| from transformers.models.auto.tokenization_auto import get_class_from_dynamic_module | |
| from transformers.tokenization_utils import AddedToken | |
| CodeGen25Tokenizer = get_class_from_dynamic_module("tokenization_codegen25.CodeGen25Tokenizer", | |
| "Salesforce/codegen25-7b-multi") | |
| tiktoken_tokenizer = get_class_from_dynamic_module("tokenization_codegen25.tiktoken_tokenizer", | |
| "Salesforce/codegen25-7b-multi") | |
| class DeciCoderTokenizer(CodeGen25Tokenizer): | |
| def __init__( | |
| self, | |
| pad_token=None, | |
| eos_token="<|endoftext|>", | |
| add_eos_token=False, | |
| add_special_tokens=True, | |
| **kwargs, | |
| ): | |
| self.add_eos_token = add_eos_token | |
| self.encoder = tiktoken_tokenizer(base="gpt2", pad_token=pad_token, add_special=add_special_tokens) | |
| pad_token_added = AddedToken(pad_token, lstrip=False, rstrip=False) if isinstance(pad_token, str) else pad_token | |
| eos_token_added = AddedToken(eos_token, lstrip=False, rstrip=False) if isinstance(eos_token, str) else eos_token | |
| super().__init__( | |
| pad_token=pad_token_added, | |
| eos_token=eos_token_added, | |
| add_eos_token=add_eos_token, | |
| add_special_tokens=add_special_tokens, | |
| **kwargs, | |
| ) | |
| def _convert_id_to_token(self, index): | |
| try: | |
| return super()._convert_id_to_token(index) | |
| except: | |
| return None | |