Source code for jptranstokenizer.tokenization_utils

import collections
import os
from dataclasses import dataclass, field
from typing import Any, Dict, List, Optional, Union

import transformers
from transformers import (
    AddedToken,
    AlbertTokenizer,
    BertJapaneseTokenizer,
    PreTrainedTokenizer,
    logging,
)
from transformers.models.bert.tokenization_bert import (
    BasicTokenizer,
    WordpieceTokenizer,
    load_vocab,
)
from transformers.models.bert_japanese.tokenization_bert_japanese import (
    CharacterTokenizer,
    MecabTokenizer,
)


logging.set_verbosity_info()
logging.enable_explicit_format()
logger = logging.get_logger()

PUBLIC_AVAILABLE_SETTING_MAP: Dict[str, Dict[str, Union[str, bool]]] = {
    "cl-tohoku/bert-base-japanese": {
        "word_tokenizer_type": "mecab",
        "tokenizer_class": "BertJapaneseTokenizer",
        "mecab_dic": "ipadic",
    },
    "cl-tohoku/bert-base-japanese-v2": {
        "word_tokenizer_type": "mecab",
        "tokenizer_class": "BertJapaneseTokenizer",
        "mecab_dic": "unidic_lite",
    },
    "cl-tohoku/bert-base-japanese-whole-word-masking": {
        "word_tokenizer_type": "mecab",
        "tokenizer_class": "BertJapaneseTokenizer",
        "mecab_dic": "ipadic",
    },
    "cl-tohoku/bert-base-japanese-char": {
        "do_lower_case": False,
        "word_tokenizer_type": "mecab",
        "tokenizer_class": "BertJapaneseTokenizer",
        "subword_tokenizer_type": "character",
    },
    "cl-tohoku/bert-base-japanese-char-whole-word-masking": {
        "do_lower_case": False,
        "word_tokenizer_type": "mecab",
        "tokenizer_class": "BertJapaneseTokenizer",
        "subword_tokenizer_type": "character",
    },
    "cl-tohoku/bert-large-japanese": {
        "word_tokenizer_type": "mecab",
        "tokenizer_class": "BertJapaneseTokenizer",
        "mecab_dic": "unidic_lite",
    },
    "ken11/albert-base-japanese-v1-with-japanese-tokenizer": {
        "word_tokenizer_type": "mecab",
        "tokenizer_class": "BertJapaneseTokenizer",
        "mecab_dic": "ipadic",
    },
    "ku-nlp/deberta-v2-base-japanese": {
        "word_tokenizer_type": "juman",
        "tokenizer_class": "DebertaV2Tokenizer",
        "do_subword_by_word": False,
    },
    "ku-nlp/deberta-v2-large-japanese": {
        "word_tokenizer_type": "juman",
        "tokenizer_class": "DebertaV2Tokenizer",
        "do_subword_by_word": False,
    },
    "ku-nlp/deberta-v2-tiny-japanese": {
        "word_tokenizer_type": "juman",
        "tokenizer_class": "DebertaV2Tokenizer",
        "do_subword_by_word": False,
    },
    "nlp-waseda/roberta-base-japanese": {
        "word_tokenizer_type": "juman",
        "tokenizer_class": "AlbertTokenizer",
        "do_subword_by_word": False,
    },
    "nlp-waseda/roberta-large-japanese": {
        "word_tokenizer_type": "juman",
        "tokenizer_class": "AlbertTokenizer",
        "do_subword_by_word": False,
    },
    "nlp-waseda/roberta-large-japanese-seq512": {
        "word_tokenizer_type": "juman",
        "tokenizer_class": "AlbertTokenizer",
        "do_subword_by_word": False,
    },
    "rinna/japanese-roberta-base": {
        "do_word_tokenize": False,
        "word_tokenizer_type": "",
        "tokenizer_class": "T5Tokenizer",
    },
}

IZUMILAB_SETTING_MAP: Dict[str, Dict[str, str]] = {
    f"izumi-lab/{model_name}": {
        "word_tokenizer_type": "mecab",
        "tokenizer_class": "BertJapaneseTokenizer",
        "mecab_dic": "ipadic",
    }
    for model_name in [
        "bert-small-japanese",
        "bert-small-japanese-fin",
        "electra-base-japanese-discriminator",
        "electra-base-japanese-generator",
        "electra-small-japanese-discriminator",
        "electra-small-japanese-fin-discriminator",
        "electra-small-japanese-fin-generator",
        "electra-small-japanese-generator",
        "electra-small-paper-japanese-discriminator",
        "electra-small-paper-japanese-fin-discriminator",
        "electra-small-paper-japanese-fin-generator",
        "electra-small-paper-japanese-generator",
    ]
}

PUBLIC_AVAILABLE_SETTING_MAP.update(IZUMILAB_SETTING_MAP)


[docs]def get_word_tokenizer( word_tokenizer_type: str, normalize_text: bool = True, ignore_max_byte_error: bool = False, do_lower_case: bool = False, mecab_dic: Optional[str] = "ipadic", mecab_option: Optional[str] = None, sudachi_split_mode: Optional[str] = "A", sudachi_config_path: Optional[str] = None, sudachi_resource_dir: Optional[str] = None, sudachi_dict_type: Optional[str] = "core", ): """Load mainword tokenizer dynamically. You can import this module shortly: .. code-block:: none >> from jptranstokenizer import get_word_tokenizer Args: word_tokenizer_type (``str``, defaults to ``"basic"``): Type of word tokenizer. ``"mecab"``, ``"juman"``, ``"spacy-luw"``, ``"sudachi"``, ``"basic"``, ``"none"`` (only normalize texts) can be specified. normalize_text (``bool``, *optional*, defaults to ``True``): Whether to apply unicode normalization to text before tokenization. do_lower_case (``bool``, *optional*, defaults to ``False``): Whether or not to lowercase the input when tokenizing. ignore_max_byte_error (``bool``, *optional*, defaults to ``False``): Whether or not to ignore error of max bytes (only valid with Juman and Sudachi). If valid, the tokenizer return empty list. mecab_dic (``str``, *optional*, defaults to ``"ipadic"``): (For MeCab) Name of dictionary to be used for MeCab initialization. Maybe ``"ipadic"``, ``"unidic"``, or ``"unidic_lite"`` is used. If you are using a system-installed dictionary, set this option to ``None`` and modify *mecab_option*. mecab_option (``str``, *optional*): (For MeCab) String passed to MeCab constructor. sudachi_split_mode (``str``, *optional*, defaults to ``"A"``): (For Sudachi) The mode of splitting. ``"A"``, ``"B"``, or ``"C"`` can be specified. sudachi_config_path (``str``, *optional*): (For Sudachi) Path to a config file of SudachiPy to be used for the sudachi dictionary initialization. sudachi_resource_dir (``str``, *optional*): (For Sudachi) Path to a resource dir containing resource files, such as ``"sudachi.json"``. sudachi_dict_type (``str``, *optional*, defaults to ``"core"``): (For Sudachi) Sudachi dictionary type to be used for tokenization. ``"small"``, ``"core"``, or ``"full"`` can be specified. """ if word_tokenizer_type == "basic": logger.warning("Argument normalize_text is ignored") word_tokenizer = BasicTokenizer( do_lower_case=do_lower_case, tokenize_chinese_chars=False ) elif word_tokenizer_type == "mecab": word_tokenizer = MecabTokenizer( do_lower_case=do_lower_case, normalize_text=normalize_text, mecab_dic=mecab_dic, mecab_option=mecab_option, ) elif word_tokenizer_type == "juman": from .mainword import JumanTokenizer word_tokenizer = JumanTokenizer( do_lower_case=do_lower_case, normalize_text=normalize_text, ignore_max_byte_error=ignore_max_byte_error, ) elif word_tokenizer_type == "spacy-luw": from .mainword import SpacyluwTokenizer word_tokenizer = SpacyluwTokenizer( do_lower_case=do_lower_case, normalize_text=normalize_text ) elif word_tokenizer_type == "sudachi": from .mainword import SudachiTokenizer word_tokenizer = SudachiTokenizer( do_lower_case=do_lower_case, normalize_text=normalize_text, ignore_max_byte_error=ignore_max_byte_error, split_mode=sudachi_split_mode, config_path=sudachi_config_path, resource_dir=sudachi_resource_dir, dict_type=sudachi_dict_type, ) elif word_tokenizer_type == "none": from .mainword import Normalizer word_tokenizer = Normalizer( do_lower_case=do_lower_case, normalize_text=normalize_text ) else: raise ValueError( f"Invalid word_tokenizer_type '{word_tokenizer_type}' is specified." ) return word_tokenizer
[docs]class JapaneseTransformerTokenizer(BertJapaneseTokenizer): """Japanese tokenizer of main and sub word. Inherited from ``transformers.BertJapaneseTokenizer``. You can import this module shortly: .. code-block:: none >> from jptranstokenizer import JapaneseTransformerTokenizer Args: vocab_file (``str`` or ``os.PathLike``, *optional*, defaults to ``""``): _description_. word_tokenizer_type (``str``, defaults to `basic`): Type of word tokenizer. "mecab", "juman", "spacy-luw", "sudachi", "basic", "none" (only normalize texts) can be specified. subword_tokenizer_type (``str``, defaults to `"wordpiece"`): Type of word tokenizer. "wordpiece", "sentencepiece", "character" (split by one token) can be specified. normalize_text (``bool``, *optional*, defaults to ``True``): Whether to apply unicode normalization to text before tokenization. do_lower_case (``bool``, *optional*, defaults to ``False``): Whether or not to lowercase the input when tokenizing. ignore_max_byte_error (``bool``, *optional*, defaults to ``False``): Whether or not to ignore error of max bytes (only valid with Juman and Sudachi). If valid, the tokenizer return empty list. do_word_tokenize (``bool``, *optional*, defaults to ``True``): Whether to do (main) word tokenization. do_subword_tokenize (``bool``, *optional*, defaults to ``True``): Whether to do subword tokenization. do_subword_by_word (``bool``, *optional*, defaults to ``True``): Whether to apply subword tokenization by word or not. In case ``False``, subword tokenization is performed to the whole input with spaceat once. unk_token (``str`` or ``tokenizers.AddedToken``, *optional*): A special token representing an out-of-vocabulary token. sep_token (``str`` or ``tokenizers.AddedToken``, *optional*): A special token separating two different sentences in the same input (used by BERT for instance). pad_token (``str`` or ``tokenizers.AddedToken``, *optional*): A special token used to make arrays of tokens the same size for batching purpose. Will then be ignored by attention mechanisms or loss computation. cls_token (``str`` or ``tokenizers.AddedToken``, *optional*): A special token representing the class of the input (used by BERT for instance). mask_token (``str`` or ``tokenizers.AddedToken``, *optional*): A special token representing a masked token (used by masked-language modeling pretraining objectives, like BERT). call_from_pretrained (``bool``, *optional*, defaults to ``False``): Whether `__init__` is called from `from_pretrained`. You don't need to set manually. mecab_dic (``str``, *optional*, defaults to ``"ipadic"``): (For MeCab) Name of dictionary to be used for MeCab initialization. Maybe ``"ipadic"``, ``"unidic"``, ``"unidic_lite"`` is used. If you are using a system-installed dictionary, set this option to ``None`` and modify *mecab_option*. mecab_option (``str``, *optional*): (For MeCab) String passed to MeCab constructor. sudachi_split_mode (``str``, *optional*, defaults to ``"A"``): (For Sudachi) The mode of splitting. ``"A"``, ``"B"``, or ``"C"`` can be specified. sudachi_config_path (``str``, *optional*): (For Sudachi) Path to a config file of SudachiPy to be used for the sudachi dictionary initialization. sudachi_resource_dir (``str``, *optional*): (For Sudachi) Path to a resource dir containing resource files, such as ``"sudachi.json"``. sudachi_dict_type (``str``, *optional*, defaults to ``"core"``): (For Sudachi) Sudachi dictionary type to be used for tokenization. ``"small"``, ``"core"``, or ``"full"`` can be specified. sp_model_kwargs (``str``, *optional*): (For sentencepiece) Optional arguments for ``sentencepiece.SentencePieceProcessor``. """ def __init__( self, vocab_file: Optional[Union[str, os.PathLike]] = None, word_tokenizer_type: str = "basic", subword_tokenizer_type: str = "wordpiece", normalize_text: bool = True, ignore_max_byte_error: bool = False, do_lower_case: bool = False, do_word_tokenize: bool = True, do_subword_tokenize: bool = True, do_subword_by_word: bool = True, unk_token: Optional[Union[str, AddedToken]] = "[UNK]", sep_token: Optional[Union[str, AddedToken]] = "[SEP]", pad_token: Optional[Union[str, AddedToken]] = "[PAD]", cls_token: Optional[Union[str, AddedToken]] = "[CLS]", mask_token: Optional[Union[str, AddedToken]] = "[MASK]", call_from_pretrained: bool = False, mecab_dic: Optional[str] = "ipadic", mecab_option: Optional[str] = None, sudachi_split_mode: Optional[str] = "A", sudachi_config_path: Optional[str] = None, sudachi_resource_dir: Optional[str] = None, sudachi_dict_type: Optional[str] = "core", sp_model_kwargs: Optional[Dict[str, Any]] = None, **kwargs, ): PreTrainedTokenizer.__init__( self, unk_token=unk_token, sep_token=sep_token, pad_token=pad_token, cls_token=cls_token, mask_token=mask_token, do_lower_case=do_lower_case, **kwargs, ) self.do_word_tokenize = do_word_tokenize self.do_subword_tokenize = do_subword_tokenize self.do_subword_by_word = do_subword_by_word self.word_tokenizer_type = word_tokenizer_type self.subword_tokenizer_type = subword_tokenizer_type if do_word_tokenize: self.word_tokenizer = get_word_tokenizer( word_tokenizer_type=word_tokenizer_type, normalize_text=normalize_text, ignore_max_byte_error=ignore_max_byte_error, do_lower_case=do_lower_case, mecab_dic=mecab_dic, mecab_option=mecab_option, sudachi_split_mode=sudachi_split_mode, sudachi_config_path=sudachi_config_path, sudachi_resource_dir=sudachi_resource_dir, sudachi_dict_type=sudachi_dict_type, ) if self.do_subword_tokenize and not call_from_pretrained: if self.subword_tokenizer_type in ["wordpiece", "character"]: if not os.path.isfile(vocab_file): raise ValueError( f"Can't find a vocabulary file at path '{vocab_file}'.\n" "To load the vocabulary from a Google pretrained model use " "`AutoTokenizer.from_pretrained(PRETRAINED_MODEL_NAME)`" ) self.vocab = load_vocab(vocab_file) self.ids_to_tokens = collections.OrderedDict( [(ids, tok) for tok, ids in self.vocab.items()] ) if self.subword_tokenizer_type == "wordpiece": self.subword_tokenizer = WordpieceTokenizer( vocab=self.vocab, unk_token=self.unk_token ) elif self.subword_tokenizer_type == "character": self.subword_tokenizer = CharacterTokenizer( vocab=self.vocab, unk_token=self.unk_token ) elif self.subword_tokenizer_type == "sentencepiece": from .subword import SentencepieceTokenizer self.subword_tokenizer = SentencepieceTokenizer( vocab_file=vocab_file, sp_model_kwargs=sp_model_kwargs ) self.vocab = self.subword_tokenizer.vocab self.ids_to_tokens = collections.OrderedDict( [ (i, self.subword_tokenizer.sp_model.IdToPiece(i)) for i in range(self.subword_tokenizer.bpe_vocab_size) ] ) else: raise ValueError( f"Invalid subword_tokenizer_type '{subword_tokenizer_type}' is specified." ) # This is needed for leave special tokens as it is when tokenizing self.unique_no_split_tokens = list(self.special_tokens_map.values()) if self.subword_tokenizer_type == "sentencepiece": self.save_vocabulary = AlbertTokenizer.save_vocabulary if not call_from_pretrained: # Check all our special tokens are registered as "no split" token (we don't cut them) and are in the vocab added_tokens = self.sanitize_special_tokens() if added_tokens: logger.warning_advice( "Special tokens have been added in the vocabulary, make sure the associated word embeddings are" " fine-tuned or trained." )
[docs] @classmethod def from_pretrained(cls, tokenizer_name_or_path: Union[str, os.PathLike], **kwargs): """ Instantiate a ``transformers.BertJapaneseTokenizer`` (or a derived class) from a predefined tokenizer. Args: tokenizer_name_or_path (``str`` or ``os.PathLike``): Can be either: - A string, the *model id* of a predefined tokenizer hosted inside a model repo on huggingface.co. Valid model ids can be namespaced under auser or organization name, like ``cl-tohoku/bert-base-japanese``. - A path to a *directory* containing vocabulary files required by the tokenizer, for instance saved using the ``transformers.tokenization_utils_base.PreTrainedTokenizerBase.save_pretrained`` method, e.g., ``./my_model_directory/``. - (**Deprecated**, not applicable to all derived classes) A path or url to a single saved vocabulary file (if and only if the tokenizer only requires a single vocabulary file like Bert or XLNet), e.g., ``./my_model_directory/vocab.txt``. word_tokenizer_type (``str``, defaults to ``"basic"``): Type of word tokenizer. ``"mecab"``, ``"juman"``, ``"spacy-luw"``, ``"sudachi"``, ``"basic"``, ``"none"`` (only normalize texts) can be specified. tokenizer_class (``str``, *optional*): Must be specified when `tokenizer_name_or_path` is not in the supported list. ``"AlbertTokenizer"``, ``"T5Tokenizer"``, and ``"BertJapaneseTokenizer"`` (whose classes are in transformers library) are available. normalize_text (``bool``, *optional*, defaults to ``True``): Whether to apply unicode normalization to text before tokenization. ignore_max_byte_error (``bool``, *optional*, defaults to ``False``): Whether or not to ignore error of max bytes (only valid with Juman and Sudachi). If valid, the tokenizer return empty list. do_lower_case (``bool``, *optional*, defaults to ``False``): Whether or not to lowercase the input when tokenizing. do_word_tokenize (``bool``, *optional*, defaults to ``True``): Whether to do (main) word tokenization. do_subword_by_word (``bool``, *optional*, defaults to ``True``): Whether to apply subword tokenization by word or not. In case ``False``, subword tokenization is performed to the whole input with spaceat once. mecab_dic (``str``, *optional*, defaults to ``"ipadic"``): (For MeCab) Name of dictionary to be used for MeCab initialization. Maybe ``"ipadic"``, ``"unidic"``, ``"unidic_lite"`` is used. If you are using a system-installed dictionary, set this option to `None` and modify *mecab_option*. mecab_option (``str``, *optional*): (For MeCab) String passed to MeCab constructor. sudachi_split_mode (``str``, *optional*, defaults to ``"A"``): (For Sudachi) The mode of splitting. ``"A"``, ``"B"``, or ``"C"`` can be specified. sudachi_config_path (``str``, *optional*): (For Sudachi) Path to a config file of SudachiPy to be used for the sudachi dictionary initialization. sudachi_resource_dir (``str``, *optional*): (For Sudachi) Path to a resource dir containing resource files, such as ``"sudachi.json"``. sudachi_dict_type (``str``, *optional*, defaults to ``"core"``): (For Sudachi) Sudachi dictionary type to be used for tokenization. ``"small"``, ``"core"``, or ``"full"`` can be specified. sp_model_kwargs (``Dict[str, Any]``, *optional*): (For sentencepiece) Optional arguments for ``sentencepiece.SentencePieceProcessor``. """ def _from_pretrained( tokenizer_class: str, word_tokenizer_type: str = "basic", normalize_text: bool = True, ignore_max_byte_error: bool = False, do_lower_case: bool = False, do_word_tokenize: bool = True, do_subword_by_word: bool = True, mecab_dic: Optional[str] = "ipadic", mecab_option: Optional[str] = None, sudachi_split_mode: Optional[str] = "A", sudachi_config_path: Optional[str] = None, sudachi_resource_dir: Optional[str] = None, sudachi_dict_type: Optional[str] = "core", sp_model_kwargs: Optional[Dict[str, Any]] = None, *init_inputs, **kwargs, ): tokenizer_class = ( transformers.models.auto.tokenization_auto.tokenizer_class_from_name( tokenizer_class ) ) tentative_tokenizer = tokenizer_class.from_pretrained( tokenizer_name_or_path, *init_inputs, **kwargs ) if isinstance( tentative_tokenizer, ( transformers.AlbertTokenizer, transformers.DebertaTokenizer, transformers.DebertaV2Tokenizer, transformers.T5Tokenizer, ), ): # sentencepiece subword_tokenizer_type = "sentencepiece" if isinstance( tentative_tokenizer, (transformers.AlbertTokenizer, transformers.T5Tokenizer), ): sp_model = tentative_tokenizer.sp_model else: # Deberta or DebertaV2 sp_model = tentative_tokenizer._tokenizer.spm from .subword import SentencepieceTokenizer subword_tokenizer = SentencepieceTokenizer( vocab_file=None, sp_model_kwargs=sp_model_kwargs, sp_model=sp_model ) vocab = subword_tokenizer.vocab ids_to_tokens = collections.OrderedDict( [ (i, subword_tokenizer.sp_model.IdToPiece(i)) for i in range(subword_tokenizer.bpe_vocab_size) ] ) elif isinstance(tentative_tokenizer, BertJapaneseTokenizer): # WordPiece or character subword_tokenizer = tentative_tokenizer.subword_tokenizer if isinstance(subword_tokenizer, WordpieceTokenizer): subword_tokenizer_type = "wordpiece" elif isinstance(subword_tokenizer, CharacterTokenizer): subword_tokenizer_type = "character" else: raise NotImplementedError() vocab = tentative_tokenizer.vocab ids_to_tokens = tentative_tokenizer.ids_to_tokens else: raise NotImplementedError() tokenizer = cls( word_tokenizer_type=word_tokenizer_type, subword_tokenizer_type=subword_tokenizer_type, normalize_text=normalize_text, ignore_max_byte_error=ignore_max_byte_error, do_lower_case=do_lower_case, do_word_tokenize=do_word_tokenize, do_subword_tokenize=True, do_subword_by_word=do_subword_by_word, unk_token=tentative_tokenizer.special_tokens_map["unk_token"], sep_token=tentative_tokenizer.special_tokens_map["sep_token"], pad_token=tentative_tokenizer.special_tokens_map["pad_token"], cls_token=tentative_tokenizer.special_tokens_map["cls_token"], mask_token=tentative_tokenizer.special_tokens_map["mask_token"], call_from_pretrained=True, mecab_dic=mecab_dic, mecab_option=mecab_option, sudachi_split_mode=sudachi_split_mode, sudachi_config_path=sudachi_config_path, sudachi_resource_dir=sudachi_resource_dir, sudachi_dict_type=sudachi_dict_type, ) tokenizer.subword_tokenizer = subword_tokenizer tokenizer.vocab = vocab tokenizer.ids_to_tokens = ids_to_tokens # This is needed for leave special tokens as it is when tokenizing tokenizer.unique_no_split_tokens = list( tokenizer.special_tokens_map.values() ) # Check all our special tokens are registered as "no split" token (we don't cut them) and are in the vocab added_tokens = tokenizer.sanitize_special_tokens() if added_tokens: logger.warning_advice( "Special tokens have been added in the vocabulary, make sure the associated word embeddings are" " fine-tuned or trained." ) return tokenizer if tokenizer_name_or_path in [ "megagonlabs/electra-base-japanese-discriminator", "megagonlabs/transformers-ud-japanese-electra-base-discriminator", ]: raise NotImplementedError( ( f"Loading {tokenizer_name_or_path} is not expected in this module.\n" "Please use the official implementation." ) ) if tokenizer_name_or_path in PUBLIC_AVAILABLE_SETTING_MAP.keys(): dct_setting: Dict[str, str] = PUBLIC_AVAILABLE_SETTING_MAP[ tokenizer_name_or_path ] for k, v in dct_setting.items(): kwargs[k] = v else: if kwargs.get("word_tokenizer_type") is None: raise ValueError("word_tokenizer must be specified") if kwargs.get("tokenizer_class") is None: raise ValueError("tokenizer_class must be specified") return _from_pretrained(**kwargs)
def _tokenize(self, text): if self.do_word_tokenize: tokens = self.word_tokenizer.tokenize( text, never_split=self.all_special_tokens ) else: tokens = [text] if self.do_subword_tokenize: if self.do_subword_by_word: split_tokens = [ sub_token for token in tokens for sub_token in self.subword_tokenizer.tokenize(token) ] else: split_tokens = self.subword_tokenizer.tokenize(" ".join(tokens)) else: split_tokens = tokens return split_tokens
[docs] def convert_tokens_to_string(self, tokens: List[str]): if self.subword_tokenizer_type in ["character", "wordpiece"]: return super().convert_tokens_to_string(tokens) elif self.subword_tokenizer_type == "sentencepiece": return self.subword_tokenizer.sp_model.decode(tokens) else: raise NotImplementedError( f"{self.subword_tokenizer} is not allowed for convert_tokens_to_string" )