Source code for jptranstokenizer.tokenization_utils

import collections
import os
from dataclasses import dataclass, field
from typing import Any, Dict, List, Optional, Union

import transformers
from transformers import (
    AddedToken,
    AlbertTokenizer,
    BertJapaneseTokenizer,
    PreTrainedTokenizer,
    logging,
)
from transformers.models.bert.tokenization_bert import (
    BasicTokenizer,
    WordpieceTokenizer,
    load_vocab,
)
from transformers.models.bert_japanese.tokenization_bert_japanese import (
    CharacterTokenizer,
    MecabTokenizer,
)


logging.set_verbosity_info()
logging.enable_explicit_format()
logger = logging.get_logger()

PUBLIC_AVAILABLE_SETTING_MAP: Dict[str, Dict[str, Union[str, bool]]] = {
    "cl-tohoku/bert-base-japanese": {
        "word_tokenizer_type": "mecab",
        "tokenizer_class": "BertJapaneseTokenizer",
        "mecab_dic": "ipadic",
    },
    "cl-tohoku/bert-base-japanese-v2": {
        "word_tokenizer_type": "mecab",
        "tokenizer_class": "BertJapaneseTokenizer",
        "mecab_dic": "unidic_lite",
    },
    "cl-tohoku/bert-base-japanese-whole-word-masking": {
        "word_tokenizer_type": "mecab",
        "tokenizer_class": "BertJapaneseTokenizer",
        "mecab_dic": "ipadic",
    },
    "cl-tohoku/bert-base-japanese-char": {
        "do_lower_case": False,
        "word_tokenizer_type": "mecab",
        "tokenizer_class": "BertJapaneseTokenizer",
        "subword_tokenizer_type": "character",
    },
    "cl-tohoku/bert-base-japanese-char-whole-word-masking": {
        "do_lower_case": False,
        "word_tokenizer_type": "mecab",
        "tokenizer_class": "BertJapaneseTokenizer",
        "subword_tokenizer_type": "character",
    },
    "cl-tohoku/bert-large-japanese": {
        "word_tokenizer_type": "mecab",
        "tokenizer_class": "BertJapaneseTokenizer",
        "mecab_dic": "unidic_lite",
    },
    "ken11/albert-base-japanese-v1-with-japanese-tokenizer": {
        "word_tokenizer_type": "mecab",
        "tokenizer_class": "BertJapaneseTokenizer",
        "mecab_dic": "ipadic",
    },
    "ku-nlp/deberta-v2-base-japanese": {
        "word_tokenizer_type": "juman",
        "tokenizer_class": "DebertaV2Tokenizer",
        "do_subword_by_word": False,
    },
    "ku-nlp/deberta-v2-large-japanese": {
        "word_tokenizer_type": "juman",
        "tokenizer_class": "DebertaV2Tokenizer",
        "do_subword_by_word": False,
    },
    "ku-nlp/deberta-v2-tiny-japanese": {
        "word_tokenizer_type": "juman",
        "tokenizer_class": "DebertaV2Tokenizer",
        "do_subword_by_word": False,
    },
    "nlp-waseda/roberta-base-japanese": {
        "word_tokenizer_type": "juman",
        "tokenizer_class": "AlbertTokenizer",
        "do_subword_by_word": False,
    },
    "nlp-waseda/roberta-large-japanese": {
        "word_tokenizer_type": "juman",
        "tokenizer_class": "AlbertTokenizer",
        "do_subword_by_word": False,
    },
    "nlp-waseda/roberta-large-japanese-seq512": {
        "word_tokenizer_type": "juman",
        "tokenizer_class": "AlbertTokenizer",
        "do_subword_by_word": False,
    },
    "rinna/japanese-roberta-base": {
        "do_word_tokenize": False,
        "word_tokenizer_type": "",
        "tokenizer_class": "T5Tokenizer",
    },
}

IZUMILAB_SETTING_MAP: Dict[str, Dict[str, str]] = {
    f"izumi-lab/{model_name}": {
        "word_tokenizer_type": "mecab",
        "tokenizer_class": "BertJapaneseTokenizer",
        "mecab_dic": "ipadic",
    }
    for model_name in [
        "bert-small-japanese",
        "bert-small-japanese-fin",
        "electra-base-japanese-discriminator",
        "electra-base-japanese-generator",
        "electra-small-japanese-discriminator",
        "electra-small-japanese-fin-discriminator",
        "electra-small-japanese-fin-generator",
        "electra-small-japanese-generator",
        "electra-small-paper-japanese-discriminator",
        "electra-small-paper-japanese-fin-discriminator",
        "electra-small-paper-japanese-fin-generator",
        "electra-small-paper-japanese-generator",
    ]
}

PUBLIC_AVAILABLE_SETTING_MAP.update(IZUMILAB_SETTING_MAP)


[docs]def get_word_tokenizer(
    word_tokenizer_type: str,
    normalize_text: bool = True,
    ignore_max_byte_error: bool = False,
    do_lower_case: bool = False,
    mecab_dic: Optional[str] = "ipadic",
    mecab_option: Optional[str] = None,
    sudachi_split_mode: Optional[str] = "A",
    sudachi_config_path: Optional[str] = None,
    sudachi_resource_dir: Optional[str] = None,
    sudachi_dict_type: Optional[str] = "core",
):
    """Load mainword tokenizer dynamically.
    You can import this module shortly:

    .. code-block:: none

       >> from jptranstokenizer import get_word_tokenizer

    Args:
        word_tokenizer_type (``str``, defaults to ``"basic"``):
            Type of word tokenizer. ``"mecab"``, ``"juman"``, ``"spacy-luw"``, ``"sudachi"``, ``"basic"``, ``"none"`` (only normalize texts) can be specified.
        normalize_text (``bool``, *optional*, defaults to ``True``):
            Whether to apply unicode normalization to text before tokenization.
        do_lower_case (``bool``, *optional*, defaults to ``False``):
            Whether or not to lowercase the input when tokenizing.
        ignore_max_byte_error (``bool``, *optional*, defaults to ``False``):
            Whether or not to ignore error of max bytes (only valid with Juman and Sudachi).
            If valid, the tokenizer return empty list.
        mecab_dic (``str``, *optional*, defaults to ``"ipadic"``):
            (For MeCab) Name of dictionary to be used for MeCab initialization.
            Maybe ``"ipadic"``, ``"unidic"``, or ``"unidic_lite"`` is used.
            If you are using a system-installed dictionary, set this option to ``None`` and modify *mecab_option*.
        mecab_option (``str``, *optional*):
            (For MeCab) String passed to MeCab constructor.
        sudachi_split_mode (``str``, *optional*, defaults to ``"A"``):
            (For Sudachi) The mode of splitting. ``"A"``, ``"B"``, or ``"C"`` can be specified.
        sudachi_config_path (``str``, *optional*):
            (For Sudachi) Path to a config file of SudachiPy to be used for the sudachi dictionary initialization.
        sudachi_resource_dir (``str``, *optional*):
            (For Sudachi) Path to a resource dir containing resource files, such as ``"sudachi.json"``.
        sudachi_dict_type (``str``, *optional*, defaults to ``"core"``):
            (For Sudachi) Sudachi dictionary type to be used for tokenization.
            ``"small"``, ``"core"``, or ``"full"`` can be specified.
    """
    if word_tokenizer_type == "basic":
        logger.warning("Argument normalize_text is ignored")
        word_tokenizer = BasicTokenizer(
            do_lower_case=do_lower_case, tokenize_chinese_chars=False
        )
    elif word_tokenizer_type == "mecab":
        word_tokenizer = MecabTokenizer(
            do_lower_case=do_lower_case,
            normalize_text=normalize_text,
            mecab_dic=mecab_dic,
            mecab_option=mecab_option,
        )
    elif word_tokenizer_type == "juman":
        from .mainword import JumanTokenizer

        word_tokenizer = JumanTokenizer(
            do_lower_case=do_lower_case,
            normalize_text=normalize_text,
            ignore_max_byte_error=ignore_max_byte_error,
        )
    elif word_tokenizer_type == "spacy-luw":
        from .mainword import SpacyluwTokenizer

        word_tokenizer = SpacyluwTokenizer(
            do_lower_case=do_lower_case, normalize_text=normalize_text
        )
    elif word_tokenizer_type == "sudachi":
        from .mainword import SudachiTokenizer

        word_tokenizer = SudachiTokenizer(
            do_lower_case=do_lower_case,
            normalize_text=normalize_text,
            ignore_max_byte_error=ignore_max_byte_error,
            split_mode=sudachi_split_mode,
            config_path=sudachi_config_path,
            resource_dir=sudachi_resource_dir,
            dict_type=sudachi_dict_type,
        )
    elif word_tokenizer_type == "none":
        from .mainword import Normalizer

        word_tokenizer = Normalizer(
            do_lower_case=do_lower_case, normalize_text=normalize_text
        )
    else:
        raise ValueError(
            f"Invalid word_tokenizer_type '{word_tokenizer_type}' is specified."
        )
    return word_tokenizer


[docs]class JapaneseTransformerTokenizer(BertJapaneseTokenizer):
    """Japanese tokenizer of main and sub word.
    Inherited from ``transformers.BertJapaneseTokenizer``.
    You can import this module shortly:

    .. code-block:: none

       >> from jptranstokenizer import JapaneseTransformerTokenizer

    Args:
        vocab_file (``str`` or ``os.PathLike``, *optional*, defaults to ``""``):
            _description_.
        word_tokenizer_type (``str``, defaults to `basic`):
            Type of word tokenizer. "mecab", "juman", "spacy-luw", "sudachi", "basic", "none" (only normalize texts) can be specified.
        subword_tokenizer_type (``str``, defaults to `"wordpiece"`):
            Type of word tokenizer. "wordpiece", "sentencepiece", "character" (split by one token) can be specified.
        normalize_text (``bool``, *optional*, defaults to ``True``):
            Whether to apply unicode normalization to text before tokenization.
        do_lower_case (``bool``, *optional*, defaults to ``False``):
            Whether or not to lowercase the input when tokenizing.
        ignore_max_byte_error (``bool``, *optional*, defaults to ``False``):
            Whether or not to ignore error of max bytes (only valid with Juman and Sudachi).
            If valid, the tokenizer return empty list.
        do_word_tokenize (``bool``, *optional*, defaults to ``True``):
            Whether to do (main) word tokenization.
        do_subword_tokenize (``bool``, *optional*, defaults to ``True``):
            Whether to do subword tokenization.
        do_subword_by_word (``bool``, *optional*, defaults to ``True``):
            Whether to apply subword tokenization by word or not.
            In case ``False``, subword tokenization is performed to the whole input with spaceat once.
        unk_token (``str`` or ``tokenizers.AddedToken``, *optional*):
            A special token representing an out-of-vocabulary token.
        sep_token (``str`` or ``tokenizers.AddedToken``, *optional*):
            A special token separating two different sentences in the same input (used by BERT for instance).
        pad_token (``str`` or ``tokenizers.AddedToken``, *optional*):
            A special token used to make arrays of tokens the same size for batching purpose. Will then be ignored by
            attention mechanisms or loss computation.
        cls_token (``str`` or ``tokenizers.AddedToken``, *optional*):
            A special token representing the class of the input (used by BERT for instance).
        mask_token (``str`` or ``tokenizers.AddedToken``, *optional*):
            A special token representing a masked token (used by masked-language modeling pretraining objectives, like
            BERT).
        call_from_pretrained (``bool``, *optional*, defaults to ``False``):
            Whether `__init__` is called from `from_pretrained`.
            You don't need to set manually.
        mecab_dic (``str``, *optional*, defaults to ``"ipadic"``):
            (For MeCab) Name of dictionary to be used for MeCab initialization.
            Maybe ``"ipadic"``, ``"unidic"``, ``"unidic_lite"`` is used.
            If you are using a system-installed dictionary, set this option to ``None`` and modify *mecab_option*.
        mecab_option (``str``, *optional*):
            (For MeCab) String passed to MeCab constructor.
        sudachi_split_mode (``str``, *optional*, defaults to ``"A"``):
            (For Sudachi) The mode of splitting. ``"A"``, ``"B"``, or ``"C"`` can be specified.
        sudachi_config_path (``str``, *optional*):
            (For Sudachi) Path to a config file of SudachiPy to be used for the sudachi dictionary initialization.
        sudachi_resource_dir (``str``, *optional*):
            (For Sudachi) Path to a resource dir containing resource files, such as ``"sudachi.json"``.
        sudachi_dict_type (``str``, *optional*, defaults to ``"core"``):
            (For Sudachi) Sudachi dictionary type to be used for tokenization.
            ``"small"``, ``"core"``, or ``"full"`` can be specified.
        sp_model_kwargs (``str``, *optional*):
            (For sentencepiece) Optional arguments for ``sentencepiece.SentencePieceProcessor``.
    """

    def __init__(
        self,
        vocab_file: Optional[Union[str, os.PathLike]] = None,
        word_tokenizer_type: str = "basic",
        subword_tokenizer_type: str = "wordpiece",
        normalize_text: bool = True,
        ignore_max_byte_error: bool = False,
        do_lower_case: bool = False,
        do_word_tokenize: bool = True,
        do_subword_tokenize: bool = True,
        do_subword_by_word: bool = True,
        unk_token: Optional[Union[str, AddedToken]] = "[UNK]",
        sep_token: Optional[Union[str, AddedToken]] = "[SEP]",
        pad_token: Optional[Union[str, AddedToken]] = "[PAD]",
        cls_token: Optional[Union[str, AddedToken]] = "[CLS]",
        mask_token: Optional[Union[str, AddedToken]] = "[MASK]",
        call_from_pretrained: bool = False,
        mecab_dic: Optional[str] = "ipadic",
        mecab_option: Optional[str] = None,
        sudachi_split_mode: Optional[str] = "A",
        sudachi_config_path: Optional[str] = None,
        sudachi_resource_dir: Optional[str] = None,
        sudachi_dict_type: Optional[str] = "core",
        sp_model_kwargs: Optional[Dict[str, Any]] = None,
        **kwargs,
    ):
        PreTrainedTokenizer.__init__(
            self,
            unk_token=unk_token,
            sep_token=sep_token,
            pad_token=pad_token,
            cls_token=cls_token,
            mask_token=mask_token,
            do_lower_case=do_lower_case,
            **kwargs,
        )
        self.do_word_tokenize = do_word_tokenize
        self.do_subword_tokenize = do_subword_tokenize
        self.do_subword_by_word = do_subword_by_word
        self.word_tokenizer_type = word_tokenizer_type
        self.subword_tokenizer_type = subword_tokenizer_type

        if do_word_tokenize:
            self.word_tokenizer = get_word_tokenizer(
                word_tokenizer_type=word_tokenizer_type,
                normalize_text=normalize_text,
                ignore_max_byte_error=ignore_max_byte_error,
                do_lower_case=do_lower_case,
                mecab_dic=mecab_dic,
                mecab_option=mecab_option,
                sudachi_split_mode=sudachi_split_mode,
                sudachi_config_path=sudachi_config_path,
                sudachi_resource_dir=sudachi_resource_dir,
                sudachi_dict_type=sudachi_dict_type,
            )

        if self.do_subword_tokenize and not call_from_pretrained:
            if self.subword_tokenizer_type in ["wordpiece", "character"]:
                if not os.path.isfile(vocab_file):
                    raise ValueError(
                        f"Can't find a vocabulary file at path '{vocab_file}'.\n"
                        "To load the vocabulary from a Google pretrained model use "
                        "`AutoTokenizer.from_pretrained(PRETRAINED_MODEL_NAME)`"
                    )
                self.vocab = load_vocab(vocab_file)
                self.ids_to_tokens = collections.OrderedDict(
                    [(ids, tok) for tok, ids in self.vocab.items()]
                )

            if self.subword_tokenizer_type == "wordpiece":
                self.subword_tokenizer = WordpieceTokenizer(
                    vocab=self.vocab, unk_token=self.unk_token
                )
            elif self.subword_tokenizer_type == "character":
                self.subword_tokenizer = CharacterTokenizer(
                    vocab=self.vocab, unk_token=self.unk_token
                )
            elif self.subword_tokenizer_type == "sentencepiece":
                from .subword import SentencepieceTokenizer

                self.subword_tokenizer = SentencepieceTokenizer(
                    vocab_file=vocab_file, sp_model_kwargs=sp_model_kwargs
                )
                self.vocab = self.subword_tokenizer.vocab
                self.ids_to_tokens = collections.OrderedDict(
                    [
                        (i, self.subword_tokenizer.sp_model.IdToPiece(i))
                        for i in range(self.subword_tokenizer.bpe_vocab_size)
                    ]
                )
            else:
                raise ValueError(
                    f"Invalid subword_tokenizer_type '{subword_tokenizer_type}' is specified."
                )
        # This is needed for leave special tokens as it is when tokenizing
        self.unique_no_split_tokens = list(self.special_tokens_map.values())
        if self.subword_tokenizer_type == "sentencepiece":
            self.save_vocabulary = AlbertTokenizer.save_vocabulary

        if not call_from_pretrained:
            # Check all our special tokens are registered as "no split" token (we don't cut them) and are in the vocab
            added_tokens = self.sanitize_special_tokens()
            if added_tokens:
                logger.warning_advice(
                    "Special tokens have been added in the vocabulary, make sure the associated word embeddings are"
                    " fine-tuned or trained."
                )

[docs]    @classmethod
    def from_pretrained(cls, tokenizer_name_or_path: Union[str, os.PathLike], **kwargs):
        """
        Instantiate a ``transformers.BertJapaneseTokenizer`` (or a derived class) from a predefined tokenizer.

        Args:
            tokenizer_name_or_path (``str`` or ``os.PathLike``):
                Can be either:

                - A string, the *model id* of a predefined tokenizer hosted inside
                  a model repo on huggingface.co. Valid model ids can be namespaced under auser or organization name, like ``cl-tohoku/bert-base-japanese``.
                - A path to a *directory* containing vocabulary files required by the tokenizer, for instance saved
                  using the ``transformers.tokenization_utils_base.PreTrainedTokenizerBase.save_pretrained`` method, e.g.,
                  ``./my_model_directory/``.
                - (**Deprecated**, not applicable to all derived classes) A path or url to a single saved vocabulary
                  file (if and only if the tokenizer only requires a single vocabulary file like Bert or XLNet), e.g.,
                  ``./my_model_directory/vocab.txt``.
            word_tokenizer_type (``str``, defaults to ``"basic"``):
                Type of word tokenizer. ``"mecab"``, ``"juman"``, ``"spacy-luw"``, ``"sudachi"``, ``"basic"``, ``"none"`` (only normalize texts) can be specified.
            tokenizer_class (``str``, *optional*):
                Must be specified when `tokenizer_name_or_path` is not in the supported list.
                ``"AlbertTokenizer"``, ``"T5Tokenizer"``, and ``"BertJapaneseTokenizer"`` (whose classes are in transformers library) are available.
            normalize_text (``bool``, *optional*, defaults to ``True``):
                Whether to apply unicode normalization to text before tokenization.
            ignore_max_byte_error (``bool``, *optional*, defaults to ``False``):
                Whether or not to ignore error of max bytes (only valid with Juman and Sudachi).
                If valid, the tokenizer return empty list.
            do_lower_case (``bool``, *optional*, defaults to ``False``):
                Whether or not to lowercase the input when tokenizing.
            do_word_tokenize (``bool``, *optional*, defaults to ``True``):
                Whether to do (main) word tokenization.
            do_subword_by_word (``bool``, *optional*, defaults to ``True``):
                Whether to apply subword tokenization by word or not.
                In case ``False``, subword tokenization is performed to the whole input with spaceat once.
            mecab_dic (``str``, *optional*, defaults to ``"ipadic"``):
                (For MeCab) Name of dictionary to be used for MeCab initialization.
                Maybe ``"ipadic"``, ``"unidic"``, ``"unidic_lite"`` is used.
                If you are using a system-installed dictionary, set this option to `None` and modify *mecab_option*.
            mecab_option (``str``, *optional*):
                (For MeCab) String passed to MeCab constructor.
            sudachi_split_mode (``str``, *optional*, defaults to ``"A"``):
                (For Sudachi) The mode of splitting. ``"A"``, ``"B"``, or ``"C"`` can be specified.
            sudachi_config_path (``str``, *optional*):
                (For Sudachi) Path to a config file of SudachiPy to be used for the sudachi dictionary initialization.
            sudachi_resource_dir (``str``, *optional*):
                (For Sudachi) Path to a resource dir containing resource files, such as ``"sudachi.json"``.
            sudachi_dict_type (``str``, *optional*, defaults to ``"core"``):
                (For Sudachi) Sudachi dictionary type to be used for tokenization.
                ``"small"``, ``"core"``, or ``"full"`` can be specified.
            sp_model_kwargs (``Dict[str, Any]``, *optional*):
                (For sentencepiece) Optional arguments for ``sentencepiece.SentencePieceProcessor``.
        """

        def _from_pretrained(
            tokenizer_class: str,
            word_tokenizer_type: str = "basic",
            normalize_text: bool = True,
            ignore_max_byte_error: bool = False,
            do_lower_case: bool = False,
            do_word_tokenize: bool = True,
            do_subword_by_word: bool = True,
            mecab_dic: Optional[str] = "ipadic",
            mecab_option: Optional[str] = None,
            sudachi_split_mode: Optional[str] = "A",
            sudachi_config_path: Optional[str] = None,
            sudachi_resource_dir: Optional[str] = None,
            sudachi_dict_type: Optional[str] = "core",
            sp_model_kwargs: Optional[Dict[str, Any]] = None,
            *init_inputs,
            **kwargs,
        ):
            tokenizer_class = (
                transformers.models.auto.tokenization_auto.tokenizer_class_from_name(
                    tokenizer_class
                )
            )
            tentative_tokenizer = tokenizer_class.from_pretrained(
                tokenizer_name_or_path, *init_inputs, **kwargs
            )
            if isinstance(
                tentative_tokenizer,
                (
                    transformers.AlbertTokenizer,
                    transformers.DebertaTokenizer,
                    transformers.DebertaV2Tokenizer,
                    transformers.T5Tokenizer,
                ),
            ):
                # sentencepiece
                subword_tokenizer_type = "sentencepiece"
                if isinstance(
                    tentative_tokenizer,
                    (transformers.AlbertTokenizer, transformers.T5Tokenizer),
                ):
                    sp_model = tentative_tokenizer.sp_model
                else:
                    # Deberta or DebertaV2
                    sp_model = tentative_tokenizer._tokenizer.spm
                from .subword import SentencepieceTokenizer

                subword_tokenizer = SentencepieceTokenizer(
                    vocab_file=None, sp_model_kwargs=sp_model_kwargs, sp_model=sp_model
                )
                vocab = subword_tokenizer.vocab
                ids_to_tokens = collections.OrderedDict(
                    [
                        (i, subword_tokenizer.sp_model.IdToPiece(i))
                        for i in range(subword_tokenizer.bpe_vocab_size)
                    ]
                )
            elif isinstance(tentative_tokenizer, BertJapaneseTokenizer):
                # WordPiece or character
                subword_tokenizer = tentative_tokenizer.subword_tokenizer
                if isinstance(subword_tokenizer, WordpieceTokenizer):
                    subword_tokenizer_type = "wordpiece"
                elif isinstance(subword_tokenizer, CharacterTokenizer):
                    subword_tokenizer_type = "character"
                else:
                    raise NotImplementedError()
                vocab = tentative_tokenizer.vocab
                ids_to_tokens = tentative_tokenizer.ids_to_tokens
            else:
                raise NotImplementedError()
            tokenizer = cls(
                word_tokenizer_type=word_tokenizer_type,
                subword_tokenizer_type=subword_tokenizer_type,
                normalize_text=normalize_text,
                ignore_max_byte_error=ignore_max_byte_error,
                do_lower_case=do_lower_case,
                do_word_tokenize=do_word_tokenize,
                do_subword_tokenize=True,
                do_subword_by_word=do_subword_by_word,
                unk_token=tentative_tokenizer.special_tokens_map["unk_token"],
                sep_token=tentative_tokenizer.special_tokens_map["sep_token"],
                pad_token=tentative_tokenizer.special_tokens_map["pad_token"],
                cls_token=tentative_tokenizer.special_tokens_map["cls_token"],
                mask_token=tentative_tokenizer.special_tokens_map["mask_token"],
                call_from_pretrained=True,
                mecab_dic=mecab_dic,
                mecab_option=mecab_option,
                sudachi_split_mode=sudachi_split_mode,
                sudachi_config_path=sudachi_config_path,
                sudachi_resource_dir=sudachi_resource_dir,
                sudachi_dict_type=sudachi_dict_type,
            )
            tokenizer.subword_tokenizer = subword_tokenizer
            tokenizer.vocab = vocab
            tokenizer.ids_to_tokens = ids_to_tokens

            # This is needed for leave special tokens as it is when tokenizing
            tokenizer.unique_no_split_tokens = list(
                tokenizer.special_tokens_map.values()
            )
            # Check all our special tokens are registered as "no split" token (we don't cut them) and are in the vocab
            added_tokens = tokenizer.sanitize_special_tokens()
            if added_tokens:
                logger.warning_advice(
                    "Special tokens have been added in the vocabulary, make sure the associated word embeddings are"
                    " fine-tuned or trained."
                )
            return tokenizer

        if tokenizer_name_or_path in [
            "megagonlabs/electra-base-japanese-discriminator",
            "megagonlabs/transformers-ud-japanese-electra-base-discriminator",
        ]:
            raise NotImplementedError(
                (
                    f"Loading {tokenizer_name_or_path} is not expected in this module.\n"
                    "Please use the official implementation."
                )
            )

        if tokenizer_name_or_path in PUBLIC_AVAILABLE_SETTING_MAP.keys():
            dct_setting: Dict[str, str] = PUBLIC_AVAILABLE_SETTING_MAP[
                tokenizer_name_or_path
            ]
            for k, v in dct_setting.items():
                kwargs[k] = v
        else:
            if kwargs.get("word_tokenizer_type") is None:
                raise ValueError("word_tokenizer must be specified")
            if kwargs.get("tokenizer_class") is None:
                raise ValueError("tokenizer_class must be specified")
        return _from_pretrained(**kwargs)

    def _tokenize(self, text):
        if self.do_word_tokenize:
            tokens = self.word_tokenizer.tokenize(
                text, never_split=self.all_special_tokens
            )
        else:
            tokens = [text]

        if self.do_subword_tokenize:
            if self.do_subword_by_word:
                split_tokens = [
                    sub_token
                    for token in tokens
                    for sub_token in self.subword_tokenizer.tokenize(token)
                ]
            else:
                split_tokens = self.subword_tokenizer.tokenize(" ".join(tokens))
        else:
            split_tokens = tokens

        return split_tokens

[docs]    def convert_tokens_to_string(self, tokens: List[str]):
        if self.subword_tokenizer_type in ["character", "wordpiece"]:
            return super().convert_tokens_to_string(tokens)
        elif self.subword_tokenizer_type == "sentencepiece":
            return self.subword_tokenizer.sp_model.decode(tokens)
        else:
            raise NotImplementedError(
                f"{self.subword_tokenizer} is not allowed for convert_tokens_to_string"
            )