Source code for jptranstokenizer.mainword.sudachi

import unicodedata
from typing import Any, Dict, List, Optional

from .base import MainTokenizerABC

# cf. https://pypi.org/project/SudachiTra/
# cf. https://github.com/WorksApplications/SudachiTra/blob/main/sudachitra/tokenization_bert_sudachipy.py
# cf. https://github.com/WorksApplications/SudachiTra/blob/main/sudachitra/sudachipy_word_tokenizer.py


[docs]class SudachiTokenizer(MainTokenizerABC):
    """Tokenizer to split into words using Sudachi.
    SudachiTra is required to use.
    For installation of SudachiTra, see https://pypi.org/project/SudachiTra/
    You can import this module shortly:

    .. code-block:: none

       >> from jptranstokenizer.mainword import SudachiTokenizer

    Args:
        split_mode (``str``, *optional*, defaults to ``"A"``):
            The mode of splitting. ``"A"``, ``"B"``, or ``"C"`` can be specified.
            For detail, see: `Sudachi#The modes of splitting <https://github.com/WorksApplications/Sudachi#the-modes-of-splitting>`_ or `Sudachi#分割モード <https://github.com/WorksApplications/Sudachi#%E5%88%86%E5%89%B2%E3%83%A2%E3%83%BC%E3%83%89>`_
        config_path (``str``, *optional*):
            Path to a config file of SudachiPy to be used for the sudachi dictionary initialization.
        resource_dir (``str``, *optional*):
            Path to a resource dir containing resource files, such as ``"sudachi.json"``.
        dict_type (``str``, *optional*, defaults to ``"core"``):
            Sudachi dictionary type to be used for tokenization.
            ``"small"``, ``"core"``, or ``"full"`` can be specified.
            For detail, see: `Sudachi#Dictionaries <https://github.com/WorksApplications/Sudachi#dictionaries>`_ or `Sudachi#辞書の取得 <https://github.com/WorksApplications/Sudachi#%E8%BE%9E%E6%9B%B8%E3%81%AE%E5%8F%96%E5%BE%97>`_
        do_lower_case (``bool``, *optional*, defaults to ``False``):
            Whether or not to lowercase the input when tokenizing.Defaults to None.
        normalize_text (``bool``, *optional*, defaults to ``True``):
            Whether to apply unicode normalization to text before tokenization.
        ignore_max_byte_error (``bool``, *optional*, defaults to ``False``):
            Whether or not to ignore error of max bytes (only valid with Juman and Sudachi).
            If valid, the tokenizer return empty list.

    .. seealso::
        - SudachiTra https://github.com/WorksApplications/SudachiTra
        - Sudachi https://github.com/WorksApplications/Sudachi
    """

    def __init__(
        self,
        split_mode: Optional[str] = "A",
        config_path: Optional[str] = None,
        resource_dir: Optional[str] = None,
        dict_type: Optional[str] = "core",
        do_lower_case: bool = False,
        normalize_text: bool = True,
        ignore_max_byte_error: bool = False,
    ):

        super().__init__(do_lower_case=do_lower_case, normalize_text=normalize_text)
        self.ignore_max_byte_error = ignore_max_byte_error
        try:
            from sudachitra.sudachipy_word_tokenizer import SudachipyWordTokenizer
            from sudachitra.word_formatter import word_formatter
        except ModuleNotFoundError as error:
            raise error.__class__(
                "You need to install sudachitra to use SudachipyWordTokenizer."
                "See https://pypi.org/project/SudachiTra/ for installation."
            )
        self.sudachi_tokenizer = SudachipyWordTokenizer(
            split_mode=split_mode,
            config_path=config_path,
            resource_dir=resource_dir,
            dict_type=dict_type,
        )
        self.word_formatter = word_formatter(
            "surface", self.sudachi_tokenizer.sudachi_dict
        )

[docs]    def tokenize(self, text: str, **kwargs: Dict[str, Any]) -> List[str]:
        """Converts a string in a sequence of words.
        Other kwargs (such as *never_split*) are ignored.

        Args:
            text (``str``): A sequence to be encoded.

        Returns:
            ``List[str]``: A list of words.
        """
        if self.normalize_text:
            text = unicodedata.normalize("NFKC", text)
        tokens: List[str]
        if self.ignore_max_byte_error and len(text.encode()) > 49149:
            tokens = []
        else:
            tokens = [
                self.word_formatter(token)
                for token in self.sudachi_tokenizer.tokenize(text)
            ]
            if self.do_lower_case:
                tokens = [token.lower() for token in tokens]
        return tokens