import unicodedata
from typing import Any, Dict, List, Optional
from .base import MainTokenizerABC
# cf. https://pypi.org/project/SudachiTra/
# cf. https://github.com/WorksApplications/SudachiTra/blob/main/sudachitra/tokenization_bert_sudachipy.py
# cf. https://github.com/WorksApplications/SudachiTra/blob/main/sudachitra/sudachipy_word_tokenizer.py
[docs]class SudachiTokenizer(MainTokenizerABC):
"""Tokenizer to split into words using Sudachi.
SudachiTra is required to use.
For installation of SudachiTra, see https://pypi.org/project/SudachiTra/
You can import this module shortly:
.. code-block:: none
>> from jptranstokenizer.mainword import SudachiTokenizer
Args:
split_mode (``str``, *optional*, defaults to ``"A"``):
The mode of splitting. ``"A"``, ``"B"``, or ``"C"`` can be specified.
For detail, see: `Sudachi#The modes of splitting <https://github.com/WorksApplications/Sudachi#the-modes-of-splitting>`_ or `Sudachi#分割モード <https://github.com/WorksApplications/Sudachi#%E5%88%86%E5%89%B2%E3%83%A2%E3%83%BC%E3%83%89>`_
config_path (``str``, *optional*):
Path to a config file of SudachiPy to be used for the sudachi dictionary initialization.
resource_dir (``str``, *optional*):
Path to a resource dir containing resource files, such as ``"sudachi.json"``.
dict_type (``str``, *optional*, defaults to ``"core"``):
Sudachi dictionary type to be used for tokenization.
``"small"``, ``"core"``, or ``"full"`` can be specified.
For detail, see: `Sudachi#Dictionaries <https://github.com/WorksApplications/Sudachi#dictionaries>`_ or `Sudachi#辞書の取得 <https://github.com/WorksApplications/Sudachi#%E8%BE%9E%E6%9B%B8%E3%81%AE%E5%8F%96%E5%BE%97>`_
do_lower_case (``bool``, *optional*, defaults to ``False``):
Whether or not to lowercase the input when tokenizing.Defaults to None.
normalize_text (``bool``, *optional*, defaults to ``True``):
Whether to apply unicode normalization to text before tokenization.
ignore_max_byte_error (``bool``, *optional*, defaults to ``False``):
Whether or not to ignore error of max bytes (only valid with Juman and Sudachi).
If valid, the tokenizer return empty list.
.. seealso::
- SudachiTra https://github.com/WorksApplications/SudachiTra
- Sudachi https://github.com/WorksApplications/Sudachi
"""
def __init__(
self,
split_mode: Optional[str] = "A",
config_path: Optional[str] = None,
resource_dir: Optional[str] = None,
dict_type: Optional[str] = "core",
do_lower_case: bool = False,
normalize_text: bool = True,
ignore_max_byte_error: bool = False,
):
super().__init__(do_lower_case=do_lower_case, normalize_text=normalize_text)
self.ignore_max_byte_error = ignore_max_byte_error
try:
from sudachitra.sudachipy_word_tokenizer import SudachipyWordTokenizer
from sudachitra.word_formatter import word_formatter
except ModuleNotFoundError as error:
raise error.__class__(
"You need to install sudachitra to use SudachipyWordTokenizer."
"See https://pypi.org/project/SudachiTra/ for installation."
)
self.sudachi_tokenizer = SudachipyWordTokenizer(
split_mode=split_mode,
config_path=config_path,
resource_dir=resource_dir,
dict_type=dict_type,
)
self.word_formatter = word_formatter(
"surface", self.sudachi_tokenizer.sudachi_dict
)
[docs] def tokenize(self, text: str, **kwargs: Dict[str, Any]) -> List[str]:
"""Converts a string in a sequence of words.
Other kwargs (such as *never_split*) are ignored.
Args:
text (``str``): A sequence to be encoded.
Returns:
``List[str]``: A list of words.
"""
if self.normalize_text:
text = unicodedata.normalize("NFKC", text)
tokens: List[str]
if self.ignore_max_byte_error and len(text.encode()) > 49149:
tokens = []
else:
tokens = [
self.word_formatter(token)
for token in self.sudachi_tokenizer.tokenize(text)
]
if self.do_lower_case:
tokens = [token.lower() for token in tokens]
return tokens