Source code for jptranstokenizer.mainword.base

import unicodedata
from abc import ABC, abstractmethod
from typing import Any, Dict, List


[docs]class MainTokenizerABC(ABC): """Abstract tokenizer class for main word division. Args: do_lower_case (``bool``, *optional*, defaults to ``False``): Whether or not to lowercase the input when tokenizing. normalize_text (``bool``, *optional*, defaults to ``True``): Whether to apply unicode normalization to text before tokenization. """ def __init__( self, do_lower_case: bool = False, normalize_text: bool = True ) -> None: self.do_lower_case = do_lower_case self.normalize_text = normalize_text
[docs] @abstractmethod def tokenize(self, text: str, **kwargs: Dict[str, Any]) -> List[str]: """Devide the sequence into words."""
[docs]class Normalizer(MainTokenizerABC): """A main word tokenizer, which only normalize and make lower case. Args: do_lower_case (``bool``, *optional*, defaults to ``False``): Whether or not to lowercase the input when tokenizing. """ def __init__(self, do_lower_case: bool = False, normalize_text: bool = True): super().__init__(do_lower_case=do_lower_case, normalize_text=normalize_text)
[docs] def tokenize(self, text: str, **kwargs: Dict[str, Any]) -> List[str]: """Only normalize and make lower case tokenizer. Maybe called for dummy main tokenizer. Other kwargs (such as *never_split*) are ignored. Args: text (``str``): A sequence to be encoded. Returns: ``List[str]``: A list of a sentence. """ if self.normalize_text: text = unicodedata.normalize("NFKC", text) if self.do_lower_case: text = text.lower() return [text]