Source code for jptranstokenizer.mainword.juman

import re
import unicodedata
from typing import Any, Dict, List

from .base import MainTokenizerABC


[docs]class JumanTokenizer(MainTokenizerABC): """Tokenizer to split into words using Juman. Juman++ and pyknp are required to use. You can import this module shortly: .. code-block:: none >> from jptranstokenizer.mainword import JumanTokenizer Args: do_lower_case (``bool``, *optional*, defaults to ``False``): Whether or not to lowercase the input when tokenizing.Defaults to None. normalize_text (``bool``, *optional*, defaults to ``True``): Whether to apply unicode normalization to text before tokenization. ignore_max_byte_error (``bool``, *optional*, defaults to ``False``): Whether or not to ignore error of max bytes (only valid with Juman and Sudachi). If valid, the tokenizer return empty list. .. seealso:: - Juman++ https://github.com/ku-nlp/jumanpp - pyknp https://github.com/ku-nlp/pyknp """ def __init__( self, do_lower_case: bool = False, normalize_text: bool = True, ignore_max_byte_error: bool = False, ): super().__init__(do_lower_case=do_lower_case, normalize_text=normalize_text) self.ignore_max_byte_error = ignore_max_byte_error try: from pyknp import Juman except ModuleNotFoundError as error: raise error.__class__( "You need to install pyknp to use JumanTokenizer." "See https://github.com/ku-nlp/pyknp for installation." ) self.juman = Juman()
[docs] def tokenize(self, text: str, **kwargs: Dict[str, Any]) -> List[str]: """Converts a string in a sequence of words. Other kwargs (such as *never_split*) are ignored. Args: text (``str``): A sequence to be encoded. Returns: ``List[str]``: A list of words. """ if self.normalize_text: text = unicodedata.normalize("NFKC", text) # "#" and "@" at the beginning of a sentence causes timeout error text = re.sub("^#", "#", text) text = re.sub("^@", "@", text) tokens = [] if not self.ignore_max_byte_error or len(text.encode()) <= 4096: try: result = self.juman.analysis(text) use_underscore = False use_quote = False except ValueError: # This error is occured because of the Juman's matter about space if '"' in text: text = text.replace('"', "”") use_quote = True else: use_quote = False if re.search(r"\s", text): text = re.sub(r"\s", "_", text) use_underscore = True else: use_underscore = False try: result = self.juman.analysis(text) except Exception: print(text) import sys sys.exit(1) except Exception: print(text) import sys sys.exit(1) for mrph in result: token = mrph.midasi if self.do_lower_case: token = token.lower() tokens.append(token) if use_underscore: tokens = list(filter(lambda x: x != "_", tokens)) if use_quote: tokens = list(map(lambda x: x.replace("”", '"'), tokens)) return tokens