Source code for jptranstokenizer.mainword.juman

import re
import unicodedata
from typing import Any, Dict, List

from .base import MainTokenizerABC


[docs]class JumanTokenizer(MainTokenizerABC):
    """Tokenizer to split into words using Juman.
    Juman++ and pyknp are required to use.
    You can import this module shortly:

    .. code-block:: none

       >> from jptranstokenizer.mainword import JumanTokenizer

    Args:
        do_lower_case (``bool``, *optional*, defaults to ``False``):
            Whether or not to lowercase the input when tokenizing.Defaults to None.
        normalize_text (``bool``, *optional*, defaults to ``True``):
            Whether to apply unicode normalization to text before tokenization.
        ignore_max_byte_error (``bool``, *optional*, defaults to ``False``):
            Whether or not to ignore error of max bytes (only valid with Juman and Sudachi).
            If valid, the tokenizer return empty list.

    .. seealso::
        - Juman++ https://github.com/ku-nlp/jumanpp
        - pyknp https://github.com/ku-nlp/pyknp
    """

    def __init__(
        self,
        do_lower_case: bool = False,
        normalize_text: bool = True,
        ignore_max_byte_error: bool = False,
    ):
        super().__init__(do_lower_case=do_lower_case, normalize_text=normalize_text)
        self.ignore_max_byte_error = ignore_max_byte_error
        try:
            from pyknp import Juman
        except ModuleNotFoundError as error:
            raise error.__class__(
                "You need to install pyknp to use JumanTokenizer."
                "See https://github.com/ku-nlp/pyknp for installation."
            )
        self.juman = Juman()

[docs]    def tokenize(self, text: str, **kwargs: Dict[str, Any]) -> List[str]:
        """Converts a string in a sequence of words.
        Other kwargs (such as *never_split*) are ignored.

        Args:
            text (``str``): A sequence to be encoded.

        Returns:
            ``List[str]``: A list of words.
        """
        if self.normalize_text:
            text = unicodedata.normalize("NFKC", text)
        # "#" and "@" at the beginning of a sentence causes timeout error
        text = re.sub("^#", "＃", text)
        text = re.sub("^@", "＠", text)
        tokens = []
        if not self.ignore_max_byte_error or len(text.encode()) <= 4096:
            try:
                result = self.juman.analysis(text)
                use_underscore = False
                use_quote = False
            except ValueError:
                # This error is occured because of the Juman's matter about space
                if '"' in text:
                    text = text.replace('"', "”")
                    use_quote = True
                else:
                    use_quote = False
                if re.search(r"\s", text):
                    text = re.sub(r"\s", "_", text)
                    use_underscore = True
                else:
                    use_underscore = False
                try:
                    result = self.juman.analysis(text)
                except Exception:
                    print(text)
                    import sys

                    sys.exit(1)
            except Exception:
                print(text)
                import sys

                sys.exit(1)
            for mrph in result:
                token = mrph.midasi
                if self.do_lower_case:
                    token = token.lower()
                tokens.append(token)
            if use_underscore:
                tokens = list(filter(lambda x: x != "_", tokens))
            if use_quote:
                tokens = list(map(lambda x: x.replace("”", '"'), tokens))
        return tokens