Source code for jptranstokenizer.mainword.spacy_luw

import unicodedata
from typing import Any, Dict, List

from .base import MainTokenizerABC


[docs]class SpacyluwTokenizer(MainTokenizerABC): """Tokenizer to split into words using ja_gsdluw in spaCy. spaCy and ja_gsdluw is required to use. For installation, `spaCy <https://pypi.org/project/spacy/>`_ and `ja_gsdluw <https://github.com/megagonlabs/UD_Japanese-GSD/releases/tag/r2.9-NE>`_ You can import this module shortly: .. code-block:: none >> from jptranstokenizer.mainword import SpacyluwTokenizer Args: do_lower_case (``bool``, *optional*, defaults to ``False``): Whether or not to lowercase the input when tokenizing.Defaults to None. normalize_text (``bool``, *optional*, defaults to ``True``): Whether to apply unicode normalization to text before tokenization. .. seealso:: - spaCy https://github.com/explosion/spaCy - megagonlabs/UD_Japanese-GSD https://github.com/megagonlabs/UD_Japanese-GSD - ja_gsdluw https://github.com/megagonlabs/UD_Japanese-GSD/releases/tag/r2.9-NE """ def __init__(self, do_lower_case: bool = False, normalize_text: bool = True): super().__init__(do_lower_case=do_lower_case, normalize_text=normalize_text) try: import spacy except ModuleNotFoundError as error: raise error.__class__( "You need to install spacy to use SpacyluwTokenizer.\n" "See https://pypi.org/project/spacy/ for spacy installation.\n" "Also you would need to install ja_gsdluw " "https://github.com/megagonlabs/UD_Japanese-GSD/releases/tag/r2.9-NE " "for ja_gsdluw installation." ) try: self.nlp = spacy.load("ja_gsdluw") except OSError as error: raise error.__class__( "You need to install ja_gsdluw to use SpacyluwTokenizer.\n" "See https://github.com/megagonlabs/UD_Japanese-GSD/releases/tag/r2.9-NE " "for installation." )
[docs] def tokenize(self, text: str, **kwargs: Dict[str, Any]) -> List[str]: """Converts a string in a sequence of words. Other kwargs (such as *never_split*) are ignored. Args: text (``str``): A sequence to be encoded. Returns: ``List[str]``: A list of words. """ if self.normalize_text: text = unicodedata.normalize("NFKC", text) tokens = [] doc = self.nlp(text) tokens = [token.text for sent in doc.sents for token in sent] if self.do_lower_case: tokens = [token.lower() for token in tokens] return tokens