from typing import Any, Dict, Optional, List
SPIECE_UNDERLINE = "▁"
[docs]class SentencepieceTokenizer:
"""Runs sentencepiece tokenization.
You can import this module shortly:
.. code-block:: none
>> from jptranstokenizer.subword import SentencepieceTokenizer
Args:
vocab_file (``str``):
The sentencepiece model file path.
sp_model_kwargs (``Dict[str, Any]``, *optional*):
Arguments of dict to pass ``sentencepiece.SentencePieceProcessor``.
sp_model (``sentencepiece.SentencePieceProcessor``, *optional*):
Already trained ``SentencePieceProcessor`` model.
"""
def __init__(
self,
vocab_file: Optional[str] = None,
sp_model_kwargs: Optional[Dict[str, Any]] = None,
sp_model: Optional[Any] = None,
):
if vocab_file is None and sp_model is None:
raise ValueError("vocab_file or sp_model must be specified")
try:
import sentencepiece as sp
except ModuleNotFoundError as error:
raise error.__class__(
"You need to install sentencepiece to use SentencepieceTokenizer."
"See https://github.com/google/sentencepiece for installation."
)
self.sp_model: sp.SentencePieceProcessor
if sp_model is None:
import sentencepiece as sp
self.sp_model_kwargs: Dict[str, Any] = (
{} if sp_model_kwargs is None else sp_model_kwargs
)
self.sp_model = sp.SentencePieceProcessor(**self.sp_model_kwargs)
self.sp_model.Load(vocab_file)
else:
self.sp_model = sp_model
self.bpe_vocab_size: int = self.sp_model.GetPieceSize()
self.vocab: Dict[str, int] = {
self.sp_model.IdToPiece(i): i for i in range(self.bpe_vocab_size)
}
[docs] def tokenize(self, text: str) -> List[str]:
"""Converts a string in a sequence of tokens.
Args:
text (``str``): A single token to be encoded.
Returns:
``List[str]``: A list of sentencepiece tokens.
"""
pieces: List[str] = self.sp_model.encode(text, out_type=str)
tokens: List[str] = []
for piece in pieces:
if len(piece) > 1 and piece[-1] == str(",") and piece[-2].isdigit():
cur_pieces = self.sp_model.EncodeAsPieces(
piece[:-1].replace(SPIECE_UNDERLINE, "")
)
if (
piece[0] != SPIECE_UNDERLINE
and cur_pieces[0][0] == SPIECE_UNDERLINE
):
if len(cur_pieces[0]) == 1:
cur_pieces = cur_pieces[1:]
else:
cur_pieces[0] = cur_pieces[0][1:]
cur_pieces.append(piece[-1])
tokens.extend(cur_pieces)
else:
tokens.append(piece)
return tokens