Source code for texar.torch.data.tokenizers.sentencepiece_tokenizer

# Copyright 2019 The Texar Authors. All Rights Reserved.
#
# Licensed under the Apache License, Version 2.0 (the "License");
# you may not use this file except in compliance with the License.
# You may obtain a copy of the License at
#
#      http://www.apache.org/licenses/LICENSE-2.0
#
# Unless required by applicable law or agreed to in writing, software
# distributed under the License is distributed on an "AS IS" BASIS,
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and
# limitations under the License.
"""
SentencePiece Tokenizer.
"""

from typing import Any, Dict, List, Optional, Tuple

import os
from shutil import copyfile, move

import sentencepiece as spm

from texar.torch.data.tokenizers.tokenizer_base import TokenizerBase
from texar.torch.modules.pretrained.pretrained_base import default_download_dir
from texar.torch.utils.utils_io import maybe_create_dir

__all__ = [
    "SentencePieceTokenizer"
]


[docs]class SentencePieceTokenizer(TokenizerBase): r"""SentencePiece Tokenizer. This class is a wrapper of Google's `SentencePiece`_ with richer ready-to-use functionalities such as adding tokens and saving/loading. `SentencePiece` is an unsupervised text tokenizer mainly for Neural Network-based text generation systems where the vocabulary size is predetermined prior to the neural model training. `SentencePiece` implements sub-word units (e.g., byte-pair-encoding (BPE) and unigram language model) with the extension of direct training from raw sentences. The supported algorithms in `SentencePiece` are: ``bpe``, ``word``, ``char``, and ``unigram``, which is specified in :attr:`hparams`. Args: cache_dir (optional): the path to a folder in which the trained `sentencepiece` model will be cached. If `None` (default), a default directory (``texar_data`` folder under user's home directory) will be used. hparams (dict or HParams, optional): Hyperparameters. Missing hyperparameter will be set to default values. See :meth:`default_hparams` for the hyperparameter structure and default values. .. _`SentencePiece`: https://github.com/google/sentencepiece """ _IS_PRETRAINED = False _VOCAB_FILE_NAMES = { 'vocab_file': 'spiece.model', } _TRAIN_ARG_MAP = { 'text_file': 'input', 'model_type': 'model_type', 'vocab_size': 'vocab_size', 'bos_token': 'bos_piece', 'eos_token': 'eos_piece', 'unk_token': 'unk_piece', 'pad_token': 'pad_piece', } def __init__(self, cache_dir: Optional[str] = None, hparams=None): super().__init__(hparams=hparams) self.__dict__: Dict if self.hparams['vocab_file'] is not None: self.vocab_file = self.hparams['vocab_file'] self.sp_model = spm.SentencePieceProcessor() self.sp_model.Load(self.vocab_file) bos_id = self.sp_model.bos_id() eos_id = self.sp_model.eos_id() unk_id = self.sp_model.unk_id() pad_id = self.sp_model.pad_id() self.bos_token = None if bos_id != -1: self.bos_token = self.sp_model.IdToPiece(bos_id) self.eos_token = None if eos_id != -1: self.eos_token = self.sp_model.IdToPiece(eos_id) self.unk_token = None if unk_id != -1: self.unk_token = self.sp_model.IdToPiece(unk_id) self.pad_token = None if pad_id != -1: self.pad_token = self.sp_model.IdToPiece(pad_id) elif self.hparams['text_file'] is not None: cmd = ['--model_prefix=spiece'] for arg, val in self.hparams.items(): if arg in self._TRAIN_ARG_MAP: if arg in self._SPECIAL_TOKENS_ATTRIBUTES and val is None: cmd.append('--' + arg.replace('token', 'id') + '=-1') else: cmd.append('--' + self._TRAIN_ARG_MAP[arg] + '=' + str(val)) cache_path = self.train(" ".join(cmd), cache_dir) self.vocab_file = os.path.join( cache_path, self._VOCAB_FILE_NAMES['vocab_file']) self.sp_model = spm.SentencePieceProcessor() self.sp_model.Load(self.vocab_file) else: raise ValueError("'vocab_file' and 'text_file' can not be None " "at the same time.")
[docs] @classmethod def train(cls, cmd: str, # type: ignore cache_dir: Optional[str] = None) -> str: r"""Trains the tokenizer from the raw text file. This function is a wrapper of `sentencepiece.SentencePieceTrainer.Train`_ function. Example: .. code-block:: python SentencePieceTokenizer.train('--input=test/botchan.txt --model_prefix=m --vocab_size=1000') Args: cmd (str): the command for the tokenizer training procedure. See ``sentencepiece.SentencePieceTrainer.Train`` for the detailed usage. cache_dir (optional): the path to a folder in which the trained `sentencepiece` model will be cached. If `None` (default), a default directory (`texar_pytorch` folder under user's home directory) will be used. Returns: Path to the cache directory. .. _`sentencepiece.SentencePieceTrainer.Train`: https://github.com/google/sentencepiece/blob/master/python/sentencepiece.py """ if cache_dir is None: cache_path = str(default_download_dir('SentencePiece')) else: if not os.path.isdir(cache_dir): raise ValueError(f"Cache directory ({cache_dir}) should be a " f"directory.") cache_path = os.path.abspath(cache_dir) maybe_create_dir(cache_path) spm.SentencePieceTrainer.Train(cmd) cwd = os.getcwd() vocab_file = os.path.join(cwd, cls._VOCAB_FILE_NAMES['vocab_file']) out_vocab_file = os.path.join( cache_path, cls._VOCAB_FILE_NAMES['vocab_file']) if os.path.abspath(vocab_file) != os.path.abspath(out_vocab_file): move(vocab_file, out_vocab_file) # Delete spiece.vocab (We might want to keep it as well) extra_file = vocab_file.rstrip('model') + 'vocab' os.remove(extra_file) return cache_path
# spm.SentencePieceProcessor() is a SwigPyObject object which cannot be # pickled. We need to define __getstate__ here. def __getstate__(self): state = self.__dict__.copy() state["sp_model"] = None state["vocab_file"] = None return state, self.vocab_file # spm.SentencePieceProcessor() is a SwigPyObject object which cannot be # pickled. We need to define __setstate__ here. def __setstate__(self, d): self.__dict__, self.vocab_file = d self.sp_model = spm.SentencePieceProcessor() self.sp_model.Load(self.vocab_file)
[docs] def save_vocab(self, save_dir: str) -> Tuple[str]: r"""Save the sentencepiece vocabulary (copy original file) to a directory. """ if not os.path.isdir(save_dir): raise ValueError("Vocabulary path ({}) should be a " "directory".format(save_dir)) out_vocab_file = os.path.join( save_dir, self._VOCAB_FILE_NAMES['vocab_file']) if os.path.abspath(self.vocab_file) != os.path.abspath(out_vocab_file): copyfile(self.vocab_file, out_vocab_file) return (out_vocab_file,)
@property def vocab_size(self) -> int: return len(self.sp_model) def _map_text_to_token(self, text: str) -> List[str]: # type: ignore return self.sp_model.EncodeAsPieces(text) def _map_token_to_id(self, token: str) -> int: return self.sp_model.PieceToId(token) def _map_id_to_token(self, index: int) -> str: token = self.sp_model.IdToPiece(index) return token
[docs] def map_token_to_text(self, tokens: List[str]) -> str: r"""Maps a sequence of tokens (string) in a single string.""" out_string = self.sp_model.DecodePieces(tokens) return out_string
[docs] @staticmethod def default_hparams() -> Dict[str, Any]: r"""Returns a dictionary of hyperparameters with default values. * If `hparams['vocab_file']` is specified, the tokenizer is directly loaded from the vocabulary file. In this case, all other configurations in `hparams` are ignored. * Otherwise, the tokenizer is automatically trained based on `hparams['text_file']`. In this case, `hparams['vocab_size']` must be specified. * `hparams['vocab_file']` and `hparams['text_file']` can not be None at the same time. .. code-block:: python { "vocab_file": None, "text_file": None, "vocab_size": None, "model_type": "unigram", "bos_token": "<s>", "eos_token": "</s>", "unk_token": "<unk>", "pad_token": "<pad>", } Here: `"vocab_file"`: str or None The path to a sentencepiece vocabulary file. `"text_file"`: str or None Comma separated list of input sentences. `"vocab_size"`: int or None Vocabulary size. The user can specify the vocabulary size, and the tokenizer training procedure will train and yield a vocabulary of the specified size. `"model_type"`: str Model algorithm to train the tokenizer. Available algorithms are: ``bpe``, ``word``, ``char``, and ``unigram``. `"bos_token"`: str or None Beginning of sentence token. Set None to disable ``bos_token``. `"eos_token"`: str or None End of sentence token. Set None to disable ``eos_token``. `"unk_token"`: str or None Unknown token. Set None to disable ``unk_token``. `"pad_token"`: str or None Padding token. Set None to disable ``pad_token``. """ return { 'vocab_file': None, 'text_file': None, 'vocab_size': None, 'model_type': 'unigram', 'bos_token': '<s>', 'eos_token': '</s>', 'pad_token': '<pad>', 'unk_token': '<unk>', }
@classmethod def _transform_config(cls, pretrained_model_name: str, cache_dir: str) -> Dict[str, Any]: pass def _init_from_checkpoint(self, pretrained_model_name: str, cache_dir: str, **kwargs): pass