Source code for texar.torch.data.tokenizers.t5_tokenizer

# Copyright 2019 The Texar Authors. All Rights Reserved.
#
# Licensed under the Apache License, Version 2.0 (the "License");
# you may not use this file except in compliance with the License.
# You may obtain a copy of the License at
#
#      http://www.apache.org/licenses/LICENSE-2.0
#
# Unless required by applicable law or agreed to in writing, software
# distributed under the License is distributed on an "AS IS" BASIS,
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and
# limitations under the License.
"""
Pre-trained T5 tokenizer.
"""

from typing import Any, Dict, Optional

import os
import re

from texar.torch.data.tokenizers.sentencepiece_tokenizer \
    import SentencePieceTokenizer
from texar.torch.modules.pretrained.t5 import PretrainedT5Mixin

__all__ = [
    'T5Tokenizer',
]


[docs]class T5Tokenizer(SentencePieceTokenizer, PretrainedT5Mixin):
    r"""Pre-trained T5 Tokenizer.

    Args:
        pretrained_model_name (optional): a `str`, the name of
            pre-trained model (e.g., `T5-Small`). Please refer to
            :class:`~texar.torch.modules.PretrainedT5Mixin` for
            all supported models.
            If None, the model name in :attr:`hparams` is used.
        cache_dir (optional): the path to a folder in which the
            pre-trained models will be cached. If `None` (default),
            a default directory (``texar_data`` folder under user's home
            directory) will be used.
        hparams (dict or HParams, optional): Hyperparameters. Missing
            hyperparameters will be set to default values. See
            :meth:`default_hparams` for the hyperparameter structure
            and default values.
    """

    _IS_PRETRAINED = True

    _VOCAB_FILE_NAMES = {
        'vocab_file': 'sentencepiece.model'
    }

    _MAX_INPUT_SIZE = {
        'T5-Small': 512,
        'T5-Base': 512,
        'T5-Large': 512,
        'T5-3B': 512,
        'T5-11B': 512
    }

    def __init__(self,
                 pretrained_model_name: Optional[str] = None,
                 cache_dir: Optional[str] = None,
                 hparams=None):

        self.load_pretrained_config(pretrained_model_name, cache_dir, hparams)

        if self.pretrained_model_dir is not None:
            assert self.pretrained_model_name is not None
            vocab_file = os.path.join(self.pretrained_model_dir,
                                      self._VOCAB_FILE_NAMES['vocab_file'])

            if self._MAX_INPUT_SIZE.get(self.pretrained_model_name):
                self.max_len = self._MAX_INPUT_SIZE[self.pretrained_model_name]
            setattr(self.hparams, 'vocab_file', vocab_file)
        else:
            if self.hparams.get('max_len'):
                self.max_len = self.hparams['max_len']

        # Add extra_ids to the special token list
        additional_special_tokens = []
        extra_ids = self.hparams['extra_ids']
        if extra_ids > 0:
            additional_special_tokens.extend(
                ["<extra_id_{}>".format(i) for i in range(extra_ids)])

        setattr(self.hparams, 'additional_special_tokens',
                additional_special_tokens)

        super().__init__(hparams=None)

[docs]    @staticmethod
    def default_hparams() -> Dict[str, Any]:
        r"""Returns a dictionary of hyperparameters with default values.

        * The tokenizer is determined by the constructor argument
          :attr:`pretrained_model_name` if it's specified. In this case,
          `hparams` are ignored.
        * Otherwise, the tokenizer is determined by
          `hparams['pretrained_model_name']` if it's specified. All other
          configurations in `hparams` are ignored.
        * If the above two are `None`, the tokenizer is defined by the
          configurations in `hparams`.

        .. code-block:: python

            {
                "pretrained_model_name": "T5-Small",
                "vocab_file": None,
                "max_len": 512,
                "bos_token": None,
                "eos_token": "</s>",
                "unk_token": "<unk>",
                "pad_token": "<pad>",
                "extra_ids": 100,
                "additional_special_tokens": [],
                "name": "t5_tokenizer",
            }

        Here:

        `"pretrained_model_name"`: str or None
            The name of the pre-trained T5 model.

        `"vocab_file"`: str or None
            The path to a sentencepiece vocabulary file.

        `"max_len"`: int or None
            The maximum sequence length that this model might ever be used with.

        `"bos_token"`: str or None
            Beginning of sentence token. Set None to disable ``bos_token``.

        `"eos_token"`: str
            End of sentence token. Set None to disable ``eos_token``.

        `"unk_token"`: str
            Unknown token. Set None to disable ``unk_token``.

        `"pad_token"`: str
            Padding token. Set None to disable ``pad_token``.

        `"extra_ids"`: int
            Add a number of extra ids added to the end of the vocabulary for
            use as sentinels. These tokens are accessible as `<extra_id_{%d}>`
            where `{%d}` is a number between 0 and extra_ids-1. Extra tokens
            are indexed from the end of the vocabulary up to beginning
            (<extra_id_0> is the last token in the vocabulary) (like in T5
            preprocessing) see:
            `https://github.com/google-research/text-to-text-transfer-transformer/blob/9fd7b14a769417be33bc6c850f9598764913c833/t5/data/preprocessors.py#L2117`

        `"additional_special_tokens"`: list
            A list of additional special tokens.

        `"name"`: str
            Name of the tokenizer.
        """
        return {
            'pretrained_model_name': 'T5-Small',
            'vocab_file': None,
            'max_len': 512,
            'bos_token': None,
            'eos_token': '</s>',
            'unk_token': '<unk>',
            'pad_token': '<pad>',
            'extra_ids': 100,
            'additional_special_tokens': [],
            'name': 't5_tokenizer',
            '@no_typecheck': ['pretrained_model_name'],
        }

    @property
    def vocab_size(self) -> int:
        return len(self.sp_model) + self.hparams['extra_ids']

    def _map_token_to_id(self, token: str) -> int:
        if token.startswith("<extra_id_"):
            match = re.match(r"<extra_id_(\d+)>", token)
            num = int(match.group(1))  # type: ignore
            return self.vocab_size - num - 1
        return self.sp_model.PieceToId(token)

    def _map_id_to_token(self, index: int) -> str:
        if index < self.sp_model.get_piece_size():
            token = self.sp_model.IdToPiece(index)
        else:
            token = "<extra_id_{}>".format(self.vocab_size - 1 - index)
        return token