Source code for texar.torch.data.tokenizers.tokenizer_base

# Copyright 2019 The Texar Authors. All Rights Reserved.
#
# Licensed under the Apache License, Version 2.0 (the "License");
# you may not use this file except in compliance with the License.
# You may obtain a copy of the License at
#
#      http://www.apache.org/licenses/LICENSE-2.0
#
# Unless required by applicable law or agreed to in writing, software
# distributed under the License is distributed on an "AS IS" BASIS,
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and
# limitations under the License.
"""
Base class for all tokenizers.

The code structure adapted from:
    `https://github.com/huggingface/pytorch-transformers/blob/master/pytorch_transformers/tokenization_utils.py`
"""

from typing import Any, Dict, List, Optional, Tuple, Union, overload

import json
import os
import warnings

from texar.torch.module_base import ModuleBase

__all__ = [
    "TokenizerBase",
]

SPECIAL_TOKENS_MAP_FILE = 'special_tokens_map.json'
ADDED_TOKENS_FILE = 'added_tokens.json'
CONFIG_FILE = 'config.json'


[docs]class TokenizerBase(ModuleBase):
    r"""Base class inherited by all tokenizer classes. This class
    handles downloading and loading pre-trained tokenizer and adding tokens to
    the vocabulary.

    Derived class can set up a few special tokens to be used in common scripts
    and internals: :attr:`bos_token`, :attr:`eos_token`, :attr:`unk_token`,
    :attr:`sep_token`, :attr:`pad_token`, :attr:`cls_token`,
    :attr:`mask_token`, and :attr:`additional_special_tokens`.

    We defined an :attr:`added_tokens_encoder` to add new tokens to the
    vocabulary without having to handle the specific vocabulary augmentation
    methods of the various underlying dictionary structures (`BPE`,
    `sentencepiece` ...).
    """

    _IS_PRETRAINED: bool
    _MAX_INPUT_SIZE: Dict[str, Optional[int]]
    _VOCAB_FILE_NAMES: Dict[str, str]
    _VOCAB_FILE_MAP: Dict[str, Dict[str, str]]
    _SPECIAL_TOKENS_ATTRIBUTES = ["bos_token", "eos_token", "unk_token",
                                  "sep_token", "pad_token", "cls_token",
                                  "mask_token", "additional_special_tokens"]

    def __init__(self, hparams):
        super().__init__(hparams=hparams)

        self.config = None

        self.bos_token = None
        self.eos_token = None
        self.unk_token = None
        self.sep_token = None
        self.pad_token = None
        self.cls_token = None
        self.mask_token = None
        self.additional_special_tokens = []

        self.max_len = int(1e12)
        self.added_tokens_encoder = {}
        self.added_tokens_decoder = {}

        for key, value in self.hparams.items():
            if key in self._SPECIAL_TOKENS_ATTRIBUTES:
                if key == 'additional_special_tokens':
                    assert isinstance(value, (list, tuple)) and \
                           all(isinstance(v, str) for v in value)
                else:
                    if value is not None:
                        assert isinstance(value, str)
                    else:
                        warnings.warn(f"Trying to set None as value special "
                                      f"token '{key}'. Proceed only if you"
                                      f" are sure!", UserWarning)
                setattr(self, key, value)

[docs]    @classmethod
    def load(cls, pretrained_model_path: str, configs: Optional[Dict] = None):
        r"""Instantiate a tokenizer from the vocabulary files or the saved
        tokenizer files.

        Args:
            pretrained_model_path: The path to a vocabulary file or a folder
                that contains the saved pre-trained tokenizer files.
            configs: Tokenizer configurations. You can overwrite the original
                tokenizer configurations saved in the configuration file
                by this dictionary.

        Returns:
            A tokenizer instance.
        """
        vocab_files = {}
        # Look for the tokenizer main vocabulary files
        for file_id, file_name in cls._VOCAB_FILE_NAMES.items():
            full_file_name: Optional[str]
            if os.path.isdir(pretrained_model_path):
                # If a directory is provided we look for the standard file name
                full_file_name = os.path.join(pretrained_model_path, file_name)
            else:
                # If a path to a file is provided we use it (will only work
                # for non-BPE tokenizer using a single vocabulary file)
                full_file_name = pretrained_model_path
            if not os.path.exists(full_file_name):
                print("Can't find file {}. We won't load it.".format(
                    full_file_name))
                full_file_name = None
            vocab_files[file_id] = full_file_name

        # Look for the additional tokens files
        all_vocab_files_names = {
            'added_tokens_file': ADDED_TOKENS_FILE,
            'special_tokens_map_file': SPECIAL_TOKENS_MAP_FILE,
            'config_file': CONFIG_FILE}

        # If a path to a file was provided, get the parent directory
        saved_directory = pretrained_model_path
        if os.path.exists(saved_directory) and not os.path.isdir(
                saved_directory):
            saved_directory = os.path.dirname(saved_directory)

        for file_id, file_name in all_vocab_files_names.items():
            full_file_name = os.path.join(saved_directory, file_name)
            if not os.path.exists(full_file_name):
                print("Can't find file {}. We won't load it.".format(
                    full_file_name))
                full_file_name = None
            vocab_files[file_id] = full_file_name

        if all(full_file_name is None for full_file_name in
               vocab_files.values()):
            raise ValueError("Can't find tokenizer files in {}.".format(
                saved_directory))

        kwargs: Dict[str, Any]
        if cls._IS_PRETRAINED:
            kwargs = {'pretrained_model_name': None}
        else:
            kwargs = {}

        added_tokens_file = vocab_files.pop('added_tokens_file', None)
        special_tokens_map_file = vocab_files.pop(
            'special_tokens_map_file', None)
        tokenizer_config_file = vocab_files.pop('config_file', None)

        for args_name, file_path in vocab_files.items():
            if args_name not in kwargs:
                kwargs[args_name] = file_path

        if special_tokens_map_file is not None:
            with open(special_tokens_map_file, encoding="utf-8") as f:
                special_tokens_map = json.load(f)
            for key, value in special_tokens_map.items():
                if key not in kwargs:
                    kwargs[key] = value

        if tokenizer_config_file is not None:
            with open(tokenizer_config_file, encoding="utf-8") as f:
                tokenizer_config = json.load(f)
            for key, value in tokenizer_config.items():
                if key not in kwargs:
                    kwargs[key] = value

        if configs is not None:
            for key, value in configs.items():
                kwargs[key] = value

        tokenizer = cls(hparams=kwargs)

        # Add supplementary tokens.
        if added_tokens_file is not None:
            with open(added_tokens_file, encoding="utf-8") as f:
                added_tok_encoder = json.load(f)
            added_tok_decoder = {v: k for k, v in added_tok_encoder.items()}
            tokenizer.added_tokens_encoder.update(added_tok_encoder)
            tokenizer.added_tokens_decoder.update(added_tok_decoder)

        return tokenizer

[docs]    def save(self, save_dir: str) -> Tuple[str]:
        r"""Save the tokenizer vocabulary files (with added tokens), tokenizer
        configuration file and a dictionary mapping special token class
        attributes (:attr:`cls_token`, :attr:`unk_token`, ...) to their values
        (`<unk>`, `<cls>`, ...) to a directory, so that it can be re-loaded
        using the :meth:`~load`.

        Args:
            save_dir: The path to a folder in which the tokenizer files
                will be saved.

        Return:
            The paths to the vocabulary file, added token file, special token
            mapping file, and the configuration file.
        """
        if not os.path.isdir(save_dir):
            raise ValueError("Saving directory ({}) should be a "
                             "directory".format(save_dir))

        special_tokens_map_file = os.path.join(save_dir,
                                               SPECIAL_TOKENS_MAP_FILE)
        added_tokens_file = os.path.join(save_dir, ADDED_TOKENS_FILE)
        config_file = os.path.join(save_dir, CONFIG_FILE)

        with open(special_tokens_map_file, 'w', encoding='utf-8') as f:
            f.write(json.dumps(self.special_tokens_map, ensure_ascii=False))

        with open(added_tokens_file, 'w', encoding='utf-8') as f:
            if self.added_tokens_encoder:
                out_str = json.dumps(self.added_tokens_encoder,
                                     ensure_ascii=False)
            else:
                out_str = u"{}"
            f.write(out_str)

        with open(config_file, 'w', encoding='utf-8') as f:
            if self.config:
                out_str = json.dumps(self.config, ensure_ascii=False)
            else:
                out_str = u"{}"
            f.write(out_str)

        vocab_files = self.save_vocab(save_dir)
        return vocab_files + (special_tokens_map_file, added_tokens_file,
                              config_file)

[docs]    def save_vocab(self, save_dir):
        r"""Save the tokenizer vocabulary to a directory. This method does not
        save added tokens, special token mappings, and the configuration file.

        Please use :meth:`~save` to save the full tokenizer state so
        that it can be reloaded using :meth:`~load`.
        """
        raise NotImplementedError

    @property
    def vocab_size(self) -> int:
        raise NotImplementedError

    def __len__(self) -> int:
        return self.vocab_size + len(self.added_tokens_encoder)

[docs]    def add_tokens(self, new_tokens: List[Optional[str]]) -> int:
        r"""Add a list of new tokens to the tokenizer class. If the new tokens
        are not in the vocabulary, they are added to the
        :attr:`added_tokens_encoder` with indices starting from the last index
        of the current vocabulary.

        Args:
            new_tokens: A list of new tokens.

        Returns:
            Number of tokens added to the vocabulary which can be used to
            correspondingly increase the size of the associated model embedding
            matrices.
        """
        if not new_tokens:
            return 0

        to_add_tokens = []
        for token in new_tokens:
            assert isinstance(token, str)
            if token != self.unk_token and \
                    (self.map_token_to_id(token) ==
                     self.map_token_to_id(self.unk_token)):
                to_add_tokens.append(token)

        added_tok_encoder = dict((tok, len(self) + i) for i, tok in
                                 enumerate(to_add_tokens))
        added_tok_decoder = {v: k for k, v in added_tok_encoder.items()}
        self.added_tokens_encoder.update(added_tok_encoder)
        self.added_tokens_decoder.update(added_tok_decoder)

        return len(to_add_tokens)

[docs]    def add_special_tokens(self, special_tokens_dict: Dict[str, str]) -> int:
        r"""Add a dictionary of special tokens to the encoder and link them to
        class attributes. If the special tokens are not in the vocabulary, they
        are added to it and indexed starting from the last index of the
        current vocabulary.

        Args:
            special_tokens_dict: A dictionary mapping special token class
                attributes (:attr:`cls_token`, :attr:`unk_token`, ...) to their
                values (`<unk>`, `<cls>`, ...).

        Returns:
            Number of tokens added to the vocabulary which can be used to
            correspondingly increase the size of the associated model embedding
            matrices.
        """
        if not special_tokens_dict:
            return 0

        added_tokens = 0
        for key, value in special_tokens_dict.items():
            assert key in self._SPECIAL_TOKENS_ATTRIBUTES
            if key == 'additional_special_tokens':
                assert isinstance(value, (list, tuple)) and all(
                    isinstance(t, str) for t in value)
                added_tokens += self.add_tokens(value)
            else:
                assert isinstance(value, str)
                added_tokens += self.add_tokens([value])
            setattr(self, key, value)

        return added_tokens

[docs]    def map_text_to_token(self, text: Optional[str],
                          **kwargs) -> List[str]:
        r"""Maps a string to a sequence of tokens (string), using the
        tokenizer. Split in words for word-based vocabulary or sub-words for
        sub-word-based vocabularies (`BPE`/`SentencePiece`/`WordPiece`).
        This function also takes care of the added tokens.

        Args:
            text: A input string.

        Return:
            A list of tokens.
        """

        def split_on_tokens(tok_list, string):
            if not string:
                return []
            if not tok_list:
                return self._map_text_to_token(string, **kwargs)
            tok = tok_list[0]
            split_text = string.split(tok)
            return sum((split_on_tokens(tok_list[1:], sub_text.strip()) + [tok]
                        for sub_text in split_text), [])[:-1]

        added_tokens = list(
            self.added_tokens_encoder.keys()) + self.all_special_tokens
        tokenized_text = split_on_tokens(added_tokens, text)
        return tokenized_text

    def _map_text_to_token(self, text: str, **kwargs) -> \
        Union[List[str], List[Tuple[str, int, int]]]:
        r"""Maps a string to a sequence of tokens (string), using the
        tokenizer. Split in words for word-based vocabulary or sub-words for
        sub-word-based vocabularies (`BPE`/`SentencePiece`/`WordPiece`).
        This function does not take care of the added tokens.
        """
        raise NotImplementedError

    # TODO: Remove these once pylint supports function stubs.
    # pylint: disable=unused-argument,function-redefined

    @overload
    def map_token_to_id(self, tokens: str) -> int:
        ...

    @overload
    def map_token_to_id(self, tokens: List[str]) -> List[int]:
        ...

[docs]    def map_token_to_id(self, tokens):
        r"""Maps a single token or a sequence of tokens to a integer id
        (resp.) a sequence of ids, using the vocabulary.

        Args:
            tokens: A single token or a list of tokens.

        Returns:
            A single token id or a list of token ids.
        """
        if isinstance(tokens, str):
            return self._map_token_to_id_with_added_voc(tokens)

        ids = []
        for token in tokens:
            ids.append(self._map_token_to_id_with_added_voc(token))
        if len(ids) > self.max_len:
            warnings.warn(
                "Token indices sequence length is longer than the specified "
                "maximum sequence length for this model ({} > {}). Running "
                "this sequence through the model will result in indexing "
                "errors".format(len(ids), self.max_len), UserWarning)
        return ids

    # pylint: enable=unused-argument,function-redefined

    def _map_token_to_id_with_added_voc(self, token: str) -> int:
        if token in self.added_tokens_encoder:
            return self.added_tokens_encoder[token]
        return self._map_token_to_id(token)

    def _map_token_to_id(self, token: str) -> int:
        raise NotImplementedError

[docs]    def map_text_to_id(self, text: str) -> List[int]:
        r"""Maps a string to a sequence of ids (integer), using the
        tokenizer and vocabulary. Same as
        `self.map_token_to_id(self.map_text_to_token(text))`.

        Args:
            text: A input string.

        Returns:
            A single token id or a list of token ids.
        """
        return self.map_token_to_id(self.map_text_to_token(text))

    # TODO: Remove these once pylint supports function stubs.
    # pylint: disable=unused-argument,function-redefined

    @overload
    def map_id_to_token(self, token_ids: int,
                        skip_special_tokens: bool = False) -> str:
        ...

    @overload
    def map_id_to_token(self, token_ids: List[int],
                        skip_special_tokens: bool = False) -> List[str]:
        ...

[docs]    def map_id_to_token(self, token_ids, skip_special_tokens=False):
        r"""Maps a single id or a sequence of ids to a token (resp.) a
        sequence of tokens, using the vocabulary and added tokens.

        Args:
            token_ids: A single token id or a list of token ids.
            skip_special_tokens: Whether to skip the special tokens.

        Returns:
            A single token or a list of tokens.
        """
        if isinstance(token_ids, int):
            if token_ids in self.added_tokens_decoder:
                return self.added_tokens_decoder[token_ids]
            else:
                return self._map_id_to_token(token_ids)
        tokens = []
        for index in token_ids:
            if index in self.all_special_ids and skip_special_tokens:
                continue
            if index in self.added_tokens_decoder:
                tokens.append(self.added_tokens_decoder[index])
            else:
                tokens.append(self._map_id_to_token(index))
        return tokens

    # pylint: enable=unused-argument,function-redefined

    def _map_id_to_token(self, token_id: int) -> str:
        raise NotImplementedError

[docs]    def map_token_to_text(self, tokens: List[str]) -> str:
        r"""Maps a sequence of tokens (string) in a single string.
        The most simple way to do it is :python:`' '.join(tokens)`, but we
        often want to remove sub-word tokenization artifacts at the same time.
        """
        raise NotImplementedError

[docs]    def map_id_to_text(self, token_ids: List[int],
                       skip_special_tokens: bool = False,
                       clean_up_tokenization_spaces: bool = True) -> str:
        r"""Maps a sequence of ids (integer) to a string, using the
        tokenizer and vocabulary with options to remove special tokens and
        clean up tokenization spaces.

        Args:
            token_ids: A list of token ids.
            skip_special_tokens: Whether to skip the special tokens.
            clean_up_tokenization_spaces: Whether to clean up a list of simple
                English tokenization artifacts like spaces before punctuations
                and abbreviated forms.
        """
        filtered_tokens = self.map_id_to_token(
            token_ids, skip_special_tokens=skip_special_tokens)
        text = self.map_token_to_text(filtered_tokens)
        if clean_up_tokenization_spaces:
            text = self.clean_up_tokenization(text)
        return text

[docs]    def encode_text(self,
                    text_a: str,
                    text_b: Optional[str] = None,
                    max_seq_length: Optional[int] = None):
        r"""Adds special tokens to a sequence or sequence pair and computes
        other information such as segment ids, input mask, and sequence length
        for specific tasks.
        """
        raise NotImplementedError

    @property
    def special_tokens_map(self) -> Dict[str, str]:
        r"""A dictionary mapping special token class attributes
        (:attr:`cls_token`, :attr:`unk_token`, ...) to their values
        (`<unk>`, `<cls>`, ...)
        """
        set_attr = {}
        for attr in self._SPECIAL_TOKENS_ATTRIBUTES:
            attr_value = getattr(self, attr)
            if attr_value:
                set_attr[attr] = attr_value
        return set_attr

    @property
    def all_special_tokens(self) -> List[str]:
        r"""List all the special tokens (`<unk>`, `<cls>`, ...) mapped to class
        attributes (:attr:`cls_token`, :attr:`unk_token`, ...).
        """
        all_toks: List[str] = []
        set_attr = self.special_tokens_map
        for attr_value in set_attr.values():
            all_toks = all_toks + (
                attr_value if isinstance(attr_value, (list, tuple)) else [
                    attr_value])
        all_toks = list(set(all_toks))
        return all_toks

    @property
    def all_special_ids(self) -> List[int]:
        r"""List the vocabulary indices of the special tokens
        (`<unk>`, `<cls>`, ...) mapped to class attributes
        (:attr:`cls_token`, :attr:`unk_token`, ...).
        """
        all_toks = self.all_special_tokens
        all_ids: List[int] = [self.map_token_to_id(t) for t in all_toks]
        return all_ids

[docs]    @staticmethod
    def clean_up_tokenization(out_string: str) -> str:
        r"""Clean up a list of simple English tokenization artifacts like
        spaces before punctuations and abbreviated forms.
        """
        out_string = out_string.replace(' .', '.').replace(' ?', '?'). \
            replace(' !', '!').replace(' ,', ',').replace(" ' ", "'"). \
            replace(" n't", "n't").replace(" 'm", "'m"). \
            replace(" do not", " don't").replace(" 's", "'s"). \
            replace(" 've", "'ve").replace(" 're", "'re")
        return out_string