Source code for

# -*- coding: utf-8 -*-
# Copyright 2019 The Texar Authors. All Rights Reserved.
# Licensed under the Apache License, Version 2.0 (the "License");
# you may not use this file except in compliance with the License.
# You may obtain a copy of the License at
# Unless required by applicable law or agreed to in writing, software
# distributed under the License is distributed on an "AS IS" BASIS,
# See the License for the specific language governing permissions and
# limitations under the License.
Helper functions and classes for vocabulary processing.
import warnings
from collections import defaultdict
from typing import DefaultDict, Dict, List, Optional, Sequence, Tuple, Union
from asyml_utilities.special_tokens import SpecialTokens

import numpy as np

from texar.torch.utils.utils import (
    _recur_split, dict_lookup, str_join, strip_special_tokens)

__all__ = [

def _make_defaultdict(keys: Sequence[Union[int, str]],
                      values: Sequence[Union[int, str]],
                      default_value: Union[int, str]) \
        -> DefaultDict[Union[int, str], Union[int, str]]:
    r"""Creates a Python `defaultdict`.

        keys (list): Keys of the dictionary.
        values (list): Values correspond to keys. The two lists :attr:`keys` and
            :attr:`values` must be of the same length.
        default_value: default value returned when key is missing.

        defaultdict: A Python `defaultdict` instance that maps keys to values.
    dict_: DefaultDict[Union[int, str], Union[int, str]]
    dict_ = defaultdict(lambda: default_value)
    for k, v in zip(keys, values):
        dict_[k] = v
    return dict_

[docs]class Vocab: r"""Vocabulary class that loads vocabulary from file, and maintains mapping tables between token strings and indexes. Each line of the vocab file should contains one vocabulary token, e.g.:: vocab_token_1 vocab token 2 vocab token | 3 . ... Args: filename (str): Path to the vocabulary file where each line contains one token. bos_token (str): A special token that will be added to the beginning of sequences. eos_token (str): A special token that will be added to the end of sequences. unk_token (str): A special token that will replace all unknown tokens (tokens not included in the vocabulary). pad_token (str): A special token that is used to do padding. """ def __init__(self, filename: str, pad_token: str = SpecialTokens.PAD, bos_token: str = SpecialTokens.BOS, eos_token: str = SpecialTokens.EOS, unk_token: str = SpecialTokens.UNK): self._filename = filename self._pad_token = pad_token self._bos_token = bos_token self._eos_token = eos_token self._unk_token = unk_token self._id_to_token_map_py, self._token_to_id_map_py \ = self.load(self._filename)
[docs] def load(self, filename: str) \ -> Tuple[Dict[int, str], Dict[str, int]]: r"""Loads the vocabulary from the file. Args: filename (str): Path to the vocabulary file. Returns: A tuple of mapping tables between word string and index, (:attr:`id_to_token_map_py`, :attr:`token_to_id_map_py`), where and :attr:`token_to_id_map_py` are python `defaultdict` instances. """ with open(filename, "r") as vocab_file: vocab = list(line.strip() for line in vocab_file) warnings.simplefilter("ignore", UnicodeWarning) if self._bos_token in vocab: raise ValueError("Special begin-of-seq token already exists in the " "vocabulary: '%s'" % self._bos_token) if self._eos_token in vocab: raise ValueError("Special end-of-seq token already exists in the " "vocabulary: '%s'" % self._eos_token) if self._unk_token in vocab: raise ValueError("Special UNK token already exists in the " "vocabulary: '%s'" % self._unk_token) if self._pad_token in vocab: raise ValueError("Special padding token already exists in the " "vocabulary: '%s'" % self._pad_token) warnings.simplefilter("default", UnicodeWarning) # Places _pad_token at the beginning to make sure it take index 0. vocab = [self._pad_token, self._bos_token, self._eos_token, self._unk_token] + vocab # Must make sure this is consistent with the above line vocab_size = len(vocab) # Creates python maps to interface with python code id_to_token_map_py = dict(zip(range(vocab_size), vocab)) token_to_id_map_py = dict(zip(vocab, range(vocab_size))) return id_to_token_map_py, token_to_id_map_py
[docs] def map_ids_to_tokens_py(self, ids: Union[List[int], np.ndarray]) \ -> np.ndarray: r"""Maps ids into text tokens. The input :attr:`ids` and returned tokens are both python arrays or list. Args: ids: An `int` numpy array or (possibly nested) list of token ids. Returns: A numpy array of text tokens of the same shape as :attr:`ids`. """ return dict_lookup(self.id_to_token_map_py, ids, self.unk_token)
[docs] def map_tokens_to_ids_py(self, tokens: List[str]) -> np.ndarray: r"""Maps text tokens into ids. The input :attr:`tokens` and returned ids are both python arrays or list. Args: tokens: A numpy array or (possibly nested) list of text tokens. Returns: A numpy array of token ids of the same shape as :attr:`tokens`. """ return dict_lookup(self.token_to_id_map_py, tokens, self.unk_token_id)
@property def id_to_token_map_py(self) -> Dict[int, str]: r"""The dictionary instance that maps from token index to the string form. """ return self._id_to_token_map_py @property def token_to_id_map_py(self) -> Dict[str, int]: r"""The dictionary instance that maps from token string to the index. """ return self._token_to_id_map_py @property def size(self) -> int: r"""The vocabulary size. """ return len(self.token_to_id_map_py) @property def bos_token(self) -> str: r"""A string of the special token indicating the beginning of sequence. """ return self._bos_token @property def bos_token_id(self) -> int: r"""The `int` index of the special token indicating the beginning of sequence. """ return self.token_to_id_map_py[self._bos_token] @property def eos_token(self) -> str: r"""A string of the special token indicating the end of sequence. """ return self._eos_token @property def eos_token_id(self) -> int: r"""The `int` index of the special token indicating the end of sequence. """ return self.token_to_id_map_py[self._eos_token] @property def unk_token(self) -> str: r"""A string of the special token indicating unknown token. """ return self._unk_token @property def unk_token_id(self) -> int: r"""The `int` index of the special token indicating unknown token. """ return self.token_to_id_map_py[self._unk_token] @property def pad_token(self) -> str: r"""A string of the special token indicating padding token. The default padding token is an empty string. """ return self._pad_token @property def pad_token_id(self) -> int: r"""The `int` index of the special token indicating padding token. """ return self.token_to_id_map_py[self._pad_token] @property def special_tokens(self) -> List[str]: r"""The list of special tokens [:attr:`pad_token`, :attr:`bos_token`, :attr:`eos_token`, :attr:`unk_token`]. """ return [self._pad_token, self._bos_token, self._eos_token, self._unk_token]
[docs]def map_ids_to_strs(ids: Union[np.ndarray, Sequence[int]], vocab: Vocab, join: bool = True, strip_pad: Optional[str] = '<PAD>', strip_bos: Optional[str] = '<BOS>', strip_eos: Optional[str] = '<EOS>') \ -> Union[np.ndarray, List[str]]: r"""Transforms ``int`` indexes to strings by mapping ids to tokens, concatenating tokens into sentences, and stripping special tokens, etc. Args: ids: An n-D numpy array or (possibly nested) list of ``int`` indexes. vocab: An instance of :class:``. join (bool): Whether to concatenate along the last dimension of the the tokens into a string separated with a space character. strip_pad (str): The PAD token to strip from the strings (i.e., remove the leading and trailing PAD tokens of the strings). Default is ``"<PAD>"`` as defined in :class:``.PAD. Set to `None` or `False` to disable the stripping. strip_bos (str): The BOS token to strip from the strings (i.e., remove the leading BOS tokens of the strings). Default is ``"<BOS>"`` as defined in :class:``.BOS. Set to `None` or `False` to disable the stripping. strip_eos (str): The EOS token to strip from the strings (i.e., remove the EOS tokens and all subsequent tokens of the strings). Default is ``"<EOS>"`` as defined in :class:``.EOS. Set to `None` or `False` to disable the stripping. Returns: If :attr:`join` is True, returns a `(n-1)`-D numpy array (or list) of concatenated strings. If :attr:`join` is False, returns an `n`-D numpy array (or list) of str tokens. Example: .. code-block:: python text_ids = [[1, 9, 6, 2, 0, 0], [1, 28, 7, 8, 2, 0]] text = map_ids_to_strs(text_ids, data.vocab) # text == ['a sentence', 'parsed from ids'] text = map_ids_to_strs( text_ids, data.vocab, join=False, strip_pad=None, strip_bos=None, strip_eos=None) # text == [['<BOS>', 'a', 'sentence', '<EOS>', '<PAD>', '<PAD>'], # ['<BOS>', 'parsed', 'from', 'ids', '<EOS>', '<PAD>']] """ tokens = vocab.map_ids_to_tokens_py(ids) # type: ignore if isinstance(ids, (list, tuple)): tokens = tokens.tolist() str_ = str_join(tokens) # type: ignore str_ = strip_special_tokens( str_, strip_pad=strip_pad, strip_bos=strip_bos, strip_eos=strip_eos) if join: return str_ # type: ignore else: return _recur_split(str_, ids) # type: ignore