Source code for texar.torch.modules.embedders.embedders

# Copyright 2019 The Texar Authors. All Rights Reserved.
#
# Licensed under the Apache License, Version 2.0 (the "License");
# you may not use this file except in compliance with the License.
# You may obtain a copy of the License at
#
#      http://www.apache.org/licenses/LICENSE-2.0
#
# Unless required by applicable law or agreed to in writing, software
# distributed under the License is distributed on an "AS IS" BASIS,
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and
# limitations under the License.
"""
Various embedders.
"""
from typing import Optional

import torch
from torch.nn import functional as F

from texar.torch.modules.embedders import embedder_utils
from texar.torch.modules.embedders.embedder_base import (
    EmbedderBase, EmbeddingDropout)

__all__ = [
    "WordEmbedder",
]


[docs]class WordEmbedder(EmbedderBase):
    r"""Simple word embedder that maps indexes into embeddings. The indexes
    can be soft (e.g., distributions over vocabulary).

    Either :attr:`init_value` or :attr:`vocab_size` is required. If both are
    given, there must be ``init_value.shape[0]==vocab_size``.

    Args:
        init_value (optional): A Tensor or numpy array that contains the
            initial value of embeddings. It is typically of shape
            ``[vocab_size] + embedding-dim``. Embeddings can have dimensionality
            > 1.

            If `None`, embedding is initialized as specified in
            ``hparams["initializer"]``. Otherwise, the
            ``"initializer"`` and ``"dim"`` hyperparameters in :attr:`hparams`
            are ignored.
        vocab_size (int, optional): The vocabulary size. Required if
            :attr:`init_value` is not given.
        hparams (dict, optional): Embedder hyperparameters. Missing
            hyperparameters will be set to default values. See
            :meth:`default_hparams` for the hyperparameter structure and
            default values.

    See :meth:`forward` for the inputs and outputs of the embedder.

    Example:

    .. code-block:: python

        ids = torch.empty([32, 10]).uniform_(to=10).type(torch.int64).
        soft_ids = torch.empty([32, 10, 100]).uniform_()

        embedder = WordEmbedder(vocab_size=100, hparams={'dim': 256})
        ids_emb = embedder(ids=ids) # shape: [32, 10, 256]
        soft_ids_emb = embedder(soft_ids=soft_ids) # shape: [32, 10, 256]

    .. code-block:: python

        # Use with Texar data module
        hparams={
            'dataset': {
                'embedding_init': {'file': 'word2vec.txt'}
                ...
            },
        }
        data = MonoTextData(data_params)
        iterator = DataIterator(data)
        batch = next(iter(iterator))

        # Use data vocab size
        embedder_1 = WordEmbedder(vocab_size=data.vocab.size)
        emb_1 = embedder_1(batch['text_ids'])

        # Use pre-trained embedding
        embedder_2 = WordEmbedder(init_value=data.embedding_init_value)
        emb_2 = embedder_2(batch['text_ids'])


    .. document private functions
    """

    def __init__(self, init_value: Optional[torch.Tensor] = None,
                 vocab_size: Optional[int] = None, hparams=None):

        if init_value is None and vocab_size is None:
            raise ValueError(
                "Either `init_value` or `vocab_size` is required.")

        super().__init__(init_value=init_value,
                         num_embeds=vocab_size, hparams=hparams)

        if vocab_size is None:
            self._vocab_size = self._num_embeds
        else:
            self._vocab_size = vocab_size
        if self._vocab_size != self._num_embeds:
            raise ValueError(
                f"vocab_size must equal to init_value.shape[0]. "
                f"Got {self._vocab_size} and {self._num_embeds}")

        self._dropout_layer = EmbeddingDropout(self._hparams.dropout_rate)

[docs]    @staticmethod
    def default_hparams():
        r"""Returns a dictionary of hyperparameters with default values.

        .. code-block:: python

            {
                "dim": 100,
                "dropout_rate": 0,
                "dropout_strategy": 'element',
                "initializer": {
                    "type": "random_uniform_initializer",
                    "kwargs": {
                        "minval": -0.1,
                        "maxval": 0.1,
                        "seed": None
                    }
                },
                "trainable": True,
                "name": "word_embedder",
            }

        Here:

        `"dim"`: int or list
            Embedding dimension. Can be a list of integers to yield embeddings
            with dimensionality > 1.

            Ignored if :attr:`init_value` is given to the embedder constructor.

        `"dropout_rate"`: float
            The dropout rate between 0 and 1. For example, ``dropout_rate=0.1``
            would zero out 10% of the embeddings. Set to 0 to disable dropout.

        `"dropout_strategy"`: str
            The dropout strategy. Can be one of the following

            - ``"element"``: The regular strategy that drops individual elements
              in the embedding vectors.
            - ``"item"``: Drops individual items (e.g., words) entirely. For
              example, for the word sequence "the simpler the better", the
              strategy can yield "_ simpler the better", where the first "the"
              is dropped.
            - ``"item_type"``: Drops item types (e.g., word types). For example,
              for the above sequence, the strategy can yield "_ simpler _
              better", where the word type "the" is dropped. The dropout will
              never yield "_ simpler the better" as in the ``"item"`` strategy.

        `"initializer"`: dict or None
            Hyperparameters of the initializer for embedding values. See
            :func:`~texar.torch.core.get_initializer` for the details. Ignored
            if :attr:`init_value` is given to the embedder constructor.

        `"trainable"`: bool
            Whether the embedding parameters are trainable. If false, freeze the
            embedding parameters.

        `"name"`: str
            Name of the embedding variable.
        """
        hparams = embedder_utils.default_embedding_hparams()
        hparams["name"] = "word_embedder"
        return hparams

[docs]    def extra_repr(self) -> str:
        return f"vocab_size={self.vocab_size}, embedding_dim={self.dim}"

[docs]    def forward(self,  # type: ignore
                ids: Optional[torch.LongTensor] = None,
                soft_ids: Optional[torch.Tensor] = None,
                **kwargs) -> torch.Tensor:
        r"""Embeds (soft) ids.

        Either :attr:`ids` or :attr:`soft_ids` must be given, and they
        must not be given at the same time.

        Args:
            ids (optional): An integer tensor containing the ids to embed.
            soft_ids (optional): A tensor of weights (probabilities) used to
                mix the embedding vectors.
            kwargs: Additional keyword arguments for
                :torch_nn:`functional.embedding` besides :attr:`params` and
                :attr:`ids`.

        Returns:
            If :attr:`ids` is given, returns a Tensor of shape
            ``list(ids.shape) + embedding-dim``. For example,
            if ``list(ids.shape) == [batch_size, max_time]``
            and ``list(embedding.shape) == [vocab_size, emb_dim]``, then the
            return tensor has shape ``[batch_size, max_time, emb_dim]``.

            If :attr:`soft_ids` is given, returns a Tensor of shape
            ``list(soft_ids.shape)[:-1] + embedding-dim``. For example,
            if ``list(soft_ids.shape) == [batch_size, max_time, vocab_size]``
            and ``list(embedding.shape) == [vocab_size, emb_dim]``, then the
            return tensor has shape ``[batch_size, max_time, emb_dim]``.
        """
        if ids is not None:
            if soft_ids is not None:
                raise ValueError(
                    'Must not specify `ids` and `soft_ids` at the same time.')
            ids_rank = ids.dim()
        elif soft_ids is not None:
            ids_rank = soft_ids.dim() - 1
        else:
            raise ValueError('Either `ids` or `soft_ids` must be given.')

        embedding = self._embedding

        if self._hparams.dropout_strategy == 'item_type':
            noise_shape = self._get_noise_shape(self._hparams.dropout_strategy)
            embedding = self._dropout_layer(embedding, noise_shape)

        if ids is not None:
            outputs = F.embedding(ids, embedding, **kwargs)
        else:
            outputs = embedder_utils.soft_embedding_lookup(embedding, soft_ids)

        if self._hparams.dropout_strategy != 'item_type':
            noise_shape = self._get_noise_shape(
                self._hparams.dropout_strategy,
                ids_rank=ids_rank, dropout_input=outputs)
            outputs = self._dropout_layer(outputs, noise_shape)

        return outputs

    @property
    def embedding(self) -> torch.Tensor:
        r"""The embedding tensor, of shape ``[vocab_size] + dim``.
        """
        return self._embedding

    @property
    def dim(self) -> int:
        r"""The embedding dimension.
        """
        return self._dim

    @property
    def vocab_size(self) -> int:
        r"""The vocabulary size.
        """
        return self._vocab_size

    @property
    def num_embeddings(self) -> int:
        r"""The vocabulary size. This interface matches
        :torch_nn:`Embedding`.
        """
        return self._vocab_size

    @property
    def output_size(self) -> int:
        r"""The feature size of :meth:`forward` output. If the :attr:`dim`
        hyperparameter is a ``list`` or ``tuple``, the feature size
        equals its final dimension; otherwise, if :attr:`dim` is an
        ``int``, the feature size equals :attr:`dim`.
        """
        if isinstance(self._dim, (list, tuple)):
            return self._dim[-1]
        else:
            return self._dim