Source code for texar.torch.modules.embedders.position_embedders

# Copyright 2019 The Texar Authors. All Rights Reserved.
#
# Licensed under the Apache License, Version 2.0 (the "License");
# you may not use this file except in compliance with the License.
# You may obtain a copy of the License at
#
#      http://www.apache.org/licenses/LICENSE-2.0
#
# Unless required by applicable law or agreed to in writing, software
# distributed under the License is distributed on an "AS IS" BASIS,
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and
# limitations under the License.
"""
Various position embedders.
"""

import math
from typing import Optional

import torch
import torch.nn.functional as F

from texar.torch.modules.embedders import embedder_utils
from texar.torch.modules.embedders.embedder_base import (
    EmbedderBase, EmbeddingDropout)
from texar.torch.utils.shapes import mask_sequences

__all__ = [
    "PositionEmbedder",
    "SinusoidsPositionEmbedder",
]


[docs]class PositionEmbedder(EmbedderBase): r"""Simple position embedder that maps position indexes into embeddings via lookup. Either :attr:`init_value` or :attr:`position_size` is required. If both are given, there must be ``init_value.shape[0]==position_size``. Args: init_value (optional): A Tensor or numpy array that contains the initial value of embeddings. It is typically of shape ``[position_size, embedding dim]``. If `None`, embedding is initialized as specified in ``hparams["initializer"]``. Otherwise, the ``"initializer"`` and ``"dim"`` hyperparameters in :attr:`hparams` are ignored. position_size (int, optional): The number of possible positions, e.g., the maximum sequence length. Required if :attr:`init_value` is not given. hparams (dict, optional): Embedder hyperparameters. If it is not specified, the default hyperparameter setting is used. See :attr:`default_hparams` for the structure and default values. .. document private functions """ def __init__(self, position_size: Optional[int] = None, init_value: Optional[torch.Tensor] = None, hparams=None): if init_value is None and position_size is None: raise ValueError( "Either `init_value` or `position_size` is required.") super().__init__(position_size, init_value, hparams=hparams) self._position_size = position_size if position_size is None: self._position_size = self._num_embeds if self._position_size != self._num_embeds: raise ValueError( f"position_size must be equal to init_value.shape[0]. " f"Got {self._position_size} and {self._num_embeds}") self._built = True self._dropout_layer = EmbeddingDropout(self._hparams.dropout_rate)
[docs] @staticmethod def default_hparams(): r"""Returns a dictionary of hyperparameters with default values. .. code-block:: python { "dim": 100, "initializer": { "type": "random_uniform_initializer", "kwargs": { "minval": -0.1, "maxval": 0.1, "seed": None } }, "dropout_rate": 0, "dropout_strategy": 'element', "trainable": True, "name": "position_embedder" } The hyperparameters have the same meaning as those in :meth:`texar.torch.modules.WordEmbedder.default_hparams`. """ hparams = embedder_utils.default_embedding_hparams() hparams["name"] = "position_embedder" return hparams
[docs] def extra_repr(self) -> str: return f"position_size={self.position_size}, embedding_dim={self.dim}"
[docs] def forward(self, # type: ignore positions: Optional[torch.LongTensor] = None, sequence_length: Optional[torch.LongTensor] = None, **kwargs): r"""Embeds the positions. Either :attr:`positions` or :attr:`sequence_length` is required: - If both are given, :attr:`sequence_length` is used to mask out embeddings of those time steps beyond the respective sequence lengths. - If only :attr:`sequence_length` is given, then positions from 0 to ``sequence_length - 1`` are embedded. Args: positions (optional): A :tensor:`LongTensor` containing the position IDs to embed. sequence_length (optional): An :tensor:`LongTensor` of shape ``[batch_size]``. Time steps beyond the respective sequence lengths will have zero-valued embeddings. kwargs: Additional keyword arguments for :torch_nn:`functional.embedding` besides :attr:`params` and :attr:`ids`. Returns: A `Tensor` of shape `shape(inputs) + embedding dimension`. """ # Gets embedder inputs if positions is None: if sequence_length is None: raise ValueError( 'Either `positions` or `sequence_length` is required.') max_length = torch.max(sequence_length) single_inputs = torch.arange(start=0, end=max_length) # Expands `single_inputs` to have shape [batch_size, max_length] inputs = single_inputs.unsqueeze(0) inputs = inputs.expand(len(sequence_length), -1).contiguous() else: inputs = positions ids_rank = inputs.dim() embedding = self._embedding inputs = inputs.to(device=embedding.device) # Gets dropout strategy st = self._hparams.dropout_strategy # Dropouts as 'item_type' before embedding if st == 'item_type': noise_shape = self._get_noise_shape( dropout_strategy=st, dropout_input=embedding) embedding = self._dropout_layer(embedding, noise_shape) # Embeds outputs = torch.nn.functional.embedding( inputs.type(torch.long), embedding, **kwargs) # Dropouts as 'item' or 'elements' after embedding if st != 'item_type': noise_shape = self._get_noise_shape( dropout_strategy=st, dropout_input=outputs, ids_rank=ids_rank) outputs = self._dropout_layer(outputs, noise_shape) # Optionally masks if sequence_length is not None: outputs = mask_sequences( outputs, sequence_length) return outputs
@property def embedding(self): r"""The embedding tensor. """ return self._embedding @property def dim(self): r"""The embedding dimension. """ return self._dim @property def position_size(self): r"""The position size, i.e., maximum number of positions. """ return self._position_size @property def output_size(self) -> int: r"""The feature size of :meth:`forward` output. If the :attr:`dim` hyperparameter is a ``list`` or ``tuple``, the feature size equals its final dimension; otherwise, if :attr:`dim` is an ``int``, the feature size equals :attr:`dim`. """ if isinstance(self._dim, (list, tuple)): return self._dim[-1] else: return self._dim
[docs]class SinusoidsPositionEmbedder(EmbedderBase): r"""Sinusoid position embedder that maps position indexes into embeddings via sinusoid calculation. This module does not have trainable parameters. Used in, e.g., Transformer models `(Vaswani et al.) "Attention Is All You Need"`. Each channel of the input Tensor is incremented by a sinusoid of a different frequency and phase. This allows attention to learn to use absolute and relative positions. Timing signals should be added to some precursors of both the query and the memory inputs to attention. The use of relative position is possible because `sin(x+y)` and `cos(x+y)` can be expressed in terms of `y`, `sin(x)`, and `cos(x)`. In particular, we use a geometric sequence of timescales starting with min_timescale and ending with max_timescale. The number of different timescales is equal to ``dim / 2``. For each timescale, we generate the two sinusoidal signals `sin(timestep/timescale)` and `cos(timestep/timescale)`. All of these sinusoids are concatenated in the dim dimension. Args: position_size (int): The number of possible positions, e.g., the maximum sequence length. Set ``position_size=None`` and ``hparams['cache_embeddings']=False`` to use arbitrarily large or negative position indices. .. document private functions """ signal: torch.Tensor inv_timescales: torch.Tensor def __init__(self, position_size: Optional[int] = None, hparams=None): super().__init__(hparams=hparams) self._num_embeds = position_size # type: ignore self._dim = self._hparams.dim self._cache_embeddings = self._hparams.cache_embeddings num_timescales = self._dim // 2 min_timescale = self._hparams.min_timescale max_timescale = self._hparams.max_timescale log_timescale_increment = (math.log(max_timescale / min_timescale) / (num_timescales - 1)) inv_timescales = min_timescale * torch.exp( (torch.arange(num_timescales, dtype=torch.float) * -log_timescale_increment)) if self._cache_embeddings: if position_size is None: raise ValueError("'position_size' must not be None when " "'cache_embeddings' is set to True") positions = torch.arange(position_size, dtype=torch.float) signal = self._compute_embeddings(positions, inv_timescales) self.register_buffer('signal', signal) else: self.register_buffer('inv_timescales', inv_timescales)
[docs] @staticmethod def default_hparams(): r"""Returns a dictionary of hyperparameters with default values We use a geometric sequence of timescales starting with min_timescale and ending with max_timescale. The number of different timescales is equal to ``dim / 2``. .. code-block:: python { 'min_timescale': 1.0, 'max_timescale': 10000.0, 'dim': 512, 'cache_embeddings': True, 'name':'sinusoid_position_embedder', } Here: `"cache_embeddings"`: bool If `True`, precompute embeddings for positions in range `[0, position_size - 1]`. This leads to faster lookup but requires lookup indices to be within this range. If `False`, embeddings are computed on-the-fly during lookup. Set to `False` if your application needs to handle sequences of arbitrary length, or requires embeddings at negative positions. """ return { 'min_timescale': 1.0, 'max_timescale': 1.0e4, 'dim': 512, 'cache_embeddings': True, 'name': 'sinusoid_position_embedder', }
[docs] def extra_repr(self) -> str: return f"embedding_dim={self.dim}"
def _compute_embeddings(self, positions: torch.Tensor, inv_timescales: torch.Tensor) -> torch.Tensor: scaled_time = (positions.type_as(inv_timescales).view(-1, 1) * inv_timescales.unsqueeze(0)) signal = torch.cat( [torch.sin(scaled_time), torch.cos(scaled_time)], dim=1) if self._dim % 2 == 1: # An extra dimension must be added when dimension is odd. signal = torch.cat( [signal, signal.new_zeros(signal.size(0), 1)], dim=1) signal = signal.view(*positions.size(), -1).contiguous() return signal
[docs] def forward(self, # type: ignore positions: Optional[torch.LongTensor] = None, sequence_length: Optional[torch.LongTensor] = None, **kwargs) \ -> torch.Tensor: r"""Embeds. Either :attr:`positions` or :attr:`sequence_length` is required: - If both are given, :attr:`sequence_length` is used to mask out embeddings of those time steps beyond the respective sequence lengths. - If only :attr:`sequence_length` is given, then positions from `0` to `sequence_length - 1` are embedded. Args: positions (optional): An :tensor:`LongTensor` containing the position IDs to embed. sequence_length (optional): An :tensor:`LongTensor` of shape ``[batch_size]``. Time steps beyond the respective sequence lengths will have zero-valued embeddings. Returns: A Tensor of shape ``[batch_size, position_size, dim]``. """ if positions is None: if sequence_length is None: raise ValueError( 'Either `positions` or `sequence_length` is required.') max_length = sequence_length.max() batch_size = sequence_length.size(0) inputs = torch.arange(max_length).to(device=sequence_length.device) inputs = inputs.expand(batch_size, max_length) else: inputs = positions if self._cache_embeddings: outputs = F.embedding(inputs, self.signal, **kwargs) else: outputs = self._compute_embeddings(inputs, self.inv_timescales) if sequence_length is not None: outputs = mask_sequences(outputs, sequence_length) return outputs
@property def dim(self): r"""The embedding dimension. """ return self._dim @property def output_size(self) -> int: r"""The feature size of :meth:`forward` output. If the :attr:`dim` hyperparameter is a ``list`` or ``tuple``, the feature size equals its final dimension; otherwise, if :attr:`dim` is an ``int``, the feature size equals :attr:`dim`. """ if isinstance(self._dim, (list, tuple)): dim = self._dim[-1] else: dim = self._dim return dim