Source code for texar.torch.modules.decoders.decoder_base

# Copyright 2019 The Texar Authors. All Rights Reserved.
#
# Licensed under the Apache License, Version 2.0 (the "License");
# you may not use this file except in compliance with the License.
# You may obtain a copy of the License at
#
#      http://www.apache.org/licenses/LICENSE-2.0
#
# Unless required by applicable law or agreed to in writing, software
# distributed under the License is distributed on an "AS IS" BASIS,
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and
# limitations under the License.
"""
Base class for decoders.
"""

import copy
from abc import ABC, abstractmethod
from typing import Callable, Generic, Optional, Tuple, TypeVar, Union, overload

import torch
from torch import nn

from texar.torch.core.layers import identity, Identity
from texar.torch.module_base import ModuleBase
from texar.torch.modules.decoders import decoder_helpers as helpers
from texar.torch.modules.decoders.decoder_helpers import Helper
from texar.torch.utils import utils
from texar.torch.utils.dtypes import torch_bool

__all__ = [
    '_make_output_layer',
    'DecoderBase',
]

State = TypeVar('State')
Output = TypeVar('Output')  # output type can be of any nested structure


def _make_output_layer(layer: Optional[Union[nn.Module, torch.Tensor]],
                       vocab_size: Optional[int],
                       output_size: int,
                       bias: bool) -> Tuple[nn.Module, Optional[int]]:
    r"""Construct the output layer for decoders. Based on the input, multiple
    types of output layers could be constructed:

    - If ``layer`` is a :torch_nn:`Module`, then the layer is returned as is.
    - If ``layer`` is `None`, then a :torch_nn:`Linear` layer is constructed
      with ``output_size`` and ``vocab_size`` as input and output dimensions.
    - If ``layer`` is a :tensor:`Tensor`, then a :torch_nn:`Linear` layer is
      constructed with the provided tensor as parameters. Note that this tensor
      should have transposed shape, i.e. shape of ``[vocab_size, output_size]``.
      Also, if the provided tensor is not an instance of :torch_nn:`Parameter`,
      it will **not** accumulate gradients.
    - If ``layer`` is :method:`texar.torch.core.identity`, identity function is
      used as the output layer.
    """
    if isinstance(layer, nn.Module):
        output_layer = layer
    elif layer is None:
        if vocab_size is None:
            raise ValueError(
                "Either `output_layer` or `vocab_size` must be provided. "
                "Set `output_layer=tx.core.identity` if no output "
                "layer is wanted.")
        output_layer = nn.Linear(output_size, vocab_size, bias)
    elif torch.is_tensor(layer):
        vocab_size = layer.size(0)
        output_layer = nn.Linear(layer.size(1), vocab_size, bias)
        if not isinstance(layer, nn.Parameter):
            layer = nn.Parameter(layer, requires_grad=False)
        output_layer.weight = layer
    elif layer is identity:
        output_layer = Identity()
    else:
        raise ValueError(
            f"output_layer should be an instance of `nn.Module`, a tensor,"
            f"or None. Unsupported type: {type(layer)}")

    return output_layer, vocab_size


TokenEmbedder = Union[nn.Module, Callable[[torch.LongTensor], torch.Tensor]]
TokenPosEmbedder = Union[
    nn.Module, Callable[[torch.LongTensor, torch.LongTensor], torch.Tensor]]


[docs]class DecoderBase(ModuleBase, Generic[State, Output], ABC): r"""Base class inherited by all RNN decoder classes. See :class:`~texar.torch.modules.BasicRNNDecoder` for the arguments. See :meth:`forward` for the inputs and outputs of RNN decoders in general. """ def __init__(self, token_embedder: Optional[TokenEmbedder] = None, token_pos_embedder: Optional[TokenPosEmbedder] = None, input_time_major: bool = False, output_time_major: bool = False, hparams=None): super().__init__(hparams=hparams) self._train_helper: Optional[Helper] = None self._infer_helper: Optional[Helper] = None self._input_time_major = input_time_major self._output_time_major = output_time_major if (token_embedder is not None and token_pos_embedder is not None): raise ValueError("At most one among `token_embedder` and " "`token_pos_embedder` should be specified") if token_embedder is None and token_pos_embedder is None: embed_token_func = self.embed_tokens.__func__ # type: ignore if embed_token_func is DecoderBase.embed_tokens: raise ValueError( "Either `token_embedder` or `token_pos_embedder` must not " "be `None` if `DecoderBase.embed_tokens` is not " "overridden.") self._token_embedder = token_embedder self._token_pos_embedder = token_pos_embedder
[docs] def embed_tokens(self, tokens: torch.LongTensor, positions: torch.LongTensor) -> torch.Tensor: r"""Convert tokens along with positions to embeddings. Args: tokens: A :tensor:`LongTensor` denoting the token indices to convert to embeddings. positions: A :tensor:`LongTensor` with the same size as :attr:`tokens`, denoting the positions of the tokens. This is useful if the decoder uses positional embeddings. Returns: A :tensor:`Tensor` of size ``tokens.size() + (embed_dim,)``, denoting the converted embeddings. """ if self._token_embedder is not None: return self._token_embedder(tokens) assert self._token_pos_embedder is not None return self._token_pos_embedder(tokens, positions)
[docs] def create_helper(self, *, decoding_strategy: Optional[str] = None, start_tokens: Optional[torch.LongTensor] = None, end_token: Optional[int] = None, softmax_temperature: Optional[float] = None, infer_mode: Optional[bool] = None, **kwargs) -> Helper: r"""Create a helper instance for the decoder. This is a shared interface for both :class:`~texar.torch.modules.BasicRNNDecoder` and :class:`~texar.torch.modules.AttentionRNNDecoder`. The function provides **3 ways** to specify the decoding method, with varying flexibility: 1. The :attr:`decoding_strategy` argument: A string taking value of: - **"train_greedy"**: decoding in teacher-forcing fashion (i.e., feeding `ground truth` to decode the next step), and each sample is obtained by taking the `argmax` of the output logits. Arguments :attr:`(inputs, sequence_length)` are required for this strategy, and argument :attr:`embedding` is optional. - **"infer_greedy"**: decoding in inference fashion (i.e., feeding the `generated` sample to decode the next step), and each sample is obtained by taking the `argmax` of the output logits. Arguments :attr:`(embedding, start_tokens, end_token)` are required for this strategy, and argument :attr:`max_decoding_length` is optional. - **"infer_sample"**: decoding in inference fashion, and each sample is obtained by `random sampling` from the RNN output distribution. Arguments :attr:`(embedding, start_tokens, end_token)` are required for this strategy, and argument :attr:`max_decoding_length` is optional. This argument is used only when argument :attr:`helper` is `None`. Example: .. code-block:: python embedder = WordEmbedder(vocab_size=data.vocab.size) decoder = BasicRNNDecoder(vocab_size=data.vocab.size) # Teacher-forcing decoding outputs_1, _, _ = decoder( decoding_strategy='train_greedy', inputs=embedder(data_batch['text_ids']), sequence_length=data_batch['length'] - 1) # Random sample decoding. Gets 100 sequence samples outputs_2, _, sequence_length = decoder( decoding_strategy='infer_sample', start_tokens=[data.vocab.bos_token_id] * 100, end_token=data.vocab.eos.token_id, embedding=embedder, max_decoding_length=60) 2. The :attr:`helper` argument: An instance of subclass of :class:`~texar.torch.modules.decoders.decoder_helpers.Helper`. This provides a superset of decoding strategies than above, for example: - :class:`~texar.torch.modules.TrainingHelper` corresponding to the "train_greedy" strategy. - :class:`~texar.torch.modules.ScheduledEmbeddingTrainingHelper` and :class:`~texar.torch.modules.ScheduledOutputTrainingHelper` for scheduled sampling. - :class:`~texar.torch.modules.SoftmaxEmbeddingHelper` and :class:`~texar.torch.modules.GumbelSoftmaxEmbeddingHelper` for soft decoding and gradient backpropagation. This means gives the maximal flexibility of configuring the decoding strategy. Example: .. code-block:: python embedder = WordEmbedder(vocab_size=data.vocab.size) decoder = BasicRNNDecoder(vocab_size=data.vocab.size) # Teacher-forcing decoding, same as above with # `decoding_strategy='train_greedy'` helper_1 = TrainingHelper( inputs=embedders(data_batch['text_ids']), sequence_length=data_batch['length'] - 1) outputs_1, _, _ = decoder(helper=helper_1) # Gumbel-softmax decoding helper_2 = GumbelSoftmaxEmbeddingHelper( embedding=embedder, start_tokens=[data.vocab.bos_token_id] * 100, end_token=data.vocab.eos_token_id, tau=0.1) outputs_2, _, sequence_length = decoder( max_decoding_length=60, helper=helper_2) 3. ``hparams["helper_train"]`` and ``hparams["helper_infer"]``: Specifying the helper through hyperparameters. Train and infer strategy is toggled based on :attr:`mode`. Appropriate arguments (e.g., :attr:`inputs`, :attr:`start_tokens`, etc) are selected to construct the helper. Additional arguments for helper constructor can be provided either through :attr:`**kwargs`, or through ``hparams["helper_train/infer"]["kwargs"]``. This means is used only when both :attr:`decoding_strategy` and :attr:`helper` are `None`. Example: .. code-block:: python h = { "helper_infer": { "type": "GumbelSoftmaxEmbeddingHelper", "kwargs": { "tau": 0.1 } } } embedder = WordEmbedder(vocab_size=data.vocab.size) decoder = BasicRNNDecoder(vocab_size=data.vocab.size, hparams=h) # Gumbel-softmax decoding decoder.eval() # disable dropout output, _, _ = decoder( decoding_strategy=None, # Sets to None explicit embedding=embedder, start_tokens=[data.vocab.bos_token_id] * 100, end_token=data.vocab.eos_token_id, max_decoding_length=60) Args: decoding_strategy (str): A string specifying the decoding strategy. Different arguments are required based on the strategy. Ignored if :attr:`helper` is given. start_tokens (optional): A :tensor:`LongTensor` of shape ``[batch_size]``, the start tokens. Used when :attr:`decoding_strategy` is ``"infer_greedy"`` or ``"infer_sample"``, or when `hparams`-configured helper is used. When used with the Texar data module, to get ``batch_size`` samples where ``batch_size`` is changing according to the data module, this can be set as ``start_tokens=torch.full_like(batch['length'], bos_token_id)``. end_token (optional): A integer or 0D :tensor:`LongTensor`, the token that marks the end of decoding. Used when :attr:`decoding_strategy` is ``"infer_greedy"`` or ``"infer_sample"``, or when `hparams`-configured helper is used. softmax_temperature (float, optional): Value to divide the logits by before computing the softmax. Larger values (above 1.0) result in more random samples. Must be > 0. If `None`, 1.0 is used. Used when ``decoding_strategy="infer_sample"``. infer_mode (optional): If not `None`, overrides mode given by :attr:`self.training`. **kwargs: Other keyword arguments for constructing helpers defined by ``hparams["helper_train"]`` or ``hparams["helper_infer"]``. Returns: The constructed helper instance. """ if decoding_strategy is not None: if decoding_strategy == 'train_greedy': helper: Helper = helpers.TrainingHelper( self._input_time_major) elif decoding_strategy in ['infer_greedy', 'infer_sample']: if start_tokens is None or end_token is None: raise ValueError( f"When using '{decoding_strategy}' decoding strategy, " f"'embedding', 'start_tokens', and 'end_token' must " f"not be `None`.") if decoding_strategy == 'infer_greedy': helper = helpers.GreedyEmbeddingHelper( start_tokens, end_token) else: helper = helpers.SampleEmbeddingHelper( start_tokens, end_token, softmax_temperature) else: raise ValueError( f"Unknown decoding strategy: {decoding_strategy}") else: is_training = (not infer_mode if infer_mode is not None else self.training) if is_training: kwargs_ = copy.copy(self._hparams.helper_train.kwargs.todict()) helper_type = self._hparams.helper_train.type else: kwargs_ = copy.copy(self._hparams.helper_infer.kwargs.todict()) helper_type = self._hparams.helper_infer.type kwargs_.update({ 'time_major': self._input_time_major, 'start_tokens': start_tokens, 'end_token': end_token, 'softmax_temperature': softmax_temperature}) kwargs_.update(kwargs) helper = helpers.get_helper(helper_type, **kwargs_) return helper
def _create_or_get_helper(self, infer_mode: Optional[bool] = None, **kwargs) -> Helper: # Prefer creating a new helper when at least one kwarg is specified. prefer_new = (len(kwargs) > 0) kwargs.update(infer_mode=infer_mode) is_training = (not infer_mode if infer_mode is not None else self.training) helper = self._train_helper if is_training else self._infer_helper if prefer_new or helper is None: helper = self.create_helper(**kwargs) if is_training and self._train_helper is None: self._train_helper = helper elif not is_training and self._infer_helper is None: self._infer_helper = helper return helper
[docs] def set_default_train_helper(self, helper: Helper): r"""Set the default helper used in training mode. Args: helper: The helper to set as default training helper. """ self._train_helper = helper
[docs] def set_default_infer_helper(self, helper: Helper): r"""Set the default helper used in eval (inference) mode. Args: helper: The helper to set as default inference helper. """ self._infer_helper = helper
[docs] def dynamic_decode(self, helper: Helper, inputs: Optional[torch.Tensor], sequence_length: Optional[torch.LongTensor], initial_state: Optional[State], max_decoding_length: Optional[int] = None, impute_finished: bool = False, step_hook: Optional[Callable[[int], None]] = None) \ -> Tuple[Output, Optional[State], torch.LongTensor]: r"""Generic routine for dynamic decoding. Please check the `documentation <https://www.tensorflow.org/api_docs/python/tf/contrib/seq2seq/dynamic_decode>`_ for the TensorFlow counterpart. Returns: A tuple of output, final state, and sequence lengths. Note that final state could be `None`, when all sequences are of zero length and :attr:`initial_state` is also `None`. """ # Decode finished, step_inputs, state = self.initialize( helper, inputs, sequence_length, initial_state) zero_outputs = step_inputs.new_zeros( step_inputs.size(0), self.output_size) if max_decoding_length is not None: finished |= (max_decoding_length <= 0) sequence_lengths = torch.zeros_like( finished, dtype=torch.long, device=finished.device) time = 0 outputs = [] while (not torch.all(finished).item() and (max_decoding_length is None or time < max_decoding_length)): next_outputs, decoder_state = \ self.step(helper, time, step_inputs, state) if max_decoding_length is not None and \ time + 1 == max_decoding_length: # Maximum decoding length reached, mark all batches as finished. # This requires special handling because performing lookup on # position embeddings with `time + 1` may result in IndexError. decoder_finished = torch.tensor(1, dtype=torch_bool, device=finished.device) # Since `next_inputs` will not be used, simply create a null # tensor. next_inputs = torch.empty(0) else: next_inputs, decoder_finished = self.next_inputs( helper, time, next_outputs) if getattr(self, 'tracks_own_finished', False): next_finished = decoder_finished else: next_finished = decoder_finished | finished # Zero out output values past finish if impute_finished: emit = utils.map_structure_zip( lambda new, cur: torch.where(finished, cur, new), (next_outputs, zero_outputs)) next_state = utils.map_structure_zip( lambda new, cur: torch.where(finished, cur, new), (decoder_state, state)) else: emit = next_outputs next_state = decoder_state outputs.append(emit) sequence_lengths.index_fill_( dim=0, value=time + 1, index=torch.nonzero((~finished).long()).flatten()) time += 1 finished = next_finished step_inputs = next_inputs state = next_state if step_hook is not None: step_hook(time) final_outputs = utils.map_structure_zip( lambda *tensors: torch.stack(tensors), outputs) # output at each time step may be a namedtuple final_state = state final_sequence_lengths = sequence_lengths try: final_outputs, final_state = self.finalize( final_outputs, final_state, final_sequence_lengths) except NotImplementedError: pass if not self._output_time_major: final_outputs = utils.map_structure( lambda x: x.transpose(0, 1) if x.dim() >= 2 else x, final_outputs) return final_outputs, final_state, final_sequence_lengths
[docs] @abstractmethod def initialize(self, helper: Helper, inputs: Optional[torch.Tensor], sequence_length: Optional[torch.LongTensor], initial_state: Optional[State]) \ -> Tuple[torch.ByteTensor, torch.Tensor, Optional[State]]: r"""Called before any decoding iterations. This methods must compute initial input values and initial state. Args: helper: The :class:`~texar.torch.modules.Helper` instance to use. inputs (optional): A (structure of) input tensors. sequence_length (optional): A :tensor:`LongTensor` representing lengths of each sequence. initial_state: A possibly nested structure of tensors indicating the initial decoder state. Returns: A tuple ``(finished, initial_inputs, initial_state)`` representing initial values of ``finished`` flags, inputs, and state. """ raise NotImplementedError
[docs] @abstractmethod def step(self, helper: Helper, time: int, inputs: torch.Tensor, state: Optional[State]) \ -> Tuple[Output, State]: r"""Compute the output and the state at the current time step. Called per step of decoding (but only once for dynamic decoding). Args: helper: The :class:`~texar.torch.modules.Helper` instance to use. time (int): Current step number. inputs: Inputs for this time step. state: Decoder state from the previous time step. Returns: A tuple ``(outputs, next_state)``. - ``outputs`` is an object containing the decoder output. - ``next_state`` is the decoder state for the next time step. """ raise NotImplementedError
[docs] @abstractmethod def next_inputs(self, helper: Helper, time: int, outputs: Output) -> \ Tuple[torch.Tensor, torch.ByteTensor]: r"""Compute the input for the next time step. Called per step of decoding (but only once for dynamic decoding). Args: helper: The :class:`~texar.torch.modules.Helper` instance to use. time (int): Current step number. outputs: An object containing the decoder output. Returns: A tuple ``(next_inputs, finished)``. - ``next_inputs`` is the tensor that should be used as input for the next step. - ``finished`` is a :torch:`ByteTensor` tensor telling whether the sequence is complete, for each sequence in the batch. """ raise NotImplementedError
# TODO: Remove these once pylint supports function stubs. # pylint: disable=missing-docstring,unused-argument,no-self-use # pylint: disable=function-redefined @overload def finalize(self, outputs: Output, final_state: State, sequence_lengths: torch.LongTensor) -> Tuple[Output, State]: ... @overload def finalize(self, outputs: Output, final_state: Optional[State], sequence_lengths: torch.LongTensor) \ -> Tuple[Output, Optional[State]]: ...
[docs] def finalize(self, outputs, final_state, sequence_lengths): r"""Called after all decoding iterations have finished. Args: outputs: Outputs at each time step. final_state: The RNNCell state after the last time step. sequence_lengths: Sequence lengths for each sequence in batch. Returns: A tuple ``(outputs, final_state)``. - ``outputs`` is an object containing the decoder output. - ``final_state`` is the final decoder state. """ return outputs, final_state
# pylint: enable=missing-docstring,unused-argument,no-self-use # pylint: enable=function-redefined @property def vocab_size(self): r"""The vocabulary size. """ return self._vocab_size @property def output_layer(self): r"""The output layer. """ return self._output_layer