Source code for texar.torch.modules.classifiers.roberta_classifier

# Copyright 2019 The Texar Authors. All Rights Reserved.
#
# Licensed under the Apache License, Version 2.0 (the "License");
# you may not use this file except in compliance with the License.
# You may obtain a copy of the License at
#
#      http://www.apache.org/licenses/LICENSE-2.0
#
# Unless required by applicable law or agreed to in writing, software
# distributed under the License is distributed on an "AS IS" BASIS,
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and
# limitations under the License.
"""
RoBERTa classifier.
"""
from typing import Optional, Tuple, Union

import torch

from texar.torch.modules.encoders.roberta_encoder import RoBERTaEncoder
from texar.torch.modules.classifiers.bert_classifier import BERTClassifier
from texar.torch.modules.pretrained.roberta import \
    PretrainedRoBERTaMixin

__all__ = [
    "RoBERTaClassifier"
]


[docs]class RoBERTaClassifier(PretrainedRoBERTaMixin, BERTClassifier): r"""Classifier based on RoBERTa modules. Please see :class:`~texar.torch.modules.PretrainedRoBERTaMixin` for a brief description of RoBERTa. This is a combination of the :class:`~texar.torch.modules.RoBERTaEncoder` with a classification layer. Both step-wise classification and sequence-level classification are supported, specified in :attr:`hparams`. Arguments are the same as in :class:`~texar.torch.modules.RoBERTaEncoder`. Args: pretrained_model_name (optional): a `str`, the name of pre-trained model (e.g., ``roberta-base``). Please refer to :class:`~texar.torch.modules.PretrainedRoBERTaMixin` for all supported models. If `None`, the model name in :attr:`hparams` is used. cache_dir (optional): the path to a folder in which the pre-trained models will be cached. If `None` (default), a default directory (``texar_data`` folder under user's home directory) will be used. hparams (dict or HParams, optional): Hyperparameters. Missing hyperparameters will be set to default values. See :meth:`default_hparams` for the hyperparameter structure and default values. .. document private functions """ _ENCODER_CLASS = RoBERTaEncoder
[docs] @staticmethod def default_hparams(): r"""Returns a dictionary of hyperparameters with default values. .. code-block:: python { # (1) Same hyperparameters as in RoBertaEncoder ... # (2) Additional hyperparameters "num_classes": 2, "logit_layer_kwargs": None, "clas_strategy": "cls_time", "max_seq_length": None, "dropout": 0.1, "name": "roberta_classifier" } Here: 1. Same hyperparameters as in :class:`~texar.torch.modules.RoBERTaEncoder`. See the :meth:`~texar.torch.modules.RoBERTaEncoder.default_hparams`. An instance of RoBERTaEncoder is created for feature extraction. 2. Additional hyperparameters: `"num_classes"`: int Number of classes: - If **> 0**, an additional `Linear` layer is appended to the encoder to compute the logits over classes. - If **<= 0**, no dense layer is appended. The number of classes is assumed to be the final dense layer size of the encoder. `"logit_layer_kwargs"`: dict Keyword arguments for the logit Dense layer constructor, except for argument "units" which is set to `num_classes`. Ignored if no extra logit layer is appended. `"clas_strategy"`: str The classification strategy, one of: - **cls_time**: Sequence-level classification based on the output of the first time step (which is the `CLS` token). Each sequence has a class. - **all_time**: Sequence-level classification based on the output of all time steps. Each sequence has a class. - **time_wise**: Step-wise classification, i.e., make classification for each time step based on its output. `"max_seq_length"`: int, optional Maximum possible length of input sequences. Required if `clas_strategy` is `all_time`. `"dropout"`: float The dropout rate of the RoBERTa encoder output. `"name"`: str Name of the classifier. """ hparams = RoBERTaEncoder.default_hparams() hparams.update({ "num_classes": 2, "logit_layer_kwargs": None, "clas_strategy": "cls_time", "max_seq_length": None, "dropout": 0.1, "name": "roberta_classifier" }) return hparams
[docs] def forward(self, # type: ignore inputs: Union[torch.Tensor, torch.LongTensor], sequence_length: Optional[torch.LongTensor] = None) \ -> Tuple[torch.Tensor, torch.LongTensor]: r"""Feeds the inputs through the network and makes classification. The arguments are the same as in :class:`~texar.torch.modules.RoBERTaEncoder`. Args: inputs: Either a **2D Tensor** of shape `[batch_size, max_time]`, containing the ids of tokens in input sequences, or a **3D Tensor** of shape `[batch_size, max_time, vocab_size]`, containing soft token ids (i.e., weights or probabilities) used to mix the embedding vectors. sequence_length (optional): A 1D Tensor of shape `[batch_size]`. Input tokens beyond respective sequence lengths are masked out automatically. Returns: A tuple `(logits, preds)`, containing the logits over classes and the predictions, respectively. - If ``clas_strategy`` is ``cls_time`` or ``all_time``: - If ``num_classes`` == 1, ``logits`` and ``pred`` are both of shape ``[batch_size]``. - If ``num_classes`` > 1, ``logits`` is of shape ``[batch_size, num_classes]`` and ``pred`` is of shape ``[batch_size]``. - If ``clas_strategy`` is ``time_wise``: - ``num_classes`` == 1, ``logits`` and ``pred`` are both of shape ``[batch_size, max_time]``. - If ``num_classes`` > 1, ``logits`` is of shape ``[batch_size, max_time, num_classes]`` and ``pred`` is of shape ``[batch_size, max_time]``. """ logits, preds = super().forward(inputs=inputs, sequence_length=sequence_length, segment_ids=None) return logits, preds