Source code for texar.torch.modules.pretrained.xlnet

# Copyright 2019 The Texar Authors. All Rights Reserved.
#
# Licensed under the Apache License, Version 2.0 (the "License");
# you may not use this file except in compliance with the License.
# You may obtain a copy of the License at
#
#      http://www.apache.org/licenses/LICENSE-2.0
#
# Unless required by applicable law or agreed to in writing, software
# distributed under the License is distributed on an "AS IS" BASIS,
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and
# limitations under the License.
"""
Utils of XLNet Modules.
"""

import json
import os
from abc import ABC
from typing import Any, Callable, Dict, Optional, Union

import torch
from torch import nn

from texar.torch.modules.pretrained.pretrained_base import PretrainedMixin
from texar.torch.modules.pretrained.xlnet_utils import (
    PositionWiseFF, RelativeMultiheadAttention, init_weights)

__all__ = [
    "PretrainedXLNetMixin",
]

_XLNET_PATH = "https://storage.googleapis.com/xlnet/released_models/"


[docs]class PretrainedXLNetMixin(PretrainedMixin, ABC):
    r"""A mixin class to support loading pre-trained checkpoints for modules
    that implement the XLNet model.

    The XLNet model was proposed in
    `XLNet: Generalized Autoregressive Pretraining for Language Understanding`_
    by `Yang et al.` It is based on the Transformer-XL model, pre-trained on a
    large corpus using a language modeling objective that considers all
    permutations of the input sentence.

    The available XLNet models are as follows:

      * ``xlnet-based-cased``: 12-layer, 768-hidden, 12-heads. This model is
        trained on full data (different from the one in the paper).
      * ``xlnet-large-cased``: 24-layer, 1024-hidden, 16-heads.

    We provide the following XLNet classes:

      * :class:`~texar.torch.modules.XLNetEncoder` for text encoding.
      * :class:`~texar.torch.modules.XLNetDecoder` for text generation and
        decoding.
      * :class:`~texar.torch.modules.XLNetClassifier` for text classification
        and sequence tagging.
      * :class:`~texar.torch.modules.XLNetRegressor` for text regression.

    .. _`XLNet: Generalized Autoregressive Pretraining for Language Understanding`:
        http://arxiv.org/abs/1906.08237
    """
    _MODEL_NAME = "XLNet"
    _MODEL2URL = {
        'xlnet-base-cased':
            _XLNET_PATH + "cased_L-12_H-768_A-12.zip",
        'xlnet-large-cased':
            _XLNET_PATH + "cased_L-24_H-1024_A-16.zip",
    }

[docs]    def reset_parameters(self):
        self.apply(init_weights)
        if not self._hparams.untie_r:
            nn.init.normal_(self.r_w_bias, 0.0, 0.02)
            nn.init.normal_(self.r_r_bias, 0.0, 0.02)
            if self._hparams.use_segments:
                nn.init.normal_(self.r_s_bias, 0.0, 0.02)

    @classmethod
    def _transform_config(cls, pretrained_model_name: str,
                          cache_dir: str) -> Dict[str, Any]:
        info = list(os.walk(cache_dir))
        root, _, files = info[0]
        config_path = None
        for file in files:
            if file.endswith('config.json'):
                config_path = os.path.join(root, file)
        if config_path is None:
            raise ValueError(f"Cannot find the config file in {cache_dir}")

        with open(config_path) as f:
            config_ckpt = json.loads(f.read())

        configs = {
            "head_dim": config_ckpt["d_head"],
            "ffn_inner_dim": config_ckpt["d_inner"],
            "hidden_dim": config_ckpt["d_model"],
            "activation": config_ckpt["ff_activation"],
            "num_heads": config_ckpt["n_head"],
            "num_layers": config_ckpt["n_layer"],
            "vocab_size": config_ckpt["n_token"],
            "untie_r": config_ckpt["untie_r"]
        }

        return configs

    def _init_from_checkpoint(self, pretrained_model_name: str,
                              cache_dir: str, **kwargs):
        # remember to call .contiguous after trans_fn
        try:
            import numpy as np
            import tensorflow as tf
        except ImportError:
            print("Loading TensorFlow models in PyTorch requires installing "
                  "TensorFlow. Please see https://www.tensorflow.org/install/ "
                  "for installation instructions.")
            raise

        ckpt = tf.train.load_checkpoint(
            os.path.join(cache_dir, 'xlnet_model.ckpt'))
        from_params: Dict[str, np.ndarray] = {
            key: ckpt.get_tensor(key)
            for key in ckpt.get_variable_to_shape_map().keys()}
        del from_params["global_step"]  # useless variable
        to_params: Dict[str, nn.Parameter] = dict(self.named_parameters())

        def get_weight(name: str) -> torch.Tensor:
            weight = from_params["model/" + name]
            del from_params["model/" + name]
            return torch.from_numpy(weight)

        TransFn = Callable[[torch.Tensor], torch.Tensor]

        def assign(param: nn.Parameter, weight: Union[str, torch.Tensor],
                   trans_fn: Optional[TransFn] = None,
                   allow_fail: bool = False):
            param_key = next(k for k, v in to_params.items() if v is param)
            # Delete regardless of whether weight exists.
            del to_params[param_key]
            if isinstance(weight, str):
                try:
                    weight = get_weight(weight)
                except KeyError:
                    if allow_fail:
                        print(f"Weight {weight} not found in checkpoint")
                        return
                    else:
                        raise
            if trans_fn is not None:
                weight = trans_fn(weight).contiguous()
            if param.size() != weight.size():
                raise ValueError(f"Expected size {param.size()}, "
                                 f"actual size {weight.size()}")
            param.data = weight

        def assign_linear(linear: nn.Linear, prefix: str):
            trans_fn = lambda p: p.view(p.size(0), -1).t()
            assign(linear.weight, prefix + "kernel", trans_fn)
            if linear.bias is not None:
                assign(linear.bias, prefix + "bias")

        def assign_layer_norm(layer_norm: nn.LayerNorm, prefix: str):
            assign(layer_norm.weight, prefix + "LayerNorm/gamma")
            assign(layer_norm.bias, prefix + "LayerNorm/beta")

        def load_xlnet_model(xlnet):
            n_layers = len(xlnet.attn_layers)
            for bias_name in ['r_r_bias', 'r_w_bias', 'r_s_bias']:
                weight = get_weight("transformer/" + bias_name)
                if xlnet.hparams.untie_r:
                    for idx in range(n_layers):
                        layer: RelativeMultiheadAttention
                        layer = xlnet.attn_layers[idx]
                        assign(getattr(layer, bias_name), weight[idx])
                else:
                    assign(getattr(xlnet, bias_name), weight)
            assign(xlnet.word_embed.weight,
                   "transformer/word_embedding/lookup_table")

            for idx in range(n_layers):
                layer: RelativeMultiheadAttention = xlnet.attn_layers[idx]
                prefix = f"transformer/layer_{idx}/rel_attn/"
                qkv_weights = [get_weight(prefix + f"{part}/kernel")
                               for part in "qkv"]
                assign(layer.head_projection.weight,
                       torch.cat([
                           p.view(p.size(0), -1) for p in qkv_weights
                       ], dim=1).t())
                assign_linear(layer.pos_projection, prefix + "r/")
                assign(layer.output_projection.weight,  # DO NOT TRANSPOSE!!!!
                       prefix + "o/kernel", lambda p: p.view(p.size(0), -1))
                assign_layer_norm(layer.layer_norm, prefix)

            for idx in range(n_layers):
                layer: PositionWiseFF = xlnet.ff_layers[idx]
                prefix = f"transformer/layer_{idx}/ff/"
                for linear_idx in range(1, 2 + 1):
                    linear_prefix = f"{prefix}layer_{linear_idx}/"
                    linear_layer: nn.Linear = getattr(
                        layer, f"linear{linear_idx}")
                    assign_linear(linear_layer, linear_prefix)
                assign_layer_norm(layer.layer_norm, prefix)

            seg_embeds = [
                p.squeeze(0)
                for p in torch.chunk(
                    get_weight("transformer/seg_embed"), n_layers, dim=0)]
            for idx in range(n_layers):
                assign(xlnet.attn_layers[idx].segment_embed, seg_embeds[idx])

            if hasattr(xlnet, 'mask_emb') and hasattr(xlnet, 'lm_bias'):
                assign(xlnet.mask_emb, "transformer/mask_emb/mask_emb")
                assign(xlnet.lm_bias, "lm_loss/bias")

        load_xlnet_model(self)

        if len(from_params) > 0:
            print(f"WARNING: Certain weights from checkpoint are not loaded: "
                  f"{list(from_params.keys())}")

        filtered_to_params = [k for k in to_params if k.startswith("xlnet")]
        if len(filtered_to_params) > 0:
            print(f"WARNING: Certain parameters are not initialized: "
                  f"{list(filtered_to_params)}")