Source code for texar.torch.modules.pretrained.bert

# Copyright 2019 The Texar Authors. All Rights Reserved.
#
# Licensed under the Apache License, Version 2.0 (the "License");
# you may not use this file except in compliance with the License.
# You may obtain a copy of the License at
#
#      http://www.apache.org/licenses/LICENSE-2.0
#
# Unless required by applicable law or agreed to in writing, software
# distributed under the License is distributed on an "AS IS" BASIS,
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and
# limitations under the License.
"""
Utils of BERT Modules.
"""

from typing import Any, Dict

import json
import os

from abc import ABC

import torch

from texar.torch.modules.pretrained.pretrained_base import PretrainedMixin

__all__ = [
    "PretrainedBERTMixin",
]

_BERT_PATH = "https://storage.googleapis.com/bert_models/"
_BIOBERT_PATH = "https://github.com/naver/biobert-pretrained/releases/download/"
_SCIBERT_PATH = "https://s3-us-west-2.amazonaws.com/ai2-s2-research/" \
                "scibert/tensorflow_models/"
_SPANBERT_PATH = "https://dl.fbaipublicfiles.com/fairseq/models/"


[docs]class PretrainedBERTMixin(PretrainedMixin, ABC):
    r"""A mixin class to support loading pre-trained checkpoints for modules
    that implement the BERT model.

    Both standard BERT models and many domain specific BERT-based models are
    supported. You can specify the :attr:`pretrained_model_name` argument to
    pick which pre-trained BERT model to use. All available categories of
    pre-trained models (and names) include:

    * **Standard BERT**: proposed in (`Devlin et al`. 2018)
      `BERT: Pre-training of Deep Bidirectional Transformers for Language Understanding`_
      . A bidirectional Transformer language model pre-trained on large text
      corpora. Available model names include:

        * ``bert-base-uncased``: 12-layer, 768-hidden, 12-heads,
          110M parameters.
        * ``bert-large-uncased``: 24-layer, 1024-hidden, 16-heads,
          340M parameters.
        * ``bert-base-cased``: 12-layer, 768-hidden, 12-heads , 110M parameters.
        * ``bert-large-cased``: 24-layer, 1024-hidden, 16-heads,
          340M parameters.
        * ``bert-base-multilingual-uncased``: 102 languages, 12-layer,
          768-hidden, 12-heads, 110M parameters.
        * ``bert-base-multilingual-cased``: 104 languages, 12-layer, 768-hidden,
          12-heads, 110M parameters.
        * ``bert-base-chinese``: Chinese Simplified and Traditional, 12-layer,
          768-hidden, 12-heads, 110M parameters.

    * **BioBERT**: proposed in (`Lee et al`. 2019)
      `BioBERT: a pre-trained biomedical language representation model for biomedical text mining`_
      . A domain specific language representation model pre-trained on
      large-scale biomedical corpora. Based on the BERT architecture, BioBERT
      effectively transfers the knowledge from a large amount of biomedical
      texts to biomedical text mining models with minimal task-specific
      architecture modifications. Available model names include:

        * ``biobert-v1.0-pmc``: BioBERT v1.0 (+ PMC 270K) - based on
          BERT-base-Cased (same vocabulary).
        * ``biobert-v1.0-pubmed-pmc``: BioBERT v1.0 (+ PubMed 200K + PMC 270K) -
          based on BERT-base-Cased (same vocabulary).
        * ``biobert-v1.0-pubmed``: BioBERT v1.0 (+ PubMed 200K) - based on
          BERT-base-Cased (same vocabulary).
        * ``biobert-v1.1-pubmed``: BioBERT v1.1 (+ PubMed 1M) - based on
          BERT-base-Cased (same vocabulary).

    * **SciBERT**: proposed in (`Beltagy et al`. 2019)
      `SciBERT: A Pretrained Language Model for Scientific Text`_. A BERT model
      trained on scientific text. SciBERT leverages unsupervised pre-training
      on a large multi-domain corpus of scientific publications to improve
      performance on downstream scientific NLP tasks. Available model
      names include:

        * ``scibert-scivocab-uncased``: Uncased version of the model trained
          on its own vocabulary.
        * ``scibert-scivocab-cased``: Cased version of the model trained on
          its own vocabulary.
        * ``scibert-basevocab-uncased``: Uncased version of the model trained
          on the original BERT vocabulary.
        * ``scibert-basevocab-cased``: Cased version of the model trained on
          the original BERT vocabulary.

    * **SpanBERT**: proposed in (`Joshi et al`. 2019)
      `SpanBERT: Improving Pre-training by Representing and Predicting Spans`_.
      As a variant of the standard BERT model, SpanBERT extends BERT by
      (1) masking contiguous random spans, rather than random tokens, and
      (2) training the span boundary representations to predict the entire
      content of the masked span, without relying on the individual token
      representations within it. Differing from the standard BERT, the
      SpanBERT model does not use segmentation embedding. Available model names
      include:

        * ``spanbert-base-cased``: SpanBERT using the BERT-base architecture,
          12-layer, 768-hidden, 12-heads , 110M parameters.
        * ``spanbert-large-cased``: SpanBERT using the BERT-large architecture,
          24-layer, 1024-hidden, 16-heads, 340M parameters.

    We provide the following BERT classes:

      * :class:`~texar.torch.modules.BERTEncoder` for text encoding.
      * :class:`~texar.torch.modules.BERTClassifier` for text classification and
        sequence tagging.

    .. _`BERT: Pre-training of Deep Bidirectional Transformers for Language Understanding`:
        https://arxiv.org/abs/1810.04805

    .. _`BioBERT: a pre-trained biomedical language representation model for biomedical text mining`:
        https://arxiv.org/abs/1901.08746

    .. _`SciBERT: A Pretrained Language Model for Scientific Text`:
        https://arxiv.org/abs/1903.10676

    .. _`SpanBERT: Improving Pre-training by Representing and Predicting Spans`:
        https://arxiv.org/abs/1907.10529
    """

    _MODEL_NAME = "BERT"
    _MODEL2URL = {
        # Standard BERT
        'bert-base-uncased':
            _BERT_PATH + "2018_10_18/uncased_L-12_H-768_A-12.zip",
        'bert-large-uncased':
            _BERT_PATH + "2018_10_18/uncased_L-24_H-1024_A-16.zip",
        'bert-base-cased':
            _BERT_PATH + "2018_10_18/cased_L-12_H-768_A-12.zip",
        'bert-large-cased':
            _BERT_PATH + "2018_10_18/cased_L-24_H-1024_A-16.zip",
        'bert-base-multilingual-uncased':
            _BERT_PATH + "2018_11_23/multi_cased_L-12_H-768_A-12.zip",
        'bert-base-multilingual-cased':
            _BERT_PATH + "2018_11_03/multilingual_L-12_H-768_A-12.zip",
        'bert-base-chinese':
            _BERT_PATH + "2018_11_03/chinese_L-12_H-768_A-12.zip",

        # BioBERT
        'biobert-v1.0-pmc':
            _BIOBERT_PATH + 'v1.0-pmc/biobert_v1.0_pmc.tar.gz',
        'biobert-v1.0-pubmed-pmc':
            _BIOBERT_PATH + 'v1.0-pubmed-pmc/biobert_v1.0_pubmed_pmc.tar.gz',
        'biobert-v1.0-pubmed':
            _BIOBERT_PATH + 'v1.0-pubmed/biobert_v1.0_pubmed.tar.gz',
        'biobert-v1.1-pubmed':
            _BIOBERT_PATH + 'v1.1-pubmed/biobert_v1.1_pubmed.tar.gz',

        # SciBERT
        'scibert-scivocab-uncased':
            _SCIBERT_PATH + 'scibert_scivocab_uncased.tar.gz',
        'scibert-scivocab-cased':
            _SCIBERT_PATH + 'scibert_scivocab_cased.tar.gz',
        'scibert-basevocab-uncased':
            _SCIBERT_PATH + 'scibert_basevocab_uncased.tar.gz',
        'scibert-basevocab-cased':
            _SCIBERT_PATH + 'scibert_basevocab_cased.tar.gz',

        # SpanBERT
        'spanbert-base-cased':
            _SPANBERT_PATH + "spanbert_hf_base.tar.gz",
        'spanbert-large-cased':
            _SPANBERT_PATH + "spanbert_hf.tar.gz",
    }
    _MODEL2CKPT = {
        # Standard BERT
        'bert-base-uncased': 'bert_model.ckpt',
        'bert-large-uncased': 'bert_model.ckpt',
        'bert-base-cased': 'bert_model.ckpt',
        'bert-large-cased': 'bert_model.ckpt',
        'bert-base-multilingual-uncased': 'bert_model.ckpt',
        'bert-base-multilingual-cased': 'bert_model.ckpt',
        'bert-base-chinese': 'bert_model.ckpt',

        # BioBERT
        'biobert-v1.0-pmc': 'biobert_model.ckpt',
        'biobert-v1.0-pubmed-pmc': 'biobert_model.ckpt',
        'biobert-v1.0-pubmed': 'biobert_model.ckpt',
        'biobert-v1.1-pubmed': 'model.ckpt-1000000',

        # SciBERT
        'scibert-scivocab-uncased': 'bert_model.ckpt',
        'scibert-scivocab-cased': 'bert_model.ckpt',
        'scibert-basevocab-uncased': 'bert_model.ckpt',
        'scibert-basevocab-cased': 'bert_model.ckpt',

        # SpanBERT
        'spanbert-base-cased': 'pytorch_model.bin',
        'spanbert-large-cased': 'pytorch_model.bin',
    }

    @classmethod
    def _transform_config(cls, pretrained_model_name: str,
                          cache_dir: str) -> Dict[str, Any]:
        info = list(os.walk(cache_dir))
        root, _, files = info[0]
        config_path = None

        for file in files:
            if file in ('bert_config.json', 'config.json'):
                config_path = os.path.join(root, file)
                with open(config_path) as f:
                    config_ckpt = json.loads(f.read())
                    hidden_dim = config_ckpt['hidden_size']
                    vocab_size = config_ckpt['vocab_size']
                    if not pretrained_model_name.startswith('spanbert'):
                        type_vocab_size = config_ckpt['type_vocab_size']
                    position_size = config_ckpt['max_position_embeddings']
                    embedding_dropout = config_ckpt['hidden_dropout_prob']
                    num_blocks = config_ckpt['num_hidden_layers']
                    num_heads = config_ckpt['num_attention_heads']
                    dropout_rate = config_ckpt['attention_probs_dropout_prob']
                    residual_dropout = config_ckpt['hidden_dropout_prob']
                    intermediate_size = config_ckpt['intermediate_size']
                    hidden_act = config_ckpt['hidden_act']

        if config_path is None:
            raise ValueError(f"Cannot find the config file in {cache_dir}")

        configs = {
            'hidden_size': hidden_dim,
            'embed': {
                'name': 'word_embeddings',
                'dim': hidden_dim
            },
            'vocab_size': vocab_size,
            'position_embed': {
                'name': 'position_embeddings',
                'dim': hidden_dim
            },
            'position_size': position_size,
            'encoder': {
                'name': 'encoder',
                'embedding_dropout': embedding_dropout,
                'num_blocks': num_blocks,
                'multihead_attention': {
                    'use_bias': True,
                    'num_units': hidden_dim,
                    'num_heads': num_heads,
                    'output_dim': hidden_dim,
                    'dropout_rate': dropout_rate,
                    'name': 'self'
                },
                'residual_dropout': residual_dropout,
                'dim': hidden_dim,
                'eps': 1e-12,
                'use_bert_config': True,
                'poswise_feedforward': {
                    "layers": [{
                        'type': 'Linear',
                        'kwargs': {
                            'in_features': hidden_dim,
                            'out_features': intermediate_size,
                            'bias': True,
                        }
                    }, {
                        'type': 'Bert' + hidden_act.upper()
                    }, {
                        'type': 'Linear',
                        'kwargs': {
                            'in_features': intermediate_size,
                            'out_features': hidden_dim,
                            'bias': True,
                        }
                    }],
                },
            }
        }

        if not pretrained_model_name.startswith('spanbert'):
            configs.update({
                'segment_embed': {
                    'name': 'token_type_embeddings',
                    'dim': hidden_dim},
                'type_vocab_size': type_vocab_size,
            })

        return configs

    def _init_from_checkpoint(self, pretrained_model_name: str,
                              cache_dir: str, **kwargs):
        if pretrained_model_name.startswith('spanbert'):
            global_tensor_map = {
                'bert.embeddings.word_embeddings.weight':
                    'word_embedder._embedding',
                'bert.embeddings.position_embeddings.weight':
                    'position_embedder._embedding',
                'bert.embeddings.LayerNorm.weight':
                    'encoder.input_normalizer.weight',
                'bert.embeddings.LayerNorm.bias':
                    'encoder.input_normalizer.bias',
            }

            attention_tensor_map = {
                "attention.self.key.bias": "self_attns.{}.K_dense.bias",
                "attention.self.query.bias": "self_attns.{}.Q_dense.bias",
                "attention.self.value.bias": "self_attns.{}.V_dense.bias",
                "attention.output.dense.bias": "self_attns.{}.O_dense.bias",
                "attention.output.LayerNorm.weight":
                    "poswise_layer_norm.{}.weight",
                "attention.output.LayerNorm.bias": "poswise_layer_norm.{}.bias",
                "intermediate.dense.bias": "poswise_networks.{}._layers.0.bias",
                "output.dense.bias": "poswise_networks.{}._layers.2.bias",
                "output.LayerNorm.weight": "output_layer_norm.{}.weight",
                "output.LayerNorm.bias": "output_layer_norm.{}.bias",
                "attention.self.key.weight": "self_attns.{}.K_dense.weight",
                "attention.self.query.weight": "self_attns.{}.Q_dense.weight",
                "attention.self.value.weight": "self_attns.{}.V_dense.weight",
                "attention.output.dense.weight": "self_attns.{}.O_dense.weight",
                "intermediate.dense.weight":
                    "poswise_networks.{}._layers.0.weight",
                "output.dense.weight": "poswise_networks.{}._layers.2.weight",
            }
            checkpoint_path = os.path.abspath(os.path.join(
                cache_dir, self._MODEL2CKPT[pretrained_model_name]))

            device = next(self.parameters()).device
            params = torch.load(checkpoint_path, map_location=device)

            for name, tensor in params.items():
                if name in global_tensor_map:
                    v_name = global_tensor_map[name]
                    pointer = self._name_to_variable(v_name)
                    assert pointer.shape == tensor.shape
                    pointer.data = tensor.data.type(pointer.dtype)
                elif name.startswith('bert.encoder.layer.'):
                    name = name.lstrip('bert.encoder.layer.')
                    layer_num, layer_name = name.split('.', 1)
                    if layer_name in attention_tensor_map:
                        v_name = attention_tensor_map[layer_name]
                        pointer = self._name_to_variable(
                            'encoder.' + v_name.format(layer_num))
                        assert pointer.shape == tensor.shape
                        pointer.data = tensor.data.type(pointer.dtype)

            return

        try:
            import numpy as np
            import tensorflow as tf
        except ImportError:
            print("Loading TensorFlow models in PyTorch requires installing "
                  "TensorFlow. Please see https://www.tensorflow.org/install/ "
                  "for installation instructions.")
            raise

        global_tensor_map = {
            'bert/embeddings/word_embeddings': 'word_embedder._embedding',
            'bert/embeddings/token_type_embeddings':
                'segment_embedder._embedding',
            'bert/embeddings/position_embeddings':
                'position_embedder._embedding',
            'bert/embeddings/LayerNorm/beta':
                'encoder.input_normalizer.bias',
            'bert/embeddings/LayerNorm/gamma':
                'encoder.input_normalizer.weight',
        }
        layer_tensor_map = {
            "attention/self/key/bias": "self_attns.{}.K_dense.bias",
            "attention/self/query/bias": "self_attns.{}.Q_dense.bias",
            "attention/self/value/bias": "self_attns.{}.V_dense.bias",
            "attention/output/dense/bias": "self_attns.{}.O_dense.bias",
            "attention/output/LayerNorm/gamma": "poswise_layer_norm.{}.weight",
            "attention/output/LayerNorm/beta": "poswise_layer_norm.{}.bias",
            "intermediate/dense/bias": "poswise_networks.{}._layers.0.bias",
            "output/dense/bias": "poswise_networks.{}._layers.2.bias",
            "output/LayerNorm/gamma": "output_layer_norm.{}.weight",
            "output/LayerNorm/beta": "output_layer_norm.{}.bias",
        }
        layer_transpose_map = {
            "attention/self/key/kernel": "self_attns.{}.K_dense.weight",
            "attention/self/query/kernel": "self_attns.{}.Q_dense.weight",
            "attention/self/value/kernel": "self_attns.{}.V_dense.weight",
            "attention/output/dense/kernel": "self_attns.{}.O_dense.weight",
            "intermediate/dense/kernel": "poswise_networks.{}._layers.0.weight",
            "output/dense/kernel": "poswise_networks.{}._layers.2.weight",
        }
        pooler_map = {
            'bert/pooler/dense/bias': 'pooler.0.bias',
            'bert/pooler/dense/kernel': 'pooler.0.weight'
        }
        tf_path = os.path.abspath(os.path.join(
            cache_dir, self._MODEL2CKPT[pretrained_model_name]))

        # Load weights from TF model
        init_vars = tf.train.list_variables(tf_path)
        tfnames, arrays = [], []
        for name, _ in init_vars:
            array = tf.train.load_variable(tf_path, name)
            tfnames.append(name)
            arrays.append(array.squeeze())
        py_prefix = "encoder."

        idx = 0
        for name, array in zip(tfnames, arrays):
            if name.startswith('cls') or name == 'global_step' or \
                    name.endswith('adam_m') or name.endswith('adam_v'):
                # ignore those variables begin with cls
                # ignore 'global_step' variable
                # ignore optimizer state variable
                continue

            if name in global_tensor_map:
                v_name = global_tensor_map[name]
                pointer = self._name_to_variable(v_name)
                assert pointer.shape == array.shape
                pointer.data = torch.from_numpy(array)
                idx += 1
            elif name in pooler_map:
                pointer = self._name_to_variable(pooler_map[name])
                if name.endswith('bias'):
                    assert pointer.shape == array.shape
                    pointer.data = torch.from_numpy(array)
                    idx += 1
                else:
                    array_t = np.transpose(array)
                    assert pointer.shape == array_t.shape
                    pointer.data = torch.from_numpy(array_t)
                    idx += 1
            else:
                # here name is the TensorFlow variable name
                name_tmp = name.split("/")
                # e.g. layer_
                layer_no = name_tmp[2][6:]
                name_tmp = "/".join(name_tmp[3:])
                if name_tmp in layer_tensor_map:
                    v_name = layer_tensor_map[name_tmp].format(layer_no)
                    pointer = self._name_to_variable(py_prefix + v_name)
                    assert pointer.shape == array.shape
                    pointer.data = torch.from_numpy(array)
                elif name_tmp in layer_transpose_map:
                    v_name = layer_transpose_map[name_tmp].format(layer_no)
                    pointer = self._name_to_variable(py_prefix + v_name)
                    array_t = np.transpose(array)
                    assert pointer.shape == array_t.shape
                    pointer.data = torch.from_numpy(array_t)
                else:
                    raise NameError(f"Variable with name '{name}' not found")
                idx += 1