Source code for texar.torch.evals.bleu

# Copyright 2017 Google Inc. All Rights Reserved.
#
# Licensed under the Apache License, Version 2.0 (the "License");
# you may not use this file except in compliance with the License.
# You may obtain a copy of the License at
#
#     http://www.apache.org/licenses/LICENSE-2.0
#
# Unless required by applicable law or agreed to in writing, software
# distributed under the License is distributed on an "AS IS" BASIS,
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and
# limitations under the License.
#
# Modifications copyright (C) 2019 Texar
# ==============================================================================
"""
Python implementation of BLEU and smoothed BLEU adapted from:
    `https://github.com/tensorflow/nmt/blob/master/nmt/scripts/bleu.py`

This module provides a Python implementation of BLEU and smoothed BLEU.
Smooth BLEU is computed following the method outlined in the paper:

    (Lin et al. 2004) ORANGE: a method for evaluating automatic evaluation
    metrics for machine translation.
    Chin-Yew Lin, Franz Josef Och. COLING 2004.
"""

import collections
import math
from typing import Counter, List, Tuple, Union

from texar.torch.utils.dtypes import compat_as_text
from texar.torch.utils.types import MaybeList

__all__ = [
    "sentence_bleu",
    "corpus_bleu",
]


def _get_ngrams(segment: MaybeList[str],
                max_order: int) -> Counter[Tuple[str, ...]]:
    r"""Extracts all n-grams up to a given maximum order from an
    input segment.

    Args:
        segment: text segment from which n-grams will be extracted.
        max_order: maximum length in tokens of the n-grams returned
            by this methods.

    Returns:
        The Counter containing all n-grams upto :attr:`max_order`
        in segment with a count of how many times each n-gram occurred.
    """
    ngram_counts: collections.Counter = collections.Counter()
    for order in range(1, max_order + 1):
        for i in range(0, len(segment) - order + 1):
            ngram = tuple(segment[i:i + order])
            ngram_counts[ngram] += 1
    return ngram_counts


def _maybe_str_to_list(list_or_str: Union[str, List[str]]) -> List[str]:
    if isinstance(list_or_str, str):
        return list_or_str.split()
    return list_or_str


def _lowercase(str_list: List[str]) -> List[str]:
    return [str_.lower() for str_ in str_list]


[docs]def sentence_bleu(references: List[MaybeList[str]],
                  hypothesis: MaybeList[str],
                  max_order: int = 4,
                  lowercase: bool = False,
                  smooth: bool = False,
                  use_bp: bool = True,
                  return_all: bool = False) -> MaybeList[float]:
    r"""Calculates BLEU score of a hypothesis sentence.

    Args:
        references: A list of reference for the hypothesis.
            Each reference can be either a list of string tokens, or a string
            containing tokenized tokens separated with whitespaces.
            List can also be numpy array.
        hypothesis: A hypothesis sentence.
            Each hypothesis can be either a list of string tokens, or a
            string containing tokenized tokens separated with whitespaces.
            List can also be numpy array.
        lowercase (bool): If `True`, lowercase reference and hypothesis
            tokens.
        max_order (int): Maximum n-gram order to use when computing
            BLEU score.
        smooth (bool): Whether or not to apply `(Lin et al. 2004)` smoothing.
        use_bp (bool): Whether to apply brevity penalty.
        return_all (bool): If `True`, returns BLEU and all
            n-gram precisions.

    Returns:
        If :attr:`return_all` is `False` (default), returns a float32
        BLEU score.

        If :attr:`return_all` is `True`, returns a list of float32
        ``[BLEU] + n-gram precisions``, which is of length :attr:`max_order`
        +1.
    """
    return corpus_bleu([references],
                       [hypothesis],
                       max_order=max_order,
                       lowercase=lowercase,
                       smooth=smooth,
                       use_bp=use_bp,
                       return_all=return_all)


[docs]def corpus_bleu(list_of_references: List[List[MaybeList[str]]],
                hypotheses: List[MaybeList[str]],
                max_order: int = 4,
                lowercase: bool = False,
                smooth: bool = False,
                use_bp: bool = True,
                return_all: bool = False) -> MaybeList[float]:
    r"""Computes corpus-level BLEU score.

    Args:
        list_of_references: A list of lists of references for each hypothesis.
            Each reference can be either a list of string tokens, or a string
            containing tokenized tokens separated with whitespaces.
            List can also be numpy array.
        hypotheses: A list of hypothesis sentences.
            Each hypothesis can be either a list of string tokens, or a
            string containing tokenized tokens separated with whitespaces.
            List can also be numpy array.
        lowercase (bool): If `True`, lowercase reference and hypothesis
            tokens.
        max_order (int): Maximum n-gram order to use when computing
            BLEU score.
        smooth (bool): Whether or not to apply `(Lin et al. 2004)` smoothing.
        use_bp (bool): Whether to apply brevity penalty.
        return_all (bool): If `True`, returns BLEU and all
            n-gram precisions.

    Returns:
        If :attr:`return_all` is `False` (default), returns a ``float32``
        BLEU score.

        If :attr:`return_all` is `True`, returns a list of
        ``float32`` scores: ``[BLEU] + n-gram precisions``,
        which is of length :attr:`max_order` +1.
    """
    list_of_references = compat_as_text(list_of_references)
    hypotheses = compat_as_text(hypotheses)

    matches_by_order = [0] * max_order
    possible_matches_by_order = [0] * max_order
    reference_length = 0
    hypothesis_length = 0

    for (references, hypothesis) in zip(list_of_references, hypotheses):
        reference_length += min(len(r) for r in references)
        hypothesis_length += len(hypothesis)

        merged_ref_ngram_counts: Counter[Tuple[str, ...]] = \
            collections.Counter()
        for reference in references:
            reference = _maybe_str_to_list(reference)
            if lowercase:
                reference = _lowercase(reference)
            merged_ref_ngram_counts |= _get_ngrams(reference, max_order)

        hypothesis = _maybe_str_to_list(hypothesis)
        if lowercase:
            hypothesis = _lowercase(hypothesis)
        hypothesis_ngram_counts = _get_ngrams(hypothesis, max_order)

        overlap = hypothesis_ngram_counts & merged_ref_ngram_counts
        for ngram in overlap:
            matches_by_order[len(ngram) - 1] += overlap[ngram]
        for order in range(1, max_order + 1):
            possible_matches = len(hypothesis) - order + 1
            if possible_matches > 0:
                possible_matches_by_order[order - 1] += possible_matches

    precisions = [0.0] * max_order
    for i in range(0, max_order):
        if smooth:
            precisions[i] = ((matches_by_order[i] + 1.) /
                             (possible_matches_by_order[i] + 1.))
        else:
            if possible_matches_by_order[i] > 0:
                precisions[i] = (float(matches_by_order[i]) /
                                 possible_matches_by_order[i])
            else:
                precisions[i] = 0.0

    if min(precisions) > 0:
        p_log_sum = sum((1. / max_order) * math.log(p) for p in precisions)
        geo_mean = math.exp(p_log_sum)
    else:
        geo_mean = 0

    if use_bp:
        ratio = float(hypothesis_length) / reference_length
        if ratio > 1.0:
            bp = 1.
        else:
            if abs(ratio) < 1e-8:
                bp = 0.
            else:
                bp = math.exp(1 - 1. / ratio)
    else:
        bp = 1.

    bleu = geo_mean * bp

    if return_all:
        return [bleu * 100] + [p * 100 for p in precisions]
    else:
        return bleu * 100