225 lines
8.6 KiB
Python
225 lines
8.6 KiB
Python
|
# -*- coding: utf-8 -*-
|
||
|
# Natural Language Toolkit: ChrF score
|
||
|
#
|
||
|
# Copyright (C) 2001-2019 NLTK Project
|
||
|
# Authors: Maja Popovic
|
||
|
# Contributors: Liling Tan, Aleš Tamchyna (Memsource)
|
||
|
# URL: <http://nltk.org/>
|
||
|
# For license information, see LICENSE.TXT
|
||
|
|
||
|
""" ChrF score implementation """
|
||
|
from __future__ import division
|
||
|
from collections import Counter, defaultdict
|
||
|
import re
|
||
|
|
||
|
from nltk.util import ngrams
|
||
|
|
||
|
|
||
|
def sentence_chrf(
|
||
|
reference, hypothesis, min_len=1, max_len=6, beta=3.0, ignore_whitespace=True
|
||
|
):
|
||
|
"""
|
||
|
Calculates the sentence level CHRF (Character n-gram F-score) described in
|
||
|
- Maja Popovic. 2015. CHRF: Character n-gram F-score for Automatic MT Evaluation.
|
||
|
In Proceedings of the 10th Workshop on Machine Translation.
|
||
|
http://www.statmt.org/wmt15/pdf/WMT49.pdf
|
||
|
- Maja Popovic. 2016. CHRF Deconstructed: β Parameters and n-gram Weights.
|
||
|
In Proceedings of the 1st Conference on Machine Translation.
|
||
|
http://www.statmt.org/wmt16/pdf/W16-2341.pdf
|
||
|
|
||
|
This implementation of CHRF only supports a single reference at the moment.
|
||
|
|
||
|
For details not reported in the paper, consult Maja Popovic's original
|
||
|
implementation: https://github.com/m-popovic/chrF
|
||
|
|
||
|
The code should output results equivalent to running CHRF++ with the
|
||
|
following options: -nw 0 -b 3
|
||
|
|
||
|
An example from the original BLEU paper
|
||
|
http://www.aclweb.org/anthology/P02-1040.pdf
|
||
|
|
||
|
>>> ref1 = str('It is a guide to action that ensures that the military '
|
||
|
... 'will forever heed Party commands').split()
|
||
|
>>> hyp1 = str('It is a guide to action which ensures that the military '
|
||
|
... 'always obeys the commands of the party').split()
|
||
|
>>> hyp2 = str('It is to insure the troops forever hearing the activity '
|
||
|
... 'guidebook that party direct').split()
|
||
|
>>> sentence_chrf(ref1, hyp1) # doctest: +ELLIPSIS
|
||
|
0.6349...
|
||
|
>>> sentence_chrf(ref1, hyp2) # doctest: +ELLIPSIS
|
||
|
0.3330...
|
||
|
|
||
|
The infamous "the the the ... " example
|
||
|
|
||
|
>>> ref = 'the cat is on the mat'.split()
|
||
|
>>> hyp = 'the the the the the the the'.split()
|
||
|
>>> sentence_chrf(ref, hyp) # doctest: +ELLIPSIS
|
||
|
0.1468...
|
||
|
|
||
|
An example to show that this function allows users to use strings instead of
|
||
|
tokens, i.e. list(str) as inputs.
|
||
|
|
||
|
>>> ref1 = str('It is a guide to action that ensures that the military '
|
||
|
... 'will forever heed Party commands')
|
||
|
>>> hyp1 = str('It is a guide to action which ensures that the military '
|
||
|
... 'always obeys the commands of the party')
|
||
|
>>> sentence_chrf(ref1, hyp1) # doctest: +ELLIPSIS
|
||
|
0.6349...
|
||
|
>>> type(ref1) == type(hyp1) == str
|
||
|
True
|
||
|
>>> sentence_chrf(ref1.split(), hyp1.split()) # doctest: +ELLIPSIS
|
||
|
0.6349...
|
||
|
|
||
|
To skip the unigrams and only use 2- to 3-grams:
|
||
|
|
||
|
>>> sentence_chrf(ref1, hyp1, min_len=2, max_len=3) # doctest: +ELLIPSIS
|
||
|
0.6617...
|
||
|
|
||
|
:param references: reference sentence
|
||
|
:type references: list(str) / str
|
||
|
:param hypothesis: a hypothesis sentence
|
||
|
:type hypothesis: list(str) / str
|
||
|
:param min_len: The minimum order of n-gram this function should extract.
|
||
|
:type min_len: int
|
||
|
:param max_len: The maximum order of n-gram this function should extract.
|
||
|
:type max_len: int
|
||
|
:param beta: the parameter to assign more importance to recall over precision
|
||
|
:type beta: float
|
||
|
:param ignore_whitespace: ignore whitespace characters in scoring
|
||
|
:type ignore_whitespace: bool
|
||
|
:return: the sentence level CHRF score.
|
||
|
:rtype: float
|
||
|
"""
|
||
|
return corpus_chrf(
|
||
|
[reference],
|
||
|
[hypothesis],
|
||
|
min_len,
|
||
|
max_len,
|
||
|
beta=beta,
|
||
|
ignore_whitespace=ignore_whitespace,
|
||
|
)
|
||
|
|
||
|
|
||
|
def _preprocess(sent, ignore_whitespace):
|
||
|
if type(sent) != str:
|
||
|
# turn list of tokens into a string
|
||
|
sent = ' '.join(sent)
|
||
|
|
||
|
if ignore_whitespace:
|
||
|
sent = re.sub(r'\s+', '', sent)
|
||
|
return sent
|
||
|
|
||
|
|
||
|
def chrf_precision_recall_fscore_support(
|
||
|
reference, hypothesis, n, beta=3.0, epsilon=1e-16
|
||
|
):
|
||
|
"""
|
||
|
This function computes the precision, recall and fscore from the ngram
|
||
|
overlaps. It returns the `support` which is the true positive score.
|
||
|
|
||
|
By underspecifying the input type, the function will be agnostic as to how
|
||
|
it computes the ngrams and simply take the whichever element in the list;
|
||
|
it could be either token or character.
|
||
|
|
||
|
:param reference: The reference sentence.
|
||
|
:type reference: list
|
||
|
:param hypothesis: The hypothesis sentence.
|
||
|
:type hypothesis: list
|
||
|
:param n: Extract up to the n-th order ngrams
|
||
|
:type n: int
|
||
|
:param beta: The parameter to assign more importance to recall over precision.
|
||
|
:type beta: float
|
||
|
:param epsilon: The fallback value if the hypothesis or reference is empty.
|
||
|
:type epsilon: float
|
||
|
:return: Returns the precision, recall and f-score and support (true positive).
|
||
|
:rtype: tuple(float)
|
||
|
"""
|
||
|
ref_ngrams = Counter(ngrams(reference, n))
|
||
|
hyp_ngrams = Counter(ngrams(hypothesis, n))
|
||
|
|
||
|
# calculate the number of ngram matches
|
||
|
overlap_ngrams = ref_ngrams & hyp_ngrams
|
||
|
tp = sum(overlap_ngrams.values()) # True positives.
|
||
|
tpfp = sum(hyp_ngrams.values()) # True positives + False positives.
|
||
|
tpfn = sum(ref_ngrams.values()) # True positives + False negatives.
|
||
|
|
||
|
try:
|
||
|
prec = tp / tpfp # precision
|
||
|
rec = tp / tpfn # recall
|
||
|
factor = beta ** 2
|
||
|
fscore = (1 + factor) * (prec * rec) / (factor * prec + rec)
|
||
|
except ZeroDivisionError:
|
||
|
prec = rec = fscore = epsilon
|
||
|
return prec, rec, fscore, tp
|
||
|
|
||
|
|
||
|
def corpus_chrf(
|
||
|
references, hypotheses, min_len=1, max_len=6, beta=3.0, ignore_whitespace=True
|
||
|
):
|
||
|
"""
|
||
|
Calculates the corpus level CHRF (Character n-gram F-score), it is the
|
||
|
macro-averaged value of the sentence/segment level CHRF score.
|
||
|
|
||
|
This implementation of CHRF only supports a single reference at the moment.
|
||
|
|
||
|
>>> ref1 = str('It is a guide to action that ensures that the military '
|
||
|
... 'will forever heed Party commands').split()
|
||
|
>>> ref2 = str('It is the guiding principle which guarantees the military '
|
||
|
... 'forces always being under the command of the Party').split()
|
||
|
>>>
|
||
|
>>> hyp1 = str('It is a guide to action which ensures that the military '
|
||
|
... 'always obeys the commands of the party').split()
|
||
|
>>> hyp2 = str('It is to insure the troops forever hearing the activity '
|
||
|
... 'guidebook that party direct')
|
||
|
>>> corpus_chrf([ref1, ref2, ref1, ref2], [hyp1, hyp2, hyp2, hyp1]) # doctest: +ELLIPSIS
|
||
|
0.3910...
|
||
|
|
||
|
:param references: a corpus of list of reference sentences, w.r.t. hypotheses
|
||
|
:type references: list(list(str))
|
||
|
:param hypotheses: a list of hypothesis sentences
|
||
|
:type hypotheses: list(list(str))
|
||
|
:param min_len: The minimum order of n-gram this function should extract.
|
||
|
:type min_len: int
|
||
|
:param max_len: The maximum order of n-gram this function should extract.
|
||
|
:type max_len: int
|
||
|
:param beta: the parameter to assign more importance to recall over precision
|
||
|
:type beta: float
|
||
|
:param ignore_whitespace: ignore whitespace characters in scoring
|
||
|
:type ignore_whitespace: bool
|
||
|
:return: the sentence level CHRF score.
|
||
|
:rtype: float
|
||
|
"""
|
||
|
|
||
|
assert len(references) == len(
|
||
|
hypotheses
|
||
|
), "The number of hypotheses and their references should be the same"
|
||
|
num_sents = len(hypotheses)
|
||
|
|
||
|
# Keep f-scores for each n-gram order separate
|
||
|
ngram_fscores = defaultdict(lambda: list())
|
||
|
|
||
|
# Iterate through each hypothesis and their corresponding references.
|
||
|
for reference, hypothesis in zip(references, hypotheses):
|
||
|
|
||
|
# preprocess both reference and hypothesis
|
||
|
reference = _preprocess(reference, ignore_whitespace)
|
||
|
hypothesis = _preprocess(hypothesis, ignore_whitespace)
|
||
|
|
||
|
# Calculate f-scores for each sentence and for each n-gram order
|
||
|
# separately.
|
||
|
for n in range(min_len, max_len + 1):
|
||
|
# Compute the precision, recall, fscore and support.
|
||
|
prec, rec, fscore, tp = chrf_precision_recall_fscore_support(
|
||
|
reference, hypothesis, n, beta=beta
|
||
|
)
|
||
|
ngram_fscores[n].append(fscore)
|
||
|
|
||
|
# how many n-gram sizes
|
||
|
num_ngram_sizes = len(ngram_fscores)
|
||
|
|
||
|
# sum of f-scores over all sentences for each n-gram order
|
||
|
total_scores = [sum(fscores) for n, fscores in ngram_fscores.items()]
|
||
|
|
||
|
# macro-average over n-gram orders and over all sentences
|
||
|
return (sum(total_scores) / num_ngram_sizes) / num_sents
|