197 lines
7.6 KiB
Python
197 lines
7.6 KiB
Python
# -*- coding: utf-8 -*-
|
|
# Natural Language Toolkit: Phrase Extraction Algorithm
|
|
#
|
|
# Copyright (C) 2001-2019 NLTK Project
|
|
# Authors: Liling Tan, Fredrik Hedman, Petra Barancikova
|
|
# URL: <http://nltk.org/>
|
|
# For license information, see LICENSE.TXT
|
|
|
|
|
|
def extract(
|
|
f_start,
|
|
f_end,
|
|
e_start,
|
|
e_end,
|
|
alignment,
|
|
f_aligned,
|
|
srctext,
|
|
trgtext,
|
|
srclen,
|
|
trglen,
|
|
max_phrase_length,
|
|
):
|
|
"""
|
|
This function checks for alignment point consistency and extracts
|
|
phrases using the chunk of consistent phrases.
|
|
|
|
A phrase pair (e, f ) is consistent with an alignment A if and only if:
|
|
|
|
(i) No English words in the phrase pair are aligned to words outside it.
|
|
|
|
∀e i ∈ e, (e i , f j ) ∈ A ⇒ f j ∈ f
|
|
|
|
(ii) No Foreign words in the phrase pair are aligned to words outside it.
|
|
|
|
∀f j ∈ f , (e i , f j ) ∈ A ⇒ e i ∈ e
|
|
|
|
(iii) The phrase pair contains at least one alignment point.
|
|
|
|
∃e i ∈ e ̄ , f j ∈ f ̄ s.t. (e i , f j ) ∈ A
|
|
|
|
:type f_start: int
|
|
:param f_start: Starting index of the possible foreign language phrases
|
|
:type f_end: int
|
|
:param f_end: Starting index of the possible foreign language phrases
|
|
:type e_start: int
|
|
:param e_start: Starting index of the possible source language phrases
|
|
:type e_end: int
|
|
:param e_end: Starting index of the possible source language phrases
|
|
:type srctext: list
|
|
:param srctext: The source language tokens, a list of string.
|
|
:type trgtext: list
|
|
:param trgtext: The target language tokens, a list of string.
|
|
:type srclen: int
|
|
:param srclen: The number of tokens in the source language tokens.
|
|
:type trglen: int
|
|
:param trglen: The number of tokens in the target language tokens.
|
|
"""
|
|
|
|
if f_end < 0: # 0-based indexing.
|
|
return {}
|
|
# Check if alignment points are consistent.
|
|
for e, f in alignment:
|
|
if (f_start <= f <= f_end) and (e < e_start or e > e_end):
|
|
return {}
|
|
|
|
# Add phrase pairs (incl. additional unaligned f)
|
|
phrases = set()
|
|
fs = f_start
|
|
while True:
|
|
fe = min(f_end, f_start + max_phrase_length - 1)
|
|
while True:
|
|
# add phrase pair ([e_start, e_end], [fs, fe]) to set E
|
|
# Need to +1 in range to include the end-point.
|
|
src_phrase = " ".join(srctext[e_start : e_end + 1])
|
|
trg_phrase = " ".join(trgtext[fs : fe + 1])
|
|
# Include more data for later ordering.
|
|
phrases.add(
|
|
((e_start, e_end + 1), (f_start, f_end + 1), src_phrase, trg_phrase)
|
|
)
|
|
fe += 1
|
|
if fe in f_aligned or fe >= trglen:
|
|
break
|
|
fs -= 1
|
|
if fs in f_aligned or fs < 0:
|
|
break
|
|
return phrases
|
|
|
|
|
|
def phrase_extraction(srctext, trgtext, alignment, max_phrase_length=0):
|
|
"""
|
|
Phrase extraction algorithm extracts all consistent phrase pairs from
|
|
a word-aligned sentence pair.
|
|
|
|
The idea is to loop over all possible source language (e) phrases and find
|
|
the minimal foreign phrase (f) that matches each of them. Matching is done
|
|
by identifying all alignment points for the source phrase and finding the
|
|
shortest foreign phrase that includes all the foreign counterparts for the
|
|
source words.
|
|
|
|
In short, a phrase alignment has to
|
|
(a) contain all alignment points for all covered words
|
|
(b) contain at least one alignment point
|
|
|
|
>>> srctext = "michael assumes that he will stay in the house"
|
|
>>> trgtext = "michael geht davon aus , dass er im haus bleibt"
|
|
>>> alignment = [(0,0), (1,1), (1,2), (1,3), (2,5), (3,6), (4,9),
|
|
... (5,9), (6,7), (7,7), (8,8)]
|
|
>>> phrases = phrase_extraction(srctext, trgtext, alignment)
|
|
>>> for i in sorted(phrases):
|
|
... print(i)
|
|
...
|
|
((0, 1), (0, 1), 'michael', 'michael')
|
|
((0, 2), (0, 4), 'michael assumes', 'michael geht davon aus')
|
|
((0, 2), (0, 4), 'michael assumes', 'michael geht davon aus ,')
|
|
((0, 3), (0, 6), 'michael assumes that', 'michael geht davon aus , dass')
|
|
((0, 4), (0, 7), 'michael assumes that he', 'michael geht davon aus , dass er')
|
|
((0, 9), (0, 10), 'michael assumes that he will stay in the house', 'michael geht davon aus , dass er im haus bleibt')
|
|
((1, 2), (1, 4), 'assumes', 'geht davon aus')
|
|
((1, 2), (1, 4), 'assumes', 'geht davon aus ,')
|
|
((1, 3), (1, 6), 'assumes that', 'geht davon aus , dass')
|
|
((1, 4), (1, 7), 'assumes that he', 'geht davon aus , dass er')
|
|
((1, 9), (1, 10), 'assumes that he will stay in the house', 'geht davon aus , dass er im haus bleibt')
|
|
((2, 3), (5, 6), 'that', ', dass')
|
|
((2, 3), (5, 6), 'that', 'dass')
|
|
((2, 4), (5, 7), 'that he', ', dass er')
|
|
((2, 4), (5, 7), 'that he', 'dass er')
|
|
((2, 9), (5, 10), 'that he will stay in the house', ', dass er im haus bleibt')
|
|
((2, 9), (5, 10), 'that he will stay in the house', 'dass er im haus bleibt')
|
|
((3, 4), (6, 7), 'he', 'er')
|
|
((3, 9), (6, 10), 'he will stay in the house', 'er im haus bleibt')
|
|
((4, 6), (9, 10), 'will stay', 'bleibt')
|
|
((4, 9), (7, 10), 'will stay in the house', 'im haus bleibt')
|
|
((6, 8), (7, 8), 'in the', 'im')
|
|
((6, 9), (7, 9), 'in the house', 'im haus')
|
|
((8, 9), (8, 9), 'house', 'haus')
|
|
|
|
:type srctext: str
|
|
:param srctext: The sentence string from the source language.
|
|
:type trgtext: str
|
|
:param trgtext: The sentence string from the target language.
|
|
:type alignment: list(tuple)
|
|
:param alignment: The word alignment outputs as list of tuples, where
|
|
the first elements of tuples are the source words' indices and
|
|
second elements are the target words' indices. This is also the output
|
|
format of nltk.translate.ibm1
|
|
:rtype: list(tuple)
|
|
:return: A list of tuples, each element in a list is a phrase and each
|
|
phrase is a tuple made up of (i) its source location, (ii) its target
|
|
location, (iii) the source phrase and (iii) the target phrase. The phrase
|
|
list of tuples represents all the possible phrases extracted from the
|
|
word alignments.
|
|
:type max_phrase_length: int
|
|
:param max_phrase_length: maximal phrase length, if 0 or not specified
|
|
it is set to a length of the longer sentence (srctext or trgtext).
|
|
"""
|
|
|
|
srctext = srctext.split() # e
|
|
trgtext = trgtext.split() # f
|
|
srclen = len(srctext) # len(e)
|
|
trglen = len(trgtext) # len(f)
|
|
# Keeps an index of which source/target words that are aligned.
|
|
f_aligned = [j for _, j in alignment]
|
|
max_phrase_length = max_phrase_length or max(srclen, trglen)
|
|
|
|
# set of phrase pairs BP
|
|
bp = set()
|
|
|
|
for e_start in range(srclen):
|
|
max_idx = min(srclen, e_start + max_phrase_length)
|
|
for e_end in range(e_start, max_idx):
|
|
# // find the minimally matching foreign phrase
|
|
# (f start , f end ) = ( length(f), 0 )
|
|
# f_start ∈ [0, len(f) - 1]; f_end ∈ [0, len(f) - 1]
|
|
f_start, f_end = trglen - 1, -1 # 0-based indexing
|
|
|
|
for e, f in alignment:
|
|
if e_start <= e <= e_end:
|
|
f_start = min(f, f_start)
|
|
f_end = max(f, f_end)
|
|
# add extract (f start , f end , e start , e end ) to set BP
|
|
phrases = extract(
|
|
f_start,
|
|
f_end,
|
|
e_start,
|
|
e_end,
|
|
alignment,
|
|
f_aligned,
|
|
srctext,
|
|
trgtext,
|
|
srclen,
|
|
trglen,
|
|
max_phrase_length,
|
|
)
|
|
if phrases:
|
|
bp.update(phrases)
|
|
return bp
|