optimized sorting giza alignments
This commit is contained in:
parent
a2b6853dcf
commit
38e51a90f7
@ -1,15 +1,15 @@
|
|||||||
SRC_LANG=en
|
SRC_LANG=pl
|
||||||
TRG_LANG=en
|
TRG_LANG=en
|
||||||
CORPUS_NAME=jrc_enes
|
CORPUS_NAME=europarl_sample
|
||||||
SEPARATOR=@\#@
|
SEPARATOR=@\#@
|
||||||
|
|
||||||
DICTIONARY_WEIGHT=5
|
DICTIONARY_WEIGHT=0
|
||||||
|
|
||||||
all: corpora/$(CORPUS_NAME)/aligned_final.txt corpora/$(CORPUS_NAME)/src_final.txt corpora/$(CORPUS_NAME)/trg_final.txt
|
all: corpora/$(CORPUS_NAME)/aligned_final.txt corpora/$(CORPUS_NAME)/src_final.txt corpora/$(CORPUS_NAME)/trg_final.txt
|
||||||
|
|
||||||
corpora/$(CORPUS_NAME)/aligned.txt: corpora/$(CORPUS_NAME)/giza.cfg corpora/$(CORPUS_NAME)/src.lem_trg.lem.cooc corpora/$(CORPUS_NAME)/src.lem_trg.lem.snt corpora/$(CORPUS_NAME)/src.lem.vcb corpora/$(CORPUS_NAME)/trg.lem.vcb
|
corpora/$(CORPUS_NAME)/aligned.txt: corpora/$(CORPUS_NAME)/giza.cfg corpora/$(CORPUS_NAME)/src.lem_trg.lem.cooc corpora/$(CORPUS_NAME)/src.lem_trg.lem.snt corpora/$(CORPUS_NAME)/src.lem.vcb corpora/$(CORPUS_NAME)/trg.lem.vcb
|
||||||
mgiza/mgizapp/bin/mgiza corpora/$(CORPUS_NAME)/giza.cfg
|
mgiza/mgizapp/bin/mgiza corpora/$(CORPUS_NAME)/giza.cfg
|
||||||
cat corpora/$(CORPUS_NAME)/aligned*part* | ./sortGizaAlignments.py > corpora/$(CORPUS_NAME)/aligned.txt
|
./mergeGizaAlignments.py corpora/$(CORPUS_NAME)/aligned*part* > corpora/$(CORPUS_NAME)/aligned.txt
|
||||||
|
|
||||||
clean-intermediate-files:
|
clean-intermediate-files:
|
||||||
rm -f corpora/$(CORPUS_NAME)/*.lem
|
rm -f corpora/$(CORPUS_NAME)/*.lem
|
||||||
|
72
mgiza-aligner/mergeGizaAlignments.py
Executable file
72
mgiza-aligner/mergeGizaAlignments.py
Executable file
@ -0,0 +1,72 @@
|
|||||||
|
#!/usr/bin/python3
|
||||||
|
|
||||||
|
import sys, re
|
||||||
|
|
||||||
|
pair_pattern = re.compile("# Sentence pair \((\d+)\)")
|
||||||
|
|
||||||
|
|
||||||
|
def getNextSentencePair(pairs_file):
|
||||||
|
first_line = pairs_file['file'].readline()
|
||||||
|
if first_line == '':
|
||||||
|
# we reached the end of file
|
||||||
|
pairs_file['nextPair'] = None
|
||||||
|
else:
|
||||||
|
nextPair = dict()
|
||||||
|
m = pair_pattern.match(first_line)
|
||||||
|
if m:
|
||||||
|
nextPair['number'] = int(m.group(1))
|
||||||
|
else:
|
||||||
|
raise Exception("Could not read sentence pair number from line: %s" %first_line)
|
||||||
|
lines = [first_line]
|
||||||
|
lines.append(pairs_file['file'].readline())
|
||||||
|
lines.append(pairs_file['file'].readline())
|
||||||
|
nextPair['lines'] = lines
|
||||||
|
pairs_file['nextPair'] = nextPair
|
||||||
|
|
||||||
|
files = []
|
||||||
|
for arg in sys.argv[1:]:
|
||||||
|
pairs_file = {'file':open(arg,'r'), 'nextPair':None}
|
||||||
|
getNextSentencePair(pairs_file)
|
||||||
|
files.append(pairs_file)
|
||||||
|
|
||||||
|
finished = False
|
||||||
|
while not finished:
|
||||||
|
youngest_file = None
|
||||||
|
for pairs_file in files:
|
||||||
|
if pairs_file['nextPair'] is not None:
|
||||||
|
# if the file is not at end
|
||||||
|
if youngest_file is None or pairs_file['nextPair']['number'] < youngest_file['nextPair']['number']:
|
||||||
|
youngest_file = pairs_file
|
||||||
|
if youngest_file is None:
|
||||||
|
finished = True
|
||||||
|
else:
|
||||||
|
print(''.join(youngest_file['nextPair']['lines']).rstrip())
|
||||||
|
getNextSentencePair(youngest_file)
|
||||||
|
|
||||||
|
|
||||||
|
|
||||||
|
for pairs_file in files:
|
||||||
|
pairs_file['file'].close()
|
||||||
|
|
||||||
|
"""
|
||||||
|
|
||||||
|
i = 0
|
||||||
|
for line in sys.stdin:
|
||||||
|
line = line.strip()
|
||||||
|
if i % 3 == 0:
|
||||||
|
current_example = [line]
|
||||||
|
m = p.match(line)
|
||||||
|
if m:
|
||||||
|
current_key = int(m.group(1))
|
||||||
|
else:
|
||||||
|
raise Exception("Wrong line: "+line)
|
||||||
|
elif i % 3 == 1:
|
||||||
|
current_example.append(line)
|
||||||
|
else:
|
||||||
|
current_example.append(line)
|
||||||
|
examples_dict[current_key] = current_example
|
||||||
|
i+=1
|
||||||
|
|
||||||
|
for key in sorted(examples_dict.keys()):
|
||||||
|
print ('\n'.join(examples_dict[key]))
|
||||||
|
"""
|
Loading…
Reference in New Issue
Block a user