Merge branch 'master' of rjawor.vm.wmi.amu.edu.pl:concordia-server
This commit is contained in:
commit
ad584fbdb3
@ -1,15 +1,15 @@
|
||||
SRC_LANG=en
|
||||
SRC_LANG=pl
|
||||
TRG_LANG=en
|
||||
CORPUS_NAME=jrc_enes
|
||||
CORPUS_NAME=europarl_sample
|
||||
SEPARATOR=@\#@
|
||||
|
||||
DICTIONARY_WEIGHT=5
|
||||
|
||||
DICTIONARY_WEIGHT=0
|
||||
|
||||
all: corpora/$(CORPUS_NAME)/aligned_final.txt corpora/$(CORPUS_NAME)/src_final.txt corpora/$(CORPUS_NAME)/trg_final.txt
|
||||
|
||||
corpora/$(CORPUS_NAME)/aligned.txt: corpora/$(CORPUS_NAME)/giza.cfg corpora/$(CORPUS_NAME)/src.lem_trg.lem.cooc corpora/$(CORPUS_NAME)/src.lem_trg.lem.snt corpora/$(CORPUS_NAME)/src.lem.vcb corpora/$(CORPUS_NAME)/trg.lem.vcb
|
||||
mgiza/mgizapp/bin/mgiza corpora/$(CORPUS_NAME)/giza.cfg
|
||||
cat corpora/$(CORPUS_NAME)/aligned*part* | ./sortGizaAlignments.py > corpora/$(CORPUS_NAME)/aligned.txt
|
||||
./mergeGizaAlignments.py corpora/$(CORPUS_NAME)/aligned*part* > corpora/$(CORPUS_NAME)/aligned.txt
|
||||
|
||||
clean-intermediate-files:
|
||||
rm -f corpora/$(CORPUS_NAME)/*.lem
|
||||
|
72
mgiza-aligner/mergeGizaAlignments.py
Executable file
72
mgiza-aligner/mergeGizaAlignments.py
Executable file
@ -0,0 +1,72 @@
|
||||
#!/usr/bin/python3
|
||||
|
||||
import sys, re
|
||||
|
||||
pair_pattern = re.compile("# Sentence pair \((\d+)\)")
|
||||
|
||||
|
||||
def getNextSentencePair(pairs_file):
|
||||
first_line = pairs_file['file'].readline()
|
||||
if first_line == '':
|
||||
# we reached the end of file
|
||||
pairs_file['nextPair'] = None
|
||||
else:
|
||||
nextPair = dict()
|
||||
m = pair_pattern.match(first_line)
|
||||
if m:
|
||||
nextPair['number'] = int(m.group(1))
|
||||
else:
|
||||
raise Exception("Could not read sentence pair number from line: %s" %first_line)
|
||||
lines = [first_line]
|
||||
lines.append(pairs_file['file'].readline())
|
||||
lines.append(pairs_file['file'].readline())
|
||||
nextPair['lines'] = lines
|
||||
pairs_file['nextPair'] = nextPair
|
||||
|
||||
files = []
|
||||
for arg in sys.argv[1:]:
|
||||
pairs_file = {'file':open(arg,'r'), 'nextPair':None}
|
||||
getNextSentencePair(pairs_file)
|
||||
files.append(pairs_file)
|
||||
|
||||
finished = False
|
||||
while not finished:
|
||||
youngest_file = None
|
||||
for pairs_file in files:
|
||||
if pairs_file['nextPair'] is not None:
|
||||
# if the file is not at end
|
||||
if youngest_file is None or pairs_file['nextPair']['number'] < youngest_file['nextPair']['number']:
|
||||
youngest_file = pairs_file
|
||||
if youngest_file is None:
|
||||
finished = True
|
||||
else:
|
||||
print(''.join(youngest_file['nextPair']['lines']).rstrip())
|
||||
getNextSentencePair(youngest_file)
|
||||
|
||||
|
||||
|
||||
for pairs_file in files:
|
||||
pairs_file['file'].close()
|
||||
|
||||
"""
|
||||
|
||||
i = 0
|
||||
for line in sys.stdin:
|
||||
line = line.strip()
|
||||
if i % 3 == 0:
|
||||
current_example = [line]
|
||||
m = p.match(line)
|
||||
if m:
|
||||
current_key = int(m.group(1))
|
||||
else:
|
||||
raise Exception("Wrong line: "+line)
|
||||
elif i % 3 == 1:
|
||||
current_example.append(line)
|
||||
else:
|
||||
current_example.append(line)
|
||||
examples_dict[current_key] = current_example
|
||||
i+=1
|
||||
|
||||
for key in sorted(examples_dict.keys()):
|
||||
print ('\n'.join(examples_dict[key]))
|
||||
"""
|
Loading…
Reference in New Issue
Block a user