clean corpus fast align
This commit is contained in:
parent
a8c7db6ee4
commit
ffe20cdc3b
@ -5,32 +5,16 @@ SEPARATOR=@\#@
|
||||
|
||||
DICTIONARY_WEIGHT=3
|
||||
|
||||
all: corpora/$(CORPUS_NAME)/src.lem corpora/$(CORPUS_NAME)/trg.lem
|
||||
all: corpora/$(CORPUS_NAME)/falign_corpus.txt
|
||||
|
||||
|
||||
clean-intermediate-files:
|
||||
clean:
|
||||
rm -f corpora/$(CORPUS_NAME)/*.lem
|
||||
rm -f corpora/$(CORPUS_NAME)/*.tok
|
||||
rm -f corpora/$(CORPUS_NAME)/*.dict
|
||||
rm -f corpora/$(CORPUS_NAME)/*.classes
|
||||
rm -f corpora/$(CORPUS_NAME)/*.classes.cats
|
||||
rm -f corpora/$(CORPUS_NAME)/*.vcb
|
||||
rm -f corpora/$(CORPUS_NAME)/*.snt
|
||||
rm -f corpora/$(CORPUS_NAME)/*.cooc
|
||||
rm -f corpora/$(CORPUS_NAME)/aligned*part*
|
||||
rm -f corpora/$(CORPUS_NAME)/aligned.txt
|
||||
rm -f corpora/$(CORPUS_NAME)/giza.cfg
|
||||
rm -f corpora/$(CORPUS_NAME)/aligned.gizacfg
|
||||
rm -f corpora/$(CORPUS_NAME)/pasted.txt
|
||||
rm -f corpora/$(CORPUS_NAME)/pasted_deduplicated.txt
|
||||
rm -f corpora/$(CORPUS_NAME)/src_clean.txt
|
||||
rm -f corpora/$(CORPUS_NAME)/trg_clean.txt
|
||||
|
||||
clean: clean-intermediate-files
|
||||
rm -f corpora/$(CORPUS_NAME)/src_final.txt
|
||||
rm -f corpora/$(CORPUS_NAME)/trg_final.txt
|
||||
rm -f corpora/$(CORPUS_NAME)/aligned_final.txt
|
||||
|
||||
rm -f corpora/$(CORPUS_NAME)/ids_clean.txt
|
||||
rm -f corpora/$(CORPUS_NAME)/falign_corpus.txt
|
||||
|
||||
corpora/$(CORPUS_NAME)/src.dict:
|
||||
./collect_dict.py $(SRC_LANG) $(TRG_LANG) $(DICTIONARY_WEIGHT) > $@
|
||||
@ -48,8 +32,8 @@ corpora/$(CORPUS_NAME)/trg.lem: corpora/$(CORPUS_NAME)/trg.txt
|
||||
|
||||
|
||||
|
||||
corpora/$(CORPUS_NAME)/src_clean.txt corpora/$(CORPUS_NAME)/trg_clean.txt corpora/$(CORPUS_NAME)/src_clean.tok corpora/$(CORPUS_NAME)/trg_clean.tok: corpora/$(CORPUS_NAME)/src.txt corpora/$(CORPUS_NAME)/trg.txt corpora/$(CORPUS_NAME)/src.tok corpora/$(CORPUS_NAME)/trg.tok
|
||||
./clean_corpus.py corpora/$(CORPUS_NAME)/src.txt corpora/$(CORPUS_NAME)/trg.txt corpora/$(CORPUS_NAME)/src.tok corpora/$(CORPUS_NAME)/trg.tok corpora/$(CORPUS_NAME)/src_clean.txt corpora/$(CORPUS_NAME)/trg_clean.txt corpora/$(CORPUS_NAME)/src_clean.tok corpora/$(CORPUS_NAME)/trg_clean.tok $(SEPARATOR)
|
||||
corpora/$(CORPUS_NAME)/src_clean.txt corpora/$(CORPUS_NAME)/trg_clean.txt corpora/$(CORPUS_NAME)/ids_clean.txt corpora/$(CORPUS_NAME)/falign_corpus.txt: corpora/$(CORPUS_NAME)/src.txt corpora/$(CORPUS_NAME)/trg.txt corpora/$(CORPUS_NAME)/ids.txt corpora/$(CORPUS_NAME)/src.lem corpora/$(CORPUS_NAME)/trg.lem
|
||||
./clean_corpus.py corpora/$(CORPUS_NAME)/src.txt corpora/$(CORPUS_NAME)/trg.txt corpora/$(CORPUS_NAME)/ids.txt corpora/$(CORPUS_NAME)/src.lem corpora/$(CORPUS_NAME)/trg.lem corpora/$(CORPUS_NAME)/src_clean.txt corpora/$(CORPUS_NAME)/trg_clean.txt corpora/$(CORPUS_NAME)/ids_clean.txt corpora/$(CORPUS_NAME)/falign_corpus.txt
|
||||
|
||||
corpora/$(CORPUS_NAME)/pasted.txt: corpora/$(CORPUS_NAME)/src_clean.txt corpora/$(CORPUS_NAME)/trg_clean.txt
|
||||
./paste.py corpora/$(CORPUS_NAME)/src_clean.txt corpora/$(CORPUS_NAME)/trg_clean.txt $(SEPARATOR)> $@
|
||||
|
@ -6,20 +6,22 @@ import sys
|
||||
max_tokens = 100
|
||||
max_ratio = 4.0
|
||||
|
||||
separator = sys.argv[9]
|
||||
#./clean_corpus.py corpora/$(CORPUS_NAME)/src.txt corpora/$(CORPUS_NAME)/trg.txt corpora/$(CORPUS_NAME)/ids.txt corpora/$(CORPUS_NAME)/src.lem corpora/$(CORPUS_NAME)/trg.lem corpora/$(CORPUS_NAME)/src_clean.txt corpora/$(CORPUS_NAME)/trg_clean.txt corpora/$(CORPUS_NAME)/ids_clean.txt corpora/$(CORPUS_NAME)/falign_corpus.txt
|
||||
|
||||
with open(sys.argv[1]) as src_file, open(sys.argv[2]) as trg_file, open(sys.argv[3]) as src_tok, open(sys.argv[4]) as trg_tok, open(sys.argv[5], 'w') as src_clean, open(sys.argv[6], 'w') as trg_clean, open(sys.argv[7], 'w') as src_clean_tok, open(sys.argv[8], 'w') as trg_clean_tok:
|
||||
for line in src_file:
|
||||
|
||||
with open(sys.argv[1]) as src, open(sys.argv[2]) as trg, open(sys.argv[3]) as ids, open(sys.argv[4]) as src_lem, open(sys.argv[5]) as trg_lem, open(sys.argv[6], 'w') as src_clean, open(sys.argv[7], 'w') as trg_clean, open(sys.argv[8], 'w') as ids_clean, open(sys.argv[9], 'w') as falign_corpus:
|
||||
for line in src:
|
||||
src_line_orig = line.strip()
|
||||
trg_line_orig = trg_file.readline().strip()
|
||||
src_line_tok = src_tok.readline().strip()
|
||||
trg_line_tok = trg_tok.readline().strip()
|
||||
src_token_count = len(src_line_tok.split())
|
||||
trg_token_count = len(trg_line_tok.split())
|
||||
trg_line_orig = trg.readline().strip()
|
||||
id_orig = ids.readline().strip()
|
||||
src_line_lem = src_lem.readline().strip()
|
||||
trg_line_lem = trg_lem.readline().strip()
|
||||
src_token_count = len(src_line_lem.split())
|
||||
trg_token_count = len(trg_line_lem.split())
|
||||
if (src_token_count > 0 and trg_token_count > 0 and src_token_count <= max_tokens and trg_token_count <= max_tokens):
|
||||
ratio = float(src_token_count/trg_token_count) if src_token_count > trg_token_count else float(trg_token_count/src_token_count)
|
||||
if (ratio <= max_ratio):
|
||||
src_clean.write(src_line_orig+"\n")
|
||||
trg_clean.write(trg_line_orig+"\n")
|
||||
src_clean_tok.write(src_line_tok+"\n")
|
||||
trg_clean_tok.write(trg_line_tok+"\n")
|
||||
ids_clean.write(id_orig+"\n")
|
||||
falign_corpus.write("%s|||%s\n" % (src_line_lem, trg_line_lem))
|
||||
|
Loading…
Reference in New Issue
Block a user