From ffe20cdc3bd07f10224d1bec0c5fb8a9a1e70ab1 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Rafa=C5=82=20Jaworski?= Date: Wed, 6 Feb 2019 15:34:08 +0100 Subject: [PATCH] clean corpus fast align --- fast-aligner/Makefile | 28 ++++++---------------------- fast-aligner/clean_corpus.py | 22 ++++++++++++---------- 2 files changed, 18 insertions(+), 32 deletions(-) diff --git a/fast-aligner/Makefile b/fast-aligner/Makefile index f080bf9..0a8d2d9 100644 --- a/fast-aligner/Makefile +++ b/fast-aligner/Makefile @@ -5,32 +5,16 @@ SEPARATOR=@\#@ DICTIONARY_WEIGHT=3 -all: corpora/$(CORPUS_NAME)/src.lem corpora/$(CORPUS_NAME)/trg.lem +all: corpora/$(CORPUS_NAME)/falign_corpus.txt -clean-intermediate-files: +clean: rm -f corpora/$(CORPUS_NAME)/*.lem - rm -f corpora/$(CORPUS_NAME)/*.tok rm -f corpora/$(CORPUS_NAME)/*.dict - rm -f corpora/$(CORPUS_NAME)/*.classes - rm -f corpora/$(CORPUS_NAME)/*.classes.cats - rm -f corpora/$(CORPUS_NAME)/*.vcb - rm -f corpora/$(CORPUS_NAME)/*.snt - rm -f corpora/$(CORPUS_NAME)/*.cooc - rm -f corpora/$(CORPUS_NAME)/aligned*part* - rm -f corpora/$(CORPUS_NAME)/aligned.txt - rm -f corpora/$(CORPUS_NAME)/giza.cfg - rm -f corpora/$(CORPUS_NAME)/aligned.gizacfg - rm -f corpora/$(CORPUS_NAME)/pasted.txt - rm -f corpora/$(CORPUS_NAME)/pasted_deduplicated.txt rm -f corpora/$(CORPUS_NAME)/src_clean.txt rm -f corpora/$(CORPUS_NAME)/trg_clean.txt - -clean: clean-intermediate-files - rm -f corpora/$(CORPUS_NAME)/src_final.txt - rm -f corpora/$(CORPUS_NAME)/trg_final.txt - rm -f corpora/$(CORPUS_NAME)/aligned_final.txt - + rm -f corpora/$(CORPUS_NAME)/ids_clean.txt + rm -f corpora/$(CORPUS_NAME)/falign_corpus.txt corpora/$(CORPUS_NAME)/src.dict: ./collect_dict.py $(SRC_LANG) $(TRG_LANG) $(DICTIONARY_WEIGHT) > $@ @@ -48,8 +32,8 @@ corpora/$(CORPUS_NAME)/trg.lem: corpora/$(CORPUS_NAME)/trg.txt -corpora/$(CORPUS_NAME)/src_clean.txt corpora/$(CORPUS_NAME)/trg_clean.txt corpora/$(CORPUS_NAME)/src_clean.tok corpora/$(CORPUS_NAME)/trg_clean.tok: corpora/$(CORPUS_NAME)/src.txt corpora/$(CORPUS_NAME)/trg.txt corpora/$(CORPUS_NAME)/src.tok corpora/$(CORPUS_NAME)/trg.tok - ./clean_corpus.py corpora/$(CORPUS_NAME)/src.txt corpora/$(CORPUS_NAME)/trg.txt corpora/$(CORPUS_NAME)/src.tok corpora/$(CORPUS_NAME)/trg.tok corpora/$(CORPUS_NAME)/src_clean.txt corpora/$(CORPUS_NAME)/trg_clean.txt corpora/$(CORPUS_NAME)/src_clean.tok corpora/$(CORPUS_NAME)/trg_clean.tok $(SEPARATOR) +corpora/$(CORPUS_NAME)/src_clean.txt corpora/$(CORPUS_NAME)/trg_clean.txt corpora/$(CORPUS_NAME)/ids_clean.txt corpora/$(CORPUS_NAME)/falign_corpus.txt: corpora/$(CORPUS_NAME)/src.txt corpora/$(CORPUS_NAME)/trg.txt corpora/$(CORPUS_NAME)/ids.txt corpora/$(CORPUS_NAME)/src.lem corpora/$(CORPUS_NAME)/trg.lem + ./clean_corpus.py corpora/$(CORPUS_NAME)/src.txt corpora/$(CORPUS_NAME)/trg.txt corpora/$(CORPUS_NAME)/ids.txt corpora/$(CORPUS_NAME)/src.lem corpora/$(CORPUS_NAME)/trg.lem corpora/$(CORPUS_NAME)/src_clean.txt corpora/$(CORPUS_NAME)/trg_clean.txt corpora/$(CORPUS_NAME)/ids_clean.txt corpora/$(CORPUS_NAME)/falign_corpus.txt corpora/$(CORPUS_NAME)/pasted.txt: corpora/$(CORPUS_NAME)/src_clean.txt corpora/$(CORPUS_NAME)/trg_clean.txt ./paste.py corpora/$(CORPUS_NAME)/src_clean.txt corpora/$(CORPUS_NAME)/trg_clean.txt $(SEPARATOR)> $@ diff --git a/fast-aligner/clean_corpus.py b/fast-aligner/clean_corpus.py index a545ff4..3b5ecbc 100755 --- a/fast-aligner/clean_corpus.py +++ b/fast-aligner/clean_corpus.py @@ -6,20 +6,22 @@ import sys max_tokens = 100 max_ratio = 4.0 -separator = sys.argv[9] +#./clean_corpus.py corpora/$(CORPUS_NAME)/src.txt corpora/$(CORPUS_NAME)/trg.txt corpora/$(CORPUS_NAME)/ids.txt corpora/$(CORPUS_NAME)/src.lem corpora/$(CORPUS_NAME)/trg.lem corpora/$(CORPUS_NAME)/src_clean.txt corpora/$(CORPUS_NAME)/trg_clean.txt corpora/$(CORPUS_NAME)/ids_clean.txt corpora/$(CORPUS_NAME)/falign_corpus.txt -with open(sys.argv[1]) as src_file, open(sys.argv[2]) as trg_file, open(sys.argv[3]) as src_tok, open(sys.argv[4]) as trg_tok, open(sys.argv[5], 'w') as src_clean, open(sys.argv[6], 'w') as trg_clean, open(sys.argv[7], 'w') as src_clean_tok, open(sys.argv[8], 'w') as trg_clean_tok: - for line in src_file: + +with open(sys.argv[1]) as src, open(sys.argv[2]) as trg, open(sys.argv[3]) as ids, open(sys.argv[4]) as src_lem, open(sys.argv[5]) as trg_lem, open(sys.argv[6], 'w') as src_clean, open(sys.argv[7], 'w') as trg_clean, open(sys.argv[8], 'w') as ids_clean, open(sys.argv[9], 'w') as falign_corpus: + for line in src: src_line_orig = line.strip() - trg_line_orig = trg_file.readline().strip() - src_line_tok = src_tok.readline().strip() - trg_line_tok = trg_tok.readline().strip() - src_token_count = len(src_line_tok.split()) - trg_token_count = len(trg_line_tok.split()) + trg_line_orig = trg.readline().strip() + id_orig = ids.readline().strip() + src_line_lem = src_lem.readline().strip() + trg_line_lem = trg_lem.readline().strip() + src_token_count = len(src_line_lem.split()) + trg_token_count = len(trg_line_lem.split()) if (src_token_count > 0 and trg_token_count > 0 and src_token_count <= max_tokens and trg_token_count <= max_tokens): ratio = float(src_token_count/trg_token_count) if src_token_count > trg_token_count else float(trg_token_count/src_token_count) if (ratio <= max_ratio): src_clean.write(src_line_orig+"\n") trg_clean.write(trg_line_orig+"\n") - src_clean_tok.write(src_line_tok+"\n") - trg_clean_tok.write(trg_line_tok+"\n") + ids_clean.write(id_orig+"\n") + falign_corpus.write("%s|||%s\n" % (src_line_lem, trg_line_lem))