From 5fa99f06a2f51cc785df7222d2897e87e8c07f8a Mon Sep 17 00:00:00 2001 From: rjawor Date: Thu, 29 Aug 2019 21:08:15 +0200 Subject: [PATCH] generating src_clean.tok --- Makefile | 6 +++--- prepare_corpus.py | 11 +++++++---- 2 files changed, 10 insertions(+), 7 deletions(-) diff --git a/Makefile b/Makefile index 3f70bbb..0e9baac 100644 --- a/Makefile +++ b/Makefile @@ -5,7 +5,7 @@ SEPARATOR=@\#@ DICTIONARY_WEIGHT=3 -all: corpora/$(CORPUS_NAME)/alignments.txt corpora/$(CORPUS_NAME)/src_clean.txt corpora/$(CORPUS_NAME)/src_clean.lem corpora/$(CORPUS_NAME)/trg_clean.txt +all: corpora/$(CORPUS_NAME)/alignments.txt corpora/$(CORPUS_NAME)/src_clean.txt corpora/$(CORPUS_NAME)/src_clean.tok corpora/$(CORPUS_NAME)/src_clean.lem corpora/$(CORPUS_NAME)/trg_clean.txt clean: @@ -37,8 +37,8 @@ corpora/$(CORPUS_NAME)/falign_result.txt: corpora/$(CORPUS_NAME)/falign_corpus.t -corpora/$(CORPUS_NAME)/src_clean.txt corpora/$(CORPUS_NAME)/src_clean.lem corpora/$(CORPUS_NAME)/trg_clean.txt corpora/$(CORPUS_NAME)/ids_clean.txt corpora/$(CORPUS_NAME)/falign_corpus.txt: corpora/$(CORPUS_NAME)/src.txt corpora/$(CORPUS_NAME)/trg.txt corpora/$(CORPUS_NAME)/ids.txt corpora/$(CORPUS_NAME)/src.lem corpora/$(CORPUS_NAME)/trg.lem corpora/$(CORPUS_NAME)/src.dict corpora/$(CORPUS_NAME)/trg.dict - ./prepare_corpus.py corpora/$(CORPUS_NAME)/src.txt corpora/$(CORPUS_NAME)/trg.txt corpora/$(CORPUS_NAME)/ids.txt corpora/$(CORPUS_NAME)/src.lem corpora/$(CORPUS_NAME)/trg.lem corpora/$(CORPUS_NAME)/src.dict corpora/$(CORPUS_NAME)/trg.dict corpora/$(CORPUS_NAME)/src_clean.txt corpora/$(CORPUS_NAME)/src_clean.lem corpora/$(CORPUS_NAME)/trg_clean.txt corpora/$(CORPUS_NAME)/ids_clean.txt corpora/$(CORPUS_NAME)/falign_corpus.txt $(SRC_LANG) $(TRG_LANG) +corpora/$(CORPUS_NAME)/src_clean.txt corpora/$(CORPUS_NAME)/src_clean.tok corpora/$(CORPUS_NAME)/src_clean.lem corpora/$(CORPUS_NAME)/trg_clean.txt corpora/$(CORPUS_NAME)/ids_clean.txt corpora/$(CORPUS_NAME)/falign_corpus.txt: corpora/$(CORPUS_NAME)/src.txt corpora/$(CORPUS_NAME)/trg.txt corpora/$(CORPUS_NAME)/ids.txt corpora/$(CORPUS_NAME)/src.tok corpora/$(CORPUS_NAME)/src.lem corpora/$(CORPUS_NAME)/trg.lem corpora/$(CORPUS_NAME)/src.dict corpora/$(CORPUS_NAME)/trg.dict + ./prepare_corpus.py corpora/$(CORPUS_NAME)/src.txt corpora/$(CORPUS_NAME)/trg.txt corpora/$(CORPUS_NAME)/ids.txt corpora/$(CORPUS_NAME)/src.tok corpora/$(CORPUS_NAME)/src.lem corpora/$(CORPUS_NAME)/trg.lem corpora/$(CORPUS_NAME)/src.dict corpora/$(CORPUS_NAME)/trg.dict corpora/$(CORPUS_NAME)/src_clean.txt corpora/$(CORPUS_NAME)/src_clean.tok corpora/$(CORPUS_NAME)/src_clean.lem corpora/$(CORPUS_NAME)/trg_clean.txt corpora/$(CORPUS_NAME)/ids_clean.txt corpora/$(CORPUS_NAME)/falign_corpus.txt $(SRC_LANG) $(TRG_LANG) diff --git a/prepare_corpus.py b/prepare_corpus.py index 14c2675..0a3a2e9 100755 --- a/prepare_corpus.py +++ b/prepare_corpus.py @@ -39,10 +39,11 @@ def containsProfanity(pattern, sentence): max_tokens = 100 max_ratio = 4.0 -#./prepare_corpus.py corpora/$(CORPUS_NAME)/src.txt corpora/$(CORPUS_NAME)/trg.txt corpora/$(CORPUS_NAME)/ids.txt corpora/$(CORPUS_NAME)/src.lem corpora/$(CORPUS_NAME)/trg.lem corpora/$(CORPUS_NAME)/src.dict corpora/$(CORPUS_NAME)/trg.dict corpora/$(CORPUS_NAME)/src_clean.txt corpora/$(CORPUS_NAME)/src_clean.lem corpora/$(CORPUS_NAME)/trg_clean.txt corpora/$(CORPUS_NAME)/ids_clean.txt corpora/$(CORPUS_NAME)/falign_corpus.txt $(SRC_LANG) $(TRG_LANG) +# ./prepare_corpus.py corpora/$(CORPUS_NAME)/src.txt corpora/$(CORPUS_NAME)/trg.txt corpora/$(CORPUS_NAME)/ids.txt corpora/$(CORPUS_NAME)/src.tok corpora/$(CORPUS_NAME)/src.lem corpora/$(CORPUS_NAME)/trg.lem corpora/$(CORPUS_NAME)/src.dict corpora/$(CORPUS_NAME)/trg.dict corpora/$(CORPUS_NAME)/src_clean.txt corpora/$(CORPUS_NAME)/src_clean.tok corpora/$(CORPUS_NAME)/src_clean.lem corpora/$(CORPUS_NAME)/trg_clean.txt corpora/$(CORPUS_NAME)/ids_clean.txt corpora/$(CORPUS_NAME)/falign_corpus.txt $(SRC_LANG) $(TRG_LANG) -src_lang = sys.argv[13] -trg_lang = sys.argv[14] + +src_lang = sys.argv[15] +trg_lang = sys.argv[16] src_profanity_whole_pattern = readProfanityPattern(src_lang, True) src_profanity_parts_pattern = readProfanityPattern(src_lang, False) @@ -52,11 +53,12 @@ trg_profanity_parts_pattern = readProfanityPattern(trg_lang, False) -with open(sys.argv[1]) as src, open(sys.argv[2]) as trg, open(sys.argv[3]) as ids, open(sys.argv[4]) as src_lem, open(sys.argv[5]) as trg_lem, open(sys.argv[6]) as src_dict, open(sys.argv[7]) as trg_dict, open(sys.argv[8], 'w') as src_clean, open(sys.argv[9], 'w') as src_clean_lem, open(sys.argv[10], 'w') as trg_clean, open(sys.argv[11], 'w') as ids_clean, open(sys.argv[12], 'w') as falign_corpus: +with open(sys.argv[1]) as src, open(sys.argv[2]) as trg, open(sys.argv[3]) as ids, open(sys.argv[4]) as src_tok, open(sys.argv[5]) as src_lem, open(sys.argv[6]) as trg_lem, open(sys.argv[7]) as src_dict, open(sys.argv[8]) as trg_dict, open(sys.argv[9], 'w') as src_clean, open(sys.argv[10], 'w') as src_clean_tok, open(sys.argv[11], 'w') as src_clean_lem, open(sys.argv[12], 'w') as trg_clean, open(sys.argv[13], 'w') as ids_clean, open(sys.argv[14], 'w') as falign_corpus: for line in src: src_line_orig = line.strip() trg_line_orig = trg.readline().strip() id_orig = ids.readline().strip() + src_line_tok = src_tok.readline().strip() src_line_lem = src_lem.readline().strip() trg_line_lem = trg_lem.readline().strip() src_token_count = len(src_line_lem.split()) @@ -70,6 +72,7 @@ with open(sys.argv[1]) as src, open(sys.argv[2]) as trg, open(sys.argv[3]) as id and (not containsProfanity(trg_profanity_parts_pattern, trg_line_lem)) ): src_clean.write(src_line_orig+"\n") + src_clean_tok.write(src_line_tok+"\n") src_clean_lem.write(src_line_lem+"\n") trg_clean.write(trg_line_orig+"\n") ids_clean.write(id_orig+"\n")