diff --git a/fast-aligner/Makefile b/fast-aligner/Makefile index 6e2cf9f..ba425c7 100644 --- a/fast-aligner/Makefile +++ b/fast-aligner/Makefile @@ -1,6 +1,6 @@ SRC_LANG=pl TRG_LANG=en -CORPUS_NAME=opensubtitles +CORPUS_NAME=opensubtitles_sample SEPARATOR=@\#@ DICTIONARY_WEIGHT=3 @@ -38,7 +38,7 @@ corpora/$(CORPUS_NAME)/trg.lem: corpora/$(CORPUS_NAME)/trg.txt corpora/$(CORPUS_NAME)/src_clean.txt corpora/$(CORPUS_NAME)/src_clean.lem corpora/$(CORPUS_NAME)/trg_clean.txt corpora/$(CORPUS_NAME)/ids_clean.txt corpora/$(CORPUS_NAME)/falign_corpus.txt: corpora/$(CORPUS_NAME)/src.txt corpora/$(CORPUS_NAME)/trg.txt corpora/$(CORPUS_NAME)/ids.txt corpora/$(CORPUS_NAME)/src.lem corpora/$(CORPUS_NAME)/trg.lem corpora/$(CORPUS_NAME)/src.dict corpora/$(CORPUS_NAME)/trg.dict - ./prepare_corpus.py corpora/$(CORPUS_NAME)/src.txt corpora/$(CORPUS_NAME)/trg.txt corpora/$(CORPUS_NAME)/ids.txt corpora/$(CORPUS_NAME)/src.lem corpora/$(CORPUS_NAME)/trg.lem corpora/$(CORPUS_NAME)/src.dict corpora/$(CORPUS_NAME)/trg.dict corpora/$(CORPUS_NAME)/src_clean.txt corpora/$(CORPUS_NAME)/src_clean.lem corpora/$(CORPUS_NAME)/trg_clean.txt corpora/$(CORPUS_NAME)/ids_clean.txt corpora/$(CORPUS_NAME)/falign_corpus.txt + ./prepare_corpus.py corpora/$(CORPUS_NAME)/src.txt corpora/$(CORPUS_NAME)/trg.txt corpora/$(CORPUS_NAME)/ids.txt corpora/$(CORPUS_NAME)/src.lem corpora/$(CORPUS_NAME)/trg.lem corpora/$(CORPUS_NAME)/src.dict corpora/$(CORPUS_NAME)/trg.dict corpora/$(CORPUS_NAME)/src_clean.txt corpora/$(CORPUS_NAME)/src_clean.lem corpora/$(CORPUS_NAME)/trg_clean.txt corpora/$(CORPUS_NAME)/ids_clean.txt corpora/$(CORPUS_NAME)/falign_corpus.txt $(SRC_LANG) $(TRG_LANG) corpora/$(CORPUS_NAME)/falign_result.txt: corpora/$(CORPUS_NAME)/falign_corpus.txt diff --git a/fast-aligner/prepare_corpus.py b/fast-aligner/prepare_corpus.py index 1853067..c72a093 100755 --- a/fast-aligner/prepare_corpus.py +++ b/fast-aligner/prepare_corpus.py @@ -1,12 +1,40 @@ #!/usr/bin/python3 # -*- coding: utf-8 -*- -import sys +import sys, re, os + + +def readProfanityPattern(lang): + result = None + file_path = 'bad-words/%s.txt' % lang + if os.path.isfile(file_path): + raw_pattern = r'\b(' + with open(file_path) as profanity_file: + for line in profanity_file: + raw_pattern += line.rstrip() + "|" + raw_pattern = raw_pattern[:-1] + r')\b' + result = re.compile(raw_pattern) + return result + +def containsProfanity(pattern, sentence): + if pattern is None: + return False + else: + m = re.search(pattern, sentence) + if m: + #sys.stderr.write('Found profanity in sentence: %s\n' % sentence) + return True + else: + return False max_tokens = 100 max_ratio = 4.0 -#./prepare_corpus.py corpora/$(CORPUS_NAME)/src.txt corpora/$(CORPUS_NAME)/trg.txt corpora/$(CORPUS_NAME)/ids.txt corpora/$(CORPUS_NAME)/src.lem corpora/$(CORPUS_NAME)/trg.lem corpora/$(CORPUS_NAME)/src.dict corpora/$(CORPUS_NAME)/trg.dict corpora/$(CORPUS_NAME)/src_clean.txt corpora/$(CORPUS_NAME)/src_clean.lem corpora/$(CORPUS_NAME)/trg_clean.txt corpora/$(CORPUS_NAME)/ids_clean.txt corpora/$(CORPUS_NAME)/falign_corpus.txt +#./prepare_corpus.py corpora/$(CORPUS_NAME)/src.txt corpora/$(CORPUS_NAME)/trg.txt corpora/$(CORPUS_NAME)/ids.txt corpora/$(CORPUS_NAME)/src.lem corpora/$(CORPUS_NAME)/trg.lem corpora/$(CORPUS_NAME)/src.dict corpora/$(CORPUS_NAME)/trg.dict corpora/$(CORPUS_NAME)/src_clean.txt corpora/$(CORPUS_NAME)/src_clean.lem corpora/$(CORPUS_NAME)/trg_clean.txt corpora/$(CORPUS_NAME)/ids_clean.txt corpora/$(CORPUS_NAME)/falign_corpus.txt $SRC_LANG $TRG_LANG + +src_profanity_pattern = readProfanityPattern(sys.argv[13]) +trg_profanity_pattern = readProfanityPattern(sys.argv[14]) + @@ -22,7 +50,7 @@ with open(sys.argv[1]) as src, open(sys.argv[2]) as trg, open(sys.argv[3]) as id trg_token_count = len(trg_line_lem.split()) if (src_token_count > 0 and trg_token_count > 0 and src_token_count <= max_tokens and trg_token_count <= max_tokens): ratio = float(src_token_count/trg_token_count) if src_token_count > trg_token_count else float(trg_token_count/src_token_count) - if (ratio <= max_ratio): + if (ratio <= max_ratio and (not containsProfanity(src_profanity_pattern, src_line_lem)) and (not containsProfanity(trg_profanity_pattern, trg_line_lem))): src_clean.write(src_line_orig+"\n") src_clean_lem.write(src_line_lem+"\n") trg_clean.write(trg_line_orig+"\n")