#!/usr/bin/python3 # -*- coding: utf-8 -*- import sys, re, os def readProfanityPattern(lang): result = None file_path = 'bad-words/%s.txt' % lang if os.path.isfile(file_path): raw_pattern = r'\b(' with open(file_path) as profanity_file: for line in profanity_file: raw_pattern += line.rstrip() + "|" raw_pattern = raw_pattern[:-1] + r')\b' result = re.compile(raw_pattern) return result def containsProfanity(pattern, sentence): if pattern is None: return False else: m = re.search(pattern, sentence) if m: #sys.stderr.write('Found profanity in sentence: %s\n' % sentence) return True else: return False max_tokens = 100 max_ratio = 4.0 #./prepare_corpus.py corpora/$(CORPUS_NAME)/src.txt corpora/$(CORPUS_NAME)/trg.txt corpora/$(CORPUS_NAME)/ids.txt corpora/$(CORPUS_NAME)/src.lem corpora/$(CORPUS_NAME)/trg.lem corpora/$(CORPUS_NAME)/src.dict corpora/$(CORPUS_NAME)/trg.dict corpora/$(CORPUS_NAME)/src_clean.txt corpora/$(CORPUS_NAME)/src_clean.lem corpora/$(CORPUS_NAME)/trg_clean.txt corpora/$(CORPUS_NAME)/ids_clean.txt corpora/$(CORPUS_NAME)/falign_corpus.txt $SRC_LANG $TRG_LANG src_profanity_pattern = readProfanityPattern(sys.argv[13]) trg_profanity_pattern = readProfanityPattern(sys.argv[14]) with open(sys.argv[1]) as src, open(sys.argv[2]) as trg, open(sys.argv[3]) as ids, open(sys.argv[4]) as src_lem, open(sys.argv[5]) as trg_lem, open(sys.argv[6]) as src_dict, open(sys.argv[7]) as trg_dict, open(sys.argv[8], 'w') as src_clean, open(sys.argv[9], 'w') as src_clean_lem, open(sys.argv[10], 'w') as trg_clean, open(sys.argv[11], 'w') as ids_clean, open(sys.argv[12], 'w') as falign_corpus: for line in src: src_line_orig = line.strip() trg_line_orig = trg.readline().strip() id_orig = ids.readline().strip() src_line_lem = src_lem.readline().strip() trg_line_lem = trg_lem.readline().strip() src_token_count = len(src_line_lem.split()) trg_token_count = len(trg_line_lem.split()) if (src_token_count > 0 and trg_token_count > 0 and src_token_count <= max_tokens and trg_token_count <= max_tokens): ratio = float(src_token_count/trg_token_count) if src_token_count > trg_token_count else float(trg_token_count/src_token_count) if (ratio <= max_ratio and (not containsProfanity(src_profanity_pattern, src_line_lem)) and (not containsProfanity(trg_profanity_pattern, trg_line_lem))): src_clean.write(src_line_orig+"\n") src_clean_lem.write(src_line_lem+"\n") trg_clean.write(trg_line_orig+"\n") ids_clean.write(id_orig+"\n") falign_corpus.write("%s ||| %s\n" % (src_line_lem, trg_line_lem)) for line in src_dict: src_word = line.rstrip() trg_word = trg_dict.readline().rstrip() falign_corpus.write("%s ||| %s\n" % (src_word, trg_word))