concordia-server/fast-aligner/prepare_corpus.py

80 lines
3.6 KiB
Python
Raw Normal View History

2019-02-04 15:27:56 +01:00
#!/usr/bin/python3
# -*- coding: utf-8 -*-
2019-03-03 08:25:08 +01:00
import sys, re, os
2019-03-04 14:26:48 +01:00
def readProfanityPattern(lang, whole):
2019-03-03 08:25:08 +01:00
result = None
2019-03-04 14:26:48 +01:00
if whole:
file_path = 'bad-words/whole.'+ lang
else:
file_path = 'bad-words/parts.'+ lang
2019-03-03 08:25:08 +01:00
if os.path.isfile(file_path):
2019-03-04 14:26:48 +01:00
raw_pattern = ''
if not whole:
raw_pattern = r'\b'
raw_pattern += '('
2019-03-03 08:25:08 +01:00
with open(file_path) as profanity_file:
for line in profanity_file:
raw_pattern += line.rstrip() + "|"
2019-03-04 14:26:48 +01:00
raw_pattern = raw_pattern[:-1] + ')'
if not whole:
raw_pattern += r'\b'
2019-03-03 08:25:08 +01:00
result = re.compile(raw_pattern)
return result
def containsProfanity(pattern, sentence):
if pattern is None:
return False
else:
m = re.search(pattern, sentence)
if m:
#sys.stderr.write('Found profanity in sentence: %s\n' % sentence)
return True
else:
return False
2019-02-04 15:27:56 +01:00
max_tokens = 100
max_ratio = 4.0
2019-03-03 08:25:08 +01:00
#./prepare_corpus.py corpora/$(CORPUS_NAME)/src.txt corpora/$(CORPUS_NAME)/trg.txt corpora/$(CORPUS_NAME)/ids.txt corpora/$(CORPUS_NAME)/src.lem corpora/$(CORPUS_NAME)/trg.lem corpora/$(CORPUS_NAME)/src.dict corpora/$(CORPUS_NAME)/trg.dict corpora/$(CORPUS_NAME)/src_clean.txt corpora/$(CORPUS_NAME)/src_clean.lem corpora/$(CORPUS_NAME)/trg_clean.txt corpora/$(CORPUS_NAME)/ids_clean.txt corpora/$(CORPUS_NAME)/falign_corpus.txt $SRC_LANG $TRG_LANG
2019-03-04 14:26:48 +01:00
src_lang = sys.argv[13]
trg_lang = sys.argv[14]
2019-03-03 08:25:08 +01:00
2019-03-04 14:26:48 +01:00
src_profanity_whole_pattern = readProfanityPattern(src_lang, True)
src_profanity_parts_pattern = readProfanityPattern(src_lang, False)
trg_profanity_whole_pattern = readProfanityPattern(trg_lang, True)
trg_profanity_parts_pattern = readProfanityPattern(trg_lang, False)
2019-02-04 15:27:56 +01:00
2019-02-06 15:34:08 +01:00
2019-02-21 14:02:51 +01:00
with open(sys.argv[1]) as src, open(sys.argv[2]) as trg, open(sys.argv[3]) as ids, open(sys.argv[4]) as src_lem, open(sys.argv[5]) as trg_lem, open(sys.argv[6]) as src_dict, open(sys.argv[7]) as trg_dict, open(sys.argv[8], 'w') as src_clean, open(sys.argv[9], 'w') as src_clean_lem, open(sys.argv[10], 'w') as trg_clean, open(sys.argv[11], 'w') as ids_clean, open(sys.argv[12], 'w') as falign_corpus:
2019-02-06 15:34:08 +01:00
for line in src:
2019-02-04 15:27:56 +01:00
src_line_orig = line.strip()
2019-02-06 15:34:08 +01:00
trg_line_orig = trg.readline().strip()
id_orig = ids.readline().strip()
src_line_lem = src_lem.readline().strip()
trg_line_lem = trg_lem.readline().strip()
src_token_count = len(src_line_lem.split())
trg_token_count = len(trg_line_lem.split())
2019-02-04 15:27:56 +01:00
if (src_token_count > 0 and trg_token_count > 0 and src_token_count <= max_tokens and trg_token_count <= max_tokens):
ratio = float(src_token_count/trg_token_count) if src_token_count > trg_token_count else float(trg_token_count/src_token_count)
2019-03-04 14:26:48 +01:00
if (ratio <= max_ratio
and (not containsProfanity(src_profanity_whole_pattern, src_line_lem))
and (not containsProfanity(src_profanity_parts_pattern, src_line_lem))
and (not containsProfanity(trg_profanity_whole_pattern, trg_line_lem))
and (not containsProfanity(trg_profanity_parts_pattern, trg_line_lem))
):
2019-02-04 15:27:56 +01:00
src_clean.write(src_line_orig+"\n")
2019-02-21 14:02:51 +01:00
src_clean_lem.write(src_line_lem+"\n")
2019-02-04 15:27:56 +01:00
trg_clean.write(trg_line_orig+"\n")
2019-02-06 15:34:08 +01:00
ids_clean.write(id_orig+"\n")
2019-02-21 14:02:51 +01:00
falign_corpus.write("%s ||| %s\n" % (src_line_lem, trg_line_lem))
for line in src_dict:
src_word = line.rstrip()
trg_word = trg_dict.readline().rstrip()
falign_corpus.write("%s ||| %s\n" % (src_word, trg_word))