profanity filtering
This commit is contained in:
parent
78cd73b5ef
commit
7bcab5aff9
@ -1,6 +1,6 @@
|
|||||||
SRC_LANG=pl
|
SRC_LANG=pl
|
||||||
TRG_LANG=en
|
TRG_LANG=en
|
||||||
CORPUS_NAME=opensubtitles
|
CORPUS_NAME=opensubtitles_sample
|
||||||
SEPARATOR=@\#@
|
SEPARATOR=@\#@
|
||||||
|
|
||||||
DICTIONARY_WEIGHT=3
|
DICTIONARY_WEIGHT=3
|
||||||
@ -38,7 +38,7 @@ corpora/$(CORPUS_NAME)/trg.lem: corpora/$(CORPUS_NAME)/trg.txt
|
|||||||
|
|
||||||
|
|
||||||
corpora/$(CORPUS_NAME)/src_clean.txt corpora/$(CORPUS_NAME)/src_clean.lem corpora/$(CORPUS_NAME)/trg_clean.txt corpora/$(CORPUS_NAME)/ids_clean.txt corpora/$(CORPUS_NAME)/falign_corpus.txt: corpora/$(CORPUS_NAME)/src.txt corpora/$(CORPUS_NAME)/trg.txt corpora/$(CORPUS_NAME)/ids.txt corpora/$(CORPUS_NAME)/src.lem corpora/$(CORPUS_NAME)/trg.lem corpora/$(CORPUS_NAME)/src.dict corpora/$(CORPUS_NAME)/trg.dict
|
corpora/$(CORPUS_NAME)/src_clean.txt corpora/$(CORPUS_NAME)/src_clean.lem corpora/$(CORPUS_NAME)/trg_clean.txt corpora/$(CORPUS_NAME)/ids_clean.txt corpora/$(CORPUS_NAME)/falign_corpus.txt: corpora/$(CORPUS_NAME)/src.txt corpora/$(CORPUS_NAME)/trg.txt corpora/$(CORPUS_NAME)/ids.txt corpora/$(CORPUS_NAME)/src.lem corpora/$(CORPUS_NAME)/trg.lem corpora/$(CORPUS_NAME)/src.dict corpora/$(CORPUS_NAME)/trg.dict
|
||||||
./prepare_corpus.py corpora/$(CORPUS_NAME)/src.txt corpora/$(CORPUS_NAME)/trg.txt corpora/$(CORPUS_NAME)/ids.txt corpora/$(CORPUS_NAME)/src.lem corpora/$(CORPUS_NAME)/trg.lem corpora/$(CORPUS_NAME)/src.dict corpora/$(CORPUS_NAME)/trg.dict corpora/$(CORPUS_NAME)/src_clean.txt corpora/$(CORPUS_NAME)/src_clean.lem corpora/$(CORPUS_NAME)/trg_clean.txt corpora/$(CORPUS_NAME)/ids_clean.txt corpora/$(CORPUS_NAME)/falign_corpus.txt
|
./prepare_corpus.py corpora/$(CORPUS_NAME)/src.txt corpora/$(CORPUS_NAME)/trg.txt corpora/$(CORPUS_NAME)/ids.txt corpora/$(CORPUS_NAME)/src.lem corpora/$(CORPUS_NAME)/trg.lem corpora/$(CORPUS_NAME)/src.dict corpora/$(CORPUS_NAME)/trg.dict corpora/$(CORPUS_NAME)/src_clean.txt corpora/$(CORPUS_NAME)/src_clean.lem corpora/$(CORPUS_NAME)/trg_clean.txt corpora/$(CORPUS_NAME)/ids_clean.txt corpora/$(CORPUS_NAME)/falign_corpus.txt $(SRC_LANG) $(TRG_LANG)
|
||||||
|
|
||||||
|
|
||||||
corpora/$(CORPUS_NAME)/falign_result.txt: corpora/$(CORPUS_NAME)/falign_corpus.txt
|
corpora/$(CORPUS_NAME)/falign_result.txt: corpora/$(CORPUS_NAME)/falign_corpus.txt
|
||||||
|
@ -1,12 +1,40 @@
|
|||||||
#!/usr/bin/python3
|
#!/usr/bin/python3
|
||||||
# -*- coding: utf-8 -*-
|
# -*- coding: utf-8 -*-
|
||||||
|
|
||||||
import sys
|
import sys, re, os
|
||||||
|
|
||||||
|
|
||||||
|
def readProfanityPattern(lang):
|
||||||
|
result = None
|
||||||
|
file_path = 'bad-words/%s.txt' % lang
|
||||||
|
if os.path.isfile(file_path):
|
||||||
|
raw_pattern = r'\b('
|
||||||
|
with open(file_path) as profanity_file:
|
||||||
|
for line in profanity_file:
|
||||||
|
raw_pattern += line.rstrip() + "|"
|
||||||
|
raw_pattern = raw_pattern[:-1] + r')\b'
|
||||||
|
result = re.compile(raw_pattern)
|
||||||
|
return result
|
||||||
|
|
||||||
|
def containsProfanity(pattern, sentence):
|
||||||
|
if pattern is None:
|
||||||
|
return False
|
||||||
|
else:
|
||||||
|
m = re.search(pattern, sentence)
|
||||||
|
if m:
|
||||||
|
#sys.stderr.write('Found profanity in sentence: %s\n' % sentence)
|
||||||
|
return True
|
||||||
|
else:
|
||||||
|
return False
|
||||||
|
|
||||||
max_tokens = 100
|
max_tokens = 100
|
||||||
max_ratio = 4.0
|
max_ratio = 4.0
|
||||||
|
|
||||||
#./prepare_corpus.py corpora/$(CORPUS_NAME)/src.txt corpora/$(CORPUS_NAME)/trg.txt corpora/$(CORPUS_NAME)/ids.txt corpora/$(CORPUS_NAME)/src.lem corpora/$(CORPUS_NAME)/trg.lem corpora/$(CORPUS_NAME)/src.dict corpora/$(CORPUS_NAME)/trg.dict corpora/$(CORPUS_NAME)/src_clean.txt corpora/$(CORPUS_NAME)/src_clean.lem corpora/$(CORPUS_NAME)/trg_clean.txt corpora/$(CORPUS_NAME)/ids_clean.txt corpora/$(CORPUS_NAME)/falign_corpus.txt
|
#./prepare_corpus.py corpora/$(CORPUS_NAME)/src.txt corpora/$(CORPUS_NAME)/trg.txt corpora/$(CORPUS_NAME)/ids.txt corpora/$(CORPUS_NAME)/src.lem corpora/$(CORPUS_NAME)/trg.lem corpora/$(CORPUS_NAME)/src.dict corpora/$(CORPUS_NAME)/trg.dict corpora/$(CORPUS_NAME)/src_clean.txt corpora/$(CORPUS_NAME)/src_clean.lem corpora/$(CORPUS_NAME)/trg_clean.txt corpora/$(CORPUS_NAME)/ids_clean.txt corpora/$(CORPUS_NAME)/falign_corpus.txt $SRC_LANG $TRG_LANG
|
||||||
|
|
||||||
|
src_profanity_pattern = readProfanityPattern(sys.argv[13])
|
||||||
|
trg_profanity_pattern = readProfanityPattern(sys.argv[14])
|
||||||
|
|
||||||
|
|
||||||
|
|
||||||
|
|
||||||
@ -22,7 +50,7 @@ with open(sys.argv[1]) as src, open(sys.argv[2]) as trg, open(sys.argv[3]) as id
|
|||||||
trg_token_count = len(trg_line_lem.split())
|
trg_token_count = len(trg_line_lem.split())
|
||||||
if (src_token_count > 0 and trg_token_count > 0 and src_token_count <= max_tokens and trg_token_count <= max_tokens):
|
if (src_token_count > 0 and trg_token_count > 0 and src_token_count <= max_tokens and trg_token_count <= max_tokens):
|
||||||
ratio = float(src_token_count/trg_token_count) if src_token_count > trg_token_count else float(trg_token_count/src_token_count)
|
ratio = float(src_token_count/trg_token_count) if src_token_count > trg_token_count else float(trg_token_count/src_token_count)
|
||||||
if (ratio <= max_ratio):
|
if (ratio <= max_ratio and (not containsProfanity(src_profanity_pattern, src_line_lem)) and (not containsProfanity(trg_profanity_pattern, trg_line_lem))):
|
||||||
src_clean.write(src_line_orig+"\n")
|
src_clean.write(src_line_orig+"\n")
|
||||||
src_clean_lem.write(src_line_lem+"\n")
|
src_clean_lem.write(src_line_lem+"\n")
|
||||||
trg_clean.write(trg_line_orig+"\n")
|
trg_clean.write(trg_line_orig+"\n")
|
||||||
|
Loading…
Reference in New Issue
Block a user