better bad words filtering

This commit is contained in:
Rafał Jaworski 2019-03-04 14:26:48 +01:00
parent a679b26726
commit 8496f12f16
5 changed files with 51 additions and 7 deletions

View File

@ -0,0 +1,3 @@
fuck
shit
nigga

View File

@ -0,0 +1,4 @@
pierdol
pierdal
kurw
ścierw

View File

@ -1,9 +1,14 @@
arse arse
ass ass
asshole
assholes
ballsack ballsack
bastard bastard
biatch biatch
bitch bitch
bitchcraft
bitchen
bitchy
blowjob blowjob
bollock bollock
bollok bollok
@ -17,6 +22,12 @@ crap
cunt cunt
damn damn
dick dick
dicker
dickerin
dickhead
dickheads
dickweed
dickweeds
dildo dildo
dyke dyke
fag fag
@ -30,10 +41,13 @@ fuck
fudgepacker fudgepacker
Goddamn Goddamn
hell hell
Horsedick
jizz jizz
knobend knobend
muff muff
needledick
nigga nigga
niggas
nigger nigger
penis penis
piss piss
@ -44,6 +58,8 @@ scrotum
sh1t sh1t
shit shit
slut slut
sluttish
slutty
spunk spunk
toss toss
tosser tosser
@ -51,4 +67,7 @@ turd
twat twat
vagina vagina
wank wank
wanker
whore whore
whorehouse
whoremonger

View File

@ -4,15 +4,24 @@
import sys, re, os import sys, re, os
def readProfanityPattern(lang): def readProfanityPattern(lang, whole):
result = None result = None
file_path = 'bad-words/%s.txt' % lang if whole:
file_path = 'bad-words/whole.'+ lang
else:
file_path = 'bad-words/parts.'+ lang
if os.path.isfile(file_path): if os.path.isfile(file_path):
raw_pattern = r'\b(' raw_pattern = ''
if not whole:
raw_pattern = r'\b'
raw_pattern += '('
with open(file_path) as profanity_file: with open(file_path) as profanity_file:
for line in profanity_file: for line in profanity_file:
raw_pattern += line.rstrip() + "|" raw_pattern += line.rstrip() + "|"
raw_pattern = raw_pattern[:-1] + r')\b' raw_pattern = raw_pattern[:-1] + ')'
if not whole:
raw_pattern += r'\b'
result = re.compile(raw_pattern) result = re.compile(raw_pattern)
return result return result
@ -32,9 +41,13 @@ max_ratio = 4.0
#./prepare_corpus.py corpora/$(CORPUS_NAME)/src.txt corpora/$(CORPUS_NAME)/trg.txt corpora/$(CORPUS_NAME)/ids.txt corpora/$(CORPUS_NAME)/src.lem corpora/$(CORPUS_NAME)/trg.lem corpora/$(CORPUS_NAME)/src.dict corpora/$(CORPUS_NAME)/trg.dict corpora/$(CORPUS_NAME)/src_clean.txt corpora/$(CORPUS_NAME)/src_clean.lem corpora/$(CORPUS_NAME)/trg_clean.txt corpora/$(CORPUS_NAME)/ids_clean.txt corpora/$(CORPUS_NAME)/falign_corpus.txt $SRC_LANG $TRG_LANG #./prepare_corpus.py corpora/$(CORPUS_NAME)/src.txt corpora/$(CORPUS_NAME)/trg.txt corpora/$(CORPUS_NAME)/ids.txt corpora/$(CORPUS_NAME)/src.lem corpora/$(CORPUS_NAME)/trg.lem corpora/$(CORPUS_NAME)/src.dict corpora/$(CORPUS_NAME)/trg.dict corpora/$(CORPUS_NAME)/src_clean.txt corpora/$(CORPUS_NAME)/src_clean.lem corpora/$(CORPUS_NAME)/trg_clean.txt corpora/$(CORPUS_NAME)/ids_clean.txt corpora/$(CORPUS_NAME)/falign_corpus.txt $SRC_LANG $TRG_LANG
src_profanity_pattern = readProfanityPattern(sys.argv[13]) src_lang = sys.argv[13]
trg_profanity_pattern = readProfanityPattern(sys.argv[14]) trg_lang = sys.argv[14]
src_profanity_whole_pattern = readProfanityPattern(src_lang, True)
src_profanity_parts_pattern = readProfanityPattern(src_lang, False)
trg_profanity_whole_pattern = readProfanityPattern(trg_lang, True)
trg_profanity_parts_pattern = readProfanityPattern(trg_lang, False)
@ -50,7 +63,12 @@ with open(sys.argv[1]) as src, open(sys.argv[2]) as trg, open(sys.argv[3]) as id
trg_token_count = len(trg_line_lem.split()) trg_token_count = len(trg_line_lem.split())
if (src_token_count > 0 and trg_token_count > 0 and src_token_count <= max_tokens and trg_token_count <= max_tokens): if (src_token_count > 0 and trg_token_count > 0 and src_token_count <= max_tokens and trg_token_count <= max_tokens):
ratio = float(src_token_count/trg_token_count) if src_token_count > trg_token_count else float(trg_token_count/src_token_count) ratio = float(src_token_count/trg_token_count) if src_token_count > trg_token_count else float(trg_token_count/src_token_count)
if (ratio <= max_ratio and (not containsProfanity(src_profanity_pattern, src_line_lem)) and (not containsProfanity(trg_profanity_pattern, trg_line_lem))): if (ratio <= max_ratio
and (not containsProfanity(src_profanity_whole_pattern, src_line_lem))
and (not containsProfanity(src_profanity_parts_pattern, src_line_lem))
and (not containsProfanity(trg_profanity_whole_pattern, trg_line_lem))
and (not containsProfanity(trg_profanity_parts_pattern, trg_line_lem))
):
src_clean.write(src_line_orig+"\n") src_clean.write(src_line_orig+"\n")
src_clean_lem.write(src_line_lem+"\n") src_clean_lem.write(src_line_lem+"\n")
trg_clean.write(trg_line_orig+"\n") trg_clean.write(trg_line_orig+"\n")