better bad words filtering

This commit is contained in:
Rafał Jaworski 2019-03-04 14:26:48 +01:00
parent a679b26726
commit 8496f12f16
5 changed files with 51 additions and 7 deletions

View File

@ -0,0 +1,3 @@
fuck
shit
nigga

View File

@ -0,0 +1,4 @@
pierdol
pierdal
kurw
ścierw

View File

@ -1,9 +1,14 @@
arse
ass
asshole
assholes
ballsack
bastard
biatch
bitch
bitchcraft
bitchen
bitchy
blowjob
bollock
bollok
@ -17,6 +22,12 @@ crap
cunt
damn
dick
dicker
dickerin
dickhead
dickheads
dickweed
dickweeds
dildo
dyke
fag
@ -30,10 +41,13 @@ fuck
fudgepacker
Goddamn
hell
Horsedick
jizz
knobend
muff
needledick
nigga
niggas
nigger
penis
piss
@ -44,6 +58,8 @@ scrotum
sh1t
shit
slut
sluttish
slutty
spunk
toss
tosser
@ -51,4 +67,7 @@ turd
twat
vagina
wank
wanker
whore
whorehouse
whoremonger

View File

@ -4,15 +4,24 @@
import sys, re, os
def readProfanityPattern(lang):
def readProfanityPattern(lang, whole):
result = None
file_path = 'bad-words/%s.txt' % lang
if whole:
file_path = 'bad-words/whole.'+ lang
else:
file_path = 'bad-words/parts.'+ lang
if os.path.isfile(file_path):
raw_pattern = r'\b('
raw_pattern = ''
if not whole:
raw_pattern = r'\b'
raw_pattern += '('
with open(file_path) as profanity_file:
for line in profanity_file:
raw_pattern += line.rstrip() + "|"
raw_pattern = raw_pattern[:-1] + r')\b'
raw_pattern = raw_pattern[:-1] + ')'
if not whole:
raw_pattern += r'\b'
result = re.compile(raw_pattern)
return result
@ -32,9 +41,13 @@ max_ratio = 4.0
#./prepare_corpus.py corpora/$(CORPUS_NAME)/src.txt corpora/$(CORPUS_NAME)/trg.txt corpora/$(CORPUS_NAME)/ids.txt corpora/$(CORPUS_NAME)/src.lem corpora/$(CORPUS_NAME)/trg.lem corpora/$(CORPUS_NAME)/src.dict corpora/$(CORPUS_NAME)/trg.dict corpora/$(CORPUS_NAME)/src_clean.txt corpora/$(CORPUS_NAME)/src_clean.lem corpora/$(CORPUS_NAME)/trg_clean.txt corpora/$(CORPUS_NAME)/ids_clean.txt corpora/$(CORPUS_NAME)/falign_corpus.txt $SRC_LANG $TRG_LANG
src_profanity_pattern = readProfanityPattern(sys.argv[13])
trg_profanity_pattern = readProfanityPattern(sys.argv[14])
src_lang = sys.argv[13]
trg_lang = sys.argv[14]
src_profanity_whole_pattern = readProfanityPattern(src_lang, True)
src_profanity_parts_pattern = readProfanityPattern(src_lang, False)
trg_profanity_whole_pattern = readProfanityPattern(trg_lang, True)
trg_profanity_parts_pattern = readProfanityPattern(trg_lang, False)
@ -50,7 +63,12 @@ with open(sys.argv[1]) as src, open(sys.argv[2]) as trg, open(sys.argv[3]) as id
trg_token_count = len(trg_line_lem.split())
if (src_token_count > 0 and trg_token_count > 0 and src_token_count <= max_tokens and trg_token_count <= max_tokens):
ratio = float(src_token_count/trg_token_count) if src_token_count > trg_token_count else float(trg_token_count/src_token_count)
if (ratio <= max_ratio and (not containsProfanity(src_profanity_pattern, src_line_lem)) and (not containsProfanity(trg_profanity_pattern, trg_line_lem))):
if (ratio <= max_ratio
and (not containsProfanity(src_profanity_whole_pattern, src_line_lem))
and (not containsProfanity(src_profanity_parts_pattern, src_line_lem))
and (not containsProfanity(trg_profanity_whole_pattern, trg_line_lem))
and (not containsProfanity(trg_profanity_parts_pattern, trg_line_lem))
):
src_clean.write(src_line_orig+"\n")
src_clean_lem.write(src_line_lem+"\n")
trg_clean.write(trg_line_orig+"\n")