better bad words filtering
This commit is contained in:
parent
a679b26726
commit
8496f12f16
3
fast-aligner/bad-words/parts.en
Normal file
3
fast-aligner/bad-words/parts.en
Normal file
@ -0,0 +1,3 @@
|
||||
fuck
|
||||
shit
|
||||
nigga
|
4
fast-aligner/bad-words/parts.pl
Normal file
4
fast-aligner/bad-words/parts.pl
Normal file
@ -0,0 +1,4 @@
|
||||
pierdol
|
||||
pierdal
|
||||
kurw
|
||||
ścierw
|
@ -1,9 +1,14 @@
|
||||
arse
|
||||
ass
|
||||
asshole
|
||||
assholes
|
||||
ballsack
|
||||
bastard
|
||||
biatch
|
||||
bitch
|
||||
bitchcraft
|
||||
bitchen
|
||||
bitchy
|
||||
blowjob
|
||||
bollock
|
||||
bollok
|
||||
@ -17,6 +22,12 @@ crap
|
||||
cunt
|
||||
damn
|
||||
dick
|
||||
dicker
|
||||
dickerin
|
||||
dickhead
|
||||
dickheads
|
||||
dickweed
|
||||
dickweeds
|
||||
dildo
|
||||
dyke
|
||||
fag
|
||||
@ -30,10 +41,13 @@ fuck
|
||||
fudgepacker
|
||||
Goddamn
|
||||
hell
|
||||
Horsedick
|
||||
jizz
|
||||
knobend
|
||||
muff
|
||||
needledick
|
||||
nigga
|
||||
niggas
|
||||
nigger
|
||||
penis
|
||||
piss
|
||||
@ -44,6 +58,8 @@ scrotum
|
||||
sh1t
|
||||
shit
|
||||
slut
|
||||
sluttish
|
||||
slutty
|
||||
spunk
|
||||
toss
|
||||
tosser
|
||||
@ -51,4 +67,7 @@ turd
|
||||
twat
|
||||
vagina
|
||||
wank
|
||||
wanker
|
||||
whore
|
||||
whorehouse
|
||||
whoremonger
|
@ -4,15 +4,24 @@
|
||||
import sys, re, os
|
||||
|
||||
|
||||
def readProfanityPattern(lang):
|
||||
def readProfanityPattern(lang, whole):
|
||||
result = None
|
||||
file_path = 'bad-words/%s.txt' % lang
|
||||
if whole:
|
||||
file_path = 'bad-words/whole.'+ lang
|
||||
else:
|
||||
file_path = 'bad-words/parts.'+ lang
|
||||
if os.path.isfile(file_path):
|
||||
raw_pattern = r'\b('
|
||||
raw_pattern = ''
|
||||
if not whole:
|
||||
raw_pattern = r'\b'
|
||||
raw_pattern += '('
|
||||
with open(file_path) as profanity_file:
|
||||
for line in profanity_file:
|
||||
raw_pattern += line.rstrip() + "|"
|
||||
raw_pattern = raw_pattern[:-1] + r')\b'
|
||||
raw_pattern = raw_pattern[:-1] + ')'
|
||||
if not whole:
|
||||
raw_pattern += r'\b'
|
||||
|
||||
result = re.compile(raw_pattern)
|
||||
return result
|
||||
|
||||
@ -32,9 +41,13 @@ max_ratio = 4.0
|
||||
|
||||
#./prepare_corpus.py corpora/$(CORPUS_NAME)/src.txt corpora/$(CORPUS_NAME)/trg.txt corpora/$(CORPUS_NAME)/ids.txt corpora/$(CORPUS_NAME)/src.lem corpora/$(CORPUS_NAME)/trg.lem corpora/$(CORPUS_NAME)/src.dict corpora/$(CORPUS_NAME)/trg.dict corpora/$(CORPUS_NAME)/src_clean.txt corpora/$(CORPUS_NAME)/src_clean.lem corpora/$(CORPUS_NAME)/trg_clean.txt corpora/$(CORPUS_NAME)/ids_clean.txt corpora/$(CORPUS_NAME)/falign_corpus.txt $SRC_LANG $TRG_LANG
|
||||
|
||||
src_profanity_pattern = readProfanityPattern(sys.argv[13])
|
||||
trg_profanity_pattern = readProfanityPattern(sys.argv[14])
|
||||
src_lang = sys.argv[13]
|
||||
trg_lang = sys.argv[14]
|
||||
|
||||
src_profanity_whole_pattern = readProfanityPattern(src_lang, True)
|
||||
src_profanity_parts_pattern = readProfanityPattern(src_lang, False)
|
||||
trg_profanity_whole_pattern = readProfanityPattern(trg_lang, True)
|
||||
trg_profanity_parts_pattern = readProfanityPattern(trg_lang, False)
|
||||
|
||||
|
||||
|
||||
@ -50,7 +63,12 @@ with open(sys.argv[1]) as src, open(sys.argv[2]) as trg, open(sys.argv[3]) as id
|
||||
trg_token_count = len(trg_line_lem.split())
|
||||
if (src_token_count > 0 and trg_token_count > 0 and src_token_count <= max_tokens and trg_token_count <= max_tokens):
|
||||
ratio = float(src_token_count/trg_token_count) if src_token_count > trg_token_count else float(trg_token_count/src_token_count)
|
||||
if (ratio <= max_ratio and (not containsProfanity(src_profanity_pattern, src_line_lem)) and (not containsProfanity(trg_profanity_pattern, trg_line_lem))):
|
||||
if (ratio <= max_ratio
|
||||
and (not containsProfanity(src_profanity_whole_pattern, src_line_lem))
|
||||
and (not containsProfanity(src_profanity_parts_pattern, src_line_lem))
|
||||
and (not containsProfanity(trg_profanity_whole_pattern, trg_line_lem))
|
||||
and (not containsProfanity(trg_profanity_parts_pattern, trg_line_lem))
|
||||
):
|
||||
src_clean.write(src_line_orig+"\n")
|
||||
src_clean_lem.write(src_line_lem+"\n")
|
||||
trg_clean.write(trg_line_orig+"\n")
|
||||
|
Loading…
Reference in New Issue
Block a user