better bad words filtering
This commit is contained in:
parent
a679b26726
commit
8496f12f16
3
fast-aligner/bad-words/parts.en
Normal file
3
fast-aligner/bad-words/parts.en
Normal file
@ -0,0 +1,3 @@
|
|||||||
|
fuck
|
||||||
|
shit
|
||||||
|
nigga
|
4
fast-aligner/bad-words/parts.pl
Normal file
4
fast-aligner/bad-words/parts.pl
Normal file
@ -0,0 +1,4 @@
|
|||||||
|
pierdol
|
||||||
|
pierdal
|
||||||
|
kurw
|
||||||
|
ścierw
|
@ -1,9 +1,14 @@
|
|||||||
arse
|
arse
|
||||||
ass
|
ass
|
||||||
|
asshole
|
||||||
|
assholes
|
||||||
ballsack
|
ballsack
|
||||||
bastard
|
bastard
|
||||||
biatch
|
biatch
|
||||||
bitch
|
bitch
|
||||||
|
bitchcraft
|
||||||
|
bitchen
|
||||||
|
bitchy
|
||||||
blowjob
|
blowjob
|
||||||
bollock
|
bollock
|
||||||
bollok
|
bollok
|
||||||
@ -17,6 +22,12 @@ crap
|
|||||||
cunt
|
cunt
|
||||||
damn
|
damn
|
||||||
dick
|
dick
|
||||||
|
dicker
|
||||||
|
dickerin
|
||||||
|
dickhead
|
||||||
|
dickheads
|
||||||
|
dickweed
|
||||||
|
dickweeds
|
||||||
dildo
|
dildo
|
||||||
dyke
|
dyke
|
||||||
fag
|
fag
|
||||||
@ -30,10 +41,13 @@ fuck
|
|||||||
fudgepacker
|
fudgepacker
|
||||||
Goddamn
|
Goddamn
|
||||||
hell
|
hell
|
||||||
|
Horsedick
|
||||||
jizz
|
jizz
|
||||||
knobend
|
knobend
|
||||||
muff
|
muff
|
||||||
|
needledick
|
||||||
nigga
|
nigga
|
||||||
|
niggas
|
||||||
nigger
|
nigger
|
||||||
penis
|
penis
|
||||||
piss
|
piss
|
||||||
@ -44,6 +58,8 @@ scrotum
|
|||||||
sh1t
|
sh1t
|
||||||
shit
|
shit
|
||||||
slut
|
slut
|
||||||
|
sluttish
|
||||||
|
slutty
|
||||||
spunk
|
spunk
|
||||||
toss
|
toss
|
||||||
tosser
|
tosser
|
||||||
@ -51,4 +67,7 @@ turd
|
|||||||
twat
|
twat
|
||||||
vagina
|
vagina
|
||||||
wank
|
wank
|
||||||
|
wanker
|
||||||
whore
|
whore
|
||||||
|
whorehouse
|
||||||
|
whoremonger
|
@ -4,15 +4,24 @@
|
|||||||
import sys, re, os
|
import sys, re, os
|
||||||
|
|
||||||
|
|
||||||
def readProfanityPattern(lang):
|
def readProfanityPattern(lang, whole):
|
||||||
result = None
|
result = None
|
||||||
file_path = 'bad-words/%s.txt' % lang
|
if whole:
|
||||||
|
file_path = 'bad-words/whole.'+ lang
|
||||||
|
else:
|
||||||
|
file_path = 'bad-words/parts.'+ lang
|
||||||
if os.path.isfile(file_path):
|
if os.path.isfile(file_path):
|
||||||
raw_pattern = r'\b('
|
raw_pattern = ''
|
||||||
|
if not whole:
|
||||||
|
raw_pattern = r'\b'
|
||||||
|
raw_pattern += '('
|
||||||
with open(file_path) as profanity_file:
|
with open(file_path) as profanity_file:
|
||||||
for line in profanity_file:
|
for line in profanity_file:
|
||||||
raw_pattern += line.rstrip() + "|"
|
raw_pattern += line.rstrip() + "|"
|
||||||
raw_pattern = raw_pattern[:-1] + r')\b'
|
raw_pattern = raw_pattern[:-1] + ')'
|
||||||
|
if not whole:
|
||||||
|
raw_pattern += r'\b'
|
||||||
|
|
||||||
result = re.compile(raw_pattern)
|
result = re.compile(raw_pattern)
|
||||||
return result
|
return result
|
||||||
|
|
||||||
@ -32,9 +41,13 @@ max_ratio = 4.0
|
|||||||
|
|
||||||
#./prepare_corpus.py corpora/$(CORPUS_NAME)/src.txt corpora/$(CORPUS_NAME)/trg.txt corpora/$(CORPUS_NAME)/ids.txt corpora/$(CORPUS_NAME)/src.lem corpora/$(CORPUS_NAME)/trg.lem corpora/$(CORPUS_NAME)/src.dict corpora/$(CORPUS_NAME)/trg.dict corpora/$(CORPUS_NAME)/src_clean.txt corpora/$(CORPUS_NAME)/src_clean.lem corpora/$(CORPUS_NAME)/trg_clean.txt corpora/$(CORPUS_NAME)/ids_clean.txt corpora/$(CORPUS_NAME)/falign_corpus.txt $SRC_LANG $TRG_LANG
|
#./prepare_corpus.py corpora/$(CORPUS_NAME)/src.txt corpora/$(CORPUS_NAME)/trg.txt corpora/$(CORPUS_NAME)/ids.txt corpora/$(CORPUS_NAME)/src.lem corpora/$(CORPUS_NAME)/trg.lem corpora/$(CORPUS_NAME)/src.dict corpora/$(CORPUS_NAME)/trg.dict corpora/$(CORPUS_NAME)/src_clean.txt corpora/$(CORPUS_NAME)/src_clean.lem corpora/$(CORPUS_NAME)/trg_clean.txt corpora/$(CORPUS_NAME)/ids_clean.txt corpora/$(CORPUS_NAME)/falign_corpus.txt $SRC_LANG $TRG_LANG
|
||||||
|
|
||||||
src_profanity_pattern = readProfanityPattern(sys.argv[13])
|
src_lang = sys.argv[13]
|
||||||
trg_profanity_pattern = readProfanityPattern(sys.argv[14])
|
trg_lang = sys.argv[14]
|
||||||
|
|
||||||
|
src_profanity_whole_pattern = readProfanityPattern(src_lang, True)
|
||||||
|
src_profanity_parts_pattern = readProfanityPattern(src_lang, False)
|
||||||
|
trg_profanity_whole_pattern = readProfanityPattern(trg_lang, True)
|
||||||
|
trg_profanity_parts_pattern = readProfanityPattern(trg_lang, False)
|
||||||
|
|
||||||
|
|
||||||
|
|
||||||
@ -50,7 +63,12 @@ with open(sys.argv[1]) as src, open(sys.argv[2]) as trg, open(sys.argv[3]) as id
|
|||||||
trg_token_count = len(trg_line_lem.split())
|
trg_token_count = len(trg_line_lem.split())
|
||||||
if (src_token_count > 0 and trg_token_count > 0 and src_token_count <= max_tokens and trg_token_count <= max_tokens):
|
if (src_token_count > 0 and trg_token_count > 0 and src_token_count <= max_tokens and trg_token_count <= max_tokens):
|
||||||
ratio = float(src_token_count/trg_token_count) if src_token_count > trg_token_count else float(trg_token_count/src_token_count)
|
ratio = float(src_token_count/trg_token_count) if src_token_count > trg_token_count else float(trg_token_count/src_token_count)
|
||||||
if (ratio <= max_ratio and (not containsProfanity(src_profanity_pattern, src_line_lem)) and (not containsProfanity(trg_profanity_pattern, trg_line_lem))):
|
if (ratio <= max_ratio
|
||||||
|
and (not containsProfanity(src_profanity_whole_pattern, src_line_lem))
|
||||||
|
and (not containsProfanity(src_profanity_parts_pattern, src_line_lem))
|
||||||
|
and (not containsProfanity(trg_profanity_whole_pattern, trg_line_lem))
|
||||||
|
and (not containsProfanity(trg_profanity_parts_pattern, trg_line_lem))
|
||||||
|
):
|
||||||
src_clean.write(src_line_orig+"\n")
|
src_clean.write(src_line_orig+"\n")
|
||||||
src_clean_lem.write(src_line_lem+"\n")
|
src_clean_lem.write(src_line_lem+"\n")
|
||||||
trg_clean.write(trg_line_orig+"\n")
|
trg_clean.write(trg_line_orig+"\n")
|
||||||
|
Loading…
Reference in New Issue
Block a user