censor sources

This commit is contained in:
Rafał Jaworski 2019-03-07 13:49:08 +01:00
parent 8496f12f16
commit ced8bd00b6
2 changed files with 38 additions and 3 deletions

View File

@ -21,7 +21,6 @@ coon
crap crap
cunt cunt
damn damn
dick
dicker dicker
dickerin dickerin
dickhead dickhead
@ -40,7 +39,6 @@ flange
fuck fuck
fudgepacker fudgepacker
Goddamn Goddamn
hell
Horsedick Horsedick
jizz jizz
knobend knobend
@ -61,7 +59,6 @@ slut
sluttish sluttish
slutty slutty
spunk spunk
toss
tosser tosser
turd turd
twat twat

38
fast-aligner/censor_sources.py Executable file
View File

@ -0,0 +1,38 @@
#!/usr/bin/python3
# -*- coding: utf-8 -*-
import sys, re, os
raw_profanity_whole_pattern = r'\b('
raw_profanity_parts_pattern = '('
for profanity_file_path in os.listdir('bad-words'):
with open('bad-words/'+profanity_file_path) as pf:
if profanity_file_path.startswith('whole'):
for word in pf:
raw_profanity_whole_pattern += word.rstrip()+'|'
else:
for word in pf:
raw_profanity_parts_pattern += word.rstrip()+'|'
raw_profanity_whole_pattern = raw_profanity_whole_pattern[:-1]+r')\b'
raw_profanity_parts_pattern = raw_profanity_parts_pattern[:-1]+')'
profanity_whole_pattern = re.compile(raw_profanity_whole_pattern, re.IGNORECASE)
profanity_parts_pattern = re.compile(raw_profanity_parts_pattern, re.IGNORECASE)
with open(sys.argv[1]) as sources_file:
for line in sources_file:
line = line.rstrip()
found_whole = re.search(profanity_whole_pattern, line)
found_parts = re.search(profanity_parts_pattern, line)
if found_whole or found_parts:
censored = re.sub(profanity_whole_pattern, '*'*5,line)
censored = re.sub(profanity_parts_pattern, '*'*5,censored)
print(censored)
sys.stderr.write('Censored: %s to %s\n' % (line, censored))
else:
print(line)