From ced8bd00b643ea1925b6993317baabc604b1fd7e Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Rafa=C5=82=20Jaworski?= Date: Thu, 7 Mar 2019 13:49:08 +0100 Subject: [PATCH] censor sources --- fast-aligner/bad-words/whole.en | 3 --- fast-aligner/censor_sources.py | 38 +++++++++++++++++++++++++++++++++ 2 files changed, 38 insertions(+), 3 deletions(-) create mode 100755 fast-aligner/censor_sources.py diff --git a/fast-aligner/bad-words/whole.en b/fast-aligner/bad-words/whole.en index c98b06d..12a685c 100644 --- a/fast-aligner/bad-words/whole.en +++ b/fast-aligner/bad-words/whole.en @@ -21,7 +21,6 @@ coon crap cunt damn -dick dicker dickerin dickhead @@ -40,7 +39,6 @@ flange fuck fudgepacker Goddamn -hell Horsedick jizz knobend @@ -61,7 +59,6 @@ slut sluttish slutty spunk -toss tosser turd twat diff --git a/fast-aligner/censor_sources.py b/fast-aligner/censor_sources.py new file mode 100755 index 0000000..25d4441 --- /dev/null +++ b/fast-aligner/censor_sources.py @@ -0,0 +1,38 @@ +#!/usr/bin/python3 +# -*- coding: utf-8 -*- + +import sys, re, os + + +raw_profanity_whole_pattern = r'\b(' +raw_profanity_parts_pattern = '(' + +for profanity_file_path in os.listdir('bad-words'): + with open('bad-words/'+profanity_file_path) as pf: + if profanity_file_path.startswith('whole'): + for word in pf: + raw_profanity_whole_pattern += word.rstrip()+'|' + else: + for word in pf: + raw_profanity_parts_pattern += word.rstrip()+'|' + + +raw_profanity_whole_pattern = raw_profanity_whole_pattern[:-1]+r')\b' +raw_profanity_parts_pattern = raw_profanity_parts_pattern[:-1]+')' + +profanity_whole_pattern = re.compile(raw_profanity_whole_pattern, re.IGNORECASE) +profanity_parts_pattern = re.compile(raw_profanity_parts_pattern, re.IGNORECASE) + +with open(sys.argv[1]) as sources_file: + for line in sources_file: + line = line.rstrip() + found_whole = re.search(profanity_whole_pattern, line) + found_parts = re.search(profanity_parts_pattern, line) + + if found_whole or found_parts: + censored = re.sub(profanity_whole_pattern, '*'*5,line) + censored = re.sub(profanity_parts_pattern, '*'*5,censored) + print(censored) + sys.stderr.write('Censored: %s to %s\n' % (line, censored)) + else: + print(line)