From ced8bd00b643ea1925b6993317baabc604b1fd7e Mon Sep 17 00:00:00 2001
From: =?UTF-8?q?Rafa=C5=82=20Jaworski?= <rjawor@amu.edu.pl>
Date: Thu, 7 Mar 2019 13:49:08 +0100
Subject: [PATCH] censor sources

---
 fast-aligner/bad-words/whole.en |  3 ---
 fast-aligner/censor_sources.py  | 38 +++++++++++++++++++++++++++++++++
 2 files changed, 38 insertions(+), 3 deletions(-)
 create mode 100755 fast-aligner/censor_sources.py

diff --git a/fast-aligner/bad-words/whole.en b/fast-aligner/bad-words/whole.en
index c98b06d..12a685c 100644
--- a/fast-aligner/bad-words/whole.en
+++ b/fast-aligner/bad-words/whole.en
@@ -21,7 +21,6 @@ coon
 crap
 cunt
 damn
-dick
 dicker
 dickerin
 dickhead
@@ -40,7 +39,6 @@ flange
 fuck
 fudgepacker
 Goddamn
-hell
 Horsedick
 jizz
 knobend
@@ -61,7 +59,6 @@ slut
 sluttish
 slutty
 spunk
-toss
 tosser
 turd
 twat
diff --git a/fast-aligner/censor_sources.py b/fast-aligner/censor_sources.py
new file mode 100755
index 0000000..25d4441
--- /dev/null
+++ b/fast-aligner/censor_sources.py
@@ -0,0 +1,38 @@
+#!/usr/bin/python3
+# -*- coding: utf-8 -*-
+
+import sys, re, os
+
+
+raw_profanity_whole_pattern = r'\b('
+raw_profanity_parts_pattern = '('
+
+for profanity_file_path in os.listdir('bad-words'):
+    with open('bad-words/'+profanity_file_path) as pf:
+        if profanity_file_path.startswith('whole'):            
+            for word in pf:
+                raw_profanity_whole_pattern += word.rstrip()+'|'
+        else:
+            for word in pf:
+                raw_profanity_parts_pattern += word.rstrip()+'|'
+
+
+raw_profanity_whole_pattern = raw_profanity_whole_pattern[:-1]+r')\b'
+raw_profanity_parts_pattern = raw_profanity_parts_pattern[:-1]+')'
+
+profanity_whole_pattern = re.compile(raw_profanity_whole_pattern, re.IGNORECASE)
+profanity_parts_pattern = re.compile(raw_profanity_parts_pattern, re.IGNORECASE)
+
+with open(sys.argv[1]) as sources_file:
+    for line in sources_file:
+        line = line.rstrip()
+        found_whole = re.search(profanity_whole_pattern, line)
+        found_parts = re.search(profanity_parts_pattern, line)
+
+        if found_whole or found_parts:
+            censored = re.sub(profanity_whole_pattern, '*'*5,line)
+            censored = re.sub(profanity_parts_pattern, '*'*5,censored)
+            print(censored)
+            sys.stderr.write('Censored: %s to %s\n' % (line, censored))
+        else:
+            print(line)