diff --git a/Makefile b/Makefile
index b00c303..3d3353c 100644
--- a/Makefile
+++ b/Makefile
@@ -7,12 +7,17 @@ JOBS := 100
 
 all: data
 
+
+data-add: parishwebsites/spider-commands-add.txt parishwebsites/domain-blacklist.txt parishwebsites/deal-with-not-completed.sh
+	cd parishwebsites && ./deal-with-not-completed.sh
+	cd parishwebsites && parallel --jobs $(JOBS) < spider-commands-add.txt
+
 data: parishwebsites/spider-commands.txt parishwebsites/domain-blacklist.txt
 	rm -f parishwebsites/*processed.txt
 	cd parishwebsites && parallel --jobs $(JOBS) < spider-commands.txt
 
 parishwebsites/spider-commands.txt: parishes-with-urls.tsv parishwebsites/domain-blacklist.txt
-	cut -f3 $< | tail -n +2 | grep http | parishwebsites/generate_spider_commands.sh | sort -u | parishwebsites/remove_blacklisted.py $(word 2,$^) > $@
+	cut -f3 $< | tail -n +2 | grep http | parishwebsites/generate_spider_commands.sh | sort -u | parishwebsites/remove_blacklisted.py $(word 2,$^) | parishwebsites/remove_duplicate_commands.py > $@
 
 parishes-with-urls.tsv: apikey.txt parishes-deon.tsv scraper/get_parishes_urls.py
 	scraper/get_parishes_urls.py -a $< -p $(word 2,$^) >> $@ 2> get-parishes-urls.log
diff --git a/annotator.py b/annotator.py
new file mode 100755
index 0000000..41bf6d1
--- /dev/null
+++ b/annotator.py
@@ -0,0 +1,40 @@
+#!/usr/bin/env python3
+import jsonlines
+from extractor.find_hours import hours_iterator
+from parishwebsites.parish2text import Parish2Text
+import os
+import random
+
+parish2text = Parish2Text()
+
+CONTEXT = 100
+
+
+
+def process_parish_page(parish_page):
+    content = parish_page.pop('content')
+    for utterance, utterance_colored in hours_iterator(content):
+        print(utterance_colored)
+        import ipdb; ipdb.set_trace()
+
+
+def process_parish_file(parish_reader):
+    for parish_page in parish_reader:
+        parish_page = parish2text.convert(parish_page)
+        process_parish_page(parish_page)
+
+
+def process_directory(directory):
+    for root, dirs, files in os.walk(directory):
+        # random.shuffle(files)
+        for fname in sorted(files):
+            filepath = os.path.join(root, fname)
+            if os.path.getsize(filepath) > 0:
+                with jsonlines.open(filepath) as parish_reader:
+                    process_parish_file(parish_reader)
+
+def main():
+    process_directory('./parishwebsites/data')
+
+if __name__ == '__main__':
+    main()
diff --git a/extractor/extract.py b/extractor/extract.py
deleted file mode 100755
index 4acccfe..0000000
--- a/extractor/extract.py
+++ /dev/null
@@ -1,151 +0,0 @@
-#!/usr/bin/env python3
-from colorama import Fore, Back, Style
-import os
-import jsonlines
-import re
-import pprint
-
-
-class Extractor:
-    def __init__(self, page):
-        "docstring"
-        self.page = page
-        self.content = page['content']
-        self.header = self.wrap_with_name_group(
-            'header',
-            'porządek mszy (świętych|św|św\.)|msz[ea][ \n]+([śs]wi[eę]t[ea]|św|św\.)'
-        )
-
-        self.sunday_title = self.wrap_with_name_group(
-            'sunday_title',
-            'niedziel[a|e][ \n]+i[ \n]+(dni[ \n]+(świąteczne|św|św\.)|święta)'
-            '|niedziel[ea]'
-            '|porządek świąteczny')
-        #'|święta'
-        self.sunday_masses = self.wrap_with_name_group(
-            'sunday_masses', '.*[^\d]\d{1,2}[^\d].*?')
-        self.everyday_title = self.wrap_with_name_group(
-            'everyday_title', 'dzień powszedni'
-            '|dni powszednie'
-            '|w tygodniu'
-            '|porządek zwykły'
-            '|od poniedziałku do soboty')
-        self.everyday_masses = self.wrap_with_name_group(
-            'everyday_masses',
-            '(.*?[^\d\n]?\d{1,2}[^\d\n]?.*?\n)+')  # \n lub koniec stringa
-
-    def wrap_with_name_group(self, name, pattern):
-        return '(?P<{}>{})'.format(name, pattern)
-
-    def extract(self, search_space=None):
-        if not search_space:
-            search_space = self.content
-        header_match = re.search(self.header, search_space, re.I)
-        if not header_match:
-            return None
-        search_space = search_space[header_match.end():]
-
-        sunday_title_match = re.search(self.sunday_title, search_space, re.I)
-        if not sunday_title_match:
-            return None
-        if re.search(self.header, search_space[:sunday_title_match.start()],
-                     re.I):  # found header closer to sunday title
-            return self.extract(search_space)
-        if sunday_title_match.start() > 50:
-            return self.extract(search_space[sunday_title_match.end()])
-
-        everyday_title_match = re.search(self.everyday_title, search_space,
-                                         re.I)
-        if not everyday_title_match:
-            return None
-        sunday_masses_hours = search_space[sunday_title_match.end():
-                                           everyday_title_match.start()]
-        if not re.search(self.sunday_masses, sunday_masses_hours,
-                         re.DOTALL | re.I):
-            return None
-        if len(sunday_masses_hours) > 500:
-            return self.extract(search_space[sunday_title_match.end():])
-        everyday_masses_match = re.search(
-            self.everyday_masses, search_space[everyday_title_match.end():],
-            re.I)
-        if not everyday_masses_match:
-            return None
-        if everyday_masses_match.start() > 150:
-            return self.extract(search_space[sunday_title_match.end():])
-
-        whole_result = header_match.group(
-            0) + search_space[:everyday_masses_match.end() +
-                              everyday_title_match.end()]
-        groups = (header_match.group(0), sunday_title_match.group(0),
-                  sunday_masses_hours, everyday_title_match.group(0),
-                  everyday_masses_match.group(0))
-        # print(whole_result)
-        # print(groups)
-        # obsłużyć # TODO:
-        # w dni powszednie (w roku szkolnym) - górny kościół
-        # 6:30, 7:00, 8:00, 18:00
-        # w dni powszednie (czas wakacji) - górny kościół
-        # 7:00, 8:00, 18:00
-
-        print('url: {}\ndepth: {}\nbutton: {}'.format(self.page[
-            'url'], self.page['depth'], self.page['button_text']))
-        return whole_result, groups
-
-
-def process_directory(directory):
-    found = 0
-    not_found = 0
-    for root, dirs, files in os.walk(directory):
-        for fname in files:
-            filepath = os.path.join(root, fname)
-            if os.path.getsize(filepath) > 0:
-                with jsonlines.open(filepath) as reader:
-                    # print(filepath)
-                    if process_parish(reader):
-                        found += 1
-                    else:
-                        not_found += 1
-                    # print('found: {}\nnot_found: {}'.format(found, not_found))
-            else:
-                pass  # empty file
-
-
-def color_match(whole_match, groups, background, colors, style):
-    for i in range(len(groups)):
-        whole_match = whole_match.replace(
-            groups[i], colors[i] + background + style + groups[i] +
-            Style.RESET_ALL + background + style, 1)
-    return whole_match + Style.RESET_ALL
-
-
-def process_parish(reader):
-    for page in sorted(reader, key=lambda x: x['depth']):  #sort by depth
-        extractor = Extractor(page)
-        result = extractor.extract()
-        if result:
-            whole_result, groups = result
-            if whole_result not in page['content']:
-                import ipdb
-                ipdb.set_trace()
-            pretty_text = page['content'].replace(
-                whole_result,
-                color_match(whole_result, groups, Back.BLACK, [
-                    Fore.RED, Fore.GREEN, Fore.YELLOW, Fore.MAGENTA, Fore.CYAN
-                ], Style.BRIGHT))
-            print(pretty_text)
-            import ipdb
-            ipdb.set_trace()
-            return True
-        else:
-            return False
-            # import ipdb
-            # ipdb.set_trace()
-            pass
-
-
-def main():
-    process_directory('./parishwebsites/data-final')
-
-
-if __name__ == '__main__':
-    main()
diff --git a/parishwebsites/convert_content2text.py b/parishwebsites/convert_content2text.py
deleted file mode 100755
index 754c0a7..0000000
--- a/parishwebsites/convert_content2text.py
+++ /dev/null
@@ -1,29 +0,0 @@
-#!/usr/bin/env python3
-import jsonlines
-import sys
-import html2text
-
-
-def convert_html_to_text(parish, text_maker):
-    html = parish['content']
-    text = text_maker.handle(html)
-    parish['content'] = text
-    return parish
-
-
-def main():
-    text_maker = html2text.HTML2Text()
-    text_maker.ignore_links = True
-    text_maker.ignore_images = True
-    writer = jsonlines.Writer(sys.stdout)
-    # text_maker.wrap_links = False
-    # text_maker.strong_mark = ''
-    with jsonlines.open(sys.argv[1]) as reader:
-        for parish in reader:
-            parish = convert_html_to_text(parish, text_maker)
-            writer.write(parish)
-    writer.close()
-
-
-if __name__ == '__main__':
-    main()
diff --git a/parishwebsites/deal-with-not-completed.sh b/parishwebsites/deal-with-not-completed.sh
new file mode 100755
index 0000000..035e757
--- /dev/null
+++ b/parishwebsites/deal-with-not-completed.sh
@@ -0,0 +1,24 @@
+#!/usr/bin/env bash
+
+./find-not-completed.sh > not-completed
+# cat duplicate-data >> not-completed
+#removes not truly finished in processed.txt
+grep -v -f <(cat not-completed | sed -e 's@^@\t@' | sed -e 's@$@\$@') processed.txt | sponge processed.txt
+
+#appends filenames from spider-commands.txt which are not in processed.txt
+comm -13 <(cut -f2 processed.txt | sort -u) <(grep -o 'data/.*" 2>' spider-commands.txt | sed -Ee 's@data/|" 2>@@g' | sort) >> not-completed
+
+sort -u not-completed | sponge not-completed
+
+# remove data connected with not-completed e.g. logs/ data/
+
+echo data directory file count: `ls -1 data | wc -l`
+cd data && xargs rm -f < ../not-completed
+cd ..
+echo data directory file count: `ls -1 data | wc -l`
+echo logs directory file count: `ls -1 logs | wc -l`
+cd logs && xargs rm -f < ../not-completed
+cd ..
+echo logs directory file count: `ls -1 logs | wc -l`
+
+grep -f <(cat not-completed | sed -e 's@^@"data/'@ | sed -e 's@$@"@') spider-commands.txt > spider-commands-add.txt
diff --git a/parishwebsites/find-not-completed.sh b/parishwebsites/find-not-completed.sh
new file mode 100755
index 0000000..7d81d25
--- /dev/null
+++ b/parishwebsites/find-not-completed.sh
@@ -0,0 +1,5 @@
+#!/usr/bin/env bash
+(grep -r "No space left on device" logs | sort -u | grep "^logs/.*:OSError" -o | sed -Ee 's@^logs/|:OSError$@@g' | sort -u &&\
+     grep -r 'Received SIGTERM' logs/ | grep '^logs/.*:20' -o | sed -Ee 's@^logs/|:20$@@g' | sort -u &&\
+ find data -empty -type f | sed -e 's@data/@@' | sort
+) | sort -u
diff --git a/parishwebsites/parish2text.py b/parishwebsites/parish2text.py
new file mode 100755
index 0000000..7fb93a0
--- /dev/null
+++ b/parishwebsites/parish2text.py
@@ -0,0 +1,40 @@
+#!/usr/bin/env python3
+import jsonlines
+import sys
+import html2text
+import pprint
+import re
+
+class Parish2Text():
+    def __init__(self):
+        "docstring"
+        self.text_maker = html2text.HTML2Text()
+        self.text_maker.ignore_links = True
+        self.text_maker.ignore_images = True
+        self.text_maker.images_to_alt = True
+        self.text_maker.strong_mark = ''
+        self.text_maker.ul_item_mark = ''
+        self.text_maker.emphasis_mark = ''
+        self.text_maker.ignore_tables = True
+        
+    def convert(self, parish):
+        parish['content'] = self.text_maker.handle(parish['content'])
+        parish['button_text'] = self.text_maker.handle(parish['button_text'])
+        parish['button_text'] = ' '.join(re.sub('[\W_]+', ' ', parish['button_text']).split())
+        return parish
+
+
+def main():
+    parish2text = Parish2Text()
+    writer = jsonlines.Writer(sys.stdout)
+    # text_maker.wrap_links = False
+    reader = jsonlines.Reader((line.rstrip('\n') for line in sys.stdin))
+    for parish in reader:
+        parish = parish2text.convert(parish)
+        parish_content = parish.pop('content')
+        pprint.pprint(parish)
+        print(parish_content)
+    reader.close()
+
+if __name__ == '__main__':
+    main()
diff --git a/parishwebsites/parishwebsites/settings.py b/parishwebsites/parishwebsites/settings.py
index 3d340b8..bccc109 100644
--- a/parishwebsites/parishwebsites/settings.py
+++ b/parishwebsites/parishwebsites/settings.py
@@ -17,7 +17,7 @@ FEED_EXPORT_ENCODING = 'utf-8'
 LOG_LEVEL = 'DEBUG'
 
 # Crawl responsibly by identifying yourself (and your website) on the user-agent
-#USER_AGENT = 'parishwebsites (+http://www.yourdomain.com)'
+USER_AGENT = 'Mozilla/5.0 (Windows NT 6.1) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/41.0.2228.0 Safari/537.36'
 
 # Obey robots.txt rules
 ROBOTSTXT_OBEY = False
@@ -85,7 +85,7 @@ AUTOTHROTTLE_TARGET_CONCURRENCY = 1
 # AUTOTHROTTLE_DEBUG = True
 
 RETRY_HTTP_CODES = [500, 502, 503, 504, 408, 400, 404, 429, 401]
-RETRY_TIMES = 5
+RETRY_TIMES = 3
 # Enable and configure HTTP caching (disabled by default)
 # See http://scrapy.readthedocs.org/en/latest/topics/downloader-middleware.html#httpcache-middleware-settings
 HTTPCACHE_ENABLED = True
@@ -93,6 +93,7 @@ HTTPCACHE_EXPIRATION_SECS = 1209600
 HTTPCACHE_DIR = 'httpcache'
 HTTPCACHE_IGNORE_HTTP_CODES = RETRY_HTTP_CODES
 HTTPCACHE_STORAGE = 'scrapy.extensions.httpcache.DbmCacheStorage'
+# HTTPCACHE_GZIP = 'True'
 DEPTH_LIMIT = 3
 # DEPTH_PRIORITY = 1
 # SCHEDULER_DISK_QUEUE = 'scrapy.squeues.PickleFifoDiskQueue'
diff --git a/parishwebsites/remove_duplicate_commands.py b/parishwebsites/remove_duplicate_commands.py
new file mode 100755
index 0000000..6309e33
--- /dev/null
+++ b/parishwebsites/remove_duplicate_commands.py
@@ -0,0 +1,13 @@
+#!/usr/bin/env python3
+import sys
+import re
+
+d = {}
+for line in sys.stdin:
+    line = line.rstrip('\n')
+    id_ = re.search('"(data/.*)" 2>', line).group(1)
+    d[id_] = line
+
+for line in d.values():
+    print(line)
+
diff --git a/parishwebsites/view_raw_data.py b/parishwebsites/view_raw_data.py
deleted file mode 100755
index 86a3259..0000000
--- a/parishwebsites/view_raw_data.py
+++ /dev/null
@@ -1,30 +0,0 @@
-#!/usr/bin/env python3
-import jsonlines
-import sys
-import html2text
-import pprint
-
-
-def convert_html_to_text(parish, text_maker):
-    html = parish['content']
-    text = text_maker.handle(html)
-    parish['content'] = text
-    return parish
-
-
-def main():
-    text_maker = html2text.HTML2Text()
-    text_maker.ignore_links = True
-    text_maker.ignore_images = True
-    writer = jsonlines.Writer(sys.stdout)
-    # text_maker.wrap_links = False
-    text_maker.strong_mark = ''
-    with jsonlines.open(sys.argv[1]) as reader:
-        for parish in reader:
-            parish = convert_html_to_text(parish, text_maker)
-            parish_content = parish.pop('content')
-            pprint.pprint(parish)
-            print(parish_content)
-
-if __name__ == '__main__':
-    main()