Add robust recrawling of not completed data.
Add annotator.py (highlighing hout within context done) Enhance parish2text.py (enable more flags, convert button)
This commit is contained in:
parent
e9c4dcd743
commit
9b76f4e8aa
7
Makefile
7
Makefile
@ -7,12 +7,17 @@ JOBS := 100
|
||||
|
||||
all: data
|
||||
|
||||
|
||||
data-add: parishwebsites/spider-commands-add.txt parishwebsites/domain-blacklist.txt parishwebsites/deal-with-not-completed.sh
|
||||
cd parishwebsites && ./deal-with-not-completed.sh
|
||||
cd parishwebsites && parallel --jobs $(JOBS) < spider-commands-add.txt
|
||||
|
||||
data: parishwebsites/spider-commands.txt parishwebsites/domain-blacklist.txt
|
||||
rm -f parishwebsites/*processed.txt
|
||||
cd parishwebsites && parallel --jobs $(JOBS) < spider-commands.txt
|
||||
|
||||
parishwebsites/spider-commands.txt: parishes-with-urls.tsv parishwebsites/domain-blacklist.txt
|
||||
cut -f3 $< | tail -n +2 | grep http | parishwebsites/generate_spider_commands.sh | sort -u | parishwebsites/remove_blacklisted.py $(word 2,$^) > $@
|
||||
cut -f3 $< | tail -n +2 | grep http | parishwebsites/generate_spider_commands.sh | sort -u | parishwebsites/remove_blacklisted.py $(word 2,$^) | parishwebsites/remove_duplicate_commands.py > $@
|
||||
|
||||
parishes-with-urls.tsv: apikey.txt parishes-deon.tsv scraper/get_parishes_urls.py
|
||||
scraper/get_parishes_urls.py -a $< -p $(word 2,$^) >> $@ 2> get-parishes-urls.log
|
||||
|
40
annotator.py
Executable file
40
annotator.py
Executable file
@ -0,0 +1,40 @@
|
||||
#!/usr/bin/env python3
|
||||
import jsonlines
|
||||
from extractor.find_hours import hours_iterator
|
||||
from parishwebsites.parish2text import Parish2Text
|
||||
import os
|
||||
import random
|
||||
|
||||
parish2text = Parish2Text()
|
||||
|
||||
CONTEXT = 100
|
||||
|
||||
|
||||
|
||||
def process_parish_page(parish_page):
|
||||
content = parish_page.pop('content')
|
||||
for utterance, utterance_colored in hours_iterator(content):
|
||||
print(utterance_colored)
|
||||
import ipdb; ipdb.set_trace()
|
||||
|
||||
|
||||
def process_parish_file(parish_reader):
|
||||
for parish_page in parish_reader:
|
||||
parish_page = parish2text.convert(parish_page)
|
||||
process_parish_page(parish_page)
|
||||
|
||||
|
||||
def process_directory(directory):
|
||||
for root, dirs, files in os.walk(directory):
|
||||
# random.shuffle(files)
|
||||
for fname in sorted(files):
|
||||
filepath = os.path.join(root, fname)
|
||||
if os.path.getsize(filepath) > 0:
|
||||
with jsonlines.open(filepath) as parish_reader:
|
||||
process_parish_file(parish_reader)
|
||||
|
||||
def main():
|
||||
process_directory('./parishwebsites/data')
|
||||
|
||||
if __name__ == '__main__':
|
||||
main()
|
@ -1,151 +0,0 @@
|
||||
#!/usr/bin/env python3
|
||||
from colorama import Fore, Back, Style
|
||||
import os
|
||||
import jsonlines
|
||||
import re
|
||||
import pprint
|
||||
|
||||
|
||||
class Extractor:
|
||||
def __init__(self, page):
|
||||
"docstring"
|
||||
self.page = page
|
||||
self.content = page['content']
|
||||
self.header = self.wrap_with_name_group(
|
||||
'header',
|
||||
'porządek mszy (świętych|św|św\.)|msz[ea][ \n]+([śs]wi[eę]t[ea]|św|św\.)'
|
||||
)
|
||||
|
||||
self.sunday_title = self.wrap_with_name_group(
|
||||
'sunday_title',
|
||||
'niedziel[a|e][ \n]+i[ \n]+(dni[ \n]+(świąteczne|św|św\.)|święta)'
|
||||
'|niedziel[ea]'
|
||||
'|porządek świąteczny')
|
||||
#'|święta'
|
||||
self.sunday_masses = self.wrap_with_name_group(
|
||||
'sunday_masses', '.*[^\d]\d{1,2}[^\d].*?')
|
||||
self.everyday_title = self.wrap_with_name_group(
|
||||
'everyday_title', 'dzień powszedni'
|
||||
'|dni powszednie'
|
||||
'|w tygodniu'
|
||||
'|porządek zwykły'
|
||||
'|od poniedziałku do soboty')
|
||||
self.everyday_masses = self.wrap_with_name_group(
|
||||
'everyday_masses',
|
||||
'(.*?[^\d\n]?\d{1,2}[^\d\n]?.*?\n)+') # \n lub koniec stringa
|
||||
|
||||
def wrap_with_name_group(self, name, pattern):
|
||||
return '(?P<{}>{})'.format(name, pattern)
|
||||
|
||||
def extract(self, search_space=None):
|
||||
if not search_space:
|
||||
search_space = self.content
|
||||
header_match = re.search(self.header, search_space, re.I)
|
||||
if not header_match:
|
||||
return None
|
||||
search_space = search_space[header_match.end():]
|
||||
|
||||
sunday_title_match = re.search(self.sunday_title, search_space, re.I)
|
||||
if not sunday_title_match:
|
||||
return None
|
||||
if re.search(self.header, search_space[:sunday_title_match.start()],
|
||||
re.I): # found header closer to sunday title
|
||||
return self.extract(search_space)
|
||||
if sunday_title_match.start() > 50:
|
||||
return self.extract(search_space[sunday_title_match.end()])
|
||||
|
||||
everyday_title_match = re.search(self.everyday_title, search_space,
|
||||
re.I)
|
||||
if not everyday_title_match:
|
||||
return None
|
||||
sunday_masses_hours = search_space[sunday_title_match.end():
|
||||
everyday_title_match.start()]
|
||||
if not re.search(self.sunday_masses, sunday_masses_hours,
|
||||
re.DOTALL | re.I):
|
||||
return None
|
||||
if len(sunday_masses_hours) > 500:
|
||||
return self.extract(search_space[sunday_title_match.end():])
|
||||
everyday_masses_match = re.search(
|
||||
self.everyday_masses, search_space[everyday_title_match.end():],
|
||||
re.I)
|
||||
if not everyday_masses_match:
|
||||
return None
|
||||
if everyday_masses_match.start() > 150:
|
||||
return self.extract(search_space[sunday_title_match.end():])
|
||||
|
||||
whole_result = header_match.group(
|
||||
0) + search_space[:everyday_masses_match.end() +
|
||||
everyday_title_match.end()]
|
||||
groups = (header_match.group(0), sunday_title_match.group(0),
|
||||
sunday_masses_hours, everyday_title_match.group(0),
|
||||
everyday_masses_match.group(0))
|
||||
# print(whole_result)
|
||||
# print(groups)
|
||||
# obsłużyć # TODO:
|
||||
# w dni powszednie (w roku szkolnym) - górny kościół
|
||||
# 6:30, 7:00, 8:00, 18:00
|
||||
# w dni powszednie (czas wakacji) - górny kościół
|
||||
# 7:00, 8:00, 18:00
|
||||
|
||||
print('url: {}\ndepth: {}\nbutton: {}'.format(self.page[
|
||||
'url'], self.page['depth'], self.page['button_text']))
|
||||
return whole_result, groups
|
||||
|
||||
|
||||
def process_directory(directory):
|
||||
found = 0
|
||||
not_found = 0
|
||||
for root, dirs, files in os.walk(directory):
|
||||
for fname in files:
|
||||
filepath = os.path.join(root, fname)
|
||||
if os.path.getsize(filepath) > 0:
|
||||
with jsonlines.open(filepath) as reader:
|
||||
# print(filepath)
|
||||
if process_parish(reader):
|
||||
found += 1
|
||||
else:
|
||||
not_found += 1
|
||||
# print('found: {}\nnot_found: {}'.format(found, not_found))
|
||||
else:
|
||||
pass # empty file
|
||||
|
||||
|
||||
def color_match(whole_match, groups, background, colors, style):
|
||||
for i in range(len(groups)):
|
||||
whole_match = whole_match.replace(
|
||||
groups[i], colors[i] + background + style + groups[i] +
|
||||
Style.RESET_ALL + background + style, 1)
|
||||
return whole_match + Style.RESET_ALL
|
||||
|
||||
|
||||
def process_parish(reader):
|
||||
for page in sorted(reader, key=lambda x: x['depth']): #sort by depth
|
||||
extractor = Extractor(page)
|
||||
result = extractor.extract()
|
||||
if result:
|
||||
whole_result, groups = result
|
||||
if whole_result not in page['content']:
|
||||
import ipdb
|
||||
ipdb.set_trace()
|
||||
pretty_text = page['content'].replace(
|
||||
whole_result,
|
||||
color_match(whole_result, groups, Back.BLACK, [
|
||||
Fore.RED, Fore.GREEN, Fore.YELLOW, Fore.MAGENTA, Fore.CYAN
|
||||
], Style.BRIGHT))
|
||||
print(pretty_text)
|
||||
import ipdb
|
||||
ipdb.set_trace()
|
||||
return True
|
||||
else:
|
||||
return False
|
||||
# import ipdb
|
||||
# ipdb.set_trace()
|
||||
pass
|
||||
|
||||
|
||||
def main():
|
||||
process_directory('./parishwebsites/data-final')
|
||||
|
||||
|
||||
if __name__ == '__main__':
|
||||
main()
|
@ -1,29 +0,0 @@
|
||||
#!/usr/bin/env python3
|
||||
import jsonlines
|
||||
import sys
|
||||
import html2text
|
||||
|
||||
|
||||
def convert_html_to_text(parish, text_maker):
|
||||
html = parish['content']
|
||||
text = text_maker.handle(html)
|
||||
parish['content'] = text
|
||||
return parish
|
||||
|
||||
|
||||
def main():
|
||||
text_maker = html2text.HTML2Text()
|
||||
text_maker.ignore_links = True
|
||||
text_maker.ignore_images = True
|
||||
writer = jsonlines.Writer(sys.stdout)
|
||||
# text_maker.wrap_links = False
|
||||
# text_maker.strong_mark = ''
|
||||
with jsonlines.open(sys.argv[1]) as reader:
|
||||
for parish in reader:
|
||||
parish = convert_html_to_text(parish, text_maker)
|
||||
writer.write(parish)
|
||||
writer.close()
|
||||
|
||||
|
||||
if __name__ == '__main__':
|
||||
main()
|
24
parishwebsites/deal-with-not-completed.sh
Executable file
24
parishwebsites/deal-with-not-completed.sh
Executable file
@ -0,0 +1,24 @@
|
||||
#!/usr/bin/env bash
|
||||
|
||||
./find-not-completed.sh > not-completed
|
||||
# cat duplicate-data >> not-completed
|
||||
#removes not truly finished in processed.txt
|
||||
grep -v -f <(cat not-completed | sed -e 's@^@\t@' | sed -e 's@$@\$@') processed.txt | sponge processed.txt
|
||||
|
||||
#appends filenames from spider-commands.txt which are not in processed.txt
|
||||
comm -13 <(cut -f2 processed.txt | sort -u) <(grep -o 'data/.*" 2>' spider-commands.txt | sed -Ee 's@data/|" 2>@@g' | sort) >> not-completed
|
||||
|
||||
sort -u not-completed | sponge not-completed
|
||||
|
||||
# remove data connected with not-completed e.g. logs/ data/
|
||||
|
||||
echo data directory file count: `ls -1 data | wc -l`
|
||||
cd data && xargs rm -f < ../not-completed
|
||||
cd ..
|
||||
echo data directory file count: `ls -1 data | wc -l`
|
||||
echo logs directory file count: `ls -1 logs | wc -l`
|
||||
cd logs && xargs rm -f < ../not-completed
|
||||
cd ..
|
||||
echo logs directory file count: `ls -1 logs | wc -l`
|
||||
|
||||
grep -f <(cat not-completed | sed -e 's@^@"data/'@ | sed -e 's@$@"@') spider-commands.txt > spider-commands-add.txt
|
5
parishwebsites/find-not-completed.sh
Executable file
5
parishwebsites/find-not-completed.sh
Executable file
@ -0,0 +1,5 @@
|
||||
#!/usr/bin/env bash
|
||||
(grep -r "No space left on device" logs | sort -u | grep "^logs/.*:OSError" -o | sed -Ee 's@^logs/|:OSError$@@g' | sort -u &&\
|
||||
grep -r 'Received SIGTERM' logs/ | grep '^logs/.*:20' -o | sed -Ee 's@^logs/|:20$@@g' | sort -u &&\
|
||||
find data -empty -type f | sed -e 's@data/@@' | sort
|
||||
) | sort -u
|
40
parishwebsites/parish2text.py
Executable file
40
parishwebsites/parish2text.py
Executable file
@ -0,0 +1,40 @@
|
||||
#!/usr/bin/env python3
|
||||
import jsonlines
|
||||
import sys
|
||||
import html2text
|
||||
import pprint
|
||||
import re
|
||||
|
||||
class Parish2Text():
|
||||
def __init__(self):
|
||||
"docstring"
|
||||
self.text_maker = html2text.HTML2Text()
|
||||
self.text_maker.ignore_links = True
|
||||
self.text_maker.ignore_images = True
|
||||
self.text_maker.images_to_alt = True
|
||||
self.text_maker.strong_mark = ''
|
||||
self.text_maker.ul_item_mark = ''
|
||||
self.text_maker.emphasis_mark = ''
|
||||
self.text_maker.ignore_tables = True
|
||||
|
||||
def convert(self, parish):
|
||||
parish['content'] = self.text_maker.handle(parish['content'])
|
||||
parish['button_text'] = self.text_maker.handle(parish['button_text'])
|
||||
parish['button_text'] = ' '.join(re.sub('[\W_]+', ' ', parish['button_text']).split())
|
||||
return parish
|
||||
|
||||
|
||||
def main():
|
||||
parish2text = Parish2Text()
|
||||
writer = jsonlines.Writer(sys.stdout)
|
||||
# text_maker.wrap_links = False
|
||||
reader = jsonlines.Reader((line.rstrip('\n') for line in sys.stdin))
|
||||
for parish in reader:
|
||||
parish = parish2text.convert(parish)
|
||||
parish_content = parish.pop('content')
|
||||
pprint.pprint(parish)
|
||||
print(parish_content)
|
||||
reader.close()
|
||||
|
||||
if __name__ == '__main__':
|
||||
main()
|
@ -17,7 +17,7 @@ FEED_EXPORT_ENCODING = 'utf-8'
|
||||
LOG_LEVEL = 'DEBUG'
|
||||
|
||||
# Crawl responsibly by identifying yourself (and your website) on the user-agent
|
||||
#USER_AGENT = 'parishwebsites (+http://www.yourdomain.com)'
|
||||
USER_AGENT = 'Mozilla/5.0 (Windows NT 6.1) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/41.0.2228.0 Safari/537.36'
|
||||
|
||||
# Obey robots.txt rules
|
||||
ROBOTSTXT_OBEY = False
|
||||
@ -85,7 +85,7 @@ AUTOTHROTTLE_TARGET_CONCURRENCY = 1
|
||||
# AUTOTHROTTLE_DEBUG = True
|
||||
|
||||
RETRY_HTTP_CODES = [500, 502, 503, 504, 408, 400, 404, 429, 401]
|
||||
RETRY_TIMES = 5
|
||||
RETRY_TIMES = 3
|
||||
# Enable and configure HTTP caching (disabled by default)
|
||||
# See http://scrapy.readthedocs.org/en/latest/topics/downloader-middleware.html#httpcache-middleware-settings
|
||||
HTTPCACHE_ENABLED = True
|
||||
@ -93,6 +93,7 @@ HTTPCACHE_EXPIRATION_SECS = 1209600
|
||||
HTTPCACHE_DIR = 'httpcache'
|
||||
HTTPCACHE_IGNORE_HTTP_CODES = RETRY_HTTP_CODES
|
||||
HTTPCACHE_STORAGE = 'scrapy.extensions.httpcache.DbmCacheStorage'
|
||||
# HTTPCACHE_GZIP = 'True'
|
||||
DEPTH_LIMIT = 3
|
||||
# DEPTH_PRIORITY = 1
|
||||
# SCHEDULER_DISK_QUEUE = 'scrapy.squeues.PickleFifoDiskQueue'
|
||||
|
13
parishwebsites/remove_duplicate_commands.py
Executable file
13
parishwebsites/remove_duplicate_commands.py
Executable file
@ -0,0 +1,13 @@
|
||||
#!/usr/bin/env python3
|
||||
import sys
|
||||
import re
|
||||
|
||||
d = {}
|
||||
for line in sys.stdin:
|
||||
line = line.rstrip('\n')
|
||||
id_ = re.search('"(data/.*)" 2>', line).group(1)
|
||||
d[id_] = line
|
||||
|
||||
for line in d.values():
|
||||
print(line)
|
||||
|
@ -1,30 +0,0 @@
|
||||
#!/usr/bin/env python3
|
||||
import jsonlines
|
||||
import sys
|
||||
import html2text
|
||||
import pprint
|
||||
|
||||
|
||||
def convert_html_to_text(parish, text_maker):
|
||||
html = parish['content']
|
||||
text = text_maker.handle(html)
|
||||
parish['content'] = text
|
||||
return parish
|
||||
|
||||
|
||||
def main():
|
||||
text_maker = html2text.HTML2Text()
|
||||
text_maker.ignore_links = True
|
||||
text_maker.ignore_images = True
|
||||
writer = jsonlines.Writer(sys.stdout)
|
||||
# text_maker.wrap_links = False
|
||||
text_maker.strong_mark = ''
|
||||
with jsonlines.open(sys.argv[1]) as reader:
|
||||
for parish in reader:
|
||||
parish = convert_html_to_text(parish, text_maker)
|
||||
parish_content = parish.pop('content')
|
||||
pprint.pprint(parish)
|
||||
print(parish_content)
|
||||
|
||||
if __name__ == '__main__':
|
||||
main()
|
Loading…
Reference in New Issue
Block a user