mass-scraper/test.py
siulkilulki 382666c563 Add test.py for data gathering (data for annotation)
Small changes to annotator.py (to be deleted in near future)
Add utils/iterator
Add redis to enviroment.yml
Rename, adapt and move rule based extractor.
Adapt find_hours.
Yapify webapp app (probalby nothing more)
Rename buttons in index.html
2018-05-11 23:12:21 +02:00

74 lines
2.2 KiB
Python
Executable File

#!/usr/bin/env python3
import redis
from utils import iterator
from extractor.find_hours import hours_iterator
import re
# r = redis.StrictRedis(host='localhost', port=6379, db=0)
def add_utterances(content, utterances):
utterances_nr = 0
for utterances_nr, utterance in enumerate(hours_iterator(content)):
utterances.append(utterance)
return utterances_nr
def has_mass_metadata(url, button_text):
regex = re.compile('msz[eay]|nabo[żz]e[ńn]stw|porz[ąa]dek')
url_match = regex.search(url)
button_match = regex.search(button_text)
if url_match and button_match:
print('both - url_metch: {}'.format(url_match.group(0)))
print('button_metch: {}'.format(button_match.group(0)))
return True
elif url_match:
print('url_match: {}'.format(url_match.group(0)))
return True
elif button_match:
print('button_match: {}'.format(button_match.group(0)))
return True
return False
def load_parishes(directory):
utterances = []
utterances_count = 0
for file_nr, parish_path in enumerate(
iterator.parish_path_iterator(directory)):
print(parish_path)
metadata_count = 0
for page_nr, parish_page in enumerate(
iterator.parish_page_iterator(parish_path)):
content = parish_page.pop('content')
# if page_nr == 0 or has_mass_metadata(parish_page['url'], parish_page['button_text']):
if page_nr == 0:
utterances_count += add_utterances(content, utterances)
if has_mass_metadata(parish_page['url'],
parish_page['button_text']):
metadata_count += 1
utterances_count += add_utterances(content, utterances)
if metadata_count == 1:
break
if page_nr == 100:
print(utterances_count)
break
print('file: {}, page: {}, utterances: {}'.format(
file_nr, page_nr, utterances_count))
return {}
utterances = {}
def main():
load_parishes('./parishwebsites/data')
# r.set('foo', 'bar')
# print(r.get('foo'))
if __name__ == '__main__':
main()