74 lines
2.2 KiB
Python
74 lines
2.2 KiB
Python
|
#!/usr/bin/env python3
|
||
|
import redis
|
||
|
from utils import iterator
|
||
|
from extractor.find_hours import hours_iterator
|
||
|
import re
|
||
|
|
||
|
# r = redis.StrictRedis(host='localhost', port=6379, db=0)
|
||
|
|
||
|
|
||
|
def add_utterances(content, utterances):
|
||
|
utterances_nr = 0
|
||
|
for utterances_nr, utterance in enumerate(hours_iterator(content)):
|
||
|
utterances.append(utterance)
|
||
|
return utterances_nr
|
||
|
|
||
|
|
||
|
def has_mass_metadata(url, button_text):
|
||
|
regex = re.compile('msz[eay]|nabo[żz]e[ńn]stw|porz[ąa]dek')
|
||
|
url_match = regex.search(url)
|
||
|
button_match = regex.search(button_text)
|
||
|
if url_match and button_match:
|
||
|
print('both - url_metch: {}'.format(url_match.group(0)))
|
||
|
print('button_metch: {}'.format(button_match.group(0)))
|
||
|
return True
|
||
|
elif url_match:
|
||
|
print('url_match: {}'.format(url_match.group(0)))
|
||
|
return True
|
||
|
elif button_match:
|
||
|
print('button_match: {}'.format(button_match.group(0)))
|
||
|
return True
|
||
|
return False
|
||
|
|
||
|
|
||
|
def load_parishes(directory):
|
||
|
utterances = []
|
||
|
utterances_count = 0
|
||
|
for file_nr, parish_path in enumerate(
|
||
|
iterator.parish_path_iterator(directory)):
|
||
|
print(parish_path)
|
||
|
metadata_count = 0
|
||
|
for page_nr, parish_page in enumerate(
|
||
|
iterator.parish_page_iterator(parish_path)):
|
||
|
content = parish_page.pop('content')
|
||
|
# if page_nr == 0 or has_mass_metadata(parish_page['url'], parish_page['button_text']):
|
||
|
if page_nr == 0:
|
||
|
utterances_count += add_utterances(content, utterances)
|
||
|
if has_mass_metadata(parish_page['url'],
|
||
|
parish_page['button_text']):
|
||
|
metadata_count += 1
|
||
|
utterances_count += add_utterances(content, utterances)
|
||
|
|
||
|
if metadata_count == 1:
|
||
|
break
|
||
|
|
||
|
if page_nr == 100:
|
||
|
print(utterances_count)
|
||
|
break
|
||
|
print('file: {}, page: {}, utterances: {}'.format(
|
||
|
file_nr, page_nr, utterances_count))
|
||
|
return {}
|
||
|
|
||
|
|
||
|
utterances = {}
|
||
|
|
||
|
|
||
|
def main():
|
||
|
load_parishes('./parishwebsites/data')
|
||
|
# r.set('foo', 'bar')
|
||
|
# print(r.get('foo'))
|
||
|
|
||
|
|
||
|
if __name__ == '__main__':
|
||
|
main()
|