mass-scraper/scraper/get_parishes_urls.py
Dawid Jurkiewicz 8b72d0b351 Prototype rule based masses extractor.
Added spider.
Started working on testsets.
2018-03-01 14:40:13 +01:00

152 lines
4.3 KiB
Python
Executable File

#!/usr/bin/env python3
import sys
from googleplaces import GooglePlaces, lang, GooglePlacesError, Place
# import jsonlines
import argparse
import logging
from enum import Enum, auto
from collections import namedtuple
logging.basicConfig(
format='%(asctime)s : %(levelname)s : %(message)s', level=logging.INFO)
class Result(Enum):
OK = auto()
# WITHOUT_WEBSITE = 'WITHOUT_WEBSITE'
AMBIGUOUS = auto()
NOT_FOUND = auto()
def _retrieve_parish_info(row, google_places, detailed=True):
parish_name = row[0]
city = row[1]
address = row[2]
street = address.split('|')[1]
if detailed:
query = '{} {} {}'.format(parish_name, street, city)
else:
query = '{} {}'.format(parish_name, city)
query_result = google_places.text_search(
query, language=lang.POLISH, radius=None)
if len(query_result.places) == 1:
place = query_result.places[0]
place.get_details()
return place, Result.OK
elif len(query_result.places) > 1:
# place = query_result.places[0]
# place.get_details()
tmp_row = row[:]
tmp_row.insert(0, 'AMBIGUOUS')
tmp_row.insert(1, query)
tmp_row.append(str(query_result.places))
logging.info('\t' + '\t'.join(tmp_row))
place = query_result.places[0]
place.get_details()
return place, Result.AMBIGUOUS
else:
if detailed:
return _retrieve_parish_info(row, google_places, detailed=False)
else:
tmp_row = row[:]
tmp_row.insert(0, 'NOT_FOUND')
tmp_row.insert(1, query)
logging.info('\t' + '\t'.join(tmp_row))
return None, Result.NOT_FOUND
def get_args():
parser = argparse.ArgumentParser()
parser.add_argument(
'-a',
'--apikey',
dest='apikey',
type=argparse.FileType(encoding='utf-8'),
help='File with apikey inside',
required=True)
# nargs='?',
# default=sys.stdin)
parser.add_argument(
'-p',
'--parishes',
dest='parishes',
type=argparse.FileType(encoding='utf-8'),
nargs='?',
default=sys.stdin,
help='Tsv parishes file',
required=True)
# parser.add_argument(
# '-n',
# '--not-found',
# type=argparse.FileType('w+', encoding='utf-8'),
# help='Output file of not found parishes',
# required=True)
# parser.add_argument(
# '-a',
# '--ambiguous',
# type=argparse.FileType('w+', encoding='utf-8'),
# help='Output file of ambiguous parishes',
# required=True)
return parser.parse_args()
def add_parish_info(row, parish, result):
url = parish.website if parish.website else ''
row.insert(2, url)
row.append(parish.place_id)
row.append(result.name)
def write_last_line_to_file(filepath, line_nr):
with (open(filepath), 'w+') as f:
print(line_nr, file=f)
def count_file_number_of_lines(filepath):
try:
with open(filepath) as f:
return sum(1 for _ in f)
except FileNotFoundError:
return 0
def main():
# writer = jsonlines.Writer(sys.stdout)
args = get_args()
apikey = args.apikey.read().rstrip('\n') # TODO: should be apikey file
args.apikey.close()
outputfile_path = './extended.tsv'
nr_of_outputfile_lines = count_file_number_of_lines(outputfile_path)
header = next(args.parishes).rstrip('\n').split('\t')
header.insert(2, 'url')
if nr_of_outputfile_lines == 0:
print('\t'.join(header) + '\tplace_id')
google_places = GooglePlaces(apikey)
for line_nr, line in enumerate(args.parishes):
if line_nr + 1 < nr_of_outputfile_lines:
continue
row = line.rstrip('\n').split('\t')
try:
parish, result = _retrieve_parish_info(row, google_places)
if not parish:
NullPlace = namedtuple('NullPlace', ['website', 'place_id'])
parish = NullPlace('', '')
except:
logging.info('Probably limit exceeded. Exiting.')
# write_last_line_to_file(outputfile_path, line_nr)
return
add_parish_info(row, parish, result)
print('\t'.join(row), flush=True)
# write_last_line_to_file(outputfile_path, 0)
args.parishes.close()
if __name__ == '__main__':
main()