mass-scraper/full_scrapper.py
2017-06-12 22:08:29 +02:00

69 lines
1.7 KiB
Python

import dill
from duckduckgo import DuckDuckGo
from urllib.parse import urlparse
import time
import random
tsv = ''
urls = ''
def check(parish, duck):
global urls
global tsv
links = _urls(parish, duck)
for link in links:
parish_root_url = urlparse(parish.url).netloc
if parish_root_url == urlparse(link).netloc:
urls += parish_root_url + '\n'
tsv += parish.name + '\t' + parish.city + '\t' + parish.street + '\t' + parish.postal_code + '\t' + parish_root_url + '\t' + parish.meta_url + '\t' + parish.gps + '\n'
print('added')
# TODO: save links to txt file, one per line
# TODO: wget -r -i file all links
# TODO: save parishes to jsonline format
return True # mark as ok url
return False
def _urls(parish, duck):
query = parish.name + ' ' + parish.street + ' ' + parish.postal_code
links = duck.links(query)
time.sleep(1)
while not links:
print('retry')
random.randint(3, 10)
time.sleep(10)
links = duck.links(query)
return links
def find_url(parish):
links = _urls(parish)
import ipdb
ipdb.set_trace()
print(links)
def main():
parishes = []
with open('./parishes.dill', 'rb') as f:
parishes = dill.load(f)
duck = DuckDuckGo(language='pl-pl')
print('Downloading proxies')
duck.download_proxies()
i = 0
for parish in parishes:
print(str(i / len(parishes)) + '% done. Nr: ' + str(i))
i += 1
if parish.url:
check(parish, duck)
with open('urls.txt', 'w') as f:
f.write(urls)
with open('parishes.tsv', 'w') as f:
f.write(tsv)
if __name__ == "__main__":
main()