69 lines
1.7 KiB
Python
69 lines
1.7 KiB
Python
import dill
|
|
from duckduckgo import DuckDuckGo
|
|
from urllib.parse import urlparse
|
|
import time
|
|
import random
|
|
|
|
tsv = ''
|
|
urls = ''
|
|
|
|
|
|
def check(parish, duck):
|
|
global urls
|
|
global tsv
|
|
links = _urls(parish, duck)
|
|
for link in links:
|
|
parish_root_url = urlparse(parish.url).netloc
|
|
if parish_root_url == urlparse(link).netloc:
|
|
urls += parish_root_url + '\n'
|
|
tsv += parish.name + '\t' + parish.city + '\t' + parish.street + '\t' + parish.postal_code + '\t' + parish_root_url + '\t' + parish.meta_url + '\t' + parish.gps + '\n'
|
|
print('added')
|
|
# TODO: save links to txt file, one per line
|
|
# TODO: wget -r -i file all links
|
|
# TODO: save parishes to jsonline format
|
|
return True # mark as ok url
|
|
return False
|
|
|
|
|
|
def _urls(parish, duck):
|
|
query = parish.name + ' ' + parish.street + ' ' + parish.postal_code
|
|
links = duck.links(query)
|
|
time.sleep(1)
|
|
while not links:
|
|
print('retry')
|
|
random.randint(3, 10)
|
|
time.sleep(10)
|
|
links = duck.links(query)
|
|
return links
|
|
|
|
|
|
def find_url(parish):
|
|
links = _urls(parish)
|
|
import ipdb
|
|
ipdb.set_trace()
|
|
print(links)
|
|
|
|
|
|
def main():
|
|
parishes = []
|
|
with open('./parishes.dill', 'rb') as f:
|
|
parishes = dill.load(f)
|
|
|
|
duck = DuckDuckGo(language='pl-pl')
|
|
print('Downloading proxies')
|
|
duck.download_proxies()
|
|
i = 0
|
|
for parish in parishes:
|
|
print(str(i / len(parishes)) + '% done. Nr: ' + str(i))
|
|
i += 1
|
|
if parish.url:
|
|
check(parish, duck)
|
|
with open('urls.txt', 'w') as f:
|
|
f.write(urls)
|
|
with open('parishes.tsv', 'w') as f:
|
|
f.write(tsv)
|
|
|
|
|
|
if __name__ == "__main__":
|
|
main()
|