fixed url checking
This commit is contained in:
parent
5ad2a36499
commit
9f1423b362
@ -1 +1 @@
|
||||
Parafia pod wezwaniem Miłosierdzia Bożego http://milosierdziegliwice.pl/ Gliwice Strzelnicza 5 44-100 Gliwice http://colaska.pl/index/parafia/id/10511 18.692078,50.322042
|
||||
Parafia pod wezwaniem NMP Królowej Aniołów http://www.adamowice.katowice.opoka.org.pl/ Adamowice ul. Poprzeczna 15 47-435 Raszyce http://colaska.pl/index/parafia/id/2 18.2955971,50.5078563
|
||||
|
@ -34,17 +34,15 @@ class DuckDuckGo(object):
|
||||
try:
|
||||
resp = requests.post(link, proxies=proxy_dict, timeout=2)
|
||||
print(proxy_dict)
|
||||
self._verbose_print()
|
||||
self.golden_proxies.append(proxy)
|
||||
return resp
|
||||
except:
|
||||
print('Nr of falitures: ' + str(self.falitures) + ' Proxies: '
|
||||
+ str(len(self.proxy_obj.proxies)) + ' Golden proxies: '
|
||||
+ str(len(self.golden_proxies)))
|
||||
self.proxy_obj.proxies.remove(proxy)
|
||||
proxy = self.proxy_obj.random()
|
||||
proxy_dict = self._proxy_to_dict(proxy)
|
||||
|
||||
self.falitures += 1
|
||||
self._verbose_print()
|
||||
total_nr_of_proxies = len(
|
||||
self.proxy_obj.proxies) + self.falitures
|
||||
if self.falitures > 0.95 * total_nr_of_proxies:
|
||||
@ -56,6 +54,11 @@ class DuckDuckGo(object):
|
||||
self.proxy_obj.proxies.extend(self.golden_proxies)
|
||||
del self.golden_proxies[:]
|
||||
|
||||
def _verbose_print(self):
|
||||
print('Nr of falitures: ' + str(self.falitures) + ' Proxies: ' +
|
||||
str(len(self.proxy_obj.proxies)) + ' Golden proxies: ' +
|
||||
str(len(self.golden_proxies)))
|
||||
|
||||
def _proxy_to_dict(self, proxy):
|
||||
proxy_string = str(proxy[0]) + ':' + str(proxy[1])
|
||||
return {
|
||||
|
@ -2,6 +2,7 @@ from selenium import webdriver
|
||||
from selenium import common
|
||||
import re
|
||||
import random
|
||||
import time
|
||||
|
||||
|
||||
# TODO: export path with geckodriver or chromedriver automatically and put driver in project files
|
||||
@ -39,8 +40,7 @@ class Proxy():
|
||||
try:
|
||||
driver.execute_script('gp.pageClick(' + str(i) + ')')
|
||||
except common.exceptions.WebDriverException:
|
||||
import ipdb
|
||||
ipdb.set_trace()
|
||||
time.sleep(1)
|
||||
driver.execute_script('gp.pageClick(' + str(i) + ')')
|
||||
print(i)
|
||||
|
||||
|
@ -26,7 +26,7 @@ class ParishUrlChecker():
|
||||
return False
|
||||
for link in links:
|
||||
link = self._get_true_url(link)
|
||||
if parish_url == link:
|
||||
if self._compare_urls(parish_url, link):
|
||||
t_parish_url = parish_url + '\n'
|
||||
self.urls += t_parish_url
|
||||
t_tsv = parish['name'] + '\t' + parish_url + '\t' + parish['city'] + '\t' + parish['street'] + '\t' + parish['postal_code'] + '\t' + parish['meta_url'] + '\t' + parish['gps'] + '\n'
|
||||
@ -47,6 +47,9 @@ class ParishUrlChecker():
|
||||
#print(links)
|
||||
return False
|
||||
|
||||
def _compare_urls(self, url_1, url_2):
|
||||
return self._convert_url(url_1) == self._convert_url(url_2)
|
||||
|
||||
def _convert_url(self, url):
|
||||
if url.endswith('/'):
|
||||
url = url[:-1]
|
||||
@ -67,7 +70,6 @@ class ParishUrlChecker():
|
||||
return new_url
|
||||
except:
|
||||
pass
|
||||
print('Falied url: ' + url)
|
||||
return ''
|
||||
|
||||
|
||||
@ -88,13 +90,9 @@ def main():
|
||||
i = 1
|
||||
switch = True
|
||||
for parish in parishes:
|
||||
print(i)
|
||||
if '10511' in parish['meta_url']:
|
||||
switch = False
|
||||
if switch:
|
||||
continue
|
||||
if parish['url']:
|
||||
urls_checker.check(parish, duck)
|
||||
if not urls_checker.check(parish, duck):
|
||||
print('Not found: ' + parish['url'])
|
||||
else:
|
||||
print('none')
|
||||
print(
|
||||
@ -104,10 +102,6 @@ def main():
|
||||
(urls_checker.added /
|
||||
(urls_checker.tried_urls or 1)) * 100) + '%')
|
||||
i += 1
|
||||
with open('urls_checked.txt', 'w') as f:
|
||||
f.write(urls)
|
||||
with open('parishes_checked.tsv', 'w') as f:
|
||||
f.write(tsv)
|
||||
|
||||
|
||||
if __name__ == "__main__":
|
||||
|
@ -1 +1 @@
|
||||
http://milosierdziegliwice.pl/
|
||||
http://www.adamowice.katowice.opoka.org.pl/
|
||||
|
Loading…
Reference in New Issue
Block a user