fixed url checking

This commit is contained in:
siulkilulki 2017-06-21 22:51:53 +02:00
parent 5ad2a36499
commit 9f1423b362
5 changed files with 17 additions and 20 deletions

View File

@ -1 +1 @@
Parafia pod wezwaniem Miłosierdzia Bożego http://milosierdziegliwice.pl/ Gliwice Strzelnicza 5 44-100 Gliwice http://colaska.pl/index/parafia/id/10511 18.692078,50.322042 Parafia pod wezwaniem NMP Królowej Aniołów http://www.adamowice.katowice.opoka.org.pl/ Adamowice ul. Poprzeczna 15 47-435 Raszyce http://colaska.pl/index/parafia/id/2 18.2955971,50.5078563

View File

@ -34,17 +34,15 @@ class DuckDuckGo(object):
try: try:
resp = requests.post(link, proxies=proxy_dict, timeout=2) resp = requests.post(link, proxies=proxy_dict, timeout=2)
print(proxy_dict) print(proxy_dict)
self._verbose_print()
self.golden_proxies.append(proxy) self.golden_proxies.append(proxy)
return resp return resp
except: except:
print('Nr of falitures: ' + str(self.falitures) + ' Proxies: '
+ str(len(self.proxy_obj.proxies)) + ' Golden proxies: '
+ str(len(self.golden_proxies)))
self.proxy_obj.proxies.remove(proxy) self.proxy_obj.proxies.remove(proxy)
proxy = self.proxy_obj.random() proxy = self.proxy_obj.random()
proxy_dict = self._proxy_to_dict(proxy) proxy_dict = self._proxy_to_dict(proxy)
self.falitures += 1 self.falitures += 1
self._verbose_print()
total_nr_of_proxies = len( total_nr_of_proxies = len(
self.proxy_obj.proxies) + self.falitures self.proxy_obj.proxies) + self.falitures
if self.falitures > 0.95 * total_nr_of_proxies: if self.falitures > 0.95 * total_nr_of_proxies:
@ -56,6 +54,11 @@ class DuckDuckGo(object):
self.proxy_obj.proxies.extend(self.golden_proxies) self.proxy_obj.proxies.extend(self.golden_proxies)
del self.golden_proxies[:] del self.golden_proxies[:]
def _verbose_print(self):
print('Nr of falitures: ' + str(self.falitures) + ' Proxies: ' +
str(len(self.proxy_obj.proxies)) + ' Golden proxies: ' +
str(len(self.golden_proxies)))
def _proxy_to_dict(self, proxy): def _proxy_to_dict(self, proxy):
proxy_string = str(proxy[0]) + ':' + str(proxy[1]) proxy_string = str(proxy[0]) + ':' + str(proxy[1])
return { return {

View File

@ -2,6 +2,7 @@ from selenium import webdriver
from selenium import common from selenium import common
import re import re
import random import random
import time
# TODO: export path with geckodriver or chromedriver automatically and put driver in project files # TODO: export path with geckodriver or chromedriver automatically and put driver in project files
@ -39,8 +40,7 @@ class Proxy():
try: try:
driver.execute_script('gp.pageClick(' + str(i) + ')') driver.execute_script('gp.pageClick(' + str(i) + ')')
except common.exceptions.WebDriverException: except common.exceptions.WebDriverException:
import ipdb time.sleep(1)
ipdb.set_trace()
driver.execute_script('gp.pageClick(' + str(i) + ')') driver.execute_script('gp.pageClick(' + str(i) + ')')
print(i) print(i)

View File

@ -26,7 +26,7 @@ class ParishUrlChecker():
return False return False
for link in links: for link in links:
link = self._get_true_url(link) link = self._get_true_url(link)
if parish_url == link: if self._compare_urls(parish_url, link):
t_parish_url = parish_url + '\n' t_parish_url = parish_url + '\n'
self.urls += t_parish_url self.urls += t_parish_url
t_tsv = parish['name'] + '\t' + parish_url + '\t' + parish['city'] + '\t' + parish['street'] + '\t' + parish['postal_code'] + '\t' + parish['meta_url'] + '\t' + parish['gps'] + '\n' t_tsv = parish['name'] + '\t' + parish_url + '\t' + parish['city'] + '\t' + parish['street'] + '\t' + parish['postal_code'] + '\t' + parish['meta_url'] + '\t' + parish['gps'] + '\n'
@ -47,6 +47,9 @@ class ParishUrlChecker():
#print(links) #print(links)
return False return False
def _compare_urls(self, url_1, url_2):
return self._convert_url(url_1) == self._convert_url(url_2)
def _convert_url(self, url): def _convert_url(self, url):
if url.endswith('/'): if url.endswith('/'):
url = url[:-1] url = url[:-1]
@ -67,7 +70,6 @@ class ParishUrlChecker():
return new_url return new_url
except: except:
pass pass
print('Falied url: ' + url)
return '' return ''
@ -88,13 +90,9 @@ def main():
i = 1 i = 1
switch = True switch = True
for parish in parishes: for parish in parishes:
print(i)
if '10511' in parish['meta_url']:
switch = False
if switch:
continue
if parish['url']: if parish['url']:
urls_checker.check(parish, duck) if not urls_checker.check(parish, duck):
print('Not found: ' + parish['url'])
else: else:
print('none') print('none')
print( print(
@ -104,10 +102,6 @@ def main():
(urls_checker.added / (urls_checker.added /
(urls_checker.tried_urls or 1)) * 100) + '%') (urls_checker.tried_urls or 1)) * 100) + '%')
i += 1 i += 1
with open('urls_checked.txt', 'w') as f:
f.write(urls)
with open('parishes_checked.tsv', 'w') as f:
f.write(tsv)
if __name__ == "__main__": if __name__ == "__main__":

View File

@ -1 +1 @@
http://milosierdziegliwice.pl/ http://www.adamowice.katowice.opoka.org.pl/