fixed url checking
This commit is contained in:
parent
5ad2a36499
commit
9f1423b362
@ -1 +1 @@
|
|||||||
Parafia pod wezwaniem Miłosierdzia Bożego http://milosierdziegliwice.pl/ Gliwice Strzelnicza 5 44-100 Gliwice http://colaska.pl/index/parafia/id/10511 18.692078,50.322042
|
Parafia pod wezwaniem NMP Królowej Aniołów http://www.adamowice.katowice.opoka.org.pl/ Adamowice ul. Poprzeczna 15 47-435 Raszyce http://colaska.pl/index/parafia/id/2 18.2955971,50.5078563
|
||||||
|
@ -34,17 +34,15 @@ class DuckDuckGo(object):
|
|||||||
try:
|
try:
|
||||||
resp = requests.post(link, proxies=proxy_dict, timeout=2)
|
resp = requests.post(link, proxies=proxy_dict, timeout=2)
|
||||||
print(proxy_dict)
|
print(proxy_dict)
|
||||||
|
self._verbose_print()
|
||||||
self.golden_proxies.append(proxy)
|
self.golden_proxies.append(proxy)
|
||||||
return resp
|
return resp
|
||||||
except:
|
except:
|
||||||
print('Nr of falitures: ' + str(self.falitures) + ' Proxies: '
|
|
||||||
+ str(len(self.proxy_obj.proxies)) + ' Golden proxies: '
|
|
||||||
+ str(len(self.golden_proxies)))
|
|
||||||
self.proxy_obj.proxies.remove(proxy)
|
self.proxy_obj.proxies.remove(proxy)
|
||||||
proxy = self.proxy_obj.random()
|
proxy = self.proxy_obj.random()
|
||||||
proxy_dict = self._proxy_to_dict(proxy)
|
proxy_dict = self._proxy_to_dict(proxy)
|
||||||
|
|
||||||
self.falitures += 1
|
self.falitures += 1
|
||||||
|
self._verbose_print()
|
||||||
total_nr_of_proxies = len(
|
total_nr_of_proxies = len(
|
||||||
self.proxy_obj.proxies) + self.falitures
|
self.proxy_obj.proxies) + self.falitures
|
||||||
if self.falitures > 0.95 * total_nr_of_proxies:
|
if self.falitures > 0.95 * total_nr_of_proxies:
|
||||||
@ -56,6 +54,11 @@ class DuckDuckGo(object):
|
|||||||
self.proxy_obj.proxies.extend(self.golden_proxies)
|
self.proxy_obj.proxies.extend(self.golden_proxies)
|
||||||
del self.golden_proxies[:]
|
del self.golden_proxies[:]
|
||||||
|
|
||||||
|
def _verbose_print(self):
|
||||||
|
print('Nr of falitures: ' + str(self.falitures) + ' Proxies: ' +
|
||||||
|
str(len(self.proxy_obj.proxies)) + ' Golden proxies: ' +
|
||||||
|
str(len(self.golden_proxies)))
|
||||||
|
|
||||||
def _proxy_to_dict(self, proxy):
|
def _proxy_to_dict(self, proxy):
|
||||||
proxy_string = str(proxy[0]) + ':' + str(proxy[1])
|
proxy_string = str(proxy[0]) + ':' + str(proxy[1])
|
||||||
return {
|
return {
|
||||||
|
@ -2,6 +2,7 @@ from selenium import webdriver
|
|||||||
from selenium import common
|
from selenium import common
|
||||||
import re
|
import re
|
||||||
import random
|
import random
|
||||||
|
import time
|
||||||
|
|
||||||
|
|
||||||
# TODO: export path with geckodriver or chromedriver automatically and put driver in project files
|
# TODO: export path with geckodriver or chromedriver automatically and put driver in project files
|
||||||
@ -39,8 +40,7 @@ class Proxy():
|
|||||||
try:
|
try:
|
||||||
driver.execute_script('gp.pageClick(' + str(i) + ')')
|
driver.execute_script('gp.pageClick(' + str(i) + ')')
|
||||||
except common.exceptions.WebDriverException:
|
except common.exceptions.WebDriverException:
|
||||||
import ipdb
|
time.sleep(1)
|
||||||
ipdb.set_trace()
|
|
||||||
driver.execute_script('gp.pageClick(' + str(i) + ')')
|
driver.execute_script('gp.pageClick(' + str(i) + ')')
|
||||||
print(i)
|
print(i)
|
||||||
|
|
||||||
|
@ -26,7 +26,7 @@ class ParishUrlChecker():
|
|||||||
return False
|
return False
|
||||||
for link in links:
|
for link in links:
|
||||||
link = self._get_true_url(link)
|
link = self._get_true_url(link)
|
||||||
if parish_url == link:
|
if self._compare_urls(parish_url, link):
|
||||||
t_parish_url = parish_url + '\n'
|
t_parish_url = parish_url + '\n'
|
||||||
self.urls += t_parish_url
|
self.urls += t_parish_url
|
||||||
t_tsv = parish['name'] + '\t' + parish_url + '\t' + parish['city'] + '\t' + parish['street'] + '\t' + parish['postal_code'] + '\t' + parish['meta_url'] + '\t' + parish['gps'] + '\n'
|
t_tsv = parish['name'] + '\t' + parish_url + '\t' + parish['city'] + '\t' + parish['street'] + '\t' + parish['postal_code'] + '\t' + parish['meta_url'] + '\t' + parish['gps'] + '\n'
|
||||||
@ -47,6 +47,9 @@ class ParishUrlChecker():
|
|||||||
#print(links)
|
#print(links)
|
||||||
return False
|
return False
|
||||||
|
|
||||||
|
def _compare_urls(self, url_1, url_2):
|
||||||
|
return self._convert_url(url_1) == self._convert_url(url_2)
|
||||||
|
|
||||||
def _convert_url(self, url):
|
def _convert_url(self, url):
|
||||||
if url.endswith('/'):
|
if url.endswith('/'):
|
||||||
url = url[:-1]
|
url = url[:-1]
|
||||||
@ -67,7 +70,6 @@ class ParishUrlChecker():
|
|||||||
return new_url
|
return new_url
|
||||||
except:
|
except:
|
||||||
pass
|
pass
|
||||||
print('Falied url: ' + url)
|
|
||||||
return ''
|
return ''
|
||||||
|
|
||||||
|
|
||||||
@ -88,13 +90,9 @@ def main():
|
|||||||
i = 1
|
i = 1
|
||||||
switch = True
|
switch = True
|
||||||
for parish in parishes:
|
for parish in parishes:
|
||||||
print(i)
|
|
||||||
if '10511' in parish['meta_url']:
|
|
||||||
switch = False
|
|
||||||
if switch:
|
|
||||||
continue
|
|
||||||
if parish['url']:
|
if parish['url']:
|
||||||
urls_checker.check(parish, duck)
|
if not urls_checker.check(parish, duck):
|
||||||
|
print('Not found: ' + parish['url'])
|
||||||
else:
|
else:
|
||||||
print('none')
|
print('none')
|
||||||
print(
|
print(
|
||||||
@ -104,10 +102,6 @@ def main():
|
|||||||
(urls_checker.added /
|
(urls_checker.added /
|
||||||
(urls_checker.tried_urls or 1)) * 100) + '%')
|
(urls_checker.tried_urls or 1)) * 100) + '%')
|
||||||
i += 1
|
i += 1
|
||||||
with open('urls_checked.txt', 'w') as f:
|
|
||||||
f.write(urls)
|
|
||||||
with open('parishes_checked.tsv', 'w') as f:
|
|
||||||
f.write(tsv)
|
|
||||||
|
|
||||||
|
|
||||||
if __name__ == "__main__":
|
if __name__ == "__main__":
|
||||||
|
@ -1 +1 @@
|
|||||||
http://milosierdziegliwice.pl/
|
http://www.adamowice.katowice.opoka.org.pl/
|
||||||
|
Loading…
Reference in New Issue
Block a user