From 9f1423b3620f4672e75dc09cde82234e8f05caa1 Mon Sep 17 00:00:00 2001 From: siulkilulki Date: Wed, 21 Jun 2017 22:51:53 +0200 Subject: [PATCH] fixed url checking --- parishes_checked_a.txt | 2 +- scraper/duckduckgo.py | 11 +++++++---- scraper/proxy.py | 4 ++-- scraper/urlschecker.py | 18 ++++++------------ urls_checked_a.txt | 2 +- 5 files changed, 17 insertions(+), 20 deletions(-) diff --git a/parishes_checked_a.txt b/parishes_checked_a.txt index db09729..19f1f33 100644 --- a/parishes_checked_a.txt +++ b/parishes_checked_a.txt @@ -1 +1 @@ -Parafia pod wezwaniem Miłosierdzia Bożego http://milosierdziegliwice.pl/ Gliwice Strzelnicza 5 44-100 Gliwice http://colaska.pl/index/parafia/id/10511 18.692078,50.322042 +Parafia pod wezwaniem NMP Królowej Aniołów http://www.adamowice.katowice.opoka.org.pl/ Adamowice ul. Poprzeczna 15 47-435 Raszyce http://colaska.pl/index/parafia/id/2 18.2955971,50.5078563 diff --git a/scraper/duckduckgo.py b/scraper/duckduckgo.py index 2a505e4..d0c6187 100644 --- a/scraper/duckduckgo.py +++ b/scraper/duckduckgo.py @@ -34,17 +34,15 @@ class DuckDuckGo(object): try: resp = requests.post(link, proxies=proxy_dict, timeout=2) print(proxy_dict) + self._verbose_print() self.golden_proxies.append(proxy) return resp except: - print('Nr of falitures: ' + str(self.falitures) + ' Proxies: ' - + str(len(self.proxy_obj.proxies)) + ' Golden proxies: ' - + str(len(self.golden_proxies))) self.proxy_obj.proxies.remove(proxy) proxy = self.proxy_obj.random() proxy_dict = self._proxy_to_dict(proxy) - self.falitures += 1 + self._verbose_print() total_nr_of_proxies = len( self.proxy_obj.proxies) + self.falitures if self.falitures > 0.95 * total_nr_of_proxies: @@ -56,6 +54,11 @@ class DuckDuckGo(object): self.proxy_obj.proxies.extend(self.golden_proxies) del self.golden_proxies[:] + def _verbose_print(self): + print('Nr of falitures: ' + str(self.falitures) + ' Proxies: ' + + str(len(self.proxy_obj.proxies)) + ' Golden proxies: ' + + str(len(self.golden_proxies))) + def _proxy_to_dict(self, proxy): proxy_string = str(proxy[0]) + ':' + str(proxy[1]) return { diff --git a/scraper/proxy.py b/scraper/proxy.py index 518523c..6b8c298 100644 --- a/scraper/proxy.py +++ b/scraper/proxy.py @@ -2,6 +2,7 @@ from selenium import webdriver from selenium import common import re import random +import time # TODO: export path with geckodriver or chromedriver automatically and put driver in project files @@ -39,8 +40,7 @@ class Proxy(): try: driver.execute_script('gp.pageClick(' + str(i) + ')') except common.exceptions.WebDriverException: - import ipdb - ipdb.set_trace() + time.sleep(1) driver.execute_script('gp.pageClick(' + str(i) + ')') print(i) diff --git a/scraper/urlschecker.py b/scraper/urlschecker.py index a19fb69..853bc26 100644 --- a/scraper/urlschecker.py +++ b/scraper/urlschecker.py @@ -26,7 +26,7 @@ class ParishUrlChecker(): return False for link in links: link = self._get_true_url(link) - if parish_url == link: + if self._compare_urls(parish_url, link): t_parish_url = parish_url + '\n' self.urls += t_parish_url t_tsv = parish['name'] + '\t' + parish_url + '\t' + parish['city'] + '\t' + parish['street'] + '\t' + parish['postal_code'] + '\t' + parish['meta_url'] + '\t' + parish['gps'] + '\n' @@ -47,6 +47,9 @@ class ParishUrlChecker(): #print(links) return False + def _compare_urls(self, url_1, url_2): + return self._convert_url(url_1) == self._convert_url(url_2) + def _convert_url(self, url): if url.endswith('/'): url = url[:-1] @@ -67,7 +70,6 @@ class ParishUrlChecker(): return new_url except: pass - print('Falied url: ' + url) return '' @@ -88,13 +90,9 @@ def main(): i = 1 switch = True for parish in parishes: - print(i) - if '10511' in parish['meta_url']: - switch = False - if switch: - continue if parish['url']: - urls_checker.check(parish, duck) + if not urls_checker.check(parish, duck): + print('Not found: ' + parish['url']) else: print('none') print( @@ -104,10 +102,6 @@ def main(): (urls_checker.added / (urls_checker.tried_urls or 1)) * 100) + '%') i += 1 - with open('urls_checked.txt', 'w') as f: - f.write(urls) - with open('parishes_checked.tsv', 'w') as f: - f.write(tsv) if __name__ == "__main__": diff --git a/urls_checked_a.txt b/urls_checked_a.txt index 11b4f1e..caaecb4 100644 --- a/urls_checked_a.txt +++ b/urls_checked_a.txt @@ -1 +1 @@ -http://milosierdziegliwice.pl/ +http://www.adamowice.katowice.opoka.org.pl/