From bdf5732b700e92fb436c4bc2b3fe21717fed916a Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Micha=C5=82=20Koz=C5=82owski?= Date: Tue, 3 Jan 2023 14:12:57 +0100 Subject: [PATCH] edit in config --- crawler.py | 15 ++++++++++----- 1 file changed, 10 insertions(+), 5 deletions(-) diff --git a/crawler.py b/crawler.py index 379095d..9d04feb 100644 --- a/crawler.py +++ b/crawler.py @@ -11,6 +11,7 @@ def main(): URL_GREEN = "https://pl.wikisource.org/wiki/Kategoria:Uwierzytelniona" def get_page_data(page_element): + time.sleep(0.5) doc = requests.get(MAIN_URL + page_element['href']) doc_soup = BeautifulSoup(doc.text, 'lxml', from_encoding="utf-8") text_elem = doc_soup.find("div", {"class": "pagetext"}).next_element @@ -18,22 +19,26 @@ def main(): image_url = doc_soup.find("div", {"class": "prp-page-image"}).next_element['src'] return {"title": page_element['title'], "href": MAIN_URL + page_element['href'], "image_url": image_url, "text": text,} - r = requests.get(URL_YELLOW) + r = requests.get(URL_GREEN) soup = BeautifulSoup(r.text, 'lxml') page_number = 1 result_list = [] max_len = "".join(re.findall("\d", re.sub("\xa0",'', soup.find("div", {"id": "mw-pages"}).find("p").text))[3:]) try: while True: + time.sleep(5) next_page = soup.find("a", {"href": re.compile(r"\/w\/index.php.*")}, string="następna strona").get('href', None) if next_page and page_number != 1: r = requests.get(MAIN_URL + next_page) - else: + elif not next_page: break if r.status_code != 200: print(r.__dict__) - break + time.sleep(10) + r = requests.get(MAIN_URL + next_page) + if r.status_code != 200: + break soup = BeautifulSoup(r.text, 'lxml') page_number += 1 links = soup.find_all("a", {"href": re.compile(r"\/wiki\/Strona:.*")}) @@ -44,10 +49,10 @@ def main(): except Exception as e: print(e) df = pd.DataFrame(result_list) - df.to_csv("./yellow.tsv", sep="\t") + df.to_csv("./green.tsv", sep="\t") df = pd.DataFrame(result_list) df.to_csv("./yellow.tsv", sep="\t") if __name__ == "__main__": - main() \ No newline at end of file + main()