edit in config

This commit is contained in:
Michał Kozłowski 2023-01-03 14:12:57 +01:00
parent c68c41e9b5
commit bdf5732b70

View File

@ -11,6 +11,7 @@ def main():
URL_GREEN = "https://pl.wikisource.org/wiki/Kategoria:Uwierzytelniona" URL_GREEN = "https://pl.wikisource.org/wiki/Kategoria:Uwierzytelniona"
def get_page_data(page_element): def get_page_data(page_element):
time.sleep(0.5)
doc = requests.get(MAIN_URL + page_element['href']) doc = requests.get(MAIN_URL + page_element['href'])
doc_soup = BeautifulSoup(doc.text, 'lxml', from_encoding="utf-8") doc_soup = BeautifulSoup(doc.text, 'lxml', from_encoding="utf-8")
text_elem = doc_soup.find("div", {"class": "pagetext"}).next_element text_elem = doc_soup.find("div", {"class": "pagetext"}).next_element
@ -18,22 +19,26 @@ def main():
image_url = doc_soup.find("div", {"class": "prp-page-image"}).next_element['src'] image_url = doc_soup.find("div", {"class": "prp-page-image"}).next_element['src']
return {"title": page_element['title'], "href": MAIN_URL + page_element['href'], "image_url": image_url, "text": text,} return {"title": page_element['title'], "href": MAIN_URL + page_element['href'], "image_url": image_url, "text": text,}
r = requests.get(URL_YELLOW) r = requests.get(URL_GREEN)
soup = BeautifulSoup(r.text, 'lxml') soup = BeautifulSoup(r.text, 'lxml')
page_number = 1 page_number = 1
result_list = [] result_list = []
max_len = "".join(re.findall("\d", re.sub("\xa0",'', soup.find("div", {"id": "mw-pages"}).find("p").text))[3:]) max_len = "".join(re.findall("\d", re.sub("\xa0",'', soup.find("div", {"id": "mw-pages"}).find("p").text))[3:])
try: try:
while True: while True:
time.sleep(5)
next_page = soup.find("a", {"href": re.compile(r"\/w\/index.php.*")}, string="następna strona").get('href', None) next_page = soup.find("a", {"href": re.compile(r"\/w\/index.php.*")}, string="następna strona").get('href', None)
if next_page and page_number != 1: if next_page and page_number != 1:
r = requests.get(MAIN_URL + next_page) r = requests.get(MAIN_URL + next_page)
else: elif not next_page:
break break
if r.status_code != 200: if r.status_code != 200:
print(r.__dict__) print(r.__dict__)
break time.sleep(10)
r = requests.get(MAIN_URL + next_page)
if r.status_code != 200:
break
soup = BeautifulSoup(r.text, 'lxml') soup = BeautifulSoup(r.text, 'lxml')
page_number += 1 page_number += 1
links = soup.find_all("a", {"href": re.compile(r"\/wiki\/Strona:.*")}) links = soup.find_all("a", {"href": re.compile(r"\/wiki\/Strona:.*")})
@ -44,10 +49,10 @@ def main():
except Exception as e: except Exception as e:
print(e) print(e)
df = pd.DataFrame(result_list) df = pd.DataFrame(result_list)
df.to_csv("./yellow.tsv", sep="\t") df.to_csv("./green.tsv", sep="\t")
df = pd.DataFrame(result_list) df = pd.DataFrame(result_list)
df.to_csv("./yellow.tsv", sep="\t") df.to_csv("./yellow.tsv", sep="\t")
if __name__ == "__main__": if __name__ == "__main__":
main() main()