crawler init
This commit is contained in:
commit
c68c41e9b5
324
crawler.ipynb
Normal file
324
crawler.ipynb
Normal file
File diff suppressed because one or more lines are too long
53
crawler.py
Normal file
53
crawler.py
Normal file
@ -0,0 +1,53 @@
|
||||
import pandas as pd
|
||||
import requests
|
||||
from bs4 import BeautifulSoup
|
||||
import re
|
||||
from tqdm import tqdm
|
||||
import time
|
||||
|
||||
def main():
|
||||
MAIN_URL = "https://pl.wikisource.org/"
|
||||
URL_YELLOW = "https://pl.wikisource.org/wiki/Kategoria:Skorygowana"
|
||||
URL_GREEN = "https://pl.wikisource.org/wiki/Kategoria:Uwierzytelniona"
|
||||
|
||||
def get_page_data(page_element):
|
||||
doc = requests.get(MAIN_URL + page_element['href'])
|
||||
doc_soup = BeautifulSoup(doc.text, 'lxml', from_encoding="utf-8")
|
||||
text_elem = doc_soup.find("div", {"class": "pagetext"}).next_element
|
||||
text = text_elem.text if not text_elem.find("math") else "math image"
|
||||
image_url = doc_soup.find("div", {"class": "prp-page-image"}).next_element['src']
|
||||
return {"title": page_element['title'], "href": MAIN_URL + page_element['href'], "image_url": image_url, "text": text,}
|
||||
|
||||
r = requests.get(URL_YELLOW)
|
||||
soup = BeautifulSoup(r.text, 'lxml')
|
||||
page_number = 1
|
||||
result_list = []
|
||||
max_len = "".join(re.findall("\d", re.sub("\xa0",'', soup.find("div", {"id": "mw-pages"}).find("p").text))[3:])
|
||||
try:
|
||||
while True:
|
||||
next_page = soup.find("a", {"href": re.compile(r"\/w\/index.php.*")}, string="następna strona").get('href', None)
|
||||
if next_page and page_number != 1:
|
||||
r = requests.get(MAIN_URL + next_page)
|
||||
else:
|
||||
break
|
||||
|
||||
if r.status_code != 200:
|
||||
print(r.__dict__)
|
||||
break
|
||||
soup = BeautifulSoup(r.text, 'lxml')
|
||||
page_number += 1
|
||||
links = soup.find_all("a", {"href": re.compile(r"\/wiki\/Strona:.*")})
|
||||
for link in tqdm(links):
|
||||
result_list.append(get_page_data(link))
|
||||
print("Page number:", page_number)
|
||||
print("Number of elements:", 200 * page_number, "/", max_len)
|
||||
except Exception as e:
|
||||
print(e)
|
||||
df = pd.DataFrame(result_list)
|
||||
df.to_csv("./yellow.tsv", sep="\t")
|
||||
|
||||
df = pd.DataFrame(result_list)
|
||||
df.to_csv("./yellow.tsv", sep="\t")
|
||||
|
||||
if __name__ == "__main__":
|
||||
main()
|
Loading…
Reference in New Issue
Block a user