update of crawler

This commit is contained in:
mkozlowskiAzimuthe 2023-01-03 16:11:50 +01:00
parent 73857d897d
commit b904e64f01
2 changed files with 37 additions and 13382 deletions

View File

@ -6,10 +6,11 @@ from tqdm import tqdm
import time import time
import argparse import argparse
def main(): MAIN_URL = "https://pl.wikisource.org/"
MAIN_URL = "https://pl.wikisource.org/"
URL_YELLOW = "https://pl.wikisource.org/wiki/Kategoria:Skorygowana" def main(args):
URL_GREEN = "https://pl.wikisource.org/wiki/Kategoria:Uwierzytelniona" category_dict = {"green": "Uwierzytelniona", "yellow": "Skorygowana", "red": "Przepisana"}
CATEGORY_URL = f"{MAIN_URL}/wiki/Kategoria:{category_dict[args.type]}"
def get_page_data(page_element): def get_page_data(page_element):
time.sleep(0.5) time.sleep(0.5)
@ -20,40 +21,47 @@ def main():
image_url = doc_soup.find("div", {"class": "prp-page-image"}).next_element['src'] image_url = doc_soup.find("div", {"class": "prp-page-image"}).next_element['src']
return {"title": page_element['title'], "href": MAIN_URL + page_element['href'], "image_url": image_url, "text": text,} return {"title": page_element['title'], "href": MAIN_URL + page_element['href'], "image_url": image_url, "text": text,}
r = requests.get(URL_GREEN) r = requests.get(CATEGORY_URL)
soup = BeautifulSoup(r.text, 'lxml') soup = BeautifulSoup(r.text, 'lxml')
page_number = 1 page_number = 1
data_number = 0
result_list = [] result_list = []
max_len = "".join(re.findall("\d", re.sub("\xa0",'', soup.find("div", {"id": "mw-pages"}).find("p").text))[3:]) max_len = int("".join(re.findall("\d", re.sub("\xa0",'', soup.find("div", {"id": "mw-pages"}).find("p").text))[3:]))
try: try:
while True: with tqdm(total=max_len) as pbar:
time.sleep(5) while data_number < max_len:
next_page = soup.find("a", {"href": re.compile(r"\/w\/index.php.*")}, string="następna strona").get('href', None) pbar.set_description(f"Page number: {page_number}")
if next_page and page_number != 1: time.sleep(5)
r = requests.get(MAIN_URL + next_page) next_page = soup.find("a", {"href": re.compile(r"\/w\/index.php.*")}, string="następna strona").get('href', None)
elif not next_page: if next_page and page_number != 1:
break r = requests.get(MAIN_URL + next_page)
elif not next_page:
break
if r.status_code != 200: if r.status_code != 200:
print(r.__dict__) print(r.__dict__)
time.sleep(10) time.sleep(10)
r = requests.get(MAIN_URL + next_page) r = requests.get(MAIN_URL + next_page)
if r.status_code != 200: if r.status_code != 200:
break break
soup = BeautifulSoup(r.text, 'lxml') soup = BeautifulSoup(r.text, 'lxml')
page_number += 1 links = soup.find_all("a", {"href": re.compile(r"\/wiki\/Strona:.*")})
links = soup.find_all("a", {"href": re.compile(r"\/wiki\/Strona:.*")}) page_number += 1
for link in tqdm(links): for link in links:
result_list.append(get_page_data(link)) result_list.append(get_page_data(link))
print("Page number:", page_number) data_number += 1
print("Number of elements:", 200 * page_number, "/", max_len) pbar.update(1)
except Exception as e: except Exception as e:
print(e) print(e)
df = pd.DataFrame(result_list) df = pd.DataFrame(result_list)
df.to_csv("./green.tsv", sep="\t") df.to_csv(f"./{args.type}.tsv", sep="\t")
df = pd.DataFrame(result_list) df = pd.DataFrame(result_list)
df.to_csv("./yellow.tsv", sep="\t") df.to_csv(f"./{args.type}.tsv", sep="\t")
if __name__ == "__main__": if __name__ == "__main__":
main() parser = argparse.ArgumentParser()
parser.add_argument("--type", type=str, default='green', choices=["green", "yellow", "red"])
args, left_argv = parser.parse_known_args()
main(args)

13353
yellow.tsv

File diff suppressed because it is too large Load Diff