From c68c41e9b55bc1ff6b189d72c47a2a98444aef73 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Micha=C5=82=20Koz=C5=82owski?= Date: Tue, 3 Jan 2023 10:53:34 +0100 Subject: [PATCH] crawler init --- crawler.ipynb | 324 ++++++++++++++++++++++++++++++++++++++++++++++++++ crawler.py | 53 +++++++++ 2 files changed, 377 insertions(+) create mode 100644 crawler.ipynb create mode 100644 crawler.py diff --git a/crawler.ipynb b/crawler.ipynb new file mode 100644 index 0000000..23f37cf --- /dev/null +++ b/crawler.ipynb @@ -0,0 +1,324 @@ +{ + "cells": [ + { + "cell_type": "code", + "execution_count": 61, + "metadata": {}, + "outputs": [], + "source": [ + "import pandas as pd\n", + "import requests\n", + "from bs4 import BeautifulSoup\n", + "import re\n", + "from tqdm import tqdm\n", + "import time" + ] + }, + { + "cell_type": "code", + "execution_count": 62, + "metadata": {}, + "outputs": [], + "source": [ + "MAIN_URL = \"https://pl.wikisource.org/\"\n", + "URL_YELLOW = \"https://pl.wikisource.org/wiki/Kategoria:Skorygowana\"\n", + "URL_GREEN = \"https://pl.wikisource.org/wiki/Kategoria:Uwierzytelniona\"" + ] + }, + { + "cell_type": "code", + "execution_count": 63, + "metadata": {}, + "outputs": [], + "source": [] + }, + { + "cell_type": "code", + "execution_count": 65, + "metadata": {}, + "outputs": [], + "source": [ + "def get_page_data(page_element):\n", + " doc = requests.get(MAIN_URL + page_element['href'])\n", + " doc_soup = BeautifulSoup(doc.text, 'lxml', from_encoding=\"utf-8\")\n", + " text_elem = doc_soup.find(\"div\", {\"class\": \"pagetext\"}).next_element\n", + " text = text_elem.text if not text_elem.find(\"math\") else \"math image\"\n", + " image_url = doc_soup.find(\"div\", {\"class\": \"prp-page-image\"}).next_element['src']\n", + " return {\"title\": page_element['title'], \"href\": MAIN_URL + page_element['href'], \"image_url\": image_url, \"text\": text,}\n" + ] + }, + { + "cell_type": "code", + "execution_count": 60, + "metadata": {}, + "outputs": [ + { + "data": { + "text/plain": [ + "'107472'" + ] + }, + "execution_count": 60, + "metadata": {}, + "output_type": "execute_result" + } + ], + "source": [ + "\"\".join(re.findall(\"\\d\", re.sub(\"\\xa0\",'', soup.find(\"div\", {\"id\": \"mw-pages\"}).find(\"p\").text))[3:])" + ] + }, + { + "cell_type": "code", + "execution_count": 7, + "metadata": {}, + "outputs": [], + "source": [ + "links = soup.find_all(\"a\", {\"href\": re.compile(r\"\\/wiki\\/Strona:.*\")})" + ] + }, + { + "cell_type": "code", + "execution_count": 15, + "metadata": {}, + "outputs": [], + "source": [ + "next_page = soup.find(\"a\", {\"href\": re.compile(r\"\\/w\\/index.php.*\")}, string=\"następna strona\")" + ] + }, + { + "cell_type": "code", + "execution_count": 64, + "metadata": {}, + "outputs": [], + "source": [] + }, + { + "cell_type": "code", + "execution_count": 5, + "metadata": {}, + "outputs": [ + { + "name": "stderr", + "output_type": "stream", + "text": [ + " 0%| | 0/200 [00:00