{ "cells": [ { "cell_type": "code", "execution_count": 61, "metadata": {}, "outputs": [], "source": [ "import pandas as pd\n", "import requests\n", "from bs4 import BeautifulSoup\n", "import re\n", "from tqdm import tqdm\n", "import time" ] }, { "cell_type": "code", "execution_count": 62, "metadata": {}, "outputs": [], "source": [ "MAIN_URL = \"https://pl.wikisource.org/\"\n", "URL_YELLOW = \"https://pl.wikisource.org/wiki/Kategoria:Skorygowana\"\n", "URL_GREEN = \"https://pl.wikisource.org/wiki/Kategoria:Uwierzytelniona\"" ] }, { "cell_type": "code", "execution_count": 63, "metadata": {}, "outputs": [], "source": [] }, { "cell_type": "code", "execution_count": 65, "metadata": {}, "outputs": [], "source": [ "def get_page_data(page_element):\n", " doc = requests.get(MAIN_URL + page_element['href'])\n", " doc_soup = BeautifulSoup(doc.text, 'lxml', from_encoding=\"utf-8\")\n", " text_elem = doc_soup.find(\"div\", {\"class\": \"pagetext\"}).next_element\n", " text = text_elem.text if not text_elem.find(\"math\") else \"math image\"\n", " image_url = doc_soup.find(\"div\", {\"class\": \"prp-page-image\"}).next_element['src']\n", " return {\"title\": page_element['title'], \"href\": MAIN_URL + page_element['href'], \"image_url\": image_url, \"text\": text,}\n" ] }, { "cell_type": "code", "execution_count": 60, "metadata": {}, "outputs": [ { "data": { "text/plain": [ "'107472'" ] }, "execution_count": 60, "metadata": {}, "output_type": "execute_result" } ], "source": [ "\"\".join(re.findall(\"\\d\", re.sub(\"\\xa0\",'', soup.find(\"div\", {\"id\": \"mw-pages\"}).find(\"p\").text))[3:])" ] }, { "cell_type": "code", "execution_count": 7, "metadata": {}, "outputs": [], "source": [ "links = soup.find_all(\"a\", {\"href\": re.compile(r\"\\/wiki\\/Strona:.*\")})" ] }, { "cell_type": "code", "execution_count": 15, "metadata": {}, "outputs": [], "source": [ "next_page = soup.find(\"a\", {\"href\": re.compile(r\"\\/w\\/index.php.*\")}, string=\"następna strona\")" ] }, { "cell_type": "code", "execution_count": 64, "metadata": {}, "outputs": [], "source": [] }, { "cell_type": "code", "execution_count": 5, "metadata": {}, "outputs": [ { "name": "stderr", "output_type": "stream", "text": [ " 0%| | 0/200 [00:00