diff --git a/crawler.ipynb b/crawler.ipynb index a51da0d..0b005ab 100644 --- a/crawler.ipynb +++ b/crawler.ipynb @@ -343,7 +343,7 @@ "name": "python", "nbconvert_exporter": "python", "pygments_lexer": "ipython3", - "version": "3.9.15" + "version": "3.9.15 | packaged by conda-forge | (main, Nov 22 2022, 08:41:22) [MSC v.1929 64 bit (AMD64)]" }, "orig_nbformat": 4, "vscode": { diff --git a/image_download.ipynb b/image_download.ipynb index 36d6c05..f38e604 100644 --- a/image_download.ipynb +++ b/image_download.ipynb @@ -262,24 +262,31 @@ }, { "cell_type": "code", - "execution_count": 95, + "execution_count": 107, + "metadata": {}, + "outputs": [], + "source": [ + "b = Image.open(requests.get(f\"https:{a['image_url'][0]}\", stream=True).raw)" + ] + }, + { + "cell_type": "code", + "execution_count": 108, "metadata": {}, "outputs": [ { - "ename": "UnidentifiedImageError", - "evalue": "cannot identify image file <_io.BytesIO object at 0x0000021DA1EA14F0>", - "output_type": "error", - "traceback": [ - "\u001b[1;31m---------------------------------------------------------------------------\u001b[0m", - "\u001b[1;31mUnidentifiedImageError\u001b[0m Traceback (most recent call last)", - "Cell \u001b[1;32mIn[95], line 1\u001b[0m\n\u001b[1;32m----> 1\u001b[0m b \u001b[39m=\u001b[39m Image\u001b[39m.\u001b[39;49mopen(requests\u001b[39m.\u001b[39;49mget(\u001b[39mf\u001b[39;49m\u001b[39m\"\u001b[39;49m\u001b[39mhttps:\u001b[39;49m\u001b[39m{\u001b[39;49;00ma[\u001b[39m'\u001b[39;49m\u001b[39mimage_url\u001b[39;49m\u001b[39m'\u001b[39;49m][\u001b[39m4\u001b[39;49m]\u001b[39m}\u001b[39;49;00m\u001b[39m\"\u001b[39;49m, stream\u001b[39m=\u001b[39;49m\u001b[39mTrue\u001b[39;49;00m)\u001b[39m.\u001b[39;49mraw)\n", - "File \u001b[1;32mc:\\Users\\PC\\anaconda3\\envs\\um\\lib\\site-packages\\PIL\\Image.py:3147\u001b[0m, in \u001b[0;36mopen\u001b[1;34m(fp, mode, formats)\u001b[0m\n\u001b[0;32m 3145\u001b[0m \u001b[39mfor\u001b[39;00m message \u001b[39min\u001b[39;00m accept_warnings:\n\u001b[0;32m 3146\u001b[0m warnings\u001b[39m.\u001b[39mwarn(message)\n\u001b[1;32m-> 3147\u001b[0m \u001b[39mraise\u001b[39;00m UnidentifiedImageError(\n\u001b[0;32m 3148\u001b[0m \u001b[39m\"\u001b[39m\u001b[39mcannot identify image file \u001b[39m\u001b[39m%r\u001b[39;00m\u001b[39m\"\u001b[39m \u001b[39m%\u001b[39m (filename \u001b[39mif\u001b[39;00m filename \u001b[39melse\u001b[39;00m fp)\n\u001b[0;32m 3149\u001b[0m )\n", - "\u001b[1;31mUnidentifiedImageError\u001b[0m: cannot identify image file <_io.BytesIO object at 0x0000021DA1EA14F0>" - ] + "data": { + "text/plain": [ + "(1024, 1486)" + ] + }, + "execution_count": 108, + "metadata": {}, + "output_type": "execute_result" } ], "source": [ - "b = Image.open(requests.get(f\"https:{a['image_url'][4]}\", stream=True).raw)" + "b." ] }, { diff --git a/join.ipynb b/join.ipynb new file mode 100644 index 0000000..f5cb0f1 --- /dev/null +++ b/join.ipynb @@ -0,0 +1,68 @@ +{ + "cells": [ + { + "cell_type": "code", + "execution_count": 1, + "metadata": {}, + "outputs": [], + "source": [ + "import pandas as pd" + ] + }, + { + "cell_type": "code", + "execution_count": 2, + "metadata": {}, + "outputs": [], + "source": [ + "yellow = pd.read_csv(\"../wikisource-data/yellow.tsv\", sep=\"\\t\")\n", + "yellow_c = pd.read_csv(\"../wikisource-data/yellow-continue-yellow.tsv.tsv\", sep=\"\\t\")" + ] + }, + { + "cell_type": "code", + "execution_count": 4, + "metadata": {}, + "outputs": [], + "source": [ + "whole = pd.concat([yellow, yellow_c], axis=0)\n" + ] + }, + { + "cell_type": "code", + "execution_count": 11, + "metadata": {}, + "outputs": [], + "source": [ + "whole.to_csv(\"./yellow-full.tsv\", sep=\"\\t\")" + ] + } + ], + "metadata": { + "kernelspec": { + "display_name": "um", + "language": "python", + "name": "python3" + }, + "language_info": { + "codemirror_mode": { + "name": "ipython", + "version": 3 + }, + "file_extension": ".py", + "mimetype": "text/x-python", + "name": "python", + "nbconvert_exporter": "python", + "pygments_lexer": "ipython3", + "version": "3.9.15" + }, + "orig_nbformat": 4, + "vscode": { + "interpreter": { + "hash": "876e189cbbe99a9a838ece62aae1013186c4bb7e0254a10cfa2f9b2381853efb" + } + } + }, + "nbformat": 4, + "nbformat_minor": 2 +}