diff --git a/README.md b/README.md new file mode 100644 index 0000000..0a760e5 --- /dev/null +++ b/README.md @@ -0,0 +1,13 @@ +# Wikisource crawler and image downloader + +## Requirements: +Python 3.8> + +## Install/setup: +`pip install -r requirements.txt` + +## Usage crawler +`python crawler.py --type {green or yellow or red} --output_file_name {output tsv file name} --start_file_name {name of file to start crawling from} --start_page_number {page of file to start crawling}` + +## Usage image downloader +`python image_download.py --file_path {tsv file with data to download} --output_folder {folder to output images -> default images} --max_folder_size_mb {size in MB to stop, if not given will download all} --from_checkpoint {True to start from checkpoint if pickle available}` \ No newline at end of file diff --git a/image_download.py b/image_download.py index 99de7ea..882edb5 100644 --- a/image_download.py +++ b/image_download.py @@ -7,6 +7,7 @@ from tqdm import tqdm import pickle import time from pprint import pprint +import json headers = {'User-Agent': 'ImageDownloadOcrBot/1.0 (micha9@op.pl) requests/2.28.1'} @@ -30,32 +31,38 @@ def main(args): print("Starting from checkpoint, index: ", offset) df = df[offset:] - for n, row in enumerate(tqdm(df.iterrows(), total=len(df))): + pbar = tqdm(df.iterrows(), total=len(df), desc=f"0/{args.max_folder_size_mb} MB") + for n, row in enumerate(pbar): try: time.sleep(0.2) r = requests.get(f"https:{row[1]['image_url']}", stream=True, headers=headers) if r.status_code != 200: pprint(r.__dict__) save_state(n, offset) - break + return image = Image.open(r.raw) if image.mode != "RGB": image = image.convert("RGB") title = row[1]['title'].replace("Strona:", "").replace("/", "-") image.save(f"{args.output_folder}/{title}.png") - if round(sum(os.path.getsize(f"./{args.output_folder}/{file}") for file in os.listdir(f"./{args.output_folder}")) * 0.000001, 2) > args.max_folder_size_mb: + with open(f"{args.output_folder}/metadata.jsonl", mode='a', encoding='utf-8') as f: + f.write(str({"file_name": title, "ground_truth": json.dumps({"gt_parse": {"text_sequance": row[1]['text']}}, ensure_ascii=False)}) + "\n") + + dir_size = round(sum(os.path.getsize(f"./{args.output_folder}/{file}") for file in os.listdir(f"./{args.output_folder}")) * 0.000001, 2) + + pbar.set_description(f"{dir_size}/{args.max_folder_size_mb if args.max_folder_size_mb else ''} MB") + + if args.max_folder.size_mb and dir_size > args.max_folder_size_mb: print(f"Limit size of: {args.max_folder_size_mb}, exceeded") save_state(n, offset) - break - except Exception as e: - print(e) - save_state(n, offset) - break + return - except KeyboardInterrupt: + except (Exception, KeyboardInterrupt) as e: + print(f"Error: {str(e)} \n") + print(f"Row: {row}") save_state(n, offset) - break + return @@ -63,7 +70,7 @@ if __name__ == "__main__": parser = argparse.ArgumentParser() parser.add_argument("--file_path", type=str, required=True) parser.add_argument("--output_folder", type=str, default="./images") - parser.add_argument("--max_folder_size_mb", default=5000.0, type=float, required=False) + parser.add_argument("--max_folder_size_mb", type=float, required=False) parser.add_argument("--from_checkpoint", type=bool, required=False, default=False) args, left_argv = parser.parse_known_args() main(args) \ No newline at end of file diff --git a/notebooks/image_download.ipynb b/notebooks/image_download.ipynb index e69de29..45e6de8 100644 --- a/notebooks/image_download.ipynb +++ b/notebooks/image_download.ipynb @@ -0,0 +1,166 @@ +{ + "cells": [ + { + "cell_type": "code", + "execution_count": 1, + "metadata": {}, + "outputs": [], + "source": [ + "import pandas as pd" + ] + }, + { + "cell_type": "code", + "execution_count": 2, + "metadata": {}, + "outputs": [], + "source": [ + "a = pd.read_csv(\"../../wikisource-data/yellow-continue-yellow.tsv.tsv\", sep=\"\\t\")" + ] + }, + { + "cell_type": "code", + "execution_count": 3, + "metadata": {}, + "outputs": [ + { + "data": { + "text/html": [ + "
\n", + "\n", + "\n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + "
Unnamed: 0titlehrefimage_urltext
00Strona:Stanisław Antoni Wotowski - George Sand...https://pl.wikisource.org//wiki/Strona:Stanis%...//upload.wikimedia.org/wikipedia/commons/thumb...zmieniła się; piękne oczy są tak samo błyszczą...
11Strona:Stanisław Antoni Wotowski - George Sand...https://pl.wikisource.org//wiki/Strona:Stanis%...//upload.wikimedia.org/wikipedia/commons/thumb...najświetniejszej chociażby sławy... i po piętn...
22Strona:Stanisław Antoni Wotowski - George Sand...https://pl.wikisource.org//wiki/Strona:Stanis%...//upload.wikimedia.org/wikipedia/commons/thumb...Chopin gra. Ledwie dostrzegalnie muskają smuk...
33Strona:Stanisław Antoni Wotowski - George Sand...https://pl.wikisource.org//wiki/Strona:Stanis%...//upload.wikimedia.org/wikipedia/commons/thumb...\\nDZIWACZNE MAŁŻEŃSTWO.\\n\\n Był grudzień 1830 ...
44Strona:Stanisław Antoni Wotowski - George Sand...https://pl.wikisource.org//wiki/Strona:Stanis%...//upload.wikimedia.org/wikipedia/commons/thumb...Ale bliższego związku z panią Sand jakby się ...
\n", + "
" + ], + "text/plain": [ + " Unnamed: 0 title \\\n", + "0 0 Strona:Stanisław Antoni Wotowski - George Sand... \n", + "1 1 Strona:Stanisław Antoni Wotowski - George Sand... \n", + "2 2 Strona:Stanisław Antoni Wotowski - George Sand... \n", + "3 3 Strona:Stanisław Antoni Wotowski - George Sand... \n", + "4 4 Strona:Stanisław Antoni Wotowski - George Sand... \n", + "\n", + " href \\\n", + "0 https://pl.wikisource.org//wiki/Strona:Stanis%... \n", + "1 https://pl.wikisource.org//wiki/Strona:Stanis%... \n", + "2 https://pl.wikisource.org//wiki/Strona:Stanis%... \n", + "3 https://pl.wikisource.org//wiki/Strona:Stanis%... \n", + "4 https://pl.wikisource.org//wiki/Strona:Stanis%... \n", + "\n", + " image_url \\\n", + "0 //upload.wikimedia.org/wikipedia/commons/thumb... \n", + "1 //upload.wikimedia.org/wikipedia/commons/thumb... \n", + "2 //upload.wikimedia.org/wikipedia/commons/thumb... \n", + "3 //upload.wikimedia.org/wikipedia/commons/thumb... \n", + "4 //upload.wikimedia.org/wikipedia/commons/thumb... \n", + "\n", + " text \n", + "0 zmieniła się; piękne oczy są tak samo błyszczą... \n", + "1 najświetniejszej chociażby sławy... i po piętn... \n", + "2  Chopin gra. Ledwie dostrzegalnie muskają smuk... \n", + "3 \\nDZIWACZNE MAŁŻEŃSTWO.\\n\\n Był grudzień 1830 ... \n", + "4  Ale bliższego związku z panią Sand jakby się ... " + ] + }, + "execution_count": 3, + "metadata": {}, + "output_type": "execute_result" + } + ], + "source": [ + "a.head()" + ] + } + ], + "metadata": { + "kernelspec": { + "display_name": "um", + "language": "python", + "name": "python3" + }, + "language_info": { + "codemirror_mode": { + "name": "ipython", + "version": 3 + }, + "file_extension": ".py", + "mimetype": "text/x-python", + "name": "python", + "nbconvert_exporter": "python", + "pygments_lexer": "ipython3", + "version": "3.9.15" + }, + "orig_nbformat": 4, + "vscode": { + "interpreter": { + "hash": "876e189cbbe99a9a838ece62aae1013186c4bb7e0254a10cfa2f9b2381853efb" + } + } + }, + "nbformat": 4, + "nbformat_minor": 2 +} diff --git a/notebooks/join.ipynb b/notebooks/join.ipynb index f5cb0f1..531e9bb 100644 --- a/notebooks/join.ipynb +++ b/notebooks/join.ipynb @@ -2,7 +2,7 @@ "cells": [ { "cell_type": "code", - "execution_count": 1, + "execution_count": null, "metadata": {}, "outputs": [], "source": [ @@ -11,30 +11,57 @@ }, { "cell_type": "code", - "execution_count": 2, + "execution_count": null, "metadata": {}, "outputs": [], "source": [ - "yellow = pd.read_csv(\"../wikisource-data/yellow.tsv\", sep=\"\\t\")\n", - "yellow_c = pd.read_csv(\"../wikisource-data/yellow-continue-yellow.tsv.tsv\", sep=\"\\t\")" + "green = pd.read_csv(\"../../wikisource-data/green.tsv\", sep=\"\\t\")\n" ] }, { "cell_type": "code", - "execution_count": 4, + "execution_count": null, "metadata": {}, "outputs": [], "source": [ - "whole = pd.concat([yellow, yellow_c], axis=0)\n" + "green.tail()" ] }, { "cell_type": "code", - "execution_count": 11, + "execution_count": null, "metadata": {}, "outputs": [], "source": [ - "whole.to_csv(\"./yellow-full.tsv\", sep=\"\\t\")" + "green = pd.read_csv(\"../green-full.tsv\", sep=\"\\t\")\n", + "yellow = pd.read_csv(\"../yellow-full.tsv\", sep=\"\\t\")" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": {}, + "outputs": [], + "source": [ + "whole = pd.concat([green, yellow], axis=0)\n" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": {}, + "outputs": [], + "source": [ + "len(whole)" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": {}, + "outputs": [], + "source": [ + "whole.to_csv(\"./wikisource-full.tsv\", sep=\"\\t\")" ] } ],