diff --git a/README.md b/README.md new file mode 100644 index 0000000..0a760e5 --- /dev/null +++ b/README.md @@ -0,0 +1,13 @@ +# Wikisource crawler and image downloader + +## Requirements: +Python 3.8> + +## Install/setup: +`pip install -r requirements.txt` + +## Usage crawler +`python crawler.py --type {green or yellow or red} --output_file_name {output tsv file name} --start_file_name {name of file to start crawling from} --start_page_number {page of file to start crawling}` + +## Usage image downloader +`python image_download.py --file_path {tsv file with data to download} --output_folder {folder to output images -> default images} --max_folder_size_mb {size in MB to stop, if not given will download all} --from_checkpoint {True to start from checkpoint if pickle available}` \ No newline at end of file diff --git a/image_download.py b/image_download.py index 99de7ea..882edb5 100644 --- a/image_download.py +++ b/image_download.py @@ -7,6 +7,7 @@ from tqdm import tqdm import pickle import time from pprint import pprint +import json headers = {'User-Agent': 'ImageDownloadOcrBot/1.0 (micha9@op.pl) requests/2.28.1'} @@ -30,32 +31,38 @@ def main(args): print("Starting from checkpoint, index: ", offset) df = df[offset:] - for n, row in enumerate(tqdm(df.iterrows(), total=len(df))): + pbar = tqdm(df.iterrows(), total=len(df), desc=f"0/{args.max_folder_size_mb} MB") + for n, row in enumerate(pbar): try: time.sleep(0.2) r = requests.get(f"https:{row[1]['image_url']}", stream=True, headers=headers) if r.status_code != 200: pprint(r.__dict__) save_state(n, offset) - break + return image = Image.open(r.raw) if image.mode != "RGB": image = image.convert("RGB") title = row[1]['title'].replace("Strona:", "").replace("/", "-") image.save(f"{args.output_folder}/{title}.png") - if round(sum(os.path.getsize(f"./{args.output_folder}/{file}") for file in os.listdir(f"./{args.output_folder}")) * 0.000001, 2) > args.max_folder_size_mb: + with open(f"{args.output_folder}/metadata.jsonl", mode='a', encoding='utf-8') as f: + f.write(str({"file_name": title, "ground_truth": json.dumps({"gt_parse": {"text_sequance": row[1]['text']}}, ensure_ascii=False)}) + "\n") + + dir_size = round(sum(os.path.getsize(f"./{args.output_folder}/{file}") for file in os.listdir(f"./{args.output_folder}")) * 0.000001, 2) + + pbar.set_description(f"{dir_size}/{args.max_folder_size_mb if args.max_folder_size_mb else ''} MB") + + if args.max_folder.size_mb and dir_size > args.max_folder_size_mb: print(f"Limit size of: {args.max_folder_size_mb}, exceeded") save_state(n, offset) - break - except Exception as e: - print(e) - save_state(n, offset) - break + return - except KeyboardInterrupt: + except (Exception, KeyboardInterrupt) as e: + print(f"Error: {str(e)} \n") + print(f"Row: {row}") save_state(n, offset) - break + return @@ -63,7 +70,7 @@ if __name__ == "__main__": parser = argparse.ArgumentParser() parser.add_argument("--file_path", type=str, required=True) parser.add_argument("--output_folder", type=str, default="./images") - parser.add_argument("--max_folder_size_mb", default=5000.0, type=float, required=False) + parser.add_argument("--max_folder_size_mb", type=float, required=False) parser.add_argument("--from_checkpoint", type=bool, required=False, default=False) args, left_argv = parser.parse_known_args() main(args) \ No newline at end of file diff --git a/notebooks/image_download.ipynb b/notebooks/image_download.ipynb index e69de29..45e6de8 100644 --- a/notebooks/image_download.ipynb +++ b/notebooks/image_download.ipynb @@ -0,0 +1,166 @@ +{ + "cells": [ + { + "cell_type": "code", + "execution_count": 1, + "metadata": {}, + "outputs": [], + "source": [ + "import pandas as pd" + ] + }, + { + "cell_type": "code", + "execution_count": 2, + "metadata": {}, + "outputs": [], + "source": [ + "a = pd.read_csv(\"../../wikisource-data/yellow-continue-yellow.tsv.tsv\", sep=\"\\t\")" + ] + }, + { + "cell_type": "code", + "execution_count": 3, + "metadata": {}, + "outputs": [ + { + "data": { + "text/html": [ + "
\n", + " | Unnamed: 0 | \n", + "title | \n", + "href | \n", + "image_url | \n", + "text | \n", + "
---|---|---|---|---|---|
0 | \n", + "0 | \n", + "Strona:Stanisław Antoni Wotowski - George Sand... | \n", + "https://pl.wikisource.org//wiki/Strona:Stanis%... | \n", + "//upload.wikimedia.org/wikipedia/commons/thumb... | \n", + "zmieniła się; piękne oczy są tak samo błyszczą... | \n", + "
1 | \n", + "1 | \n", + "Strona:Stanisław Antoni Wotowski - George Sand... | \n", + "https://pl.wikisource.org//wiki/Strona:Stanis%... | \n", + "//upload.wikimedia.org/wikipedia/commons/thumb... | \n", + "najświetniejszej chociażby sławy... i po piętn... | \n", + "
2 | \n", + "2 | \n", + "Strona:Stanisław Antoni Wotowski - George Sand... | \n", + "https://pl.wikisource.org//wiki/Strona:Stanis%... | \n", + "//upload.wikimedia.org/wikipedia/commons/thumb... | \n", + "Chopin gra. Ledwie dostrzegalnie muskają smuk... | \n", + "
3 | \n", + "3 | \n", + "Strona:Stanisław Antoni Wotowski - George Sand... | \n", + "https://pl.wikisource.org//wiki/Strona:Stanis%... | \n", + "//upload.wikimedia.org/wikipedia/commons/thumb... | \n", + "\\nDZIWACZNE MAŁŻEŃSTWO.\\n\\n Był grudzień 1830 ... | \n", + "
4 | \n", + "4 | \n", + "Strona:Stanisław Antoni Wotowski - George Sand... | \n", + "https://pl.wikisource.org//wiki/Strona:Stanis%... | \n", + "//upload.wikimedia.org/wikipedia/commons/thumb... | \n", + "Ale bliższego związku z panią Sand jakby się ... | \n", + "