diff --git a/README.md b/README.md
new file mode 100644
index 0000000..0a760e5
--- /dev/null
+++ b/README.md
@@ -0,0 +1,13 @@
+# Wikisource crawler and image downloader
+
+## Requirements:
+Python 3.8>
+ 
+## Install/setup:
+`pip install -r requirements.txt`
+
+## Usage crawler
+`python crawler.py --type {green or yellow or red} --output_file_name {output tsv file name} --start_file_name {name of file to start crawling from} --start_page_number {page of file to start crawling}`
+
+## Usage image downloader
+`python image_download.py --file_path {tsv file with data to download} --output_folder {folder to output images -> default images} --max_folder_size_mb {size in MB to stop, if not given will download all} --from_checkpoint {True to start from checkpoint if pickle available}`
\ No newline at end of file
diff --git a/image_download.py b/image_download.py
index 99de7ea..882edb5 100644
--- a/image_download.py
+++ b/image_download.py
@@ -7,6 +7,7 @@ from tqdm import tqdm
 import pickle
 import time
 from pprint import pprint
+import json
 
 headers = {'User-Agent': 'ImageDownloadOcrBot/1.0 (micha9@op.pl) requests/2.28.1'}
 
@@ -30,32 +31,38 @@ def main(args):
             print("Starting from checkpoint, index: ", offset)
             df = df[offset:]
 
-    for n, row in enumerate(tqdm(df.iterrows(), total=len(df))):
+    pbar = tqdm(df.iterrows(), total=len(df), desc=f"0/{args.max_folder_size_mb} MB")
+    for n, row in enumerate(pbar):
         try:
             time.sleep(0.2)
             r = requests.get(f"https:{row[1]['image_url']}", stream=True, headers=headers)
             if r.status_code != 200:
                 pprint(r.__dict__)
                 save_state(n, offset)
-                break
+                return
             image = Image.open(r.raw)
             if image.mode != "RGB":
                 image = image.convert("RGB")
             title = row[1]['title'].replace("Strona:", "").replace("/", "-")
             image.save(f"{args.output_folder}/{title}.png")
 
-            if round(sum(os.path.getsize(f"./{args.output_folder}/{file}") for file in os.listdir(f"./{args.output_folder}")) * 0.000001, 2) > args.max_folder_size_mb:
+            with open(f"{args.output_folder}/metadata.jsonl", mode='a', encoding='utf-8') as f:
+                f.write(str({"file_name": title, "ground_truth": json.dumps({"gt_parse": {"text_sequance": row[1]['text']}}, ensure_ascii=False)}) + "\n")
+
+            dir_size = round(sum(os.path.getsize(f"./{args.output_folder}/{file}") for file in os.listdir(f"./{args.output_folder}")) * 0.000001, 2)
+
+            pbar.set_description(f"{dir_size}/{args.max_folder_size_mb if args.max_folder_size_mb else ''} MB")
+
+            if args.max_folder.size_mb and dir_size > args.max_folder_size_mb:
                 print(f"Limit size of: {args.max_folder_size_mb}, exceeded")
                 save_state(n, offset) 
-                break
-        except Exception as e:
-            print(e)
-            save_state(n, offset)
-            break
+                return
 
-        except KeyboardInterrupt:
+        except (Exception, KeyboardInterrupt) as e:
+            print(f"Error: {str(e)} \n")
+            print(f"Row: {row}")
             save_state(n, offset)
-            break
+            return
 
 
 
@@ -63,7 +70,7 @@ if __name__ == "__main__":
     parser = argparse.ArgumentParser()
     parser.add_argument("--file_path", type=str, required=True)
     parser.add_argument("--output_folder", type=str, default="./images")
-    parser.add_argument("--max_folder_size_mb", default=5000.0, type=float, required=False)
+    parser.add_argument("--max_folder_size_mb", type=float, required=False)
     parser.add_argument("--from_checkpoint", type=bool, required=False, default=False)
     args, left_argv = parser.parse_known_args()
     main(args)
\ No newline at end of file
diff --git a/notebooks/image_download.ipynb b/notebooks/image_download.ipynb
index e69de29..45e6de8 100644
--- a/notebooks/image_download.ipynb
+++ b/notebooks/image_download.ipynb
@@ -0,0 +1,166 @@
+{
+ "cells": [
+  {
+   "cell_type": "code",
+   "execution_count": 1,
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "import pandas as pd"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 2,
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "a = pd.read_csv(\"../../wikisource-data/yellow-continue-yellow.tsv.tsv\", sep=\"\\t\")"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 3,
+   "metadata": {},
+   "outputs": [
+    {
+     "data": {
+      "text/html": [
+       "<div>\n",
+       "<style scoped>\n",
+       "    .dataframe tbody tr th:only-of-type {\n",
+       "        vertical-align: middle;\n",
+       "    }\n",
+       "\n",
+       "    .dataframe tbody tr th {\n",
+       "        vertical-align: top;\n",
+       "    }\n",
+       "\n",
+       "    .dataframe thead th {\n",
+       "        text-align: right;\n",
+       "    }\n",
+       "</style>\n",
+       "<table border=\"1\" class=\"dataframe\">\n",
+       "  <thead>\n",
+       "    <tr style=\"text-align: right;\">\n",
+       "      <th></th>\n",
+       "      <th>Unnamed: 0</th>\n",
+       "      <th>title</th>\n",
+       "      <th>href</th>\n",
+       "      <th>image_url</th>\n",
+       "      <th>text</th>\n",
+       "    </tr>\n",
+       "  </thead>\n",
+       "  <tbody>\n",
+       "    <tr>\n",
+       "      <th>0</th>\n",
+       "      <td>0</td>\n",
+       "      <td>Strona:Stanisław Antoni Wotowski - George Sand...</td>\n",
+       "      <td>https://pl.wikisource.org//wiki/Strona:Stanis%...</td>\n",
+       "      <td>//upload.wikimedia.org/wikipedia/commons/thumb...</td>\n",
+       "      <td>zmieniła się; piękne oczy są tak samo błyszczą...</td>\n",
+       "    </tr>\n",
+       "    <tr>\n",
+       "      <th>1</th>\n",
+       "      <td>1</td>\n",
+       "      <td>Strona:Stanisław Antoni Wotowski - George Sand...</td>\n",
+       "      <td>https://pl.wikisource.org//wiki/Strona:Stanis%...</td>\n",
+       "      <td>//upload.wikimedia.org/wikipedia/commons/thumb...</td>\n",
+       "      <td>najświetniejszej chociażby sławy... i po piętn...</td>\n",
+       "    </tr>\n",
+       "    <tr>\n",
+       "      <th>2</th>\n",
+       "      <td>2</td>\n",
+       "      <td>Strona:Stanisław Antoni Wotowski - George Sand...</td>\n",
+       "      <td>https://pl.wikisource.org//wiki/Strona:Stanis%...</td>\n",
+       "      <td>//upload.wikimedia.org/wikipedia/commons/thumb...</td>\n",
+       "      <td>Chopin gra. Ledwie dostrzegalnie muskają smuk...</td>\n",
+       "    </tr>\n",
+       "    <tr>\n",
+       "      <th>3</th>\n",
+       "      <td>3</td>\n",
+       "      <td>Strona:Stanisław Antoni Wotowski - George Sand...</td>\n",
+       "      <td>https://pl.wikisource.org//wiki/Strona:Stanis%...</td>\n",
+       "      <td>//upload.wikimedia.org/wikipedia/commons/thumb...</td>\n",
+       "      <td>\\nDZIWACZNE MAŁŻEŃSTWO.\\n\\n Był grudzień 1830 ...</td>\n",
+       "    </tr>\n",
+       "    <tr>\n",
+       "      <th>4</th>\n",
+       "      <td>4</td>\n",
+       "      <td>Strona:Stanisław Antoni Wotowski - George Sand...</td>\n",
+       "      <td>https://pl.wikisource.org//wiki/Strona:Stanis%...</td>\n",
+       "      <td>//upload.wikimedia.org/wikipedia/commons/thumb...</td>\n",
+       "      <td>Ale bliższego związku z panią Sand jakby się ...</td>\n",
+       "    </tr>\n",
+       "  </tbody>\n",
+       "</table>\n",
+       "</div>"
+      ],
+      "text/plain": [
+       "   Unnamed: 0                                              title  \\\n",
+       "0           0  Strona:Stanisław Antoni Wotowski - George Sand...   \n",
+       "1           1  Strona:Stanisław Antoni Wotowski - George Sand...   \n",
+       "2           2  Strona:Stanisław Antoni Wotowski - George Sand...   \n",
+       "3           3  Strona:Stanisław Antoni Wotowski - George Sand...   \n",
+       "4           4  Strona:Stanisław Antoni Wotowski - George Sand...   \n",
+       "\n",
+       "                                                href  \\\n",
+       "0  https://pl.wikisource.org//wiki/Strona:Stanis%...   \n",
+       "1  https://pl.wikisource.org//wiki/Strona:Stanis%...   \n",
+       "2  https://pl.wikisource.org//wiki/Strona:Stanis%...   \n",
+       "3  https://pl.wikisource.org//wiki/Strona:Stanis%...   \n",
+       "4  https://pl.wikisource.org//wiki/Strona:Stanis%...   \n",
+       "\n",
+       "                                           image_url  \\\n",
+       "0  //upload.wikimedia.org/wikipedia/commons/thumb...   \n",
+       "1  //upload.wikimedia.org/wikipedia/commons/thumb...   \n",
+       "2  //upload.wikimedia.org/wikipedia/commons/thumb...   \n",
+       "3  //upload.wikimedia.org/wikipedia/commons/thumb...   \n",
+       "4  //upload.wikimedia.org/wikipedia/commons/thumb...   \n",
+       "\n",
+       "                                                text  \n",
+       "0  zmieniła się; piękne oczy są tak samo błyszczą...  \n",
+       "1  najświetniejszej chociażby sławy... i po piętn...  \n",
+       "2   Chopin gra. Ledwie dostrzegalnie muskają smuk...  \n",
+       "3  \\nDZIWACZNE MAŁŻEŃSTWO.\\n\\n Był grudzień 1830 ...  \n",
+       "4   Ale bliższego związku z panią Sand jakby się ...  "
+      ]
+     },
+     "execution_count": 3,
+     "metadata": {},
+     "output_type": "execute_result"
+    }
+   ],
+   "source": [
+    "a.head()"
+   ]
+  }
+ ],
+ "metadata": {
+  "kernelspec": {
+   "display_name": "um",
+   "language": "python",
+   "name": "python3"
+  },
+  "language_info": {
+   "codemirror_mode": {
+    "name": "ipython",
+    "version": 3
+   },
+   "file_extension": ".py",
+   "mimetype": "text/x-python",
+   "name": "python",
+   "nbconvert_exporter": "python",
+   "pygments_lexer": "ipython3",
+   "version": "3.9.15"
+  },
+  "orig_nbformat": 4,
+  "vscode": {
+   "interpreter": {
+    "hash": "876e189cbbe99a9a838ece62aae1013186c4bb7e0254a10cfa2f9b2381853efb"
+   }
+  }
+ },
+ "nbformat": 4,
+ "nbformat_minor": 2
+}
diff --git a/notebooks/join.ipynb b/notebooks/join.ipynb
index f5cb0f1..531e9bb 100644
--- a/notebooks/join.ipynb
+++ b/notebooks/join.ipynb
@@ -2,7 +2,7 @@
  "cells": [
   {
    "cell_type": "code",
-   "execution_count": 1,
+   "execution_count": null,
    "metadata": {},
    "outputs": [],
    "source": [
@@ -11,30 +11,57 @@
   },
   {
    "cell_type": "code",
-   "execution_count": 2,
+   "execution_count": null,
    "metadata": {},
    "outputs": [],
    "source": [
-    "yellow = pd.read_csv(\"../wikisource-data/yellow.tsv\", sep=\"\\t\")\n",
-    "yellow_c = pd.read_csv(\"../wikisource-data/yellow-continue-yellow.tsv.tsv\", sep=\"\\t\")"
+    "green = pd.read_csv(\"../../wikisource-data/green.tsv\", sep=\"\\t\")\n"
    ]
   },
   {
    "cell_type": "code",
-   "execution_count": 4,
+   "execution_count": null,
    "metadata": {},
    "outputs": [],
    "source": [
-    "whole = pd.concat([yellow, yellow_c], axis=0)\n"
+    "green.tail()"
    ]
   },
   {
    "cell_type": "code",
-   "execution_count": 11,
+   "execution_count": null,
    "metadata": {},
    "outputs": [],
    "source": [
-    "whole.to_csv(\"./yellow-full.tsv\", sep=\"\\t\")"
+    "green = pd.read_csv(\"../green-full.tsv\", sep=\"\\t\")\n",
+    "yellow = pd.read_csv(\"../yellow-full.tsv\", sep=\"\\t\")"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "whole = pd.concat([green, yellow], axis=0)\n"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "len(whole)"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "whole.to_csv(\"./wikisource-full.tsv\", sep=\"\\t\")"
    ]
   }
  ],