Compare commits

..

No commits in common. "master" and "data" have entirely different histories.
master ... data

12 changed files with 13393 additions and 711 deletions

6
.gitignore vendored
View File

@ -1,6 +0,0 @@
env
*.out
images*
*.tsv
env_wikicrawler
temp_images

View File

@ -1,13 +0,0 @@
# Wikisource crawler and image downloader
## Requirements:
Python 3.8>
## Install/setup:
`pip install -r requirements.txt`
## Usage crawler
`python crawler.py --type {green or yellow or red} --output_file_name {output tsv file name} --start_file_name {name of file to start crawling from} --start_page_number {page of file to start crawling}`
## Usage image downloader
`python image_download.py --file_path {tsv file with data to download} --output_folder {folder to output images -> default images} --max_folder_size_mb {size in MB to stop, if not given will download all} --from_checkpoint {True to start from checkpoint if pickle available}`

View File

@ -343,7 +343,7 @@
"name": "python",
"nbconvert_exporter": "python",
"pygments_lexer": "ipython3",
"version": "3.9.15 | packaged by conda-forge | (main, Nov 22 2022, 08:41:22) [MSC v.1929 64 bit (AMD64)]"
"version": "3.9.15"
},
"orig_nbformat": 4,
"vscode": {

View File

@ -6,43 +6,27 @@ from tqdm import tqdm
import time
import argparse
def main():
MAIN_URL = "https://pl.wikisource.org/"
URL_YELLOW = "https://pl.wikisource.org/wiki/Kategoria:Skorygowana"
URL_GREEN = "https://pl.wikisource.org/wiki/Kategoria:Uwierzytelniona"
def get_page_data(page_element):
time.sleep(0.5)
doc = requests.get(MAIN_URL + page_element['href'])
doc_soup = BeautifulSoup(doc.text, 'lxml')
text = doc_soup.find("div", {"class": "pagetext"}).next_element
image_url = doc_soup.find("div", {"class": "prp-page-image"}).find("img")['src']
return {"title": page_element['title'], "href": MAIN_URL + page_element['href'], "image_url": image_url, "text": text.text}
doc_soup = BeautifulSoup(doc.text, 'lxml', from_encoding="utf-8")
text_elem = doc_soup.find("div", {"class": "pagetext"}).next_element
text = text_elem.text
image_url = doc_soup.find("div", {"class": "prp-page-image"}).next_element['src']
return {"title": page_element['title'], "href": MAIN_URL + page_element['href'], "image_url": image_url, "text": text,}
def save_data(file_name, data, args):
if not args.testing:
df = pd.DataFrame(data)
df.to_csv(f"./{file_name}.tsv", sep="\t")
def main(args):
category_dict = {"green": "Uwierzytelniona", "yellow": "Skorygowana", "red": "Przepisana"}
if args.start_file_name and args.start_page_number:
CATEGORY_URL = f"{MAIN_URL}/w/index.php?title=Kategoria:{category_dict[args.type]}&pagefrom={args.start_file_name}"
else:
CATEGORY_URL = f"{MAIN_URL}/wiki/Kategoria:{category_dict[args.type]}"
r = requests.get(CATEGORY_URL)
r = requests.get(URL_GREEN)
soup = BeautifulSoup(r.text, 'lxml')
page_number = 1 if not args.start_page_number else args.start_page_number
data_number = 0 if not args.start_page_number else args.start_page_number * 200
page_number = 1
result_list = []
max_len = int("".join(re.findall("\d", re.sub("\xa0",'', soup.find("div", {"id": "mw-pages"}).find("p").text))[3:]))
max_len = "".join(re.findall("\d", re.sub("\xa0",'', soup.find("div", {"id": "mw-pages"}).find("p").text))[3:])
try:
with tqdm(total=max_len) as pbar:
if args.start_page_number:
pbar.update(data_number)
pbar.refresh()
while data_number < max_len:
pbar.set_description(f"Page number: {page_number}")
while True:
time.sleep(5)
next_page = soup.find("a", {"href": re.compile(r"\/w\/index.php.*")}, string="następna strona").get('href', None)
if next_page and page_number != 1:
@ -52,33 +36,24 @@ def main(args):
if r.status_code != 200:
print(r.__dict__)
time.sleep(60)
time.sleep(10)
r = requests.get(MAIN_URL + next_page)
if r.status_code != 200:
break
soup = BeautifulSoup(r.text, 'lxml')
links = soup.find_all("a", {"href": re.compile(r"\/wiki\/Strona:.*")})
page_number += 1
for link in links:
links = soup.find_all("a", {"href": re.compile(r"\/wiki\/Strona:.*")})
for link in tqdm(links):
result_list.append(get_page_data(link))
data_number += 1
pbar.update(1)
print("Page number:", page_number)
print("Number of elements:", 200 * page_number, "/", max_len)
except Exception as e:
print("Error:", e)
save_data(f"./{args.output_file_name}-{args.type}", result_list, args)
except KeyboardInterrupt:
save_data(f"./{args.output_file_name}-{args.type}", result_list, args)
save_data(f"./{args.output_file_name}-{args.type}", result_list, args)
print(e)
df = pd.DataFrame(result_list)
df.to_csv("./green.tsv", sep="\t")
df = pd.DataFrame(result_list)
df.to_csv("./yellow.tsv", sep="\t")
if __name__ == "__main__":
parser = argparse.ArgumentParser()
parser.add_argument("--wiki_type", type=str, default='green', choices=["green", "yellow", "red"], required=True)
parser.add_argument("--output_file_name", type=str, required=True)
parser.add_argument("--start_file_name", type=str, required=False)
parser.add_argument("--start_page_number", type=int, required=False)
parser.add_argument("--testing", type=bool, required=False, default=False)
args, left_argv = parser.parse_known_args()
print(type(vars(args)))
# main(args)
main()

View File

@ -1,115 +0,0 @@
from bs4 import BeautifulSoup
import requests
import time
import os
from tqdm import tqdm
import csv
from collections import deque
import argparse
import re
import pandas as pd
MAIN_URL = "https://pl.wikisource.org/"
class WikiCrawler:
def __init__(self, wiki_type: str, output_file_name: str):
self.wiki_type = wiki_type
self.output_file_name = output_file_name
self.page_number = 1
self.index = 1
self.load_last_checkpoint()
def load_last_checkpoint(self):
self.start_file_name = None
if os.path.exists(self.output_file_name):
df = pd.read_csv(self.output_file_name, encoding='utf-8', sep='\t')
last_line = df.tail(1).iloc()[0]
self.start_file_name = last_line[0]
self.page_number = int(last_line[-1])
self.index = int(last_line[-2])
del df
print(f"Starting from index: {self.index}, page: {self.page_number}")
def _init_crawl(self):
category_dict = {"green": "Uwierzytelniona", "yellow": "Skorygowana", "red": "Przepisana"}
CATEGORY_URL = f"{MAIN_URL}/wiki/Kategoria:{category_dict[self.wiki_type]}"
# if should start from other step
if self.start_file_name:
CATEGORY_URL = f"{MAIN_URL}/w/index.php?title=Kategoria:{category_dict[self.wiki_type]}&pagefrom={self.start_file_name}"
request = requests.get(CATEGORY_URL)
assert request.status_code == 200, f"Status diffrent on main request, status: {request.status_code}"
soup = BeautifulSoup(request.text, 'lxml')
self.max_len = int("".join(re.findall("\d", re.sub("\xa0",'', soup.find("div", {"id": "mw-pages"}).find("p").text))[3:]))
self.pbar = tqdm(total=self.max_len)
if self.start_file_name:
self.pbar.update(self.index)
self.pbar.refresh()
return soup, request
def save_page_data(self, page_element):
time.sleep(0.3)
doc_request = requests.get(MAIN_URL + page_element['href'])
assert doc_request.status_code == 200, f"Wrong status on requesting doc link: {MAIN_URL + page_element['href']}"
doc_soup = BeautifulSoup(doc_request.text, 'lxml')
text = doc_soup.find("div", {"class": "pagetext"}).next_element
image_url = doc_soup.find("div", {"class": "prp-page-image"}).find("img")['src']
with open(self.output_file_name, 'a', newline='') as output_csv:
row_dict = {
"title": page_element['title'],
"href": MAIN_URL + page_element['href'],
"image_url": image_url,
"text": text.text,
"index": self.index,
"page_number": self.page_number
}
fields = list(row_dict.keys())
writer = csv.DictWriter(output_csv, fieldnames=list(row_dict.keys()), delimiter='\t')
writer.writerow(row_dict)
def crawl(self):
soup, r = self._init_crawl()
first_search = True
while self.index < self.max_len:
time.sleep(0.3)
self.pbar.set_description(f"Page number: {self.page_number}")
next_page = soup.find("a", {"href": re.compile(r"\/w\/index.php.*")}, string="następna strona")
if next_page:
next_page = next_page.get('href', None)
if next_page and not first_search:
r = requests.get(MAIN_URL + next_page)
elif not next_page:
print(soup)
print("\n\n\n", soup.text)
print("End of pages, or next page not found")
break
# handle wrong request
if r.status_code != 200:
print('Retry of request, request data: ', r.__dict__)
time.sleep(60)
r = requests.get(MAIN_URL + next_page)
assert r.status_code == 200, f"Retry failed, request status: {r.status_code}, full_info: {r.__dict__}"
soup = BeautifulSoup(r.text, 'lxml')
links = soup.find_all("a", {"href": re.compile(r"\/wiki\/Strona:.*")})
for link in links:
self.save_page_data(link)
self.index += 1
self.pbar.update(1)
self.page_number += 1
first_search = False
print("Finished")
if __name__ == '__main__':
parser = argparse.ArgumentParser()
parser.add_argument("--wiki_type", type=str, choices=["green", "yellow", "red"], required=True)
parser.add_argument("--output_file_name", type=str, required=True)
args, left_argv = parser.parse_known_args()
crawler = WikiCrawler(**vars(args))
crawler.crawl()

View File

@ -1,91 +0,0 @@
import os
import argparse
import pandas as pd
import requests
from PIL import Image
from tqdm import tqdm
import pickle
import time
from pprint import pprint
import json
from datasets import load_dataset
from huggingface_hub import login
import shutil
headers = {'User-Agent': 'ImageDownloadOcrBot/1.0 (no.rp.mk.info@gmail.com) requests/2.28.1'}
class WikiImage:
def __init__(self, input_file_path: str, dataset_name: str, output_folder: str = 'temp_images', split_number: int = 1):
self.input_file_path = input_file_path
self.split_number = split_number
self.max_dataset_len = 10000
self.output_folder = output_folder
self.dataset_name = dataset_name
print("Loading input file")
self.dataframe = pd.read_csv(self.input_file_path, sep='\t')[(self.split_number - 1) * self.max_dataset_len:]
if os.path.exists(self.output_folder):
print("Removing old dear")
if os.path.exists('/home/zombely/.cache/huggingface/datasets'):
shutil.rmtree('/home/zombely/.cache/huggingface/datasets')
shutil.rmtree(self.output_folder)
os.mkdir(self.output_folder)
self.pbar = tqdm(self.dataframe.iterrows(), total=len(self.dataframe), desc=f"Split: {self.split_number}")
login(os.environ.get("HUG_TOKEN"), True)
def image_save(self, row):
time.sleep(0.3)
image_request = requests.get(f"https:{row[1]['image_url']}", stream=True, headers=headers)
if image_request.status_code in [500, 404]:
print(f"Image {row[1]['title']} is not reacheable")
return
if image_request.status_code != 200:
time.sleep(80)
image_request = requests.get(f"https:{row[1]['image_url']}", stream=True, headers=headers)
assert image_request.status_code == 200, f"Response status is diffrent, status_code: {image_request.status_code}, full info: {image_request.__dict__}"
image = Image.open(image_request.raw)
if image.mode != "RGB":
image = image.convert("RGB")
title = row[1]['title'].replace("Strona:", "").replace("/", "-")
image.save(f"{self.output_folder}/{title}.png")
with open(f"{self.output_folder}/metadata.jsonl", mode='a', encoding='utf-8') as f:
# f.write(str({"file_name": f"{title}.png", "ground_truth": json.dumps({"gt_parse": {"text_sequance": row[1]['text'].replace('"', "'")}}, ensure_ascii=False)})+"\n")
json.dump({"file_name": f"{title}.png", "ground_truth": json.dumps({"gt_parse": {"text_sequance": row[1]['text'].replace('"', "'")}}, ensure_ascii=False)}, f, ensure_ascii=False)
f.write("\n")
def push_dataset(self, split_name: str):
print(f"Pushing split: {split_name}")
dataset = load_dataset(self.output_folder)
dataset[split_name] = dataset.pop('train')
dataset.push_to_hub(f'Zombely/{self.dataset_name}')
shutil.rmtree(self.output_folder)
shutil.rmtree('/home/zombely/.cache/huggingface/datasets')
os.mkdir(self.output_folder)
del dataset
print("Upload finished")
def crawl(self):
print("Start download")
for index, row in enumerate(self.pbar):
self.image_save(row)
if (index + 1) % self.max_dataset_len == 0:
self.push_dataset(f'train_{self.split_number}')
self.split_number += 1
self.pbar.set_description(f'Split: {self.split_number}')
self.push_dataset('validation')
if __name__ == "__main__":
parser = argparse.ArgumentParser()
parser.add_argument("--input_file_path", type=str, required=True)
parser.add_argument("--dataset_name", type=str, required=True)
parser.add_argument("--output_folder", type=str, required=False, default='temp_images')
parser.add_argument("--split_number", type=int, required=False, default=1)
args, left_argv = parser.parse_known_args()
crawler = WikiImage(**vars(args))
crawler.crawl()

View File

@ -1,79 +0,0 @@
import os
import argparse
import pandas as pd
import requests
from PIL import Image
from tqdm import tqdm
import pickle
import time
from pprint import pprint
import json
headers = {'User-Agent': 'ImageDownloadOcrBot/1.0 (micha9@op.pl) requests/2.28.1'}
def save_state(index, offset):
with open("./state.pickle", "wb") as state_file:
pickle.dump({"row_index": index+offset}, state_file, protocol=pickle.HIGHEST_PROTOCOL)
print("Saving state, index: ", index+offset)
def main(args):
df = pd.read_csv(args.file_path, sep="\t")
offset = 0
if not os.path.exists(args.output_folder):
print(f"Creating missing folder: {args.output_folder}")
os.mkdir(args.output_folder)
if args.from_checkpoint and os.path.exists("./state.pickle"):
with open("state.pickle", "rb") as state:
state_dict = pickle.load(state)
offset = state_dict["row_index"]
print("Starting from checkpoint, index: ", offset)
df = df[offset:]
pbar = tqdm(df.iterrows(), total=len(df), desc=f"0/{args.max_folder_size_mb if args.max_folder_size_mb else 'No limit given'} MB")
for n, row in enumerate(pbar):
try:
time.sleep(0.2)
r = requests.get(f"https:{row[1]['image_url']}", stream=True, headers=headers)
if r.status_code != 200:
time.sleep(80)
r = requests.get(f"https:{row[1]['image_url']}", stream=True, headers=headers)
if r.status_code != 200:
pprint(r.__dict__)
save_state(n, offset)
return
image = Image.open(r.raw)
if image.mode != "RGB":
image = image.convert("RGB")
title = row[1]['title'].replace("Strona:", "").replace("/", "-")
image.save(f"{args.output_folder}/{title}.png")
with open(f"{args.output_folder}/metadata.jsonl", mode='a', encoding='utf-8') as f:
f.write(str({"file_name": f"{title}.png", "ground_truth": json.dumps({"gt_parse": {"text_sequance": row[1]['text']}}, ensure_ascii=False)}) + "\n")
dir_size = round(sum(os.path.getsize(f"./{args.output_folder}/{file}") for file in os.listdir(f"./{args.output_folder}")) * 0.000001, 2)
pbar.set_description(f"{dir_size}/{args.max_folder_size_mb if args.max_folder_size_mb else 'No limit given'} MB")
if args.max_folder_size_mb and dir_size > args.max_folder_size_mb:
print(f"Limit size of: {args.max_folder_size_mb}, exceeded")
save_state(n, offset)
return
except (Exception, KeyboardInterrupt) as e:
print(f"Error: {str(e)} \n")
print(f"Row: {row}")
save_state(n, offset)
return
if __name__ == "__main__":
parser = argparse.ArgumentParser()
parser.add_argument("--file_path", type=str, required=True)
parser.add_argument("--output_folder", type=str, default="./images")
parser.add_argument("--max_folder_size_mb", type=float, required=False)
parser.add_argument("--from_checkpoint", type=bool, required=False, default=False)
args, left_argv = parser.parse_known_args()
main(args)

View File

@ -1,8 +0,0 @@
import smtplib
def main():
smtp = smtplib.SMTP("0.0.0.0", 25, 'mail')
smtp.sendmail('info@zbhome.com', ['michalkozlowski936@gmail.com'], "Hello from zbhome")
if __name__ == "__main__":
main()

View File

@ -1,238 +0,0 @@
{
"cells": [
{
"cell_type": "code",
"execution_count": 1,
"metadata": {},
"outputs": [],
"source": [
"import pandas as pd"
]
},
{
"cell_type": "code",
"execution_count": 2,
"metadata": {},
"outputs": [],
"source": [
"a = pd.read_csv(\"../../wikisource-data/yellow-continue-yellow.tsv.tsv\", sep=\"\\t\")"
]
},
{
"cell_type": "code",
"execution_count": 3,
"metadata": {},
"outputs": [
{
"data": {
"text/html": [
"<div>\n",
"<style scoped>\n",
" .dataframe tbody tr th:only-of-type {\n",
" vertical-align: middle;\n",
" }\n",
"\n",
" .dataframe tbody tr th {\n",
" vertical-align: top;\n",
" }\n",
"\n",
" .dataframe thead th {\n",
" text-align: right;\n",
" }\n",
"</style>\n",
"<table border=\"1\" class=\"dataframe\">\n",
" <thead>\n",
" <tr style=\"text-align: right;\">\n",
" <th></th>\n",
" <th>Unnamed: 0</th>\n",
" <th>title</th>\n",
" <th>href</th>\n",
" <th>image_url</th>\n",
" <th>text</th>\n",
" </tr>\n",
" </thead>\n",
" <tbody>\n",
" <tr>\n",
" <th>0</th>\n",
" <td>0</td>\n",
" <td>Strona:Stanisław Antoni Wotowski - George Sand...</td>\n",
" <td>https://pl.wikisource.org//wiki/Strona:Stanis%...</td>\n",
" <td>//upload.wikimedia.org/wikipedia/commons/thumb...</td>\n",
" <td>zmieniła się; piękne oczy są tak samo błyszczą...</td>\n",
" </tr>\n",
" <tr>\n",
" <th>1</th>\n",
" <td>1</td>\n",
" <td>Strona:Stanisław Antoni Wotowski - George Sand...</td>\n",
" <td>https://pl.wikisource.org//wiki/Strona:Stanis%...</td>\n",
" <td>//upload.wikimedia.org/wikipedia/commons/thumb...</td>\n",
" <td>najświetniejszej chociażby sławy... i po piętn...</td>\n",
" </tr>\n",
" <tr>\n",
" <th>2</th>\n",
" <td>2</td>\n",
" <td>Strona:Stanisław Antoni Wotowski - George Sand...</td>\n",
" <td>https://pl.wikisource.org//wiki/Strona:Stanis%...</td>\n",
" <td>//upload.wikimedia.org/wikipedia/commons/thumb...</td>\n",
" <td>Chopin gra. Ledwie dostrzegalnie muskają smuk...</td>\n",
" </tr>\n",
" <tr>\n",
" <th>3</th>\n",
" <td>3</td>\n",
" <td>Strona:Stanisław Antoni Wotowski - George Sand...</td>\n",
" <td>https://pl.wikisource.org//wiki/Strona:Stanis%...</td>\n",
" <td>//upload.wikimedia.org/wikipedia/commons/thumb...</td>\n",
" <td>\\nDZIWACZNE MAŁŻEŃSTWO.\\n\\nBył grudzień 1830 ...</td>\n",
" </tr>\n",
" <tr>\n",
" <th>4</th>\n",
" <td>4</td>\n",
" <td>Strona:Stanisław Antoni Wotowski - George Sand...</td>\n",
" <td>https://pl.wikisource.org//wiki/Strona:Stanis%...</td>\n",
" <td>//upload.wikimedia.org/wikipedia/commons/thumb...</td>\n",
" <td>Ale bliższego związku z panią Sand jakby się ...</td>\n",
" </tr>\n",
" </tbody>\n",
"</table>\n",
"</div>"
],
"text/plain": [
" Unnamed: 0 title \\\n",
"0 0 Strona:Stanisław Antoni Wotowski - George Sand... \n",
"1 1 Strona:Stanisław Antoni Wotowski - George Sand... \n",
"2 2 Strona:Stanisław Antoni Wotowski - George Sand... \n",
"3 3 Strona:Stanisław Antoni Wotowski - George Sand... \n",
"4 4 Strona:Stanisław Antoni Wotowski - George Sand... \n",
"\n",
" href \\\n",
"0 https://pl.wikisource.org//wiki/Strona:Stanis%... \n",
"1 https://pl.wikisource.org//wiki/Strona:Stanis%... \n",
"2 https://pl.wikisource.org//wiki/Strona:Stanis%... \n",
"3 https://pl.wikisource.org//wiki/Strona:Stanis%... \n",
"4 https://pl.wikisource.org//wiki/Strona:Stanis%... \n",
"\n",
" image_url \\\n",
"0 //upload.wikimedia.org/wikipedia/commons/thumb... \n",
"1 //upload.wikimedia.org/wikipedia/commons/thumb... \n",
"2 //upload.wikimedia.org/wikipedia/commons/thumb... \n",
"3 //upload.wikimedia.org/wikipedia/commons/thumb... \n",
"4 //upload.wikimedia.org/wikipedia/commons/thumb... \n",
"\n",
" text \n",
"0 zmieniła się; piękne oczy są tak samo błyszczą... \n",
"1 najświetniejszej chociażby sławy... i po piętn... \n",
"2 Chopin gra. Ledwie dostrzegalnie muskają smuk... \n",
"3 \\nDZIWACZNE MAŁŻEŃSTWO.\\n\\nBył grudzień 1830 ... \n",
"4 Ale bliższego związku z panią Sand jakby się ... "
]
},
"execution_count": 3,
"metadata": {},
"output_type": "execute_result"
}
],
"source": [
"a.head()"
]
},
{
"cell_type": "code",
"execution_count": 2,
"metadata": {},
"outputs": [],
"source": [
"from datasets import load_dataset\n",
"from huggingface_hub import login"
]
},
{
"cell_type": "code",
"execution_count": 6,
"metadata": {},
"outputs": [
{
"name": "stderr",
"output_type": "stream",
"text": [
"Resolving data files: 100%|██████████| 29/29 [00:00<?, ?it/s]\n",
"Using custom data configuration images-8b1ad802b6988161\n"
]
},
{
"name": "stdout",
"output_type": "stream",
"text": [
"Downloading and preparing dataset imagefolder/images to C:/Users/PC/.cache/huggingface/datasets/imagefolder/images-8b1ad802b6988161/0.0.0/37fbb85cc714a338bea574ac6c7d0b5be5aff46c1862c1989b20e0771199e93f...\n"
]
},
{
"name": "stderr",
"output_type": "stream",
"text": [
"Downloading data files: 0it [00:00, ?it/s]\n",
"Extracting data files: 0it [00:00, ?it/s]\n"
]
},
{
"ename": "ArrowInvalid",
"evalue": "JSON parse error: Missing a name for object member. in row 0",
"output_type": "error",
"traceback": [
"\u001b[1;31m---------------------------------------------------------------------------\u001b[0m",
"\u001b[1;31mArrowInvalid\u001b[0m Traceback (most recent call last)",
"Cell \u001b[1;32mIn[6], line 1\u001b[0m\n\u001b[1;32m----> 1\u001b[0m dataset \u001b[39m=\u001b[39m load_dataset(\u001b[39m\"\u001b[39;49m\u001b[39m../images\u001b[39;49m\u001b[39m\"\u001b[39;49m)\n",
"File \u001b[1;32mc:\\Users\\PC\\anaconda3\\envs\\um\\lib\\site-packages\\datasets\\load.py:1741\u001b[0m, in \u001b[0;36mload_dataset\u001b[1;34m(path, name, data_dir, data_files, split, cache_dir, features, download_config, download_mode, ignore_verifications, keep_in_memory, save_infos, revision, use_auth_token, task, streaming, num_proc, **config_kwargs)\u001b[0m\n\u001b[0;32m 1738\u001b[0m try_from_hf_gcs \u001b[39m=\u001b[39m path \u001b[39mnot\u001b[39;00m \u001b[39min\u001b[39;00m _PACKAGED_DATASETS_MODULES\n\u001b[0;32m 1740\u001b[0m \u001b[39m# Download and prepare data\u001b[39;00m\n\u001b[1;32m-> 1741\u001b[0m builder_instance\u001b[39m.\u001b[39;49mdownload_and_prepare(\n\u001b[0;32m 1742\u001b[0m download_config\u001b[39m=\u001b[39;49mdownload_config,\n\u001b[0;32m 1743\u001b[0m download_mode\u001b[39m=\u001b[39;49mdownload_mode,\n\u001b[0;32m 1744\u001b[0m ignore_verifications\u001b[39m=\u001b[39;49mignore_verifications,\n\u001b[0;32m 1745\u001b[0m try_from_hf_gcs\u001b[39m=\u001b[39;49mtry_from_hf_gcs,\n\u001b[0;32m 1746\u001b[0m use_auth_token\u001b[39m=\u001b[39;49muse_auth_token,\n\u001b[0;32m 1747\u001b[0m num_proc\u001b[39m=\u001b[39;49mnum_proc,\n\u001b[0;32m 1748\u001b[0m )\n\u001b[0;32m 1750\u001b[0m \u001b[39m# Build dataset for splits\u001b[39;00m\n\u001b[0;32m 1751\u001b[0m keep_in_memory \u001b[39m=\u001b[39m (\n\u001b[0;32m 1752\u001b[0m keep_in_memory \u001b[39mif\u001b[39;00m keep_in_memory \u001b[39mis\u001b[39;00m \u001b[39mnot\u001b[39;00m \u001b[39mNone\u001b[39;00m \u001b[39melse\u001b[39;00m is_small_dataset(builder_instance\u001b[39m.\u001b[39minfo\u001b[39m.\u001b[39mdataset_size)\n\u001b[0;32m 1753\u001b[0m )\n",
"File \u001b[1;32mc:\\Users\\PC\\anaconda3\\envs\\um\\lib\\site-packages\\datasets\\builder.py:822\u001b[0m, in \u001b[0;36mDatasetBuilder.download_and_prepare\u001b[1;34m(self, output_dir, download_config, download_mode, ignore_verifications, try_from_hf_gcs, dl_manager, base_path, use_auth_token, file_format, max_shard_size, num_proc, storage_options, **download_and_prepare_kwargs)\u001b[0m\n\u001b[0;32m 820\u001b[0m \u001b[39mif\u001b[39;00m num_proc \u001b[39mis\u001b[39;00m \u001b[39mnot\u001b[39;00m \u001b[39mNone\u001b[39;00m:\n\u001b[0;32m 821\u001b[0m prepare_split_kwargs[\u001b[39m\"\u001b[39m\u001b[39mnum_proc\u001b[39m\u001b[39m\"\u001b[39m] \u001b[39m=\u001b[39m num_proc\n\u001b[1;32m--> 822\u001b[0m \u001b[39mself\u001b[39m\u001b[39m.\u001b[39m_download_and_prepare(\n\u001b[0;32m 823\u001b[0m dl_manager\u001b[39m=\u001b[39mdl_manager,\n\u001b[0;32m 824\u001b[0m verify_infos\u001b[39m=\u001b[39mverify_infos,\n\u001b[0;32m 825\u001b[0m \u001b[39m*\u001b[39m\u001b[39m*\u001b[39mprepare_split_kwargs,\n\u001b[0;32m 826\u001b[0m \u001b[39m*\u001b[39m\u001b[39m*\u001b[39mdownload_and_prepare_kwargs,\n\u001b[0;32m 827\u001b[0m )\n\u001b[0;32m 828\u001b[0m \u001b[39m# Sync info\u001b[39;00m\n\u001b[0;32m 829\u001b[0m \u001b[39mself\u001b[39m\u001b[39m.\u001b[39minfo\u001b[39m.\u001b[39mdataset_size \u001b[39m=\u001b[39m \u001b[39msum\u001b[39m(split\u001b[39m.\u001b[39mnum_bytes \u001b[39mfor\u001b[39;00m split \u001b[39min\u001b[39;00m \u001b[39mself\u001b[39m\u001b[39m.\u001b[39minfo\u001b[39m.\u001b[39msplits\u001b[39m.\u001b[39mvalues())\n",
"File \u001b[1;32mc:\\Users\\PC\\anaconda3\\envs\\um\\lib\\site-packages\\datasets\\builder.py:1555\u001b[0m, in \u001b[0;36mGeneratorBasedBuilder._download_and_prepare\u001b[1;34m(self, dl_manager, verify_infos, **prepare_splits_kwargs)\u001b[0m\n\u001b[0;32m 1554\u001b[0m \u001b[39mdef\u001b[39;00m \u001b[39m_download_and_prepare\u001b[39m(\u001b[39mself\u001b[39m, dl_manager, verify_infos, \u001b[39m*\u001b[39m\u001b[39m*\u001b[39mprepare_splits_kwargs):\n\u001b[1;32m-> 1555\u001b[0m \u001b[39msuper\u001b[39m()\u001b[39m.\u001b[39m_download_and_prepare(\n\u001b[0;32m 1556\u001b[0m dl_manager, verify_infos, check_duplicate_keys\u001b[39m=\u001b[39mverify_infos, \u001b[39m*\u001b[39m\u001b[39m*\u001b[39mprepare_splits_kwargs\n\u001b[0;32m 1557\u001b[0m )\n",
"File \u001b[1;32mc:\\Users\\PC\\anaconda3\\envs\\um\\lib\\site-packages\\datasets\\builder.py:891\u001b[0m, in \u001b[0;36mDatasetBuilder._download_and_prepare\u001b[1;34m(self, dl_manager, verify_infos, **prepare_split_kwargs)\u001b[0m\n\u001b[0;32m 889\u001b[0m split_dict \u001b[39m=\u001b[39m SplitDict(dataset_name\u001b[39m=\u001b[39m\u001b[39mself\u001b[39m\u001b[39m.\u001b[39mname)\n\u001b[0;32m 890\u001b[0m split_generators_kwargs \u001b[39m=\u001b[39m \u001b[39mself\u001b[39m\u001b[39m.\u001b[39m_make_split_generators_kwargs(prepare_split_kwargs)\n\u001b[1;32m--> 891\u001b[0m split_generators \u001b[39m=\u001b[39m \u001b[39mself\u001b[39m\u001b[39m.\u001b[39m_split_generators(dl_manager, \u001b[39m*\u001b[39m\u001b[39m*\u001b[39msplit_generators_kwargs)\n\u001b[0;32m 893\u001b[0m \u001b[39m# Checksums verification\u001b[39;00m\n\u001b[0;32m 894\u001b[0m \u001b[39mif\u001b[39;00m verify_infos \u001b[39mand\u001b[39;00m dl_manager\u001b[39m.\u001b[39mrecord_checksums:\n",
"File \u001b[1;32mc:\\Users\\PC\\anaconda3\\envs\\um\\lib\\site-packages\\datasets\\packaged_modules\\folder_based_builder\\folder_based_builder.py:189\u001b[0m, in \u001b[0;36mFolderBasedBuilder._split_generators\u001b[1;34m(self, dl_manager)\u001b[0m\n\u001b[0;32m 186\u001b[0m metadata_ext \u001b[39m=\u001b[39m metadata_ext\u001b[39m.\u001b[39mpop()\n\u001b[0;32m 188\u001b[0m \u001b[39mfor\u001b[39;00m _, downloaded_metadata_file \u001b[39min\u001b[39;00m itertools\u001b[39m.\u001b[39mchain\u001b[39m.\u001b[39mfrom_iterable(metadata_files\u001b[39m.\u001b[39mvalues()):\n\u001b[1;32m--> 189\u001b[0m pa_metadata_table \u001b[39m=\u001b[39m \u001b[39mself\u001b[39;49m\u001b[39m.\u001b[39;49m_read_metadata(downloaded_metadata_file)\n\u001b[0;32m 190\u001b[0m features_per_metadata_file\u001b[39m.\u001b[39mappend(\n\u001b[0;32m 191\u001b[0m (downloaded_metadata_file, datasets\u001b[39m.\u001b[39mFeatures\u001b[39m.\u001b[39mfrom_arrow_schema(pa_metadata_table\u001b[39m.\u001b[39mschema))\n\u001b[0;32m 192\u001b[0m )\n\u001b[0;32m 193\u001b[0m \u001b[39mfor\u001b[39;00m downloaded_metadata_file, metadata_features \u001b[39min\u001b[39;00m features_per_metadata_file:\n",
"File \u001b[1;32mc:\\Users\\PC\\anaconda3\\envs\\um\\lib\\site-packages\\datasets\\packaged_modules\\folder_based_builder\\folder_based_builder.py:260\u001b[0m, in \u001b[0;36mFolderBasedBuilder._read_metadata\u001b[1;34m(self, metadata_file)\u001b[0m\n\u001b[0;32m 258\u001b[0m \u001b[39melse\u001b[39;00m:\n\u001b[0;32m 259\u001b[0m \u001b[39mwith\u001b[39;00m \u001b[39mopen\u001b[39m(metadata_file, \u001b[39m\"\u001b[39m\u001b[39mrb\u001b[39m\u001b[39m\"\u001b[39m) \u001b[39mas\u001b[39;00m f:\n\u001b[1;32m--> 260\u001b[0m \u001b[39mreturn\u001b[39;00m paj\u001b[39m.\u001b[39;49mread_json(f)\n",
"File \u001b[1;32mc:\\Users\\PC\\anaconda3\\envs\\um\\lib\\site-packages\\pyarrow\\_json.pyx:259\u001b[0m, in \u001b[0;36mpyarrow._json.read_json\u001b[1;34m()\u001b[0m\n",
"File \u001b[1;32mc:\\Users\\PC\\anaconda3\\envs\\um\\lib\\site-packages\\pyarrow\\error.pxi:144\u001b[0m, in \u001b[0;36mpyarrow.lib.pyarrow_internal_check_status\u001b[1;34m()\u001b[0m\n",
"File \u001b[1;32mc:\\Users\\PC\\anaconda3\\envs\\um\\lib\\site-packages\\pyarrow\\error.pxi:100\u001b[0m, in \u001b[0;36mpyarrow.lib.check_status\u001b[1;34m()\u001b[0m\n",
"\u001b[1;31mArrowInvalid\u001b[0m: JSON parse error: Missing a name for object member. in row 0"
]
}
],
"source": [
"dataset = load_dataset(\"../images\")"
]
},
{
"cell_type": "code",
"execution_count": null,
"metadata": {},
"outputs": [],
"source": [
"login('',True)"
]
}
],
"metadata": {
"kernelspec": {
"display_name": "um",
"language": "python",
"name": "python3"
},
"language_info": {
"codemirror_mode": {
"name": "ipython",
"version": 3
},
"file_extension": ".py",
"mimetype": "text/x-python",
"name": "python",
"nbconvert_exporter": "python",
"pygments_lexer": "ipython3",
"version": "3.9.15"
},
"orig_nbformat": 4,
"vscode": {
"interpreter": {
"hash": "876e189cbbe99a9a838ece62aae1013186c4bb7e0254a10cfa2f9b2381853efb"
}
}
},
"nbformat": 4,
"nbformat_minor": 2
}

View File

@ -1,95 +0,0 @@
{
"cells": [
{
"cell_type": "code",
"execution_count": null,
"metadata": {},
"outputs": [],
"source": [
"import pandas as pd"
]
},
{
"cell_type": "code",
"execution_count": null,
"metadata": {},
"outputs": [],
"source": [
"green = pd.read_csv(\"../../wikisource-data/green.tsv\", sep=\"\\t\")\n"
]
},
{
"cell_type": "code",
"execution_count": null,
"metadata": {},
"outputs": [],
"source": [
"green.tail()"
]
},
{
"cell_type": "code",
"execution_count": null,
"metadata": {},
"outputs": [],
"source": [
"green = pd.read_csv(\"../green-full.tsv\", sep=\"\\t\")\n",
"yellow = pd.read_csv(\"../yellow-full.tsv\", sep=\"\\t\")"
]
},
{
"cell_type": "code",
"execution_count": null,
"metadata": {},
"outputs": [],
"source": [
"whole = pd.concat([green, yellow], axis=0)\n"
]
},
{
"cell_type": "code",
"execution_count": null,
"metadata": {},
"outputs": [],
"source": [
"len(whole)"
]
},
{
"cell_type": "code",
"execution_count": null,
"metadata": {},
"outputs": [],
"source": [
"whole.to_csv(\"./wikisource-full.tsv\", sep=\"\\t\")"
]
}
],
"metadata": {
"kernelspec": {
"display_name": "um",
"language": "python",
"name": "python3"
},
"language_info": {
"codemirror_mode": {
"name": "ipython",
"version": 3
},
"file_extension": ".py",
"mimetype": "text/x-python",
"name": "python",
"nbconvert_exporter": "python",
"pygments_lexer": "ipython3",
"version": "3.9.15"
},
"orig_nbformat": 4,
"vscode": {
"interpreter": {
"hash": "876e189cbbe99a9a838ece62aae1013186c4bb7e0254a10cfa2f9b2381853efb"
}
}
},
"nbformat": 4,
"nbformat_minor": 2
}

View File

@ -5,7 +5,6 @@ idna==3.4
lxml==4.9.2
numpy==1.24.1
pandas==1.5.2
Pillow==9.4.0
python-dateutil==2.8.2
pytz==2022.7
requests==2.28.1

13353
yellow.tsv Normal file

File diff suppressed because it is too large Load Diff