Compare commits

..

25 Commits
data ... master

Author SHA1 Message Date
zzombely
da0e4f3263 crawler classes and image downloader 2023-03-12 15:57:35 +00:00
Michał Kozłowski
680c3d000c fixing 2023-01-10 22:57:22 +01:00
Michał Kozłowski
fe144a70b9 readme and update for mb lock 2023-01-10 19:05:56 +01:00
Michał Kozłowski
fe6f47122a gitignore update 2023-01-07 15:48:45 +01:00
Michał Kozłowski
2f648da50d notebooks folder 2023-01-07 15:47:02 +01:00
Michał Kozłowski
ea2ecf454c Merge branch 'master' of https://git.wmi.amu.edu.pl/s444415/wikisource-crawler 2023-01-07 15:45:16 +01:00
Michał Kozłowski
9ccb251801 output_folder fix 2023-01-07 15:45:14 +01:00
162aae6958 requirements update 2023-01-07 14:41:47 +00:00
Michał Kozłowski
61f83dd7e4 gitignore 2023-01-07 15:31:51 +01:00
Michał Kozłowski
9088a3dc10 fix in notebooks plus join 2023-01-07 15:21:57 +01:00
Michał Kozłowski
1f08432fbf crawler fix 2023-01-07 14:59:07 +01:00
Michał Kozłowski
9ee285cf95 fix 2023-01-07 14:41:26 +01:00
Michał Kozłowski
0f3a6eb9da qol update 2023-01-07 14:39:53 +01:00
Michał Kozłowski
bf6014ba98 header update 2023-01-07 14:32:45 +01:00
Michał Kozłowski
5c3611f972 gitignore 2023-01-07 14:23:59 +01:00
Michał Kozłowski
cc76e894fd headers fix 2023-01-07 14:23:16 +01:00
Michał Kozłowski
f55125f447 image download script 2023-01-07 14:19:01 +01:00
Michał Kozłowski
526eb7d5e4 fix save 2023-01-07 11:42:54 +01:00
Michał Kozłowski
68231dce0a counter update 2023-01-07 11:39:22 +01:00
Michał Kozłowski
6d7c08a9af fix 2023-01-07 11:37:32 +01:00
Michał Kozłowski
0a39807fec crawler update with continuation, files removed 2023-01-07 11:34:27 +01:00
683bc9e6fc green partial done 2023-01-07 09:12:06 +00:00
a16d660cf7 yellow data, crawler fixes, .gitignore added out files 2023-01-04 20:32:26 +00:00
Michał Kozłowski
f7ff1acec3 gitignore 2023-01-03 16:13:33 +01:00
mkozlowskiAzimuthe
b904e64f01 update of crawler 2023-01-03 16:11:50 +01:00
12 changed files with 711 additions and 13393 deletions

6
.gitignore vendored Normal file
View File

@ -0,0 +1,6 @@
env
*.out
images*
*.tsv
env_wikicrawler
temp_images

13
README.md Normal file
View File

@ -0,0 +1,13 @@
# Wikisource crawler and image downloader
## Requirements:
Python 3.8>
## Install/setup:
`pip install -r requirements.txt`
## Usage crawler
`python crawler.py --type {green or yellow or red} --output_file_name {output tsv file name} --start_file_name {name of file to start crawling from} --start_page_number {page of file to start crawling}`
## Usage image downloader
`python image_download.py --file_path {tsv file with data to download} --output_folder {folder to output images -> default images} --max_folder_size_mb {size in MB to stop, if not given will download all} --from_checkpoint {True to start from checkpoint if pickle available}`

View File

@ -6,54 +6,79 @@ from tqdm import tqdm
import time import time
import argparse import argparse
def main(): MAIN_URL = "https://pl.wikisource.org/"
MAIN_URL = "https://pl.wikisource.org/"
URL_YELLOW = "https://pl.wikisource.org/wiki/Kategoria:Skorygowana"
URL_GREEN = "https://pl.wikisource.org/wiki/Kategoria:Uwierzytelniona"
def get_page_data(page_element): def get_page_data(page_element):
time.sleep(0.5) time.sleep(0.5)
doc = requests.get(MAIN_URL + page_element['href']) doc = requests.get(MAIN_URL + page_element['href'])
doc_soup = BeautifulSoup(doc.text, 'lxml', from_encoding="utf-8") doc_soup = BeautifulSoup(doc.text, 'lxml')
text_elem = doc_soup.find("div", {"class": "pagetext"}).next_element text = doc_soup.find("div", {"class": "pagetext"}).next_element
text = text_elem.text image_url = doc_soup.find("div", {"class": "prp-page-image"}).find("img")['src']
image_url = doc_soup.find("div", {"class": "prp-page-image"}).next_element['src'] return {"title": page_element['title'], "href": MAIN_URL + page_element['href'], "image_url": image_url, "text": text.text}
return {"title": page_element['title'], "href": MAIN_URL + page_element['href'], "image_url": image_url, "text": text,}
r = requests.get(URL_GREEN) def save_data(file_name, data, args):
if not args.testing:
df = pd.DataFrame(data)
df.to_csv(f"./{file_name}.tsv", sep="\t")
def main(args):
category_dict = {"green": "Uwierzytelniona", "yellow": "Skorygowana", "red": "Przepisana"}
if args.start_file_name and args.start_page_number:
CATEGORY_URL = f"{MAIN_URL}/w/index.php?title=Kategoria:{category_dict[args.type]}&pagefrom={args.start_file_name}"
else:
CATEGORY_URL = f"{MAIN_URL}/wiki/Kategoria:{category_dict[args.type]}"
r = requests.get(CATEGORY_URL)
soup = BeautifulSoup(r.text, 'lxml') soup = BeautifulSoup(r.text, 'lxml')
page_number = 1 page_number = 1 if not args.start_page_number else args.start_page_number
data_number = 0 if not args.start_page_number else args.start_page_number * 200
result_list = [] result_list = []
max_len = "".join(re.findall("\d", re.sub("\xa0",'', soup.find("div", {"id": "mw-pages"}).find("p").text))[3:]) max_len = int("".join(re.findall("\d", re.sub("\xa0",'', soup.find("div", {"id": "mw-pages"}).find("p").text))[3:]))
try: try:
while True: with tqdm(total=max_len) as pbar:
time.sleep(5) if args.start_page_number:
next_page = soup.find("a", {"href": re.compile(r"\/w\/index.php.*")}, string="następna strona").get('href', None) pbar.update(data_number)
if next_page and page_number != 1: pbar.refresh()
r = requests.get(MAIN_URL + next_page)
elif not next_page:
break
if r.status_code != 200: while data_number < max_len:
print(r.__dict__) pbar.set_description(f"Page number: {page_number}")
time.sleep(10) time.sleep(5)
r = requests.get(MAIN_URL + next_page) next_page = soup.find("a", {"href": re.compile(r"\/w\/index.php.*")}, string="następna strona").get('href', None)
if r.status_code != 200: if next_page and page_number != 1:
break r = requests.get(MAIN_URL + next_page)
soup = BeautifulSoup(r.text, 'lxml') elif not next_page:
page_number += 1 break
links = soup.find_all("a", {"href": re.compile(r"\/wiki\/Strona:.*")})
for link in tqdm(links): if r.status_code != 200:
result_list.append(get_page_data(link)) print(r.__dict__)
print("Page number:", page_number) time.sleep(60)
print("Number of elements:", 200 * page_number, "/", max_len) r = requests.get(MAIN_URL + next_page)
if r.status_code != 200:
break
soup = BeautifulSoup(r.text, 'lxml')
links = soup.find_all("a", {"href": re.compile(r"\/wiki\/Strona:.*")})
page_number += 1
for link in links:
result_list.append(get_page_data(link))
data_number += 1
pbar.update(1)
except Exception as e: except Exception as e:
print(e) print("Error:", e)
df = pd.DataFrame(result_list) save_data(f"./{args.output_file_name}-{args.type}", result_list, args)
df.to_csv("./green.tsv", sep="\t") except KeyboardInterrupt:
save_data(f"./{args.output_file_name}-{args.type}", result_list, args)
save_data(f"./{args.output_file_name}-{args.type}", result_list, args)
df = pd.DataFrame(result_list)
df.to_csv("./yellow.tsv", sep="\t")
if __name__ == "__main__": if __name__ == "__main__":
main() parser = argparse.ArgumentParser()
parser.add_argument("--wiki_type", type=str, default='green', choices=["green", "yellow", "red"], required=True)
parser.add_argument("--output_file_name", type=str, required=True)
parser.add_argument("--start_file_name", type=str, required=False)
parser.add_argument("--start_page_number", type=int, required=False)
parser.add_argument("--testing", type=bool, required=False, default=False)
args, left_argv = parser.parse_known_args()
print(type(vars(args)))
# main(args)

115
crawler_class.py Normal file
View File

@ -0,0 +1,115 @@
from bs4 import BeautifulSoup
import requests
import time
import os
from tqdm import tqdm
import csv
from collections import deque
import argparse
import re
import pandas as pd
MAIN_URL = "https://pl.wikisource.org/"
class WikiCrawler:
def __init__(self, wiki_type: str, output_file_name: str):
self.wiki_type = wiki_type
self.output_file_name = output_file_name
self.page_number = 1
self.index = 1
self.load_last_checkpoint()
def load_last_checkpoint(self):
self.start_file_name = None
if os.path.exists(self.output_file_name):
df = pd.read_csv(self.output_file_name, encoding='utf-8', sep='\t')
last_line = df.tail(1).iloc()[0]
self.start_file_name = last_line[0]
self.page_number = int(last_line[-1])
self.index = int(last_line[-2])
del df
print(f"Starting from index: {self.index}, page: {self.page_number}")
def _init_crawl(self):
category_dict = {"green": "Uwierzytelniona", "yellow": "Skorygowana", "red": "Przepisana"}
CATEGORY_URL = f"{MAIN_URL}/wiki/Kategoria:{category_dict[self.wiki_type]}"
# if should start from other step
if self.start_file_name:
CATEGORY_URL = f"{MAIN_URL}/w/index.php?title=Kategoria:{category_dict[self.wiki_type]}&pagefrom={self.start_file_name}"
request = requests.get(CATEGORY_URL)
assert request.status_code == 200, f"Status diffrent on main request, status: {request.status_code}"
soup = BeautifulSoup(request.text, 'lxml')
self.max_len = int("".join(re.findall("\d", re.sub("\xa0",'', soup.find("div", {"id": "mw-pages"}).find("p").text))[3:]))
self.pbar = tqdm(total=self.max_len)
if self.start_file_name:
self.pbar.update(self.index)
self.pbar.refresh()
return soup, request
def save_page_data(self, page_element):
time.sleep(0.3)
doc_request = requests.get(MAIN_URL + page_element['href'])
assert doc_request.status_code == 200, f"Wrong status on requesting doc link: {MAIN_URL + page_element['href']}"
doc_soup = BeautifulSoup(doc_request.text, 'lxml')
text = doc_soup.find("div", {"class": "pagetext"}).next_element
image_url = doc_soup.find("div", {"class": "prp-page-image"}).find("img")['src']
with open(self.output_file_name, 'a', newline='') as output_csv:
row_dict = {
"title": page_element['title'],
"href": MAIN_URL + page_element['href'],
"image_url": image_url,
"text": text.text,
"index": self.index,
"page_number": self.page_number
}
fields = list(row_dict.keys())
writer = csv.DictWriter(output_csv, fieldnames=list(row_dict.keys()), delimiter='\t')
writer.writerow(row_dict)
def crawl(self):
soup, r = self._init_crawl()
first_search = True
while self.index < self.max_len:
time.sleep(0.3)
self.pbar.set_description(f"Page number: {self.page_number}")
next_page = soup.find("a", {"href": re.compile(r"\/w\/index.php.*")}, string="następna strona")
if next_page:
next_page = next_page.get('href', None)
if next_page and not first_search:
r = requests.get(MAIN_URL + next_page)
elif not next_page:
print(soup)
print("\n\n\n", soup.text)
print("End of pages, or next page not found")
break
# handle wrong request
if r.status_code != 200:
print('Retry of request, request data: ', r.__dict__)
time.sleep(60)
r = requests.get(MAIN_URL + next_page)
assert r.status_code == 200, f"Retry failed, request status: {r.status_code}, full_info: {r.__dict__}"
soup = BeautifulSoup(r.text, 'lxml')
links = soup.find_all("a", {"href": re.compile(r"\/wiki\/Strona:.*")})
for link in links:
self.save_page_data(link)
self.index += 1
self.pbar.update(1)
self.page_number += 1
first_search = False
print("Finished")
if __name__ == '__main__':
parser = argparse.ArgumentParser()
parser.add_argument("--wiki_type", type=str, choices=["green", "yellow", "red"], required=True)
parser.add_argument("--output_file_name", type=str, required=True)
args, left_argv = parser.parse_known_args()
crawler = WikiCrawler(**vars(args))
crawler.crawl()

91
image_class.py Normal file
View File

@ -0,0 +1,91 @@
import os
import argparse
import pandas as pd
import requests
from PIL import Image
from tqdm import tqdm
import pickle
import time
from pprint import pprint
import json
from datasets import load_dataset
from huggingface_hub import login
import shutil
headers = {'User-Agent': 'ImageDownloadOcrBot/1.0 (no.rp.mk.info@gmail.com) requests/2.28.1'}
class WikiImage:
def __init__(self, input_file_path: str, dataset_name: str, output_folder: str = 'temp_images', split_number: int = 1):
self.input_file_path = input_file_path
self.split_number = split_number
self.max_dataset_len = 10000
self.output_folder = output_folder
self.dataset_name = dataset_name
print("Loading input file")
self.dataframe = pd.read_csv(self.input_file_path, sep='\t')[(self.split_number - 1) * self.max_dataset_len:]
if os.path.exists(self.output_folder):
print("Removing old dear")
if os.path.exists('/home/zombely/.cache/huggingface/datasets'):
shutil.rmtree('/home/zombely/.cache/huggingface/datasets')
shutil.rmtree(self.output_folder)
os.mkdir(self.output_folder)
self.pbar = tqdm(self.dataframe.iterrows(), total=len(self.dataframe), desc=f"Split: {self.split_number}")
login(os.environ.get("HUG_TOKEN"), True)
def image_save(self, row):
time.sleep(0.3)
image_request = requests.get(f"https:{row[1]['image_url']}", stream=True, headers=headers)
if image_request.status_code in [500, 404]:
print(f"Image {row[1]['title']} is not reacheable")
return
if image_request.status_code != 200:
time.sleep(80)
image_request = requests.get(f"https:{row[1]['image_url']}", stream=True, headers=headers)
assert image_request.status_code == 200, f"Response status is diffrent, status_code: {image_request.status_code}, full info: {image_request.__dict__}"
image = Image.open(image_request.raw)
if image.mode != "RGB":
image = image.convert("RGB")
title = row[1]['title'].replace("Strona:", "").replace("/", "-")
image.save(f"{self.output_folder}/{title}.png")
with open(f"{self.output_folder}/metadata.jsonl", mode='a', encoding='utf-8') as f:
# f.write(str({"file_name": f"{title}.png", "ground_truth": json.dumps({"gt_parse": {"text_sequance": row[1]['text'].replace('"', "'")}}, ensure_ascii=False)})+"\n")
json.dump({"file_name": f"{title}.png", "ground_truth": json.dumps({"gt_parse": {"text_sequance": row[1]['text'].replace('"', "'")}}, ensure_ascii=False)}, f, ensure_ascii=False)
f.write("\n")
def push_dataset(self, split_name: str):
print(f"Pushing split: {split_name}")
dataset = load_dataset(self.output_folder)
dataset[split_name] = dataset.pop('train')
dataset.push_to_hub(f'Zombely/{self.dataset_name}')
shutil.rmtree(self.output_folder)
shutil.rmtree('/home/zombely/.cache/huggingface/datasets')
os.mkdir(self.output_folder)
del dataset
print("Upload finished")
def crawl(self):
print("Start download")
for index, row in enumerate(self.pbar):
self.image_save(row)
if (index + 1) % self.max_dataset_len == 0:
self.push_dataset(f'train_{self.split_number}')
self.split_number += 1
self.pbar.set_description(f'Split: {self.split_number}')
self.push_dataset('validation')
if __name__ == "__main__":
parser = argparse.ArgumentParser()
parser.add_argument("--input_file_path", type=str, required=True)
parser.add_argument("--dataset_name", type=str, required=True)
parser.add_argument("--output_folder", type=str, required=False, default='temp_images')
parser.add_argument("--split_number", type=int, required=False, default=1)
args, left_argv = parser.parse_known_args()
crawler = WikiImage(**vars(args))
crawler.crawl()

79
image_download.py Normal file
View File

@ -0,0 +1,79 @@
import os
import argparse
import pandas as pd
import requests
from PIL import Image
from tqdm import tqdm
import pickle
import time
from pprint import pprint
import json
headers = {'User-Agent': 'ImageDownloadOcrBot/1.0 (micha9@op.pl) requests/2.28.1'}
def save_state(index, offset):
with open("./state.pickle", "wb") as state_file:
pickle.dump({"row_index": index+offset}, state_file, protocol=pickle.HIGHEST_PROTOCOL)
print("Saving state, index: ", index+offset)
def main(args):
df = pd.read_csv(args.file_path, sep="\t")
offset = 0
if not os.path.exists(args.output_folder):
print(f"Creating missing folder: {args.output_folder}")
os.mkdir(args.output_folder)
if args.from_checkpoint and os.path.exists("./state.pickle"):
with open("state.pickle", "rb") as state:
state_dict = pickle.load(state)
offset = state_dict["row_index"]
print("Starting from checkpoint, index: ", offset)
df = df[offset:]
pbar = tqdm(df.iterrows(), total=len(df), desc=f"0/{args.max_folder_size_mb if args.max_folder_size_mb else 'No limit given'} MB")
for n, row in enumerate(pbar):
try:
time.sleep(0.2)
r = requests.get(f"https:{row[1]['image_url']}", stream=True, headers=headers)
if r.status_code != 200:
time.sleep(80)
r = requests.get(f"https:{row[1]['image_url']}", stream=True, headers=headers)
if r.status_code != 200:
pprint(r.__dict__)
save_state(n, offset)
return
image = Image.open(r.raw)
if image.mode != "RGB":
image = image.convert("RGB")
title = row[1]['title'].replace("Strona:", "").replace("/", "-")
image.save(f"{args.output_folder}/{title}.png")
with open(f"{args.output_folder}/metadata.jsonl", mode='a', encoding='utf-8') as f:
f.write(str({"file_name": f"{title}.png", "ground_truth": json.dumps({"gt_parse": {"text_sequance": row[1]['text']}}, ensure_ascii=False)}) + "\n")
dir_size = round(sum(os.path.getsize(f"./{args.output_folder}/{file}") for file in os.listdir(f"./{args.output_folder}")) * 0.000001, 2)
pbar.set_description(f"{dir_size}/{args.max_folder_size_mb if args.max_folder_size_mb else 'No limit given'} MB")
if args.max_folder_size_mb and dir_size > args.max_folder_size_mb:
print(f"Limit size of: {args.max_folder_size_mb}, exceeded")
save_state(n, offset)
return
except (Exception, KeyboardInterrupt) as e:
print(f"Error: {str(e)} \n")
print(f"Row: {row}")
save_state(n, offset)
return
if __name__ == "__main__":
parser = argparse.ArgumentParser()
parser.add_argument("--file_path", type=str, required=True)
parser.add_argument("--output_folder", type=str, default="./images")
parser.add_argument("--max_folder_size_mb", type=float, required=False)
parser.add_argument("--from_checkpoint", type=bool, required=False, default=False)
args, left_argv = parser.parse_known_args()
main(args)

8
mail_test.py Normal file
View File

@ -0,0 +1,8 @@
import smtplib
def main():
smtp = smtplib.SMTP("0.0.0.0", 25, 'mail')
smtp.sendmail('info@zbhome.com', ['michalkozlowski936@gmail.com'], "Hello from zbhome")
if __name__ == "__main__":
main()

View File

@ -343,7 +343,7 @@
"name": "python", "name": "python",
"nbconvert_exporter": "python", "nbconvert_exporter": "python",
"pygments_lexer": "ipython3", "pygments_lexer": "ipython3",
"version": "3.9.15" "version": "3.9.15 | packaged by conda-forge | (main, Nov 22 2022, 08:41:22) [MSC v.1929 64 bit (AMD64)]"
}, },
"orig_nbformat": 4, "orig_nbformat": 4,
"vscode": { "vscode": {

View File

@ -0,0 +1,238 @@
{
"cells": [
{
"cell_type": "code",
"execution_count": 1,
"metadata": {},
"outputs": [],
"source": [
"import pandas as pd"
]
},
{
"cell_type": "code",
"execution_count": 2,
"metadata": {},
"outputs": [],
"source": [
"a = pd.read_csv(\"../../wikisource-data/yellow-continue-yellow.tsv.tsv\", sep=\"\\t\")"
]
},
{
"cell_type": "code",
"execution_count": 3,
"metadata": {},
"outputs": [
{
"data": {
"text/html": [
"<div>\n",
"<style scoped>\n",
" .dataframe tbody tr th:only-of-type {\n",
" vertical-align: middle;\n",
" }\n",
"\n",
" .dataframe tbody tr th {\n",
" vertical-align: top;\n",
" }\n",
"\n",
" .dataframe thead th {\n",
" text-align: right;\n",
" }\n",
"</style>\n",
"<table border=\"1\" class=\"dataframe\">\n",
" <thead>\n",
" <tr style=\"text-align: right;\">\n",
" <th></th>\n",
" <th>Unnamed: 0</th>\n",
" <th>title</th>\n",
" <th>href</th>\n",
" <th>image_url</th>\n",
" <th>text</th>\n",
" </tr>\n",
" </thead>\n",
" <tbody>\n",
" <tr>\n",
" <th>0</th>\n",
" <td>0</td>\n",
" <td>Strona:Stanisław Antoni Wotowski - George Sand...</td>\n",
" <td>https://pl.wikisource.org//wiki/Strona:Stanis%...</td>\n",
" <td>//upload.wikimedia.org/wikipedia/commons/thumb...</td>\n",
" <td>zmieniła się; piękne oczy są tak samo błyszczą...</td>\n",
" </tr>\n",
" <tr>\n",
" <th>1</th>\n",
" <td>1</td>\n",
" <td>Strona:Stanisław Antoni Wotowski - George Sand...</td>\n",
" <td>https://pl.wikisource.org//wiki/Strona:Stanis%...</td>\n",
" <td>//upload.wikimedia.org/wikipedia/commons/thumb...</td>\n",
" <td>najświetniejszej chociażby sławy... i po piętn...</td>\n",
" </tr>\n",
" <tr>\n",
" <th>2</th>\n",
" <td>2</td>\n",
" <td>Strona:Stanisław Antoni Wotowski - George Sand...</td>\n",
" <td>https://pl.wikisource.org//wiki/Strona:Stanis%...</td>\n",
" <td>//upload.wikimedia.org/wikipedia/commons/thumb...</td>\n",
" <td>Chopin gra. Ledwie dostrzegalnie muskają smuk...</td>\n",
" </tr>\n",
" <tr>\n",
" <th>3</th>\n",
" <td>3</td>\n",
" <td>Strona:Stanisław Antoni Wotowski - George Sand...</td>\n",
" <td>https://pl.wikisource.org//wiki/Strona:Stanis%...</td>\n",
" <td>//upload.wikimedia.org/wikipedia/commons/thumb...</td>\n",
" <td>\\nDZIWACZNE MAŁŻEŃSTWO.\\n\\nBył grudzień 1830 ...</td>\n",
" </tr>\n",
" <tr>\n",
" <th>4</th>\n",
" <td>4</td>\n",
" <td>Strona:Stanisław Antoni Wotowski - George Sand...</td>\n",
" <td>https://pl.wikisource.org//wiki/Strona:Stanis%...</td>\n",
" <td>//upload.wikimedia.org/wikipedia/commons/thumb...</td>\n",
" <td>Ale bliższego związku z panią Sand jakby się ...</td>\n",
" </tr>\n",
" </tbody>\n",
"</table>\n",
"</div>"
],
"text/plain": [
" Unnamed: 0 title \\\n",
"0 0 Strona:Stanisław Antoni Wotowski - George Sand... \n",
"1 1 Strona:Stanisław Antoni Wotowski - George Sand... \n",
"2 2 Strona:Stanisław Antoni Wotowski - George Sand... \n",
"3 3 Strona:Stanisław Antoni Wotowski - George Sand... \n",
"4 4 Strona:Stanisław Antoni Wotowski - George Sand... \n",
"\n",
" href \\\n",
"0 https://pl.wikisource.org//wiki/Strona:Stanis%... \n",
"1 https://pl.wikisource.org//wiki/Strona:Stanis%... \n",
"2 https://pl.wikisource.org//wiki/Strona:Stanis%... \n",
"3 https://pl.wikisource.org//wiki/Strona:Stanis%... \n",
"4 https://pl.wikisource.org//wiki/Strona:Stanis%... \n",
"\n",
" image_url \\\n",
"0 //upload.wikimedia.org/wikipedia/commons/thumb... \n",
"1 //upload.wikimedia.org/wikipedia/commons/thumb... \n",
"2 //upload.wikimedia.org/wikipedia/commons/thumb... \n",
"3 //upload.wikimedia.org/wikipedia/commons/thumb... \n",
"4 //upload.wikimedia.org/wikipedia/commons/thumb... \n",
"\n",
" text \n",
"0 zmieniła się; piękne oczy są tak samo błyszczą... \n",
"1 najświetniejszej chociażby sławy... i po piętn... \n",
"2 Chopin gra. Ledwie dostrzegalnie muskają smuk... \n",
"3 \\nDZIWACZNE MAŁŻEŃSTWO.\\n\\nBył grudzień 1830 ... \n",
"4 Ale bliższego związku z panią Sand jakby się ... "
]
},
"execution_count": 3,
"metadata": {},
"output_type": "execute_result"
}
],
"source": [
"a.head()"
]
},
{
"cell_type": "code",
"execution_count": 2,
"metadata": {},
"outputs": [],
"source": [
"from datasets import load_dataset\n",
"from huggingface_hub import login"
]
},
{
"cell_type": "code",
"execution_count": 6,
"metadata": {},
"outputs": [
{
"name": "stderr",
"output_type": "stream",
"text": [
"Resolving data files: 100%|██████████| 29/29 [00:00<?, ?it/s]\n",
"Using custom data configuration images-8b1ad802b6988161\n"
]
},
{
"name": "stdout",
"output_type": "stream",
"text": [
"Downloading and preparing dataset imagefolder/images to C:/Users/PC/.cache/huggingface/datasets/imagefolder/images-8b1ad802b6988161/0.0.0/37fbb85cc714a338bea574ac6c7d0b5be5aff46c1862c1989b20e0771199e93f...\n"
]
},
{
"name": "stderr",
"output_type": "stream",
"text": [
"Downloading data files: 0it [00:00, ?it/s]\n",
"Extracting data files: 0it [00:00, ?it/s]\n"
]
},
{
"ename": "ArrowInvalid",
"evalue": "JSON parse error: Missing a name for object member. in row 0",
"output_type": "error",
"traceback": [
"\u001b[1;31m---------------------------------------------------------------------------\u001b[0m",
"\u001b[1;31mArrowInvalid\u001b[0m Traceback (most recent call last)",
"Cell \u001b[1;32mIn[6], line 1\u001b[0m\n\u001b[1;32m----> 1\u001b[0m dataset \u001b[39m=\u001b[39m load_dataset(\u001b[39m\"\u001b[39;49m\u001b[39m../images\u001b[39;49m\u001b[39m\"\u001b[39;49m)\n",
"File \u001b[1;32mc:\\Users\\PC\\anaconda3\\envs\\um\\lib\\site-packages\\datasets\\load.py:1741\u001b[0m, in \u001b[0;36mload_dataset\u001b[1;34m(path, name, data_dir, data_files, split, cache_dir, features, download_config, download_mode, ignore_verifications, keep_in_memory, save_infos, revision, use_auth_token, task, streaming, num_proc, **config_kwargs)\u001b[0m\n\u001b[0;32m 1738\u001b[0m try_from_hf_gcs \u001b[39m=\u001b[39m path \u001b[39mnot\u001b[39;00m \u001b[39min\u001b[39;00m _PACKAGED_DATASETS_MODULES\n\u001b[0;32m 1740\u001b[0m \u001b[39m# Download and prepare data\u001b[39;00m\n\u001b[1;32m-> 1741\u001b[0m builder_instance\u001b[39m.\u001b[39;49mdownload_and_prepare(\n\u001b[0;32m 1742\u001b[0m download_config\u001b[39m=\u001b[39;49mdownload_config,\n\u001b[0;32m 1743\u001b[0m download_mode\u001b[39m=\u001b[39;49mdownload_mode,\n\u001b[0;32m 1744\u001b[0m ignore_verifications\u001b[39m=\u001b[39;49mignore_verifications,\n\u001b[0;32m 1745\u001b[0m try_from_hf_gcs\u001b[39m=\u001b[39;49mtry_from_hf_gcs,\n\u001b[0;32m 1746\u001b[0m use_auth_token\u001b[39m=\u001b[39;49muse_auth_token,\n\u001b[0;32m 1747\u001b[0m num_proc\u001b[39m=\u001b[39;49mnum_proc,\n\u001b[0;32m 1748\u001b[0m )\n\u001b[0;32m 1750\u001b[0m \u001b[39m# Build dataset for splits\u001b[39;00m\n\u001b[0;32m 1751\u001b[0m keep_in_memory \u001b[39m=\u001b[39m (\n\u001b[0;32m 1752\u001b[0m keep_in_memory \u001b[39mif\u001b[39;00m keep_in_memory \u001b[39mis\u001b[39;00m \u001b[39mnot\u001b[39;00m \u001b[39mNone\u001b[39;00m \u001b[39melse\u001b[39;00m is_small_dataset(builder_instance\u001b[39m.\u001b[39minfo\u001b[39m.\u001b[39mdataset_size)\n\u001b[0;32m 1753\u001b[0m )\n",
"File \u001b[1;32mc:\\Users\\PC\\anaconda3\\envs\\um\\lib\\site-packages\\datasets\\builder.py:822\u001b[0m, in \u001b[0;36mDatasetBuilder.download_and_prepare\u001b[1;34m(self, output_dir, download_config, download_mode, ignore_verifications, try_from_hf_gcs, dl_manager, base_path, use_auth_token, file_format, max_shard_size, num_proc, storage_options, **download_and_prepare_kwargs)\u001b[0m\n\u001b[0;32m 820\u001b[0m \u001b[39mif\u001b[39;00m num_proc \u001b[39mis\u001b[39;00m \u001b[39mnot\u001b[39;00m \u001b[39mNone\u001b[39;00m:\n\u001b[0;32m 821\u001b[0m prepare_split_kwargs[\u001b[39m\"\u001b[39m\u001b[39mnum_proc\u001b[39m\u001b[39m\"\u001b[39m] \u001b[39m=\u001b[39m num_proc\n\u001b[1;32m--> 822\u001b[0m \u001b[39mself\u001b[39m\u001b[39m.\u001b[39m_download_and_prepare(\n\u001b[0;32m 823\u001b[0m dl_manager\u001b[39m=\u001b[39mdl_manager,\n\u001b[0;32m 824\u001b[0m verify_infos\u001b[39m=\u001b[39mverify_infos,\n\u001b[0;32m 825\u001b[0m \u001b[39m*\u001b[39m\u001b[39m*\u001b[39mprepare_split_kwargs,\n\u001b[0;32m 826\u001b[0m \u001b[39m*\u001b[39m\u001b[39m*\u001b[39mdownload_and_prepare_kwargs,\n\u001b[0;32m 827\u001b[0m )\n\u001b[0;32m 828\u001b[0m \u001b[39m# Sync info\u001b[39;00m\n\u001b[0;32m 829\u001b[0m \u001b[39mself\u001b[39m\u001b[39m.\u001b[39minfo\u001b[39m.\u001b[39mdataset_size \u001b[39m=\u001b[39m \u001b[39msum\u001b[39m(split\u001b[39m.\u001b[39mnum_bytes \u001b[39mfor\u001b[39;00m split \u001b[39min\u001b[39;00m \u001b[39mself\u001b[39m\u001b[39m.\u001b[39minfo\u001b[39m.\u001b[39msplits\u001b[39m.\u001b[39mvalues())\n",
"File \u001b[1;32mc:\\Users\\PC\\anaconda3\\envs\\um\\lib\\site-packages\\datasets\\builder.py:1555\u001b[0m, in \u001b[0;36mGeneratorBasedBuilder._download_and_prepare\u001b[1;34m(self, dl_manager, verify_infos, **prepare_splits_kwargs)\u001b[0m\n\u001b[0;32m 1554\u001b[0m \u001b[39mdef\u001b[39;00m \u001b[39m_download_and_prepare\u001b[39m(\u001b[39mself\u001b[39m, dl_manager, verify_infos, \u001b[39m*\u001b[39m\u001b[39m*\u001b[39mprepare_splits_kwargs):\n\u001b[1;32m-> 1555\u001b[0m \u001b[39msuper\u001b[39m()\u001b[39m.\u001b[39m_download_and_prepare(\n\u001b[0;32m 1556\u001b[0m dl_manager, verify_infos, check_duplicate_keys\u001b[39m=\u001b[39mverify_infos, \u001b[39m*\u001b[39m\u001b[39m*\u001b[39mprepare_splits_kwargs\n\u001b[0;32m 1557\u001b[0m )\n",
"File \u001b[1;32mc:\\Users\\PC\\anaconda3\\envs\\um\\lib\\site-packages\\datasets\\builder.py:891\u001b[0m, in \u001b[0;36mDatasetBuilder._download_and_prepare\u001b[1;34m(self, dl_manager, verify_infos, **prepare_split_kwargs)\u001b[0m\n\u001b[0;32m 889\u001b[0m split_dict \u001b[39m=\u001b[39m SplitDict(dataset_name\u001b[39m=\u001b[39m\u001b[39mself\u001b[39m\u001b[39m.\u001b[39mname)\n\u001b[0;32m 890\u001b[0m split_generators_kwargs \u001b[39m=\u001b[39m \u001b[39mself\u001b[39m\u001b[39m.\u001b[39m_make_split_generators_kwargs(prepare_split_kwargs)\n\u001b[1;32m--> 891\u001b[0m split_generators \u001b[39m=\u001b[39m \u001b[39mself\u001b[39m\u001b[39m.\u001b[39m_split_generators(dl_manager, \u001b[39m*\u001b[39m\u001b[39m*\u001b[39msplit_generators_kwargs)\n\u001b[0;32m 893\u001b[0m \u001b[39m# Checksums verification\u001b[39;00m\n\u001b[0;32m 894\u001b[0m \u001b[39mif\u001b[39;00m verify_infos \u001b[39mand\u001b[39;00m dl_manager\u001b[39m.\u001b[39mrecord_checksums:\n",
"File \u001b[1;32mc:\\Users\\PC\\anaconda3\\envs\\um\\lib\\site-packages\\datasets\\packaged_modules\\folder_based_builder\\folder_based_builder.py:189\u001b[0m, in \u001b[0;36mFolderBasedBuilder._split_generators\u001b[1;34m(self, dl_manager)\u001b[0m\n\u001b[0;32m 186\u001b[0m metadata_ext \u001b[39m=\u001b[39m metadata_ext\u001b[39m.\u001b[39mpop()\n\u001b[0;32m 188\u001b[0m \u001b[39mfor\u001b[39;00m _, downloaded_metadata_file \u001b[39min\u001b[39;00m itertools\u001b[39m.\u001b[39mchain\u001b[39m.\u001b[39mfrom_iterable(metadata_files\u001b[39m.\u001b[39mvalues()):\n\u001b[1;32m--> 189\u001b[0m pa_metadata_table \u001b[39m=\u001b[39m \u001b[39mself\u001b[39;49m\u001b[39m.\u001b[39;49m_read_metadata(downloaded_metadata_file)\n\u001b[0;32m 190\u001b[0m features_per_metadata_file\u001b[39m.\u001b[39mappend(\n\u001b[0;32m 191\u001b[0m (downloaded_metadata_file, datasets\u001b[39m.\u001b[39mFeatures\u001b[39m.\u001b[39mfrom_arrow_schema(pa_metadata_table\u001b[39m.\u001b[39mschema))\n\u001b[0;32m 192\u001b[0m )\n\u001b[0;32m 193\u001b[0m \u001b[39mfor\u001b[39;00m downloaded_metadata_file, metadata_features \u001b[39min\u001b[39;00m features_per_metadata_file:\n",
"File \u001b[1;32mc:\\Users\\PC\\anaconda3\\envs\\um\\lib\\site-packages\\datasets\\packaged_modules\\folder_based_builder\\folder_based_builder.py:260\u001b[0m, in \u001b[0;36mFolderBasedBuilder._read_metadata\u001b[1;34m(self, metadata_file)\u001b[0m\n\u001b[0;32m 258\u001b[0m \u001b[39melse\u001b[39;00m:\n\u001b[0;32m 259\u001b[0m \u001b[39mwith\u001b[39;00m \u001b[39mopen\u001b[39m(metadata_file, \u001b[39m\"\u001b[39m\u001b[39mrb\u001b[39m\u001b[39m\"\u001b[39m) \u001b[39mas\u001b[39;00m f:\n\u001b[1;32m--> 260\u001b[0m \u001b[39mreturn\u001b[39;00m paj\u001b[39m.\u001b[39;49mread_json(f)\n",
"File \u001b[1;32mc:\\Users\\PC\\anaconda3\\envs\\um\\lib\\site-packages\\pyarrow\\_json.pyx:259\u001b[0m, in \u001b[0;36mpyarrow._json.read_json\u001b[1;34m()\u001b[0m\n",
"File \u001b[1;32mc:\\Users\\PC\\anaconda3\\envs\\um\\lib\\site-packages\\pyarrow\\error.pxi:144\u001b[0m, in \u001b[0;36mpyarrow.lib.pyarrow_internal_check_status\u001b[1;34m()\u001b[0m\n",
"File \u001b[1;32mc:\\Users\\PC\\anaconda3\\envs\\um\\lib\\site-packages\\pyarrow\\error.pxi:100\u001b[0m, in \u001b[0;36mpyarrow.lib.check_status\u001b[1;34m()\u001b[0m\n",
"\u001b[1;31mArrowInvalid\u001b[0m: JSON parse error: Missing a name for object member. in row 0"
]
}
],
"source": [
"dataset = load_dataset(\"../images\")"
]
},
{
"cell_type": "code",
"execution_count": null,
"metadata": {},
"outputs": [],
"source": [
"login('',True)"
]
}
],
"metadata": {
"kernelspec": {
"display_name": "um",
"language": "python",
"name": "python3"
},
"language_info": {
"codemirror_mode": {
"name": "ipython",
"version": 3
},
"file_extension": ".py",
"mimetype": "text/x-python",
"name": "python",
"nbconvert_exporter": "python",
"pygments_lexer": "ipython3",
"version": "3.9.15"
},
"orig_nbformat": 4,
"vscode": {
"interpreter": {
"hash": "876e189cbbe99a9a838ece62aae1013186c4bb7e0254a10cfa2f9b2381853efb"
}
}
},
"nbformat": 4,
"nbformat_minor": 2
}

95
notebooks/join.ipynb Normal file
View File

@ -0,0 +1,95 @@
{
"cells": [
{
"cell_type": "code",
"execution_count": null,
"metadata": {},
"outputs": [],
"source": [
"import pandas as pd"
]
},
{
"cell_type": "code",
"execution_count": null,
"metadata": {},
"outputs": [],
"source": [
"green = pd.read_csv(\"../../wikisource-data/green.tsv\", sep=\"\\t\")\n"
]
},
{
"cell_type": "code",
"execution_count": null,
"metadata": {},
"outputs": [],
"source": [
"green.tail()"
]
},
{
"cell_type": "code",
"execution_count": null,
"metadata": {},
"outputs": [],
"source": [
"green = pd.read_csv(\"../green-full.tsv\", sep=\"\\t\")\n",
"yellow = pd.read_csv(\"../yellow-full.tsv\", sep=\"\\t\")"
]
},
{
"cell_type": "code",
"execution_count": null,
"metadata": {},
"outputs": [],
"source": [
"whole = pd.concat([green, yellow], axis=0)\n"
]
},
{
"cell_type": "code",
"execution_count": null,
"metadata": {},
"outputs": [],
"source": [
"len(whole)"
]
},
{
"cell_type": "code",
"execution_count": null,
"metadata": {},
"outputs": [],
"source": [
"whole.to_csv(\"./wikisource-full.tsv\", sep=\"\\t\")"
]
}
],
"metadata": {
"kernelspec": {
"display_name": "um",
"language": "python",
"name": "python3"
},
"language_info": {
"codemirror_mode": {
"name": "ipython",
"version": 3
},
"file_extension": ".py",
"mimetype": "text/x-python",
"name": "python",
"nbconvert_exporter": "python",
"pygments_lexer": "ipython3",
"version": "3.9.15"
},
"orig_nbformat": 4,
"vscode": {
"interpreter": {
"hash": "876e189cbbe99a9a838ece62aae1013186c4bb7e0254a10cfa2f9b2381853efb"
}
}
},
"nbformat": 4,
"nbformat_minor": 2
}

View File

@ -5,6 +5,7 @@ idna==3.4
lxml==4.9.2 lxml==4.9.2
numpy==1.24.1 numpy==1.24.1
pandas==1.5.2 pandas==1.5.2
Pillow==9.4.0
python-dateutil==2.8.2 python-dateutil==2.8.2
pytz==2022.7 pytz==2022.7
requests==2.28.1 requests==2.28.1

13353
yellow.tsv

File diff suppressed because it is too large Load Diff