Compare commits
25 Commits
Author | SHA1 | Date | |
---|---|---|---|
|
da0e4f3263 | ||
|
680c3d000c | ||
|
fe144a70b9 | ||
|
fe6f47122a | ||
|
2f648da50d | ||
|
ea2ecf454c | ||
|
9ccb251801 | ||
162aae6958 | |||
|
61f83dd7e4 | ||
|
9088a3dc10 | ||
|
1f08432fbf | ||
|
9ee285cf95 | ||
|
0f3a6eb9da | ||
|
bf6014ba98 | ||
|
5c3611f972 | ||
|
cc76e894fd | ||
|
f55125f447 | ||
|
526eb7d5e4 | ||
|
68231dce0a | ||
|
6d7c08a9af | ||
|
0a39807fec | ||
683bc9e6fc | |||
a16d660cf7 | |||
|
f7ff1acec3 | ||
|
b904e64f01 |
6
.gitignore
vendored
Normal file
6
.gitignore
vendored
Normal file
@ -0,0 +1,6 @@
|
|||||||
|
env
|
||||||
|
*.out
|
||||||
|
images*
|
||||||
|
*.tsv
|
||||||
|
env_wikicrawler
|
||||||
|
temp_images
|
13
README.md
Normal file
13
README.md
Normal file
@ -0,0 +1,13 @@
|
|||||||
|
# Wikisource crawler and image downloader
|
||||||
|
|
||||||
|
## Requirements:
|
||||||
|
Python 3.8>
|
||||||
|
|
||||||
|
## Install/setup:
|
||||||
|
`pip install -r requirements.txt`
|
||||||
|
|
||||||
|
## Usage crawler
|
||||||
|
`python crawler.py --type {green or yellow or red} --output_file_name {output tsv file name} --start_file_name {name of file to start crawling from} --start_page_number {page of file to start crawling}`
|
||||||
|
|
||||||
|
## Usage image downloader
|
||||||
|
`python image_download.py --file_path {tsv file with data to download} --output_folder {folder to output images -> default images} --max_folder_size_mb {size in MB to stop, if not given will download all} --from_checkpoint {True to start from checkpoint if pickle available}`
|
103
crawler.py
103
crawler.py
@ -6,54 +6,79 @@ from tqdm import tqdm
|
|||||||
import time
|
import time
|
||||||
import argparse
|
import argparse
|
||||||
|
|
||||||
def main():
|
MAIN_URL = "https://pl.wikisource.org/"
|
||||||
MAIN_URL = "https://pl.wikisource.org/"
|
|
||||||
URL_YELLOW = "https://pl.wikisource.org/wiki/Kategoria:Skorygowana"
|
|
||||||
URL_GREEN = "https://pl.wikisource.org/wiki/Kategoria:Uwierzytelniona"
|
|
||||||
|
|
||||||
def get_page_data(page_element):
|
def get_page_data(page_element):
|
||||||
time.sleep(0.5)
|
time.sleep(0.5)
|
||||||
doc = requests.get(MAIN_URL + page_element['href'])
|
doc = requests.get(MAIN_URL + page_element['href'])
|
||||||
doc_soup = BeautifulSoup(doc.text, 'lxml', from_encoding="utf-8")
|
doc_soup = BeautifulSoup(doc.text, 'lxml')
|
||||||
text_elem = doc_soup.find("div", {"class": "pagetext"}).next_element
|
text = doc_soup.find("div", {"class": "pagetext"}).next_element
|
||||||
text = text_elem.text
|
image_url = doc_soup.find("div", {"class": "prp-page-image"}).find("img")['src']
|
||||||
image_url = doc_soup.find("div", {"class": "prp-page-image"}).next_element['src']
|
return {"title": page_element['title'], "href": MAIN_URL + page_element['href'], "image_url": image_url, "text": text.text}
|
||||||
return {"title": page_element['title'], "href": MAIN_URL + page_element['href'], "image_url": image_url, "text": text,}
|
|
||||||
|
|
||||||
r = requests.get(URL_GREEN)
|
def save_data(file_name, data, args):
|
||||||
|
if not args.testing:
|
||||||
|
df = pd.DataFrame(data)
|
||||||
|
df.to_csv(f"./{file_name}.tsv", sep="\t")
|
||||||
|
|
||||||
|
def main(args):
|
||||||
|
category_dict = {"green": "Uwierzytelniona", "yellow": "Skorygowana", "red": "Przepisana"}
|
||||||
|
if args.start_file_name and args.start_page_number:
|
||||||
|
CATEGORY_URL = f"{MAIN_URL}/w/index.php?title=Kategoria:{category_dict[args.type]}&pagefrom={args.start_file_name}"
|
||||||
|
else:
|
||||||
|
CATEGORY_URL = f"{MAIN_URL}/wiki/Kategoria:{category_dict[args.type]}"
|
||||||
|
|
||||||
|
r = requests.get(CATEGORY_URL)
|
||||||
soup = BeautifulSoup(r.text, 'lxml')
|
soup = BeautifulSoup(r.text, 'lxml')
|
||||||
page_number = 1
|
page_number = 1 if not args.start_page_number else args.start_page_number
|
||||||
|
data_number = 0 if not args.start_page_number else args.start_page_number * 200
|
||||||
result_list = []
|
result_list = []
|
||||||
max_len = "".join(re.findall("\d", re.sub("\xa0",'', soup.find("div", {"id": "mw-pages"}).find("p").text))[3:])
|
max_len = int("".join(re.findall("\d", re.sub("\xa0",'', soup.find("div", {"id": "mw-pages"}).find("p").text))[3:]))
|
||||||
|
|
||||||
try:
|
try:
|
||||||
while True:
|
with tqdm(total=max_len) as pbar:
|
||||||
time.sleep(5)
|
if args.start_page_number:
|
||||||
next_page = soup.find("a", {"href": re.compile(r"\/w\/index.php.*")}, string="następna strona").get('href', None)
|
pbar.update(data_number)
|
||||||
if next_page and page_number != 1:
|
pbar.refresh()
|
||||||
r = requests.get(MAIN_URL + next_page)
|
|
||||||
elif not next_page:
|
|
||||||
break
|
|
||||||
|
|
||||||
if r.status_code != 200:
|
while data_number < max_len:
|
||||||
print(r.__dict__)
|
pbar.set_description(f"Page number: {page_number}")
|
||||||
time.sleep(10)
|
time.sleep(5)
|
||||||
r = requests.get(MAIN_URL + next_page)
|
next_page = soup.find("a", {"href": re.compile(r"\/w\/index.php.*")}, string="następna strona").get('href', None)
|
||||||
if r.status_code != 200:
|
if next_page and page_number != 1:
|
||||||
break
|
r = requests.get(MAIN_URL + next_page)
|
||||||
soup = BeautifulSoup(r.text, 'lxml')
|
elif not next_page:
|
||||||
page_number += 1
|
break
|
||||||
links = soup.find_all("a", {"href": re.compile(r"\/wiki\/Strona:.*")})
|
|
||||||
for link in tqdm(links):
|
if r.status_code != 200:
|
||||||
result_list.append(get_page_data(link))
|
print(r.__dict__)
|
||||||
print("Page number:", page_number)
|
time.sleep(60)
|
||||||
print("Number of elements:", 200 * page_number, "/", max_len)
|
r = requests.get(MAIN_URL + next_page)
|
||||||
|
if r.status_code != 200:
|
||||||
|
break
|
||||||
|
soup = BeautifulSoup(r.text, 'lxml')
|
||||||
|
links = soup.find_all("a", {"href": re.compile(r"\/wiki\/Strona:.*")})
|
||||||
|
page_number += 1
|
||||||
|
for link in links:
|
||||||
|
result_list.append(get_page_data(link))
|
||||||
|
data_number += 1
|
||||||
|
pbar.update(1)
|
||||||
except Exception as e:
|
except Exception as e:
|
||||||
print(e)
|
print("Error:", e)
|
||||||
df = pd.DataFrame(result_list)
|
save_data(f"./{args.output_file_name}-{args.type}", result_list, args)
|
||||||
df.to_csv("./green.tsv", sep="\t")
|
except KeyboardInterrupt:
|
||||||
|
save_data(f"./{args.output_file_name}-{args.type}", result_list, args)
|
||||||
|
|
||||||
|
save_data(f"./{args.output_file_name}-{args.type}", result_list, args)
|
||||||
|
|
||||||
df = pd.DataFrame(result_list)
|
|
||||||
df.to_csv("./yellow.tsv", sep="\t")
|
|
||||||
|
|
||||||
if __name__ == "__main__":
|
if __name__ == "__main__":
|
||||||
main()
|
parser = argparse.ArgumentParser()
|
||||||
|
parser.add_argument("--wiki_type", type=str, default='green', choices=["green", "yellow", "red"], required=True)
|
||||||
|
parser.add_argument("--output_file_name", type=str, required=True)
|
||||||
|
parser.add_argument("--start_file_name", type=str, required=False)
|
||||||
|
parser.add_argument("--start_page_number", type=int, required=False)
|
||||||
|
parser.add_argument("--testing", type=bool, required=False, default=False)
|
||||||
|
args, left_argv = parser.parse_known_args()
|
||||||
|
print(type(vars(args)))
|
||||||
|
# main(args)
|
||||||
|
115
crawler_class.py
Normal file
115
crawler_class.py
Normal file
@ -0,0 +1,115 @@
|
|||||||
|
from bs4 import BeautifulSoup
|
||||||
|
import requests
|
||||||
|
import time
|
||||||
|
import os
|
||||||
|
from tqdm import tqdm
|
||||||
|
import csv
|
||||||
|
from collections import deque
|
||||||
|
import argparse
|
||||||
|
import re
|
||||||
|
import pandas as pd
|
||||||
|
|
||||||
|
MAIN_URL = "https://pl.wikisource.org/"
|
||||||
|
|
||||||
|
|
||||||
|
class WikiCrawler:
|
||||||
|
def __init__(self, wiki_type: str, output_file_name: str):
|
||||||
|
self.wiki_type = wiki_type
|
||||||
|
self.output_file_name = output_file_name
|
||||||
|
self.page_number = 1
|
||||||
|
self.index = 1
|
||||||
|
self.load_last_checkpoint()
|
||||||
|
|
||||||
|
def load_last_checkpoint(self):
|
||||||
|
self.start_file_name = None
|
||||||
|
if os.path.exists(self.output_file_name):
|
||||||
|
df = pd.read_csv(self.output_file_name, encoding='utf-8', sep='\t')
|
||||||
|
last_line = df.tail(1).iloc()[0]
|
||||||
|
self.start_file_name = last_line[0]
|
||||||
|
self.page_number = int(last_line[-1])
|
||||||
|
self.index = int(last_line[-2])
|
||||||
|
del df
|
||||||
|
print(f"Starting from index: {self.index}, page: {self.page_number}")
|
||||||
|
|
||||||
|
def _init_crawl(self):
|
||||||
|
category_dict = {"green": "Uwierzytelniona", "yellow": "Skorygowana", "red": "Przepisana"}
|
||||||
|
CATEGORY_URL = f"{MAIN_URL}/wiki/Kategoria:{category_dict[self.wiki_type]}"
|
||||||
|
# if should start from other step
|
||||||
|
if self.start_file_name:
|
||||||
|
CATEGORY_URL = f"{MAIN_URL}/w/index.php?title=Kategoria:{category_dict[self.wiki_type]}&pagefrom={self.start_file_name}"
|
||||||
|
request = requests.get(CATEGORY_URL)
|
||||||
|
assert request.status_code == 200, f"Status diffrent on main request, status: {request.status_code}"
|
||||||
|
|
||||||
|
soup = BeautifulSoup(request.text, 'lxml')
|
||||||
|
self.max_len = int("".join(re.findall("\d", re.sub("\xa0",'', soup.find("div", {"id": "mw-pages"}).find("p").text))[3:]))
|
||||||
|
self.pbar = tqdm(total=self.max_len)
|
||||||
|
|
||||||
|
if self.start_file_name:
|
||||||
|
self.pbar.update(self.index)
|
||||||
|
self.pbar.refresh()
|
||||||
|
|
||||||
|
return soup, request
|
||||||
|
|
||||||
|
def save_page_data(self, page_element):
|
||||||
|
time.sleep(0.3)
|
||||||
|
doc_request = requests.get(MAIN_URL + page_element['href'])
|
||||||
|
assert doc_request.status_code == 200, f"Wrong status on requesting doc link: {MAIN_URL + page_element['href']}"
|
||||||
|
doc_soup = BeautifulSoup(doc_request.text, 'lxml')
|
||||||
|
text = doc_soup.find("div", {"class": "pagetext"}).next_element
|
||||||
|
image_url = doc_soup.find("div", {"class": "prp-page-image"}).find("img")['src']
|
||||||
|
|
||||||
|
with open(self.output_file_name, 'a', newline='') as output_csv:
|
||||||
|
row_dict = {
|
||||||
|
"title": page_element['title'],
|
||||||
|
"href": MAIN_URL + page_element['href'],
|
||||||
|
"image_url": image_url,
|
||||||
|
"text": text.text,
|
||||||
|
"index": self.index,
|
||||||
|
"page_number": self.page_number
|
||||||
|
}
|
||||||
|
fields = list(row_dict.keys())
|
||||||
|
writer = csv.DictWriter(output_csv, fieldnames=list(row_dict.keys()), delimiter='\t')
|
||||||
|
writer.writerow(row_dict)
|
||||||
|
|
||||||
|
def crawl(self):
|
||||||
|
soup, r = self._init_crawl()
|
||||||
|
first_search = True
|
||||||
|
while self.index < self.max_len:
|
||||||
|
time.sleep(0.3)
|
||||||
|
self.pbar.set_description(f"Page number: {self.page_number}")
|
||||||
|
next_page = soup.find("a", {"href": re.compile(r"\/w\/index.php.*")}, string="następna strona")
|
||||||
|
if next_page:
|
||||||
|
next_page = next_page.get('href', None)
|
||||||
|
if next_page and not first_search:
|
||||||
|
r = requests.get(MAIN_URL + next_page)
|
||||||
|
elif not next_page:
|
||||||
|
print(soup)
|
||||||
|
print("\n\n\n", soup.text)
|
||||||
|
print("End of pages, or next page not found")
|
||||||
|
break
|
||||||
|
|
||||||
|
# handle wrong request
|
||||||
|
if r.status_code != 200:
|
||||||
|
print('Retry of request, request data: ', r.__dict__)
|
||||||
|
time.sleep(60)
|
||||||
|
r = requests.get(MAIN_URL + next_page)
|
||||||
|
assert r.status_code == 200, f"Retry failed, request status: {r.status_code}, full_info: {r.__dict__}"
|
||||||
|
|
||||||
|
soup = BeautifulSoup(r.text, 'lxml')
|
||||||
|
links = soup.find_all("a", {"href": re.compile(r"\/wiki\/Strona:.*")})
|
||||||
|
for link in links:
|
||||||
|
self.save_page_data(link)
|
||||||
|
self.index += 1
|
||||||
|
self.pbar.update(1)
|
||||||
|
self.page_number += 1
|
||||||
|
first_search = False
|
||||||
|
print("Finished")
|
||||||
|
|
||||||
|
if __name__ == '__main__':
|
||||||
|
parser = argparse.ArgumentParser()
|
||||||
|
parser.add_argument("--wiki_type", type=str, choices=["green", "yellow", "red"], required=True)
|
||||||
|
parser.add_argument("--output_file_name", type=str, required=True)
|
||||||
|
args, left_argv = parser.parse_known_args()
|
||||||
|
|
||||||
|
crawler = WikiCrawler(**vars(args))
|
||||||
|
crawler.crawl()
|
91
image_class.py
Normal file
91
image_class.py
Normal file
@ -0,0 +1,91 @@
|
|||||||
|
import os
|
||||||
|
import argparse
|
||||||
|
import pandas as pd
|
||||||
|
import requests
|
||||||
|
from PIL import Image
|
||||||
|
from tqdm import tqdm
|
||||||
|
import pickle
|
||||||
|
import time
|
||||||
|
from pprint import pprint
|
||||||
|
import json
|
||||||
|
from datasets import load_dataset
|
||||||
|
from huggingface_hub import login
|
||||||
|
import shutil
|
||||||
|
|
||||||
|
headers = {'User-Agent': 'ImageDownloadOcrBot/1.0 (no.rp.mk.info@gmail.com) requests/2.28.1'}
|
||||||
|
|
||||||
|
class WikiImage:
|
||||||
|
|
||||||
|
def __init__(self, input_file_path: str, dataset_name: str, output_folder: str = 'temp_images', split_number: int = 1):
|
||||||
|
self.input_file_path = input_file_path
|
||||||
|
self.split_number = split_number
|
||||||
|
self.max_dataset_len = 10000
|
||||||
|
self.output_folder = output_folder
|
||||||
|
self.dataset_name = dataset_name
|
||||||
|
print("Loading input file")
|
||||||
|
self.dataframe = pd.read_csv(self.input_file_path, sep='\t')[(self.split_number - 1) * self.max_dataset_len:]
|
||||||
|
if os.path.exists(self.output_folder):
|
||||||
|
print("Removing old dear")
|
||||||
|
if os.path.exists('/home/zombely/.cache/huggingface/datasets'):
|
||||||
|
shutil.rmtree('/home/zombely/.cache/huggingface/datasets')
|
||||||
|
shutil.rmtree(self.output_folder)
|
||||||
|
os.mkdir(self.output_folder)
|
||||||
|
self.pbar = tqdm(self.dataframe.iterrows(), total=len(self.dataframe), desc=f"Split: {self.split_number}")
|
||||||
|
|
||||||
|
login(os.environ.get("HUG_TOKEN"), True)
|
||||||
|
|
||||||
|
def image_save(self, row):
|
||||||
|
time.sleep(0.3)
|
||||||
|
image_request = requests.get(f"https:{row[1]['image_url']}", stream=True, headers=headers)
|
||||||
|
if image_request.status_code in [500, 404]:
|
||||||
|
print(f"Image {row[1]['title']} is not reacheable")
|
||||||
|
return
|
||||||
|
if image_request.status_code != 200:
|
||||||
|
time.sleep(80)
|
||||||
|
image_request = requests.get(f"https:{row[1]['image_url']}", stream=True, headers=headers)
|
||||||
|
assert image_request.status_code == 200, f"Response status is diffrent, status_code: {image_request.status_code}, full info: {image_request.__dict__}"
|
||||||
|
|
||||||
|
image = Image.open(image_request.raw)
|
||||||
|
if image.mode != "RGB":
|
||||||
|
image = image.convert("RGB")
|
||||||
|
title = row[1]['title'].replace("Strona:", "").replace("/", "-")
|
||||||
|
image.save(f"{self.output_folder}/{title}.png")
|
||||||
|
|
||||||
|
with open(f"{self.output_folder}/metadata.jsonl", mode='a', encoding='utf-8') as f:
|
||||||
|
# f.write(str({"file_name": f"{title}.png", "ground_truth": json.dumps({"gt_parse": {"text_sequance": row[1]['text'].replace('"', "'")}}, ensure_ascii=False)})+"\n")
|
||||||
|
json.dump({"file_name": f"{title}.png", "ground_truth": json.dumps({"gt_parse": {"text_sequance": row[1]['text'].replace('"', "'")}}, ensure_ascii=False)}, f, ensure_ascii=False)
|
||||||
|
f.write("\n")
|
||||||
|
|
||||||
|
def push_dataset(self, split_name: str):
|
||||||
|
print(f"Pushing split: {split_name}")
|
||||||
|
dataset = load_dataset(self.output_folder)
|
||||||
|
dataset[split_name] = dataset.pop('train')
|
||||||
|
dataset.push_to_hub(f'Zombely/{self.dataset_name}')
|
||||||
|
shutil.rmtree(self.output_folder)
|
||||||
|
shutil.rmtree('/home/zombely/.cache/huggingface/datasets')
|
||||||
|
os.mkdir(self.output_folder)
|
||||||
|
del dataset
|
||||||
|
print("Upload finished")
|
||||||
|
|
||||||
|
def crawl(self):
|
||||||
|
print("Start download")
|
||||||
|
for index, row in enumerate(self.pbar):
|
||||||
|
self.image_save(row)
|
||||||
|
if (index + 1) % self.max_dataset_len == 0:
|
||||||
|
self.push_dataset(f'train_{self.split_number}')
|
||||||
|
self.split_number += 1
|
||||||
|
self.pbar.set_description(f'Split: {self.split_number}')
|
||||||
|
|
||||||
|
self.push_dataset('validation')
|
||||||
|
|
||||||
|
|
||||||
|
if __name__ == "__main__":
|
||||||
|
parser = argparse.ArgumentParser()
|
||||||
|
parser.add_argument("--input_file_path", type=str, required=True)
|
||||||
|
parser.add_argument("--dataset_name", type=str, required=True)
|
||||||
|
parser.add_argument("--output_folder", type=str, required=False, default='temp_images')
|
||||||
|
parser.add_argument("--split_number", type=int, required=False, default=1)
|
||||||
|
args, left_argv = parser.parse_known_args()
|
||||||
|
crawler = WikiImage(**vars(args))
|
||||||
|
crawler.crawl()
|
||||||
|
|
79
image_download.py
Normal file
79
image_download.py
Normal file
@ -0,0 +1,79 @@
|
|||||||
|
import os
|
||||||
|
import argparse
|
||||||
|
import pandas as pd
|
||||||
|
import requests
|
||||||
|
from PIL import Image
|
||||||
|
from tqdm import tqdm
|
||||||
|
import pickle
|
||||||
|
import time
|
||||||
|
from pprint import pprint
|
||||||
|
import json
|
||||||
|
|
||||||
|
headers = {'User-Agent': 'ImageDownloadOcrBot/1.0 (micha9@op.pl) requests/2.28.1'}
|
||||||
|
|
||||||
|
def save_state(index, offset):
|
||||||
|
with open("./state.pickle", "wb") as state_file:
|
||||||
|
pickle.dump({"row_index": index+offset}, state_file, protocol=pickle.HIGHEST_PROTOCOL)
|
||||||
|
print("Saving state, index: ", index+offset)
|
||||||
|
|
||||||
|
def main(args):
|
||||||
|
|
||||||
|
df = pd.read_csv(args.file_path, sep="\t")
|
||||||
|
offset = 0
|
||||||
|
if not os.path.exists(args.output_folder):
|
||||||
|
print(f"Creating missing folder: {args.output_folder}")
|
||||||
|
os.mkdir(args.output_folder)
|
||||||
|
|
||||||
|
if args.from_checkpoint and os.path.exists("./state.pickle"):
|
||||||
|
with open("state.pickle", "rb") as state:
|
||||||
|
state_dict = pickle.load(state)
|
||||||
|
offset = state_dict["row_index"]
|
||||||
|
print("Starting from checkpoint, index: ", offset)
|
||||||
|
df = df[offset:]
|
||||||
|
|
||||||
|
pbar = tqdm(df.iterrows(), total=len(df), desc=f"0/{args.max_folder_size_mb if args.max_folder_size_mb else 'No limit given'} MB")
|
||||||
|
for n, row in enumerate(pbar):
|
||||||
|
try:
|
||||||
|
time.sleep(0.2)
|
||||||
|
r = requests.get(f"https:{row[1]['image_url']}", stream=True, headers=headers)
|
||||||
|
if r.status_code != 200:
|
||||||
|
time.sleep(80)
|
||||||
|
r = requests.get(f"https:{row[1]['image_url']}", stream=True, headers=headers)
|
||||||
|
if r.status_code != 200:
|
||||||
|
pprint(r.__dict__)
|
||||||
|
save_state(n, offset)
|
||||||
|
return
|
||||||
|
image = Image.open(r.raw)
|
||||||
|
if image.mode != "RGB":
|
||||||
|
image = image.convert("RGB")
|
||||||
|
title = row[1]['title'].replace("Strona:", "").replace("/", "-")
|
||||||
|
image.save(f"{args.output_folder}/{title}.png")
|
||||||
|
|
||||||
|
with open(f"{args.output_folder}/metadata.jsonl", mode='a', encoding='utf-8') as f:
|
||||||
|
f.write(str({"file_name": f"{title}.png", "ground_truth": json.dumps({"gt_parse": {"text_sequance": row[1]['text']}}, ensure_ascii=False)}) + "\n")
|
||||||
|
|
||||||
|
dir_size = round(sum(os.path.getsize(f"./{args.output_folder}/{file}") for file in os.listdir(f"./{args.output_folder}")) * 0.000001, 2)
|
||||||
|
|
||||||
|
pbar.set_description(f"{dir_size}/{args.max_folder_size_mb if args.max_folder_size_mb else 'No limit given'} MB")
|
||||||
|
|
||||||
|
if args.max_folder_size_mb and dir_size > args.max_folder_size_mb:
|
||||||
|
print(f"Limit size of: {args.max_folder_size_mb}, exceeded")
|
||||||
|
save_state(n, offset)
|
||||||
|
return
|
||||||
|
|
||||||
|
except (Exception, KeyboardInterrupt) as e:
|
||||||
|
print(f"Error: {str(e)} \n")
|
||||||
|
print(f"Row: {row}")
|
||||||
|
save_state(n, offset)
|
||||||
|
return
|
||||||
|
|
||||||
|
|
||||||
|
|
||||||
|
if __name__ == "__main__":
|
||||||
|
parser = argparse.ArgumentParser()
|
||||||
|
parser.add_argument("--file_path", type=str, required=True)
|
||||||
|
parser.add_argument("--output_folder", type=str, default="./images")
|
||||||
|
parser.add_argument("--max_folder_size_mb", type=float, required=False)
|
||||||
|
parser.add_argument("--from_checkpoint", type=bool, required=False, default=False)
|
||||||
|
args, left_argv = parser.parse_known_args()
|
||||||
|
main(args)
|
8
mail_test.py
Normal file
8
mail_test.py
Normal file
@ -0,0 +1,8 @@
|
|||||||
|
import smtplib
|
||||||
|
|
||||||
|
def main():
|
||||||
|
smtp = smtplib.SMTP("0.0.0.0", 25, 'mail')
|
||||||
|
smtp.sendmail('info@zbhome.com', ['michalkozlowski936@gmail.com'], "Hello from zbhome")
|
||||||
|
|
||||||
|
if __name__ == "__main__":
|
||||||
|
main()
|
@ -343,7 +343,7 @@
|
|||||||
"name": "python",
|
"name": "python",
|
||||||
"nbconvert_exporter": "python",
|
"nbconvert_exporter": "python",
|
||||||
"pygments_lexer": "ipython3",
|
"pygments_lexer": "ipython3",
|
||||||
"version": "3.9.15"
|
"version": "3.9.15 | packaged by conda-forge | (main, Nov 22 2022, 08:41:22) [MSC v.1929 64 bit (AMD64)]"
|
||||||
},
|
},
|
||||||
"orig_nbformat": 4,
|
"orig_nbformat": 4,
|
||||||
"vscode": {
|
"vscode": {
|
238
notebooks/image_download.ipynb
Normal file
238
notebooks/image_download.ipynb
Normal file
@ -0,0 +1,238 @@
|
|||||||
|
{
|
||||||
|
"cells": [
|
||||||
|
{
|
||||||
|
"cell_type": "code",
|
||||||
|
"execution_count": 1,
|
||||||
|
"metadata": {},
|
||||||
|
"outputs": [],
|
||||||
|
"source": [
|
||||||
|
"import pandas as pd"
|
||||||
|
]
|
||||||
|
},
|
||||||
|
{
|
||||||
|
"cell_type": "code",
|
||||||
|
"execution_count": 2,
|
||||||
|
"metadata": {},
|
||||||
|
"outputs": [],
|
||||||
|
"source": [
|
||||||
|
"a = pd.read_csv(\"../../wikisource-data/yellow-continue-yellow.tsv.tsv\", sep=\"\\t\")"
|
||||||
|
]
|
||||||
|
},
|
||||||
|
{
|
||||||
|
"cell_type": "code",
|
||||||
|
"execution_count": 3,
|
||||||
|
"metadata": {},
|
||||||
|
"outputs": [
|
||||||
|
{
|
||||||
|
"data": {
|
||||||
|
"text/html": [
|
||||||
|
"<div>\n",
|
||||||
|
"<style scoped>\n",
|
||||||
|
" .dataframe tbody tr th:only-of-type {\n",
|
||||||
|
" vertical-align: middle;\n",
|
||||||
|
" }\n",
|
||||||
|
"\n",
|
||||||
|
" .dataframe tbody tr th {\n",
|
||||||
|
" vertical-align: top;\n",
|
||||||
|
" }\n",
|
||||||
|
"\n",
|
||||||
|
" .dataframe thead th {\n",
|
||||||
|
" text-align: right;\n",
|
||||||
|
" }\n",
|
||||||
|
"</style>\n",
|
||||||
|
"<table border=\"1\" class=\"dataframe\">\n",
|
||||||
|
" <thead>\n",
|
||||||
|
" <tr style=\"text-align: right;\">\n",
|
||||||
|
" <th></th>\n",
|
||||||
|
" <th>Unnamed: 0</th>\n",
|
||||||
|
" <th>title</th>\n",
|
||||||
|
" <th>href</th>\n",
|
||||||
|
" <th>image_url</th>\n",
|
||||||
|
" <th>text</th>\n",
|
||||||
|
" </tr>\n",
|
||||||
|
" </thead>\n",
|
||||||
|
" <tbody>\n",
|
||||||
|
" <tr>\n",
|
||||||
|
" <th>0</th>\n",
|
||||||
|
" <td>0</td>\n",
|
||||||
|
" <td>Strona:Stanisław Antoni Wotowski - George Sand...</td>\n",
|
||||||
|
" <td>https://pl.wikisource.org//wiki/Strona:Stanis%...</td>\n",
|
||||||
|
" <td>//upload.wikimedia.org/wikipedia/commons/thumb...</td>\n",
|
||||||
|
" <td>zmieniła się; piękne oczy są tak samo błyszczą...</td>\n",
|
||||||
|
" </tr>\n",
|
||||||
|
" <tr>\n",
|
||||||
|
" <th>1</th>\n",
|
||||||
|
" <td>1</td>\n",
|
||||||
|
" <td>Strona:Stanisław Antoni Wotowski - George Sand...</td>\n",
|
||||||
|
" <td>https://pl.wikisource.org//wiki/Strona:Stanis%...</td>\n",
|
||||||
|
" <td>//upload.wikimedia.org/wikipedia/commons/thumb...</td>\n",
|
||||||
|
" <td>najświetniejszej chociażby sławy... i po piętn...</td>\n",
|
||||||
|
" </tr>\n",
|
||||||
|
" <tr>\n",
|
||||||
|
" <th>2</th>\n",
|
||||||
|
" <td>2</td>\n",
|
||||||
|
" <td>Strona:Stanisław Antoni Wotowski - George Sand...</td>\n",
|
||||||
|
" <td>https://pl.wikisource.org//wiki/Strona:Stanis%...</td>\n",
|
||||||
|
" <td>//upload.wikimedia.org/wikipedia/commons/thumb...</td>\n",
|
||||||
|
" <td>Chopin gra. Ledwie dostrzegalnie muskają smuk...</td>\n",
|
||||||
|
" </tr>\n",
|
||||||
|
" <tr>\n",
|
||||||
|
" <th>3</th>\n",
|
||||||
|
" <td>3</td>\n",
|
||||||
|
" <td>Strona:Stanisław Antoni Wotowski - George Sand...</td>\n",
|
||||||
|
" <td>https://pl.wikisource.org//wiki/Strona:Stanis%...</td>\n",
|
||||||
|
" <td>//upload.wikimedia.org/wikipedia/commons/thumb...</td>\n",
|
||||||
|
" <td>\\nDZIWACZNE MAŁŻEŃSTWO.\\n\\n Był grudzień 1830 ...</td>\n",
|
||||||
|
" </tr>\n",
|
||||||
|
" <tr>\n",
|
||||||
|
" <th>4</th>\n",
|
||||||
|
" <td>4</td>\n",
|
||||||
|
" <td>Strona:Stanisław Antoni Wotowski - George Sand...</td>\n",
|
||||||
|
" <td>https://pl.wikisource.org//wiki/Strona:Stanis%...</td>\n",
|
||||||
|
" <td>//upload.wikimedia.org/wikipedia/commons/thumb...</td>\n",
|
||||||
|
" <td>Ale bliższego związku z panią Sand jakby się ...</td>\n",
|
||||||
|
" </tr>\n",
|
||||||
|
" </tbody>\n",
|
||||||
|
"</table>\n",
|
||||||
|
"</div>"
|
||||||
|
],
|
||||||
|
"text/plain": [
|
||||||
|
" Unnamed: 0 title \\\n",
|
||||||
|
"0 0 Strona:Stanisław Antoni Wotowski - George Sand... \n",
|
||||||
|
"1 1 Strona:Stanisław Antoni Wotowski - George Sand... \n",
|
||||||
|
"2 2 Strona:Stanisław Antoni Wotowski - George Sand... \n",
|
||||||
|
"3 3 Strona:Stanisław Antoni Wotowski - George Sand... \n",
|
||||||
|
"4 4 Strona:Stanisław Antoni Wotowski - George Sand... \n",
|
||||||
|
"\n",
|
||||||
|
" href \\\n",
|
||||||
|
"0 https://pl.wikisource.org//wiki/Strona:Stanis%... \n",
|
||||||
|
"1 https://pl.wikisource.org//wiki/Strona:Stanis%... \n",
|
||||||
|
"2 https://pl.wikisource.org//wiki/Strona:Stanis%... \n",
|
||||||
|
"3 https://pl.wikisource.org//wiki/Strona:Stanis%... \n",
|
||||||
|
"4 https://pl.wikisource.org//wiki/Strona:Stanis%... \n",
|
||||||
|
"\n",
|
||||||
|
" image_url \\\n",
|
||||||
|
"0 //upload.wikimedia.org/wikipedia/commons/thumb... \n",
|
||||||
|
"1 //upload.wikimedia.org/wikipedia/commons/thumb... \n",
|
||||||
|
"2 //upload.wikimedia.org/wikipedia/commons/thumb... \n",
|
||||||
|
"3 //upload.wikimedia.org/wikipedia/commons/thumb... \n",
|
||||||
|
"4 //upload.wikimedia.org/wikipedia/commons/thumb... \n",
|
||||||
|
"\n",
|
||||||
|
" text \n",
|
||||||
|
"0 zmieniła się; piękne oczy są tak samo błyszczą... \n",
|
||||||
|
"1 najświetniejszej chociażby sławy... i po piętn... \n",
|
||||||
|
"2 Chopin gra. Ledwie dostrzegalnie muskają smuk... \n",
|
||||||
|
"3 \\nDZIWACZNE MAŁŻEŃSTWO.\\n\\n Był grudzień 1830 ... \n",
|
||||||
|
"4 Ale bliższego związku z panią Sand jakby się ... "
|
||||||
|
]
|
||||||
|
},
|
||||||
|
"execution_count": 3,
|
||||||
|
"metadata": {},
|
||||||
|
"output_type": "execute_result"
|
||||||
|
}
|
||||||
|
],
|
||||||
|
"source": [
|
||||||
|
"a.head()"
|
||||||
|
]
|
||||||
|
},
|
||||||
|
{
|
||||||
|
"cell_type": "code",
|
||||||
|
"execution_count": 2,
|
||||||
|
"metadata": {},
|
||||||
|
"outputs": [],
|
||||||
|
"source": [
|
||||||
|
"from datasets import load_dataset\n",
|
||||||
|
"from huggingface_hub import login"
|
||||||
|
]
|
||||||
|
},
|
||||||
|
{
|
||||||
|
"cell_type": "code",
|
||||||
|
"execution_count": 6,
|
||||||
|
"metadata": {},
|
||||||
|
"outputs": [
|
||||||
|
{
|
||||||
|
"name": "stderr",
|
||||||
|
"output_type": "stream",
|
||||||
|
"text": [
|
||||||
|
"Resolving data files: 100%|██████████| 29/29 [00:00<?, ?it/s]\n",
|
||||||
|
"Using custom data configuration images-8b1ad802b6988161\n"
|
||||||
|
]
|
||||||
|
},
|
||||||
|
{
|
||||||
|
"name": "stdout",
|
||||||
|
"output_type": "stream",
|
||||||
|
"text": [
|
||||||
|
"Downloading and preparing dataset imagefolder/images to C:/Users/PC/.cache/huggingface/datasets/imagefolder/images-8b1ad802b6988161/0.0.0/37fbb85cc714a338bea574ac6c7d0b5be5aff46c1862c1989b20e0771199e93f...\n"
|
||||||
|
]
|
||||||
|
},
|
||||||
|
{
|
||||||
|
"name": "stderr",
|
||||||
|
"output_type": "stream",
|
||||||
|
"text": [
|
||||||
|
"Downloading data files: 0it [00:00, ?it/s]\n",
|
||||||
|
"Extracting data files: 0it [00:00, ?it/s]\n"
|
||||||
|
]
|
||||||
|
},
|
||||||
|
{
|
||||||
|
"ename": "ArrowInvalid",
|
||||||
|
"evalue": "JSON parse error: Missing a name for object member. in row 0",
|
||||||
|
"output_type": "error",
|
||||||
|
"traceback": [
|
||||||
|
"\u001b[1;31m---------------------------------------------------------------------------\u001b[0m",
|
||||||
|
"\u001b[1;31mArrowInvalid\u001b[0m Traceback (most recent call last)",
|
||||||
|
"Cell \u001b[1;32mIn[6], line 1\u001b[0m\n\u001b[1;32m----> 1\u001b[0m dataset \u001b[39m=\u001b[39m load_dataset(\u001b[39m\"\u001b[39;49m\u001b[39m../images\u001b[39;49m\u001b[39m\"\u001b[39;49m)\n",
|
||||||
|
"File \u001b[1;32mc:\\Users\\PC\\anaconda3\\envs\\um\\lib\\site-packages\\datasets\\load.py:1741\u001b[0m, in \u001b[0;36mload_dataset\u001b[1;34m(path, name, data_dir, data_files, split, cache_dir, features, download_config, download_mode, ignore_verifications, keep_in_memory, save_infos, revision, use_auth_token, task, streaming, num_proc, **config_kwargs)\u001b[0m\n\u001b[0;32m 1738\u001b[0m try_from_hf_gcs \u001b[39m=\u001b[39m path \u001b[39mnot\u001b[39;00m \u001b[39min\u001b[39;00m _PACKAGED_DATASETS_MODULES\n\u001b[0;32m 1740\u001b[0m \u001b[39m# Download and prepare data\u001b[39;00m\n\u001b[1;32m-> 1741\u001b[0m builder_instance\u001b[39m.\u001b[39;49mdownload_and_prepare(\n\u001b[0;32m 1742\u001b[0m download_config\u001b[39m=\u001b[39;49mdownload_config,\n\u001b[0;32m 1743\u001b[0m download_mode\u001b[39m=\u001b[39;49mdownload_mode,\n\u001b[0;32m 1744\u001b[0m ignore_verifications\u001b[39m=\u001b[39;49mignore_verifications,\n\u001b[0;32m 1745\u001b[0m try_from_hf_gcs\u001b[39m=\u001b[39;49mtry_from_hf_gcs,\n\u001b[0;32m 1746\u001b[0m use_auth_token\u001b[39m=\u001b[39;49muse_auth_token,\n\u001b[0;32m 1747\u001b[0m num_proc\u001b[39m=\u001b[39;49mnum_proc,\n\u001b[0;32m 1748\u001b[0m )\n\u001b[0;32m 1750\u001b[0m \u001b[39m# Build dataset for splits\u001b[39;00m\n\u001b[0;32m 1751\u001b[0m keep_in_memory \u001b[39m=\u001b[39m (\n\u001b[0;32m 1752\u001b[0m keep_in_memory \u001b[39mif\u001b[39;00m keep_in_memory \u001b[39mis\u001b[39;00m \u001b[39mnot\u001b[39;00m \u001b[39mNone\u001b[39;00m \u001b[39melse\u001b[39;00m is_small_dataset(builder_instance\u001b[39m.\u001b[39minfo\u001b[39m.\u001b[39mdataset_size)\n\u001b[0;32m 1753\u001b[0m )\n",
|
||||||
|
"File \u001b[1;32mc:\\Users\\PC\\anaconda3\\envs\\um\\lib\\site-packages\\datasets\\builder.py:822\u001b[0m, in \u001b[0;36mDatasetBuilder.download_and_prepare\u001b[1;34m(self, output_dir, download_config, download_mode, ignore_verifications, try_from_hf_gcs, dl_manager, base_path, use_auth_token, file_format, max_shard_size, num_proc, storage_options, **download_and_prepare_kwargs)\u001b[0m\n\u001b[0;32m 820\u001b[0m \u001b[39mif\u001b[39;00m num_proc \u001b[39mis\u001b[39;00m \u001b[39mnot\u001b[39;00m \u001b[39mNone\u001b[39;00m:\n\u001b[0;32m 821\u001b[0m prepare_split_kwargs[\u001b[39m\"\u001b[39m\u001b[39mnum_proc\u001b[39m\u001b[39m\"\u001b[39m] \u001b[39m=\u001b[39m num_proc\n\u001b[1;32m--> 822\u001b[0m \u001b[39mself\u001b[39m\u001b[39m.\u001b[39m_download_and_prepare(\n\u001b[0;32m 823\u001b[0m dl_manager\u001b[39m=\u001b[39mdl_manager,\n\u001b[0;32m 824\u001b[0m verify_infos\u001b[39m=\u001b[39mverify_infos,\n\u001b[0;32m 825\u001b[0m \u001b[39m*\u001b[39m\u001b[39m*\u001b[39mprepare_split_kwargs,\n\u001b[0;32m 826\u001b[0m \u001b[39m*\u001b[39m\u001b[39m*\u001b[39mdownload_and_prepare_kwargs,\n\u001b[0;32m 827\u001b[0m )\n\u001b[0;32m 828\u001b[0m \u001b[39m# Sync info\u001b[39;00m\n\u001b[0;32m 829\u001b[0m \u001b[39mself\u001b[39m\u001b[39m.\u001b[39minfo\u001b[39m.\u001b[39mdataset_size \u001b[39m=\u001b[39m \u001b[39msum\u001b[39m(split\u001b[39m.\u001b[39mnum_bytes \u001b[39mfor\u001b[39;00m split \u001b[39min\u001b[39;00m \u001b[39mself\u001b[39m\u001b[39m.\u001b[39minfo\u001b[39m.\u001b[39msplits\u001b[39m.\u001b[39mvalues())\n",
|
||||||
|
"File \u001b[1;32mc:\\Users\\PC\\anaconda3\\envs\\um\\lib\\site-packages\\datasets\\builder.py:1555\u001b[0m, in \u001b[0;36mGeneratorBasedBuilder._download_and_prepare\u001b[1;34m(self, dl_manager, verify_infos, **prepare_splits_kwargs)\u001b[0m\n\u001b[0;32m 1554\u001b[0m \u001b[39mdef\u001b[39;00m \u001b[39m_download_and_prepare\u001b[39m(\u001b[39mself\u001b[39m, dl_manager, verify_infos, \u001b[39m*\u001b[39m\u001b[39m*\u001b[39mprepare_splits_kwargs):\n\u001b[1;32m-> 1555\u001b[0m \u001b[39msuper\u001b[39m()\u001b[39m.\u001b[39m_download_and_prepare(\n\u001b[0;32m 1556\u001b[0m dl_manager, verify_infos, check_duplicate_keys\u001b[39m=\u001b[39mverify_infos, \u001b[39m*\u001b[39m\u001b[39m*\u001b[39mprepare_splits_kwargs\n\u001b[0;32m 1557\u001b[0m )\n",
|
||||||
|
"File \u001b[1;32mc:\\Users\\PC\\anaconda3\\envs\\um\\lib\\site-packages\\datasets\\builder.py:891\u001b[0m, in \u001b[0;36mDatasetBuilder._download_and_prepare\u001b[1;34m(self, dl_manager, verify_infos, **prepare_split_kwargs)\u001b[0m\n\u001b[0;32m 889\u001b[0m split_dict \u001b[39m=\u001b[39m SplitDict(dataset_name\u001b[39m=\u001b[39m\u001b[39mself\u001b[39m\u001b[39m.\u001b[39mname)\n\u001b[0;32m 890\u001b[0m split_generators_kwargs \u001b[39m=\u001b[39m \u001b[39mself\u001b[39m\u001b[39m.\u001b[39m_make_split_generators_kwargs(prepare_split_kwargs)\n\u001b[1;32m--> 891\u001b[0m split_generators \u001b[39m=\u001b[39m \u001b[39mself\u001b[39m\u001b[39m.\u001b[39m_split_generators(dl_manager, \u001b[39m*\u001b[39m\u001b[39m*\u001b[39msplit_generators_kwargs)\n\u001b[0;32m 893\u001b[0m \u001b[39m# Checksums verification\u001b[39;00m\n\u001b[0;32m 894\u001b[0m \u001b[39mif\u001b[39;00m verify_infos \u001b[39mand\u001b[39;00m dl_manager\u001b[39m.\u001b[39mrecord_checksums:\n",
|
||||||
|
"File \u001b[1;32mc:\\Users\\PC\\anaconda3\\envs\\um\\lib\\site-packages\\datasets\\packaged_modules\\folder_based_builder\\folder_based_builder.py:189\u001b[0m, in \u001b[0;36mFolderBasedBuilder._split_generators\u001b[1;34m(self, dl_manager)\u001b[0m\n\u001b[0;32m 186\u001b[0m metadata_ext \u001b[39m=\u001b[39m metadata_ext\u001b[39m.\u001b[39mpop()\n\u001b[0;32m 188\u001b[0m \u001b[39mfor\u001b[39;00m _, downloaded_metadata_file \u001b[39min\u001b[39;00m itertools\u001b[39m.\u001b[39mchain\u001b[39m.\u001b[39mfrom_iterable(metadata_files\u001b[39m.\u001b[39mvalues()):\n\u001b[1;32m--> 189\u001b[0m pa_metadata_table \u001b[39m=\u001b[39m \u001b[39mself\u001b[39;49m\u001b[39m.\u001b[39;49m_read_metadata(downloaded_metadata_file)\n\u001b[0;32m 190\u001b[0m features_per_metadata_file\u001b[39m.\u001b[39mappend(\n\u001b[0;32m 191\u001b[0m (downloaded_metadata_file, datasets\u001b[39m.\u001b[39mFeatures\u001b[39m.\u001b[39mfrom_arrow_schema(pa_metadata_table\u001b[39m.\u001b[39mschema))\n\u001b[0;32m 192\u001b[0m )\n\u001b[0;32m 193\u001b[0m \u001b[39mfor\u001b[39;00m downloaded_metadata_file, metadata_features \u001b[39min\u001b[39;00m features_per_metadata_file:\n",
|
||||||
|
"File \u001b[1;32mc:\\Users\\PC\\anaconda3\\envs\\um\\lib\\site-packages\\datasets\\packaged_modules\\folder_based_builder\\folder_based_builder.py:260\u001b[0m, in \u001b[0;36mFolderBasedBuilder._read_metadata\u001b[1;34m(self, metadata_file)\u001b[0m\n\u001b[0;32m 258\u001b[0m \u001b[39melse\u001b[39;00m:\n\u001b[0;32m 259\u001b[0m \u001b[39mwith\u001b[39;00m \u001b[39mopen\u001b[39m(metadata_file, \u001b[39m\"\u001b[39m\u001b[39mrb\u001b[39m\u001b[39m\"\u001b[39m) \u001b[39mas\u001b[39;00m f:\n\u001b[1;32m--> 260\u001b[0m \u001b[39mreturn\u001b[39;00m paj\u001b[39m.\u001b[39;49mread_json(f)\n",
|
||||||
|
"File \u001b[1;32mc:\\Users\\PC\\anaconda3\\envs\\um\\lib\\site-packages\\pyarrow\\_json.pyx:259\u001b[0m, in \u001b[0;36mpyarrow._json.read_json\u001b[1;34m()\u001b[0m\n",
|
||||||
|
"File \u001b[1;32mc:\\Users\\PC\\anaconda3\\envs\\um\\lib\\site-packages\\pyarrow\\error.pxi:144\u001b[0m, in \u001b[0;36mpyarrow.lib.pyarrow_internal_check_status\u001b[1;34m()\u001b[0m\n",
|
||||||
|
"File \u001b[1;32mc:\\Users\\PC\\anaconda3\\envs\\um\\lib\\site-packages\\pyarrow\\error.pxi:100\u001b[0m, in \u001b[0;36mpyarrow.lib.check_status\u001b[1;34m()\u001b[0m\n",
|
||||||
|
"\u001b[1;31mArrowInvalid\u001b[0m: JSON parse error: Missing a name for object member. in row 0"
|
||||||
|
]
|
||||||
|
}
|
||||||
|
],
|
||||||
|
"source": [
|
||||||
|
"dataset = load_dataset(\"../images\")"
|
||||||
|
]
|
||||||
|
},
|
||||||
|
{
|
||||||
|
"cell_type": "code",
|
||||||
|
"execution_count": null,
|
||||||
|
"metadata": {},
|
||||||
|
"outputs": [],
|
||||||
|
"source": [
|
||||||
|
"login('',True)"
|
||||||
|
]
|
||||||
|
}
|
||||||
|
],
|
||||||
|
"metadata": {
|
||||||
|
"kernelspec": {
|
||||||
|
"display_name": "um",
|
||||||
|
"language": "python",
|
||||||
|
"name": "python3"
|
||||||
|
},
|
||||||
|
"language_info": {
|
||||||
|
"codemirror_mode": {
|
||||||
|
"name": "ipython",
|
||||||
|
"version": 3
|
||||||
|
},
|
||||||
|
"file_extension": ".py",
|
||||||
|
"mimetype": "text/x-python",
|
||||||
|
"name": "python",
|
||||||
|
"nbconvert_exporter": "python",
|
||||||
|
"pygments_lexer": "ipython3",
|
||||||
|
"version": "3.9.15"
|
||||||
|
},
|
||||||
|
"orig_nbformat": 4,
|
||||||
|
"vscode": {
|
||||||
|
"interpreter": {
|
||||||
|
"hash": "876e189cbbe99a9a838ece62aae1013186c4bb7e0254a10cfa2f9b2381853efb"
|
||||||
|
}
|
||||||
|
}
|
||||||
|
},
|
||||||
|
"nbformat": 4,
|
||||||
|
"nbformat_minor": 2
|
||||||
|
}
|
95
notebooks/join.ipynb
Normal file
95
notebooks/join.ipynb
Normal file
@ -0,0 +1,95 @@
|
|||||||
|
{
|
||||||
|
"cells": [
|
||||||
|
{
|
||||||
|
"cell_type": "code",
|
||||||
|
"execution_count": null,
|
||||||
|
"metadata": {},
|
||||||
|
"outputs": [],
|
||||||
|
"source": [
|
||||||
|
"import pandas as pd"
|
||||||
|
]
|
||||||
|
},
|
||||||
|
{
|
||||||
|
"cell_type": "code",
|
||||||
|
"execution_count": null,
|
||||||
|
"metadata": {},
|
||||||
|
"outputs": [],
|
||||||
|
"source": [
|
||||||
|
"green = pd.read_csv(\"../../wikisource-data/green.tsv\", sep=\"\\t\")\n"
|
||||||
|
]
|
||||||
|
},
|
||||||
|
{
|
||||||
|
"cell_type": "code",
|
||||||
|
"execution_count": null,
|
||||||
|
"metadata": {},
|
||||||
|
"outputs": [],
|
||||||
|
"source": [
|
||||||
|
"green.tail()"
|
||||||
|
]
|
||||||
|
},
|
||||||
|
{
|
||||||
|
"cell_type": "code",
|
||||||
|
"execution_count": null,
|
||||||
|
"metadata": {},
|
||||||
|
"outputs": [],
|
||||||
|
"source": [
|
||||||
|
"green = pd.read_csv(\"../green-full.tsv\", sep=\"\\t\")\n",
|
||||||
|
"yellow = pd.read_csv(\"../yellow-full.tsv\", sep=\"\\t\")"
|
||||||
|
]
|
||||||
|
},
|
||||||
|
{
|
||||||
|
"cell_type": "code",
|
||||||
|
"execution_count": null,
|
||||||
|
"metadata": {},
|
||||||
|
"outputs": [],
|
||||||
|
"source": [
|
||||||
|
"whole = pd.concat([green, yellow], axis=0)\n"
|
||||||
|
]
|
||||||
|
},
|
||||||
|
{
|
||||||
|
"cell_type": "code",
|
||||||
|
"execution_count": null,
|
||||||
|
"metadata": {},
|
||||||
|
"outputs": [],
|
||||||
|
"source": [
|
||||||
|
"len(whole)"
|
||||||
|
]
|
||||||
|
},
|
||||||
|
{
|
||||||
|
"cell_type": "code",
|
||||||
|
"execution_count": null,
|
||||||
|
"metadata": {},
|
||||||
|
"outputs": [],
|
||||||
|
"source": [
|
||||||
|
"whole.to_csv(\"./wikisource-full.tsv\", sep=\"\\t\")"
|
||||||
|
]
|
||||||
|
}
|
||||||
|
],
|
||||||
|
"metadata": {
|
||||||
|
"kernelspec": {
|
||||||
|
"display_name": "um",
|
||||||
|
"language": "python",
|
||||||
|
"name": "python3"
|
||||||
|
},
|
||||||
|
"language_info": {
|
||||||
|
"codemirror_mode": {
|
||||||
|
"name": "ipython",
|
||||||
|
"version": 3
|
||||||
|
},
|
||||||
|
"file_extension": ".py",
|
||||||
|
"mimetype": "text/x-python",
|
||||||
|
"name": "python",
|
||||||
|
"nbconvert_exporter": "python",
|
||||||
|
"pygments_lexer": "ipython3",
|
||||||
|
"version": "3.9.15"
|
||||||
|
},
|
||||||
|
"orig_nbformat": 4,
|
||||||
|
"vscode": {
|
||||||
|
"interpreter": {
|
||||||
|
"hash": "876e189cbbe99a9a838ece62aae1013186c4bb7e0254a10cfa2f9b2381853efb"
|
||||||
|
}
|
||||||
|
}
|
||||||
|
},
|
||||||
|
"nbformat": 4,
|
||||||
|
"nbformat_minor": 2
|
||||||
|
}
|
@ -5,6 +5,7 @@ idna==3.4
|
|||||||
lxml==4.9.2
|
lxml==4.9.2
|
||||||
numpy==1.24.1
|
numpy==1.24.1
|
||||||
pandas==1.5.2
|
pandas==1.5.2
|
||||||
|
Pillow==9.4.0
|
||||||
python-dateutil==2.8.2
|
python-dateutil==2.8.2
|
||||||
pytz==2022.7
|
pytz==2022.7
|
||||||
requests==2.28.1
|
requests==2.28.1
|
||||||
|
13353
yellow.tsv
13353
yellow.tsv
File diff suppressed because it is too large
Load Diff
Loading…
Reference in New Issue
Block a user