full text fix
This commit is contained in:
parent
bdf5732b70
commit
a9f7929fc6
File diff suppressed because one or more lines are too long
@ -4,6 +4,7 @@ from bs4 import BeautifulSoup
|
|||||||
import re
|
import re
|
||||||
from tqdm import tqdm
|
from tqdm import tqdm
|
||||||
import time
|
import time
|
||||||
|
import argparse
|
||||||
|
|
||||||
def main():
|
def main():
|
||||||
MAIN_URL = "https://pl.wikisource.org/"
|
MAIN_URL = "https://pl.wikisource.org/"
|
||||||
@ -15,7 +16,7 @@ def main():
|
|||||||
doc = requests.get(MAIN_URL + page_element['href'])
|
doc = requests.get(MAIN_URL + page_element['href'])
|
||||||
doc_soup = BeautifulSoup(doc.text, 'lxml', from_encoding="utf-8")
|
doc_soup = BeautifulSoup(doc.text, 'lxml', from_encoding="utf-8")
|
||||||
text_elem = doc_soup.find("div", {"class": "pagetext"}).next_element
|
text_elem = doc_soup.find("div", {"class": "pagetext"}).next_element
|
||||||
text = text_elem.text if not text_elem.find("math") else "math image"
|
text = text_elem.text
|
||||||
image_url = doc_soup.find("div", {"class": "prp-page-image"}).next_element['src']
|
image_url = doc_soup.find("div", {"class": "prp-page-image"}).next_element['src']
|
||||||
return {"title": page_element['title'], "href": MAIN_URL + page_element['href'], "image_url": image_url, "text": text,}
|
return {"title": page_element['title'], "href": MAIN_URL + page_element['href'], "image_url": image_url, "text": text,}
|
||||||
|
|
||||||
|
Loading…
Reference in New Issue
Block a user