edit in config
This commit is contained in:
parent
c68c41e9b5
commit
bdf5732b70
15
crawler.py
15
crawler.py
@ -11,6 +11,7 @@ def main():
|
|||||||
URL_GREEN = "https://pl.wikisource.org/wiki/Kategoria:Uwierzytelniona"
|
URL_GREEN = "https://pl.wikisource.org/wiki/Kategoria:Uwierzytelniona"
|
||||||
|
|
||||||
def get_page_data(page_element):
|
def get_page_data(page_element):
|
||||||
|
time.sleep(0.5)
|
||||||
doc = requests.get(MAIN_URL + page_element['href'])
|
doc = requests.get(MAIN_URL + page_element['href'])
|
||||||
doc_soup = BeautifulSoup(doc.text, 'lxml', from_encoding="utf-8")
|
doc_soup = BeautifulSoup(doc.text, 'lxml', from_encoding="utf-8")
|
||||||
text_elem = doc_soup.find("div", {"class": "pagetext"}).next_element
|
text_elem = doc_soup.find("div", {"class": "pagetext"}).next_element
|
||||||
@ -18,22 +19,26 @@ def main():
|
|||||||
image_url = doc_soup.find("div", {"class": "prp-page-image"}).next_element['src']
|
image_url = doc_soup.find("div", {"class": "prp-page-image"}).next_element['src']
|
||||||
return {"title": page_element['title'], "href": MAIN_URL + page_element['href'], "image_url": image_url, "text": text,}
|
return {"title": page_element['title'], "href": MAIN_URL + page_element['href'], "image_url": image_url, "text": text,}
|
||||||
|
|
||||||
r = requests.get(URL_YELLOW)
|
r = requests.get(URL_GREEN)
|
||||||
soup = BeautifulSoup(r.text, 'lxml')
|
soup = BeautifulSoup(r.text, 'lxml')
|
||||||
page_number = 1
|
page_number = 1
|
||||||
result_list = []
|
result_list = []
|
||||||
max_len = "".join(re.findall("\d", re.sub("\xa0",'', soup.find("div", {"id": "mw-pages"}).find("p").text))[3:])
|
max_len = "".join(re.findall("\d", re.sub("\xa0",'', soup.find("div", {"id": "mw-pages"}).find("p").text))[3:])
|
||||||
try:
|
try:
|
||||||
while True:
|
while True:
|
||||||
|
time.sleep(5)
|
||||||
next_page = soup.find("a", {"href": re.compile(r"\/w\/index.php.*")}, string="następna strona").get('href', None)
|
next_page = soup.find("a", {"href": re.compile(r"\/w\/index.php.*")}, string="następna strona").get('href', None)
|
||||||
if next_page and page_number != 1:
|
if next_page and page_number != 1:
|
||||||
r = requests.get(MAIN_URL + next_page)
|
r = requests.get(MAIN_URL + next_page)
|
||||||
else:
|
elif not next_page:
|
||||||
break
|
break
|
||||||
|
|
||||||
if r.status_code != 200:
|
if r.status_code != 200:
|
||||||
print(r.__dict__)
|
print(r.__dict__)
|
||||||
break
|
time.sleep(10)
|
||||||
|
r = requests.get(MAIN_URL + next_page)
|
||||||
|
if r.status_code != 200:
|
||||||
|
break
|
||||||
soup = BeautifulSoup(r.text, 'lxml')
|
soup = BeautifulSoup(r.text, 'lxml')
|
||||||
page_number += 1
|
page_number += 1
|
||||||
links = soup.find_all("a", {"href": re.compile(r"\/wiki\/Strona:.*")})
|
links = soup.find_all("a", {"href": re.compile(r"\/wiki\/Strona:.*")})
|
||||||
@ -44,10 +49,10 @@ def main():
|
|||||||
except Exception as e:
|
except Exception as e:
|
||||||
print(e)
|
print(e)
|
||||||
df = pd.DataFrame(result_list)
|
df = pd.DataFrame(result_list)
|
||||||
df.to_csv("./yellow.tsv", sep="\t")
|
df.to_csv("./green.tsv", sep="\t")
|
||||||
|
|
||||||
df = pd.DataFrame(result_list)
|
df = pd.DataFrame(result_list)
|
||||||
df.to_csv("./yellow.tsv", sep="\t")
|
df.to_csv("./yellow.tsv", sep="\t")
|
||||||
|
|
||||||
if __name__ == "__main__":
|
if __name__ == "__main__":
|
||||||
main()
|
main()
|
||||||
|
Loading…
Reference in New Issue
Block a user