przemowoAnalizator/stalin_war_crawler.py
2020-01-06 14:46:37 +01:00

16 lines
766 B
Python

import requests
from bs4 import BeautifulSoup
index_src = requests.get("https://www.marxists.org/reference/archive/stalin/works/war/index.htm").text
index = BeautifulSoup(index_src)
links = index.select("strong a")
for link in links:
print("https://www.marxists.org/reference/archive/stalin/works/war/"+link['href'])
speech = BeautifulSoup(requests.get("https://www.marxists.org/reference/archive/stalin/works"+link['href'].replace("..",'')).text)
paragraphs = speech.select("p")
text=""
for p in paragraphs:
text+=p.text
print(text)
with open("war/"+speech.title.text.replace("'", '').replace(" ", "_").replace('"', '').replace(":", '').replace(".",'').replace(",",'')+".txt", "w+", encoding="utf-8") as f:
f.write(text)