przemowoAnalizator/stalin_trockizm_crawler.py

16 lines
784 B
Python

import requests
from bs4 import BeautifulSoup
index_src = requests.get("https://www.marxists.org/reference/archive/stalin/works/subject/trotskyism/index.htm").text
index = BeautifulSoup(index_src)
links = index.select(".fst a")
for link in links:
print("https://www.marxists.org/reference/archive/stalin/works/"+link['href'])
speech = BeautifulSoup(requests.get("https://www.marxists.org/reference/archive/stalin/works"+link['href'].replace("..",'')[1:]).text)
paragraphs = speech.select("p")
text=""
for p in paragraphs:
text+=p.text
print(text)
with open("trockizm/"+speech.title.text.replace("'", '').replace(" ", "_").replace('"', '').replace(":", '').replace(".",'').replace(",",'')+".txt", "w+", encoding="utf-8") as f:
f.write(text)