82 lines
3.2 KiB
Python
82 lines
3.2 KiB
Python
|
import requests
|
||
|
from bs4 import BeautifulSoup as bs
|
||
|
import time
|
||
|
|
||
|
|
||
|
def get_url_soup(url):
|
||
|
req = requests.get(url)
|
||
|
soup = bs(req.text, 'html.parser')
|
||
|
l = len(soup)
|
||
|
while l <= 1:
|
||
|
req = requests.get(url)
|
||
|
soup = bs(req.text, 'html.parser')
|
||
|
l = len(soup)
|
||
|
time.sleep(3)
|
||
|
return soup
|
||
|
|
||
|
|
||
|
def clean_text(text):
|
||
|
return " ".join(text.split())
|
||
|
|
||
|
|
||
|
def get_forum(url, domains, file):
|
||
|
try:
|
||
|
soup = get_url_soup(url)
|
||
|
soup = soup.find('div', {'class': 'order-first'})
|
||
|
cards = soup.find_all('div', {'class': 'card mb-3'}) + soup.find_all('div', {'class': 'card mb-3 shadow-sm'})
|
||
|
for card in cards:
|
||
|
title = card.find('div', {'class': 'col-lg-7 col-md-6 col-12'})
|
||
|
title = clean_text(title.text)
|
||
|
if title == 'Forum':
|
||
|
forum_card = card.find_all('div', {'class': 'row-item'})
|
||
|
for c in forum_card:
|
||
|
a = c.find_all('a')
|
||
|
if len(a) > 1:
|
||
|
a = c.find_all('a')[1]
|
||
|
t_url = a.get('href')
|
||
|
domain = clean_text(a.text)
|
||
|
domains.append(domain)
|
||
|
get_forum(t_url, domains, file)
|
||
|
domains.pop()
|
||
|
elif title == 'Ogłoszenia':
|
||
|
topics = card.find_all('a', {'class': 'topictitle fs-5'})
|
||
|
for topic in topics:
|
||
|
file.write(topic.get('href').replace('https://hyperreal.info/talk/', '') + '\t' + '\t'.join(domains + ['Ogłoszenia']) + '\n')
|
||
|
elif title == 'Tematy':
|
||
|
pages = soup.find('span', {'class': 'fw-normal'})
|
||
|
if pages != None:
|
||
|
pages = int(clean_text(pages.text).replace('Strona 1 z ', ''))
|
||
|
for page in range(pages):
|
||
|
n = page * 15
|
||
|
topics_url = f'{url[:-5]}-{n}.html'
|
||
|
topics_soup = get_url_soup(topics_url)
|
||
|
topics_soup = topics_soup.find('div', {'class': 'order-first'})
|
||
|
cards_topics = topics_soup.find_all('div', {'class': 'card mb-3 shadow-sm'})[-1]
|
||
|
topics = cards_topics.find_all('a', {'class': 'topictitle fs-5'})
|
||
|
for topic in topics:
|
||
|
file.write(topic.get('href').replace('https://hyperreal.info/talk/', '') + '\t' + '\t'.join(domains) + '\n')
|
||
|
except Exception as e:
|
||
|
print(f'\tERROR: {url} - {e}')
|
||
|
|
||
|
|
||
|
def get_main_domains(url, file):
|
||
|
soup = get_url_soup(url)
|
||
|
soup = soup.find('div', {'class': 'order-first'})
|
||
|
domains = soup.find_all('div', {'class': 'card mb-3'})
|
||
|
for domain in domains:
|
||
|
domain_0 = clean_text(domain.find('div', {'class': 'col-lg-7 col-md-6 col-12'}).text)
|
||
|
topics = domain.find_all('div', {'class': 'row-item'})
|
||
|
for topic in topics:
|
||
|
a = topic.find_all('a')[1]
|
||
|
t_url = a.get('href')
|
||
|
domain_1 = clean_text(a.text)
|
||
|
get_forum(t_url, [domain_0, domain_1], file)
|
||
|
return []
|
||
|
|
||
|
|
||
|
tmp = []
|
||
|
url = 'https://hyperreal.info/talk/'
|
||
|
|
||
|
|
||
|
with open('data/hyperreal_URLs.tsv','w') as file:
|
||
|
get_main_domains(url, file)
|