forum-scrapers/hyperreal_scrap_URLs.py

82 lines
3.2 KiB
Python
Raw Normal View History

2023-01-02 09:30:54 +01:00
import requests
from bs4 import BeautifulSoup as bs
import time
def get_url_soup(url):
req = requests.get(url)
soup = bs(req.text, 'html.parser')
l = len(soup)
while l <= 1:
req = requests.get(url)
soup = bs(req.text, 'html.parser')
l = len(soup)
time.sleep(3)
return soup
def clean_text(text):
return " ".join(text.split())
def get_forum(url, domains, file):
try:
soup = get_url_soup(url)
soup = soup.find('div', {'class': 'order-first'})
cards = soup.find_all('div', {'class': 'card mb-3'}) + soup.find_all('div', {'class': 'card mb-3 shadow-sm'})
for card in cards:
title = card.find('div', {'class': 'col-lg-7 col-md-6 col-12'})
title = clean_text(title.text)
if title == 'Forum':
forum_card = card.find_all('div', {'class': 'row-item'})
for c in forum_card:
a = c.find_all('a')
if len(a) > 1:
a = c.find_all('a')[1]
t_url = a.get('href')
domain = clean_text(a.text)
domains.append(domain)
get_forum(t_url, domains, file)
domains.pop()
elif title == 'Ogłoszenia':
topics = card.find_all('a', {'class': 'topictitle fs-5'})
for topic in topics:
file.write(topic.get('href').replace('https://hyperreal.info/talk/', '') + '\t' + '\t'.join(domains + ['Ogłoszenia']) + '\n')
elif title == 'Tematy':
pages = soup.find('span', {'class': 'fw-normal'})
if pages != None:
pages = int(clean_text(pages.text).replace('Strona 1 z ', ''))
for page in range(pages):
n = page * 15
topics_url = f'{url[:-5]}-{n}.html'
topics_soup = get_url_soup(topics_url)
topics_soup = topics_soup.find('div', {'class': 'order-first'})
cards_topics = topics_soup.find_all('div', {'class': 'card mb-3 shadow-sm'})[-1]
topics = cards_topics.find_all('a', {'class': 'topictitle fs-5'})
for topic in topics:
file.write(topic.get('href').replace('https://hyperreal.info/talk/', '') + '\t' + '\t'.join(domains) + '\n')
except Exception as e:
print(f'\tERROR: {url} - {e}')
def get_main_domains(url, file):
soup = get_url_soup(url)
soup = soup.find('div', {'class': 'order-first'})
domains = soup.find_all('div', {'class': 'card mb-3'})
for domain in domains:
domain_0 = clean_text(domain.find('div', {'class': 'col-lg-7 col-md-6 col-12'}).text)
topics = domain.find_all('div', {'class': 'row-item'})
for topic in topics:
a = topic.find_all('a')[1]
t_url = a.get('href')
domain_1 = clean_text(a.text)
get_forum(t_url, [domain_0, domain_1], file)
return []
tmp = []
url = 'https://hyperreal.info/talk/'
with open('data/hyperreal_URLs.tsv','w') as file:
get_main_domains(url, file)