forum-scrapers/hyperreal_scrap_URLs.py

import requests
from bs4 import BeautifulSoup as bs
import time


def get_url_soup(url):
    req = requests.get(url)
    soup = bs(req.text, 'html.parser')
    l = len(soup)
    while l <= 1:
        req = requests.get(url)
        soup = bs(req.text, 'html.parser')
        l = len(soup)
        time.sleep(3)
    return soup


def clean_text(text):
    return " ".join(text.split())


def get_forum(url, domains, file):
    try:
        soup = get_url_soup(url)
        soup = soup.find('div', {'class': 'order-first'})
        cards = soup.find_all('div', {'class': 'card mb-3'}) + soup.find_all('div', {'class': 'card mb-3 shadow-sm'})
        for card in cards:
            title = card.find('div', {'class': 'col-lg-7 col-md-6 col-12'})
            title = clean_text(title.text)
            if title == 'Forum':
                forum_card = card.find_all('div', {'class': 'row-item'})
                for c in forum_card:
                    a = c.find_all('a')
                    if len(a) > 1:
                        a = c.find_all('a')[1]
                        t_url = a.get('href')
                        domain = clean_text(a.text)
                        domains.append(domain)
                        get_forum(t_url, domains, file)
                        domains.pop()
            elif title == 'Ogłoszenia':
                topics = card.find_all('a', {'class': 'topictitle fs-5'})
                for topic in topics:
                    file.write(topic.get('href').replace('https://hyperreal.info/talk/', '') + '\t' + '\t'.join(domains + ['Ogłoszenia']) + '\n')
            elif title == 'Tematy':
                pages = soup.find('span', {'class': 'fw-normal'})
                if pages != None:
                    pages = int(clean_text(pages.text).replace('Strona 1 z ', ''))
                    for page in range(pages):
                        n = page * 15
                        topics_url = f'{url[:-5]}-{n}.html'
                        topics_soup = get_url_soup(topics_url)
                        topics_soup = topics_soup.find('div', {'class': 'order-first'})
                        cards_topics = topics_soup.find_all('div', {'class': 'card mb-3 shadow-sm'})[-1]
                        topics = cards_topics.find_all('a', {'class': 'topictitle fs-5'})
                        for topic in topics:
                            file.write(topic.get('href').replace('https://hyperreal.info/talk/', '') + '\t' + '\t'.join(domains) + '\n')
    except Exception as e:
        print(f'\tERROR: {url} - {e}')


def get_main_domains(url, file):
    soup = get_url_soup(url)
    soup = soup.find('div', {'class': 'order-first'})
    domains = soup.find_all('div', {'class': 'card mb-3'})
    for domain in domains:
        domain_0 = clean_text(domain.find('div', {'class': 'col-lg-7 col-md-6 col-12'}).text)
        topics = domain.find_all('div', {'class': 'row-item'})
        for topic in topics:
            a = topic.find_all('a')[1]
            t_url = a.get('href')
            domain_1 = clean_text(a.text)
            get_forum(t_url, [domain_0, domain_1], file)
    return []


tmp = []
url = 'https://hyperreal.info/talk/'


with open('data/hyperreal_URLs.tsv','w') as file:
    get_main_domains(url, file)