forum-scrapers/hyperreal_scrap_URLs.py

import requests
from bs4 import BeautifulSoup as bs
import time


def get_url_soup(url):
    req = requests.get(url)
    soup = bs(req.text, 'html.parser')
    l = len(soup)
    while l <= 1:
        req = requests.get(url)
        soup = bs(req.text, 'html.parser')
        l = len(soup)
        time.sleep(3)
    return soup


def clean_text(text):
    return " ".join(text.split())


def get_forum(url, domains, file):
    try:
        soup = get_url_soup(url)
        soup = soup.find('div', {'class': 'order-first'})
        cards = soup.find_all('div', {'class': 'card mb-3'}) + soup.find_all('div', {'class': 'card mb-3 shadow-sm'})
        for card in cards:
            title = card.find('div', {'class': 'col-lg-7 col-md-6 col-12'})
            title = clean_text(title.text)
            if title == 'Forum':
                forum_card = card.find_all('div', {'class': 'row-item'})
                for c in forum_card:
                    a = c.find_all('a')
                    if len(a) > 1:
                        a = c.find_all('a')[1]
                        t_url = a.get('href')
                        domain = clean_text(a.text)
                        domains.append(domain)
                        get_forum(t_url, domains, file)
                        domains.pop()
            elif title == 'Ogłoszenia':
                topics = card.find_all('a', {'class': 'topictitle fs-5'})
                for topic in topics:
                    file.write(topic.get('href').replace('https://hyperreal.info/talk/', '') + '\t' + '\t'.join(domains + ['Ogłoszenia']) + '\n')
            elif title == 'Tematy':
                pages = soup.find('span', {'class': 'fw-normal'})
                if pages != None:
                    pages = int(clean_text(pages.text).replace('Strona 1 z ', ''))
                    for page in range(pages):
                        n = page * 15
                        topics_url = f'{url[:-5]}-{n}.html'
                        topics_soup = get_url_soup(topics_url)
                        topics_soup = topics_soup.find('div', {'class': 'order-first'})
                        cards_topics = topics_soup.find_all('div', {'class': 'card mb-3 shadow-sm'})[-1]
                        topics = cards_topics.find_all('a', {'class': 'topictitle fs-5'})
                        for topic in topics:
                            file.write(topic.get('href').replace('https://hyperreal.info/talk/', '') + '\t' + '\t'.join(domains) + '\n')
    except Exception as e:
        print(f'\tERROR: {url} - {e}')


def get_main_domains(url, file):
    soup = get_url_soup(url)
    soup = soup.find('div', {'class': 'order-first'})
    domains = soup.find_all('div', {'class': 'card mb-3'})
    for domain in domains:
        domain_0 = clean_text(domain.find('div', {'class': 'col-lg-7 col-md-6 col-12'}).text)
        topics = domain.find_all('div', {'class': 'row-item'})
        for topic in topics:
            a = topic.find_all('a')[1]
            t_url = a.get('href')
            domain_1 = clean_text(a.text)
            get_forum(t_url, [domain_0, domain_1], file)
    return []


tmp = []
url = 'https://hyperreal.info/talk/'


with open('data/hyperreal_URLs.tsv','w') as file:
    get_main_domains(url, file)
upload scripts 2023-01-02 09:30:54 +01:00			`import requests`
			`from bs4 import BeautifulSoup as bs`
			`import time`


			`def get_url_soup(url):`
			`req = requests.get(url)`
			`soup = bs(req.text, 'html.parser')`
			`l = len(soup)`
			`while l <= 1:`
			`req = requests.get(url)`
			`soup = bs(req.text, 'html.parser')`
			`l = len(soup)`
			`time.sleep(3)`
			`return soup`


			`def clean_text(text):`
			`return " ".join(text.split())`


			`def get_forum(url, domains, file):`
			`try:`
			`soup = get_url_soup(url)`
			`soup = soup.find('div', {'class': 'order-first'})`
			`cards = soup.find_all('div', {'class': 'card mb-3'}) + soup.find_all('div', {'class': 'card mb-3 shadow-sm'})`
			`for card in cards:`
			`title = card.find('div', {'class': 'col-lg-7 col-md-6 col-12'})`
			`title = clean_text(title.text)`
			`if title == 'Forum':`
			`forum_card = card.find_all('div', {'class': 'row-item'})`
			`for c in forum_card:`
			`a = c.find_all('a')`
			`if len(a) > 1:`
			`a = c.find_all('a')[1]`
			`t_url = a.get('href')`
			`domain = clean_text(a.text)`
			`domains.append(domain)`
			`get_forum(t_url, domains, file)`
			`domains.pop()`
			`elif title == 'Ogłoszenia':`
			`topics = card.find_all('a', {'class': 'topictitle fs-5'})`
			`for topic in topics:`
			`file.write(topic.get('href').replace('https://hyperreal.info/talk/', '') + '\t' + '\t'.join(domains + ['Ogłoszenia']) + '\n')`
			`elif title == 'Tematy':`
			`pages = soup.find('span', {'class': 'fw-normal'})`
			`if pages != None:`
			`pages = int(clean_text(pages.text).replace('Strona 1 z ', ''))`
			`for page in range(pages):`
			`n = page * 15`
			`topics_url = f'{url[:-5]}-{n}.html'`
			`topics_soup = get_url_soup(topics_url)`
			`topics_soup = topics_soup.find('div', {'class': 'order-first'})`
			`cards_topics = topics_soup.find_all('div', {'class': 'card mb-3 shadow-sm'})[-1]`
			`topics = cards_topics.find_all('a', {'class': 'topictitle fs-5'})`
			`for topic in topics:`
			`file.write(topic.get('href').replace('https://hyperreal.info/talk/', '') + '\t' + '\t'.join(domains) + '\n')`
			`except Exception as e:`
			`print(f'\tERROR: {url} - {e}')`


			`def get_main_domains(url, file):`
			`soup = get_url_soup(url)`
			`soup = soup.find('div', {'class': 'order-first'})`
			`domains = soup.find_all('div', {'class': 'card mb-3'})`
			`for domain in domains:`
			`domain_0 = clean_text(domain.find('div', {'class': 'col-lg-7 col-md-6 col-12'}).text)`
			`topics = domain.find_all('div', {'class': 'row-item'})`
			`for topic in topics:`
			`a = topic.find_all('a')[1]`
			`t_url = a.get('href')`
			`domain_1 = clean_text(a.text)`
			`get_forum(t_url, [domain_0, domain_1], file)`
			`return []`


			`tmp = []`
			`url = 'https://hyperreal.info/talk/'`


			`with open('data/hyperreal_URLs.tsv','w') as file:`
			`get_main_domains(url, file)`