import requests from bs4 import BeautifulSoup as bs import time def get_url_soup(url): req = requests.get(url) soup = bs(req.text, 'html.parser') l = len(soup) while l <= 1: req = requests.get(url) soup = bs(req.text, 'html.parser') l = len(soup) time.sleep(3) return soup def clean_text(text): return " ".join(text.split()) def get_forum(url, domains, file): try: soup = get_url_soup(url) soup = soup.find('div', {'class': 'order-first'}) cards = soup.find_all('div', {'class': 'card mb-3'}) + soup.find_all('div', {'class': 'card mb-3 shadow-sm'}) for card in cards: title = card.find('div', {'class': 'col-lg-7 col-md-6 col-12'}) title = clean_text(title.text) if title == 'Forum': forum_card = card.find_all('div', {'class': 'row-item'}) for c in forum_card: a = c.find_all('a') if len(a) > 1: a = c.find_all('a')[1] t_url = a.get('href') domain = clean_text(a.text) domains.append(domain) get_forum(t_url, domains, file) domains.pop() elif title == 'Ogłoszenia': topics = card.find_all('a', {'class': 'topictitle fs-5'}) for topic in topics: file.write(topic.get('href').replace('https://hyperreal.info/talk/', '') + '\t' + '\t'.join(domains + ['Ogłoszenia']) + '\n') elif title == 'Tematy': pages = soup.find('span', {'class': 'fw-normal'}) if pages != None: pages = int(clean_text(pages.text).replace('Strona 1 z ', '')) for page in range(pages): n = page * 15 topics_url = f'{url[:-5]}-{n}.html' topics_soup = get_url_soup(topics_url) topics_soup = topics_soup.find('div', {'class': 'order-first'}) cards_topics = topics_soup.find_all('div', {'class': 'card mb-3 shadow-sm'})[-1] topics = cards_topics.find_all('a', {'class': 'topictitle fs-5'}) for topic in topics: file.write(topic.get('href').replace('https://hyperreal.info/talk/', '') + '\t' + '\t'.join(domains) + '\n') except Exception as e: print(f'\tERROR: {url} - {e}') def get_main_domains(url, file): soup = get_url_soup(url) soup = soup.find('div', {'class': 'order-first'}) domains = soup.find_all('div', {'class': 'card mb-3'}) for domain in domains: domain_0 = clean_text(domain.find('div', {'class': 'col-lg-7 col-md-6 col-12'}).text) topics = domain.find_all('div', {'class': 'row-item'}) for topic in topics: a = topic.find_all('a')[1] t_url = a.get('href') domain_1 = clean_text(a.text) get_forum(t_url, [domain_0, domain_1], file) return [] tmp = [] url = 'https://hyperreal.info/talk/' with open('data/hyperreal_URLs.tsv','w') as file: get_main_domains(url, file)