forum-scrapers/hyperreal.py

import csv
import requests
from bs4 import BeautifulSoup as bs
import time
from datetime import datetime


def get_url_soup(url):
    try:
        req = requests.get(url)
        soup = bs(req.text, 'html.parser')
        if len(soup) == 0:
            print('sleep')
            time.sleep(3)
            print(sleep)
            get_url_soup(url)
    except Exception as e:
        print(e)
        get_url_soup(url)
        return
    return soup


def clean_text(text):
    return " ".join(text.split())


# def get_content(soup):
#     blockquotes = soup.find_all('blockquote')
#     content = ' '.join([p.text.strip() for p in soup])
#     for bq in blockquotes:
#         t = ' '.join([p.text.strip() for p in bq])
#         if 'pisze:' in bq.text:
#             idx = bq.text.index('pisze:') + len('pisze:')
#             t = t[:idx] + ' <cite> ' + t[idx:] + ' </cite>'
#         else:
#             t = '<cite> ' + t + ' </cite>'
#         content = content.replace(bq.text, t)
#     return clean_text(content)


def get_content(soup):
    blockquotes = soup.find_all('blockquote')
    cite = soup.find('cite')
    if blockquotes != []:
        if cite != None:
            soup = str(soup)
            soup = soup.replace(str(cite), ' '.join([p.text.strip() for p in cite]))
            content = ' '.join([p.text.strip() for p in bs(soup, 'html.parser')])
            content = clean_text(content)
            idx = content.index('pisze:') + len('pisze:')
            content = content[:idx] + ' <cite> ' + content[idx:] + ' </cite>'
        else:
            content = clean_text(soup.text)
            content = '<cite> ' + content + ' </cite>'
        return content
    return clean_text(' '.join([p.text.strip() for p in soup]))


def get_all_comments(url):
    result = []
    soup = get_url_soup(url)
    pages = soup.find('span', {'class': 'fw-normal'})
    print()
    print(pages)
    print()
    pages = int(clean_text(pages.text).replace('Strona 1 z ', ''))
    n = 0
    for page in range(pages):
        print(page + 1)
        if page != 0:
            n_url = f'{url[:-5]}-{page * 10}.html'
            soup = get_url_soup(n_url)
        cards = soup.find_all('div', {'class': 'card border-info mb-2 mb-md-3 shadow-sm'})
        for card in cards:
            idx = card.get('id')[1:]
            postprofile = card.find('div', {'class': 'postprofile'})
            user = postprofile.find('div', {'class': 'mt-1 float-start float-md-none'})
            username = user.find('a')
            if username == None:
                username = clean_text(user.text)
                strong, posts, registration, karma_received, karma_assigned, narkopedia_edits, sex = '', '', '', '', '', '', ''
            else:
                username = clean_text(username.text)
                strong = user.find('strong')
                if strong != None: strong = strong.text
                else: strong = ''
                modal = soup.find('div', {'id': f'modal_post_{idx}'}).find('div', {'class': 'col-8'})
                info = modal.find_all('div')
                posts = clean_text(info[0].text.replace('Posty:', ''))
                registration = clean_text(info[1].text.replace('Rejestracja:', ''))
                if len(info) > 2:
                    sex = clean_text(info[2].text.replace('Płeć:', ''))
                    if sex != None:
                        if sex == 'chłopak': sex = 'M'
                        elif sex == 'dziewka': sex = 'F'
                else: sex = ''
                score = modal.find_all('li')[-3:]
                karma_received = clean_text(score[0].text.replace('Karma otrzymana', ''))
                karma_assigned = clean_text(score[1].text.replace('Karma przydzielona', ''))
                narkopedia_edits = clean_text(score[2].text.replace('Edycje Narkopedii', ''))
            body = card.find('div', {'id': f'post_content{idx}'})
            title = clean_text(body.find('a').text)
            datetime_str = body.find('time')['datetime']
            datetime_object = datetime.strptime(datetime_str, '%Y-%m-%dT%H:%M:%S+00:00')
            date = str(datetime_object.date())
            hour = str(datetime_object.time())
            content = get_content(body.find('div', {'class': 'content'}))
            result.append([idx, '', title, username, strong, registration, posts, sex, karma_received, karma_assigned, narkopedia_edits, date, hour, '', str(n), content])
            n += 1
    result[0][-3] = str(n - 1)
    for i in result:
        i[1] = result[0][0]
    return result


with open('data/hyperreal_URLs.tsv') as file_0:
    tsv_file = csv.reader(file_0, delimiter='\t')
    with open('data/hyperreal.tsv','w') as file:
        for data in tsv_file:
            url = f'https://hyperreal.info/talk/{data[0]}'
            print(url)
            domains = data[1:]
            m = len(domains)
            domains = domains + [''] * (6 - m)
            try:
                result = get_all_comments(url)
                for r in result:
                    r = [data[0]] + domains + r
                    file.write('\t'.join(r))
                    file.write('\n')
            except Exception as e:
                print(f'error: {url}')
                with open('data/hyperreal_errors_2.tsv', 'a+') as f:
                    d = "\t".join(data)
                    f.write(f'{d}\t{e}\n')