forum-scrapers/hyperreal.py

136 lines
5.3 KiB
Python
Raw Normal View History

2023-01-02 09:30:54 +01:00
import csv
import requests
from bs4 import BeautifulSoup as bs
import time
from datetime import datetime
def get_url_soup(url):
try:
req = requests.get(url)
soup = bs(req.text, 'html.parser')
if len(soup) == 0:
print('sleep')
time.sleep(3)
print(sleep)
get_url_soup(url)
except Exception as e:
print(e)
get_url_soup(url)
return
return soup
def clean_text(text):
return " ".join(text.split())
# def get_content(soup):
# blockquotes = soup.find_all('blockquote')
# content = ' '.join([p.text.strip() for p in soup])
# for bq in blockquotes:
# t = ' '.join([p.text.strip() for p in bq])
# if 'pisze:' in bq.text:
# idx = bq.text.index('pisze:') + len('pisze:')
# t = t[:idx] + ' <cite> ' + t[idx:] + ' </cite>'
# else:
# t = '<cite> ' + t + ' </cite>'
# content = content.replace(bq.text, t)
# return clean_text(content)
def get_content(soup):
blockquotes = soup.find_all('blockquote')
cite = soup.find('cite')
if blockquotes != []:
if cite != None:
soup = str(soup)
soup = soup.replace(str(cite), ' '.join([p.text.strip() for p in cite]))
content = ' '.join([p.text.strip() for p in bs(soup, 'html.parser')])
content = clean_text(content)
idx = content.index('pisze:') + len('pisze:')
content = content[:idx] + ' <cite> ' + content[idx:] + ' </cite>'
else:
content = clean_text(soup.text)
content = '<cite> ' + content + ' </cite>'
return content
return clean_text(' '.join([p.text.strip() for p in soup]))
def get_all_comments(url):
result = []
soup = get_url_soup(url)
pages = soup.find('span', {'class': 'fw-normal'})
print()
print(pages)
print()
pages = int(clean_text(pages.text).replace('Strona 1 z ', ''))
n = 0
for page in range(pages):
print(page + 1)
if page != 0:
n_url = f'{url[:-5]}-{page * 10}.html'
soup = get_url_soup(n_url)
cards = soup.find_all('div', {'class': 'card border-info mb-2 mb-md-3 shadow-sm'})
for card in cards:
idx = card.get('id')[1:]
postprofile = card.find('div', {'class': 'postprofile'})
user = postprofile.find('div', {'class': 'mt-1 float-start float-md-none'})
username = user.find('a')
if username == None:
username = clean_text(user.text)
strong, posts, registration, karma_received, karma_assigned, narkopedia_edits, sex = '', '', '', '', '', '', ''
else:
username = clean_text(username.text)
strong = user.find('strong')
if strong != None: strong = strong.text
else: strong = ''
modal = soup.find('div', {'id': f'modal_post_{idx}'}).find('div', {'class': 'col-8'})
info = modal.find_all('div')
posts = clean_text(info[0].text.replace('Posty:', ''))
registration = clean_text(info[1].text.replace('Rejestracja:', ''))
if len(info) > 2:
sex = clean_text(info[2].text.replace('Płeć:', ''))
if sex != None:
if sex == 'chłopak': sex = 'M'
elif sex == 'dziewka': sex = 'F'
else: sex = ''
score = modal.find_all('li')[-3:]
karma_received = clean_text(score[0].text.replace('Karma otrzymana', ''))
karma_assigned = clean_text(score[1].text.replace('Karma przydzielona', ''))
narkopedia_edits = clean_text(score[2].text.replace('Edycje Narkopedii', ''))
body = card.find('div', {'id': f'post_content{idx}'})
title = clean_text(body.find('a').text)
datetime_str = body.find('time')['datetime']
datetime_object = datetime.strptime(datetime_str, '%Y-%m-%dT%H:%M:%S+00:00')
date = str(datetime_object.date())
hour = str(datetime_object.time())
content = get_content(body.find('div', {'class': 'content'}))
result.append([idx, '', title, username, strong, registration, posts, sex, karma_received, karma_assigned, narkopedia_edits, date, hour, '', str(n), content])
n += 1
result[0][-3] = str(n - 1)
for i in result:
i[1] = result[0][0]
return result
with open('data/hyperreal_URLs.tsv') as file_0:
tsv_file = csv.reader(file_0, delimiter='\t')
with open('data/hyperreal.tsv','w') as file:
for data in tsv_file:
url = f'https://hyperreal.info/talk/{data[0]}'
print(url)
domains = data[1:]
m = len(domains)
domains = domains + [''] * (6 - m)
try:
result = get_all_comments(url)
for r in result:
r = [data[0]] + domains + r
file.write('\t'.join(r))
file.write('\n')
except Exception as e:
print(f'error: {url}')
with open('data/hyperreal_errors_2.tsv', 'a+') as f:
d = "\t".join(data)
f.write(f'{d}\t{e}\n')