136 lines
5.3 KiB
Python
136 lines
5.3 KiB
Python
|
import csv
|
||
|
import requests
|
||
|
from bs4 import BeautifulSoup as bs
|
||
|
import time
|
||
|
from datetime import datetime
|
||
|
|
||
|
|
||
|
def get_url_soup(url):
|
||
|
try:
|
||
|
req = requests.get(url)
|
||
|
soup = bs(req.text, 'html.parser')
|
||
|
if len(soup) == 0:
|
||
|
print('sleep')
|
||
|
time.sleep(3)
|
||
|
print(sleep)
|
||
|
get_url_soup(url)
|
||
|
except Exception as e:
|
||
|
print(e)
|
||
|
get_url_soup(url)
|
||
|
return
|
||
|
return soup
|
||
|
|
||
|
|
||
|
def clean_text(text):
|
||
|
return " ".join(text.split())
|
||
|
|
||
|
|
||
|
# def get_content(soup):
|
||
|
# blockquotes = soup.find_all('blockquote')
|
||
|
# content = ' '.join([p.text.strip() for p in soup])
|
||
|
# for bq in blockquotes:
|
||
|
# t = ' '.join([p.text.strip() for p in bq])
|
||
|
# if 'pisze:' in bq.text:
|
||
|
# idx = bq.text.index('pisze:') + len('pisze:')
|
||
|
# t = t[:idx] + ' <cite> ' + t[idx:] + ' </cite>'
|
||
|
# else:
|
||
|
# t = '<cite> ' + t + ' </cite>'
|
||
|
# content = content.replace(bq.text, t)
|
||
|
# return clean_text(content)
|
||
|
|
||
|
|
||
|
def get_content(soup):
|
||
|
blockquotes = soup.find_all('blockquote')
|
||
|
cite = soup.find('cite')
|
||
|
if blockquotes != []:
|
||
|
if cite != None:
|
||
|
soup = str(soup)
|
||
|
soup = soup.replace(str(cite), ' '.join([p.text.strip() for p in cite]))
|
||
|
content = ' '.join([p.text.strip() for p in bs(soup, 'html.parser')])
|
||
|
content = clean_text(content)
|
||
|
idx = content.index('pisze:') + len('pisze:')
|
||
|
content = content[:idx] + ' <cite> ' + content[idx:] + ' </cite>'
|
||
|
else:
|
||
|
content = clean_text(soup.text)
|
||
|
content = '<cite> ' + content + ' </cite>'
|
||
|
return content
|
||
|
return clean_text(' '.join([p.text.strip() for p in soup]))
|
||
|
|
||
|
|
||
|
def get_all_comments(url):
|
||
|
result = []
|
||
|
soup = get_url_soup(url)
|
||
|
pages = soup.find('span', {'class': 'fw-normal'})
|
||
|
print()
|
||
|
print(pages)
|
||
|
print()
|
||
|
pages = int(clean_text(pages.text).replace('Strona 1 z ', ''))
|
||
|
n = 0
|
||
|
for page in range(pages):
|
||
|
print(page + 1)
|
||
|
if page != 0:
|
||
|
n_url = f'{url[:-5]}-{page * 10}.html'
|
||
|
soup = get_url_soup(n_url)
|
||
|
cards = soup.find_all('div', {'class': 'card border-info mb-2 mb-md-3 shadow-sm'})
|
||
|
for card in cards:
|
||
|
idx = card.get('id')[1:]
|
||
|
postprofile = card.find('div', {'class': 'postprofile'})
|
||
|
user = postprofile.find('div', {'class': 'mt-1 float-start float-md-none'})
|
||
|
username = user.find('a')
|
||
|
if username == None:
|
||
|
username = clean_text(user.text)
|
||
|
strong, posts, registration, karma_received, karma_assigned, narkopedia_edits, sex = '', '', '', '', '', '', ''
|
||
|
else:
|
||
|
username = clean_text(username.text)
|
||
|
strong = user.find('strong')
|
||
|
if strong != None: strong = strong.text
|
||
|
else: strong = ''
|
||
|
modal = soup.find('div', {'id': f'modal_post_{idx}'}).find('div', {'class': 'col-8'})
|
||
|
info = modal.find_all('div')
|
||
|
posts = clean_text(info[0].text.replace('Posty:', ''))
|
||
|
registration = clean_text(info[1].text.replace('Rejestracja:', ''))
|
||
|
if len(info) > 2:
|
||
|
sex = clean_text(info[2].text.replace('Płeć:', ''))
|
||
|
if sex != None:
|
||
|
if sex == 'chłopak': sex = 'M'
|
||
|
elif sex == 'dziewka': sex = 'F'
|
||
|
else: sex = ''
|
||
|
score = modal.find_all('li')[-3:]
|
||
|
karma_received = clean_text(score[0].text.replace('Karma otrzymana', ''))
|
||
|
karma_assigned = clean_text(score[1].text.replace('Karma przydzielona', ''))
|
||
|
narkopedia_edits = clean_text(score[2].text.replace('Edycje Narkopedii', ''))
|
||
|
body = card.find('div', {'id': f'post_content{idx}'})
|
||
|
title = clean_text(body.find('a').text)
|
||
|
datetime_str = body.find('time')['datetime']
|
||
|
datetime_object = datetime.strptime(datetime_str, '%Y-%m-%dT%H:%M:%S+00:00')
|
||
|
date = str(datetime_object.date())
|
||
|
hour = str(datetime_object.time())
|
||
|
content = get_content(body.find('div', {'class': 'content'}))
|
||
|
result.append([idx, '', title, username, strong, registration, posts, sex, karma_received, karma_assigned, narkopedia_edits, date, hour, '', str(n), content])
|
||
|
n += 1
|
||
|
result[0][-3] = str(n - 1)
|
||
|
for i in result:
|
||
|
i[1] = result[0][0]
|
||
|
return result
|
||
|
|
||
|
|
||
|
with open('data/hyperreal_URLs.tsv') as file_0:
|
||
|
tsv_file = csv.reader(file_0, delimiter='\t')
|
||
|
with open('data/hyperreal.tsv','w') as file:
|
||
|
for data in tsv_file:
|
||
|
url = f'https://hyperreal.info/talk/{data[0]}'
|
||
|
print(url)
|
||
|
domains = data[1:]
|
||
|
m = len(domains)
|
||
|
domains = domains + [''] * (6 - m)
|
||
|
try:
|
||
|
result = get_all_comments(url)
|
||
|
for r in result:
|
||
|
r = [data[0]] + domains + r
|
||
|
file.write('\t'.join(r))
|
||
|
file.write('\n')
|
||
|
except Exception as e:
|
||
|
print(f'error: {url}')
|
||
|
with open('data/hyperreal_errors_2.tsv', 'a+') as f:
|
||
|
d = "\t".join(data)
|
||
|
f.write(f'{d}\t{e}\n')
|