forum-scrapers/historycy.py

137 lines
5.9 KiB
Python
Raw Normal View History

2023-01-02 09:30:54 +01:00
import csv
import requests
# from bs4 import BeautifulSoup as bs
import time
from datetime import datetime
import numpy as np
from bs4 import BeautifulSoup
from selenium import webdriver
from selenium.webdriver.chrome.options import Options
# from selenium.webdriver.common.by import By
from selenium import webdriver
from selenium.webdriver.chrome.service import Service
from webdriver_manager.chrome import ChromeDriverManager
def get_url_soup(url):
driver.get(url)
return BeautifulSoup(driver.page_source, 'lxml')
def clean_text(text):
text = " ".join(text.split())
text = text.replace('¥', 'Ą').replace('¹', 'ą')
text = text.replace('Æ', 'Ć').replace('æ', 'ć')
text = text.replace('Ê', 'Ę').replace('ê', 'ę')
text = text.replace('£', 'Ł').replace('³', 'ł')
text = text.replace('Ñ', 'Ń').replace('ñ', 'ń')
text = text.replace('¯', 'Ż').replace('¿', 'ż')
text = text.replace('\x8c', 'Ś').replace('\x9c', 'ś')
text = text.replace('\x8f', 'Ź').replace('\x9f', 'ź')
text = text.replace('„', '').replace('”', '')
text = text.replace('\x96', '')
return text
def get_content(soup):
links = soup.find_all('a')
cites = soup.find_all('div', {'class': 'quotemain'})
content = ' '.join([p.text.strip() for p in BeautifulSoup(str(soup), 'html.parser').find_all(text=True)])
for link in links:
if clean_text(link.text).startswith('http'):
content = content.replace(str(link.text), f' {str(link.get("href"))} ')
for cite in cites:
c = ' '.join([p.text.strip() for p in BeautifulSoup(str(cite), 'html.parser').find_all(text=True)])
content = content.replace(c, f' <cite> {c} </cite> ')
return clean_text(content)
def get_all_comments(url):
print(url)
result = []
soup = get_url_soup(url)
pages = soup.find('a', {'title': 'skocz do strony...'})
if pages != None:
pages = int(clean_text(pages.text).replace(' Strony', ''))
else: pages = 1
n = 0
for page in range(pages):
print(page + 1)
if page != 0:
n_url = f'{url}&st={page * 15}.html'
soup = get_url_soup(n_url)
cards = soup.find('div', {'class': 'tableborder'})
cards = cards.find_all('table', {'cellspacing': '0', 'cellpadding': '0', 'width': '100%', 'border': '0'})
for card in cards[:30]:
if card.find('td', {'valign': 'top'}) and card.find('td', {'valign': 'top'}).find('table', {'cellspacing': '0', 'cellpadding': '0', 'width': '100%', 'border': '0'}):
card = card.find('td', {'valign': 'top'}).find('table', {'cellspacing': '0', 'cellpadding': '0', 'width': '100%', 'border': '0'})
card = card.find('tbody').find_all('tr')[1]
info, content = card.find_all('td', {'class': 'row4'})
username = clean_text(info.find('tr').text)
score = info.find_all('tr')[1].find_all('td', {'class': 'posterinfo'})
ranga = clean_text(score[0].text).replace(' ranga', '')
group = clean_text(score[2].text).replace('Grupa: ', '')
posts = clean_text(score[3].text).replace('Postów: ', '').replace('.', '')
user_id = clean_text(score[4].text).replace('Nr użytkownika: ', '').replace('.', '')
name_surname, degree, profession = '', '', ''
if len(score) > 6:
for s in score[6:]:
if 'Stopień akademicki: ' in s.text:
degree = clean_text(s.text).replace('Stopień akademicki: ', '')
elif 'Zawód: 'in s.text:
profession = clean_text(s.text).replace('Zawód: ', '')
else:
name_surname = clean_text(s.text)
time = clean_text(content.find('span', {'class': 'postdetails'}).text)
time = time.replace('Dzisiaj', '2/12/2022').replace('Wczoraj', '1/12/2022')
datetime_object = datetime.strptime(time, '%d/%m/%Y, %H:%M')
date = str(datetime_object.date())
hour = str(datetime_object.time())
content = content.find('span', {'class': 'postcolor'})
content = get_content(content)
result.append([username, user_id, name_surname, degree, ranga, profession, group, posts, date, hour, '', str(n), content])
n += 1
result[0][-3] = str(n - 1)
return result
options = Options()
options.headless = True
options.add_argument('--no-sandbox')
options.incognito = True
driver = webdriver.Chrome(options=options)
print()
with open('data/historycy_errors.tsv') as file_0:
tsv_file = csv.reader(file_0, delimiter='\t')
with open('data/historycy_2.tsv','w') as file:
# header = '\t'.join(['topic_URL', 'domain_0', 'domain_1', 'domain_2', 'domain_3', 'domain_4', 'topic', 'views', 'username', 'user_id', 'name_surname', 'degree', 'position', 'profession', 'group', 'posts', 'date', 'hour', 'comments', 'n', 'content'])
# file.write(header + '\n')
for data in tsv_file:
url = f'http://www.historycy.org/index.php?s={data[0]}'
domains = data[3:-1]
m = len(domains)
domains = domains + [''] * (5 - m)
# try:
result = get_all_comments(url)
flag = True
for r in result:
if flag:
views = data[2]
r = [data[0]] + domains + [data[1]] + [views] + r
if flag:
views = ''
flag = False
file.write('\t'.join(r))
file.write('\n')
# except Exception as e:
# print(f'error: {url}')
# with open('data/historycy_errors.tsv', 'a+') as f:
# d = "\t".join(data)
# f.write(f'{d}\t{e}\n')