forum-scrapers/historycy.py

137 lines
5.9 KiB
Python
Raw Permalink Blame History

This file contains invisible Unicode characters

This file contains invisible Unicode characters that are indistinguishable to humans but may be processed differently by a computer. If you think that this is intentional, you can safely ignore this warning. Use the Escape button to reveal them.

This file contains Unicode characters that might be confused with other characters. If you think that this is intentional, you can safely ignore this warning. Use the Escape button to reveal them.

import csv
import requests
# from bs4 import BeautifulSoup as bs
import time
from datetime import datetime
import numpy as np
from bs4 import BeautifulSoup
from selenium import webdriver
from selenium.webdriver.chrome.options import Options
# from selenium.webdriver.common.by import By
from selenium import webdriver
from selenium.webdriver.chrome.service import Service
from webdriver_manager.chrome import ChromeDriverManager
def get_url_soup(url):
driver.get(url)
return BeautifulSoup(driver.page_source, 'lxml')
def clean_text(text):
text = " ".join(text.split())
text = text.replace('¥', 'Ą').replace('¹', 'ą')
text = text.replace('Æ', 'Ć').replace('æ', 'ć')
text = text.replace('Ê', 'Ę').replace('ê', 'ę')
text = text.replace('£', 'Ł').replace('³', 'ł')
text = text.replace('Ñ', 'Ń').replace('ñ', 'ń')
text = text.replace('¯', 'Ż').replace('¿', 'ż')
text = text.replace('\x8c', 'Ś').replace('\x9c', 'ś')
text = text.replace('\x8f', 'Ź').replace('\x9f', 'ź')
text = text.replace('„', '').replace('”', '')
text = text.replace('\x96', '')
return text
def get_content(soup):
links = soup.find_all('a')
cites = soup.find_all('div', {'class': 'quotemain'})
content = ' '.join([p.text.strip() for p in BeautifulSoup(str(soup), 'html.parser').find_all(text=True)])
for link in links:
if clean_text(link.text).startswith('http'):
content = content.replace(str(link.text), f' {str(link.get("href"))} ')
for cite in cites:
c = ' '.join([p.text.strip() for p in BeautifulSoup(str(cite), 'html.parser').find_all(text=True)])
content = content.replace(c, f' <cite> {c} </cite> ')
return clean_text(content)
def get_all_comments(url):
print(url)
result = []
soup = get_url_soup(url)
pages = soup.find('a', {'title': 'skocz do strony...'})
if pages != None:
pages = int(clean_text(pages.text).replace(' Strony', ''))
else: pages = 1
n = 0
for page in range(pages):
print(page + 1)
if page != 0:
n_url = f'{url}&st={page * 15}.html'
soup = get_url_soup(n_url)
cards = soup.find('div', {'class': 'tableborder'})
cards = cards.find_all('table', {'cellspacing': '0', 'cellpadding': '0', 'width': '100%', 'border': '0'})
for card in cards[:30]:
if card.find('td', {'valign': 'top'}) and card.find('td', {'valign': 'top'}).find('table', {'cellspacing': '0', 'cellpadding': '0', 'width': '100%', 'border': '0'}):
card = card.find('td', {'valign': 'top'}).find('table', {'cellspacing': '0', 'cellpadding': '0', 'width': '100%', 'border': '0'})
card = card.find('tbody').find_all('tr')[1]
info, content = card.find_all('td', {'class': 'row4'})
username = clean_text(info.find('tr').text)
score = info.find_all('tr')[1].find_all('td', {'class': 'posterinfo'})
ranga = clean_text(score[0].text).replace(' ranga', '')
group = clean_text(score[2].text).replace('Grupa: ', '')
posts = clean_text(score[3].text).replace('Postów: ', '').replace('.', '')
user_id = clean_text(score[4].text).replace('Nr użytkownika: ', '').replace('.', '')
name_surname, degree, profession = '', '', ''
if len(score) > 6:
for s in score[6:]:
if 'Stopień akademicki: ' in s.text:
degree = clean_text(s.text).replace('Stopień akademicki: ', '')
elif 'Zawód: 'in s.text:
profession = clean_text(s.text).replace('Zawód: ', '')
else:
name_surname = clean_text(s.text)
time = clean_text(content.find('span', {'class': 'postdetails'}).text)
time = time.replace('Dzisiaj', '2/12/2022').replace('Wczoraj', '1/12/2022')
datetime_object = datetime.strptime(time, '%d/%m/%Y, %H:%M')
date = str(datetime_object.date())
hour = str(datetime_object.time())
content = content.find('span', {'class': 'postcolor'})
content = get_content(content)
result.append([username, user_id, name_surname, degree, ranga, profession, group, posts, date, hour, '', str(n), content])
n += 1
result[0][-3] = str(n - 1)
return result
options = Options()
options.headless = True
options.add_argument('--no-sandbox')
options.incognito = True
driver = webdriver.Chrome(options=options)
print()
with open('data/historycy_errors.tsv') as file_0:
tsv_file = csv.reader(file_0, delimiter='\t')
with open('data/historycy_2.tsv','w') as file:
# header = '\t'.join(['topic_URL', 'domain_0', 'domain_1', 'domain_2', 'domain_3', 'domain_4', 'topic', 'views', 'username', 'user_id', 'name_surname', 'degree', 'position', 'profession', 'group', 'posts', 'date', 'hour', 'comments', 'n', 'content'])
# file.write(header + '\n')
for data in tsv_file:
url = f'http://www.historycy.org/index.php?s={data[0]}'
domains = data[3:-1]
m = len(domains)
domains = domains + [''] * (5 - m)
# try:
result = get_all_comments(url)
flag = True
for r in result:
if flag:
views = data[2]
r = [data[0]] + domains + [data[1]] + [views] + r
if flag:
views = ''
flag = False
file.write('\t'.join(r))
file.write('\n')
# except Exception as e:
# print(f'error: {url}')
# with open('data/historycy_errors.tsv', 'a+') as f:
# d = "\t".join(data)
# f.write(f'{d}\t{e}\n')