137 lines
5.9 KiB
Python
137 lines
5.9 KiB
Python
import csv
|
||
import requests
|
||
# from bs4 import BeautifulSoup as bs
|
||
import time
|
||
from datetime import datetime
|
||
import numpy as np
|
||
|
||
from bs4 import BeautifulSoup
|
||
from selenium import webdriver
|
||
from selenium.webdriver.chrome.options import Options
|
||
# from selenium.webdriver.common.by import By
|
||
from selenium import webdriver
|
||
from selenium.webdriver.chrome.service import Service
|
||
from webdriver_manager.chrome import ChromeDriverManager
|
||
|
||
|
||
def get_url_soup(url):
|
||
driver.get(url)
|
||
return BeautifulSoup(driver.page_source, 'lxml')
|
||
|
||
|
||
def clean_text(text):
|
||
text = " ".join(text.split())
|
||
text = text.replace('¥', 'Ą').replace('¹', 'ą')
|
||
text = text.replace('Æ', 'Ć').replace('æ', 'ć')
|
||
text = text.replace('Ê', 'Ę').replace('ê', 'ę')
|
||
text = text.replace('£', 'Ł').replace('³', 'ł')
|
||
text = text.replace('Ñ', 'Ń').replace('ñ', 'ń')
|
||
text = text.replace('¯', 'Ż').replace('¿', 'ż')
|
||
text = text.replace('\x8c', 'Ś').replace('\x9c', 'ś')
|
||
text = text.replace('\x8f', 'Ź').replace('\x9f', 'ź')
|
||
text = text.replace('', '„').replace('', '”')
|
||
text = text.replace('\x96', '–')
|
||
return text
|
||
|
||
|
||
def get_content(soup):
|
||
links = soup.find_all('a')
|
||
cites = soup.find_all('div', {'class': 'quotemain'})
|
||
content = ' '.join([p.text.strip() for p in BeautifulSoup(str(soup), 'html.parser').find_all(text=True)])
|
||
|
||
for link in links:
|
||
if clean_text(link.text).startswith('http'):
|
||
content = content.replace(str(link.text), f' {str(link.get("href"))} ')
|
||
|
||
for cite in cites:
|
||
c = ' '.join([p.text.strip() for p in BeautifulSoup(str(cite), 'html.parser').find_all(text=True)])
|
||
content = content.replace(c, f' <cite> {c} </cite> ')
|
||
return clean_text(content)
|
||
|
||
|
||
def get_all_comments(url):
|
||
print(url)
|
||
result = []
|
||
soup = get_url_soup(url)
|
||
pages = soup.find('a', {'title': 'skocz do strony...'})
|
||
if pages != None:
|
||
pages = int(clean_text(pages.text).replace(' Strony', ''))
|
||
else: pages = 1
|
||
n = 0
|
||
for page in range(pages):
|
||
print(page + 1)
|
||
if page != 0:
|
||
n_url = f'{url}&st={page * 15}.html'
|
||
soup = get_url_soup(n_url)
|
||
cards = soup.find('div', {'class': 'tableborder'})
|
||
cards = cards.find_all('table', {'cellspacing': '0', 'cellpadding': '0', 'width': '100%', 'border': '0'})
|
||
for card in cards[:30]:
|
||
if card.find('td', {'valign': 'top'}) and card.find('td', {'valign': 'top'}).find('table', {'cellspacing': '0', 'cellpadding': '0', 'width': '100%', 'border': '0'}):
|
||
card = card.find('td', {'valign': 'top'}).find('table', {'cellspacing': '0', 'cellpadding': '0', 'width': '100%', 'border': '0'})
|
||
card = card.find('tbody').find_all('tr')[1]
|
||
info, content = card.find_all('td', {'class': 'row4'})
|
||
username = clean_text(info.find('tr').text)
|
||
score = info.find_all('tr')[1].find_all('td', {'class': 'posterinfo'})
|
||
ranga = clean_text(score[0].text).replace(' ranga', '')
|
||
group = clean_text(score[2].text).replace('Grupa: ', '')
|
||
posts = clean_text(score[3].text).replace('Postów: ', '').replace('.', '')
|
||
user_id = clean_text(score[4].text).replace('Nr użytkownika: ', '').replace('.', '')
|
||
name_surname, degree, profession = '', '', ''
|
||
if len(score) > 6:
|
||
for s in score[6:]:
|
||
if 'Stopień akademicki: ' in s.text:
|
||
degree = clean_text(s.text).replace('Stopień akademicki: ', '')
|
||
elif 'Zawód: 'in s.text:
|
||
profession = clean_text(s.text).replace('Zawód: ', '')
|
||
else:
|
||
name_surname = clean_text(s.text)
|
||
time = clean_text(content.find('span', {'class': 'postdetails'}).text)
|
||
time = time.replace('Dzisiaj', '2/12/2022').replace('Wczoraj', '1/12/2022')
|
||
datetime_object = datetime.strptime(time, '%d/%m/%Y, %H:%M')
|
||
date = str(datetime_object.date())
|
||
hour = str(datetime_object.time())
|
||
content = content.find('span', {'class': 'postcolor'})
|
||
content = get_content(content)
|
||
|
||
result.append([username, user_id, name_surname, degree, ranga, profession, group, posts, date, hour, '', str(n), content])
|
||
n += 1
|
||
|
||
|
||
result[0][-3] = str(n - 1)
|
||
return result
|
||
|
||
options = Options()
|
||
options.headless = True
|
||
options.add_argument('--no-sandbox')
|
||
options.incognito = True
|
||
driver = webdriver.Chrome(options=options)
|
||
print()
|
||
|
||
|
||
with open('data/historycy_errors.tsv') as file_0:
|
||
tsv_file = csv.reader(file_0, delimiter='\t')
|
||
with open('data/historycy_2.tsv','w') as file:
|
||
# header = '\t'.join(['topic_URL', 'domain_0', 'domain_1', 'domain_2', 'domain_3', 'domain_4', 'topic', 'views', 'username', 'user_id', 'name_surname', 'degree', 'position', 'profession', 'group', 'posts', 'date', 'hour', 'comments', 'n', 'content'])
|
||
# file.write(header + '\n')
|
||
for data in tsv_file:
|
||
url = f'http://www.historycy.org/index.php?s={data[0]}'
|
||
domains = data[3:-1]
|
||
m = len(domains)
|
||
domains = domains + [''] * (5 - m)
|
||
# try:
|
||
result = get_all_comments(url)
|
||
flag = True
|
||
for r in result:
|
||
if flag:
|
||
views = data[2]
|
||
r = [data[0]] + domains + [data[1]] + [views] + r
|
||
if flag:
|
||
views = ''
|
||
flag = False
|
||
file.write('\t'.join(r))
|
||
file.write('\n')
|
||
# except Exception as e:
|
||
# print(f'error: {url}')
|
||
# with open('data/historycy_errors.tsv', 'a+') as f:
|
||
# d = "\t".join(data)
|
||
# f.write(f'{d}\t{e}\n') |