import csv import requests # from bs4 import BeautifulSoup as bs import time from datetime import datetime import numpy as np from bs4 import BeautifulSoup from selenium import webdriver from selenium.webdriver.chrome.options import Options # from selenium.webdriver.common.by import By from selenium import webdriver from selenium.webdriver.chrome.service import Service from webdriver_manager.chrome import ChromeDriverManager def get_url_soup(url): driver.get(url) return BeautifulSoup(driver.page_source, 'lxml') def clean_text(text): text = " ".join(text.split()) text = text.replace('¥', 'Ą').replace('¹', 'ą') text = text.replace('Æ', 'Ć').replace('æ', 'ć') text = text.replace('Ê', 'Ę').replace('ê', 'ę') text = text.replace('£', 'Ł').replace('³', 'ł') text = text.replace('Ñ', 'Ń').replace('ñ', 'ń') text = text.replace('¯', 'Ż').replace('¿', 'ż') text = text.replace('\x8c', 'Ś').replace('\x9c', 'ś') text = text.replace('\x8f', 'Ź').replace('\x9f', 'ź') text = text.replace('„', '„').replace('”', '”') text = text.replace('\x96', '–') return text def get_content(soup): links = soup.find_all('a') cites = soup.find_all('div', {'class': 'quotemain'}) content = ' '.join([p.text.strip() for p in BeautifulSoup(str(soup), 'html.parser').find_all(text=True)]) for link in links: if clean_text(link.text).startswith('http'): content = content.replace(str(link.text), f' {str(link.get("href"))} ') for cite in cites: c = ' '.join([p.text.strip() for p in BeautifulSoup(str(cite), 'html.parser').find_all(text=True)]) content = content.replace(c, f' {c} ') return clean_text(content) def get_all_comments(url): print(url) result = [] soup = get_url_soup(url) pages = soup.find('a', {'title': 'skocz do strony...'}) if pages != None: pages = int(clean_text(pages.text).replace(' Strony', '')) else: pages = 1 n = 0 for page in range(pages): print(page + 1) if page != 0: n_url = f'{url}&st={page * 15}.html' soup = get_url_soup(n_url) cards = soup.find('div', {'class': 'tableborder'}) cards = cards.find_all('table', {'cellspacing': '0', 'cellpadding': '0', 'width': '100%', 'border': '0'}) for card in cards[:30]: if card.find('td', {'valign': 'top'}) and card.find('td', {'valign': 'top'}).find('table', {'cellspacing': '0', 'cellpadding': '0', 'width': '100%', 'border': '0'}): card = card.find('td', {'valign': 'top'}).find('table', {'cellspacing': '0', 'cellpadding': '0', 'width': '100%', 'border': '0'}) card = card.find('tbody').find_all('tr')[1] info, content = card.find_all('td', {'class': 'row4'}) username = clean_text(info.find('tr').text) score = info.find_all('tr')[1].find_all('td', {'class': 'posterinfo'}) ranga = clean_text(score[0].text).replace(' ranga', '') group = clean_text(score[2].text).replace('Grupa: ', '') posts = clean_text(score[3].text).replace('Postów: ', '').replace('.', '') user_id = clean_text(score[4].text).replace('Nr użytkownika: ', '').replace('.', '') name_surname, degree, profession = '', '', '' if len(score) > 6: for s in score[6:]: if 'Stopień akademicki: ' in s.text: degree = clean_text(s.text).replace('Stopień akademicki: ', '') elif 'Zawód: 'in s.text: profession = clean_text(s.text).replace('Zawód: ', '') else: name_surname = clean_text(s.text) time = clean_text(content.find('span', {'class': 'postdetails'}).text) time = time.replace('Dzisiaj', '2/12/2022').replace('Wczoraj', '1/12/2022') datetime_object = datetime.strptime(time, '%d/%m/%Y, %H:%M') date = str(datetime_object.date()) hour = str(datetime_object.time()) content = content.find('span', {'class': 'postcolor'}) content = get_content(content) result.append([username, user_id, name_surname, degree, ranga, profession, group, posts, date, hour, '', str(n), content]) n += 1 result[0][-3] = str(n - 1) return result options = Options() options.headless = True options.add_argument('--no-sandbox') options.incognito = True driver = webdriver.Chrome(options=options) print() with open('data/historycy_errors.tsv') as file_0: tsv_file = csv.reader(file_0, delimiter='\t') with open('data/historycy_2.tsv','w') as file: # header = '\t'.join(['topic_URL', 'domain_0', 'domain_1', 'domain_2', 'domain_3', 'domain_4', 'topic', 'views', 'username', 'user_id', 'name_surname', 'degree', 'position', 'profession', 'group', 'posts', 'date', 'hour', 'comments', 'n', 'content']) # file.write(header + '\n') for data in tsv_file: url = f'http://www.historycy.org/index.php?s={data[0]}' domains = data[3:-1] m = len(domains) domains = domains + [''] * (5 - m) # try: result = get_all_comments(url) flag = True for r in result: if flag: views = data[2] r = [data[0]] + domains + [data[1]] + [views] + r if flag: views = '' flag = False file.write('\t'.join(r)) file.write('\n') # except Exception as e: # print(f'error: {url}') # with open('data/historycy_errors.tsv', 'a+') as f: # d = "\t".join(data) # f.write(f'{d}\t{e}\n')