import csv import requests from bs4 import BeautifulSoup as bs import time from datetime import datetime def get_url_soup(url): try: req = requests.get(url) soup = bs(req.text, 'html.parser') if len(soup) == 0: print('sleep') time.sleep(3) print(sleep) get_url_soup(url) except Exception as e: print(e) get_url_soup(url) return return soup def clean_text(text): return " ".join(text.split()) # def get_content(soup): # blockquotes = soup.find_all('blockquote') # content = ' '.join([p.text.strip() for p in soup]) # for bq in blockquotes: # t = ' '.join([p.text.strip() for p in bq]) # if 'pisze:' in bq.text: # idx = bq.text.index('pisze:') + len('pisze:') # t = t[:idx] + ' ' + t[idx:] + ' ' # else: # t = ' ' + t + ' ' # content = content.replace(bq.text, t) # return clean_text(content) def get_content(soup): blockquotes = soup.find_all('blockquote') cite = soup.find('cite') if blockquotes != []: if cite != None: soup = str(soup) soup = soup.replace(str(cite), ' '.join([p.text.strip() for p in cite])) content = ' '.join([p.text.strip() for p in bs(soup, 'html.parser')]) content = clean_text(content) idx = content.index('pisze:') + len('pisze:') content = content[:idx] + ' ' + content[idx:] + ' ' else: content = clean_text(soup.text) content = ' ' + content + ' ' return content return clean_text(' '.join([p.text.strip() for p in soup])) def get_all_comments(url): result = [] soup = get_url_soup(url) pages = soup.find('span', {'class': 'fw-normal'}) print() print(pages) print() pages = int(clean_text(pages.text).replace('Strona 1 z ', '')) n = 0 for page in range(pages): print(page + 1) if page != 0: n_url = f'{url[:-5]}-{page * 10}.html' soup = get_url_soup(n_url) cards = soup.find_all('div', {'class': 'card border-info mb-2 mb-md-3 shadow-sm'}) for card in cards: idx = card.get('id')[1:] postprofile = card.find('div', {'class': 'postprofile'}) user = postprofile.find('div', {'class': 'mt-1 float-start float-md-none'}) username = user.find('a') if username == None: username = clean_text(user.text) strong, posts, registration, karma_received, karma_assigned, narkopedia_edits, sex = '', '', '', '', '', '', '' else: username = clean_text(username.text) strong = user.find('strong') if strong != None: strong = strong.text else: strong = '' modal = soup.find('div', {'id': f'modal_post_{idx}'}).find('div', {'class': 'col-8'}) info = modal.find_all('div') posts = clean_text(info[0].text.replace('Posty:', '')) registration = clean_text(info[1].text.replace('Rejestracja:', '')) if len(info) > 2: sex = clean_text(info[2].text.replace('Płeć:', '')) if sex != None: if sex == 'chłopak': sex = 'M' elif sex == 'dziewka': sex = 'F' else: sex = '' score = modal.find_all('li')[-3:] karma_received = clean_text(score[0].text.replace('Karma otrzymana', '')) karma_assigned = clean_text(score[1].text.replace('Karma przydzielona', '')) narkopedia_edits = clean_text(score[2].text.replace('Edycje Narkopedii', '')) body = card.find('div', {'id': f'post_content{idx}'}) title = clean_text(body.find('a').text) datetime_str = body.find('time')['datetime'] datetime_object = datetime.strptime(datetime_str, '%Y-%m-%dT%H:%M:%S+00:00') date = str(datetime_object.date()) hour = str(datetime_object.time()) content = get_content(body.find('div', {'class': 'content'})) result.append([idx, '', title, username, strong, registration, posts, sex, karma_received, karma_assigned, narkopedia_edits, date, hour, '', str(n), content]) n += 1 result[0][-3] = str(n - 1) for i in result: i[1] = result[0][0] return result with open('data/hyperreal_URLs.tsv') as file_0: tsv_file = csv.reader(file_0, delimiter='\t') with open('data/hyperreal.tsv','w') as file: for data in tsv_file: url = f'https://hyperreal.info/talk/{data[0]}' print(url) domains = data[1:] m = len(domains) domains = domains + [''] * (6 - m) try: result = get_all_comments(url) for r in result: r = [data[0]] + domains + r file.write('\t'.join(r)) file.write('\n') except Exception as e: print(f'error: {url}') with open('data/hyperreal_errors_2.tsv', 'a+') as f: d = "\t".join(data) f.write(f'{d}\t{e}\n')