import csv from bs4 import BeautifulSoup from selenium import webdriver from selenium.webdriver.chrome.options import Options from selenium.webdriver.common.by import By import time from datetime import datetime def get_url_soup(url): driver.get(url) return BeautifulSoup(driver.page_source, 'lxml') def check_pages_num(soup): pages = soup.find('select', {'class': 'reply-pagination'}) if pages != None: pages = pages.find_all('option')[-1] pages = pages.text.split('/')[-1] return int(pages) else: return 1 def clean_text(text): return " ".join(text.split()) def set_num(n, m): if m != 0: return str(f'{n}.{m}') else: return str(n) def get_all_comments(url, domain_0, domain_1): result = [] soup = get_url_soup(url) pages = check_pages_num(soup) comments_list = [] n0, n = 0, 0 for i in range(pages): page_url = f'{url}?order=oldest&page={i}#topic-replies' if i == 0: main = soup.find('div', {'class': 'post__main'}) main_title = clean_text(main.find('h1', {'class': 'post__title'}).text.strip()) main_author = main.find('h5', {'class': 'author__info'}).text.strip() main_time = main.find('time', {'class': 'fuzzy'})['datetime'] datetime_object = datetime.strptime(main_time, '%Y-%m-%dT%H:%M+%S:%f') date = str(datetime_object.date()) hour = str(datetime_object.time()) likes, replies = soup.find_all('p', {'class': 'post__stats'})[1].text.strip().split(', ') likes = likes.replace(' likes', '').replace(' like', '') replies = replies.replace(' replies', '').replace(' reply', '') main_content = soup.find('div', {'class': 'post__content'}).find_all('p')[:-1] main_content = clean_text(' '.join([p.text.strip() for p in main_content])) main_id = url.split('-')[-1] followers = main.find('p', {'class': 'post__stats'}).find_all('span')[-1].text followers = followers.strip() if 'following' in followers: followers = followers.replace(' users are following.', '').replace(' user is following.', '') else: followers = '' main_data = [main_id, main_id, main_title, main_author, followers, date, hour, likes, replies, '0', '0', main_content] comments_list += [main_data] else: soup = get_url_soup(page_url) comments = soup.find('div', {'class': 'comment-page'}) if comments != None: comments = comments.find_all('li', {'class': 'comment'}) m = 0 for comment in comments: classes = comment.get('class') header = comment.find('div', {'class': 'post__header'}) likes = comment.find('a', {'class': 'post__like'}) if likes != None: likes = clean_text(likes.text) else: likes = '' content = comment.find('div', {'class': 'post__content'}) if content != None: content = clean_text(' '.join([q.text for q in content.findAll(text=True)])) content = content.replace(f' Report / Delete {likes} Reply', '') content = content.replace(f' Report / Delete Reply', '') if header != None: if 'comment--nested' in classes: m += 1 else: m = 0 n += 1 n0 += 1 idx = comment['itemid'].split('#')[-1] user = header.find('a', {'class': 'author__name'}).text.strip() time = comment.find('time', {'class': 'fuzzy'})['datetime'] datetime_object = datetime.strptime(time, '%Y-%m-%dT%H:%M+%S:%f') date = str(datetime_object.date()) hour = str(datetime_object.time()) likes = comment.find('a', {'class': 'post__like'}) if likes != None: likes = clean_text(likes.text) else: likes = '' n_n = set_num(n, m) comments_list += [[idx, main_id, main_title, user, '', date, hour, likes, '', str(n0), n_n, content]] comments_list[0][8] = str(n0) return comments_list DRIVER_PATH = '/usr/bin/chromedriver' options = Options() options.headless = True options.incognito = True driver = webdriver.Chrome(options=options, executable_path=DRIVER_PATH) with open("data/patient_URLs.tsv") as file: tsv_file = csv.reader(file, delimiter="\t") with open('data/patient_3.tsv','w') as file: # file.write('\t'.join(['topic_URL', 'domain_0', 'domain_1', 'id', 'main_id', 'title', 'user', 'followers', 'date', 'hour', 'likes', 'comments', 'n', 'm', 'content']) + '\n') for topic_url, domain_0, domain_1 in list(tsv_file)[538775:]: try: result = get_all_comments('https://patient.info/forums/discuss/' + topic_url, domain_0, domain_1) for r in result: r = [topic_url, domain_0, domain_1] + r file.write('\t'.join(r)) file.write('\n') except Exception as e: print(f'error: {topic_url}') with open('patient_error_2.tsv', 'a+') as f: f.write(f'{topic_url}\t{e}\n')