forum-scrapers/patient.py

122 lines
5.4 KiB
Python

import csv
from bs4 import BeautifulSoup
from selenium import webdriver
from selenium.webdriver.chrome.options import Options
from selenium.webdriver.common.by import By
import time
from datetime import datetime
def get_url_soup(url):
driver.get(url)
return BeautifulSoup(driver.page_source, 'lxml')
def check_pages_num(soup):
pages = soup.find('select', {'class': 'reply-pagination'})
if pages != None:
pages = pages.find_all('option')[-1]
pages = pages.text.split('/')[-1]
return int(pages)
else:
return 1
def clean_text(text):
return " ".join(text.split())
def set_num(n, m):
if m != 0:
return str(f'{n}.{m}')
else: return str(n)
def get_all_comments(url, domain_0, domain_1):
result = []
soup = get_url_soup(url)
pages = check_pages_num(soup)
comments_list = []
n0, n = 0, 0
for i in range(pages):
page_url = f'{url}?order=oldest&page={i}#topic-replies'
if i == 0:
main = soup.find('div', {'class': 'post__main'})
main_title = clean_text(main.find('h1', {'class': 'post__title'}).text.strip())
main_author = main.find('h5', {'class': 'author__info'}).text.strip()
main_time = main.find('time', {'class': 'fuzzy'})['datetime']
datetime_object = datetime.strptime(main_time, '%Y-%m-%dT%H:%M+%S:%f')
date = str(datetime_object.date())
hour = str(datetime_object.time())
likes, replies = soup.find_all('p', {'class': 'post__stats'})[1].text.strip().split(', ')
likes = likes.replace(' likes', '').replace(' like', '')
replies = replies.replace(' replies', '').replace(' reply', '')
main_content = soup.find('div', {'class': 'post__content'}).find_all('p')[:-1]
main_content = clean_text(' '.join([p.text.strip() for p in main_content]))
main_id = url.split('-')[-1]
followers = main.find('p', {'class': 'post__stats'}).find_all('span')[-1].text
followers = followers.strip()
if 'following' in followers:
followers = followers.replace(' users are following.', '').replace(' user is following.', '')
else:
followers = ''
main_data = [main_id, main_id, main_title, main_author, followers, date, hour, likes, replies, '0', '0', main_content]
comments_list += [main_data]
else:
soup = get_url_soup(page_url)
comments = soup.find('div', {'class': 'comment-page'})
if comments != None:
comments = comments.find_all('li', {'class': 'comment'})
m = 0
for comment in comments:
classes = comment.get('class')
header = comment.find('div', {'class': 'post__header'})
likes = comment.find('a', {'class': 'post__like'})
if likes != None:
likes = clean_text(likes.text)
else: likes = ''
content = comment.find('div', {'class': 'post__content'})
if content != None:
content = clean_text(' '.join([q.text for q in content.findAll(text=True)]))
content = content.replace(f' Report / Delete {likes} Reply', '')
content = content.replace(f' Report / Delete Reply', '')
if header != None:
if 'comment--nested' in classes:
m += 1
else:
m = 0
n += 1
n0 += 1
idx = comment['itemid'].split('#')[-1]
user = header.find('a', {'class': 'author__name'}).text.strip()
time = comment.find('time', {'class': 'fuzzy'})['datetime']
datetime_object = datetime.strptime(time, '%Y-%m-%dT%H:%M+%S:%f')
date = str(datetime_object.date())
hour = str(datetime_object.time())
likes = comment.find('a', {'class': 'post__like'})
if likes != None:
likes = clean_text(likes.text)
else: likes = ''
n_n = set_num(n, m)
comments_list += [[idx, main_id, main_title, user, '', date, hour, likes, '', str(n0), n_n, content]]
comments_list[0][8] = str(n0)
return comments_list
DRIVER_PATH = '/usr/bin/chromedriver'
options = Options()
options.headless = True
options.incognito = True
driver = webdriver.Chrome(options=options, executable_path=DRIVER_PATH)
with open("data/patient_URLs.tsv") as file:
tsv_file = csv.reader(file, delimiter="\t")
with open('data/patient_3.tsv','w') as file:
# file.write('\t'.join(['topic_URL', 'domain_0', 'domain_1', 'id', 'main_id', 'title', 'user', 'followers', 'date', 'hour', 'likes', 'comments', 'n', 'm', 'content']) + '\n')
for topic_url, domain_0, domain_1 in list(tsv_file)[538775:]:
try:
result = get_all_comments('https://patient.info/forums/discuss/' + topic_url, domain_0, domain_1)
for r in result:
r = [topic_url, domain_0, domain_1] + r
file.write('\t'.join(r))
file.write('\n')
except Exception as e:
print(f'error: {topic_url}')
with open('patient_error_2.tsv', 'a+') as f:
f.write(f'{topic_url}\t{e}\n')