forum-scrapers/patient.py

122 lines
5.4 KiB
Python
Raw Normal View History

2023-01-02 09:30:54 +01:00
import csv
from bs4 import BeautifulSoup
from selenium import webdriver
from selenium.webdriver.chrome.options import Options
from selenium.webdriver.common.by import By
import time
from datetime import datetime
def get_url_soup(url):
driver.get(url)
return BeautifulSoup(driver.page_source, 'lxml')
def check_pages_num(soup):
pages = soup.find('select', {'class': 'reply-pagination'})
if pages != None:
pages = pages.find_all('option')[-1]
pages = pages.text.split('/')[-1]
return int(pages)
else:
return 1
def clean_text(text):
return " ".join(text.split())
def set_num(n, m):
if m != 0:
return str(f'{n}.{m}')
else: return str(n)
def get_all_comments(url, domain_0, domain_1):
result = []
soup = get_url_soup(url)
pages = check_pages_num(soup)
comments_list = []
n0, n = 0, 0
for i in range(pages):
page_url = f'{url}?order=oldest&page={i}#topic-replies'
if i == 0:
main = soup.find('div', {'class': 'post__main'})
main_title = clean_text(main.find('h1', {'class': 'post__title'}).text.strip())
main_author = main.find('h5', {'class': 'author__info'}).text.strip()
main_time = main.find('time', {'class': 'fuzzy'})['datetime']
datetime_object = datetime.strptime(main_time, '%Y-%m-%dT%H:%M+%S:%f')
date = str(datetime_object.date())
hour = str(datetime_object.time())
likes, replies = soup.find_all('p', {'class': 'post__stats'})[1].text.strip().split(', ')
likes = likes.replace(' likes', '').replace(' like', '')
replies = replies.replace(' replies', '').replace(' reply', '')
main_content = soup.find('div', {'class': 'post__content'}).find_all('p')[:-1]
main_content = clean_text(' '.join([p.text.strip() for p in main_content]))
main_id = url.split('-')[-1]
followers = main.find('p', {'class': 'post__stats'}).find_all('span')[-1].text
followers = followers.strip()
if 'following' in followers:
followers = followers.replace(' users are following.', '').replace(' user is following.', '')
else:
followers = ''
main_data = [main_id, main_id, main_title, main_author, followers, date, hour, likes, replies, '0', '0', main_content]
comments_list += [main_data]
else:
soup = get_url_soup(page_url)
comments = soup.find('div', {'class': 'comment-page'})
if comments != None:
comments = comments.find_all('li', {'class': 'comment'})
m = 0
for comment in comments:
classes = comment.get('class')
header = comment.find('div', {'class': 'post__header'})
likes = comment.find('a', {'class': 'post__like'})
if likes != None:
likes = clean_text(likes.text)
else: likes = ''
content = comment.find('div', {'class': 'post__content'})
if content != None:
content = clean_text(' '.join([q.text for q in content.findAll(text=True)]))
content = content.replace(f' Report / Delete {likes} Reply', '')
content = content.replace(f' Report / Delete Reply', '')
if header != None:
if 'comment--nested' in classes:
m += 1
else:
m = 0
n += 1
n0 += 1
idx = comment['itemid'].split('#')[-1]
user = header.find('a', {'class': 'author__name'}).text.strip()
time = comment.find('time', {'class': 'fuzzy'})['datetime']
datetime_object = datetime.strptime(time, '%Y-%m-%dT%H:%M+%S:%f')
date = str(datetime_object.date())
hour = str(datetime_object.time())
likes = comment.find('a', {'class': 'post__like'})
if likes != None:
likes = clean_text(likes.text)
else: likes = ''
n_n = set_num(n, m)
comments_list += [[idx, main_id, main_title, user, '', date, hour, likes, '', str(n0), n_n, content]]
comments_list[0][8] = str(n0)
return comments_list
DRIVER_PATH = '/usr/bin/chromedriver'
options = Options()
options.headless = True
options.incognito = True
driver = webdriver.Chrome(options=options, executable_path=DRIVER_PATH)
with open("data/patient_URLs.tsv") as file:
tsv_file = csv.reader(file, delimiter="\t")
with open('data/patient_3.tsv','w') as file:
# file.write('\t'.join(['topic_URL', 'domain_0', 'domain_1', 'id', 'main_id', 'title', 'user', 'followers', 'date', 'hour', 'likes', 'comments', 'n', 'm', 'content']) + '\n')
for topic_url, domain_0, domain_1 in list(tsv_file)[538775:]:
try:
result = get_all_comments('https://patient.info/forums/discuss/' + topic_url, domain_0, domain_1)
for r in result:
r = [topic_url, domain_0, domain_1] + r
file.write('\t'.join(r))
file.write('\n')
except Exception as e:
print(f'error: {topic_url}')
with open('patient_error_2.tsv', 'a+') as f:
f.write(f'{topic_url}\t{e}\n')