122 lines
5.4 KiB
Python
122 lines
5.4 KiB
Python
import csv
|
|
from bs4 import BeautifulSoup
|
|
from selenium import webdriver
|
|
from selenium.webdriver.chrome.options import Options
|
|
from selenium.webdriver.common.by import By
|
|
import time
|
|
from datetime import datetime
|
|
|
|
def get_url_soup(url):
|
|
driver.get(url)
|
|
return BeautifulSoup(driver.page_source, 'lxml')
|
|
|
|
def check_pages_num(soup):
|
|
pages = soup.find('select', {'class': 'reply-pagination'})
|
|
if pages != None:
|
|
pages = pages.find_all('option')[-1]
|
|
pages = pages.text.split('/')[-1]
|
|
return int(pages)
|
|
else:
|
|
return 1
|
|
|
|
def clean_text(text):
|
|
return " ".join(text.split())
|
|
|
|
def set_num(n, m):
|
|
if m != 0:
|
|
return str(f'{n}.{m}')
|
|
else: return str(n)
|
|
|
|
def get_all_comments(url, domain_0, domain_1):
|
|
result = []
|
|
soup = get_url_soup(url)
|
|
pages = check_pages_num(soup)
|
|
comments_list = []
|
|
n0, n = 0, 0
|
|
for i in range(pages):
|
|
page_url = f'{url}?order=oldest&page={i}#topic-replies'
|
|
if i == 0:
|
|
main = soup.find('div', {'class': 'post__main'})
|
|
main_title = clean_text(main.find('h1', {'class': 'post__title'}).text.strip())
|
|
main_author = main.find('h5', {'class': 'author__info'}).text.strip()
|
|
main_time = main.find('time', {'class': 'fuzzy'})['datetime']
|
|
datetime_object = datetime.strptime(main_time, '%Y-%m-%dT%H:%M+%S:%f')
|
|
date = str(datetime_object.date())
|
|
hour = str(datetime_object.time())
|
|
likes, replies = soup.find_all('p', {'class': 'post__stats'})[1].text.strip().split(', ')
|
|
likes = likes.replace(' likes', '').replace(' like', '')
|
|
replies = replies.replace(' replies', '').replace(' reply', '')
|
|
main_content = soup.find('div', {'class': 'post__content'}).find_all('p')[:-1]
|
|
main_content = clean_text(' '.join([p.text.strip() for p in main_content]))
|
|
main_id = url.split('-')[-1]
|
|
followers = main.find('p', {'class': 'post__stats'}).find_all('span')[-1].text
|
|
followers = followers.strip()
|
|
if 'following' in followers:
|
|
followers = followers.replace(' users are following.', '').replace(' user is following.', '')
|
|
else:
|
|
followers = ''
|
|
main_data = [main_id, main_id, main_title, main_author, followers, date, hour, likes, replies, '0', '0', main_content]
|
|
comments_list += [main_data]
|
|
else:
|
|
soup = get_url_soup(page_url)
|
|
comments = soup.find('div', {'class': 'comment-page'})
|
|
if comments != None:
|
|
comments = comments.find_all('li', {'class': 'comment'})
|
|
m = 0
|
|
for comment in comments:
|
|
classes = comment.get('class')
|
|
header = comment.find('div', {'class': 'post__header'})
|
|
likes = comment.find('a', {'class': 'post__like'})
|
|
if likes != None:
|
|
likes = clean_text(likes.text)
|
|
else: likes = ''
|
|
content = comment.find('div', {'class': 'post__content'})
|
|
if content != None:
|
|
content = clean_text(' '.join([q.text for q in content.findAll(text=True)]))
|
|
content = content.replace(f' Report / Delete {likes} Reply', '')
|
|
content = content.replace(f' Report / Delete Reply', '')
|
|
if header != None:
|
|
if 'comment--nested' in classes:
|
|
m += 1
|
|
else:
|
|
m = 0
|
|
n += 1
|
|
n0 += 1
|
|
idx = comment['itemid'].split('#')[-1]
|
|
user = header.find('a', {'class': 'author__name'}).text.strip()
|
|
time = comment.find('time', {'class': 'fuzzy'})['datetime']
|
|
datetime_object = datetime.strptime(time, '%Y-%m-%dT%H:%M+%S:%f')
|
|
date = str(datetime_object.date())
|
|
hour = str(datetime_object.time())
|
|
likes = comment.find('a', {'class': 'post__like'})
|
|
if likes != None:
|
|
likes = clean_text(likes.text)
|
|
else: likes = ''
|
|
n_n = set_num(n, m)
|
|
comments_list += [[idx, main_id, main_title, user, '', date, hour, likes, '', str(n0), n_n, content]]
|
|
comments_list[0][8] = str(n0)
|
|
return comments_list
|
|
|
|
|
|
DRIVER_PATH = '/usr/bin/chromedriver'
|
|
options = Options()
|
|
options.headless = True
|
|
options.incognito = True
|
|
driver = webdriver.Chrome(options=options, executable_path=DRIVER_PATH)
|
|
|
|
|
|
with open("data/patient_URLs.tsv") as file:
|
|
tsv_file = csv.reader(file, delimiter="\t")
|
|
with open('data/patient_3.tsv','w') as file:
|
|
# file.write('\t'.join(['topic_URL', 'domain_0', 'domain_1', 'id', 'main_id', 'title', 'user', 'followers', 'date', 'hour', 'likes', 'comments', 'n', 'm', 'content']) + '\n')
|
|
for topic_url, domain_0, domain_1 in list(tsv_file)[538775:]:
|
|
try:
|
|
result = get_all_comments('https://patient.info/forums/discuss/' + topic_url, domain_0, domain_1)
|
|
for r in result:
|
|
r = [topic_url, domain_0, domain_1] + r
|
|
file.write('\t'.join(r))
|
|
file.write('\n')
|
|
except Exception as e:
|
|
print(f'error: {topic_url}')
|
|
with open('patient_error_2.tsv', 'a+') as f:
|
|
f.write(f'{topic_url}\t{e}\n') |