upload scripts
This commit is contained in:
commit
04ac541e2d
137
historycy.py
Normal file
137
historycy.py
Normal file
@ -0,0 +1,137 @@
|
||||
import csv
|
||||
import requests
|
||||
# from bs4 import BeautifulSoup as bs
|
||||
import time
|
||||
from datetime import datetime
|
||||
import numpy as np
|
||||
|
||||
from bs4 import BeautifulSoup
|
||||
from selenium import webdriver
|
||||
from selenium.webdriver.chrome.options import Options
|
||||
# from selenium.webdriver.common.by import By
|
||||
from selenium import webdriver
|
||||
from selenium.webdriver.chrome.service import Service
|
||||
from webdriver_manager.chrome import ChromeDriverManager
|
||||
|
||||
|
||||
def get_url_soup(url):
|
||||
driver.get(url)
|
||||
return BeautifulSoup(driver.page_source, 'lxml')
|
||||
|
||||
|
||||
def clean_text(text):
|
||||
text = " ".join(text.split())
|
||||
text = text.replace('¥', 'Ą').replace('¹', 'ą')
|
||||
text = text.replace('Æ', 'Ć').replace('æ', 'ć')
|
||||
text = text.replace('Ê', 'Ę').replace('ê', 'ę')
|
||||
text = text.replace('£', 'Ł').replace('³', 'ł')
|
||||
text = text.replace('Ñ', 'Ń').replace('ñ', 'ń')
|
||||
text = text.replace('¯', 'Ż').replace('¿', 'ż')
|
||||
text = text.replace('\x8c', 'Ś').replace('\x9c', 'ś')
|
||||
text = text.replace('\x8f', 'Ź').replace('\x9f', 'ź')
|
||||
text = text.replace('', '„').replace('', '”')
|
||||
text = text.replace('\x96', '–')
|
||||
return text
|
||||
|
||||
|
||||
def get_content(soup):
|
||||
links = soup.find_all('a')
|
||||
cites = soup.find_all('div', {'class': 'quotemain'})
|
||||
content = ' '.join([p.text.strip() for p in BeautifulSoup(str(soup), 'html.parser').find_all(text=True)])
|
||||
|
||||
for link in links:
|
||||
if clean_text(link.text).startswith('http'):
|
||||
content = content.replace(str(link.text), f' {str(link.get("href"))} ')
|
||||
|
||||
for cite in cites:
|
||||
c = ' '.join([p.text.strip() for p in BeautifulSoup(str(cite), 'html.parser').find_all(text=True)])
|
||||
content = content.replace(c, f' <cite> {c} </cite> ')
|
||||
return clean_text(content)
|
||||
|
||||
|
||||
def get_all_comments(url):
|
||||
print(url)
|
||||
result = []
|
||||
soup = get_url_soup(url)
|
||||
pages = soup.find('a', {'title': 'skocz do strony...'})
|
||||
if pages != None:
|
||||
pages = int(clean_text(pages.text).replace(' Strony', ''))
|
||||
else: pages = 1
|
||||
n = 0
|
||||
for page in range(pages):
|
||||
print(page + 1)
|
||||
if page != 0:
|
||||
n_url = f'{url}&st={page * 15}.html'
|
||||
soup = get_url_soup(n_url)
|
||||
cards = soup.find('div', {'class': 'tableborder'})
|
||||
cards = cards.find_all('table', {'cellspacing': '0', 'cellpadding': '0', 'width': '100%', 'border': '0'})
|
||||
for card in cards[:30]:
|
||||
if card.find('td', {'valign': 'top'}) and card.find('td', {'valign': 'top'}).find('table', {'cellspacing': '0', 'cellpadding': '0', 'width': '100%', 'border': '0'}):
|
||||
card = card.find('td', {'valign': 'top'}).find('table', {'cellspacing': '0', 'cellpadding': '0', 'width': '100%', 'border': '0'})
|
||||
card = card.find('tbody').find_all('tr')[1]
|
||||
info, content = card.find_all('td', {'class': 'row4'})
|
||||
username = clean_text(info.find('tr').text)
|
||||
score = info.find_all('tr')[1].find_all('td', {'class': 'posterinfo'})
|
||||
ranga = clean_text(score[0].text).replace(' ranga', '')
|
||||
group = clean_text(score[2].text).replace('Grupa: ', '')
|
||||
posts = clean_text(score[3].text).replace('Postów: ', '').replace('.', '')
|
||||
user_id = clean_text(score[4].text).replace('Nr użytkownika: ', '').replace('.', '')
|
||||
name_surname, degree, profession = '', '', ''
|
||||
if len(score) > 6:
|
||||
for s in score[6:]:
|
||||
if 'Stopień akademicki: ' in s.text:
|
||||
degree = clean_text(s.text).replace('Stopień akademicki: ', '')
|
||||
elif 'Zawód: 'in s.text:
|
||||
profession = clean_text(s.text).replace('Zawód: ', '')
|
||||
else:
|
||||
name_surname = clean_text(s.text)
|
||||
time = clean_text(content.find('span', {'class': 'postdetails'}).text)
|
||||
time = time.replace('Dzisiaj', '2/12/2022').replace('Wczoraj', '1/12/2022')
|
||||
datetime_object = datetime.strptime(time, '%d/%m/%Y, %H:%M')
|
||||
date = str(datetime_object.date())
|
||||
hour = str(datetime_object.time())
|
||||
content = content.find('span', {'class': 'postcolor'})
|
||||
content = get_content(content)
|
||||
|
||||
result.append([username, user_id, name_surname, degree, ranga, profession, group, posts, date, hour, '', str(n), content])
|
||||
n += 1
|
||||
|
||||
|
||||
result[0][-3] = str(n - 1)
|
||||
return result
|
||||
|
||||
options = Options()
|
||||
options.headless = True
|
||||
options.add_argument('--no-sandbox')
|
||||
options.incognito = True
|
||||
driver = webdriver.Chrome(options=options)
|
||||
print()
|
||||
|
||||
|
||||
with open('data/historycy_errors.tsv') as file_0:
|
||||
tsv_file = csv.reader(file_0, delimiter='\t')
|
||||
with open('data/historycy_2.tsv','w') as file:
|
||||
# header = '\t'.join(['topic_URL', 'domain_0', 'domain_1', 'domain_2', 'domain_3', 'domain_4', 'topic', 'views', 'username', 'user_id', 'name_surname', 'degree', 'position', 'profession', 'group', 'posts', 'date', 'hour', 'comments', 'n', 'content'])
|
||||
# file.write(header + '\n')
|
||||
for data in tsv_file:
|
||||
url = f'http://www.historycy.org/index.php?s={data[0]}'
|
||||
domains = data[3:-1]
|
||||
m = len(domains)
|
||||
domains = domains + [''] * (5 - m)
|
||||
# try:
|
||||
result = get_all_comments(url)
|
||||
flag = True
|
||||
for r in result:
|
||||
if flag:
|
||||
views = data[2]
|
||||
r = [data[0]] + domains + [data[1]] + [views] + r
|
||||
if flag:
|
||||
views = ''
|
||||
flag = False
|
||||
file.write('\t'.join(r))
|
||||
file.write('\n')
|
||||
# except Exception as e:
|
||||
# print(f'error: {url}')
|
||||
# with open('data/historycy_errors.tsv', 'a+') as f:
|
||||
# d = "\t".join(data)
|
||||
# f.write(f'{d}\t{e}\n')
|
103
historycy_scrap_URLs.py
Normal file
103
historycy_scrap_URLs.py
Normal file
@ -0,0 +1,103 @@
|
||||
import requests
|
||||
from bs4 import BeautifulSoup as bs
|
||||
import time
|
||||
|
||||
|
||||
def get_url_soup(url):
|
||||
try:
|
||||
req = requests.get(url)
|
||||
soup = bs(req.text, 'lxml')
|
||||
if len(soup) == 0:
|
||||
print('sleep')
|
||||
time.sleep(3)
|
||||
print(sleep)
|
||||
get_url_soup(url)
|
||||
except Exception as e:
|
||||
print(e)
|
||||
get_url_soup(url)
|
||||
return
|
||||
return soup
|
||||
|
||||
|
||||
def clean_text(text):
|
||||
text = " ".join(text.split())
|
||||
text = text.replace('¥', 'Ą').replace('¹', 'ą')
|
||||
text = text.replace('Æ', 'Ć').replace('æ', 'ć')
|
||||
text = text.replace('Ê', 'Ę').replace('ê', 'ę')
|
||||
text = text.replace('£', 'Ł').replace('³', 'ł')
|
||||
text = text.replace('Ñ', 'Ń').replace('ñ', 'ń')
|
||||
text = text.replace('¯', 'Ż').replace('¿', 'ż')
|
||||
text = text.replace('\x8c', 'Ś').replace('\x9c', 'ś')
|
||||
text = text.replace('\x8f', 'Ź').replace('\x9f', 'ź')
|
||||
text = text.replace('', '„').replace('', '”')
|
||||
text = text.replace('\x96', '–')
|
||||
return text
|
||||
|
||||
|
||||
def get_topics(url, soup, domains):
|
||||
table = soup.find_all('table', {'width': '100%', 'border': '0', 'cellspacing':'1', 'cellpadding': '4'})
|
||||
if table != None:
|
||||
rows = table[-1].find_all('tr')
|
||||
for row in rows:
|
||||
cells = row.find_all('td')
|
||||
if len(cells) == 7 and cells[2] != 'Nazwa tematu':
|
||||
links = cells[2].find_all('a')
|
||||
link = links[0].get('href')
|
||||
title = clean_text(links[0].text)
|
||||
if link == '#':
|
||||
link = links[1].get('href')
|
||||
title = clean_text(links[1].text)
|
||||
doms = "\t".join(domains)
|
||||
link = link.replace('http://www.historycy.org/index.php?s=', '')
|
||||
views = cells[5].text.replace('.', '')
|
||||
file.write(f'{link}\t{title}\t{views}\t{doms}\n')
|
||||
return
|
||||
|
||||
|
||||
def get_domains(url, domains, file):
|
||||
soup = get_url_soup(url)
|
||||
table = soup.find_all('div', {'class': 'tableborder'})
|
||||
pages = soup.find('a', {'title': 'skocz do strony...'})
|
||||
if pages != None:
|
||||
pages = clean_text(pages.text).replace(' Strony', '')
|
||||
pages = int(pages)
|
||||
for page in range(pages):
|
||||
if page != 0:
|
||||
page_url = f'{url}&prune_day=100&sort_by=Z-A&sort_key=last_post&topicfilter=all&st={page * 100}'
|
||||
soup = get_url_soup(page_url)
|
||||
get_topics(page_url, soup, domains)
|
||||
else:
|
||||
get_topics(url, soup, domains)
|
||||
else:
|
||||
get_topics(url, soup, domains)
|
||||
if len(table) > 1:
|
||||
table = table[1]
|
||||
rows = table.find_all('tr')[1:-1]
|
||||
for row in rows:
|
||||
tds = row.find_all('td')
|
||||
if tds[0].find('img', {'alt': 'Redirect'}) == None:
|
||||
topic = tds[1].find('b')
|
||||
topic_name = clean_text(topic.text)
|
||||
topic_url = topic.find('a').get('href')
|
||||
get_domains(topic_url, domains + [topic_name], file)
|
||||
|
||||
|
||||
def get_main_domains(url, file):
|
||||
soup = get_url_soup(url)
|
||||
domains_0 = soup.find_all('div', {'class': 'tableborder'})[:6]
|
||||
for domain in domains_0:
|
||||
domain_0 = clean_text(domain.find('div', {'class': 'maintitle'}).text)
|
||||
domains_1 = domain.find_all('tr')[1:-1]
|
||||
for domain in domains_1:
|
||||
domain_1 = domain.find_all('td')[1].find('b')
|
||||
domain_1_name = clean_text(domain_1.text)
|
||||
domain_1_url = domain_1.find('a').get('href')
|
||||
if domain_1_name != 'Mównica':
|
||||
get_domains(domain_1_url, [domain_0, domain_1_name], file)
|
||||
return []
|
||||
|
||||
|
||||
url = 'http://www.historycy.org/'
|
||||
|
||||
with open('data/historycy_URLs.tsv','w') as file:
|
||||
get_main_domains(url, file)
|
136
hyperreal.py
Normal file
136
hyperreal.py
Normal file
@ -0,0 +1,136 @@
|
||||
import csv
|
||||
import requests
|
||||
from bs4 import BeautifulSoup as bs
|
||||
import time
|
||||
from datetime import datetime
|
||||
|
||||
|
||||
def get_url_soup(url):
|
||||
try:
|
||||
req = requests.get(url)
|
||||
soup = bs(req.text, 'html.parser')
|
||||
if len(soup) == 0:
|
||||
print('sleep')
|
||||
time.sleep(3)
|
||||
print(sleep)
|
||||
get_url_soup(url)
|
||||
except Exception as e:
|
||||
print(e)
|
||||
get_url_soup(url)
|
||||
return
|
||||
return soup
|
||||
|
||||
|
||||
def clean_text(text):
|
||||
return " ".join(text.split())
|
||||
|
||||
|
||||
# def get_content(soup):
|
||||
# blockquotes = soup.find_all('blockquote')
|
||||
# content = ' '.join([p.text.strip() for p in soup])
|
||||
# for bq in blockquotes:
|
||||
# t = ' '.join([p.text.strip() for p in bq])
|
||||
# if 'pisze:' in bq.text:
|
||||
# idx = bq.text.index('pisze:') + len('pisze:')
|
||||
# t = t[:idx] + ' <cite> ' + t[idx:] + ' </cite>'
|
||||
# else:
|
||||
# t = '<cite> ' + t + ' </cite>'
|
||||
# content = content.replace(bq.text, t)
|
||||
# return clean_text(content)
|
||||
|
||||
|
||||
def get_content(soup):
|
||||
blockquotes = soup.find_all('blockquote')
|
||||
cite = soup.find('cite')
|
||||
if blockquotes != []:
|
||||
if cite != None:
|
||||
soup = str(soup)
|
||||
soup = soup.replace(str(cite), ' '.join([p.text.strip() for p in cite]))
|
||||
content = ' '.join([p.text.strip() for p in bs(soup, 'html.parser')])
|
||||
content = clean_text(content)
|
||||
idx = content.index('pisze:') + len('pisze:')
|
||||
content = content[:idx] + ' <cite> ' + content[idx:] + ' </cite>'
|
||||
else:
|
||||
content = clean_text(soup.text)
|
||||
content = '<cite> ' + content + ' </cite>'
|
||||
return content
|
||||
return clean_text(' '.join([p.text.strip() for p in soup]))
|
||||
|
||||
|
||||
def get_all_comments(url):
|
||||
result = []
|
||||
soup = get_url_soup(url)
|
||||
pages = soup.find('span', {'class': 'fw-normal'})
|
||||
print()
|
||||
print(pages)
|
||||
print()
|
||||
pages = int(clean_text(pages.text).replace('Strona 1 z ', ''))
|
||||
n = 0
|
||||
for page in range(pages):
|
||||
print(page + 1)
|
||||
if page != 0:
|
||||
n_url = f'{url[:-5]}-{page * 10}.html'
|
||||
soup = get_url_soup(n_url)
|
||||
cards = soup.find_all('div', {'class': 'card border-info mb-2 mb-md-3 shadow-sm'})
|
||||
for card in cards:
|
||||
idx = card.get('id')[1:]
|
||||
postprofile = card.find('div', {'class': 'postprofile'})
|
||||
user = postprofile.find('div', {'class': 'mt-1 float-start float-md-none'})
|
||||
username = user.find('a')
|
||||
if username == None:
|
||||
username = clean_text(user.text)
|
||||
strong, posts, registration, karma_received, karma_assigned, narkopedia_edits, sex = '', '', '', '', '', '', ''
|
||||
else:
|
||||
username = clean_text(username.text)
|
||||
strong = user.find('strong')
|
||||
if strong != None: strong = strong.text
|
||||
else: strong = ''
|
||||
modal = soup.find('div', {'id': f'modal_post_{idx}'}).find('div', {'class': 'col-8'})
|
||||
info = modal.find_all('div')
|
||||
posts = clean_text(info[0].text.replace('Posty:', ''))
|
||||
registration = clean_text(info[1].text.replace('Rejestracja:', ''))
|
||||
if len(info) > 2:
|
||||
sex = clean_text(info[2].text.replace('Płeć:', ''))
|
||||
if sex != None:
|
||||
if sex == 'chłopak': sex = 'M'
|
||||
elif sex == 'dziewka': sex = 'F'
|
||||
else: sex = ''
|
||||
score = modal.find_all('li')[-3:]
|
||||
karma_received = clean_text(score[0].text.replace('Karma otrzymana', ''))
|
||||
karma_assigned = clean_text(score[1].text.replace('Karma przydzielona', ''))
|
||||
narkopedia_edits = clean_text(score[2].text.replace('Edycje Narkopedii', ''))
|
||||
body = card.find('div', {'id': f'post_content{idx}'})
|
||||
title = clean_text(body.find('a').text)
|
||||
datetime_str = body.find('time')['datetime']
|
||||
datetime_object = datetime.strptime(datetime_str, '%Y-%m-%dT%H:%M:%S+00:00')
|
||||
date = str(datetime_object.date())
|
||||
hour = str(datetime_object.time())
|
||||
content = get_content(body.find('div', {'class': 'content'}))
|
||||
result.append([idx, '', title, username, strong, registration, posts, sex, karma_received, karma_assigned, narkopedia_edits, date, hour, '', str(n), content])
|
||||
n += 1
|
||||
result[0][-3] = str(n - 1)
|
||||
for i in result:
|
||||
i[1] = result[0][0]
|
||||
return result
|
||||
|
||||
|
||||
with open('data/hyperreal_URLs.tsv') as file_0:
|
||||
tsv_file = csv.reader(file_0, delimiter='\t')
|
||||
with open('data/hyperreal.tsv','w') as file:
|
||||
for data in tsv_file:
|
||||
url = f'https://hyperreal.info/talk/{data[0]}'
|
||||
print(url)
|
||||
domains = data[1:]
|
||||
m = len(domains)
|
||||
domains = domains + [''] * (6 - m)
|
||||
try:
|
||||
result = get_all_comments(url)
|
||||
for r in result:
|
||||
r = [data[0]] + domains + r
|
||||
file.write('\t'.join(r))
|
||||
file.write('\n')
|
||||
except Exception as e:
|
||||
print(f'error: {url}')
|
||||
with open('data/hyperreal_errors_2.tsv', 'a+') as f:
|
||||
d = "\t".join(data)
|
||||
f.write(f'{d}\t{e}\n')
|
82
hyperreal_scrap_URLs.py
Normal file
82
hyperreal_scrap_URLs.py
Normal file
@ -0,0 +1,82 @@
|
||||
import requests
|
||||
from bs4 import BeautifulSoup as bs
|
||||
import time
|
||||
|
||||
|
||||
def get_url_soup(url):
|
||||
req = requests.get(url)
|
||||
soup = bs(req.text, 'html.parser')
|
||||
l = len(soup)
|
||||
while l <= 1:
|
||||
req = requests.get(url)
|
||||
soup = bs(req.text, 'html.parser')
|
||||
l = len(soup)
|
||||
time.sleep(3)
|
||||
return soup
|
||||
|
||||
|
||||
def clean_text(text):
|
||||
return " ".join(text.split())
|
||||
|
||||
|
||||
def get_forum(url, domains, file):
|
||||
try:
|
||||
soup = get_url_soup(url)
|
||||
soup = soup.find('div', {'class': 'order-first'})
|
||||
cards = soup.find_all('div', {'class': 'card mb-3'}) + soup.find_all('div', {'class': 'card mb-3 shadow-sm'})
|
||||
for card in cards:
|
||||
title = card.find('div', {'class': 'col-lg-7 col-md-6 col-12'})
|
||||
title = clean_text(title.text)
|
||||
if title == 'Forum':
|
||||
forum_card = card.find_all('div', {'class': 'row-item'})
|
||||
for c in forum_card:
|
||||
a = c.find_all('a')
|
||||
if len(a) > 1:
|
||||
a = c.find_all('a')[1]
|
||||
t_url = a.get('href')
|
||||
domain = clean_text(a.text)
|
||||
domains.append(domain)
|
||||
get_forum(t_url, domains, file)
|
||||
domains.pop()
|
||||
elif title == 'Ogłoszenia':
|
||||
topics = card.find_all('a', {'class': 'topictitle fs-5'})
|
||||
for topic in topics:
|
||||
file.write(topic.get('href').replace('https://hyperreal.info/talk/', '') + '\t' + '\t'.join(domains + ['Ogłoszenia']) + '\n')
|
||||
elif title == 'Tematy':
|
||||
pages = soup.find('span', {'class': 'fw-normal'})
|
||||
if pages != None:
|
||||
pages = int(clean_text(pages.text).replace('Strona 1 z ', ''))
|
||||
for page in range(pages):
|
||||
n = page * 15
|
||||
topics_url = f'{url[:-5]}-{n}.html'
|
||||
topics_soup = get_url_soup(topics_url)
|
||||
topics_soup = topics_soup.find('div', {'class': 'order-first'})
|
||||
cards_topics = topics_soup.find_all('div', {'class': 'card mb-3 shadow-sm'})[-1]
|
||||
topics = cards_topics.find_all('a', {'class': 'topictitle fs-5'})
|
||||
for topic in topics:
|
||||
file.write(topic.get('href').replace('https://hyperreal.info/talk/', '') + '\t' + '\t'.join(domains) + '\n')
|
||||
except Exception as e:
|
||||
print(f'\tERROR: {url} - {e}')
|
||||
|
||||
|
||||
def get_main_domains(url, file):
|
||||
soup = get_url_soup(url)
|
||||
soup = soup.find('div', {'class': 'order-first'})
|
||||
domains = soup.find_all('div', {'class': 'card mb-3'})
|
||||
for domain in domains:
|
||||
domain_0 = clean_text(domain.find('div', {'class': 'col-lg-7 col-md-6 col-12'}).text)
|
||||
topics = domain.find_all('div', {'class': 'row-item'})
|
||||
for topic in topics:
|
||||
a = topic.find_all('a')[1]
|
||||
t_url = a.get('href')
|
||||
domain_1 = clean_text(a.text)
|
||||
get_forum(t_url, [domain_0, domain_1], file)
|
||||
return []
|
||||
|
||||
|
||||
tmp = []
|
||||
url = 'https://hyperreal.info/talk/'
|
||||
|
||||
|
||||
with open('data/hyperreal_URLs.tsv','w') as file:
|
||||
get_main_domains(url, file)
|
122
patient.py
Normal file
122
patient.py
Normal file
@ -0,0 +1,122 @@
|
||||
import csv
|
||||
from bs4 import BeautifulSoup
|
||||
from selenium import webdriver
|
||||
from selenium.webdriver.chrome.options import Options
|
||||
from selenium.webdriver.common.by import By
|
||||
import time
|
||||
from datetime import datetime
|
||||
|
||||
def get_url_soup(url):
|
||||
driver.get(url)
|
||||
return BeautifulSoup(driver.page_source, 'lxml')
|
||||
|
||||
def check_pages_num(soup):
|
||||
pages = soup.find('select', {'class': 'reply-pagination'})
|
||||
if pages != None:
|
||||
pages = pages.find_all('option')[-1]
|
||||
pages = pages.text.split('/')[-1]
|
||||
return int(pages)
|
||||
else:
|
||||
return 1
|
||||
|
||||
def clean_text(text):
|
||||
return " ".join(text.split())
|
||||
|
||||
def set_num(n, m):
|
||||
if m != 0:
|
||||
return str(f'{n}.{m}')
|
||||
else: return str(n)
|
||||
|
||||
def get_all_comments(url, domain_0, domain_1):
|
||||
result = []
|
||||
soup = get_url_soup(url)
|
||||
pages = check_pages_num(soup)
|
||||
comments_list = []
|
||||
n0, n = 0, 0
|
||||
for i in range(pages):
|
||||
page_url = f'{url}?order=oldest&page={i}#topic-replies'
|
||||
if i == 0:
|
||||
main = soup.find('div', {'class': 'post__main'})
|
||||
main_title = clean_text(main.find('h1', {'class': 'post__title'}).text.strip())
|
||||
main_author = main.find('h5', {'class': 'author__info'}).text.strip()
|
||||
main_time = main.find('time', {'class': 'fuzzy'})['datetime']
|
||||
datetime_object = datetime.strptime(main_time, '%Y-%m-%dT%H:%M+%S:%f')
|
||||
date = str(datetime_object.date())
|
||||
hour = str(datetime_object.time())
|
||||
likes, replies = soup.find_all('p', {'class': 'post__stats'})[1].text.strip().split(', ')
|
||||
likes = likes.replace(' likes', '').replace(' like', '')
|
||||
replies = replies.replace(' replies', '').replace(' reply', '')
|
||||
main_content = soup.find('div', {'class': 'post__content'}).find_all('p')[:-1]
|
||||
main_content = clean_text(' '.join([p.text.strip() for p in main_content]))
|
||||
main_id = url.split('-')[-1]
|
||||
followers = main.find('p', {'class': 'post__stats'}).find_all('span')[-1].text
|
||||
followers = followers.strip()
|
||||
if 'following' in followers:
|
||||
followers = followers.replace(' users are following.', '').replace(' user is following.', '')
|
||||
else:
|
||||
followers = ''
|
||||
main_data = [main_id, main_id, main_title, main_author, followers, date, hour, likes, replies, '0', '0', main_content]
|
||||
comments_list += [main_data]
|
||||
else:
|
||||
soup = get_url_soup(page_url)
|
||||
comments = soup.find('div', {'class': 'comment-page'})
|
||||
if comments != None:
|
||||
comments = comments.find_all('li', {'class': 'comment'})
|
||||
m = 0
|
||||
for comment in comments:
|
||||
classes = comment.get('class')
|
||||
header = comment.find('div', {'class': 'post__header'})
|
||||
likes = comment.find('a', {'class': 'post__like'})
|
||||
if likes != None:
|
||||
likes = clean_text(likes.text)
|
||||
else: likes = ''
|
||||
content = comment.find('div', {'class': 'post__content'})
|
||||
if content != None:
|
||||
content = clean_text(' '.join([q.text for q in content.findAll(text=True)]))
|
||||
content = content.replace(f' Report / Delete {likes} Reply', '')
|
||||
content = content.replace(f' Report / Delete Reply', '')
|
||||
if header != None:
|
||||
if 'comment--nested' in classes:
|
||||
m += 1
|
||||
else:
|
||||
m = 0
|
||||
n += 1
|
||||
n0 += 1
|
||||
idx = comment['itemid'].split('#')[-1]
|
||||
user = header.find('a', {'class': 'author__name'}).text.strip()
|
||||
time = comment.find('time', {'class': 'fuzzy'})['datetime']
|
||||
datetime_object = datetime.strptime(time, '%Y-%m-%dT%H:%M+%S:%f')
|
||||
date = str(datetime_object.date())
|
||||
hour = str(datetime_object.time())
|
||||
likes = comment.find('a', {'class': 'post__like'})
|
||||
if likes != None:
|
||||
likes = clean_text(likes.text)
|
||||
else: likes = ''
|
||||
n_n = set_num(n, m)
|
||||
comments_list += [[idx, main_id, main_title, user, '', date, hour, likes, '', str(n0), n_n, content]]
|
||||
comments_list[0][8] = str(n0)
|
||||
return comments_list
|
||||
|
||||
|
||||
DRIVER_PATH = '/usr/bin/chromedriver'
|
||||
options = Options()
|
||||
options.headless = True
|
||||
options.incognito = True
|
||||
driver = webdriver.Chrome(options=options, executable_path=DRIVER_PATH)
|
||||
|
||||
|
||||
with open("data/patient_URLs.tsv") as file:
|
||||
tsv_file = csv.reader(file, delimiter="\t")
|
||||
with open('data/patient_3.tsv','w') as file:
|
||||
# file.write('\t'.join(['topic_URL', 'domain_0', 'domain_1', 'id', 'main_id', 'title', 'user', 'followers', 'date', 'hour', 'likes', 'comments', 'n', 'm', 'content']) + '\n')
|
||||
for topic_url, domain_0, domain_1 in list(tsv_file)[538775:]:
|
||||
try:
|
||||
result = get_all_comments('https://patient.info/forums/discuss/' + topic_url, domain_0, domain_1)
|
||||
for r in result:
|
||||
r = [topic_url, domain_0, domain_1] + r
|
||||
file.write('\t'.join(r))
|
||||
file.write('\n')
|
||||
except Exception as e:
|
||||
print(f'error: {topic_url}')
|
||||
with open('patient_error_2.tsv', 'a+') as f:
|
||||
f.write(f'{topic_url}\t{e}\n')
|
51
patient_scrap_URLs.py
Normal file
51
patient_scrap_URLs.py
Normal file
@ -0,0 +1,51 @@
|
||||
from bs4 import BeautifulSoup
|
||||
import cloudscraper
|
||||
|
||||
def get_url_soup(url):
|
||||
time.sleep(1)
|
||||
req = requests.get(url)
|
||||
soup = bs(req.text, 'html.parser')
|
||||
l = len(soup)
|
||||
while l <= 1:
|
||||
req = requests.get(url)
|
||||
soup = bs(req.text, 'html.parser')
|
||||
l = len(soup)
|
||||
time.sleep(3)
|
||||
return soup
|
||||
|
||||
scraper = cloudscraper.create_scraper(delay=10, browser={'custom': 'ScraperBot/1.0',})
|
||||
url = 'https://patient.info/forums'
|
||||
req = scraper.get(url)
|
||||
soup = BeautifulSoup(req.text, 'lxml')
|
||||
forums = [(a.get('href'), a.text.strip()) for a in soup.find_all('a', {'class': 'con-meds-lnk'})]
|
||||
|
||||
with open('data/patient_URLs.tsv','w') as file:
|
||||
for url, d0 in forums:
|
||||
url = f'https://patient.info{url}'
|
||||
print(url)
|
||||
req = scraper.get(url)
|
||||
soup = BeautifulSoup(req.text, 'lxml')
|
||||
domains = soup.find_all('h3', {'class': 'title'})
|
||||
# domains = [d.find('a').get('href') for d in domains]
|
||||
for d in domains:
|
||||
d1 = d.text.strip()
|
||||
d = d.find('a').get('href')
|
||||
print('\t', d.replace('/forums/discuss/browse/', ''))
|
||||
url = f'https://patient.info{d}'
|
||||
req = scraper.get(url)
|
||||
soup = BeautifulSoup(req.text, 'lxml')
|
||||
pages = soup.find('select', {'class': 'reply-pagination'})
|
||||
if pages != None:
|
||||
pages = pages.find_all('option')[-1]
|
||||
pages = pages.text.split('/')[-1]
|
||||
pages = int(pages)
|
||||
else:
|
||||
pages = 1
|
||||
for p in range(pages):
|
||||
page_url = f'https://patient.info{d}?page={p}#group-discussions'
|
||||
req = scraper.get(page_url)
|
||||
soup = BeautifulSoup(req.text, 'lxml')
|
||||
posts = soup.find_all('h3', {'class': 'post__title'})
|
||||
for post in posts:
|
||||
href = post.find('a').get('href')
|
||||
file.write(f'{href.replace("/forums/discuss/", "")}\t{d0}\t{d1}\n')
|
6
requirements.txt
Normal file
6
requirements.txt
Normal file
@ -0,0 +1,6 @@
|
||||
beautifulsoup4==4.11.1
|
||||
cloudscraper==1.2.65
|
||||
lxml==4.9.1
|
||||
selenium==4.7.0
|
||||
webdriver-manager==3.8.5
|
||||
pandas
|
Loading…
Reference in New Issue
Block a user