upload scripts

This commit is contained in:
fraszosia@gmail.com 2023-01-02 09:30:54 +01:00
commit 04ac541e2d
7 changed files with 637 additions and 0 deletions

137
historycy.py Normal file
View File

@ -0,0 +1,137 @@
import csv
import requests
# from bs4 import BeautifulSoup as bs
import time
from datetime import datetime
import numpy as np
from bs4 import BeautifulSoup
from selenium import webdriver
from selenium.webdriver.chrome.options import Options
# from selenium.webdriver.common.by import By
from selenium import webdriver
from selenium.webdriver.chrome.service import Service
from webdriver_manager.chrome import ChromeDriverManager
def get_url_soup(url):
driver.get(url)
return BeautifulSoup(driver.page_source, 'lxml')
def clean_text(text):
text = " ".join(text.split())
text = text.replace('¥', 'Ą').replace('¹', 'ą')
text = text.replace('Æ', 'Ć').replace('æ', 'ć')
text = text.replace('Ê', 'Ę').replace('ê', 'ę')
text = text.replace('£', 'Ł').replace('³', 'ł')
text = text.replace('Ñ', 'Ń').replace('ñ', 'ń')
text = text.replace('¯', 'Ż').replace('¿', 'ż')
text = text.replace('\x8c', 'Ś').replace('\x9c', 'ś')
text = text.replace('\x8f', 'Ź').replace('\x9f', 'ź')
text = text.replace('„', '').replace('”', '')
text = text.replace('\x96', '')
return text
def get_content(soup):
links = soup.find_all('a')
cites = soup.find_all('div', {'class': 'quotemain'})
content = ' '.join([p.text.strip() for p in BeautifulSoup(str(soup), 'html.parser').find_all(text=True)])
for link in links:
if clean_text(link.text).startswith('http'):
content = content.replace(str(link.text), f' {str(link.get("href"))} ')
for cite in cites:
c = ' '.join([p.text.strip() for p in BeautifulSoup(str(cite), 'html.parser').find_all(text=True)])
content = content.replace(c, f' <cite> {c} </cite> ')
return clean_text(content)
def get_all_comments(url):
print(url)
result = []
soup = get_url_soup(url)
pages = soup.find('a', {'title': 'skocz do strony...'})
if pages != None:
pages = int(clean_text(pages.text).replace(' Strony', ''))
else: pages = 1
n = 0
for page in range(pages):
print(page + 1)
if page != 0:
n_url = f'{url}&st={page * 15}.html'
soup = get_url_soup(n_url)
cards = soup.find('div', {'class': 'tableborder'})
cards = cards.find_all('table', {'cellspacing': '0', 'cellpadding': '0', 'width': '100%', 'border': '0'})
for card in cards[:30]:
if card.find('td', {'valign': 'top'}) and card.find('td', {'valign': 'top'}).find('table', {'cellspacing': '0', 'cellpadding': '0', 'width': '100%', 'border': '0'}):
card = card.find('td', {'valign': 'top'}).find('table', {'cellspacing': '0', 'cellpadding': '0', 'width': '100%', 'border': '0'})
card = card.find('tbody').find_all('tr')[1]
info, content = card.find_all('td', {'class': 'row4'})
username = clean_text(info.find('tr').text)
score = info.find_all('tr')[1].find_all('td', {'class': 'posterinfo'})
ranga = clean_text(score[0].text).replace(' ranga', '')
group = clean_text(score[2].text).replace('Grupa: ', '')
posts = clean_text(score[3].text).replace('Postów: ', '').replace('.', '')
user_id = clean_text(score[4].text).replace('Nr użytkownika: ', '').replace('.', '')
name_surname, degree, profession = '', '', ''
if len(score) > 6:
for s in score[6:]:
if 'Stopień akademicki: ' in s.text:
degree = clean_text(s.text).replace('Stopień akademicki: ', '')
elif 'Zawód: 'in s.text:
profession = clean_text(s.text).replace('Zawód: ', '')
else:
name_surname = clean_text(s.text)
time = clean_text(content.find('span', {'class': 'postdetails'}).text)
time = time.replace('Dzisiaj', '2/12/2022').replace('Wczoraj', '1/12/2022')
datetime_object = datetime.strptime(time, '%d/%m/%Y, %H:%M')
date = str(datetime_object.date())
hour = str(datetime_object.time())
content = content.find('span', {'class': 'postcolor'})
content = get_content(content)
result.append([username, user_id, name_surname, degree, ranga, profession, group, posts, date, hour, '', str(n), content])
n += 1
result[0][-3] = str(n - 1)
return result
options = Options()
options.headless = True
options.add_argument('--no-sandbox')
options.incognito = True
driver = webdriver.Chrome(options=options)
print()
with open('data/historycy_errors.tsv') as file_0:
tsv_file = csv.reader(file_0, delimiter='\t')
with open('data/historycy_2.tsv','w') as file:
# header = '\t'.join(['topic_URL', 'domain_0', 'domain_1', 'domain_2', 'domain_3', 'domain_4', 'topic', 'views', 'username', 'user_id', 'name_surname', 'degree', 'position', 'profession', 'group', 'posts', 'date', 'hour', 'comments', 'n', 'content'])
# file.write(header + '\n')
for data in tsv_file:
url = f'http://www.historycy.org/index.php?s={data[0]}'
domains = data[3:-1]
m = len(domains)
domains = domains + [''] * (5 - m)
# try:
result = get_all_comments(url)
flag = True
for r in result:
if flag:
views = data[2]
r = [data[0]] + domains + [data[1]] + [views] + r
if flag:
views = ''
flag = False
file.write('\t'.join(r))
file.write('\n')
# except Exception as e:
# print(f'error: {url}')
# with open('data/historycy_errors.tsv', 'a+') as f:
# d = "\t".join(data)
# f.write(f'{d}\t{e}\n')

103
historycy_scrap_URLs.py Normal file
View File

@ -0,0 +1,103 @@
import requests
from bs4 import BeautifulSoup as bs
import time
def get_url_soup(url):
try:
req = requests.get(url)
soup = bs(req.text, 'lxml')
if len(soup) == 0:
print('sleep')
time.sleep(3)
print(sleep)
get_url_soup(url)
except Exception as e:
print(e)
get_url_soup(url)
return
return soup
def clean_text(text):
text = " ".join(text.split())
text = text.replace('¥', 'Ą').replace('¹', 'ą')
text = text.replace('Æ', 'Ć').replace('æ', 'ć')
text = text.replace('Ê', 'Ę').replace('ê', 'ę')
text = text.replace('£', 'Ł').replace('³', 'ł')
text = text.replace('Ñ', 'Ń').replace('ñ', 'ń')
text = text.replace('¯', 'Ż').replace('¿', 'ż')
text = text.replace('\x8c', 'Ś').replace('\x9c', 'ś')
text = text.replace('\x8f', 'Ź').replace('\x9f', 'ź')
text = text.replace('„', '').replace('”', '')
text = text.replace('\x96', '')
return text
def get_topics(url, soup, domains):
table = soup.find_all('table', {'width': '100%', 'border': '0', 'cellspacing':'1', 'cellpadding': '4'})
if table != None:
rows = table[-1].find_all('tr')
for row in rows:
cells = row.find_all('td')
if len(cells) == 7 and cells[2] != 'Nazwa tematu':
links = cells[2].find_all('a')
link = links[0].get('href')
title = clean_text(links[0].text)
if link == '#':
link = links[1].get('href')
title = clean_text(links[1].text)
doms = "\t".join(domains)
link = link.replace('http://www.historycy.org/index.php?s=', '')
views = cells[5].text.replace('.', '')
file.write(f'{link}\t{title}\t{views}\t{doms}\n')
return
def get_domains(url, domains, file):
soup = get_url_soup(url)
table = soup.find_all('div', {'class': 'tableborder'})
pages = soup.find('a', {'title': 'skocz do strony...'})
if pages != None:
pages = clean_text(pages.text).replace(' Strony', '')
pages = int(pages)
for page in range(pages):
if page != 0:
page_url = f'{url}&prune_day=100&sort_by=Z-A&sort_key=last_post&topicfilter=all&st={page * 100}'
soup = get_url_soup(page_url)
get_topics(page_url, soup, domains)
else:
get_topics(url, soup, domains)
else:
get_topics(url, soup, domains)
if len(table) > 1:
table = table[1]
rows = table.find_all('tr')[1:-1]
for row in rows:
tds = row.find_all('td')
if tds[0].find('img', {'alt': 'Redirect'}) == None:
topic = tds[1].find('b')
topic_name = clean_text(topic.text)
topic_url = topic.find('a').get('href')
get_domains(topic_url, domains + [topic_name], file)
def get_main_domains(url, file):
soup = get_url_soup(url)
domains_0 = soup.find_all('div', {'class': 'tableborder'})[:6]
for domain in domains_0:
domain_0 = clean_text(domain.find('div', {'class': 'maintitle'}).text)
domains_1 = domain.find_all('tr')[1:-1]
for domain in domains_1:
domain_1 = domain.find_all('td')[1].find('b')
domain_1_name = clean_text(domain_1.text)
domain_1_url = domain_1.find('a').get('href')
if domain_1_name != 'Mównica':
get_domains(domain_1_url, [domain_0, domain_1_name], file)
return []
url = 'http://www.historycy.org/'
with open('data/historycy_URLs.tsv','w') as file:
get_main_domains(url, file)

136
hyperreal.py Normal file
View File

@ -0,0 +1,136 @@
import csv
import requests
from bs4 import BeautifulSoup as bs
import time
from datetime import datetime
def get_url_soup(url):
try:
req = requests.get(url)
soup = bs(req.text, 'html.parser')
if len(soup) == 0:
print('sleep')
time.sleep(3)
print(sleep)
get_url_soup(url)
except Exception as e:
print(e)
get_url_soup(url)
return
return soup
def clean_text(text):
return " ".join(text.split())
# def get_content(soup):
# blockquotes = soup.find_all('blockquote')
# content = ' '.join([p.text.strip() for p in soup])
# for bq in blockquotes:
# t = ' '.join([p.text.strip() for p in bq])
# if 'pisze:' in bq.text:
# idx = bq.text.index('pisze:') + len('pisze:')
# t = t[:idx] + ' <cite> ' + t[idx:] + ' </cite>'
# else:
# t = '<cite> ' + t + ' </cite>'
# content = content.replace(bq.text, t)
# return clean_text(content)
def get_content(soup):
blockquotes = soup.find_all('blockquote')
cite = soup.find('cite')
if blockquotes != []:
if cite != None:
soup = str(soup)
soup = soup.replace(str(cite), ' '.join([p.text.strip() for p in cite]))
content = ' '.join([p.text.strip() for p in bs(soup, 'html.parser')])
content = clean_text(content)
idx = content.index('pisze:') + len('pisze:')
content = content[:idx] + ' <cite> ' + content[idx:] + ' </cite>'
else:
content = clean_text(soup.text)
content = '<cite> ' + content + ' </cite>'
return content
return clean_text(' '.join([p.text.strip() for p in soup]))
def get_all_comments(url):
result = []
soup = get_url_soup(url)
pages = soup.find('span', {'class': 'fw-normal'})
print()
print(pages)
print()
pages = int(clean_text(pages.text).replace('Strona 1 z ', ''))
n = 0
for page in range(pages):
print(page + 1)
if page != 0:
n_url = f'{url[:-5]}-{page * 10}.html'
soup = get_url_soup(n_url)
cards = soup.find_all('div', {'class': 'card border-info mb-2 mb-md-3 shadow-sm'})
for card in cards:
idx = card.get('id')[1:]
postprofile = card.find('div', {'class': 'postprofile'})
user = postprofile.find('div', {'class': 'mt-1 float-start float-md-none'})
username = user.find('a')
if username == None:
username = clean_text(user.text)
strong, posts, registration, karma_received, karma_assigned, narkopedia_edits, sex = '', '', '', '', '', '', ''
else:
username = clean_text(username.text)
strong = user.find('strong')
if strong != None: strong = strong.text
else: strong = ''
modal = soup.find('div', {'id': f'modal_post_{idx}'}).find('div', {'class': 'col-8'})
info = modal.find_all('div')
posts = clean_text(info[0].text.replace('Posty:', ''))
registration = clean_text(info[1].text.replace('Rejestracja:', ''))
if len(info) > 2:
sex = clean_text(info[2].text.replace('Płeć:', ''))
if sex != None:
if sex == 'chłopak': sex = 'M'
elif sex == 'dziewka': sex = 'F'
else: sex = ''
score = modal.find_all('li')[-3:]
karma_received = clean_text(score[0].text.replace('Karma otrzymana', ''))
karma_assigned = clean_text(score[1].text.replace('Karma przydzielona', ''))
narkopedia_edits = clean_text(score[2].text.replace('Edycje Narkopedii', ''))
body = card.find('div', {'id': f'post_content{idx}'})
title = clean_text(body.find('a').text)
datetime_str = body.find('time')['datetime']
datetime_object = datetime.strptime(datetime_str, '%Y-%m-%dT%H:%M:%S+00:00')
date = str(datetime_object.date())
hour = str(datetime_object.time())
content = get_content(body.find('div', {'class': 'content'}))
result.append([idx, '', title, username, strong, registration, posts, sex, karma_received, karma_assigned, narkopedia_edits, date, hour, '', str(n), content])
n += 1
result[0][-3] = str(n - 1)
for i in result:
i[1] = result[0][0]
return result
with open('data/hyperreal_URLs.tsv') as file_0:
tsv_file = csv.reader(file_0, delimiter='\t')
with open('data/hyperreal.tsv','w') as file:
for data in tsv_file:
url = f'https://hyperreal.info/talk/{data[0]}'
print(url)
domains = data[1:]
m = len(domains)
domains = domains + [''] * (6 - m)
try:
result = get_all_comments(url)
for r in result:
r = [data[0]] + domains + r
file.write('\t'.join(r))
file.write('\n')
except Exception as e:
print(f'error: {url}')
with open('data/hyperreal_errors_2.tsv', 'a+') as f:
d = "\t".join(data)
f.write(f'{d}\t{e}\n')

82
hyperreal_scrap_URLs.py Normal file
View File

@ -0,0 +1,82 @@
import requests
from bs4 import BeautifulSoup as bs
import time
def get_url_soup(url):
req = requests.get(url)
soup = bs(req.text, 'html.parser')
l = len(soup)
while l <= 1:
req = requests.get(url)
soup = bs(req.text, 'html.parser')
l = len(soup)
time.sleep(3)
return soup
def clean_text(text):
return " ".join(text.split())
def get_forum(url, domains, file):
try:
soup = get_url_soup(url)
soup = soup.find('div', {'class': 'order-first'})
cards = soup.find_all('div', {'class': 'card mb-3'}) + soup.find_all('div', {'class': 'card mb-3 shadow-sm'})
for card in cards:
title = card.find('div', {'class': 'col-lg-7 col-md-6 col-12'})
title = clean_text(title.text)
if title == 'Forum':
forum_card = card.find_all('div', {'class': 'row-item'})
for c in forum_card:
a = c.find_all('a')
if len(a) > 1:
a = c.find_all('a')[1]
t_url = a.get('href')
domain = clean_text(a.text)
domains.append(domain)
get_forum(t_url, domains, file)
domains.pop()
elif title == 'Ogłoszenia':
topics = card.find_all('a', {'class': 'topictitle fs-5'})
for topic in topics:
file.write(topic.get('href').replace('https://hyperreal.info/talk/', '') + '\t' + '\t'.join(domains + ['Ogłoszenia']) + '\n')
elif title == 'Tematy':
pages = soup.find('span', {'class': 'fw-normal'})
if pages != None:
pages = int(clean_text(pages.text).replace('Strona 1 z ', ''))
for page in range(pages):
n = page * 15
topics_url = f'{url[:-5]}-{n}.html'
topics_soup = get_url_soup(topics_url)
topics_soup = topics_soup.find('div', {'class': 'order-first'})
cards_topics = topics_soup.find_all('div', {'class': 'card mb-3 shadow-sm'})[-1]
topics = cards_topics.find_all('a', {'class': 'topictitle fs-5'})
for topic in topics:
file.write(topic.get('href').replace('https://hyperreal.info/talk/', '') + '\t' + '\t'.join(domains) + '\n')
except Exception as e:
print(f'\tERROR: {url} - {e}')
def get_main_domains(url, file):
soup = get_url_soup(url)
soup = soup.find('div', {'class': 'order-first'})
domains = soup.find_all('div', {'class': 'card mb-3'})
for domain in domains:
domain_0 = clean_text(domain.find('div', {'class': 'col-lg-7 col-md-6 col-12'}).text)
topics = domain.find_all('div', {'class': 'row-item'})
for topic in topics:
a = topic.find_all('a')[1]
t_url = a.get('href')
domain_1 = clean_text(a.text)
get_forum(t_url, [domain_0, domain_1], file)
return []
tmp = []
url = 'https://hyperreal.info/talk/'
with open('data/hyperreal_URLs.tsv','w') as file:
get_main_domains(url, file)

122
patient.py Normal file
View File

@ -0,0 +1,122 @@
import csv
from bs4 import BeautifulSoup
from selenium import webdriver
from selenium.webdriver.chrome.options import Options
from selenium.webdriver.common.by import By
import time
from datetime import datetime
def get_url_soup(url):
driver.get(url)
return BeautifulSoup(driver.page_source, 'lxml')
def check_pages_num(soup):
pages = soup.find('select', {'class': 'reply-pagination'})
if pages != None:
pages = pages.find_all('option')[-1]
pages = pages.text.split('/')[-1]
return int(pages)
else:
return 1
def clean_text(text):
return " ".join(text.split())
def set_num(n, m):
if m != 0:
return str(f'{n}.{m}')
else: return str(n)
def get_all_comments(url, domain_0, domain_1):
result = []
soup = get_url_soup(url)
pages = check_pages_num(soup)
comments_list = []
n0, n = 0, 0
for i in range(pages):
page_url = f'{url}?order=oldest&page={i}#topic-replies'
if i == 0:
main = soup.find('div', {'class': 'post__main'})
main_title = clean_text(main.find('h1', {'class': 'post__title'}).text.strip())
main_author = main.find('h5', {'class': 'author__info'}).text.strip()
main_time = main.find('time', {'class': 'fuzzy'})['datetime']
datetime_object = datetime.strptime(main_time, '%Y-%m-%dT%H:%M+%S:%f')
date = str(datetime_object.date())
hour = str(datetime_object.time())
likes, replies = soup.find_all('p', {'class': 'post__stats'})[1].text.strip().split(', ')
likes = likes.replace(' likes', '').replace(' like', '')
replies = replies.replace(' replies', '').replace(' reply', '')
main_content = soup.find('div', {'class': 'post__content'}).find_all('p')[:-1]
main_content = clean_text(' '.join([p.text.strip() for p in main_content]))
main_id = url.split('-')[-1]
followers = main.find('p', {'class': 'post__stats'}).find_all('span')[-1].text
followers = followers.strip()
if 'following' in followers:
followers = followers.replace(' users are following.', '').replace(' user is following.', '')
else:
followers = ''
main_data = [main_id, main_id, main_title, main_author, followers, date, hour, likes, replies, '0', '0', main_content]
comments_list += [main_data]
else:
soup = get_url_soup(page_url)
comments = soup.find('div', {'class': 'comment-page'})
if comments != None:
comments = comments.find_all('li', {'class': 'comment'})
m = 0
for comment in comments:
classes = comment.get('class')
header = comment.find('div', {'class': 'post__header'})
likes = comment.find('a', {'class': 'post__like'})
if likes != None:
likes = clean_text(likes.text)
else: likes = ''
content = comment.find('div', {'class': 'post__content'})
if content != None:
content = clean_text(' '.join([q.text for q in content.findAll(text=True)]))
content = content.replace(f' Report / Delete {likes} Reply', '')
content = content.replace(f' Report / Delete Reply', '')
if header != None:
if 'comment--nested' in classes:
m += 1
else:
m = 0
n += 1
n0 += 1
idx = comment['itemid'].split('#')[-1]
user = header.find('a', {'class': 'author__name'}).text.strip()
time = comment.find('time', {'class': 'fuzzy'})['datetime']
datetime_object = datetime.strptime(time, '%Y-%m-%dT%H:%M+%S:%f')
date = str(datetime_object.date())
hour = str(datetime_object.time())
likes = comment.find('a', {'class': 'post__like'})
if likes != None:
likes = clean_text(likes.text)
else: likes = ''
n_n = set_num(n, m)
comments_list += [[idx, main_id, main_title, user, '', date, hour, likes, '', str(n0), n_n, content]]
comments_list[0][8] = str(n0)
return comments_list
DRIVER_PATH = '/usr/bin/chromedriver'
options = Options()
options.headless = True
options.incognito = True
driver = webdriver.Chrome(options=options, executable_path=DRIVER_PATH)
with open("data/patient_URLs.tsv") as file:
tsv_file = csv.reader(file, delimiter="\t")
with open('data/patient_3.tsv','w') as file:
# file.write('\t'.join(['topic_URL', 'domain_0', 'domain_1', 'id', 'main_id', 'title', 'user', 'followers', 'date', 'hour', 'likes', 'comments', 'n', 'm', 'content']) + '\n')
for topic_url, domain_0, domain_1 in list(tsv_file)[538775:]:
try:
result = get_all_comments('https://patient.info/forums/discuss/' + topic_url, domain_0, domain_1)
for r in result:
r = [topic_url, domain_0, domain_1] + r
file.write('\t'.join(r))
file.write('\n')
except Exception as e:
print(f'error: {topic_url}')
with open('patient_error_2.tsv', 'a+') as f:
f.write(f'{topic_url}\t{e}\n')

51
patient_scrap_URLs.py Normal file
View File

@ -0,0 +1,51 @@
from bs4 import BeautifulSoup
import cloudscraper
def get_url_soup(url):
time.sleep(1)
req = requests.get(url)
soup = bs(req.text, 'html.parser')
l = len(soup)
while l <= 1:
req = requests.get(url)
soup = bs(req.text, 'html.parser')
l = len(soup)
time.sleep(3)
return soup
scraper = cloudscraper.create_scraper(delay=10, browser={'custom': 'ScraperBot/1.0',})
url = 'https://patient.info/forums'
req = scraper.get(url)
soup = BeautifulSoup(req.text, 'lxml')
forums = [(a.get('href'), a.text.strip()) for a in soup.find_all('a', {'class': 'con-meds-lnk'})]
with open('data/patient_URLs.tsv','w') as file:
for url, d0 in forums:
url = f'https://patient.info{url}'
print(url)
req = scraper.get(url)
soup = BeautifulSoup(req.text, 'lxml')
domains = soup.find_all('h3', {'class': 'title'})
# domains = [d.find('a').get('href') for d in domains]
for d in domains:
d1 = d.text.strip()
d = d.find('a').get('href')
print('\t', d.replace('/forums/discuss/browse/', ''))
url = f'https://patient.info{d}'
req = scraper.get(url)
soup = BeautifulSoup(req.text, 'lxml')
pages = soup.find('select', {'class': 'reply-pagination'})
if pages != None:
pages = pages.find_all('option')[-1]
pages = pages.text.split('/')[-1]
pages = int(pages)
else:
pages = 1
for p in range(pages):
page_url = f'https://patient.info{d}?page={p}#group-discussions'
req = scraper.get(page_url)
soup = BeautifulSoup(req.text, 'lxml')
posts = soup.find_all('h3', {'class': 'post__title'})
for post in posts:
href = post.find('a').get('href')
file.write(f'{href.replace("/forums/discuss/", "")}\t{d0}\t{d1}\n')

6
requirements.txt Normal file
View File

@ -0,0 +1,6 @@
beautifulsoup4==4.11.1
cloudscraper==1.2.65
lxml==4.9.1
selenium==4.7.0
webdriver-manager==3.8.5
pandas