51 lines
1.9 KiB
Python
51 lines
1.9 KiB
Python
|
from bs4 import BeautifulSoup
|
||
|
import cloudscraper
|
||
|
|
||
|
def get_url_soup(url):
|
||
|
time.sleep(1)
|
||
|
req = requests.get(url)
|
||
|
soup = bs(req.text, 'html.parser')
|
||
|
l = len(soup)
|
||
|
while l <= 1:
|
||
|
req = requests.get(url)
|
||
|
soup = bs(req.text, 'html.parser')
|
||
|
l = len(soup)
|
||
|
time.sleep(3)
|
||
|
return soup
|
||
|
|
||
|
scraper = cloudscraper.create_scraper(delay=10, browser={'custom': 'ScraperBot/1.0',})
|
||
|
url = 'https://patient.info/forums'
|
||
|
req = scraper.get(url)
|
||
|
soup = BeautifulSoup(req.text, 'lxml')
|
||
|
forums = [(a.get('href'), a.text.strip()) for a in soup.find_all('a', {'class': 'con-meds-lnk'})]
|
||
|
|
||
|
with open('data/patient_URLs.tsv','w') as file:
|
||
|
for url, d0 in forums:
|
||
|
url = f'https://patient.info{url}'
|
||
|
print(url)
|
||
|
req = scraper.get(url)
|
||
|
soup = BeautifulSoup(req.text, 'lxml')
|
||
|
domains = soup.find_all('h3', {'class': 'title'})
|
||
|
# domains = [d.find('a').get('href') for d in domains]
|
||
|
for d in domains:
|
||
|
d1 = d.text.strip()
|
||
|
d = d.find('a').get('href')
|
||
|
print('\t', d.replace('/forums/discuss/browse/', ''))
|
||
|
url = f'https://patient.info{d}'
|
||
|
req = scraper.get(url)
|
||
|
soup = BeautifulSoup(req.text, 'lxml')
|
||
|
pages = soup.find('select', {'class': 'reply-pagination'})
|
||
|
if pages != None:
|
||
|
pages = pages.find_all('option')[-1]
|
||
|
pages = pages.text.split('/')[-1]
|
||
|
pages = int(pages)
|
||
|
else:
|
||
|
pages = 1
|
||
|
for p in range(pages):
|
||
|
page_url = f'https://patient.info{d}?page={p}#group-discussions'
|
||
|
req = scraper.get(page_url)
|
||
|
soup = BeautifulSoup(req.text, 'lxml')
|
||
|
posts = soup.find_all('h3', {'class': 'post__title'})
|
||
|
for post in posts:
|
||
|
href = post.find('a').get('href')
|
||
|
file.write(f'{href.replace("/forums/discuss/", "")}\t{d0}\t{d1}\n')
|