1
0
forum-scrapers/patient_scrap_URLs.py

51 lines
1.9 KiB
Python
Raw Normal View History

2023-01-02 09:30:54 +01:00
from bs4 import BeautifulSoup
import cloudscraper
def get_url_soup(url):
time.sleep(1)
req = requests.get(url)
soup = bs(req.text, 'html.parser')
l = len(soup)
while l <= 1:
req = requests.get(url)
soup = bs(req.text, 'html.parser')
l = len(soup)
time.sleep(3)
return soup
scraper = cloudscraper.create_scraper(delay=10, browser={'custom': 'ScraperBot/1.0',})
url = 'https://patient.info/forums'
req = scraper.get(url)
soup = BeautifulSoup(req.text, 'lxml')
forums = [(a.get('href'), a.text.strip()) for a in soup.find_all('a', {'class': 'con-meds-lnk'})]
with open('data/patient_URLs.tsv','w') as file:
for url, d0 in forums:
url = f'https://patient.info{url}'
print(url)
req = scraper.get(url)
soup = BeautifulSoup(req.text, 'lxml')
domains = soup.find_all('h3', {'class': 'title'})
# domains = [d.find('a').get('href') for d in domains]
for d in domains:
d1 = d.text.strip()
d = d.find('a').get('href')
print('\t', d.replace('/forums/discuss/browse/', ''))
url = f'https://patient.info{d}'
req = scraper.get(url)
soup = BeautifulSoup(req.text, 'lxml')
pages = soup.find('select', {'class': 'reply-pagination'})
if pages != None:
pages = pages.find_all('option')[-1]
pages = pages.text.split('/')[-1]
pages = int(pages)
else:
pages = 1
for p in range(pages):
page_url = f'https://patient.info{d}?page={p}#group-discussions'
req = scraper.get(page_url)
soup = BeautifulSoup(req.text, 'lxml')
posts = soup.find_all('h3', {'class': 'post__title'})
for post in posts:
href = post.find('a').get('href')
file.write(f'{href.replace("/forums/discuss/", "")}\t{d0}\t{d1}\n')