forum-scrapers/patient_scrap_URLs.py

from bs4 import BeautifulSoup
import cloudscraper

def get_url_soup(url):
    time.sleep(1)
    req = requests.get(url)
    soup = bs(req.text, 'html.parser')
    l = len(soup)
    while l <= 1:
        req = requests.get(url)
        soup = bs(req.text, 'html.parser')
        l = len(soup)
        time.sleep(3)
    return soup

scraper = cloudscraper.create_scraper(delay=10, browser={'custom': 'ScraperBot/1.0',})
url = 'https://patient.info/forums'
req = scraper.get(url)
soup = BeautifulSoup(req.text, 'lxml')
forums = [(a.get('href'), a.text.strip()) for a in soup.find_all('a', {'class': 'con-meds-lnk'})]

with open('data/patient_URLs.tsv','w') as file:
    for url, d0 in forums:
        url = f'https://patient.info{url}'
        print(url)
        req = scraper.get(url)
        soup = BeautifulSoup(req.text, 'lxml')
        domains = soup.find_all('h3', {'class': 'title'})
        # domains = [d.find('a').get('href') for d in domains]
        for d in domains:
            d1 = d.text.strip()
            d = d.find('a').get('href')
            print('\t', d.replace('/forums/discuss/browse/', ''))
            url = f'https://patient.info{d}'
            req = scraper.get(url)
            soup = BeautifulSoup(req.text, 'lxml')
            pages = soup.find('select', {'class': 'reply-pagination'})
            if pages != None:
                pages = pages.find_all('option')[-1]
                pages = pages.text.split('/')[-1]
                pages = int(pages)
            else:
                pages = 1
            for p in range(pages):
                page_url = f'https://patient.info{d}?page={p}#group-discussions'
                req = scraper.get(page_url)
                soup = BeautifulSoup(req.text, 'lxml')
                posts = soup.find_all('h3', {'class': 'post__title'})
                for post in posts:
                    href = post.find('a').get('href')
                    file.write(f'{href.replace("/forums/discuss/", "")}\t{d0}\t{d1}\n')