from bs4 import BeautifulSoup import cloudscraper def get_url_soup(url): time.sleep(1) req = requests.get(url) soup = bs(req.text, 'html.parser') l = len(soup) while l <= 1: req = requests.get(url) soup = bs(req.text, 'html.parser') l = len(soup) time.sleep(3) return soup scraper = cloudscraper.create_scraper(delay=10, browser={'custom': 'ScraperBot/1.0',}) url = 'https://patient.info/forums' req = scraper.get(url) soup = BeautifulSoup(req.text, 'lxml') forums = [(a.get('href'), a.text.strip()) for a in soup.find_all('a', {'class': 'con-meds-lnk'})] with open('data/patient_URLs.tsv','w') as file: for url, d0 in forums: url = f'https://patient.info{url}' print(url) req = scraper.get(url) soup = BeautifulSoup(req.text, 'lxml') domains = soup.find_all('h3', {'class': 'title'}) # domains = [d.find('a').get('href') for d in domains] for d in domains: d1 = d.text.strip() d = d.find('a').get('href') print('\t', d.replace('/forums/discuss/browse/', '')) url = f'https://patient.info{d}' req = scraper.get(url) soup = BeautifulSoup(req.text, 'lxml') pages = soup.find('select', {'class': 'reply-pagination'}) if pages != None: pages = pages.find_all('option')[-1] pages = pages.text.split('/')[-1] pages = int(pages) else: pages = 1 for p in range(pages): page_url = f'https://patient.info{d}?page={p}#group-discussions' req = scraper.get(page_url) soup = BeautifulSoup(req.text, 'lxml') posts = soup.find_all('h3', {'class': 'post__title'}) for post in posts: href = post.find('a').get('href') file.write(f'{href.replace("/forums/discuss/", "")}\t{d0}\t{d1}\n')