upload scripts

2023-01-02 09:30:54 +01:00 · 2023-01-02 09:30:54 +01:00 · 04ac541e2d
commit 04ac541e2d
7 changed files with 637 additions and 0 deletions
--- a/historycy.py
+++ b/historycy.py
@ -0,0 +1,137 @@
+import csv
+import requests
+# from bs4 import BeautifulSoup as bs
+import time
+from datetime import datetime
+import numpy as np
+
+from bs4 import BeautifulSoup
+from selenium import webdriver
+from selenium.webdriver.chrome.options import Options
+# from selenium.webdriver.common.by import By
+from selenium import webdriver
+from selenium.webdriver.chrome.service import Service
+from webdriver_manager.chrome import ChromeDriverManager
+
+
+def get_url_soup(url):
+    driver.get(url)
+    return BeautifulSoup(driver.page_source, 'lxml')
+
+
+def clean_text(text):
+    text = " ".join(text.split())
+    text = text.replace('¥', 'Ą').replace('¹', 'ą')
+    text = text.replace('Æ', 'Ć').replace('æ', 'ć')
+    text = text.replace('Ê', 'Ę').replace('ê', 'ę')
+    text = text.replace('£', 'Ł').replace('³', 'ł')
+    text = text.replace('Ñ', 'Ń').replace('ñ', 'ń')
+    text = text.replace('¯', 'Ż').replace('¿', 'ż')
+    text = text.replace('\x8c', 'Ś').replace('\x9c', 'ś')
+    text = text.replace('\x8f', 'Ź').replace('\x9f', 'ź')
+    text = text.replace('', '„').replace('', '”')
+    text = text.replace('\x96', '–')
+    return text
+
+
+def get_content(soup):
+    links = soup.find_all('a')
+    cites = soup.find_all('div', {'class': 'quotemain'})
+    content = ' '.join([p.text.strip() for p in BeautifulSoup(str(soup), 'html.parser').find_all(text=True)])
+
+    for link in links:
+        if clean_text(link.text).startswith('http'):
+            content = content.replace(str(link.text), f' {str(link.get("href"))} ')
+
+    for cite in cites:
+        c = ' '.join([p.text.strip() for p in BeautifulSoup(str(cite), 'html.parser').find_all(text=True)])
+        content = content.replace(c, f' <cite> {c} </cite> ')
+    return clean_text(content)
+
+
+def get_all_comments(url):
+    print(url)
+    result = []
+    soup = get_url_soup(url)
+    pages = soup.find('a', {'title': 'skocz do strony...'})
+    if pages != None:
+        pages = int(clean_text(pages.text).replace(' Strony', ''))
+    else: pages = 1
+    n = 0
+    for page in range(pages):
+        print(page + 1)
+        if page != 0:
+            n_url = f'{url}&st={page * 15}.html'
+            soup = get_url_soup(n_url)
+        cards = soup.find('div', {'class': 'tableborder'})
+        cards = cards.find_all('table', {'cellspacing': '0', 'cellpadding': '0', 'width': '100%', 'border': '0'})
+        for card in cards[:30]:
+            if card.find('td', {'valign': 'top'}) and card.find('td', {'valign': 'top'}).find('table', {'cellspacing': '0', 'cellpadding': '0', 'width': '100%', 'border': '0'}):
+                card = card.find('td', {'valign': 'top'}).find('table', {'cellspacing': '0', 'cellpadding': '0', 'width': '100%', 'border': '0'})
+                card = card.find('tbody').find_all('tr')[1]
+                info, content = card.find_all('td', {'class': 'row4'})
+                username = clean_text(info.find('tr').text)
+                score = info.find_all('tr')[1].find_all('td', {'class': 'posterinfo'})
+                ranga = clean_text(score[0].text).replace(' ranga', '')
+                group = clean_text(score[2].text).replace('Grupa: ', '')
+                posts = clean_text(score[3].text).replace('Postów: ', '').replace('.', '')
+                user_id = clean_text(score[4].text).replace('Nr użytkownika: ', '').replace('.', '')
+                name_surname, degree, profession = '', '', ''
+                if len(score) > 6:
+                    for s in score[6:]:
+                        if 'Stopień akademicki: ' in s.text:
+                            degree = clean_text(s.text).replace('Stopień akademicki: ', '')
+                        elif 'Zawód: 'in s.text:
+                            profession = clean_text(s.text).replace('Zawód: ', '')
+                        else:
+                            name_surname = clean_text(s.text)
+                time = clean_text(content.find('span', {'class': 'postdetails'}).text)
+                time = time.replace('Dzisiaj', '2/12/2022').replace('Wczoraj', '1/12/2022')
+                datetime_object = datetime.strptime(time, '%d/%m/%Y, %H:%M')
+                date = str(datetime_object.date())
+                hour = str(datetime_object.time())
+                content = content.find('span', {'class': 'postcolor'})
+                content = get_content(content)
+
+                result.append([username, user_id, name_surname, degree, ranga, profession, group, posts, date, hour, '', str(n), content])
+                n += 1
+
+
+    result[0][-3] = str(n - 1)
+    return result
+
+options = Options()
+options.headless = True
+options.add_argument('--no-sandbox')
+options.incognito = True
+driver = webdriver.Chrome(options=options)
+print()
+
+
+with open('data/historycy_errors.tsv') as file_0:
+    tsv_file = csv.reader(file_0, delimiter='\t')
+    with open('data/historycy_2.tsv','w') as file:
+        # header = '\t'.join(['topic_URL', 'domain_0', 'domain_1', 'domain_2', 'domain_3', 'domain_4', 'topic', 'views', 'username', 'user_id', 'name_surname', 'degree', 'position', 'profession', 'group', 'posts', 'date', 'hour', 'comments', 'n', 'content'])
+        # file.write(header + '\n')
+        for data in tsv_file:
+            url = f'http://www.historycy.org/index.php?s={data[0]}'
+            domains = data[3:-1]
+            m = len(domains)
+            domains = domains + [''] * (5 - m)
+            # try:
+            result = get_all_comments(url)
+            flag = True
+            for r in result:
+                if flag:
+                    views = data[2]
+                r = [data[0]] + domains + [data[1]] + [views] + r
+                if flag:
+                    views = ''
+                    flag = False
+                file.write('\t'.join(r))
+                file.write('\n')
+            # except Exception as e:
+            #     print(f'error: {url}')
+            #     with open('data/historycy_errors.tsv', 'a+') as f:
+            #         d = "\t".join(data)
+            #         f.write(f'{d}\t{e}\n')
--- a/historycy_scrap_URLs.py
+++ b/historycy_scrap_URLs.py
@ -0,0 +1,103 @@
+import requests
+from bs4 import BeautifulSoup as bs
+import time
+
+
+def get_url_soup(url):
+    try:
+        req = requests.get(url)
+        soup = bs(req.text, 'lxml')
+        if len(soup) == 0:
+            print('sleep')
+            time.sleep(3)
+            print(sleep)
+            get_url_soup(url)
+    except Exception as e:
+        print(e)
+        get_url_soup(url)
+        return
+    return soup
+
+
+def clean_text(text):
+    text = " ".join(text.split())
+    text = text.replace('¥', 'Ą').replace('¹', 'ą')
+    text = text.replace('Æ', 'Ć').replace('æ', 'ć')
+    text = text.replace('Ê', 'Ę').replace('ê', 'ę')
+    text = text.replace('£', 'Ł').replace('³', 'ł')
+    text = text.replace('Ñ', 'Ń').replace('ñ', 'ń')
+    text = text.replace('¯', 'Ż').replace('¿', 'ż')
+    text = text.replace('\x8c', 'Ś').replace('\x9c', 'ś')
+    text = text.replace('\x8f', 'Ź').replace('\x9f', 'ź')
+    text = text.replace('', '„').replace('', '”')
+    text = text.replace('\x96', '–')
+    return text
+
+
+def get_topics(url, soup, domains):
+    table = soup.find_all('table', {'width': '100%', 'border': '0', 'cellspacing':'1', 'cellpadding': '4'})
+    if table != None:
+        rows = table[-1].find_all('tr')
+        for row in rows:
+            cells = row.find_all('td')
+            if len(cells) == 7 and cells[2] != 'Nazwa tematu':
+                links = cells[2].find_all('a')
+                link = links[0].get('href')
+                title = clean_text(links[0].text)
+                if link == '#':
+                    link = links[1].get('href')
+                    title = clean_text(links[1].text)
+                doms = "\t".join(domains)
+                link = link.replace('http://www.historycy.org/index.php?s=', '')
+                views = cells[5].text.replace('.', '')
+                file.write(f'{link}\t{title}\t{views}\t{doms}\n')
+    return
+
+
+def get_domains(url, domains, file):
+    soup = get_url_soup(url)
+    table = soup.find_all('div', {'class': 'tableborder'})
+    pages = soup.find('a', {'title': 'skocz do strony...'})
+    if pages != None:
+        pages = clean_text(pages.text).replace(' Strony', '')
+        pages = int(pages)
+        for page in range(pages):
+            if page != 0:
+                page_url = f'{url}&prune_day=100&sort_by=Z-A&sort_key=last_post&topicfilter=all&st={page * 100}'
+                soup = get_url_soup(page_url)
+                get_topics(page_url, soup, domains)
+            else:
+                get_topics(url, soup, domains)
+    else:
+        get_topics(url, soup, domains)
+    if len(table) > 1:
+        table = table[1]
+        rows = table.find_all('tr')[1:-1]
+        for row in rows:
+            tds = row.find_all('td')
+            if tds[0].find('img', {'alt': 'Redirect'}) == None:
+                topic = tds[1].find('b')
+                topic_name = clean_text(topic.text)
+                topic_url = topic.find('a').get('href')
+                get_domains(topic_url, domains + [topic_name], file)
+    
+
+def get_main_domains(url, file):
+    soup = get_url_soup(url)
+    domains_0 = soup.find_all('div', {'class': 'tableborder'})[:6]
+    for domain in domains_0:
+        domain_0 = clean_text(domain.find('div', {'class': 'maintitle'}).text)
+        domains_1 = domain.find_all('tr')[1:-1]
+        for domain in domains_1:
+            domain_1 = domain.find_all('td')[1].find('b')
+            domain_1_name = clean_text(domain_1.text)
+            domain_1_url = domain_1.find('a').get('href')
+            if domain_1_name != 'Mównica':
+                get_domains(domain_1_url, [domain_0, domain_1_name], file)
+    return []
+
+
+url = 'http://www.historycy.org/'
+
+with open('data/historycy_URLs.tsv','w') as file:
+    get_main_domains(url, file)
--- a/hyperreal.py
+++ b/hyperreal.py
@ -0,0 +1,136 @@
+import csv
+import requests
+from bs4 import BeautifulSoup as bs
+import time
+from datetime import datetime
+
+
+def get_url_soup(url):
+    try:
+        req = requests.get(url)
+        soup = bs(req.text, 'html.parser')
+        if len(soup) == 0:
+            print('sleep')
+            time.sleep(3)
+            print(sleep)
+            get_url_soup(url)
+    except Exception as e:
+        print(e)
+        get_url_soup(url)
+        return
+    return soup
+
+
+def clean_text(text):
+    return " ".join(text.split())
+
+
+# def get_content(soup):
+#     blockquotes = soup.find_all('blockquote')
+#     content = ' '.join([p.text.strip() for p in soup])
+#     for bq in blockquotes:
+#         t = ' '.join([p.text.strip() for p in bq])
+#         if 'pisze:' in bq.text:
+#             idx = bq.text.index('pisze:') + len('pisze:')
+#             t = t[:idx] + ' <cite> ' + t[idx:] + ' </cite>'
+#         else:
+#             t = '<cite> ' + t + ' </cite>'
+#         content = content.replace(bq.text, t)
+#     return clean_text(content)
+
+
+def get_content(soup):
+    blockquotes = soup.find_all('blockquote')
+    cite = soup.find('cite')
+    if blockquotes != []:
+        if cite != None:
+            soup = str(soup)
+            soup = soup.replace(str(cite), ' '.join([p.text.strip() for p in cite]))
+            content = ' '.join([p.text.strip() for p in bs(soup, 'html.parser')])
+            content = clean_text(content)
+            idx = content.index('pisze:') + len('pisze:')
+            content = content[:idx] + ' <cite> ' + content[idx:] + ' </cite>'
+        else:
+            content = clean_text(soup.text)
+            content = '<cite> ' + content + ' </cite>'
+        return content
+    return clean_text(' '.join([p.text.strip() for p in soup]))
+
+
+def get_all_comments(url):
+    result = []
+    soup = get_url_soup(url)
+    pages = soup.find('span', {'class': 'fw-normal'})
+    print()
+    print(pages)
+    print()
+    pages = int(clean_text(pages.text).replace('Strona 1 z ', ''))
+    n = 0
+    for page in range(pages):
+        print(page + 1)
+        if page != 0:
+            n_url = f'{url[:-5]}-{page * 10}.html'
+            soup = get_url_soup(n_url)
+        cards = soup.find_all('div', {'class': 'card border-info mb-2 mb-md-3 shadow-sm'})
+        for card in cards:
+            idx = card.get('id')[1:]            
+            postprofile = card.find('div', {'class': 'postprofile'})
+            user = postprofile.find('div', {'class': 'mt-1 float-start float-md-none'})
+            username = user.find('a')
+            if username == None:
+                username = clean_text(user.text)
+                strong, posts, registration, karma_received, karma_assigned, narkopedia_edits, sex = '', '', '', '', '', '', ''
+            else:
+                username = clean_text(username.text)
+                strong = user.find('strong')
+                if strong != None: strong = strong.text
+                else: strong = ''
+                modal = soup.find('div', {'id': f'modal_post_{idx}'}).find('div', {'class': 'col-8'})
+                info = modal.find_all('div')
+                posts = clean_text(info[0].text.replace('Posty:', ''))
+                registration = clean_text(info[1].text.replace('Rejestracja:', ''))
+                if len(info) > 2:
+                    sex = clean_text(info[2].text.replace('Płeć:', ''))
+                    if sex != None:
+                        if sex == 'chłopak': sex = 'M'
+                        elif sex == 'dziewka': sex = 'F'
+                else: sex = ''
+                score = modal.find_all('li')[-3:]
+                karma_received = clean_text(score[0].text.replace('Karma otrzymana', ''))
+                karma_assigned = clean_text(score[1].text.replace('Karma przydzielona', ''))
+                narkopedia_edits = clean_text(score[2].text.replace('Edycje Narkopedii', ''))
+            body = card.find('div', {'id': f'post_content{idx}'})
+            title = clean_text(body.find('a').text)
+            datetime_str = body.find('time')['datetime']
+            datetime_object = datetime.strptime(datetime_str, '%Y-%m-%dT%H:%M:%S+00:00')
+            date = str(datetime_object.date())
+            hour = str(datetime_object.time())
+            content = get_content(body.find('div', {'class': 'content'}))
+            result.append([idx, '', title, username, strong, registration, posts, sex, karma_received, karma_assigned, narkopedia_edits, date, hour, '', str(n), content])
+            n += 1
+    result[0][-3] = str(n - 1)
+    for i in result:
+        i[1] = result[0][0]
+    return result
+
+
+with open('data/hyperreal_URLs.tsv') as file_0:
+    tsv_file = csv.reader(file_0, delimiter='\t')
+    with open('data/hyperreal.tsv','w') as file:
+        for data in tsv_file:
+            url = f'https://hyperreal.info/talk/{data[0]}'
+            print(url)
+            domains = data[1:]
+            m = len(domains)
+            domains = domains + [''] * (6 - m)
+            try:
+                result = get_all_comments(url)
+                for r in result:
+                    r = [data[0]] + domains + r
+                    file.write('\t'.join(r))
+                    file.write('\n')
+            except Exception as e:
+                print(f'error: {url}')
+                with open('data/hyperreal_errors_2.tsv', 'a+') as f:
+                    d = "\t".join(data)
+                    f.write(f'{d}\t{e}\n')
--- a/hyperreal_scrap_URLs.py
+++ b/hyperreal_scrap_URLs.py
@ -0,0 +1,82 @@
+import requests
+from bs4 import BeautifulSoup as bs
+import time
+
+
+def get_url_soup(url):
+    req = requests.get(url)
+    soup = bs(req.text, 'html.parser')
+    l = len(soup)
+    while l <= 1:
+        req = requests.get(url)
+        soup = bs(req.text, 'html.parser')
+        l = len(soup)
+        time.sleep(3)
+    return soup
+
+
+def clean_text(text):
+    return " ".join(text.split())
+
+
+def get_forum(url, domains, file):
+    try:
+        soup = get_url_soup(url)
+        soup = soup.find('div', {'class': 'order-first'})
+        cards = soup.find_all('div', {'class': 'card mb-3'}) + soup.find_all('div', {'class': 'card mb-3 shadow-sm'})
+        for card in cards:
+            title = card.find('div', {'class': 'col-lg-7 col-md-6 col-12'})
+            title = clean_text(title.text)
+            if title == 'Forum':
+                forum_card = card.find_all('div', {'class': 'row-item'})
+                for c in forum_card:
+                    a = c.find_all('a')
+                    if len(a) > 1:
+                        a = c.find_all('a')[1]
+                        t_url = a.get('href')
+                        domain = clean_text(a.text)
+                        domains.append(domain)
+                        get_forum(t_url, domains, file)
+                        domains.pop()
+            elif title == 'Ogłoszenia':
+                topics = card.find_all('a', {'class': 'topictitle fs-5'})
+                for topic in topics:
+                    file.write(topic.get('href').replace('https://hyperreal.info/talk/', '') + '\t' + '\t'.join(domains + ['Ogłoszenia']) + '\n')
+            elif title == 'Tematy':
+                pages = soup.find('span', {'class': 'fw-normal'})
+                if pages != None:
+                    pages = int(clean_text(pages.text).replace('Strona 1 z ', ''))
+                    for page in range(pages):
+                        n = page * 15
+                        topics_url = f'{url[:-5]}-{n}.html'
+                        topics_soup = get_url_soup(topics_url)
+                        topics_soup = topics_soup.find('div', {'class': 'order-first'})
+                        cards_topics = topics_soup.find_all('div', {'class': 'card mb-3 shadow-sm'})[-1]
+                        topics = cards_topics.find_all('a', {'class': 'topictitle fs-5'})
+                        for topic in topics:
+                            file.write(topic.get('href').replace('https://hyperreal.info/talk/', '') + '\t' + '\t'.join(domains) + '\n')
+    except Exception as e:
+        print(f'\tERROR: {url} - {e}')
+
+
+def get_main_domains(url, file):
+    soup = get_url_soup(url)
+    soup = soup.find('div', {'class': 'order-first'})
+    domains = soup.find_all('div', {'class': 'card mb-3'})
+    for domain in domains:
+        domain_0 = clean_text(domain.find('div', {'class': 'col-lg-7 col-md-6 col-12'}).text)
+        topics = domain.find_all('div', {'class': 'row-item'})
+        for topic in topics:
+            a = topic.find_all('a')[1]
+            t_url = a.get('href')
+            domain_1 = clean_text(a.text)
+            get_forum(t_url, [domain_0, domain_1], file)
+    return []
+
+
+tmp = []
+url = 'https://hyperreal.info/talk/'
+
+
+with open('data/hyperreal_URLs.tsv','w') as file:
+    get_main_domains(url, file)
--- a/patient.py
+++ b/patient.py
@ -0,0 +1,122 @@
+import csv
+from bs4 import BeautifulSoup
+from selenium import webdriver
+from selenium.webdriver.chrome.options import Options
+from selenium.webdriver.common.by import By
+import time
+from datetime import datetime
+
+def get_url_soup(url):
+    driver.get(url)
+    return BeautifulSoup(driver.page_source, 'lxml')
+
+def check_pages_num(soup):
+    pages = soup.find('select', {'class': 'reply-pagination'})
+    if pages != None:
+        pages = pages.find_all('option')[-1]
+        pages = pages.text.split('/')[-1]
+        return int(pages)
+    else:
+        return 1
+    
+def clean_text(text):
+    return " ".join(text.split())
+
+def set_num(n, m):
+    if m != 0:
+        return str(f'{n}.{m}')
+    else: return str(n)
+    
+def get_all_comments(url, domain_0, domain_1):
+    result = []
+    soup = get_url_soup(url)
+    pages = check_pages_num(soup)
+    comments_list = []
+    n0, n = 0, 0
+    for i in range(pages):
+        page_url = f'{url}?order=oldest&page={i}#topic-replies'
+        if i == 0:
+            main = soup.find('div', {'class': 'post__main'})
+            main_title = clean_text(main.find('h1', {'class': 'post__title'}).text.strip())
+            main_author = main.find('h5', {'class': 'author__info'}).text.strip()
+            main_time = main.find('time', {'class': 'fuzzy'})['datetime']
+            datetime_object = datetime.strptime(main_time, '%Y-%m-%dT%H:%M+%S:%f')
+            date = str(datetime_object.date())
+            hour = str(datetime_object.time())
+            likes, replies = soup.find_all('p', {'class': 'post__stats'})[1].text.strip().split(', ')
+            likes = likes.replace(' likes', '').replace(' like', '')
+            replies = replies.replace(' replies', '').replace(' reply', '')
+            main_content = soup.find('div', {'class': 'post__content'}).find_all('p')[:-1]
+            main_content = clean_text(' '.join([p.text.strip() for p in main_content]))
+            main_id = url.split('-')[-1]
+            followers = main.find('p', {'class': 'post__stats'}).find_all('span')[-1].text
+            followers = followers.strip()
+            if 'following' in followers:
+                followers = followers.replace(' users are following.', '').replace(' user is following.', '')
+            else:
+                followers = ''
+            main_data = [main_id, main_id, main_title, main_author, followers, date, hour, likes, replies, '0', '0', main_content]
+            comments_list += [main_data]
+        else:
+            soup = get_url_soup(page_url)
+        comments = soup.find('div', {'class': 'comment-page'})
+        if comments != None:
+            comments = comments.find_all('li', {'class': 'comment'})
+            m = 0
+            for comment in comments:
+                classes = comment.get('class')
+                header = comment.find('div', {'class': 'post__header'})
+                likes = comment.find('a', {'class': 'post__like'})
+                if likes != None:
+                    likes = clean_text(likes.text)
+                else: likes = ''
+                content = comment.find('div', {'class': 'post__content'})
+                if content != None:
+                    content = clean_text(' '.join([q.text for q in content.findAll(text=True)]))
+                    content = content.replace(f' Report / Delete {likes} Reply', '')
+                    content = content.replace(f' Report / Delete Reply', '')
+                if header != None:
+                    if 'comment--nested' in classes:
+                        m += 1
+                    else:
+                        m = 0
+                        n += 1
+                    n0 += 1
+                    idx = comment['itemid'].split('#')[-1]
+                    user = header.find('a', {'class': 'author__name'}).text.strip()
+                    time = comment.find('time', {'class': 'fuzzy'})['datetime']
+                    datetime_object = datetime.strptime(time, '%Y-%m-%dT%H:%M+%S:%f')
+                    date = str(datetime_object.date())
+                    hour = str(datetime_object.time())
+                    likes = comment.find('a', {'class': 'post__like'})
+                    if likes != None:
+                        likes = clean_text(likes.text)
+                    else: likes = ''
+                    n_n = set_num(n, m)
+                    comments_list += [[idx, main_id, main_title, user, '', date, hour, likes, '', str(n0), n_n, content]]
+    comments_list[0][8] = str(n0)
+    return comments_list
+
+
+DRIVER_PATH = '/usr/bin/chromedriver'
+options = Options()
+options.headless = True
+options.incognito = True
+driver = webdriver.Chrome(options=options, executable_path=DRIVER_PATH)
+
+
+with open("data/patient_URLs.tsv") as file:
+    tsv_file = csv.reader(file, delimiter="\t")
+    with open('data/patient_3.tsv','w') as file:
+        # file.write('\t'.join(['topic_URL', 'domain_0', 'domain_1', 'id', 'main_id', 'title', 'user', 'followers', 'date', 'hour', 'likes', 'comments', 'n', 'm', 'content']) + '\n')
+        for topic_url, domain_0, domain_1 in list(tsv_file)[538775:]:
+            try:
+                result = get_all_comments('https://patient.info/forums/discuss/' + topic_url, domain_0, domain_1)
+                for r in result:
+                    r = [topic_url, domain_0, domain_1] + r
+                    file.write('\t'.join(r))
+                    file.write('\n')
+            except Exception as e:
+                print(f'error: {topic_url}')
+                with open('patient_error_2.tsv', 'a+') as f:
+                    f.write(f'{topic_url}\t{e}\n')
--- a/patient_scrap_URLs.py
+++ b/patient_scrap_URLs.py
@ -0,0 +1,51 @@
+from bs4 import BeautifulSoup
+import cloudscraper
+
+def get_url_soup(url):
+    time.sleep(1)
+    req = requests.get(url)
+    soup = bs(req.text, 'html.parser')
+    l = len(soup)
+    while l <= 1:
+        req = requests.get(url)
+        soup = bs(req.text, 'html.parser')
+        l = len(soup)
+        time.sleep(3)
+    return soup
+
+scraper = cloudscraper.create_scraper(delay=10, browser={'custom': 'ScraperBot/1.0',})
+url = 'https://patient.info/forums'
+req = scraper.get(url)
+soup = BeautifulSoup(req.text, 'lxml')
+forums = [(a.get('href'), a.text.strip()) for a in soup.find_all('a', {'class': 'con-meds-lnk'})]
+
+with open('data/patient_URLs.tsv','w') as file:
+    for url, d0 in forums:
+        url = f'https://patient.info{url}'
+        print(url)
+        req = scraper.get(url)
+        soup = BeautifulSoup(req.text, 'lxml')
+        domains = soup.find_all('h3', {'class': 'title'})
+        # domains = [d.find('a').get('href') for d in domains]
+        for d in domains:
+            d1 = d.text.strip()
+            d = d.find('a').get('href')
+            print('\t', d.replace('/forums/discuss/browse/', ''))
+            url = f'https://patient.info{d}'
+            req = scraper.get(url)
+            soup = BeautifulSoup(req.text, 'lxml')
+            pages = soup.find('select', {'class': 'reply-pagination'})
+            if pages != None:
+                pages = pages.find_all('option')[-1]
+                pages = pages.text.split('/')[-1]
+                pages = int(pages)
+            else:
+                pages = 1
+            for p in range(pages):
+                page_url = f'https://patient.info{d}?page={p}#group-discussions'
+                req = scraper.get(page_url)
+                soup = BeautifulSoup(req.text, 'lxml')
+                posts = soup.find_all('h3', {'class': 'post__title'})
+                for post in posts:
+                    href = post.find('a').get('href')
+                    file.write(f'{href.replace("/forums/discuss/", "")}\t{d0}\t{d1}\n')
--- a/requirements.txt
+++ b/requirements.txt
@ -0,0 +1,6 @@
+beautifulsoup4==4.11.1
+cloudscraper==1.2.65
+lxml==4.9.1
+selenium==4.7.0
+webdriver-manager==3.8.5
+pandas