diff --git a/crawler_see_also.py b/crawler_see_also.py new file mode 100644 index 0000000..70f1bb8 --- /dev/null +++ b/crawler_see_also.py @@ -0,0 +1,30 @@ +#trzeba zainstalować biblioteki: +# pip install beautifulsoup4 +# pip install requests + +#python crawler_see_also.py https://en.wikipedia.org/wiki/Online_chat +from bs4 import BeautifulSoup +import requests +import sys + +base_wiki_url = "https://en.wikipedia.org{0}" + +queue =[] + +queue.append(sys.argv[1]) +def start(): + for url in queue: + + raw_src = requests.get(url).text + doc = BeautifulSoup(raw_src, 'html.parser') + try: + s=doc.select("#See_also")[0].parent.find_next("ul") + except IndexError: + continue + for link in s.find_all('li'): + + full_link = base_wiki_url.format(link.a['href']) + print(full_link) + queue.append(full_link) + +start() \ No newline at end of file