Add 'crawler_see_also.py'

This commit is contained in:
Emil Markiewicz 2019-05-29 11:12:55 +00:00
parent aafd978b1c
commit d26bd7375c
1 changed files with 30 additions and 0 deletions

30
crawler_see_also.py Normal file
View File

@ -0,0 +1,30 @@
#trzeba zainstalować biblioteki:
# pip install beautifulsoup4
# pip install requests
#python crawler_see_also.py https://en.wikipedia.org/wiki/Online_chat
from bs4 import BeautifulSoup
import requests
import sys
base_wiki_url = "https://en.wikipedia.org{0}"
queue =[]
queue.append(sys.argv[1])
def start():
for url in queue:
raw_src = requests.get(url).text
doc = BeautifulSoup(raw_src, 'html.parser')
try:
s=doc.select("#See_also")[0].parent.find_next("ul")
except IndexError:
continue
for link in s.find_all('li'):
full_link = base_wiki_url.format(link.a['href'])
print(full_link)
queue.append(full_link)
start()