Add 'crawler_see_also.py'
This commit is contained in:
parent
aafd978b1c
commit
d26bd7375c
30
crawler_see_also.py
Normal file
30
crawler_see_also.py
Normal file
@ -0,0 +1,30 @@
|
|||||||
|
#trzeba zainstalować biblioteki:
|
||||||
|
# pip install beautifulsoup4
|
||||||
|
# pip install requests
|
||||||
|
|
||||||
|
#python crawler_see_also.py https://en.wikipedia.org/wiki/Online_chat
|
||||||
|
from bs4 import BeautifulSoup
|
||||||
|
import requests
|
||||||
|
import sys
|
||||||
|
|
||||||
|
base_wiki_url = "https://en.wikipedia.org{0}"
|
||||||
|
|
||||||
|
queue =[]
|
||||||
|
|
||||||
|
queue.append(sys.argv[1])
|
||||||
|
def start():
|
||||||
|
for url in queue:
|
||||||
|
|
||||||
|
raw_src = requests.get(url).text
|
||||||
|
doc = BeautifulSoup(raw_src, 'html.parser')
|
||||||
|
try:
|
||||||
|
s=doc.select("#See_also")[0].parent.find_next("ul")
|
||||||
|
except IndexError:
|
||||||
|
continue
|
||||||
|
for link in s.find_all('li'):
|
||||||
|
|
||||||
|
full_link = base_wiki_url.format(link.a['href'])
|
||||||
|
print(full_link)
|
||||||
|
queue.append(full_link)
|
||||||
|
|
||||||
|
start()
|
Loading…
Reference in New Issue
Block a user