Update 08_basic_email_web_crawler.py
This is a much simpler version of the script(easily understandable).
This commit is contained in:
parent
436f3119f9
commit
761e0ecec2
@ -1,45 +1,26 @@
|
|||||||
import requests
|
import requests
|
||||||
import re
|
import re
|
||||||
try:
|
|
||||||
from urllib.parse import urljoin
|
|
||||||
except ImportError:
|
|
||||||
from urlparse import urljoin
|
|
||||||
|
|
||||||
# regex
|
#get url
|
||||||
email_re = re.compile(r'([\w\.,]+@[\w\.,]+\.\w+)')
|
#url=input('Enter a URL (include 'http://'):')--this is wrong
|
||||||
link_re = re.compile(r'href="(.*?)"')
|
url = input('Enter a URL (include `http://`): ')
|
||||||
|
|
||||||
|
|
||||||
def crawl(url):
|
#connect to the url
|
||||||
|
website=requests.get(url)
|
||||||
|
|
||||||
result = set()
|
#read html
|
||||||
|
html=website.text
|
||||||
|
|
||||||
req = requests.get(url)
|
|
||||||
|
|
||||||
# Check if successful
|
#use re.findall to grab all the links
|
||||||
if(req.status_code != 200):
|
links = re.findall('"((http|ftp)s?://.*?)"', html)
|
||||||
return []
|
|
||||||
|
|
||||||
# Find links
|
emails=re.findall('([\w\.,]+@[\w\.,]+\.\w+)',html)
|
||||||
links = link_re.findall(req.text)
|
|
||||||
|
|
||||||
print("\nFound {} links".format(len(links)))
|
|
||||||
|
|
||||||
# Search links for emails
|
#prints the number of links in the list
|
||||||
for link in links:
|
print("\nFound {} links".format(len(links)))
|
||||||
|
|
||||||
# Get an absolute URL for a link
|
for email in emails:
|
||||||
link = urljoin(url, link)
|
print(email)
|
||||||
|
|
||||||
# Find all emails on current page
|
|
||||||
result.update(email_re.findall(req.text))
|
|
||||||
|
|
||||||
return result
|
|
||||||
|
|
||||||
if __name__ == '__main__':
|
|
||||||
emails = crawl('http://www.realpython.com')
|
|
||||||
|
|
||||||
print("\nScrapped e-mail addresses:")
|
|
||||||
for email in emails:
|
|
||||||
print(email)
|
|
||||||
print("\n")
|
|
||||||
|
Loading…
Reference in New Issue
Block a user