Update 08_basic_email_web_crawler.py
This is a much simpler version of the script(easily understandable).
This commit is contained in:
parent
436f3119f9
commit
761e0ecec2
@ -1,45 +1,26 @@
|
||||
import requests
|
||||
import re
|
||||
try:
|
||||
from urllib.parse import urljoin
|
||||
except ImportError:
|
||||
from urlparse import urljoin
|
||||
|
||||
# regex
|
||||
email_re = re.compile(r'([\w\.,]+@[\w\.,]+\.\w+)')
|
||||
link_re = re.compile(r'href="(.*?)"')
|
||||
#get url
|
||||
#url=input('Enter a URL (include 'http://'):')--this is wrong
|
||||
url = input('Enter a URL (include `http://`): ')
|
||||
|
||||
|
||||
def crawl(url):
|
||||
#connect to the url
|
||||
website=requests.get(url)
|
||||
|
||||
result = set()
|
||||
#read html
|
||||
html=website.text
|
||||
|
||||
req = requests.get(url)
|
||||
|
||||
# Check if successful
|
||||
if(req.status_code != 200):
|
||||
return []
|
||||
#use re.findall to grab all the links
|
||||
links = re.findall('"((http|ftp)s?://.*?)"', html)
|
||||
|
||||
# Find links
|
||||
links = link_re.findall(req.text)
|
||||
emails=re.findall('([\w\.,]+@[\w\.,]+\.\w+)',html)
|
||||
|
||||
print("\nFound {} links".format(len(links)))
|
||||
|
||||
# Search links for emails
|
||||
for link in links:
|
||||
#prints the number of links in the list
|
||||
print("\nFound {} links".format(len(links)))
|
||||
|
||||
# Get an absolute URL for a link
|
||||
link = urljoin(url, link)
|
||||
|
||||
# Find all emails on current page
|
||||
result.update(email_re.findall(req.text))
|
||||
|
||||
return result
|
||||
|
||||
if __name__ == '__main__':
|
||||
emails = crawl('http://www.realpython.com')
|
||||
|
||||
print("\nScrapped e-mail addresses:")
|
||||
for email in emails:
|
||||
print(email)
|
||||
print("\n")
|
||||
for email in emails:
|
||||
print(email)
|
||||
|
Loading…
Reference in New Issue
Block a user