Merge pull request #5 from RajuKoushik/patch-1

Update 08_basic_email_web_crawler.py
This commit is contained in:
Michael Herman 2016-02-19 00:07:19 +02:00
commit 780cad2988

View File

@ -1,45 +1,26 @@
import requests
import re
try:
from urllib.parse import urljoin
except ImportError:
from urlparse import urljoin
# regex
email_re = re.compile(r'([\w\.,]+@[\w\.,]+\.\w+)')
link_re = re.compile(r'href="(.*?)"')
#get url
#url=input('Enter a URL (include 'http://'):')--this is wrong
url = input('Enter a URL (include `http://`): ')
def crawl(url):
#connect to the url
website=requests.get(url)
result = set()
#read html
html=website.text
req = requests.get(url)
# Check if successful
if(req.status_code != 200):
return []
#use re.findall to grab all the links
links = re.findall('"((http|ftp)s?://.*?)"', html)
# Find links
links = link_re.findall(req.text)
emails=re.findall('([\w\.,]+@[\w\.,]+\.\w+)',html)
print("\nFound {} links".format(len(links)))
# Search links for emails
for link in links:
#prints the number of links in the list
print("\nFound {} links".format(len(links)))
# Get an absolute URL for a link
link = urljoin(url, link)
# Find all emails on current page
result.update(email_re.findall(req.text))
return result
if __name__ == '__main__':
emails = crawl('http://www.realpython.com')
print("\nScrapped e-mail addresses:")
for email in emails:
print(email)
print("\n")
for email in emails:
print(email)