From 6613b13d4dba5bd7b1dfbe6759e4f9f6773be63d Mon Sep 17 00:00:00 2001 From: Michael Herman Date: Thu, 18 Feb 2016 15:08:55 -0700 Subject: [PATCH] updated email crawler --- 08_basic_email_web_crawler.py | 23 +++++++++-------------- 1 file changed, 9 insertions(+), 14 deletions(-) diff --git a/08_basic_email_web_crawler.py b/08_basic_email_web_crawler.py index a7dbbce..b56c747 100644 --- a/08_basic_email_web_crawler.py +++ b/08_basic_email_web_crawler.py @@ -1,26 +1,21 @@ import requests import re -#get url -#url=input('Enter a URL (include 'http://'):')--this is wrong +# get url url = input('Enter a URL (include `http://`): ') +# connect to the url +website = requests.get(url) -#connect to the url -website=requests.get(url) +# read html +html = website.text -#read html -html=website.text - - -#use re.findall to grab all the links +# use re.findall to grab all the links links = re.findall('"((http|ftp)s?://.*?)"', html) - -emails=re.findall('([\w\.,]+@[\w\.,]+\.\w+)',html) +emails = re.findall('([\w\.,]+@[\w\.,]+\.\w+)', html) -#prints the number of links in the list +# print the number of links in the list print("\nFound {} links".format(len(links))) - for email in emails: - print(email) + print(email)