updated email crawler

This commit is contained in:
Michael Herman 2016-02-18 15:08:55 -07:00
parent 780cad2988
commit 6613b13d4d

View File

@ -1,26 +1,21 @@
import requests import requests
import re import re
#get url # get url
#url=input('Enter a URL (include 'http://'):')--this is wrong
url = input('Enter a URL (include `http://`): ') url = input('Enter a URL (include `http://`): ')
# connect to the url
website = requests.get(url)
#connect to the url # read html
website=requests.get(url) html = website.text
#read html # use re.findall to grab all the links
html=website.text
#use re.findall to grab all the links
links = re.findall('"((http|ftp)s?://.*?)"', html) links = re.findall('"((http|ftp)s?://.*?)"', html)
emails = re.findall('([\w\.,]+@[\w\.,]+\.\w+)', html)
emails=re.findall('([\w\.,]+@[\w\.,]+\.\w+)',html)
#prints the number of links in the list # print the number of links in the list
print("\nFound {} links".format(len(links))) print("\nFound {} links".format(len(links)))
for email in emails: for email in emails:
print(email) print(email)