updated email crawler
This commit is contained in:
parent
780cad2988
commit
6613b13d4d
@ -1,26 +1,21 @@
|
|||||||
import requests
|
import requests
|
||||||
import re
|
import re
|
||||||
|
|
||||||
#get url
|
# get url
|
||||||
#url=input('Enter a URL (include 'http://'):')--this is wrong
|
|
||||||
url = input('Enter a URL (include `http://`): ')
|
url = input('Enter a URL (include `http://`): ')
|
||||||
|
|
||||||
|
# connect to the url
|
||||||
|
website = requests.get(url)
|
||||||
|
|
||||||
#connect to the url
|
# read html
|
||||||
website=requests.get(url)
|
html = website.text
|
||||||
|
|
||||||
#read html
|
# use re.findall to grab all the links
|
||||||
html=website.text
|
|
||||||
|
|
||||||
|
|
||||||
#use re.findall to grab all the links
|
|
||||||
links = re.findall('"((http|ftp)s?://.*?)"', html)
|
links = re.findall('"((http|ftp)s?://.*?)"', html)
|
||||||
|
emails = re.findall('([\w\.,]+@[\w\.,]+\.\w+)', html)
|
||||||
emails=re.findall('([\w\.,]+@[\w\.,]+\.\w+)',html)
|
|
||||||
|
|
||||||
|
|
||||||
#prints the number of links in the list
|
# print the number of links in the list
|
||||||
print("\nFound {} links".format(len(links)))
|
print("\nFound {} links".format(len(links)))
|
||||||
|
|
||||||
for email in emails:
|
for email in emails:
|
||||||
print(email)
|
print(email)
|
||||||
|
Loading…
Reference in New Issue
Block a user