python-scripts/08_basic_email_web_crawler.py

27 lines
491 B
Python
Raw Normal View History

2014-05-14 02:48:46 +02:00
import requests
import re
#get url
#url=input('Enter a URL (include 'http://'):')--this is wrong
url = input('Enter a URL (include `http://`): ')
2014-05-14 02:48:46 +02:00
2014-05-18 16:29:23 +02:00
#connect to the url
website=requests.get(url)
2014-05-14 02:48:46 +02:00
#read html
html=website.text
2014-05-14 02:48:46 +02:00
#use re.findall to grab all the links
links = re.findall('"((http|ftp)s?://.*?)"', html)
2014-05-14 02:48:46 +02:00
emails=re.findall('([\w\.,]+@[\w\.,]+\.\w+)',html)
2014-05-14 02:48:46 +02:00
#prints the number of links in the list
print("\nFound {} links".format(len(links)))
2014-05-14 02:48:46 +02:00
for email in emails:
print(email)