python-scripts/scripts/08_basic_email_web_crawler.py

22 lines
439 B
Python
Raw Normal View History

2014-05-14 02:48:46 +02:00
import requests
import re
2016-02-18 23:08:55 +01:00
# get url
url = input('Enter a URL (include `http://`): ')
2014-05-14 02:48:46 +02:00
2016-02-18 23:08:55 +01:00
# connect to the url
website = requests.get(url)
2014-05-18 16:29:23 +02:00
2016-02-18 23:08:55 +01:00
# read html
html = website.text
2014-05-14 02:48:46 +02:00
2016-02-18 23:08:55 +01:00
# use re.findall to grab all the links
links = re.findall('"((http|ftp)s?://.*?)"', html)
2016-02-18 23:08:55 +01:00
emails = re.findall('([\w\.,]+@[\w\.,]+\.\w+)', html)
2014-05-14 02:48:46 +02:00
2016-02-18 23:08:55 +01:00
# print the number of links in the list
print("\nFound {} links".format(len(links)))
for email in emails:
2016-02-18 23:08:55 +01:00
print(email)