python-scripts/scripts/08_basic_email_web_crawler.py

import requests
import re

# get url
url = input('Enter a URL (include `http://`): ')

# connect to the url
website = requests.get(url)

# read html
html = website.text

# use re.findall to grab all the links
links = re.findall('"((http|ftp)s?://.*?)"', html)
emails = re.findall('([\w\.,]+@[\w\.,]+\.\w+)', html)


# print the number of links in the list
print("\nFound {} links".format(len(links)))
for email in emails:
    print(email)
added web crawler 2014-05-14 02:48:46 +02:00			`import requests`
			`import re`

updated email crawler 2016-02-18 23:08:55 +01:00			`# get url`
Update 08_basic_email_web_crawler.py This is a much simpler version of the script(easily understandable). 2015-12-02 17:21:55 +01:00			url = input('Enter a URL (include `http://`): ')
added web crawler 2014-05-14 02:48:46 +02:00
updated email crawler 2016-02-18 23:08:55 +01:00			`# connect to the url`
			`website = requests.get(url)`
added link scraper 2014-05-18 16:29:23 +02:00
updated email crawler 2016-02-18 23:08:55 +01:00			`# read html`
			`html = website.text`
added web crawler 2014-05-14 02:48:46 +02:00
updated email crawler 2016-02-18 23:08:55 +01:00			`# use re.findall to grab all the links`
Update 08_basic_email_web_crawler.py This is a much simpler version of the script(easily understandable). 2015-12-02 17:21:55 +01:00			`links = re.findall('"((http\|ftp)s?://.*?)"', html)`
updated email crawler 2016-02-18 23:08:55 +01:00			`emails = re.findall('([\w\.,]+@[\w\.,]+\.\w+)', html)`
added web crawler 2014-05-14 02:48:46 +02:00

updated email crawler 2016-02-18 23:08:55 +01:00			`# print the number of links in the list`
Update 08_basic_email_web_crawler.py This is a much simpler version of the script(easily understandable). 2015-12-02 17:21:55 +01:00			`print("\nFound {} links".format(len(links)))`
			`for email in emails:`
updated email crawler 2016-02-18 23:08:55 +01:00			`print(email)`