Update 08_basic_email_web_crawler.py

This is a much simpler version of the script(easily understandable).
This commit is contained in:
RayKon∞ 2015-12-02 21:51:55 +05:30
parent 436f3119f9
commit 761e0ecec2

View File

@ -1,45 +1,26 @@
import requests
import re
try:
from urllib.parse import urljoin
except ImportError:
from urlparse import urljoin
# regex
email_re = re.compile(r'([\w\.,]+@[\w\.,]+\.\w+)')
link_re = re.compile(r'href="(.*?)"')
#get url
#url=input('Enter a URL (include 'http://'):')--this is wrong
url = input('Enter a URL (include `http://`): ')
def crawl(url):
#connect to the url
website=requests.get(url)
result = set()
#read html
html=website.text
req = requests.get(url)
# Check if successful
if(req.status_code != 200):
return []
#use re.findall to grab all the links
links = re.findall('"((http|ftp)s?://.*?)"', html)
# Find links
links = link_re.findall(req.text)
emails=re.findall('([\w\.,]+@[\w\.,]+\.\w+)',html)
#prints the number of links in the list
print("\nFound {} links".format(len(links)))
# Search links for emails
for link in links:
# Get an absolute URL for a link
link = urljoin(url, link)
# Find all emails on current page
result.update(email_re.findall(req.text))
return result
if __name__ == '__main__':
emails = crawl('http://www.realpython.com')
print("\nScrapped e-mail addresses:")
for email in emails:
print(email)
print("\n")