From 761e0ecec220daaf3a0c7dfcd1f36949e670ef2e Mon Sep 17 00:00:00 2001
From: =?UTF-8?q?RayKon=E2=88=9E?= <g.rajukoushik@gmail.com>
Date: Wed, 2 Dec 2015 21:51:55 +0530
Subject: [PATCH] Update 08_basic_email_web_crawler.py

This is a much simpler version of the script(easily understandable).
---
 08_basic_email_web_crawler.py | 47 +++++++++++------------------------
 1 file changed, 14 insertions(+), 33 deletions(-)

diff --git a/08_basic_email_web_crawler.py b/08_basic_email_web_crawler.py
index 9c6c58f..a7dbbce 100644
--- a/08_basic_email_web_crawler.py
+++ b/08_basic_email_web_crawler.py
@@ -1,45 +1,26 @@
 import requests
 import re
-try:
-    from urllib.parse import urljoin
-except ImportError:
-    from urlparse import urljoin
 
-# regex
-email_re = re.compile(r'([\w\.,]+@[\w\.,]+\.\w+)')
-link_re = re.compile(r'href="(.*?)"')
+#get url
+#url=input('Enter a URL (include 'http://'):')--this is wrong
+url = input('Enter a URL (include `http://`): ')
 
 
-def crawl(url):
+#connect to the url
+website=requests.get(url)
 
-    result = set()
+#read html
+html=website.text
 
-    req = requests.get(url)
 
-    # Check if successful
-    if(req.status_code != 200):
-        return []
+#use re.findall to grab all the links
+links = re.findall('"((http|ftp)s?://.*?)"', html)
 
-    # Find links
-    links = link_re.findall(req.text)
+emails=re.findall('([\w\.,]+@[\w\.,]+\.\w+)',html)
 
-    print("\nFound {} links".format(len(links)))
 
-    # Search links for emails
-    for link in links:
+#prints the number of links in the list
+print("\nFound {} links".format(len(links)))
 
-        # Get an absolute URL for a link
-        link = urljoin(url, link)
-
-        # Find all emails on current page
-        result.update(email_re.findall(req.text))
-
-    return result
-
-if __name__ == '__main__':
-    emails = crawl('http://www.realpython.com')
-
-    print("\nScrapped e-mail addresses:")
-    for email in emails:
-        print(email)
-    print("\n")
+for email in emails:
+	print(email)