From 53da94f078490eb1a47b9eb66776c8419d1e080f Mon Sep 17 00:00:00 2001
From: Michael Herman <hermanmu@gmail.com>
Date: Sun, 18 May 2014 12:16:56 -0600
Subject: [PATCH] updates

---
 08_basic_email_web_crawler.py | 44 ++++++++++++++++-------------------
 09_basic_link_web_crawler.py  | 42 ++++++++++++---------------------
 readme.md                     |  4 ++--
 3 files changed, 37 insertions(+), 53 deletions(-)

diff --git a/08_basic_email_web_crawler.py b/08_basic_email_web_crawler.py
index 30a2282..faca75f 100644
--- a/08_basic_email_web_crawler.py
+++ b/08_basic_email_web_crawler.py
@@ -7,40 +7,36 @@ email_re = re.compile(r'([\w\.,]+@[\w\.,]+\.\w+)')
 link_re = re.compile(r'href="(.*?)"')
 
 
-def crawl(url, maxlevel):
+def crawl(url):
 
     result = set()
 
-    while maxlevel > 0:
+    req = requests.get(url)
 
-        # Get the webpage
-        req = requests.get(url)
+    # Check if successful
+    if(req.status_code != 200):
+        return []
 
-        # Check if successful
-        if(req.status_code != 200):
-            return []
+    # Find links
+    links = link_re.findall(req.text)
 
-        # Find and follow all the links
-        links = link_re.findall(req.text)
-        for link in links:
-            # Get an absolute URL for a link
-            link = urlparse.urljoin(url, link)
+    print "\nFound {} links".format(len(links))
 
-            # Find all emails on current page
-            result.update(email_re.findall(req.text))
+    # Search links for emails
+    for link in links:
 
-            print "Crawled level: {}".format(maxlevel)
+        # Get an absolute URL for a link
+        link = urlparse.urljoin(url, link)
 
-            # new level
-            maxlevel -= 1
-
-            # recurse
-            crawl(link, maxlevel)
+        # Find all emails on current page
+        result.update(email_re.findall(req.text))
 
     return result
 
-emails = crawl('http://www.website_goes_here_dot_com', 2)
+if __name__ == '__main__':
+    emails = crawl('http://www.realpython.com')
 
-print "\nScrapped e-mail addresses:"
-for email in emails:
-    print email
+    print "\nScrapped e-mail addresses:"
+    for email in emails:
+        print email
+    print "\n"
diff --git a/09_basic_link_web_crawler.py b/09_basic_link_web_crawler.py
index 47be8a1..95849d2 100644
--- a/09_basic_link_web_crawler.py
+++ b/09_basic_link_web_crawler.py
@@ -6,39 +6,27 @@ import urlparse
 link_re = re.compile(r'href="(.*?)"')
 
 
-def crawl(url, maxlevel):
+def crawl(url):
 
-    result = set()
+    req = requests.get(url)
 
-    while maxlevel > 0:
+    # Check if successful
+    if(req.status_code != 200):
+        return []
 
-        # Get the webpage
-        req = requests.get(url)
+    # Find links
+    links = link_re.findall(req.text)
 
-        # Check if successful
-        if(req.status_code != 200):
-            return []
+    print "\nFound {} links".format(len(links))
 
-        # Find and follow all the links
-        links = link_re.findall(req.text)
-        for link in links:
-            # Get an absolute URL for a link
-            link = urlparse.urljoin(url, link)
-            # add links to result set
-            result.update(link)
+    # Search links for emails
+    for link in links:
 
-            print "Crawled level: {}".format(maxlevel)
+        # Get an absolute URL for a link
+        link = urlparse.urljoin(url, link)
 
-            # new level
-            maxlevel -= 1
+        print link
 
-            # recurse
-            crawl(link, maxlevel)
 
-    return result
-
-emails = crawl('http://www.website_goes_here_dot_com', 2)
-
-print "\nScrapped links:"
-for link in links:
-    print link
+if __name__ == '__main__':
+    crawl('http://www.realpython.com')
diff --git a/readme.md b/readme.md
index 217a3f2..412f67f 100644
--- a/readme.md
+++ b/readme.md
@@ -7,6 +7,6 @@
 1. **05_load_json_without_dupes.py**: load json, convert to dict, raise error if there is a duplicate key
 1. **06_execution_time.py**: class used for timing execution of code
 1. **07_benchmark_permissions_loading_django.py**: benchmark loading of permissions in Django
-1. **08_basic_email_web_crawler.py**: web crawler for grabbing emails from a website recursively
-1. **09_basic_link_web_crawler.py**: web crawler for grabbing links from a website recursively
+1. **08_basic_email_web_crawler.py**: web crawler for grabbing emails from a website
+1. **09_basic_link_web_crawler.py**: web crawler for grabbing links from a website 
 1. **10_find_files_recursively.py**: recursively grab files from a directory