updates

2014-05-18 12:16:56 -06:00 · 2014-05-18 12:16:56 -06:00 · 53da94f078
commit 53da94f078
parent 239a0ff7ff
3 changed files with 37 additions and 53 deletions
--- a/08_basic_email_web_crawler.py
+++ b/08_basic_email_web_crawler.py
@ -7,40 +7,36 @@ email_re = re.compile(r'([\w\.,]+@[\w\.,]+\.\w+)')
 link_re = re.compile(r'href="(.*?)"')


-def crawl(url, maxlevel):
+def crawl(url):

    result = set()

-    while maxlevel > 0:
+    req = requests.get(url)

-        # Get the webpage
-        req = requests.get(url)
+    # Check if successful
+    if(req.status_code != 200):
+        return []

-        # Check if successful
-        if(req.status_code != 200):
-            return []
+    # Find links
+    links = link_re.findall(req.text)

-        # Find and follow all the links
-        links = link_re.findall(req.text)
-        for link in links:
-            # Get an absolute URL for a link
-            link = urlparse.urljoin(url, link)
+    print "\nFound {} links".format(len(links))

-            # Find all emails on current page
-            result.update(email_re.findall(req.text))
+    # Search links for emails
+    for link in links:

-            print "Crawled level: {}".format(maxlevel)
+        # Get an absolute URL for a link
+        link = urlparse.urljoin(url, link)

-            # new level
-            maxlevel -= 1
-
-            # recurse
-            crawl(link, maxlevel)
+        # Find all emails on current page
+        result.update(email_re.findall(req.text))

    return result

-emails = crawl('http://www.website_goes_here_dot_com', 2)
+if __name__ == '__main__':
+    emails = crawl('http://www.realpython.com')

-print "\nScrapped e-mail addresses:"
-for email in emails:
-    print email
+    print "\nScrapped e-mail addresses:"
+    for email in emails:
+        print email
+    print "\n"
--- a/09_basic_link_web_crawler.py
+++ b/09_basic_link_web_crawler.py
@ -6,39 +6,27 @@ import urlparse
 link_re = re.compile(r'href="(.*?)"')


-def crawl(url, maxlevel):
+def crawl(url):

-    result = set()
+    req = requests.get(url)

-    while maxlevel > 0:
+    # Check if successful
+    if(req.status_code != 200):
+        return []

-        # Get the webpage
-        req = requests.get(url)
+    # Find links
+    links = link_re.findall(req.text)

-        # Check if successful
-        if(req.status_code != 200):
-            return []
+    print "\nFound {} links".format(len(links))

-        # Find and follow all the links
-        links = link_re.findall(req.text)
-        for link in links:
-            # Get an absolute URL for a link
-            link = urlparse.urljoin(url, link)
-            # add links to result set
-            result.update(link)
+    # Search links for emails
+    for link in links:

-            print "Crawled level: {}".format(maxlevel)
+        # Get an absolute URL for a link
+        link = urlparse.urljoin(url, link)

-            # new level
-            maxlevel -= 1
+        print link

-            # recurse
-            crawl(link, maxlevel)

-    return result
-
-emails = crawl('http://www.website_goes_here_dot_com', 2)
-
-print "\nScrapped links:"
-for link in links:
-    print link
+if __name__ == '__main__':
+    crawl('http://www.realpython.com')
--- a/readme.md
+++ b/readme.md
@ -7,6 +7,6 @@
 1. **05_load_json_without_dupes.py**: load json, convert to dict, raise error if there is a duplicate key
 1. **06_execution_time.py**: class used for timing execution of code
 1. **07_benchmark_permissions_loading_django.py**: benchmark loading of permissions in Django
-1. **08_basic_email_web_crawler.py**: web crawler for grabbing emails from a website recursively
-1. **09_basic_link_web_crawler.py**: web crawler for grabbing links from a website recursively
+1. **08_basic_email_web_crawler.py**: web crawler for grabbing emails from a website
+1. **09_basic_link_web_crawler.py**: web crawler for grabbing links from a website 
 1. **10_find_files_recursively.py**: recursively grab files from a directory