added web crawler

2014-05-13 17:48:46 -07:00 · 2014-05-13 17:48:46 -07:00 · 7409f60137
commit 7409f60137
parent 7a6597cf8c
2 changed files with 46 additions and 0 deletions
--- a/08_basic_email_web_crawler.py
+++ b/08_basic_email_web_crawler.py
@ -0,0 +1,45 @@
+import requests
+import re
+import urlparse
+
+# regex
+email_re = re.compile(r'([\w\.,]+@[\w\.,]+\.\w+)')
+link_re = re.compile(r'href="(.*?)"')
+
+def crawl(url, maxlevel):
+
+    result = set()
+
+    while maxlevel > 0:
+
+        # Get the webpage
+        req = requests.get(url)
+
+        # Check if successful
+        if(req.status_code != 200):
+            return []
+
+        # Find and follow all the links
+        links = link_re.findall(req.text)
+        for link in links:
+            # Get an absolute URL for a link
+            link = urlparse.urljoin(url, link)
+
+        # Find all emails on current page
+        result.update(email_re.findall(req.text))
+
+        print "Crawled level: {}".format(maxlevel)
+
+        # new level
+        maxlevel -= 1
+
+        # recurse 
+        crawl(link, maxlevel)
+
+    return result
+
+emails = crawl('http://www.website_goes_here_dot_com', 2)
+
+print "\nScrapped e-mail addresses:"
+for email in emails:
+    print email
--- a/readme.md
+++ b/readme.md
@ -7,3 +7,4 @@
 1. **05_load_json_without_dupes.py**: load json, convert to dict, raise error if there is a duplicate key
 1. **06_execution_time.py**: class used for timing execution of code
 1. **07_benchmark_permissions_loading_django.py**: benchmark loading of permissions in Django
+1. **08_basic_email_web_crawler.py**: web crawler for grabbing emails from a website recursively