added web crawler

2014-05-13 17:48:46 -07:00 · 2014-05-13 17:48:46 -07:00 · 7409f60137
commit 7409f60137
parent 7a6597cf8c
2 changed files with 46 additions and 0 deletions
--- a/08_basic_email_web_crawler.py
+++ b/08_basic_email_web_crawler.py
@ -0,0 +1,45 @@
 import requests
 import re
 import urlparse
 # regex
 email_re = re.compile(r'([\w\.,]+@[\w\.,]+\.\w+)')
 link_re = re.compile(r'href="(.*?)"')
 def crawl(url, maxlevel):
    result = set()
    while maxlevel > 0:
        # Get the webpage
        req = requests.get(url)
        # Check if successful
        if(req.status_code != 200):
            return []
        # Find and follow all the links
        links = link_re.findall(req.text)
        for link in links:
            # Get an absolute URL for a link
            link = urlparse.urljoin(url, link)
        # Find all emails on current page
        result.update(email_re.findall(req.text))
        print "Crawled level: {}".format(maxlevel)
        # new level
        maxlevel -= 1
        # recurse 
        crawl(link, maxlevel)
    return result
 emails = crawl('http://www.website_goes_here_dot_com', 2)
 print "\nScrapped e-mail addresses:"
 for email in emails:
    print email
--- a/readme.md
+++ b/readme.md
@ -7,3 +7,4 @@
 1. **05_load_json_without_dupes.py**: load json, convert to dict, raise error if there is a duplicate key
 1. **06_execution_time.py**: class used for timing execution of code
 1. **07_benchmark_permissions_loading_django.py**: benchmark loading of permissions in Django
 1. **08_basic_email_web_crawler.py**: web crawler for grabbing emails from a website recursively