updates

2014-05-18 12:16:56 -06:00 · 2014-05-18 12:16:56 -06:00 · 53da94f078
commit 53da94f078
parent 239a0ff7ff
3 changed files with 37 additions and 53 deletions
--- a/08_basic_email_web_crawler.py
+++ b/08_basic_email_web_crawler.py
@ -7,40 +7,36 @@ email_re = re.compile(r'([\w\.,]+@[\w\.,]+\.\w+)')
 link_re = re.compile(r'href="(.*?)"')
-def crawl(url, maxlevel):
+def crawl(url):
    result = set()
    while maxlevel > 0:
        # Get the webpage
    req = requests.get(url)
    # Check if successful
    if(req.status_code != 200):
        return []
-        # Find and follow all the links
+    # Find links
    links = link_re.findall(req.text)
    print "\nFound {} links".format(len(links))
    # Search links for emails
    for link in links:
        # Get an absolute URL for a link
        link = urlparse.urljoin(url, link)
        # Find all emails on current page
        result.update(email_re.findall(req.text))
            print "Crawled level: {}".format(maxlevel)
            # new level
            maxlevel -= 1
            # recurse
            crawl(link, maxlevel)
    return result
-emails = crawl('http://www.website_goes_here_dot_com', 2)
+if __name__ == '__main__':
    emails = crawl('http://www.realpython.com')
-print "\nScrapped e-mail addresses:"
+    print "\nScrapped e-mail addresses:"
-for email in emails:
+    for email in emails:
        print email
    print "\n"
--- a/09_basic_link_web_crawler.py
+++ b/09_basic_link_web_crawler.py
@ -6,39 +6,27 @@ import urlparse
 link_re = re.compile(r'href="(.*?)"')
-def crawl(url, maxlevel):
+def crawl(url):
    result = set()
    while maxlevel > 0:
        # Get the webpage
    req = requests.get(url)
    # Check if successful
    if(req.status_code != 200):
        return []
-        # Find and follow all the links
+    # Find links
    links = link_re.findall(req.text)
    print "\nFound {} links".format(len(links))
    # Search links for emails
    for link in links:
        # Get an absolute URL for a link
        link = urlparse.urljoin(url, link)
            # add links to result set
            result.update(link)
            print "Crawled level: {}".format(maxlevel)
            # new level
            maxlevel -= 1
            # recurse
            crawl(link, maxlevel)
    return result
 emails = crawl('http://www.website_goes_here_dot_com', 2)
 print "\nScrapped links:"
 for link in links:
        print link
 if __name__ == '__main__':
    crawl('http://www.realpython.com')
--- a/readme.md
+++ b/readme.md
@ -7,6 +7,6 @@
 1. **05_load_json_without_dupes.py**: load json, convert to dict, raise error if there is a duplicate key
 1. **06_execution_time.py**: class used for timing execution of code
 1. **07_benchmark_permissions_loading_django.py**: benchmark loading of permissions in Django
-1. **08_basic_email_web_crawler.py**: web crawler for grabbing emails from a website recursively
+1. **08_basic_email_web_crawler.py**: web crawler for grabbing emails from a website
-1. **09_basic_link_web_crawler.py**: web crawler for grabbing links from a website recursively
+1. **09_basic_link_web_crawler.py**: web crawler for grabbing links from a website 
 1. **10_find_files_recursively.py**: recursively grab files from a directory