updates

2014-05-18 12:16:56 -06:00 · 2014-05-18 12:16:56 -06:00 · 53da94f078
commit 53da94f078
parent 239a0ff7ff
3 changed files with 37 additions and 53 deletions
--- a/08_basic_email_web_crawler.py
+++ b/08_basic_email_web_crawler.py
@ -7,40 +7,36 @@ email_re = re.compile(r'([\w\.,]+@[\w\.,]+\.\w+)')
 link_re = re.compile(r'href="(.*?)"')


-def crawl(url, maxlevel):
+def crawl(url):

    result = set()

-    while maxlevel > 0:
-
-        # Get the webpage
    req = requests.get(url)

    # Check if successful
    if(req.status_code != 200):
        return []

-        # Find and follow all the links
+    # Find links
    links = link_re.findall(req.text)
+
+    print "\nFound {} links".format(len(links))
+
+    # Search links for emails
    for link in links:
+
        # Get an absolute URL for a link
        link = urlparse.urljoin(url, link)

        # Find all emails on current page
        result.update(email_re.findall(req.text))

-            print "Crawled level: {}".format(maxlevel)
-
-            # new level
-            maxlevel -= 1
-
-            # recurse
-            crawl(link, maxlevel)
-
    return result

-emails = crawl('http://www.website_goes_here_dot_com', 2)
+if __name__ == '__main__':
+    emails = crawl('http://www.realpython.com')

    print "\nScrapped e-mail addresses:"
    for email in emails:
        print email
+    print "\n"
--- a/09_basic_link_web_crawler.py
+++ b/09_basic_link_web_crawler.py
@ -6,39 +6,27 @@ import urlparse
 link_re = re.compile(r'href="(.*?)"')


-def crawl(url, maxlevel):
+def crawl(url):

-    result = set()
-
-    while maxlevel > 0:
-
-        # Get the webpage
    req = requests.get(url)

    # Check if successful
    if(req.status_code != 200):
        return []

-        # Find and follow all the links
+    # Find links
    links = link_re.findall(req.text)
+
+    print "\nFound {} links".format(len(links))
+
+    # Search links for emails
    for link in links:
+
        # Get an absolute URL for a link
        link = urlparse.urljoin(url, link)
-            # add links to result set
-            result.update(link)

-            print "Crawled level: {}".format(maxlevel)
-
-            # new level
-            maxlevel -= 1
-
-            # recurse
-            crawl(link, maxlevel)
-
-    return result
-
-emails = crawl('http://www.website_goes_here_dot_com', 2)
-
-print "\nScrapped links:"
-for link in links:
        print link
+
+
+if __name__ == '__main__':
+    crawl('http://www.realpython.com')
--- a/readme.md
+++ b/readme.md
@ -7,6 +7,6 @@
 1. **05_load_json_without_dupes.py**: load json, convert to dict, raise error if there is a duplicate key
 1. **06_execution_time.py**: class used for timing execution of code
 1. **07_benchmark_permissions_loading_django.py**: benchmark loading of permissions in Django
-1. **08_basic_email_web_crawler.py**: web crawler for grabbing emails from a website recursively
-1. **09_basic_link_web_crawler.py**: web crawler for grabbing links from a website recursively
+1. **08_basic_email_web_crawler.py**: web crawler for grabbing emails from a website
+1. **09_basic_link_web_crawler.py**: web crawler for grabbing links from a website 
 1. **10_find_files_recursively.py**: recursively grab files from a directory