From 53da94f078490eb1a47b9eb66776c8419d1e080f Mon Sep 17 00:00:00 2001 From: Michael Herman Date: Sun, 18 May 2014 12:16:56 -0600 Subject: [PATCH] updates --- 08_basic_email_web_crawler.py | 44 ++++++++++++++++------------------- 09_basic_link_web_crawler.py | 42 ++++++++++++--------------------- readme.md | 4 ++-- 3 files changed, 37 insertions(+), 53 deletions(-) diff --git a/08_basic_email_web_crawler.py b/08_basic_email_web_crawler.py index 30a2282..faca75f 100644 --- a/08_basic_email_web_crawler.py +++ b/08_basic_email_web_crawler.py @@ -7,40 +7,36 @@ email_re = re.compile(r'([\w\.,]+@[\w\.,]+\.\w+)') link_re = re.compile(r'href="(.*?)"') -def crawl(url, maxlevel): +def crawl(url): result = set() - while maxlevel > 0: + req = requests.get(url) - # Get the webpage - req = requests.get(url) + # Check if successful + if(req.status_code != 200): + return [] - # Check if successful - if(req.status_code != 200): - return [] + # Find links + links = link_re.findall(req.text) - # Find and follow all the links - links = link_re.findall(req.text) - for link in links: - # Get an absolute URL for a link - link = urlparse.urljoin(url, link) + print "\nFound {} links".format(len(links)) - # Find all emails on current page - result.update(email_re.findall(req.text)) + # Search links for emails + for link in links: - print "Crawled level: {}".format(maxlevel) + # Get an absolute URL for a link + link = urlparse.urljoin(url, link) - # new level - maxlevel -= 1 - - # recurse - crawl(link, maxlevel) + # Find all emails on current page + result.update(email_re.findall(req.text)) return result -emails = crawl('http://www.website_goes_here_dot_com', 2) +if __name__ == '__main__': + emails = crawl('http://www.realpython.com') -print "\nScrapped e-mail addresses:" -for email in emails: - print email + print "\nScrapped e-mail addresses:" + for email in emails: + print email + print "\n" diff --git a/09_basic_link_web_crawler.py b/09_basic_link_web_crawler.py index 47be8a1..95849d2 100644 --- a/09_basic_link_web_crawler.py +++ b/09_basic_link_web_crawler.py @@ -6,39 +6,27 @@ import urlparse link_re = re.compile(r'href="(.*?)"') -def crawl(url, maxlevel): +def crawl(url): - result = set() + req = requests.get(url) - while maxlevel > 0: + # Check if successful + if(req.status_code != 200): + return [] - # Get the webpage - req = requests.get(url) + # Find links + links = link_re.findall(req.text) - # Check if successful - if(req.status_code != 200): - return [] + print "\nFound {} links".format(len(links)) - # Find and follow all the links - links = link_re.findall(req.text) - for link in links: - # Get an absolute URL for a link - link = urlparse.urljoin(url, link) - # add links to result set - result.update(link) + # Search links for emails + for link in links: - print "Crawled level: {}".format(maxlevel) + # Get an absolute URL for a link + link = urlparse.urljoin(url, link) - # new level - maxlevel -= 1 + print link - # recurse - crawl(link, maxlevel) - return result - -emails = crawl('http://www.website_goes_here_dot_com', 2) - -print "\nScrapped links:" -for link in links: - print link +if __name__ == '__main__': + crawl('http://www.realpython.com') diff --git a/readme.md b/readme.md index 217a3f2..412f67f 100644 --- a/readme.md +++ b/readme.md @@ -7,6 +7,6 @@ 1. **05_load_json_without_dupes.py**: load json, convert to dict, raise error if there is a duplicate key 1. **06_execution_time.py**: class used for timing execution of code 1. **07_benchmark_permissions_loading_django.py**: benchmark loading of permissions in Django -1. **08_basic_email_web_crawler.py**: web crawler for grabbing emails from a website recursively -1. **09_basic_link_web_crawler.py**: web crawler for grabbing links from a website recursively +1. **08_basic_email_web_crawler.py**: web crawler for grabbing emails from a website +1. **09_basic_link_web_crawler.py**: web crawler for grabbing links from a website 1. **10_find_files_recursively.py**: recursively grab files from a directory