added web crawler
This commit is contained in:
parent
7a6597cf8c
commit
7409f60137
45
08_basic_email_web_crawler.py
Normal file
45
08_basic_email_web_crawler.py
Normal file
@ -0,0 +1,45 @@
|
||||
import requests
|
||||
import re
|
||||
import urlparse
|
||||
|
||||
# regex
|
||||
email_re = re.compile(r'([\w\.,]+@[\w\.,]+\.\w+)')
|
||||
link_re = re.compile(r'href="(.*?)"')
|
||||
|
||||
def crawl(url, maxlevel):
|
||||
|
||||
result = set()
|
||||
|
||||
while maxlevel > 0:
|
||||
|
||||
# Get the webpage
|
||||
req = requests.get(url)
|
||||
|
||||
# Check if successful
|
||||
if(req.status_code != 200):
|
||||
return []
|
||||
|
||||
# Find and follow all the links
|
||||
links = link_re.findall(req.text)
|
||||
for link in links:
|
||||
# Get an absolute URL for a link
|
||||
link = urlparse.urljoin(url, link)
|
||||
|
||||
# Find all emails on current page
|
||||
result.update(email_re.findall(req.text))
|
||||
|
||||
print "Crawled level: {}".format(maxlevel)
|
||||
|
||||
# new level
|
||||
maxlevel -= 1
|
||||
|
||||
# recurse
|
||||
crawl(link, maxlevel)
|
||||
|
||||
return result
|
||||
|
||||
emails = crawl('http://www.website_goes_here_dot_com', 2)
|
||||
|
||||
print "\nScrapped e-mail addresses:"
|
||||
for email in emails:
|
||||
print email
|
@ -7,3 +7,4 @@
|
||||
1. **05_load_json_without_dupes.py**: load json, convert to dict, raise error if there is a duplicate key
|
||||
1. **06_execution_time.py**: class used for timing execution of code
|
||||
1. **07_benchmark_permissions_loading_django.py**: benchmark loading of permissions in Django
|
||||
1. **08_basic_email_web_crawler.py**: web crawler for grabbing emails from a website recursively
|
||||
|
Loading…
Reference in New Issue
Block a user