updates
This commit is contained in:
parent
239a0ff7ff
commit
53da94f078
@ -7,40 +7,36 @@ email_re = re.compile(r'([\w\.,]+@[\w\.,]+\.\w+)')
|
|||||||
link_re = re.compile(r'href="(.*?)"')
|
link_re = re.compile(r'href="(.*?)"')
|
||||||
|
|
||||||
|
|
||||||
def crawl(url, maxlevel):
|
def crawl(url):
|
||||||
|
|
||||||
result = set()
|
result = set()
|
||||||
|
|
||||||
while maxlevel > 0:
|
|
||||||
|
|
||||||
# Get the webpage
|
|
||||||
req = requests.get(url)
|
req = requests.get(url)
|
||||||
|
|
||||||
# Check if successful
|
# Check if successful
|
||||||
if(req.status_code != 200):
|
if(req.status_code != 200):
|
||||||
return []
|
return []
|
||||||
|
|
||||||
# Find and follow all the links
|
# Find links
|
||||||
links = link_re.findall(req.text)
|
links = link_re.findall(req.text)
|
||||||
|
|
||||||
|
print "\nFound {} links".format(len(links))
|
||||||
|
|
||||||
|
# Search links for emails
|
||||||
for link in links:
|
for link in links:
|
||||||
|
|
||||||
# Get an absolute URL for a link
|
# Get an absolute URL for a link
|
||||||
link = urlparse.urljoin(url, link)
|
link = urlparse.urljoin(url, link)
|
||||||
|
|
||||||
# Find all emails on current page
|
# Find all emails on current page
|
||||||
result.update(email_re.findall(req.text))
|
result.update(email_re.findall(req.text))
|
||||||
|
|
||||||
print "Crawled level: {}".format(maxlevel)
|
|
||||||
|
|
||||||
# new level
|
|
||||||
maxlevel -= 1
|
|
||||||
|
|
||||||
# recurse
|
|
||||||
crawl(link, maxlevel)
|
|
||||||
|
|
||||||
return result
|
return result
|
||||||
|
|
||||||
emails = crawl('http://www.website_goes_here_dot_com', 2)
|
if __name__ == '__main__':
|
||||||
|
emails = crawl('http://www.realpython.com')
|
||||||
|
|
||||||
print "\nScrapped e-mail addresses:"
|
print "\nScrapped e-mail addresses:"
|
||||||
for email in emails:
|
for email in emails:
|
||||||
print email
|
print email
|
||||||
|
print "\n"
|
||||||
|
@ -6,39 +6,27 @@ import urlparse
|
|||||||
link_re = re.compile(r'href="(.*?)"')
|
link_re = re.compile(r'href="(.*?)"')
|
||||||
|
|
||||||
|
|
||||||
def crawl(url, maxlevel):
|
def crawl(url):
|
||||||
|
|
||||||
result = set()
|
|
||||||
|
|
||||||
while maxlevel > 0:
|
|
||||||
|
|
||||||
# Get the webpage
|
|
||||||
req = requests.get(url)
|
req = requests.get(url)
|
||||||
|
|
||||||
# Check if successful
|
# Check if successful
|
||||||
if(req.status_code != 200):
|
if(req.status_code != 200):
|
||||||
return []
|
return []
|
||||||
|
|
||||||
# Find and follow all the links
|
# Find links
|
||||||
links = link_re.findall(req.text)
|
links = link_re.findall(req.text)
|
||||||
|
|
||||||
|
print "\nFound {} links".format(len(links))
|
||||||
|
|
||||||
|
# Search links for emails
|
||||||
for link in links:
|
for link in links:
|
||||||
|
|
||||||
# Get an absolute URL for a link
|
# Get an absolute URL for a link
|
||||||
link = urlparse.urljoin(url, link)
|
link = urlparse.urljoin(url, link)
|
||||||
# add links to result set
|
|
||||||
result.update(link)
|
|
||||||
|
|
||||||
print "Crawled level: {}".format(maxlevel)
|
|
||||||
|
|
||||||
# new level
|
|
||||||
maxlevel -= 1
|
|
||||||
|
|
||||||
# recurse
|
|
||||||
crawl(link, maxlevel)
|
|
||||||
|
|
||||||
return result
|
|
||||||
|
|
||||||
emails = crawl('http://www.website_goes_here_dot_com', 2)
|
|
||||||
|
|
||||||
print "\nScrapped links:"
|
|
||||||
for link in links:
|
|
||||||
print link
|
print link
|
||||||
|
|
||||||
|
|
||||||
|
if __name__ == '__main__':
|
||||||
|
crawl('http://www.realpython.com')
|
||||||
|
@ -7,6 +7,6 @@
|
|||||||
1. **05_load_json_without_dupes.py**: load json, convert to dict, raise error if there is a duplicate key
|
1. **05_load_json_without_dupes.py**: load json, convert to dict, raise error if there is a duplicate key
|
||||||
1. **06_execution_time.py**: class used for timing execution of code
|
1. **06_execution_time.py**: class used for timing execution of code
|
||||||
1. **07_benchmark_permissions_loading_django.py**: benchmark loading of permissions in Django
|
1. **07_benchmark_permissions_loading_django.py**: benchmark loading of permissions in Django
|
||||||
1. **08_basic_email_web_crawler.py**: web crawler for grabbing emails from a website recursively
|
1. **08_basic_email_web_crawler.py**: web crawler for grabbing emails from a website
|
||||||
1. **09_basic_link_web_crawler.py**: web crawler for grabbing links from a website recursively
|
1. **09_basic_link_web_crawler.py**: web crawler for grabbing links from a website
|
||||||
1. **10_find_files_recursively.py**: recursively grab files from a directory
|
1. **10_find_files_recursively.py**: recursively grab files from a directory
|
||||||
|
Loading…
Reference in New Issue
Block a user