Update 08_basic_email_web_crawler.py

This is a much simpler version of the script(easily understandable).
This commit is contained in:
RayKon∞ 2015-12-02 21:51:55 +05:30
parent 436f3119f9
commit 761e0ecec2

View File

@ -1,45 +1,26 @@
import requests import requests
import re import re
try:
from urllib.parse import urljoin
except ImportError:
from urlparse import urljoin
# regex #get url
email_re = re.compile(r'([\w\.,]+@[\w\.,]+\.\w+)') #url=input('Enter a URL (include 'http://'):')--this is wrong
link_re = re.compile(r'href="(.*?)"') url = input('Enter a URL (include `http://`): ')
def crawl(url): #connect to the url
website=requests.get(url)
result = set() #read html
html=website.text
req = requests.get(url)
# Check if successful #use re.findall to grab all the links
if(req.status_code != 200): links = re.findall('"((http|ftp)s?://.*?)"', html)
return []
# Find links emails=re.findall('([\w\.,]+@[\w\.,]+\.\w+)',html)
links = link_re.findall(req.text)
print("\nFound {} links".format(len(links)))
# Search links for emails #prints the number of links in the list
for link in links: print("\nFound {} links".format(len(links)))
# Get an absolute URL for a link for email in emails:
link = urljoin(url, link) print(email)
# Find all emails on current page
result.update(email_re.findall(req.text))
return result
if __name__ == '__main__':
emails = crawl('http://www.realpython.com')
print("\nScrapped e-mail addresses:")
for email in emails:
print(email)
print("\n")