diff --git a/08_basic_email_web_crawler.py b/08_basic_email_web_crawler.py index 9c6c58f..a7dbbce 100644 --- a/08_basic_email_web_crawler.py +++ b/08_basic_email_web_crawler.py @@ -1,45 +1,26 @@ import requests import re -try: - from urllib.parse import urljoin -except ImportError: - from urlparse import urljoin -# regex -email_re = re.compile(r'([\w\.,]+@[\w\.,]+\.\w+)') -link_re = re.compile(r'href="(.*?)"') +#get url +#url=input('Enter a URL (include 'http://'):')--this is wrong +url = input('Enter a URL (include `http://`): ') -def crawl(url): +#connect to the url +website=requests.get(url) - result = set() +#read html +html=website.text - req = requests.get(url) - # Check if successful - if(req.status_code != 200): - return [] +#use re.findall to grab all the links +links = re.findall('"((http|ftp)s?://.*?)"', html) - # Find links - links = link_re.findall(req.text) +emails=re.findall('([\w\.,]+@[\w\.,]+\.\w+)',html) - print("\nFound {} links".format(len(links))) - # Search links for emails - for link in links: +#prints the number of links in the list +print("\nFound {} links".format(len(links))) - # Get an absolute URL for a link - link = urljoin(url, link) - - # Find all emails on current page - result.update(email_re.findall(req.text)) - - return result - -if __name__ == '__main__': - emails = crawl('http://www.realpython.com') - - print("\nScrapped e-mail addresses:") - for email in emails: - print(email) - print("\n") +for email in emails: + print(email)