Update 08_basic_email_web_crawler.py

This is a much simpler version of the script(easily understandable).
2015-12-02 21:51:55 +05:30 · 2015-12-02 21:51:55 +05:30 · 761e0ecec2
commit 761e0ecec2
parent 436f3119f9
1 changed files with 14 additions and 33 deletions
--- a/08_basic_email_web_crawler.py
+++ b/08_basic_email_web_crawler.py
@ -1,45 +1,26 @@
 import requests
 import re
 try:
    from urllib.parse import urljoin
 except ImportError:
    from urlparse import urljoin
-# regex
+#get url
-email_re = re.compile(r'([\w\.,]+@[\w\.,]+\.\w+)')
+#url=input('Enter a URL (include 'http://'):')--this is wrong
-link_re = re.compile(r'href="(.*?)"')
+url = input('Enter a URL (include `http://`): ')
-def crawl(url):
+#connect to the url
 website=requests.get(url)
-    result = set()
+#read html
 html=website.text
    req = requests.get(url)
-    # Check if successful
+#use re.findall to grab all the links
-    if(req.status_code != 200):
+links = re.findall('"((http|ftp)s?://.*?)"', html)
        return []
-    # Find links
+emails=re.findall('([\w\.,]+@[\w\.,]+\.\w+)',html)
    links = link_re.findall(req.text)
    print("\nFound {} links".format(len(links)))
-    # Search links for emails
+#prints the number of links in the list
-    for link in links:
+print("\nFound {} links".format(len(links)))
-        # Get an absolute URL for a link
+for email in emails:
        link = urljoin(url, link)
        # Find all emails on current page
        result.update(email_re.findall(req.text))
    return result
 if __name__ == '__main__':
    emails = crawl('http://www.realpython.com')
    print("\nScrapped e-mail addresses:")
    for email in emails:
 	print(email)
    print("\n")