Merge pull request #5 from RajuKoushik/patch-1

Update 08_basic_email_web_crawler.py
This commit is contained in:
Michael Herman 2016-02-19 00:07:19 +02:00
commit 780cad2988

View File

@ -1,45 +1,26 @@
import requests import requests
import re import re
try:
from urllib.parse import urljoin
except ImportError:
from urlparse import urljoin
# regex #get url
email_re = re.compile(r'([\w\.,]+@[\w\.,]+\.\w+)') #url=input('Enter a URL (include 'http://'):')--this is wrong
link_re = re.compile(r'href="(.*?)"') url = input('Enter a URL (include `http://`): ')
def crawl(url): #connect to the url
website=requests.get(url)
result = set() #read html
html=website.text
req = requests.get(url)
# Check if successful #use re.findall to grab all the links
if(req.status_code != 200): links = re.findall('"((http|ftp)s?://.*?)"', html)
return []
# Find links emails=re.findall('([\w\.,]+@[\w\.,]+\.\w+)',html)
links = link_re.findall(req.text)
#prints the number of links in the list
print("\nFound {} links".format(len(links))) print("\nFound {} links".format(len(links)))
# Search links for emails
for link in links:
# Get an absolute URL for a link
link = urljoin(url, link)
# Find all emails on current page
result.update(email_re.findall(req.text))
return result
if __name__ == '__main__':
emails = crawl('http://www.realpython.com')
print("\nScrapped e-mail addresses:")
for email in emails: for email in emails:
print(email) print(email)
print("\n")