added stock scraper, converted all scripts to python 2/3 compatibility

2015-05-17 03:49:35 -06:00 · 2015-05-17 03:49:35 -06:00 · 0bb4c8c255
commit 0bb4c8c255
parent 5bb3679901
22 changed files with 129 additions and 72 deletions
--- a/.gitignore
+++ b/.gitignore
@ -1,3 +1,5 @@
 .pyc
 .DS_Store
 _tmp
+env
+__pycache__
--- a/02_find_all_links.py
+++ b/02_find_all_links.py
@ -1,18 +1,18 @@
-import urllib2
+import requests
 import re

 # get url
-url =raw_input('Enter a URL (include `http://`): ')
+url = input('Enter a URL (include `http://`): ')

 # connect to the url
-website = urllib2.urlopen(url)
+website = requests.get(url)

 # read html
-html = website.read()
+html = website.text

 # use re.findall to grab all the links
 links = re.findall('"((http|ftp)s?://.*?)"', html)

 # output links
 for link in links:
-	print link[0]
+    print(link[0])
--- a/03_simple_twitter_manager.py
+++ b/03_simple_twitter_manager.py
@ -16,10 +16,13 @@ twitter_api = twitter.Api(
 if __name__ == '__main__':
    follower_ids = twitter_api.GetFollowerIDs()
    following_ids = twitter_api.GetFriendIDs()
-    zombie_follows = [following_id for following_id in following_ids if following_id not in follower_ids]
+    zombie_follows = [following_id for following_id in
+                      following_ids if following_id not in follower_ids]

-    confirm = raw_input("Are you sure you want to unfollow %s tweeps [y|n]? " % (len(zombie_follows)))
+    confirm = raw_input(
+        "Are you sure you want to unfollow {0} tweeps [y|n]? ".format(
+            (len(zombie_follows))))
    if confirm.lower() == 'y':
        for id in zombie_follows:
            user = twitter_api.DestroyFriendship(user_id=id)
-            print "Unfollowed %s" % (user.screen_name)
+            print("Unfollowed {0}".format(user.screen_name))
--- a/04_rename_with_slice.py
+++ b/04_rename_with_slice.py
@ -8,7 +8,7 @@ for file in glob.glob("*.json"):
    new_file_name = file_name[:-6] + extension
    try:
        os.rename(file, new_file_name)
-    except OSError, e:
-        print e
+    except OSError as e:
+        print(e)
    else:
-        print "Renamed {} to {}".format(file, new_file_name)
+        print("Renamed {} to {}".format(file, new_file_name))
--- a/05_load_json_without_dupes.py
+++ b/05_load_json_without_dupes.py
@ -1,11 +1,9 @@
-import json
-
 def dict_raise_on_duplicates(ordered_pairs):
    """reject duplicate keys"""
    my_dict = dict()
    for key, values in ordered_pairs:
        if key in my_dict:
-           raise ValueError("Duplicate key: {}".format(key,))
+            raise ValueError("Duplicate key: {}".format(key,))
        else:
-           my_dict[key] = values
+            my_dict[key] = values
    return my_dict
--- a/06_execution_time.py
+++ b/06_execution_time.py
@ -13,6 +13,7 @@ For example:


 import time
+import random


 class ExecutionTime:
@ -25,9 +26,9 @@ class ExecutionTime:

 # ---- run code ---- #

-import random

 timer = ExecutionTime()
 sample_list = list()
-my_list = [random.randint(1, 888898) for num in xrange(1, 1000000) if num % 2 == 0]
-print 'Finished in {} seconds.'.format(timer.duration())
+my_list = [random.randint(1, 888898) for num in
+           range(1, 1000000) if num % 2 == 0]
+print('Finished in {} seconds.'.format(timer.duration()))
--- a/07_benchmark_permissions_loading_django.py
+++ b/07_benchmark_permissions_loading_django.py
@ -14,8 +14,8 @@ def timeit(method):
        te = time.time()
        all_times.append(te - ts)

-        print all_times
-        print numpy.mean(all_times)
+        print(all_times)
+        print(numpy.mean(all_times))
        return result

    return timed
--- a/08_basic_email_web_crawler.py
+++ b/08_basic_email_web_crawler.py
@ -1,6 +1,9 @@
 import requests
 import re
-import urlparse
+try:
+    from urllib.parse import urljoin
+except ImportError:
+    from urlparse import urljoin

 # regex
 email_re = re.compile(r'([\w\.,]+@[\w\.,]+\.\w+)')
@ -20,13 +23,13 @@ def crawl(url):
    # Find links
    links = link_re.findall(req.text)

-    print "\nFound {} links".format(len(links))
+    print("\nFound {} links".format(len(links)))

    # Search links for emails
    for link in links:

        # Get an absolute URL for a link
-        link = urlparse.urljoin(url, link)
+        link = urljoin(url, link)

        # Find all emails on current page
        result.update(email_re.findall(req.text))
@ -36,7 +39,7 @@ def crawl(url):
 if __name__ == '__main__':
    emails = crawl('http://www.realpython.com')

-    print "\nScrapped e-mail addresses:"
+    print("\nScrapped e-mail addresses:")
    for email in emails:
-        print email
-    print "\n"
+        print(email)
+    print("\n")
--- a/09_basic_link_web_crawler.py
+++ b/09_basic_link_web_crawler.py
@ -1,6 +1,9 @@
 import requests
 import re
-import urlparse
+try:
+    from urllib.parse import urljoin
+except ImportError:
+    from urlparse import urljoin

 # regex
 link_re = re.compile(r'href="(.*?)"')
@ -17,17 +20,15 @@ def crawl(url):
    # Find links
    links = link_re.findall(req.text)

-    print "\nFound {} links".format(len(links))
+    print("\nFound {} links".format(len(links)))

    # Search links for emails
    for link in links:

        # Get an absolute URL for a link
-        link = urlparse.urljoin(url, link)
-
-        print link
+        link = urljoin(url, link)

+        print(link)

 if __name__ == '__main__':
    crawl('http://www.realpython.com')
-
--- a/10_find_files_recursively.py
+++ b/10_find_files_recursively.py
@ -2,7 +2,7 @@ import fnmatch
 import os

 # constants
-PATH = '/../../../..'
+PATH = './'
 PATTERN = '*.py'


@ -14,17 +14,17 @@ def get_file_names(filepath, pattern):
                # matches.append(os.path.join(root, filename))  # full path
                matches.append(os.path.join(filename))  # just file name
        if matches:
-            print "Found {} files:".format(len(matches))
+            print("Found {} files:".format(len(matches)))
            output_files(matches)
        else:
-            print "No files found."
+            print("No files found.")
    else:
-        print "Sorry that path does not exist. Try again."
+        print("Sorry that path does not exist. Try again.")


 def output_files(list_of_files):
    for filename in list_of_files:
-        print filename
+        print(filename)


 if __name__ == '__main__':
--- a/11_optimize_images_with_wand.py
+++ b/11_optimize_images_with_wand.py
@ -1,9 +1,9 @@
 import fnmatch
 import os

-# sudo pip install Wand
+# pip install Wand
 from wand.image import Image
-# sudo pip install http://pypi.python.org/packages/source/h/hurry.filesize/hurry.filesize-0.9.tar.gz
+# pip install http://pypi.python.org/packages/source/h/hurry.filesize/hurry.filesize-0.9.tar.gz
 from hurry.filesize import size


@ -19,12 +19,13 @@ def get_image_file_names(filepath, pattern):
            for filename in fnmatch.filter(filenames, pattern):
                matches.append(os.path.join(root, filename))  # full path
        if matches:
-            print "Found {} files, with a total file size of {}.".format(len(matches), get_total_size(matches))
+            print("Found {} files, with a total file size of {}.".format(
+                len(matches), get_total_size(matches)))
            return matches
        else:
-            print "No files found."
+            print("No files found.")
    else:
-        print "Sorry that path does not exist. Try again."
+        print("Sorry that path does not exist. Try again.")


 def get_total_size(list_of_image_names):
@ -35,7 +36,7 @@ def get_total_size(list_of_image_names):


 def resize_images(list_of_image_names):
-    print "Optimizing ... "
+    print("Optimizing ... ")
    for index, image_name in enumerate(list_of_image_names):
        with open(image_name) as f:
            image_binary = f.read()
@ -43,7 +44,7 @@ def resize_images(list_of_image_names):
            if img.height >= 600:
                img.transform(resize='x600')
                img.save(filename=image_name)
-    print "Optimization complete."
+    print("Optimization complete.")


 if __name__ == '__main__':
--- a/12_csv_split.py
+++ b/12_csv_split.py
@ -117,10 +117,10 @@ def parse_file(arguments):
                writer = writer.writerows(chunk)

            # Output info
-            print ""
-            print "Chunk # {}:".format(current_chunk)
-            print "Filepath: {}".format(current_output)
-            print "# of rows: {}".format(len(chunk))
+            print("")
+            print("Chunk # {}:".format(current_chunk))
+            print("Filepath: {}".format(current_output))
+            print("# of rows: {}".format(len(chunk)))

            # Create new chunk
            current_chunk += 1
--- a/13_random_name_generator.py
+++ b/13_random_name_generator.py
@ -10,7 +10,7 @@ def random_name_generator(first, second, x):
         - number of random names
    """
    names = []
-    for i in xrange(0, int(x)):
+    for i in range(0, int(x)):
        random_first = randint(0, len(first)-1)
        random_last = randint(0, len(second)-1)
        names.append("{0} {1}".format(
@ -23,4 +23,4 @@ def random_name_generator(first, second, x):
 first_names = ["Drew", "Mike", "Landon", "Jeremy", "Tyler", "Tom", "Avery"]
 last_names = ["Smith", "Jones", "Brighton", "Taylor"]
 names = random_name_generator(first_names, last_names, 5)
-print '\n'.join(names)
+print('\n'.join(names))
--- a/15_check_my_environment.py
+++ b/15_check_my_environment.py
@ -11,7 +11,7 @@ class Main:
        pass

    def process(self):
-        print "ok"
+        print("ok")

 if __name__ == "__main__":
    m = Main(some_script.CONFIGFILE)
@ -39,7 +39,7 @@ CONFIGFILE = get_config_file()
 if CONFIGFILE is None:
    sys.exit("Configuration error! Unknown environment set. \
              Edit config.py and set appropriate environment")
-print "Config file: {}".format(CONFIGFILE)
+print("Config file: {}".format(CONFIGFILE))
 if not os.path.exists(CONFIGFILE):
    sys.exit("Configuration error! Config file does not exist")
-print "Config ok ...."
+print("Config ok ....")
--- a/18_zipper.py
+++ b/18_zipper.py
@ -3,7 +3,7 @@ from datetime import datetime
 from zipfile import ZipFile


-#set file name and time of creation
+# set file name and time of creation
 today = datetime.now()
 file_name = 'zipper_' + today.strftime('%Y.%m.%dh%H%M') + '.zip'
 dir_name = 'tmp/'  # update path
--- a/20_restore_file_from_git.py
+++ b/20_restore_file_from_git.py
@ -1,9 +1,9 @@
 from subprocess import check_output, call


-file_name = str(raw_input('Enter the file name: '))
+file_name = str(input('Enter the file name: '))
 commit = check_output(["git", "rev-list", "-n", "1", "HEAD", "--", file_name])
-print str(commit).rstrip()
+print(str(commit).rstrip())
 call(["git", "checkout", str(commit).rstrip()+"~1", file_name])


--- a/22_git_tag.py
+++ b/22_git_tag.py
@ -10,5 +10,5 @@ if len(sys.argv) == 3:
    subprocess.call(command, shell=True)
    subprocess.call('git push --tags', shell=True)
 else:
-    print 'usage: tag.py TAG_NAME COMMIT'
+    print('usage: tag.py TAG_NAME COMMIT')
    sys.exit(1)
--- a/24_sql2csv.py
+++ b/24_sql2csv.py
@ -3,7 +3,7 @@ import csv
 import sqlite3

 if len(sys.argv) < 3:
-    print "Use: {0} DATABASE_NAME TABLE_NAME".format(sys.argv[0])
+    print("Use: {0} DATABASE_NAME TABLE_NAME".format(sys.argv[0]))
    exit()

 conn = sqlite3.connect(sys.argv[1])
--- a/25_ip2geolocation.py
+++ b/25_ip2geolocation.py
@ -9,7 +9,7 @@ def get_addresses(filename):
    row info from the csv file.
    """
    all_addresses = []
-    with open(filename, 'rb') as f:
+    with open(filename, 'rt') as f:
        reader = csv.reader(f)
        for row in reader:
            all_addresses.append(row)
@ -29,7 +29,7 @@ def get_geolocation(all_the_ip_address):
    header_row.extend(['Country', 'City'])
    # get geolocation
    for line in all_the_ip_address:
-        print "Grabbing geo info for row # {0}".format(counter)
+        print("Grabbing geo info for row # {0}".format(counter))
        r = requests.get('https://freegeoip.net/json/{0}'.format(line[0]))
        line.extend([str(r.json()['country_name']), str(r.json()['city'])])
        updated_addresses.append(line)
@ -43,10 +43,15 @@ def create_csv(updated_address_list):
    Given the updated lists of lists from `get_geolocation()`, this function
    creates a new CSV.
    """
-    with open('output.csv', 'wb') as f:
+    import sys
+    if sys.version_info >= (3, 0, 0):
+        f = open('output.csv', 'w', newline='')
+    else:
+        f = open('output.csv', 'wb')
+    with f:
        writer = csv.writer(f)
        writer.writerows(updated_address_list)
-    print "All done!"
+    print("All done!")


 if __name__ == '__main__':
--- a/26_stock_scraper.py
+++ b/26_stock_scraper.py
@ -0,0 +1,32 @@
+import requests
+from lxml import html
+from collections import defaultdict
+
+
+def get_stocks(url):
+    # Make Request
+    page = requests.get(url)
+    # Parse/Scrape
+    tree = html.fromstring(page.text)
+    xpath = '//*[@id="mw-content-text"]/table[1]'
+    rows = tree.xpath(xpath)[0].findall("tr")
+    rows = [(row.getchildren()[0], row.getchildren()[3]) for row in rows[1:]]
+    rows = [(row[0].getchildren()[0].text, row[1].text) for row in rows]
+    industries = defaultdict(list)
+    for row in rows:
+        industries[row[1]].append(row[0])
+    return industries
+
+
+def output_data(data_dict):
+    for industry in data_dict:
+        print('\n'+industry)
+        print('-'*len(industry))
+        for ticker in data_dict[industry]:
+            print(ticker)
+
+
+if __name__ == '__main__':
+    url = 'http://en.wikipedia.org/wiki/List_of_S%26P_500_companies'
+    scraped_data = get_stocks(url)
+    output_data(scraped_data)
--- a/readme.md
+++ b/readme.md
@ -25,3 +25,4 @@
 1. **23_flask_session_test.py**: Just a simple app to see if the sessions are working
 1. **24_sql2csv.py**: SQL to CSV.
 1. **25_ip2geolocation.py**: Given a CSV file with an ip address (see sample - *25_sample_csv.csv*), return the geolocation based on the ip.
+1. **26_stock_scraper.py**: Scrape the S&P 500 Companies list from Wikipedia, then output he data.
--- a/requirements.txt
+++ b/requirements.txt
@ -0,0 +1,10 @@
+Flask==0.10.1
+Jinja2==2.7.3
+MarkupSafe==0.23
+Wand==0.4.0
+Werkzeug==0.10.4
+hurry.filesize==0.9
+itsdangerous==0.24
+lxml==3.4.4
+numpy==1.9.2
+requests==2.7.0