added stock scraper, converted all scripts to python 2/3 compatibility

This commit is contained in:
Michael Herman 2015-05-17 03:49:35 -06:00
parent 5bb3679901
commit 0bb4c8c255
22 changed files with 129 additions and 72 deletions

2
.gitignore vendored
View File

@ -1,3 +1,5 @@
.pyc .pyc
.DS_Store .DS_Store
_tmp _tmp
env
__pycache__

View File

@ -1,18 +1,18 @@
import urllib2 import requests
import re import re
# get url # get url
url =raw_input('Enter a URL (include `http://`): ') url = input('Enter a URL (include `http://`): ')
# connect to the url # connect to the url
website = urllib2.urlopen(url) website = requests.get(url)
# read html # read html
html = website.read() html = website.text
# use re.findall to grab all the links # use re.findall to grab all the links
links = re.findall('"((http|ftp)s?://.*?)"', html) links = re.findall('"((http|ftp)s?://.*?)"', html)
# output links # output links
for link in links: for link in links:
print link[0] print(link[0])

View File

@ -16,10 +16,13 @@ twitter_api = twitter.Api(
if __name__ == '__main__': if __name__ == '__main__':
follower_ids = twitter_api.GetFollowerIDs() follower_ids = twitter_api.GetFollowerIDs()
following_ids = twitter_api.GetFriendIDs() following_ids = twitter_api.GetFriendIDs()
zombie_follows = [following_id for following_id in following_ids if following_id not in follower_ids] zombie_follows = [following_id for following_id in
following_ids if following_id not in follower_ids]
confirm = raw_input("Are you sure you want to unfollow %s tweeps [y|n]? " % (len(zombie_follows))) confirm = raw_input(
"Are you sure you want to unfollow {0} tweeps [y|n]? ".format(
(len(zombie_follows))))
if confirm.lower() == 'y': if confirm.lower() == 'y':
for id in zombie_follows: for id in zombie_follows:
user = twitter_api.DestroyFriendship(user_id=id) user = twitter_api.DestroyFriendship(user_id=id)
print "Unfollowed %s" % (user.screen_name) print("Unfollowed {0}".format(user.screen_name))

View File

@ -8,7 +8,7 @@ for file in glob.glob("*.json"):
new_file_name = file_name[:-6] + extension new_file_name = file_name[:-6] + extension
try: try:
os.rename(file, new_file_name) os.rename(file, new_file_name)
except OSError, e: except OSError as e:
print e print(e)
else: else:
print "Renamed {} to {}".format(file, new_file_name) print("Renamed {} to {}".format(file, new_file_name))

View File

@ -1,5 +1,3 @@
import json
def dict_raise_on_duplicates(ordered_pairs): def dict_raise_on_duplicates(ordered_pairs):
"""reject duplicate keys""" """reject duplicate keys"""
my_dict = dict() my_dict = dict()

View File

@ -13,6 +13,7 @@ For example:
import time import time
import random
class ExecutionTime: class ExecutionTime:
@ -25,9 +26,9 @@ class ExecutionTime:
# ---- run code ---- # # ---- run code ---- #
import random
timer = ExecutionTime() timer = ExecutionTime()
sample_list = list() sample_list = list()
my_list = [random.randint(1, 888898) for num in xrange(1, 1000000) if num % 2 == 0] my_list = [random.randint(1, 888898) for num in
print 'Finished in {} seconds.'.format(timer.duration()) range(1, 1000000) if num % 2 == 0]
print('Finished in {} seconds.'.format(timer.duration()))

View File

@ -14,8 +14,8 @@ def timeit(method):
te = time.time() te = time.time()
all_times.append(te - ts) all_times.append(te - ts)
print all_times print(all_times)
print numpy.mean(all_times) print(numpy.mean(all_times))
return result return result
return timed return timed

View File

@ -1,6 +1,9 @@
import requests import requests
import re import re
import urlparse try:
from urllib.parse import urljoin
except ImportError:
from urlparse import urljoin
# regex # regex
email_re = re.compile(r'([\w\.,]+@[\w\.,]+\.\w+)') email_re = re.compile(r'([\w\.,]+@[\w\.,]+\.\w+)')
@ -20,13 +23,13 @@ def crawl(url):
# Find links # Find links
links = link_re.findall(req.text) links = link_re.findall(req.text)
print "\nFound {} links".format(len(links)) print("\nFound {} links".format(len(links)))
# Search links for emails # Search links for emails
for link in links: for link in links:
# Get an absolute URL for a link # Get an absolute URL for a link
link = urlparse.urljoin(url, link) link = urljoin(url, link)
# Find all emails on current page # Find all emails on current page
result.update(email_re.findall(req.text)) result.update(email_re.findall(req.text))
@ -36,7 +39,7 @@ def crawl(url):
if __name__ == '__main__': if __name__ == '__main__':
emails = crawl('http://www.realpython.com') emails = crawl('http://www.realpython.com')
print "\nScrapped e-mail addresses:" print("\nScrapped e-mail addresses:")
for email in emails: for email in emails:
print email print(email)
print "\n" print("\n")

View File

@ -1,6 +1,9 @@
import requests import requests
import re import re
import urlparse try:
from urllib.parse import urljoin
except ImportError:
from urlparse import urljoin
# regex # regex
link_re = re.compile(r'href="(.*?)"') link_re = re.compile(r'href="(.*?)"')
@ -17,17 +20,15 @@ def crawl(url):
# Find links # Find links
links = link_re.findall(req.text) links = link_re.findall(req.text)
print "\nFound {} links".format(len(links)) print("\nFound {} links".format(len(links)))
# Search links for emails # Search links for emails
for link in links: for link in links:
# Get an absolute URL for a link # Get an absolute URL for a link
link = urlparse.urljoin(url, link) link = urljoin(url, link)
print link
print(link)
if __name__ == '__main__': if __name__ == '__main__':
crawl('http://www.realpython.com') crawl('http://www.realpython.com')

View File

@ -2,7 +2,7 @@ import fnmatch
import os import os
# constants # constants
PATH = '/../../../..' PATH = './'
PATTERN = '*.py' PATTERN = '*.py'
@ -14,17 +14,17 @@ def get_file_names(filepath, pattern):
# matches.append(os.path.join(root, filename)) # full path # matches.append(os.path.join(root, filename)) # full path
matches.append(os.path.join(filename)) # just file name matches.append(os.path.join(filename)) # just file name
if matches: if matches:
print "Found {} files:".format(len(matches)) print("Found {} files:".format(len(matches)))
output_files(matches) output_files(matches)
else: else:
print "No files found." print("No files found.")
else: else:
print "Sorry that path does not exist. Try again." print("Sorry that path does not exist. Try again.")
def output_files(list_of_files): def output_files(list_of_files):
for filename in list_of_files: for filename in list_of_files:
print filename print(filename)
if __name__ == '__main__': if __name__ == '__main__':

View File

@ -1,9 +1,9 @@
import fnmatch import fnmatch
import os import os
# sudo pip install Wand # pip install Wand
from wand.image import Image from wand.image import Image
# sudo pip install http://pypi.python.org/packages/source/h/hurry.filesize/hurry.filesize-0.9.tar.gz # pip install http://pypi.python.org/packages/source/h/hurry.filesize/hurry.filesize-0.9.tar.gz
from hurry.filesize import size from hurry.filesize import size
@ -19,12 +19,13 @@ def get_image_file_names(filepath, pattern):
for filename in fnmatch.filter(filenames, pattern): for filename in fnmatch.filter(filenames, pattern):
matches.append(os.path.join(root, filename)) # full path matches.append(os.path.join(root, filename)) # full path
if matches: if matches:
print "Found {} files, with a total file size of {}.".format(len(matches), get_total_size(matches)) print("Found {} files, with a total file size of {}.".format(
len(matches), get_total_size(matches)))
return matches return matches
else: else:
print "No files found." print("No files found.")
else: else:
print "Sorry that path does not exist. Try again." print("Sorry that path does not exist. Try again.")
def get_total_size(list_of_image_names): def get_total_size(list_of_image_names):
@ -35,7 +36,7 @@ def get_total_size(list_of_image_names):
def resize_images(list_of_image_names): def resize_images(list_of_image_names):
print "Optimizing ... " print("Optimizing ... ")
for index, image_name in enumerate(list_of_image_names): for index, image_name in enumerate(list_of_image_names):
with open(image_name) as f: with open(image_name) as f:
image_binary = f.read() image_binary = f.read()
@ -43,7 +44,7 @@ def resize_images(list_of_image_names):
if img.height >= 600: if img.height >= 600:
img.transform(resize='x600') img.transform(resize='x600')
img.save(filename=image_name) img.save(filename=image_name)
print "Optimization complete." print("Optimization complete.")
if __name__ == '__main__': if __name__ == '__main__':

View File

@ -117,10 +117,10 @@ def parse_file(arguments):
writer = writer.writerows(chunk) writer = writer.writerows(chunk)
# Output info # Output info
print "" print("")
print "Chunk # {}:".format(current_chunk) print("Chunk # {}:".format(current_chunk))
print "Filepath: {}".format(current_output) print("Filepath: {}".format(current_output))
print "# of rows: {}".format(len(chunk)) print("# of rows: {}".format(len(chunk)))
# Create new chunk # Create new chunk
current_chunk += 1 current_chunk += 1

View File

@ -10,7 +10,7 @@ def random_name_generator(first, second, x):
- number of random names - number of random names
""" """
names = [] names = []
for i in xrange(0, int(x)): for i in range(0, int(x)):
random_first = randint(0, len(first)-1) random_first = randint(0, len(first)-1)
random_last = randint(0, len(second)-1) random_last = randint(0, len(second)-1)
names.append("{0} {1}".format( names.append("{0} {1}".format(
@ -23,4 +23,4 @@ def random_name_generator(first, second, x):
first_names = ["Drew", "Mike", "Landon", "Jeremy", "Tyler", "Tom", "Avery"] first_names = ["Drew", "Mike", "Landon", "Jeremy", "Tyler", "Tom", "Avery"]
last_names = ["Smith", "Jones", "Brighton", "Taylor"] last_names = ["Smith", "Jones", "Brighton", "Taylor"]
names = random_name_generator(first_names, last_names, 5) names = random_name_generator(first_names, last_names, 5)
print '\n'.join(names) print('\n'.join(names))

View File

@ -11,7 +11,7 @@ class Main:
pass pass
def process(self): def process(self):
print "ok" print("ok")
if __name__ == "__main__": if __name__ == "__main__":
m = Main(some_script.CONFIGFILE) m = Main(some_script.CONFIGFILE)
@ -39,7 +39,7 @@ CONFIGFILE = get_config_file()
if CONFIGFILE is None: if CONFIGFILE is None:
sys.exit("Configuration error! Unknown environment set. \ sys.exit("Configuration error! Unknown environment set. \
Edit config.py and set appropriate environment") Edit config.py and set appropriate environment")
print "Config file: {}".format(CONFIGFILE) print("Config file: {}".format(CONFIGFILE))
if not os.path.exists(CONFIGFILE): if not os.path.exists(CONFIGFILE):
sys.exit("Configuration error! Config file does not exist") sys.exit("Configuration error! Config file does not exist")
print "Config ok ...." print("Config ok ....")

View File

@ -3,7 +3,7 @@ from datetime import datetime
from zipfile import ZipFile from zipfile import ZipFile
#set file name and time of creation # set file name and time of creation
today = datetime.now() today = datetime.now()
file_name = 'zipper_' + today.strftime('%Y.%m.%dh%H%M') + '.zip' file_name = 'zipper_' + today.strftime('%Y.%m.%dh%H%M') + '.zip'
dir_name = 'tmp/' # update path dir_name = 'tmp/' # update path

View File

@ -1,9 +1,9 @@
from subprocess import check_output, call from subprocess import check_output, call
file_name = str(raw_input('Enter the file name: ')) file_name = str(input('Enter the file name: '))
commit = check_output(["git", "rev-list", "-n", "1", "HEAD", "--", file_name]) commit = check_output(["git", "rev-list", "-n", "1", "HEAD", "--", file_name])
print str(commit).rstrip() print(str(commit).rstrip())
call(["git", "checkout", str(commit).rstrip()+"~1", file_name]) call(["git", "checkout", str(commit).rstrip()+"~1", file_name])

View File

@ -10,5 +10,5 @@ if len(sys.argv) == 3:
subprocess.call(command, shell=True) subprocess.call(command, shell=True)
subprocess.call('git push --tags', shell=True) subprocess.call('git push --tags', shell=True)
else: else:
print 'usage: tag.py TAG_NAME COMMIT' print('usage: tag.py TAG_NAME COMMIT')
sys.exit(1) sys.exit(1)

View File

@ -3,7 +3,7 @@ import csv
import sqlite3 import sqlite3
if len(sys.argv) < 3: if len(sys.argv) < 3:
print "Use: {0} DATABASE_NAME TABLE_NAME".format(sys.argv[0]) print("Use: {0} DATABASE_NAME TABLE_NAME".format(sys.argv[0]))
exit() exit()
conn = sqlite3.connect(sys.argv[1]) conn = sqlite3.connect(sys.argv[1])

View File

@ -9,7 +9,7 @@ def get_addresses(filename):
row info from the csv file. row info from the csv file.
""" """
all_addresses = [] all_addresses = []
with open(filename, 'rb') as f: with open(filename, 'rt') as f:
reader = csv.reader(f) reader = csv.reader(f)
for row in reader: for row in reader:
all_addresses.append(row) all_addresses.append(row)
@ -29,7 +29,7 @@ def get_geolocation(all_the_ip_address):
header_row.extend(['Country', 'City']) header_row.extend(['Country', 'City'])
# get geolocation # get geolocation
for line in all_the_ip_address: for line in all_the_ip_address:
print "Grabbing geo info for row # {0}".format(counter) print("Grabbing geo info for row # {0}".format(counter))
r = requests.get('https://freegeoip.net/json/{0}'.format(line[0])) r = requests.get('https://freegeoip.net/json/{0}'.format(line[0]))
line.extend([str(r.json()['country_name']), str(r.json()['city'])]) line.extend([str(r.json()['country_name']), str(r.json()['city'])])
updated_addresses.append(line) updated_addresses.append(line)
@ -43,10 +43,15 @@ def create_csv(updated_address_list):
Given the updated lists of lists from `get_geolocation()`, this function Given the updated lists of lists from `get_geolocation()`, this function
creates a new CSV. creates a new CSV.
""" """
with open('output.csv', 'wb') as f: import sys
if sys.version_info >= (3, 0, 0):
f = open('output.csv', 'w', newline='')
else:
f = open('output.csv', 'wb')
with f:
writer = csv.writer(f) writer = csv.writer(f)
writer.writerows(updated_address_list) writer.writerows(updated_address_list)
print "All done!" print("All done!")
if __name__ == '__main__': if __name__ == '__main__':

32
26_stock_scraper.py Normal file
View File

@ -0,0 +1,32 @@
import requests
from lxml import html
from collections import defaultdict
def get_stocks(url):
# Make Request
page = requests.get(url)
# Parse/Scrape
tree = html.fromstring(page.text)
xpath = '//*[@id="mw-content-text"]/table[1]'
rows = tree.xpath(xpath)[0].findall("tr")
rows = [(row.getchildren()[0], row.getchildren()[3]) for row in rows[1:]]
rows = [(row[0].getchildren()[0].text, row[1].text) for row in rows]
industries = defaultdict(list)
for row in rows:
industries[row[1]].append(row[0])
return industries
def output_data(data_dict):
for industry in data_dict:
print('\n'+industry)
print('-'*len(industry))
for ticker in data_dict[industry]:
print(ticker)
if __name__ == '__main__':
url = 'http://en.wikipedia.org/wiki/List_of_S%26P_500_companies'
scraped_data = get_stocks(url)
output_data(scraped_data)

View File

@ -25,3 +25,4 @@
1. **23_flask_session_test.py**: Just a simple app to see if the sessions are working 1. **23_flask_session_test.py**: Just a simple app to see if the sessions are working
1. **24_sql2csv.py**: SQL to CSV. 1. **24_sql2csv.py**: SQL to CSV.
1. **25_ip2geolocation.py**: Given a CSV file with an ip address (see sample - *25_sample_csv.csv*), return the geolocation based on the ip. 1. **25_ip2geolocation.py**: Given a CSV file with an ip address (see sample - *25_sample_csv.csv*), return the geolocation based on the ip.
1. **26_stock_scraper.py**: Scrape the S&P 500 Companies list from Wikipedia, then output he data.

10
requirements.txt Normal file
View File

@ -0,0 +1,10 @@
Flask==0.10.1
Jinja2==2.7.3
MarkupSafe==0.23
Wand==0.4.0
Werkzeug==0.10.4
hurry.filesize==0.9
itsdangerous==0.24
lxml==3.4.4
numpy==1.9.2
requests==2.7.0