added stock scraper, converted all scripts to python 2/3 compatibility
This commit is contained in:
parent
5bb3679901
commit
0bb4c8c255
2
.gitignore
vendored
2
.gitignore
vendored
@ -1,3 +1,5 @@
|
|||||||
.pyc
|
.pyc
|
||||||
.DS_Store
|
.DS_Store
|
||||||
_tmp
|
_tmp
|
||||||
|
env
|
||||||
|
__pycache__
|
@ -1,18 +1,18 @@
|
|||||||
import urllib2
|
import requests
|
||||||
import re
|
import re
|
||||||
|
|
||||||
# get url
|
# get url
|
||||||
url =raw_input('Enter a URL (include `http://`): ')
|
url = input('Enter a URL (include `http://`): ')
|
||||||
|
|
||||||
# connect to the url
|
# connect to the url
|
||||||
website = urllib2.urlopen(url)
|
website = requests.get(url)
|
||||||
|
|
||||||
# read html
|
# read html
|
||||||
html = website.read()
|
html = website.text
|
||||||
|
|
||||||
# use re.findall to grab all the links
|
# use re.findall to grab all the links
|
||||||
links = re.findall('"((http|ftp)s?://.*?)"', html)
|
links = re.findall('"((http|ftp)s?://.*?)"', html)
|
||||||
|
|
||||||
# output links
|
# output links
|
||||||
for link in links:
|
for link in links:
|
||||||
print link[0]
|
print(link[0])
|
||||||
|
@ -16,10 +16,13 @@ twitter_api = twitter.Api(
|
|||||||
if __name__ == '__main__':
|
if __name__ == '__main__':
|
||||||
follower_ids = twitter_api.GetFollowerIDs()
|
follower_ids = twitter_api.GetFollowerIDs()
|
||||||
following_ids = twitter_api.GetFriendIDs()
|
following_ids = twitter_api.GetFriendIDs()
|
||||||
zombie_follows = [following_id for following_id in following_ids if following_id not in follower_ids]
|
zombie_follows = [following_id for following_id in
|
||||||
|
following_ids if following_id not in follower_ids]
|
||||||
|
|
||||||
confirm = raw_input("Are you sure you want to unfollow %s tweeps [y|n]? " % (len(zombie_follows)))
|
confirm = raw_input(
|
||||||
|
"Are you sure you want to unfollow {0} tweeps [y|n]? ".format(
|
||||||
|
(len(zombie_follows))))
|
||||||
if confirm.lower() == 'y':
|
if confirm.lower() == 'y':
|
||||||
for id in zombie_follows:
|
for id in zombie_follows:
|
||||||
user = twitter_api.DestroyFriendship(user_id=id)
|
user = twitter_api.DestroyFriendship(user_id=id)
|
||||||
print "Unfollowed %s" % (user.screen_name)
|
print("Unfollowed {0}".format(user.screen_name))
|
||||||
|
@ -8,7 +8,7 @@ for file in glob.glob("*.json"):
|
|||||||
new_file_name = file_name[:-6] + extension
|
new_file_name = file_name[:-6] + extension
|
||||||
try:
|
try:
|
||||||
os.rename(file, new_file_name)
|
os.rename(file, new_file_name)
|
||||||
except OSError, e:
|
except OSError as e:
|
||||||
print e
|
print(e)
|
||||||
else:
|
else:
|
||||||
print "Renamed {} to {}".format(file, new_file_name)
|
print("Renamed {} to {}".format(file, new_file_name))
|
||||||
|
@ -1,11 +1,9 @@
|
|||||||
import json
|
|
||||||
|
|
||||||
def dict_raise_on_duplicates(ordered_pairs):
|
def dict_raise_on_duplicates(ordered_pairs):
|
||||||
"""reject duplicate keys"""
|
"""reject duplicate keys"""
|
||||||
my_dict = dict()
|
my_dict = dict()
|
||||||
for key, values in ordered_pairs:
|
for key, values in ordered_pairs:
|
||||||
if key in my_dict:
|
if key in my_dict:
|
||||||
raise ValueError("Duplicate key: {}".format(key,))
|
raise ValueError("Duplicate key: {}".format(key,))
|
||||||
else:
|
else:
|
||||||
my_dict[key] = values
|
my_dict[key] = values
|
||||||
return my_dict
|
return my_dict
|
@ -13,6 +13,7 @@ For example:
|
|||||||
|
|
||||||
|
|
||||||
import time
|
import time
|
||||||
|
import random
|
||||||
|
|
||||||
|
|
||||||
class ExecutionTime:
|
class ExecutionTime:
|
||||||
@ -25,9 +26,9 @@ class ExecutionTime:
|
|||||||
|
|
||||||
# ---- run code ---- #
|
# ---- run code ---- #
|
||||||
|
|
||||||
import random
|
|
||||||
|
|
||||||
timer = ExecutionTime()
|
timer = ExecutionTime()
|
||||||
sample_list = list()
|
sample_list = list()
|
||||||
my_list = [random.randint(1, 888898) for num in xrange(1, 1000000) if num % 2 == 0]
|
my_list = [random.randint(1, 888898) for num in
|
||||||
print 'Finished in {} seconds.'.format(timer.duration())
|
range(1, 1000000) if num % 2 == 0]
|
||||||
|
print('Finished in {} seconds.'.format(timer.duration()))
|
||||||
|
@ -14,8 +14,8 @@ def timeit(method):
|
|||||||
te = time.time()
|
te = time.time()
|
||||||
all_times.append(te - ts)
|
all_times.append(te - ts)
|
||||||
|
|
||||||
print all_times
|
print(all_times)
|
||||||
print numpy.mean(all_times)
|
print(numpy.mean(all_times))
|
||||||
return result
|
return result
|
||||||
|
|
||||||
return timed
|
return timed
|
||||||
|
@ -1,6 +1,9 @@
|
|||||||
import requests
|
import requests
|
||||||
import re
|
import re
|
||||||
import urlparse
|
try:
|
||||||
|
from urllib.parse import urljoin
|
||||||
|
except ImportError:
|
||||||
|
from urlparse import urljoin
|
||||||
|
|
||||||
# regex
|
# regex
|
||||||
email_re = re.compile(r'([\w\.,]+@[\w\.,]+\.\w+)')
|
email_re = re.compile(r'([\w\.,]+@[\w\.,]+\.\w+)')
|
||||||
@ -20,13 +23,13 @@ def crawl(url):
|
|||||||
# Find links
|
# Find links
|
||||||
links = link_re.findall(req.text)
|
links = link_re.findall(req.text)
|
||||||
|
|
||||||
print "\nFound {} links".format(len(links))
|
print("\nFound {} links".format(len(links)))
|
||||||
|
|
||||||
# Search links for emails
|
# Search links for emails
|
||||||
for link in links:
|
for link in links:
|
||||||
|
|
||||||
# Get an absolute URL for a link
|
# Get an absolute URL for a link
|
||||||
link = urlparse.urljoin(url, link)
|
link = urljoin(url, link)
|
||||||
|
|
||||||
# Find all emails on current page
|
# Find all emails on current page
|
||||||
result.update(email_re.findall(req.text))
|
result.update(email_re.findall(req.text))
|
||||||
@ -36,7 +39,7 @@ def crawl(url):
|
|||||||
if __name__ == '__main__':
|
if __name__ == '__main__':
|
||||||
emails = crawl('http://www.realpython.com')
|
emails = crawl('http://www.realpython.com')
|
||||||
|
|
||||||
print "\nScrapped e-mail addresses:"
|
print("\nScrapped e-mail addresses:")
|
||||||
for email in emails:
|
for email in emails:
|
||||||
print email
|
print(email)
|
||||||
print "\n"
|
print("\n")
|
||||||
|
@ -1,6 +1,9 @@
|
|||||||
import requests
|
import requests
|
||||||
import re
|
import re
|
||||||
import urlparse
|
try:
|
||||||
|
from urllib.parse import urljoin
|
||||||
|
except ImportError:
|
||||||
|
from urlparse import urljoin
|
||||||
|
|
||||||
# regex
|
# regex
|
||||||
link_re = re.compile(r'href="(.*?)"')
|
link_re = re.compile(r'href="(.*?)"')
|
||||||
@ -17,17 +20,15 @@ def crawl(url):
|
|||||||
# Find links
|
# Find links
|
||||||
links = link_re.findall(req.text)
|
links = link_re.findall(req.text)
|
||||||
|
|
||||||
print "\nFound {} links".format(len(links))
|
print("\nFound {} links".format(len(links)))
|
||||||
|
|
||||||
# Search links for emails
|
# Search links for emails
|
||||||
for link in links:
|
for link in links:
|
||||||
|
|
||||||
# Get an absolute URL for a link
|
# Get an absolute URL for a link
|
||||||
link = urlparse.urljoin(url, link)
|
link = urljoin(url, link)
|
||||||
|
|
||||||
print link
|
|
||||||
|
|
||||||
|
print(link)
|
||||||
|
|
||||||
if __name__ == '__main__':
|
if __name__ == '__main__':
|
||||||
crawl('http://www.realpython.com')
|
crawl('http://www.realpython.com')
|
||||||
|
|
||||||
|
@ -2,7 +2,7 @@ import fnmatch
|
|||||||
import os
|
import os
|
||||||
|
|
||||||
# constants
|
# constants
|
||||||
PATH = '/../../../..'
|
PATH = './'
|
||||||
PATTERN = '*.py'
|
PATTERN = '*.py'
|
||||||
|
|
||||||
|
|
||||||
@ -14,17 +14,17 @@ def get_file_names(filepath, pattern):
|
|||||||
# matches.append(os.path.join(root, filename)) # full path
|
# matches.append(os.path.join(root, filename)) # full path
|
||||||
matches.append(os.path.join(filename)) # just file name
|
matches.append(os.path.join(filename)) # just file name
|
||||||
if matches:
|
if matches:
|
||||||
print "Found {} files:".format(len(matches))
|
print("Found {} files:".format(len(matches)))
|
||||||
output_files(matches)
|
output_files(matches)
|
||||||
else:
|
else:
|
||||||
print "No files found."
|
print("No files found.")
|
||||||
else:
|
else:
|
||||||
print "Sorry that path does not exist. Try again."
|
print("Sorry that path does not exist. Try again.")
|
||||||
|
|
||||||
|
|
||||||
def output_files(list_of_files):
|
def output_files(list_of_files):
|
||||||
for filename in list_of_files:
|
for filename in list_of_files:
|
||||||
print filename
|
print(filename)
|
||||||
|
|
||||||
|
|
||||||
if __name__ == '__main__':
|
if __name__ == '__main__':
|
||||||
|
@ -1,9 +1,9 @@
|
|||||||
import fnmatch
|
import fnmatch
|
||||||
import os
|
import os
|
||||||
|
|
||||||
# sudo pip install Wand
|
# pip install Wand
|
||||||
from wand.image import Image
|
from wand.image import Image
|
||||||
# sudo pip install http://pypi.python.org/packages/source/h/hurry.filesize/hurry.filesize-0.9.tar.gz
|
# pip install http://pypi.python.org/packages/source/h/hurry.filesize/hurry.filesize-0.9.tar.gz
|
||||||
from hurry.filesize import size
|
from hurry.filesize import size
|
||||||
|
|
||||||
|
|
||||||
@ -19,12 +19,13 @@ def get_image_file_names(filepath, pattern):
|
|||||||
for filename in fnmatch.filter(filenames, pattern):
|
for filename in fnmatch.filter(filenames, pattern):
|
||||||
matches.append(os.path.join(root, filename)) # full path
|
matches.append(os.path.join(root, filename)) # full path
|
||||||
if matches:
|
if matches:
|
||||||
print "Found {} files, with a total file size of {}.".format(len(matches), get_total_size(matches))
|
print("Found {} files, with a total file size of {}.".format(
|
||||||
|
len(matches), get_total_size(matches)))
|
||||||
return matches
|
return matches
|
||||||
else:
|
else:
|
||||||
print "No files found."
|
print("No files found.")
|
||||||
else:
|
else:
|
||||||
print "Sorry that path does not exist. Try again."
|
print("Sorry that path does not exist. Try again.")
|
||||||
|
|
||||||
|
|
||||||
def get_total_size(list_of_image_names):
|
def get_total_size(list_of_image_names):
|
||||||
@ -35,7 +36,7 @@ def get_total_size(list_of_image_names):
|
|||||||
|
|
||||||
|
|
||||||
def resize_images(list_of_image_names):
|
def resize_images(list_of_image_names):
|
||||||
print "Optimizing ... "
|
print("Optimizing ... ")
|
||||||
for index, image_name in enumerate(list_of_image_names):
|
for index, image_name in enumerate(list_of_image_names):
|
||||||
with open(image_name) as f:
|
with open(image_name) as f:
|
||||||
image_binary = f.read()
|
image_binary = f.read()
|
||||||
@ -43,7 +44,7 @@ def resize_images(list_of_image_names):
|
|||||||
if img.height >= 600:
|
if img.height >= 600:
|
||||||
img.transform(resize='x600')
|
img.transform(resize='x600')
|
||||||
img.save(filename=image_name)
|
img.save(filename=image_name)
|
||||||
print "Optimization complete."
|
print("Optimization complete.")
|
||||||
|
|
||||||
|
|
||||||
if __name__ == '__main__':
|
if __name__ == '__main__':
|
||||||
|
@ -117,10 +117,10 @@ def parse_file(arguments):
|
|||||||
writer = writer.writerows(chunk)
|
writer = writer.writerows(chunk)
|
||||||
|
|
||||||
# Output info
|
# Output info
|
||||||
print ""
|
print("")
|
||||||
print "Chunk # {}:".format(current_chunk)
|
print("Chunk # {}:".format(current_chunk))
|
||||||
print "Filepath: {}".format(current_output)
|
print("Filepath: {}".format(current_output))
|
||||||
print "# of rows: {}".format(len(chunk))
|
print("# of rows: {}".format(len(chunk)))
|
||||||
|
|
||||||
# Create new chunk
|
# Create new chunk
|
||||||
current_chunk += 1
|
current_chunk += 1
|
||||||
|
@ -10,7 +10,7 @@ def random_name_generator(first, second, x):
|
|||||||
- number of random names
|
- number of random names
|
||||||
"""
|
"""
|
||||||
names = []
|
names = []
|
||||||
for i in xrange(0, int(x)):
|
for i in range(0, int(x)):
|
||||||
random_first = randint(0, len(first)-1)
|
random_first = randint(0, len(first)-1)
|
||||||
random_last = randint(0, len(second)-1)
|
random_last = randint(0, len(second)-1)
|
||||||
names.append("{0} {1}".format(
|
names.append("{0} {1}".format(
|
||||||
@ -23,4 +23,4 @@ def random_name_generator(first, second, x):
|
|||||||
first_names = ["Drew", "Mike", "Landon", "Jeremy", "Tyler", "Tom", "Avery"]
|
first_names = ["Drew", "Mike", "Landon", "Jeremy", "Tyler", "Tom", "Avery"]
|
||||||
last_names = ["Smith", "Jones", "Brighton", "Taylor"]
|
last_names = ["Smith", "Jones", "Brighton", "Taylor"]
|
||||||
names = random_name_generator(first_names, last_names, 5)
|
names = random_name_generator(first_names, last_names, 5)
|
||||||
print '\n'.join(names)
|
print('\n'.join(names))
|
||||||
|
@ -11,7 +11,7 @@ class Main:
|
|||||||
pass
|
pass
|
||||||
|
|
||||||
def process(self):
|
def process(self):
|
||||||
print "ok"
|
print("ok")
|
||||||
|
|
||||||
if __name__ == "__main__":
|
if __name__ == "__main__":
|
||||||
m = Main(some_script.CONFIGFILE)
|
m = Main(some_script.CONFIGFILE)
|
||||||
@ -39,7 +39,7 @@ CONFIGFILE = get_config_file()
|
|||||||
if CONFIGFILE is None:
|
if CONFIGFILE is None:
|
||||||
sys.exit("Configuration error! Unknown environment set. \
|
sys.exit("Configuration error! Unknown environment set. \
|
||||||
Edit config.py and set appropriate environment")
|
Edit config.py and set appropriate environment")
|
||||||
print "Config file: {}".format(CONFIGFILE)
|
print("Config file: {}".format(CONFIGFILE))
|
||||||
if not os.path.exists(CONFIGFILE):
|
if not os.path.exists(CONFIGFILE):
|
||||||
sys.exit("Configuration error! Config file does not exist")
|
sys.exit("Configuration error! Config file does not exist")
|
||||||
print "Config ok ...."
|
print("Config ok ....")
|
||||||
|
@ -3,7 +3,7 @@ from datetime import datetime
|
|||||||
from zipfile import ZipFile
|
from zipfile import ZipFile
|
||||||
|
|
||||||
|
|
||||||
#set file name and time of creation
|
# set file name and time of creation
|
||||||
today = datetime.now()
|
today = datetime.now()
|
||||||
file_name = 'zipper_' + today.strftime('%Y.%m.%dh%H%M') + '.zip'
|
file_name = 'zipper_' + today.strftime('%Y.%m.%dh%H%M') + '.zip'
|
||||||
dir_name = 'tmp/' # update path
|
dir_name = 'tmp/' # update path
|
||||||
|
@ -1,9 +1,9 @@
|
|||||||
from subprocess import check_output, call
|
from subprocess import check_output, call
|
||||||
|
|
||||||
|
|
||||||
file_name = str(raw_input('Enter the file name: '))
|
file_name = str(input('Enter the file name: '))
|
||||||
commit = check_output(["git", "rev-list", "-n", "1", "HEAD", "--", file_name])
|
commit = check_output(["git", "rev-list", "-n", "1", "HEAD", "--", file_name])
|
||||||
print str(commit).rstrip()
|
print(str(commit).rstrip())
|
||||||
call(["git", "checkout", str(commit).rstrip()+"~1", file_name])
|
call(["git", "checkout", str(commit).rstrip()+"~1", file_name])
|
||||||
|
|
||||||
|
|
||||||
|
@ -10,5 +10,5 @@ if len(sys.argv) == 3:
|
|||||||
subprocess.call(command, shell=True)
|
subprocess.call(command, shell=True)
|
||||||
subprocess.call('git push --tags', shell=True)
|
subprocess.call('git push --tags', shell=True)
|
||||||
else:
|
else:
|
||||||
print 'usage: tag.py TAG_NAME COMMIT'
|
print('usage: tag.py TAG_NAME COMMIT')
|
||||||
sys.exit(1)
|
sys.exit(1)
|
||||||
|
@ -3,7 +3,7 @@ import csv
|
|||||||
import sqlite3
|
import sqlite3
|
||||||
|
|
||||||
if len(sys.argv) < 3:
|
if len(sys.argv) < 3:
|
||||||
print "Use: {0} DATABASE_NAME TABLE_NAME".format(sys.argv[0])
|
print("Use: {0} DATABASE_NAME TABLE_NAME".format(sys.argv[0]))
|
||||||
exit()
|
exit()
|
||||||
|
|
||||||
conn = sqlite3.connect(sys.argv[1])
|
conn = sqlite3.connect(sys.argv[1])
|
||||||
|
@ -9,7 +9,7 @@ def get_addresses(filename):
|
|||||||
row info from the csv file.
|
row info from the csv file.
|
||||||
"""
|
"""
|
||||||
all_addresses = []
|
all_addresses = []
|
||||||
with open(filename, 'rb') as f:
|
with open(filename, 'rt') as f:
|
||||||
reader = csv.reader(f)
|
reader = csv.reader(f)
|
||||||
for row in reader:
|
for row in reader:
|
||||||
all_addresses.append(row)
|
all_addresses.append(row)
|
||||||
@ -29,7 +29,7 @@ def get_geolocation(all_the_ip_address):
|
|||||||
header_row.extend(['Country', 'City'])
|
header_row.extend(['Country', 'City'])
|
||||||
# get geolocation
|
# get geolocation
|
||||||
for line in all_the_ip_address:
|
for line in all_the_ip_address:
|
||||||
print "Grabbing geo info for row # {0}".format(counter)
|
print("Grabbing geo info for row # {0}".format(counter))
|
||||||
r = requests.get('https://freegeoip.net/json/{0}'.format(line[0]))
|
r = requests.get('https://freegeoip.net/json/{0}'.format(line[0]))
|
||||||
line.extend([str(r.json()['country_name']), str(r.json()['city'])])
|
line.extend([str(r.json()['country_name']), str(r.json()['city'])])
|
||||||
updated_addresses.append(line)
|
updated_addresses.append(line)
|
||||||
@ -43,10 +43,15 @@ def create_csv(updated_address_list):
|
|||||||
Given the updated lists of lists from `get_geolocation()`, this function
|
Given the updated lists of lists from `get_geolocation()`, this function
|
||||||
creates a new CSV.
|
creates a new CSV.
|
||||||
"""
|
"""
|
||||||
with open('output.csv', 'wb') as f:
|
import sys
|
||||||
|
if sys.version_info >= (3, 0, 0):
|
||||||
|
f = open('output.csv', 'w', newline='')
|
||||||
|
else:
|
||||||
|
f = open('output.csv', 'wb')
|
||||||
|
with f:
|
||||||
writer = csv.writer(f)
|
writer = csv.writer(f)
|
||||||
writer.writerows(updated_address_list)
|
writer.writerows(updated_address_list)
|
||||||
print "All done!"
|
print("All done!")
|
||||||
|
|
||||||
|
|
||||||
if __name__ == '__main__':
|
if __name__ == '__main__':
|
||||||
|
32
26_stock_scraper.py
Normal file
32
26_stock_scraper.py
Normal file
@ -0,0 +1,32 @@
|
|||||||
|
import requests
|
||||||
|
from lxml import html
|
||||||
|
from collections import defaultdict
|
||||||
|
|
||||||
|
|
||||||
|
def get_stocks(url):
|
||||||
|
# Make Request
|
||||||
|
page = requests.get(url)
|
||||||
|
# Parse/Scrape
|
||||||
|
tree = html.fromstring(page.text)
|
||||||
|
xpath = '//*[@id="mw-content-text"]/table[1]'
|
||||||
|
rows = tree.xpath(xpath)[0].findall("tr")
|
||||||
|
rows = [(row.getchildren()[0], row.getchildren()[3]) for row in rows[1:]]
|
||||||
|
rows = [(row[0].getchildren()[0].text, row[1].text) for row in rows]
|
||||||
|
industries = defaultdict(list)
|
||||||
|
for row in rows:
|
||||||
|
industries[row[1]].append(row[0])
|
||||||
|
return industries
|
||||||
|
|
||||||
|
|
||||||
|
def output_data(data_dict):
|
||||||
|
for industry in data_dict:
|
||||||
|
print('\n'+industry)
|
||||||
|
print('-'*len(industry))
|
||||||
|
for ticker in data_dict[industry]:
|
||||||
|
print(ticker)
|
||||||
|
|
||||||
|
|
||||||
|
if __name__ == '__main__':
|
||||||
|
url = 'http://en.wikipedia.org/wiki/List_of_S%26P_500_companies'
|
||||||
|
scraped_data = get_stocks(url)
|
||||||
|
output_data(scraped_data)
|
@ -25,3 +25,4 @@
|
|||||||
1. **23_flask_session_test.py**: Just a simple app to see if the sessions are working
|
1. **23_flask_session_test.py**: Just a simple app to see if the sessions are working
|
||||||
1. **24_sql2csv.py**: SQL to CSV.
|
1. **24_sql2csv.py**: SQL to CSV.
|
||||||
1. **25_ip2geolocation.py**: Given a CSV file with an ip address (see sample - *25_sample_csv.csv*), return the geolocation based on the ip.
|
1. **25_ip2geolocation.py**: Given a CSV file with an ip address (see sample - *25_sample_csv.csv*), return the geolocation based on the ip.
|
||||||
|
1. **26_stock_scraper.py**: Scrape the S&P 500 Companies list from Wikipedia, then output he data.
|
||||||
|
10
requirements.txt
Normal file
10
requirements.txt
Normal file
@ -0,0 +1,10 @@
|
|||||||
|
Flask==0.10.1
|
||||||
|
Jinja2==2.7.3
|
||||||
|
MarkupSafe==0.23
|
||||||
|
Wand==0.4.0
|
||||||
|
Werkzeug==0.10.4
|
||||||
|
hurry.filesize==0.9
|
||||||
|
itsdangerous==0.24
|
||||||
|
lxml==3.4.4
|
||||||
|
numpy==1.9.2
|
||||||
|
requests==2.7.0
|
Loading…
Reference in New Issue
Block a user