python-scripts/scripts/26_stock_scraper.py

import requests
from lxml import html
from collections import defaultdict


def get_stocks(url):
    # Make Request
    page = requests.get(url)
    # Parse/Scrape
    tree = html.fromstring(page.text)
    xpath = '//*[@id="mw-content-text"]/table[1]'
    rows = tree.xpath(xpath)[0].findall("tr")
    rows = [(row.getchildren()[0], row.getchildren()[3]) for row in rows[1:]]
    rows = [(row[0].getchildren()[0].text, row[1].text) for row in rows]
    industries = defaultdict(list)
    for row in rows:
        industries[row[1]].append(row[0])
    return industries


def output_data(data_dict):
    for industry in data_dict:
        print('\n'+industry)
        print('-'*len(industry))
        for ticker in data_dict[industry]:
            print(ticker)


if __name__ == '__main__':
    url = 'http://en.wikipedia.org/wiki/List_of_S%26P_500_companies'
    scraped_data = get_stocks(url)
    output_data(scraped_data)