33 lines
924 B
Python
33 lines
924 B
Python
import requests
|
|
from lxml import html
|
|
from collections import defaultdict
|
|
|
|
|
|
def get_stocks(url):
|
|
# Make Request
|
|
page = requests.get(url)
|
|
# Parse/Scrape
|
|
tree = html.fromstring(page.text)
|
|
xpath = '//*[@id="mw-content-text"]/table[1]'
|
|
rows = tree.xpath(xpath)[0].findall("tr")
|
|
rows = [(row.getchildren()[0], row.getchildren()[3]) for row in rows[1:]]
|
|
rows = [(row[0].getchildren()[0].text, row[1].text) for row in rows]
|
|
industries = defaultdict(list)
|
|
for row in rows:
|
|
industries[row[1]].append(row[0])
|
|
return industries
|
|
|
|
|
|
def output_data(data_dict):
|
|
for industry in data_dict:
|
|
print('\n'+industry)
|
|
print('-'*len(industry))
|
|
for ticker in data_dict[industry]:
|
|
print(ticker)
|
|
|
|
|
|
if __name__ == '__main__':
|
|
url = 'http://en.wikipedia.org/wiki/List_of_S%26P_500_companies'
|
|
scraped_data = get_stocks(url)
|
|
output_data(scraped_data)
|