billionaries-wizualizacja/projekt2.ipynb

358 KiB

import folium
import warnings
import pandas as pd 
import plotly.express as px
import matplotlib.pyplot as plt
import plotly.graph_objects as go

from difflib import get_close_matches
from plotly.subplots import make_subplots

plt.style.use('ggplot')
warnings.filterwarnings("ignore")

Read and clean up data

Remove nans value and replace them from: latitude country, longitude_country, country, city, gdp_country.

df = pd.read_csv('data.csv')

df.latitude_country = df.latitude_country.fillna(0.0)
df.longitude_country = df.longitude_country.fillna(0.0)
df.country = df.country.fillna('Unknown')
df.city = df.city.fillna('Unknown')
df.gdp_country = df.gdp_country.fillna(0.0)
df.finalWorth = df.finalWorth / 1000

Interactive plots using plotly

Plot made using ploty that dipslay on hover age, net worth and name of billionaire

fig = px.scatter(df.head(20), x='age', y='finalWorth', color='age', size='finalWorth', hover_data=['personName'], title='Age vs Net Worth')
fig.update_layout(
    xaxis_title='Age',
    yaxis_title='Net Worth (Billions)',
    yaxis=dict(tickprefix='$', ticksuffix='b'),
    xaxis=dict(type='log'),
    legend_title='Age',
    font=dict(
        family='Courier New, monospace',
        size=18,
        color='RebeccaPurple'
    )

)
fig.show()

Plot made using plotly that on hover displays: industry, number of billionaries and % of billionaires that are from that industry

industry_counts = df['industries'].str.split(',').explode().value_counts()
fig = go.Figure(data=[go.Pie(labels=industry_counts.index, values=industry_counts, hole=0.3)])
fig.update_layout(
    title='Billionaires by Industry',
    annotations=[dict(text='Industry', x=0.5, y=0.5, font_size=20, showarrow=False)]
)
fig.show()

Pie plots made with plotly that shows gender distribution and self-made by gender

gender_counts = df['gender'].str.split(',').explode().value_counts()

df_women = df[df['gender'] == 'F']
selfMade_counts_women = df_women['selfMade'].value_counts()

df_men = df[df['gender'] == 'M']
selfMade_counts_men = df_men['selfMade'].value_counts()

fig1 = go.Figure(data=[go.Pie(labels=gender_counts.index, values=gender_counts, hole=0.3)])
fig1.update_traces(textinfo='percent+label')

fig2 = go.Figure(data=[go.Pie(labels=selfMade_counts_men.index, values=selfMade_counts_men, hole=0.3)])
fig2.update_traces(textinfo='percent+label')

fig3 = go.Figure(data=[go.Pie(labels=selfMade_counts_women.index, values=selfMade_counts_women, hole=0.3)])
fig3.update_traces(textinfo='percent+label')

fig = make_subplots(rows=1, cols=3, specs=[[{'type': 'domain'}, {'type': 'domain'},{'type': 'sunburst'}]])

fig.add_trace(fig1.data[0], row=1, col=1)
fig.add_trace(fig2.data[0], row=1, col=2)
fig.add_trace(fig3.data[0], row=1, col=3)

fig.update_layout(height=400, showlegend=False, title_text="Distribution of Gender and SelfMade by Gender")
fig.add_annotation(dict(x=0.14, y=-0.1, ax=0, ay=0, text="% Gender Distribution"))
fig.add_annotation(dict(x=0.50, y=-0.1, ax=0, ay=0, text="% SelfMade (M)"))
fig.add_annotation(dict(x=0.85, y=-0.1, ax=0, ay=0, text="% SelfMade (F)"))

fig.show()

Maps using plotly and folium

Map made using folium on click show name of country and numbers of billionaries from that country

m = folium.Map(location=[40, -100], zoom_start=4)

for country in df['country'].unique():
    lat, lon = df[df['country'] == country]['latitude_country'].mean(), df[df['country'] == country]['longitude_country'].mean()
    num_billionaires = len(df[df['country'] == country])
    folium.Marker([lat, lon], popup=f"{country} Num of billionaires {num_billionaires}").add_to(m)

m
Make this Notebook Trusted to load map: File -> Trust Notebook

Adjust states names to display

dd= df.dropna(subset=["state"])

states = {
    'AK': 'Alaska', 'AL': 'Alabama', 'AR': 'Arkansas', 'AS': 'American Samoa',
    'AZ': 'Arizona', 'CA': 'California', 'CO': 'Colorado', 'CT': 'Connecticut',
    'DC': 'District of Columbia', 'DE': 'Delaware', 'FL': 'Florida', 'GA': 'Georgia',
    'GU': 'Guam', 'HI': 'Hawaii', 'IA': 'Iowa', 'ID': 'Idaho', 'IL': 'Illinois',
    'IN': 'Indiana', 'KS': 'Kansas', 'KY': 'Kentucky', 'LA': 'Louisiana', 'MA': 'Massachusetts',
    'MD': 'Maryland', 'ME': 'Maine', 'MI': 'Michigan', 'MN': 'Minnesota', 'MO': 'Missouri',
    'MP': 'Northern Mariana Islands', 'MS': 'Mississippi', 'MT': 'Montana', 'NA': 'National',
    'NC': 'North Carolina', 'ND': 'North Dakota', 'NE': 'Nebraska', 'NH': 'New Hampshire',
    'NJ': 'New Jersey', 'NM': 'New Mexico', 'NV': 'Nevada', 'NY': 'New York', 'OH': 'Ohio',
    'OK': 'Oklahoma', 'OR': 'Oregon', 'PA': 'Pennsylvania', 'PR': 'Puerto Rico', 'RI': 'Rhode Island',
    'SC': 'South Carolina', 'SD': 'South Dakota', 'TN': 'Tennessee', 'TX': 'Texas', 'UT': 'Utah',
    'VA': 'Virginia', 'VI': 'Virgin Islands', 'VT': 'Vermont', 'WA': 'Washington', 'WI': 'Wisconsin',
    'WV': 'West Virginia', 'WY': 'Wyoming'
}

def best_match(x):
    if len(x) == 2:
        abbr = x.upper()
        if abbr in states:
            return abbr
    else:
        matches = get_close_matches(x, states.values(), n=1, cutoff=0.8)
        if matches:
            abbr = list(states.keys())[list(states.values()).index(matches[0])]
            return abbr
    return None

dd['state_corrected'] = dd['state'].apply(lambda x: best_match(x))

dd["state_corrected"].unique()
array(['TX', 'WA', 'HI', 'NE', 'NY', 'CA', 'KS', 'AR', 'OR', 'VA', 'WY',
       'NV', 'FL', 'PA', 'MA', 'IL', 'TN', 'CT', 'OK', 'MI', 'WI', 'CO',
       'IN', 'NH', 'GA', 'MO', 'NJ', 'NC', 'IA', 'KY', 'MD', 'MT', 'OH',
       'SC', 'AZ', 'LA', 'UT', 'RI', 'ID', 'VI', 'MN', 'ME', 'MS', 'SD',
       'AL'], dtype=object)

Plot make by plotly, displaying states in USA on hover: name of state, billionaries population in this state

d = dd[dd['country'] == 'United States']
d = dd.groupby('state_corrected')['personName'].count().reset_index(name='rich_pop_usa')

fig = px.choropleth(d, locations="state_corrected", locationmode='USA-states', 
                    scope="usa", color="rich_pop_usa", 
                    color_continuous_scale="Viridis")

fig.update_traces(customdata=d['rich_pop_usa'].values,
                hovertemplate='<b>%{text}</b><br>Billionaires Population: %{customdata}',
                  
                  text=[f"{states[abbr]} ({abbr})" for abbr in d['state_corrected']],
                  marker_line_color='white', marker_line_width=0.5)

fig.update_layout(
    title_text='Rich Population by U.S. State',
    geo_scope='usa',
)

fig.show()

Plot make using plotly to display life expectancy in each conutry, on hover: name of country, life expectancy

fig = px.choropleth(df, locations="country", locationmode='country names',
                    color="life_expectancy_country", hover_name="country",
                    color_continuous_scale="Viridis")
fig.show()