#!/usr/bin/env python # coding: utf-8 # In[1]: import pandas as pd import plotly.express as px import seaborn as sns import os from sklearn.model_selection import train_test_split from sklearn.preprocessing import MinMaxScaler df = pd.read_csv(os.path.join('.', 'body_performance.csv')) df['BMI'] = df['weight_kg']/(0.0001*df['height_cm']*df['height_cm']) print(df.head()) # In[ ]: df.duplicated().sum() print(f'with duplicates:{df.shape}') df.drop_duplicates(inplace=True) print(f'without duplicates:{df.shape}') df_copy = df.copy() # In[ ]: body_train, body_test = train_test_split(df, test_size=int(df["age"].count()*0.2), random_state=1) body_test, body_valid = train_test_split(body_test, test_size=int(body_test["age"].count()*0.5), random_state=1) print("number of elements in data frame: {}".format(df['age'].count())) print("train: {}".format(body_train["age"].count())) print("test: {}".format(body_test["age"].count())) print("valid: {}".format(body_valid["age"].count())) # In[ ]: print(df.describe(include='all')) #sit and bend forward_cm jest na minusie!!! # In[ ]: scaler = MinMaxScaler() df[['age', 'height_cm', 'weight_kg','body fat_%', 'diastolic','systolic','gripForce','sit-ups counts', 'broad jump_cm','BMI']] = scaler.fit_transform(df[[ 'age', 'height_cm', 'weight_kg','body fat_%', 'diastolic','systolic','gripForce','sit-ups counts', 'broad jump_cm','BMI']]) scaler = MinMaxScaler(feature_range=(-1, 1)) df['sit and bend forward_cm'] = scaler.fit_transform(df[['sit and bend forward_cm']]) df.describe(include='all') # In[ ]: df.info() # In[ ]: print('Each class in data frame: \n{}'.format(df['class'].value_counts())) print('Each class in train data: \n{}'.format(body_train['class'].value_counts())) print('Each class in test data: \n{}'.format(body_test['class'].value_counts())) print('Each class in valid data: \n{}'.format(body_valid['class'].value_counts())) # In[ ]: # In[ ]: # In[ ]: #df["class"].value_counts().plot(kind="bar") # In[ ]: #df[["class","body fat_%"]].groupby("class").mean().plot(kind="bar") # In[ ]: #sns.set_theme() #sns.relplot(data = df.head(200), x = 'broad jump_cm', y = 'sit-ups counts', hue = 'class') # In[ ]: #sns.relplot(data = df[df['gender'] == 'M'].head(200), x = 'body fat_%', y = 'BMI', hue = 'class') # In[ ]: #sns.relplot(data = df[df['gender'] == 'F'].head(200), x = 'body fat_%', y = 'BMI', hue = 'class') # In[ ]: #px.box(df, y=['height_cm', # 'weight_kg', # 'body fat_%', # 'diastolic', # 'systolic', # 'gripForce', # 'sit and bend forward_cm', # 'sit-ups counts', # 'broad jump_cm', # 'BMI']) # In[ ]: # this is taking too long time #sns.pairplot(data=df.drop(columns=["gender"]).head(500), hue="class") # In[ ]: # In[ ]: