22 KiB
22 KiB
import pandas as pd
import plotly.express as px
import seaborn as sns
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import MinMaxScaler
df = pd.read_csv(r'.\body_performance.csv')
df['BMI'] = df['weight_kg']/(0.0001*df['height_cm']*df['height_cm'])
print(df.head())
age gender height_cm weight_kg body fat_% diastolic systolic \ 0 27.0 M 172.3 75.24 21.3 80.0 130.0 1 25.0 M 165.0 55.80 15.7 77.0 126.0 2 31.0 M 179.6 78.00 20.1 92.0 152.0 3 32.0 M 174.5 71.10 18.4 76.0 147.0 4 28.0 M 173.8 67.70 17.1 70.0 127.0 gripForce sit and bend forward_cm sit-ups counts broad jump_cm class \ 0 54.9 18.4 60.0 217.0 C 1 36.4 16.3 53.0 229.0 A 2 44.8 12.0 49.0 181.0 C 3 41.4 15.2 53.0 219.0 B 4 43.5 27.1 45.0 217.0 B BMI 0 25.344179 1 20.495868 2 24.181428 3 23.349562 4 22.412439
df.duplicated().sum()
print(f'with duplicates:{df.shape}')
df.drop_duplicates(inplace=True)
print(f'without duplicates:{df.shape}')
df_copy = df.copy()
body_train, body_test = train_test_split(df, test_size=int(df["age"].count()*0.2), random_state=1)
body_test, body_valid = train_test_split(body_test, test_size=int(body_test["age"].count()*0.5), random_state=1)
print("number of elements in data frame: {}".format(df['age'].count()))
print("train: {}".format(body_train["age"].count()))
print("test: {}".format(body_test["age"].count()))
print("valid: {}".format(body_valid["age"].count()))
number of elements in data frame: 13393 train: 10715 test: 1339 valid: 1339
print(df.describe(include='all'))
#sit and bend forward_cm jest na minusie!!!
age gender height_cm weight_kg body fat_% \ count 13393.000000 13393 13393.000000 13393.000000 13393.000000 unique NaN 2 NaN NaN NaN top NaN M NaN NaN NaN freq NaN 8467 NaN NaN NaN mean 36.775106 NaN 168.559807 67.447316 23.240165 std 13.625639 NaN 8.426583 11.949666 7.256844 min 21.000000 NaN 125.000000 26.300000 3.000000 25% 25.000000 NaN 162.400000 58.200000 18.000000 50% 32.000000 NaN 169.200000 67.400000 22.800000 75% 48.000000 NaN 174.800000 75.300000 28.000000 max 64.000000 NaN 193.800000 138.100000 78.400000 diastolic systolic gripForce sit and bend forward_cm \ count 13393.000000 13393.000000 13393.000000 13393.000000 unique NaN NaN NaN NaN top NaN NaN NaN NaN freq NaN NaN NaN NaN mean 78.796842 130.234817 36.963877 15.209268 std 10.742033 14.713954 10.624864 8.456677 min 0.000000 0.000000 0.000000 -25.000000 25% 71.000000 120.000000 27.500000 10.900000 50% 79.000000 130.000000 37.900000 16.200000 75% 86.000000 141.000000 45.200000 20.700000 max 156.200000 201.000000 70.500000 213.000000 sit-ups counts broad jump_cm class BMI count 13393.000000 13393.000000 13393 13393.000000 unique NaN NaN 4 NaN top NaN NaN C NaN freq NaN NaN 3349 NaN mean 39.771224 190.129627 NaN 23.606014 std 14.276698 39.868000 NaN 2.940936 min 0.000000 0.000000 NaN 11.103976 25% 30.000000 162.000000 NaN 21.612812 50% 41.000000 193.000000 NaN 23.463513 75% 50.000000 221.000000 NaN 25.341367 max 80.000000 303.000000 NaN 42.906509
scaler = MinMaxScaler()
df[['age', 'height_cm', 'weight_kg','body fat_%',
'diastolic','systolic','gripForce','sit-ups counts',
'broad jump_cm','BMI']] = scaler.fit_transform(df[[
'age', 'height_cm', 'weight_kg','body fat_%',
'diastolic','systolic','gripForce','sit-ups counts',
'broad jump_cm','BMI']])
scaler = MinMaxScaler(feature_range=(-1, 1))
df['sit and bend forward_cm'] = scaler.fit_transform(df[['sit and bend forward_cm']])
df.describe(include='all')
age | gender | height_cm | weight_kg | body fat_% | diastolic | systolic | gripForce | sit and bend forward_cm | sit-ups counts | broad jump_cm | class | BMI | |
---|---|---|---|---|---|---|---|---|---|---|---|---|---|
count | 13393.000000 | 13393 | 13393.000000 | 13393.000000 | 13393.000000 | 13393.000000 | 13393.000000 | 13393.000000 | 13393.000000 | 13393.000000 | 13393.000000 | 13393 | 13393.000000 |
unique | NaN | 2 | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | 4 | NaN |
top | NaN | M | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | C | NaN |
freq | NaN | 8467 | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | 3349 | NaN |
mean | 0.366863 | NaN | 0.633137 | 0.368044 | 0.268437 | 0.504461 | 0.647934 | 0.524310 | -0.662107 | 0.497140 | 0.627491 | NaN | 0.393115 |
std | 0.316875 | NaN | 0.122479 | 0.106884 | 0.096245 | 0.068771 | 0.073204 | 0.150707 | 0.071065 | 0.178459 | 0.131578 | NaN | 0.092475 |
min | 0.000000 | NaN | 0.000000 | 0.000000 | 0.000000 | 0.000000 | 0.000000 | 0.000000 | -1.000000 | 0.000000 | 0.000000 | NaN | 0.000000 |
25% | 0.093023 | NaN | 0.543605 | 0.285331 | 0.198939 | 0.454545 | 0.597015 | 0.390071 | -0.698319 | 0.375000 | 0.534653 | NaN | 0.330440 |
50% | 0.255814 | NaN | 0.642442 | 0.367621 | 0.262599 | 0.505762 | 0.646766 | 0.537589 | -0.653782 | 0.512500 | 0.636964 | NaN | 0.388634 |
75% | 0.627907 | NaN | 0.723837 | 0.438283 | 0.331565 | 0.550576 | 0.701493 | 0.641135 | -0.615966 | 0.625000 | 0.729373 | NaN | 0.447681 |
max | 1.000000 | NaN | 1.000000 | 1.000000 | 1.000000 | 1.000000 | 1.000000 | 1.000000 | 1.000000 | 1.000000 | 1.000000 | NaN | 1.000000 |
df.info()
<class 'pandas.core.frame.DataFrame'> RangeIndex: 13393 entries, 0 to 13392 Data columns (total 13 columns): # Column Non-Null Count Dtype --- ------ -------------- ----- 0 age 13393 non-null float64 1 gender 13393 non-null object 2 height_cm 13393 non-null float64 3 weight_kg 13393 non-null float64 4 body fat_% 13393 non-null float64 5 diastolic 13393 non-null float64 6 systolic 13393 non-null float64 7 gripForce 13393 non-null float64 8 sit and bend forward_cm 13393 non-null float64 9 sit-ups counts 13393 non-null float64 10 broad jump_cm 13393 non-null float64 11 class 13393 non-null object 12 BMI 13393 non-null float64 dtypes: float64(11), object(2) memory usage: 1.3+ MB
print('Each class in data frame: \n{}'.format(df['class'].value_counts()))
print('Each class in train data: \n{}'.format(body_train['class'].value_counts()))
print('Each class in test data: \n{}'.format(body_test['class'].value_counts()))
print('Each class in valid data: \n{}'.format(body_valid['class'].value_counts()))
Each class in data frame: C 3349 D 3349 A 3348 B 3347 Name: class, dtype: int64 Each class in train data: A 2703 B 2681 C 2671 D 2660 Name: class, dtype: int64 Each class in test data: D 353 C 332 B 328 A 326 Name: class, dtype: int64 Each class in valid data: C 346 B 338 D 336 A 319 Name: class, dtype: int64
#df["class"].value_counts().plot(kind="bar")
#df[["class","body fat_%"]].groupby("class").mean().plot(kind="bar")
#sns.set_theme()
#sns.relplot(data = df.head(200), x = 'broad jump_cm', y = 'sit-ups counts', hue = 'class')
#sns.relplot(data = df[df['gender'] == 'M'].head(200), x = 'body fat_%', y = 'BMI', hue = 'class')
#sns.relplot(data = df[df['gender'] == 'F'].head(200), x = 'body fat_%', y = 'BMI', hue = 'class')
#px.box(df, y=['height_cm',
# 'weight_kg',
# 'body fat_%',
# 'diastolic',
# 'systolic',
# 'gripForce',
# 'sit and bend forward_cm',
# 'sit-ups counts',
# 'broad jump_cm',
# 'BMI'])
# this is taking too long time
#sns.pairplot(data=df.drop(columns=["gender"]).head(500), hue="class")