ium_s487178/.ipynb_checkpoints/body_performance-checkpoint.ipynb
2023-04-05 14:40:41 +02:00

22 KiB

import pandas as pd
import plotly.express as px
import seaborn as sns
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import MinMaxScaler

df = pd.read_csv(r'.\body_performance.csv')

df['BMI'] = df['weight_kg']/(0.0001*df['height_cm']*df['height_cm'])
print(df.head())
    age gender  height_cm  weight_kg  body fat_%  diastolic  systolic  \
0  27.0      M      172.3      75.24        21.3       80.0     130.0   
1  25.0      M      165.0      55.80        15.7       77.0     126.0   
2  31.0      M      179.6      78.00        20.1       92.0     152.0   
3  32.0      M      174.5      71.10        18.4       76.0     147.0   
4  28.0      M      173.8      67.70        17.1       70.0     127.0   

   gripForce  sit and bend forward_cm  sit-ups counts  broad jump_cm class  \
0       54.9                     18.4            60.0          217.0     C   
1       36.4                     16.3            53.0          229.0     A   
2       44.8                     12.0            49.0          181.0     C   
3       41.4                     15.2            53.0          219.0     B   
4       43.5                     27.1            45.0          217.0     B   

         BMI  
0  25.344179  
1  20.495868  
2  24.181428  
3  23.349562  
4  22.412439  
df.duplicated().sum()
print(f'with duplicates:{df.shape}')
df.drop_duplicates(inplace=True)
print(f'without duplicates:{df.shape}')
df_copy = df.copy()
body_train, body_test = train_test_split(df, test_size=int(df["age"].count()*0.2), random_state=1)
body_test, body_valid = train_test_split(body_test, test_size=int(body_test["age"].count()*0.5), random_state=1)

print("number of elements in data frame: {}".format(df['age'].count()))
print("train: {}".format(body_train["age"].count()))
print("test: {}".format(body_test["age"].count()))
print("valid: {}".format(body_valid["age"].count()))
number of elements in data frame: 13393
train: 10715
test: 1339
valid: 1339
print(df.describe(include='all'))
#sit and bend forward_cm jest na minusie!!!
                 age gender     height_cm     weight_kg    body fat_%  \
count   13393.000000  13393  13393.000000  13393.000000  13393.000000   
unique           NaN      2           NaN           NaN           NaN   
top              NaN      M           NaN           NaN           NaN   
freq             NaN   8467           NaN           NaN           NaN   
mean       36.775106    NaN    168.559807     67.447316     23.240165   
std        13.625639    NaN      8.426583     11.949666      7.256844   
min        21.000000    NaN    125.000000     26.300000      3.000000   
25%        25.000000    NaN    162.400000     58.200000     18.000000   
50%        32.000000    NaN    169.200000     67.400000     22.800000   
75%        48.000000    NaN    174.800000     75.300000     28.000000   
max        64.000000    NaN    193.800000    138.100000     78.400000   

           diastolic      systolic     gripForce  sit and bend forward_cm  \
count   13393.000000  13393.000000  13393.000000             13393.000000   
unique           NaN           NaN           NaN                      NaN   
top              NaN           NaN           NaN                      NaN   
freq             NaN           NaN           NaN                      NaN   
mean       78.796842    130.234817     36.963877                15.209268   
std        10.742033     14.713954     10.624864                 8.456677   
min         0.000000      0.000000      0.000000               -25.000000   
25%        71.000000    120.000000     27.500000                10.900000   
50%        79.000000    130.000000     37.900000                16.200000   
75%        86.000000    141.000000     45.200000                20.700000   
max       156.200000    201.000000     70.500000               213.000000   

        sit-ups counts  broad jump_cm  class           BMI  
count     13393.000000   13393.000000  13393  13393.000000  
unique             NaN            NaN      4           NaN  
top                NaN            NaN      C           NaN  
freq               NaN            NaN   3349           NaN  
mean         39.771224     190.129627    NaN     23.606014  
std          14.276698      39.868000    NaN      2.940936  
min           0.000000       0.000000    NaN     11.103976  
25%          30.000000     162.000000    NaN     21.612812  
50%          41.000000     193.000000    NaN     23.463513  
75%          50.000000     221.000000    NaN     25.341367  
max          80.000000     303.000000    NaN     42.906509  
scaler = MinMaxScaler()
df[['age', 'height_cm', 'weight_kg','body fat_%',
    'diastolic','systolic','gripForce','sit-ups counts',
    'broad jump_cm','BMI']] = scaler.fit_transform(df[[
    'age', 'height_cm', 'weight_kg','body fat_%',
    'diastolic','systolic','gripForce','sit-ups counts',
    'broad jump_cm','BMI']])

scaler = MinMaxScaler(feature_range=(-1, 1))
df['sit and bend forward_cm'] = scaler.fit_transform(df[['sit and bend forward_cm']])
df.describe(include='all')

age gender height_cm weight_kg body fat_% diastolic systolic gripForce sit and bend forward_cm sit-ups counts broad jump_cm class BMI
count 13393.000000 13393 13393.000000 13393.000000 13393.000000 13393.000000 13393.000000 13393.000000 13393.000000 13393.000000 13393.000000 13393 13393.000000
unique NaN 2 NaN NaN NaN NaN NaN NaN NaN NaN NaN 4 NaN
top NaN M NaN NaN NaN NaN NaN NaN NaN NaN NaN C NaN
freq NaN 8467 NaN NaN NaN NaN NaN NaN NaN NaN NaN 3349 NaN
mean 0.366863 NaN 0.633137 0.368044 0.268437 0.504461 0.647934 0.524310 -0.662107 0.497140 0.627491 NaN 0.393115
std 0.316875 NaN 0.122479 0.106884 0.096245 0.068771 0.073204 0.150707 0.071065 0.178459 0.131578 NaN 0.092475
min 0.000000 NaN 0.000000 0.000000 0.000000 0.000000 0.000000 0.000000 -1.000000 0.000000 0.000000 NaN 0.000000
25% 0.093023 NaN 0.543605 0.285331 0.198939 0.454545 0.597015 0.390071 -0.698319 0.375000 0.534653 NaN 0.330440
50% 0.255814 NaN 0.642442 0.367621 0.262599 0.505762 0.646766 0.537589 -0.653782 0.512500 0.636964 NaN 0.388634
75% 0.627907 NaN 0.723837 0.438283 0.331565 0.550576 0.701493 0.641135 -0.615966 0.625000 0.729373 NaN 0.447681
max 1.000000 NaN 1.000000 1.000000 1.000000 1.000000 1.000000 1.000000 1.000000 1.000000 1.000000 NaN 1.000000
df.info()
<class 'pandas.core.frame.DataFrame'>
RangeIndex: 13393 entries, 0 to 13392
Data columns (total 13 columns):
 #   Column                   Non-Null Count  Dtype  
---  ------                   --------------  -----  
 0   age                      13393 non-null  float64
 1   gender                   13393 non-null  object 
 2   height_cm                13393 non-null  float64
 3   weight_kg                13393 non-null  float64
 4   body fat_%               13393 non-null  float64
 5   diastolic                13393 non-null  float64
 6   systolic                 13393 non-null  float64
 7   gripForce                13393 non-null  float64
 8   sit and bend forward_cm  13393 non-null  float64
 9   sit-ups counts           13393 non-null  float64
 10  broad jump_cm            13393 non-null  float64
 11  class                    13393 non-null  object 
 12  BMI                      13393 non-null  float64
dtypes: float64(11), object(2)
memory usage: 1.3+ MB
print('Each class in data frame: \n{}'.format(df['class'].value_counts()))
print('Each class in train data: \n{}'.format(body_train['class'].value_counts()))
print('Each class in test data: \n{}'.format(body_test['class'].value_counts()))
print('Each class in valid data: \n{}'.format(body_valid['class'].value_counts()))
Each class in data frame: 
C    3349
D    3349
A    3348
B    3347
Name: class, dtype: int64
Each class in train data: 
A    2703
B    2681
C    2671
D    2660
Name: class, dtype: int64
Each class in test data: 
D    353
C    332
B    328
A    326
Name: class, dtype: int64
Each class in valid data: 
C    346
B    338
D    336
A    319
Name: class, dtype: int64


#df["class"].value_counts().plot(kind="bar")
#df[["class","body fat_%"]].groupby("class").mean().plot(kind="bar")
#sns.set_theme()

#sns.relplot(data = df.head(200), x = 'broad jump_cm', y = 'sit-ups counts', hue = 'class')
#sns.relplot(data = df[df['gender'] == 'M'].head(200), x = 'body fat_%', y = 'BMI', hue = 'class')
#sns.relplot(data = df[df['gender'] == 'F'].head(200), x = 'body fat_%', y = 'BMI', hue = 'class')
#px.box(df, y=['height_cm',
#               'weight_kg',
#               'body fat_%',
#               'diastolic',
#               'systolic',
#               'gripForce',
#               'sit and bend forward_cm',
#               'sit-ups counts',
#               'broad jump_cm',
#               'BMI'])
# this is taking too long time
#sns.pairplot(data=df.drop(columns=["gender"]).head(500), hue="class")