ium_s487178/.ipynb_checkpoints/body_performance-checkpoint.ipynb
2023-04-05 20:23:36 +02:00

7.3 KiB

import pandas as pd
import plotly.express as px
import seaborn as sns
import os
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import MinMaxScaler

df = pd.read_csv(os.path.join('.', 'body_performance.csv'))

df['BMI'] = df['weight_kg']/(0.0001*df['height_cm']*df['height_cm'])
print(df.head())
    age gender  height_cm  weight_kg  body fat_%  diastolic  systolic  \
0  27.0      M      172.3      75.24        21.3       80.0     130.0   
1  25.0      M      165.0      55.80        15.7       77.0     126.0   
2  31.0      M      179.6      78.00        20.1       92.0     152.0   
3  32.0      M      174.5      71.10        18.4       76.0     147.0   
4  28.0      M      173.8      67.70        17.1       70.0     127.0   

   gripForce  sit and bend forward_cm  sit-ups counts  broad jump_cm class  \
0       54.9                     18.4            60.0          217.0     C   
1       36.4                     16.3            53.0          229.0     A   
2       44.8                     12.0            49.0          181.0     C   
3       41.4                     15.2            53.0          219.0     B   
4       43.5                     27.1            45.0          217.0     B   

         BMI  
0  25.344179  
1  20.495868  
2  24.181428  
3  23.349562  
4  22.412439  
df.duplicated().sum()
print(f'with duplicates:{df.shape}')
df.drop_duplicates(inplace=True)
print(f'without duplicates:{df.shape}')
df_copy = df.copy()
body_train, body_test = train_test_split(df, test_size=int(df["age"].count()*0.2), random_state=1)
body_test, body_valid = train_test_split(body_test, test_size=int(body_test["age"].count()*0.5), random_state=1)

print("number of elements in data frame: {}".format(df['age'].count()))
print("train: {}".format(body_train["age"].count()))
print("test: {}".format(body_test["age"].count()))
print("valid: {}".format(body_valid["age"].count()))
print(df.describe(include='all'))
#sit and bend forward_cm jest na minusie!!!
scaler = MinMaxScaler()
df[['age', 'height_cm', 'weight_kg','body fat_%',
    'diastolic','systolic','gripForce','sit-ups counts',
    'broad jump_cm','BMI']] = scaler.fit_transform(df[[
    'age', 'height_cm', 'weight_kg','body fat_%',
    'diastolic','systolic','gripForce','sit-ups counts',
    'broad jump_cm','BMI']])

scaler = MinMaxScaler(feature_range=(-1, 1))
df['sit and bend forward_cm'] = scaler.fit_transform(df[['sit and bend forward_cm']])
df.describe(include='all')

df.info()
print('Each class in data frame: \n{}'.format(df['class'].value_counts()))
print('Each class in train data: \n{}'.format(body_train['class'].value_counts()))
print('Each class in test data: \n{}'.format(body_test['class'].value_counts()))
print('Each class in valid data: \n{}'.format(body_valid['class'].value_counts()))


#df["class"].value_counts().plot(kind="bar")
#df[["class","body fat_%"]].groupby("class").mean().plot(kind="bar")
#sns.set_theme()

#sns.relplot(data = df.head(200), x = 'broad jump_cm', y = 'sit-ups counts', hue = 'class')
#sns.relplot(data = df[df['gender'] == 'M'].head(200), x = 'body fat_%', y = 'BMI', hue = 'class')
#sns.relplot(data = df[df['gender'] == 'F'].head(200), x = 'body fat_%', y = 'BMI', hue = 'class')
#px.box(df, y=['height_cm',
#               'weight_kg',
#               'body fat_%',
#               'diastolic',
#               'systolic',
#               'gripForce',
#               'sit and bend forward_cm',
#               'sit-ups counts',
#               'broad jump_cm',
#               'BMI'])
# this is taking too long time
#sns.pairplot(data=df.drop(columns=["gender"]).head(500), hue="class")