ium_s487178/body_performance-checkpoint.ipynb at 6ffa08f74934fb5bb39e572682f06cf15d428f53

import pandas as pd
import plotly.express as px
import seaborn as sns
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import MinMaxScaler

df = pd.read_csv(r'.\body_performance.csv')

df['BMI'] = df['weight_kg']/(0.0001*df['height_cm']*df['height_cm'])
print(df.head())

    age gender  height_cm  weight_kg  body fat_%  diastolic  systolic  \
0  27.0      M      172.3      75.24        21.3       80.0     130.0   
1  25.0      M      165.0      55.80        15.7       77.0     126.0   
2  31.0      M      179.6      78.00        20.1       92.0     152.0   
3  32.0      M      174.5      71.10        18.4       76.0     147.0   
4  28.0      M      173.8      67.70        17.1       70.0     127.0   

   gripForce  sit and bend forward_cm  sit-ups counts  broad jump_cm class  \
0       54.9                     18.4            60.0          217.0     C   
1       36.4                     16.3            53.0          229.0     A   
2       44.8                     12.0            49.0          181.0     C   
3       41.4                     15.2            53.0          219.0     B   
4       43.5                     27.1            45.0          217.0     B   

         BMI  
0  25.344179  
1  20.495868  
2  24.181428  
3  23.349562  
4  22.412439

df.duplicated().sum()
print(f'with duplicates:{df.shape}')
df.drop_duplicates(inplace=True)
print(f'without duplicates:{df.shape}')
df_copy = df.copy()

body_train, body_test = train_test_split(df, test_size=int(df["age"].count()*0.2), random_state=1)
body_test, body_valid = train_test_split(body_test, test_size=int(body_test["age"].count()*0.5), random_state=1)

print("number of elements in data frame: {}".format(df['age'].count()))
print("train: {}".format(body_train["age"].count()))
print("test: {}".format(body_test["age"].count()))
print("valid: {}".format(body_valid["age"].count()))

number of elements in data frame: 13393
train: 10715
test: 1339
valid: 1339

print(df.describe(include='all'))
#sit and bend forward_cm jest na minusie!!!

                 age gender     height_cm     weight_kg    body fat_%  \
count   13393.000000  13393  13393.000000  13393.000000  13393.000000   
unique           NaN      2           NaN           NaN           NaN   
top              NaN      M           NaN           NaN           NaN   
freq             NaN   8467           NaN           NaN           NaN   
mean       36.775106    NaN    168.559807     67.447316     23.240165   
std        13.625639    NaN      8.426583     11.949666      7.256844   
min        21.000000    NaN    125.000000     26.300000      3.000000   
25%        25.000000    NaN    162.400000     58.200000     18.000000   
50%        32.000000    NaN    169.200000     67.400000     22.800000   
75%        48.000000    NaN    174.800000     75.300000     28.000000   
max        64.000000    NaN    193.800000    138.100000     78.400000   

           diastolic      systolic     gripForce  sit and bend forward_cm  \
count   13393.000000  13393.000000  13393.000000             13393.000000   
unique           NaN           NaN           NaN                      NaN   
top              NaN           NaN           NaN                      NaN   
freq             NaN           NaN           NaN                      NaN   
mean       78.796842    130.234817     36.963877                15.209268   
std        10.742033     14.713954     10.624864                 8.456677   
min         0.000000      0.000000      0.000000               -25.000000   
25%        71.000000    120.000000     27.500000                10.900000   
50%        79.000000    130.000000     37.900000                16.200000   
75%        86.000000    141.000000     45.200000                20.700000   
max       156.200000    201.000000     70.500000               213.000000   

        sit-ups counts  broad jump_cm  class           BMI  
count     13393.000000   13393.000000  13393  13393.000000  
unique             NaN            NaN      4           NaN  
top                NaN            NaN      C           NaN  
freq               NaN            NaN   3349           NaN  
mean         39.771224     190.129627    NaN     23.606014  
std          14.276698      39.868000    NaN      2.940936  
min           0.000000       0.000000    NaN     11.103976  
25%          30.000000     162.000000    NaN     21.612812  
50%          41.000000     193.000000    NaN     23.463513  
75%          50.000000     221.000000    NaN     25.341367  
max          80.000000     303.000000    NaN     42.906509

scaler = MinMaxScaler()
df[['age', 'height_cm', 'weight_kg','body fat_%',
    'diastolic','systolic','gripForce','sit-ups counts',
    'broad jump_cm','BMI']] = scaler.fit_transform(df[[
    'age', 'height_cm', 'weight_kg','body fat_%',
    'diastolic','systolic','gripForce','sit-ups counts',
    'broad jump_cm','BMI']])

scaler = MinMaxScaler(feature_range=(-1, 1))
df['sit and bend forward_cm'] = scaler.fit_transform(df[['sit and bend forward_cm']])
df.describe(include='all')

	age	gender	height_cm	weight_kg	body fat_%	diastolic	systolic	gripForce	sit and bend forward_cm	sit-ups counts	broad jump_cm	class	BMI
count	13393.000000	13393	13393.000000	13393.000000	13393.000000	13393.000000	13393.000000	13393.000000	13393.000000	13393.000000	13393.000000	13393	13393.000000
unique	NaN	2	NaN	NaN	NaN	NaN	NaN	NaN	NaN	NaN	NaN	4	NaN
top	NaN	M	NaN	NaN	NaN	NaN	NaN	NaN	NaN	NaN	NaN	C	NaN
freq	NaN	8467	NaN	NaN	NaN	NaN	NaN	NaN	NaN	NaN	NaN	3349	NaN
mean	0.366863	NaN	0.633137	0.368044	0.268437	0.504461	0.647934	0.524310	-0.662107	0.497140	0.627491	NaN	0.393115
std	0.316875	NaN	0.122479	0.106884	0.096245	0.068771	0.073204	0.150707	0.071065	0.178459	0.131578	NaN	0.092475
min	0.000000	NaN	0.000000	0.000000	0.000000	0.000000	0.000000	0.000000	-1.000000	0.000000	0.000000	NaN	0.000000
25%	0.093023	NaN	0.543605	0.285331	0.198939	0.454545	0.597015	0.390071	-0.698319	0.375000	0.534653	NaN	0.330440
50%	0.255814	NaN	0.642442	0.367621	0.262599	0.505762	0.646766	0.537589	-0.653782	0.512500	0.636964	NaN	0.388634
75%	0.627907	NaN	0.723837	0.438283	0.331565	0.550576	0.701493	0.641135	-0.615966	0.625000	0.729373	NaN	0.447681
max	1.000000	NaN	1.000000	1.000000	1.000000	1.000000	1.000000	1.000000	1.000000	1.000000	1.000000	NaN	1.000000

df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 13393 entries, 0 to 13392
Data columns (total 13 columns):
 #   Column                   Non-Null Count  Dtype  
---  ------                   --------------  -----  
 0   age                      13393 non-null  float64
 1   gender                   13393 non-null  object 
 2   height_cm                13393 non-null  float64
 3   weight_kg                13393 non-null  float64
 4   body fat_%               13393 non-null  float64
 5   diastolic                13393 non-null  float64
 6   systolic                 13393 non-null  float64
 7   gripForce                13393 non-null  float64
 8   sit and bend forward_cm  13393 non-null  float64
 9   sit-ups counts           13393 non-null  float64
 10  broad jump_cm            13393 non-null  float64
 11  class                    13393 non-null  object 
 12  BMI                      13393 non-null  float64
dtypes: float64(11), object(2)
memory usage: 1.3+ MB

print('Each class in data frame: \n{}'.format(df['class'].value_counts()))
print('Each class in train data: \n{}'.format(body_train['class'].value_counts()))
print('Each class in test data: \n{}'.format(body_test['class'].value_counts()))
print('Each class in valid data: \n{}'.format(body_valid['class'].value_counts()))

Each class in data frame: 
C    3349
D    3349
A    3348
B    3347
Name: class, dtype: int64
Each class in train data: 
A    2703
B    2681
C    2671
D    2660
Name: class, dtype: int64
Each class in test data: 
D    353
C    332
B    328
A    326
Name: class, dtype: int64
Each class in valid data: 
C    346
B    338
D    336
A    319
Name: class, dtype: int64

#df["class"].value_counts().plot(kind="bar")

#df[["class","body fat_%"]].groupby("class").mean().plot(kind="bar")

#sns.set_theme()

#sns.relplot(data = df.head(200), x = 'broad jump_cm', y = 'sit-ups counts', hue = 'class')

#sns.relplot(data = df[df['gender'] == 'M'].head(200), x = 'body fat_%', y = 'BMI', hue = 'class')

#sns.relplot(data = df[df['gender'] == 'F'].head(200), x = 'body fat_%', y = 'BMI', hue = 'class')

#px.box(df, y=['height_cm',
#               'weight_kg',
#               'body fat_%',
#               'diastolic',
#               'systolic',
#               'gripForce',
#               'sit and bend forward_cm',
#               'sit-ups counts',
#               'broad jump_cm',
#               'BMI'])

# this is taking too long time
#sns.pairplot(data=df.drop(columns=["gender"]).head(500), hue="class")

22 KiB Raw Blame History

22 KiB

Raw Blame History