2023-04-04 12:19:34 +02:00
|
|
|
#!/usr/bin/env python
|
|
|
|
# coding: utf-8
|
|
|
|
|
2023-04-05 20:23:36 +02:00
|
|
|
# In[1]:
|
2023-04-04 12:19:34 +02:00
|
|
|
|
|
|
|
|
|
|
|
import pandas as pd
|
|
|
|
import plotly.express as px
|
|
|
|
import seaborn as sns
|
2023-04-05 20:23:36 +02:00
|
|
|
import os
|
2023-04-05 19:40:37 +02:00
|
|
|
from sklearn.model_selection import train_test_split
|
|
|
|
from sklearn.preprocessing import MinMaxScaler
|
2023-04-04 12:19:34 +02:00
|
|
|
|
2023-04-05 20:23:36 +02:00
|
|
|
df = pd.read_csv(os.path.join('.', 'body_performance.csv'))
|
2023-04-04 12:19:34 +02:00
|
|
|
|
|
|
|
df['BMI'] = df['weight_kg']/(0.0001*df['height_cm']*df['height_cm'])
|
2023-04-05 19:40:37 +02:00
|
|
|
print(df.head())
|
2023-04-04 12:19:34 +02:00
|
|
|
|
|
|
|
|
|
|
|
# In[ ]:
|
|
|
|
|
|
|
|
|
|
|
|
df.duplicated().sum()
|
|
|
|
print(f'with duplicates:{df.shape}')
|
|
|
|
df.drop_duplicates(inplace=True)
|
|
|
|
print(f'without duplicates:{df.shape}')
|
|
|
|
df_copy = df.copy()
|
|
|
|
|
|
|
|
|
|
|
|
# In[ ]:
|
|
|
|
|
|
|
|
|
2023-04-05 19:40:37 +02:00
|
|
|
body_train, body_test = train_test_split(df, test_size=int(df["age"].count()*0.2), random_state=1)
|
|
|
|
body_test, body_valid = train_test_split(body_test, test_size=int(body_test["age"].count()*0.5), random_state=1)
|
|
|
|
|
|
|
|
print("number of elements in data frame: {}".format(df['age'].count()))
|
|
|
|
print("train: {}".format(body_train["age"].count()))
|
|
|
|
print("test: {}".format(body_test["age"].count()))
|
|
|
|
print("valid: {}".format(body_valid["age"].count()))
|
|
|
|
|
|
|
|
|
|
|
|
# In[ ]:
|
|
|
|
|
|
|
|
|
|
|
|
print(df.describe(include='all'))
|
|
|
|
#sit and bend forward_cm jest na minusie!!!
|
|
|
|
|
|
|
|
|
|
|
|
# In[ ]:
|
|
|
|
|
|
|
|
|
|
|
|
scaler = MinMaxScaler()
|
|
|
|
df[['age', 'height_cm', 'weight_kg','body fat_%',
|
|
|
|
'diastolic','systolic','gripForce','sit-ups counts',
|
|
|
|
'broad jump_cm','BMI']] = scaler.fit_transform(df[[
|
|
|
|
'age', 'height_cm', 'weight_kg','body fat_%',
|
|
|
|
'diastolic','systolic','gripForce','sit-ups counts',
|
|
|
|
'broad jump_cm','BMI']])
|
|
|
|
|
|
|
|
scaler = MinMaxScaler(feature_range=(-1, 1))
|
|
|
|
df['sit and bend forward_cm'] = scaler.fit_transform(df[['sit and bend forward_cm']])
|
2023-04-04 12:19:34 +02:00
|
|
|
df.describe(include='all')
|
|
|
|
|
|
|
|
|
2023-04-05 19:40:37 +02:00
|
|
|
|
2023-04-04 12:19:34 +02:00
|
|
|
# In[ ]:
|
|
|
|
|
|
|
|
|
|
|
|
df.info()
|
|
|
|
|
|
|
|
|
|
|
|
# In[ ]:
|
|
|
|
|
|
|
|
|
2023-04-05 19:40:37 +02:00
|
|
|
print('Each class in data frame: \n{}'.format(df['class'].value_counts()))
|
|
|
|
print('Each class in train data: \n{}'.format(body_train['class'].value_counts()))
|
|
|
|
print('Each class in test data: \n{}'.format(body_test['class'].value_counts()))
|
|
|
|
print('Each class in valid data: \n{}'.format(body_valid['class'].value_counts()))
|
|
|
|
|
|
|
|
|
|
|
|
# In[ ]:
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
# In[ ]:
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
# In[ ]:
|
|
|
|
|
|
|
|
|
|
|
|
#df["class"].value_counts().plot(kind="bar")
|
2023-04-04 12:19:34 +02:00
|
|
|
|
|
|
|
|
|
|
|
# In[ ]:
|
|
|
|
|
|
|
|
|
2023-04-05 19:40:37 +02:00
|
|
|
#df[["class","body fat_%"]].groupby("class").mean().plot(kind="bar")
|
2023-04-04 12:19:34 +02:00
|
|
|
|
|
|
|
|
|
|
|
# In[ ]:
|
|
|
|
|
|
|
|
|
2023-04-05 19:40:37 +02:00
|
|
|
#sns.set_theme()
|
2023-04-04 12:19:34 +02:00
|
|
|
|
2023-04-05 19:40:37 +02:00
|
|
|
#sns.relplot(data = df.head(200), x = 'broad jump_cm', y = 'sit-ups counts', hue = 'class')
|
2023-04-04 12:19:34 +02:00
|
|
|
|
|
|
|
|
|
|
|
# In[ ]:
|
|
|
|
|
|
|
|
|
2023-04-05 19:40:37 +02:00
|
|
|
#sns.relplot(data = df[df['gender'] == 'M'].head(200), x = 'body fat_%', y = 'BMI', hue = 'class')
|
2023-04-04 12:19:34 +02:00
|
|
|
|
|
|
|
|
|
|
|
# In[ ]:
|
|
|
|
|
|
|
|
|
2023-04-05 19:40:37 +02:00
|
|
|
#sns.relplot(data = df[df['gender'] == 'F'].head(200), x = 'body fat_%', y = 'BMI', hue = 'class')
|
2023-04-04 12:19:34 +02:00
|
|
|
|
|
|
|
|
|
|
|
# In[ ]:
|
|
|
|
|
|
|
|
|
2023-04-05 19:40:37 +02:00
|
|
|
#px.box(df, y=['height_cm',
|
|
|
|
# 'weight_kg',
|
|
|
|
# 'body fat_%',
|
|
|
|
# 'diastolic',
|
|
|
|
# 'systolic',
|
|
|
|
# 'gripForce',
|
|
|
|
# 'sit and bend forward_cm',
|
|
|
|
# 'sit-ups counts',
|
|
|
|
# 'broad jump_cm',
|
|
|
|
# 'BMI'])
|
2023-04-04 12:19:34 +02:00
|
|
|
|
|
|
|
|
|
|
|
# In[ ]:
|
|
|
|
|
|
|
|
|
|
|
|
# this is taking too long time
|
|
|
|
#sns.pairplot(data=df.drop(columns=["gender"]).head(500), hue="class")
|
|
|
|
|
|
|
|
|
|
|
|
# In[ ]:
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
# In[ ]:
|
|
|
|
|
|
|
|
|
2023-04-05 22:33:54 +02:00
|
|
|
print("no elo mordeczko, która tu dotarła")
|
2023-04-04 12:19:34 +02:00
|
|
|
|