2022-04-28 20:13:22 +02:00
|
|
|
#!/usr/bin/env python
|
|
|
|
# coding: utf-8
|
|
|
|
|
|
|
|
# In[ ]:
|
|
|
|
|
|
|
|
|
2022-05-29 13:46:40 +02:00
|
|
|
# get_ipython().system('unzip -o body-performance-data.zip')
|
2022-04-28 20:13:22 +02:00
|
|
|
|
|
|
|
|
|
|
|
# In[4]:
|
|
|
|
|
|
|
|
|
|
|
|
import pandas as pd
|
|
|
|
from sklearn.model_selection import train_test_split
|
|
|
|
|
|
|
|
|
|
|
|
# In[21]:
|
|
|
|
|
|
|
|
|
2022-05-29 13:46:40 +02:00
|
|
|
df = pd.read_csv('data/bodyPerformance.csv')
|
2022-04-28 20:13:22 +02:00
|
|
|
|
|
|
|
|
|
|
|
# In[22]:
|
|
|
|
|
|
|
|
|
|
|
|
cols = ['gender', 'height_cm', 'weight_kg', 'body fat_%', 'sit-ups counts', 'broad jump_cm']
|
|
|
|
df = df[cols]
|
|
|
|
|
|
|
|
# male - 0, female - 1
|
|
|
|
df['gender'].replace({'M': 0, 'F': 1}, inplace = True)
|
|
|
|
df = df.dropna(how='any')
|
|
|
|
|
|
|
|
|
|
|
|
# In[23]:
|
|
|
|
|
|
|
|
|
|
|
|
X = df[['height_cm', 'weight_kg', 'body fat_%', 'sit-ups counts', 'broad jump_cm']]
|
|
|
|
y = df[['gender']]
|
|
|
|
|
|
|
|
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)
|
|
|
|
|
|
|
|
|
|
|
|
# In[24]:
|
|
|
|
|
|
|
|
|
|
|
|
X_train.to_csv(r'X_train.csv', index=False)
|
|
|
|
X_test.to_csv(r'X_test.csv', index=False)
|
|
|
|
y_train.to_csv(r'y_train.csv', index=False)
|
|
|
|
y_test.to_csv(r'y_test.csv', index=False)
|
|
|
|
|