ium_444421/prepare_datasets.py

51 lines
877 B
Python
Raw Normal View History

2022-04-28 20:13:22 +02:00
#!/usr/bin/env python
# coding: utf-8
# In[ ]:
2022-05-29 13:46:40 +02:00
# get_ipython().system('unzip -o body-performance-data.zip')
2022-04-28 20:13:22 +02:00
# In[4]:
import pandas as pd
from sklearn.model_selection import train_test_split
# In[21]:
2022-05-29 13:46:40 +02:00
df = pd.read_csv('data/bodyPerformance.csv')
2022-04-28 20:13:22 +02:00
# In[22]:
cols = ['gender', 'height_cm', 'weight_kg', 'body fat_%', 'sit-ups counts', 'broad jump_cm']
df = df[cols]
# male - 0, female - 1
df['gender'].replace({'M': 0, 'F': 1}, inplace = True)
df = df.dropna(how='any')
# In[23]:
X = df[['height_cm', 'weight_kg', 'body fat_%', 'sit-ups counts', 'broad jump_cm']]
y = df[['gender']]
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)
# In[24]:
X_train.to_csv(r'X_train.csv', index=False)
X_test.to_csv(r'X_test.csv', index=False)
y_train.to_csv(r'y_train.csv', index=False)
y_test.to_csv(r'y_test.csv', index=False)