Prepare train and test sets

This commit is contained in:
Agata 2022-04-28 20:13:22 +02:00
parent 9eb59b3cee
commit 60dbcba650
2 changed files with 53 additions and 2 deletions

View File

@ -1,6 +1,6 @@
pipeline {
agent {
docker {image 'agakul/ium:3.0'}
docker {image 'agakul/ium:4.0'}
}
stages {
stage('Check out from version control') {
@ -10,7 +10,8 @@ pipeline {
}
stage('Shell Script') {
steps {
sh 'ipython ./preparation.py'
sh 'ipython ./prepare_datasets.py'
archiveArtifacts artifacts: 'X_train.csv, X_test.csv, y_train.csv, y_test.csv '
}
}
}

50
prepare_datasets.py Normal file
View File

@ -0,0 +1,50 @@
#!/usr/bin/env python
# coding: utf-8
# In[ ]:
get_ipython().system('unzip -o body-performance-data.zip')
# In[4]:
import pandas as pd
from sklearn.model_selection import train_test_split
# In[21]:
df = pd.read_csv('bodyPerformance.csv')
# In[22]:
cols = ['gender', 'height_cm', 'weight_kg', 'body fat_%', 'sit-ups counts', 'broad jump_cm']
df = df[cols]
# male - 0, female - 1
df['gender'].replace({'M': 0, 'F': 1}, inplace = True)
df = df.dropna(how='any')
# In[23]:
X = df[['height_cm', 'weight_kg', 'body fat_%', 'sit-ups counts', 'broad jump_cm']]
y = df[['gender']]
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)
# In[24]:
X_train.to_csv(r'X_train.csv', index=False)
X_test.to_csv(r'X_test.csv', index=False)
y_train.to_csv(r'y_train.csv', index=False)
y_test.to_csv(r'y_test.csv', index=False)