Prepare train and test sets

2022-04-28 20:13:22 +02:00 · 2022-04-28 20:13:22 +02:00 · 60dbcba650
commit 60dbcba650
parent 9eb59b3cee
2 changed files with 53 additions and 2 deletions
--- a/5
+++ b/5
@ -1,6 +1,6 @@
 pipeline {
    agent {
-    	docker {image 'agakul/ium:3.0'}
+    	docker {image 'agakul/ium:4.0'}
    }
    stages {
 		stage('Check out from version control') {
@ -10,7 +10,8 @@ pipeline {
 		}
        stage('Shell Script') {
            steps {
-				sh 'ipython ./preparation.py'
+				sh 'ipython ./prepare_datasets.py'
+				archiveArtifacts artifacts: 'X_train.csv, X_test.csv, y_train.csv, y_test.csv '
            }
        }
    }
--- a/prepare_datasets.py
+++ b/prepare_datasets.py
@ -0,0 +1,50 @@
+#!/usr/bin/env python
+# coding: utf-8
+
+# In[ ]:
+
+
+get_ipython().system('unzip -o body-performance-data.zip')
+
+
+# In[4]:
+
+
+import pandas as pd
+from sklearn.model_selection import train_test_split
+
+
+# In[21]:
+
+
+df = pd.read_csv('bodyPerformance.csv')
+
+
+# In[22]:
+
+
+cols = ['gender', 'height_cm', 'weight_kg', 'body fat_%', 'sit-ups counts', 'broad jump_cm']
+df = df[cols]
+
+# male - 0, female - 1
+df['gender'].replace({'M': 0, 'F': 1}, inplace = True)
+df = df.dropna(how='any')
+
+
+# In[23]:
+
+
+X = df[['height_cm', 'weight_kg', 'body fat_%', 'sit-ups counts', 'broad jump_cm']]
+y = df[['gender']]
+
+X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)
+
+
+# In[24]:
+
+
+X_train.to_csv(r'X_train.csv', index=False)
+X_test.to_csv(r'X_test.csv', index=False)
+y_train.to_csv(r'y_train.csv', index=False)
+y_test.to_csv(r'y_test.csv', index=False)
+