Preprocesing python
All checks were successful
s434784-training/pipeline/head This commit looks good

This commit is contained in:
Maciej Sobkowiak 2021-05-16 21:06:46 +02:00
parent 6bba24a4b9
commit 19bf9c3fe0
3 changed files with 42 additions and 6 deletions

View File

@ -15,7 +15,7 @@ RUN pip3 install sklearn
WORKDIR /app
COPY ./preparations.sh ./
COPY ./preprocesing_python.py ./
COPY ./preprocesing.py ./
COPY ./training.py ./
# CMD ./preparations.sh

10
Jenkinsfile vendored
View File

@ -21,18 +21,18 @@ pipeline {
script {
def image = docker.build('dock')
image.inside{
sh 'chmod +x preparations.sh'
sh 'chmod +x preprocesing.py'
sh 'echo ${CUTOFF}'
sh './preparations.sh ${CUTOFF}'
sh 'python3 preprocessing.py ${CUTOFF}'
}
}
}
}
stage('archiveArtifacts'){
steps{
archiveArtifacts 'data.dev'
archiveArtifacts 'data.train'
archiveArtifacts 'data.test'
archiveArtifacts 'test.csv'
archiveArtifacts 'validate.csv'
archiveArtifacts 'train.csv'
}
}
}

36
preprocesing.py Normal file
View File

@ -0,0 +1,36 @@
import sys
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
CUTOFF = int(sys.argv[1])
sc = pd.read_csv('who_suicide_statistics.csv')
age = {"5-14 years": 0, "15-24 years": 1, "25-34 years": 2,
"35-54 years": 3, "55-74 years": 4, "75+ years": 5}
sex = {"male": 0, "female": 1}
# Usunięcie niepełnych danych
sc.dropna(inplace=True)
# Kategoryzacja
sc = pd.get_dummies(
sc, columns=['age', 'sex', 'country'], prefix='', prefix_sep='')
# CUTOFF
sc = sc.head(CUTOFF)
# podział na train validate i test
train, validate, test = np.split(sc.sample(frac=1, random_state=42),
[int(.6*len(sc)), int(.8*len(sc))])
# zapis do plików
train.to_csv('train.csv')
validate.to_csv('validate.csv')
test.to_csv('test.csv')
print(train)
print(validate)
print(test)