fix jenkinsfile-create-dataset

This commit is contained in:
jakubknczny 2021-04-11 11:04:32 +02:00
parent 5370f573fa
commit 75da800223
5 changed files with 106 additions and 11 deletions

View File

@ -1,28 +1,58 @@
pipeline {
agent any
parameters {
agent none
/* parameters {
string(defaultValue: '6000',
description: 'numbers of data entries to keep in train.csv',
name: 'CUTOFF',
trim: true)
}
*/
stages {
stage('sh: Shell Script') {
stage('copy files') {
agent any
steps {
sh '''
cp ./lab1/script.sh .
cp ./lab1/python_script.py .
cp ./lab3/Dockerfile .
cp ./lab3/requirements.txt .
'''
}
}
/* stage('sh: Shell Script') {
steps {
withEnv(["KAGGLE_USERNAME=${params.KAGGLE_USERNAME}",
"KAGGLE_KEY=${params.KAGGLE_KEY}" ]) {
sh 'chmod +x ./lab2/script-zadanie-2-4.sh'
sh './lab2/script-zadanie-2-4.sh'
sh 'chmod +x ./lab2/script-zadanie-2-4-cutoff.sh'
sh ''' chmod +x ./lab2/script-zadanie-2-4.sh
./lab2/script-zadanie-2-4.sh
chmod +x ./lab2/script-zadanie-2-4-cutoff.sh'''
sh "./lab2/script-zadanie-2-4-cutoff.sh ${params.CUTOFF}"
}
}
}
stage('archive artifacts') {
steps {
archiveArtifacts 'train.csv'
archiveArtifacts 'test.csv'
archiveArtifacts 'valid.csv'
*/
stage('docker') {
agent {
dockerfile true
}
stages {
stage('test') {
steps {
sh 'cat /etc/issue'
}
}
stage('actual') {
steps {
sh './script.sh'
}
}
stage('archive artifacts') {
steps {
archiveArtifacts 'train.csv'
archiveArtifacts 'test.csv'
archiveArtifacts 'valid.csv'
}
}
}
}
}

19
lab3/Dockerfile Normal file
View File

@ -0,0 +1,19 @@
FROM ubuntu:latest
RUN apt update >>/dev/null
RUN apt install -y apt-utils >>/dev/null
RUN apt install -y python3.8 >>/dev/null
RUN apt install -y python3-pip >>/dev/null
RUN apt install -y unzip >>/dev/null
WORKDIR /app
COPY ./python_script.py ./
COPY ./script.sh ./
RUN chmod +x script.sh
COPY ./requirements.txt ./
RUN pip3 install -r requirements.txt >>/dev/null
CMD ./script.sh

37
lab3/python_script.py Normal file
View File

@ -0,0 +1,37 @@
import pandas as pd
from sklearn import preprocessing
from sklearn.model_selection import train_test_split
df = pd.read_csv('smart_grid_stability_augmented.csv')
scaler = preprocessing.StandardScaler().fit(df.iloc[:, 0:-1])
df_norm_array = scaler.transform(df.iloc[:, 0:-1])
df_norm = pd.DataFrame(data=df_norm_array,
columns=df.columns[:-1])
df_norm['stabf'] = df['stabf']
train, testAndValid = train_test_split(
df_norm,
test_size=0.2,
random_state=42,
stratify=df_norm['stabf'])
test, valid = train_test_split(
testAndValid,
test_size=0.5,
random_state=42,
stratify=testAndValid['stabf'])
def namestr(obj, namespace):
return [name for name in namespace if namespace[name] is obj]
dataset = df_norm
for x in [dataset, train, test, valid]:
print([q for q in namestr(x, globals()) if len(q) == max([len(w) for w in namestr(x, globals())])][-1])
print("size:", len(x))
print(x.describe(include='all'))
print("class distribution", x.value_counts('stabf'))
print('===============================================================')

3
lab3/requirements.txt Normal file
View File

@ -0,0 +1,3 @@
kaggle==1.5.12
pandas==1.1.2
sklearn==0.0

6
lab3/script.sh Normal file
View File

@ -0,0 +1,6 @@
#!/bin/bash
kaggle datasets download -d 'pcbreviglieri/smart-grid-stability' >>/dev/null 2>&1
unzip smart-grid-stability.zip >>/dev/null 2>&1
python3 python_script.py