This commit is contained in:
piotrwrzodak 2023-05-12 00:08:16 +02:00
parent f7468f52b7
commit f1351c0b16
3 changed files with 82 additions and 2 deletions

View File

@ -0,0 +1,39 @@
pipeline {
agent any
parameters{
choice(
choices: ['lastSuccessful()', 'lastCompleted()', 'latestSavedBuild()'],
description: 'Which build to use for copying artifacts',
name: 'BUILD_SELECTOR'
)}
stages {
stage('clear') {
steps {
sh 'rm -rf *'
}
}
stage('checkout') {
steps {
checkout scm
}
}
stage('copy_artifacts') {
steps {
copyArtifacts fingerprintArtifacts: true, projectName: 'z-s444510-create-dataset', selector: buildParameter('BUILD_SELECTOR')
}
}
stage('Docker') {
agent {
docker {
image 'piotrwrzodak/ium:2'
reuseNode true
}
}
steps {
sh 'ls -a'
sh 'python3 dataset-stats-new.py'
archiveArtifacts artifacts: 'stats.csv', fingerprint: true
}
}
}
}

View File

@ -3,10 +3,10 @@ import os
import numpy as np
cutoff = 10
cutoff = int(os.environ['CUTOFF'])
data = pd.read_csv('./barcelona_weekends.csv')
data = data.sample(cutoff)
data = data[:cutoff]
data = data.iloc[:, 1:]
train_set, dev_set, test_set = np.split(data.sample(frac=1, random_state=42),

41
dataset-stats-new.py Normal file
View File

@ -0,0 +1,41 @@
import pandas as pd
import numpy as np
def calculate_stats(data, col_name):
col_values = data[col_name]
return [
len(data),
np.min(col_values),
np.max(col_values),
np.std(col_values),
np.median(col_values)
]
def calculate_value_counts(data, col_name):
return data[col_name].value_counts()
if __name__ == '__main__':
train = pd.read_csv('barcelona_weekends.train.csv')
dev = pd.read_csv('barcelona_weekends.dev.csv')
test = pd.read_csv('barcelona_weekends.test.csv')
train_set_stats = calculate_stats(train, 'realSum')
dev_set_stats = calculate_stats(dev, 'realSum')
test_set_stats = calculate_stats(test, 'realSum')
columns = ['size', 'minimum', 'maximum', 'standard deviation', 'median']
rows = ['train', 'dev', 'test']
df = pd.DataFrame(
data=np.array([train_set_stats, dev_set_stats, test_set_stats]),
index=rows,
columns=columns)
print(df)
print('Train', calculate_value_counts(train, 'person_capacity'), end='\n\n')
print('Dev', calculate_value_counts(dev, 'person_capacity'), end='\n\n')
print('Test', calculate_value_counts(test, 'person_capacity'), end='\n\n')
df.to_csv('stats.csv', index=False)