4.3.2 v1
This commit is contained in:
parent
f7468f52b7
commit
f1351c0b16
39
Jenkinsfile-dataset-stats-docker-hub
Normal file
39
Jenkinsfile-dataset-stats-docker-hub
Normal file
@ -0,0 +1,39 @@
|
||||
pipeline {
|
||||
agent any
|
||||
parameters{
|
||||
choice(
|
||||
choices: ['lastSuccessful()', 'lastCompleted()', 'latestSavedBuild()'],
|
||||
description: 'Which build to use for copying artifacts',
|
||||
name: 'BUILD_SELECTOR'
|
||||
)}
|
||||
stages {
|
||||
stage('clear') {
|
||||
steps {
|
||||
sh 'rm -rf *'
|
||||
}
|
||||
}
|
||||
stage('checkout') {
|
||||
steps {
|
||||
checkout scm
|
||||
}
|
||||
}
|
||||
stage('copy_artifacts') {
|
||||
steps {
|
||||
copyArtifacts fingerprintArtifacts: true, projectName: 'z-s444510-create-dataset', selector: buildParameter('BUILD_SELECTOR')
|
||||
}
|
||||
}
|
||||
stage('Docker') {
|
||||
agent {
|
||||
docker {
|
||||
image 'piotrwrzodak/ium:2'
|
||||
reuseNode true
|
||||
}
|
||||
}
|
||||
steps {
|
||||
sh 'ls -a'
|
||||
sh 'python3 dataset-stats-new.py'
|
||||
archiveArtifacts artifacts: 'stats.csv', fingerprint: true
|
||||
}
|
||||
}
|
||||
}
|
||||
}
|
@ -3,10 +3,10 @@ import os
|
||||
import numpy as np
|
||||
|
||||
|
||||
cutoff = 10
|
||||
cutoff = int(os.environ['CUTOFF'])
|
||||
|
||||
data = pd.read_csv('./barcelona_weekends.csv')
|
||||
data = data.sample(cutoff)
|
||||
data = data[:cutoff]
|
||||
data = data.iloc[:, 1:]
|
||||
|
||||
train_set, dev_set, test_set = np.split(data.sample(frac=1, random_state=42),
|
||||
|
41
dataset-stats-new.py
Normal file
41
dataset-stats-new.py
Normal file
@ -0,0 +1,41 @@
|
||||
import pandas as pd
|
||||
import numpy as np
|
||||
|
||||
|
||||
def calculate_stats(data, col_name):
|
||||
col_values = data[col_name]
|
||||
return [
|
||||
len(data),
|
||||
np.min(col_values),
|
||||
np.max(col_values),
|
||||
np.std(col_values),
|
||||
np.median(col_values)
|
||||
]
|
||||
|
||||
|
||||
def calculate_value_counts(data, col_name):
|
||||
return data[col_name].value_counts()
|
||||
|
||||
|
||||
if __name__ == '__main__':
|
||||
train = pd.read_csv('barcelona_weekends.train.csv')
|
||||
dev = pd.read_csv('barcelona_weekends.dev.csv')
|
||||
test = pd.read_csv('barcelona_weekends.test.csv')
|
||||
|
||||
train_set_stats = calculate_stats(train, 'realSum')
|
||||
dev_set_stats = calculate_stats(dev, 'realSum')
|
||||
test_set_stats = calculate_stats(test, 'realSum')
|
||||
|
||||
columns = ['size', 'minimum', 'maximum', 'standard deviation', 'median']
|
||||
rows = ['train', 'dev', 'test']
|
||||
df = pd.DataFrame(
|
||||
data=np.array([train_set_stats, dev_set_stats, test_set_stats]),
|
||||
index=rows,
|
||||
columns=columns)
|
||||
print(df)
|
||||
|
||||
print('Train', calculate_value_counts(train, 'person_capacity'), end='\n\n')
|
||||
print('Dev', calculate_value_counts(dev, 'person_capacity'), end='\n\n')
|
||||
print('Test', calculate_value_counts(test, 'person_capacity'), end='\n\n')
|
||||
|
||||
df.to_csv('stats.csv', index=False)
|
Loading…
Reference in New Issue
Block a user