docker-v3
This commit is contained in:
parent
a63041cfc1
commit
9cf9fc7d6c
@ -8,8 +8,7 @@ pipeline {
|
|||||||
steps {
|
steps {
|
||||||
copyArtifacts fingerprintArtifacts: true, projectName: 's434732-create-dataset', selector: buildParameter('BUILD_SELECTOR')
|
copyArtifacts fingerprintArtifacts: true, projectName: 's434732-create-dataset', selector: buildParameter('BUILD_SELECTOR')
|
||||||
|
|
||||||
sh "chmod 777 ./skrypt_zad2_stats.sh"
|
sh 'python3 "./skrypt_stat.py" > stats.txt'
|
||||||
sh "./skrypt_zad2_stats.sh"
|
|
||||||
|
|
||||||
archiveArtifacts "stats.txt"
|
archiveArtifacts "stats.txt"
|
||||||
}
|
}
|
||||||
|
@ -1,31 +1,12 @@
|
|||||||
import pandas as pd
|
import pandas as pd
|
||||||
from sklearn.model_selection import train_test_split
|
|
||||||
from sklearn import preprocessing
|
|
||||||
import kaggle
|
|
||||||
|
|
||||||
kaggle.api.authenticate()
|
import pandas as pd
|
||||||
|
|
||||||
kaggle.api.dataset_download_files('martj42/international-football-results-from-1872-to-2017', path='.', unzip=True)
|
train = pd.read_csv('train.csv')
|
||||||
|
test = pd.read_csv('test.csv')
|
||||||
|
valid = pd.read_csv('valid.csv')
|
||||||
|
|
||||||
results = pd.read_csv('results.csv')
|
|
||||||
|
|
||||||
#brak wierszy z NaN
|
|
||||||
results.dropna()
|
|
||||||
|
|
||||||
#normalizacja itp
|
|
||||||
for collumn in ['home_team', 'away_team', 'tournament', 'city', 'country']:
|
|
||||||
results[collumn] = results[collumn].str.lower()
|
|
||||||
|
|
||||||
# Podział zbioru 6:1:1
|
|
||||||
train, test = train_test_split(results, test_size= 1 - 0.6)
|
|
||||||
|
|
||||||
valid, test = train_test_split(test, test_size=0.5)
|
|
||||||
|
|
||||||
print("All data: ", results.size)
|
|
||||||
print("Train size: ", train.size)
|
print("Train size: ", train.size)
|
||||||
print("Test size: ", test.size)
|
print("Test size: ", test.size)
|
||||||
print("Validate size: ", valid.size)
|
print("Validate size: ", valid.size)
|
||||||
print(results.describe(include='all'))
|
|
||||||
|
|
||||||
# sprawdzenie czy cały dataset oraz podział na podzbiory jest równy
|
|
||||||
print(train.size+test.size+valid.size)
|
|
||||||
|
Loading…
Reference in New Issue
Block a user