diff --git a/Jenkinsfile2 b/Jenkinsfile2 index 899818e..6c73e9e 100644 --- a/Jenkinsfile2 +++ b/Jenkinsfile2 @@ -8,8 +8,7 @@ pipeline { steps { copyArtifacts fingerprintArtifacts: true, projectName: 's434732-create-dataset', selector: buildParameter('BUILD_SELECTOR') - sh "chmod 777 ./skrypt_zad2_stats.sh" - sh "./skrypt_zad2_stats.sh" + sh 'python3 "./skrypt_stat.py" > stats.txt' archiveArtifacts "stats.txt" } diff --git a/skrypt_stat.py b/skrypt_stat.py index 6e8715d..6e10c38 100644 --- a/skrypt_stat.py +++ b/skrypt_stat.py @@ -1,31 +1,12 @@ import pandas as pd -from sklearn.model_selection import train_test_split -from sklearn import preprocessing -import kaggle -kaggle.api.authenticate() +import pandas as pd -kaggle.api.dataset_download_files('martj42/international-football-results-from-1872-to-2017', path='.', unzip=True) +train = pd.read_csv('train.csv') +test = pd.read_csv('test.csv') +valid = pd.read_csv('valid.csv') -results = pd.read_csv('results.csv') - -#brak wierszy z NaN -results.dropna() - -#normalizacja itp -for collumn in ['home_team', 'away_team', 'tournament', 'city', 'country']: - results[collumn] = results[collumn].str.lower() - -# Podział zbioru 6:1:1 -train, test = train_test_split(results, test_size= 1 - 0.6) - -valid, test = train_test_split(test, test_size=0.5) - -print("All data: ", results.size) print("Train size: ", train.size) print("Test size: ", test.size) print("Validate size: ", valid.size) -print(results.describe(include='all')) -# sprawdzenie czy cały dataset oraz podział na podzbiory jest równy -print(train.size+test.size+valid.size)