test

2022-04-03 00:21:21 +02:00 · 2022-04-03 00:21:21 +02:00 · a10a7feed9
commit a10a7feed9
parent a50ac85994
2 changed files with 63 additions and 7 deletions
--- a/9
+++ b/9
@ -22,14 +22,9 @@ pipeline {
            steps {
 				copyArtifacts fingerprintArtifacts: true, projectName: 's444501-create-dataset', selector: buildParameter('BUILD_SELECTOR')
 				
-				withEnv(["KAGGLE_USERNAME=${params.KAGGLE_USERNAME}","KAGGLE_KEY=${params.KAGGLE_KEY}", "CUTOFF=${params.CUTOFF}"]) {
-					sh 'echo KAGGLE_USERNAME: $KAGGLE_USERNAME'
-					sh './download.sh'
-					}
-				
-				sh 'chmod u+x ./stats.sh'
+				//sh 'chmod u+x ./stats.sh'
 				//sh './stats.sh'
-				sh "python ium-data.py"
+				sh "python stats.py"
                archiveArtifacts 'stats.txt'
 			}
 		}
--- a/stats.py
+++ b/stats.py
@ -0,0 +1,61 @@
+#!/usr/bin/env python3
+
+#from kaggle.api.kaggle_api_extended import KaggleApi
+import pandas as pd
+import matplotlib.pyplot as plt
+from sklearn.model_selection import train_test_split
+
+pd.set_option("display.max_rows", None)
+
+
+def column_stat(analyzed_set, column_name):
+    rating_min = analyzed_set[column_name].min()
+    rating_max = analyzed_set[column_name].max()
+    rating_mean = round(analyzed_set[column_name].mean(), 3)
+    rating_median = analyzed_set[column_name].median()
+    rating_std = round(analyzed_set[column_name].std(), 3)
+
+    output = ''
+
+    output += f"Dla kolumny '{column_name}':\n"
+    output += f"Minimum: {rating_min}\n"
+    output += f"Maximum: {rating_max}\n"
+    output += f"Średnia: {rating_mean}\n"
+    output += f"Mediana: {rating_median}\n"
+    output += f"Odchylenie standardowe: {rating_std}\n"
+
+    return output
+
+
+d_train = pd.read_csv('d_train.csv', encoding='latin-1')
+d_test = pd.read_csv('d_test.csv', encoding='latin-1')
+d_tdev = pd.read_csv('d_dev.csv', encoding='latin-1')
+
+
+# Statystyki
+#temp = ''
+#temp += f"Wielkość całego zbioru: {disney.shape[0]}\n"
+#temp += f"Inne statystyki:\n"
+#temp += column_stat(disney, 'Rating')
+#temp += '\n'
+
+temp += f"Wielkość zbioru trenującego: {d_train.shape[0]}\n"
+temp += f"Inne statystyki:\n"
+temp += column_stat(d_train, 'Rating')
+temp += '\n'
+
+temp += f"Wielkość zbioru walidującego: {d_dev.shape[0]}\n"
+temp += f"Inne statystyki:\n"
+temp += column_stat(d_dev, 'Rating')
+temp += '\n'
+
+temp += f"Wielkość zbioru testowego: {d_test.shape[0]}\n"
+temp += f"Inne statystyki:\n"
+temp += column_stat(d_test, 'Rating')
+temp += '\n'
+
+with open('stats.txt', 'w+', encoding="utf-8") as f:
+    print(temp)
+    f.write(temp)
+
+