From 78636092b9ab7c625cb9679e81e5b811f2728aee Mon Sep 17 00:00:00 2001
From: Norbert Walkowiak <norwal2@st.amu.edu.pl>
Date: Tue, 11 Apr 2023 22:01:05 +0200
Subject: [PATCH] update task: s487175-create-dataset

---
 Jenkinsfile                      |  52 ++++++++-----
 s487175-create-dataset-script.py | 125 +++++++++++++++++++++++++++++++
 2 files changed, 157 insertions(+), 20 deletions(-)
 create mode 100644 s487175-create-dataset-script.py

diff --git a/Jenkinsfile b/Jenkinsfile
index 0352999..8af4fa6 100644
--- a/Jenkinsfile
+++ b/Jenkinsfile
@@ -1,29 +1,41 @@
 pipeline {
    agent any
-   //Definijuemy parametry, które będzie można podać podczas wywoływania zadania
-   parameters {
-     string (
-         defaultValue: 'Hello World!',
-         description: 'Tekst, którym chcesz przywitać świat',
-         name: 'INPUT_TEXT',
-         trim: false
-        )
-   }
    stages {
-      stage('Hello') {
-         steps {
-            //Wypisz wartość parametru w konsoli (To nie jest polecenie bash, tylko groovy!)
-            echo "INPUT_TEXT: ${INPUT_TEXT}"
-            //Wywołaj w konsoli komendę "figlet", która generuje ASCI-art
-            sh "figlet \"${INPUT_TEXT}\" | tee output.txt"
-         }
+      stage('Preparation') { 
+         properties([
+            //dostęp do API KAGGLE
+               parameters([
+                  string(
+                     defaultValue: 'nbrt10',
+                     description: 'Kaggle username',
+                     name: 'KAGGLE_USERNAME',
+                     trim: false
+                  ),
+                  password(
+                     defaultValue: '',
+                     description: 'Kaggle token taken from kaggle.json file, as described in https://github.com/Kaggle/kaggle-api#api-credentials',
+                     name: 'KAGGLE_KEY'
+                  )
+               ])
+         ])
       }
-      stage('Goodbye!') {
+      stage('checkout') {
          steps {
-            echo 'Goodbye!'
-            //Zarchiwizuj wynik
-            archiveArtifacts 'output.txt'
+                //Klonowanie
+                checkout([$class: 'GitSCM', branches: [[name: '*/master']], doGenerateSubmoduleConfigurations: false, extensions: [], submoduleCfg: [], userRemoteConfigs: [[credentialsId: 'git-creds', url: 'https://git.wmi.amu.edu.pl/s487175/ium_z487175.git']]])
+            }
+      }
+      stage('script') {
+         steps {
+               //skrypt
+               sh './s487175-create-dataset-scriptsh.py' > output.txt
          }
       }
    }
+   post {
+    always {
+      //archiveArtifacts
+      archiveArtifacts artifacts: 'output.txt', onlyIfSuccessful: true
+    }
+  }
 }
\ No newline at end of file
diff --git a/s487175-create-dataset-script.py b/s487175-create-dataset-script.py
new file mode 100644
index 0000000..2fa2fe8
--- /dev/null
+++ b/s487175-create-dataset-script.py
@@ -0,0 +1,125 @@
+
+import pandas as pd
+diamonds = pd.read_csv('diamonds.csv')
+#Wyświetlenie zbioru danych
+diamonds
+
+# %%
+#przydzielanie nazwy kolumny z id
+diamonds = diamonds.rename(columns={diamonds.columns[0]: 'id'})
+diamonds
+
+# %%
+#Convert to lowerCase
+
+diamonds['cut'] = diamonds['cut'].str.lower()
+diamonds
+
+# %%
+import sklearn
+from sklearn.model_selection import train_test_split
+
+# %%
+#podział danych na train/test/dev w proporcji 4:1:1
+#losować ustawiona na 10
+
+#1. Dzielimy na zbiór treningowy 80 % i resztę danych
+diamonds_train, diamonds_test_dev = sklearn.model_selection.train_test_split(diamonds, test_size=0.2, random_state=10)
+
+#2. Podział reszty danych na zbiór testowy 10% i walidacyjny 10%
+diamonds_test, diamonds_dev = train_test_split(diamonds_test_dev, test_size=0.5, random_state=10)
+
+
+# %%
+#Wyświetlenie rozmiarów zbiorów danych train/test/dev
+print("Rozmiar diamonds: ", diamonds.shape)
+print("Rozmiar diamonds_train: ", diamonds_train.shape)
+print("Rozmiar diamonds_test: ", diamonds_test.shape)
+print("Rozmiar diamonds_dev: ", diamonds_dev.shape)
+
+# %%
+# średnią, minimum, maksimum, odchylenia standardowe, medianę wartości poszczególnych parametrów)
+print(diamonds.describe())
+
+# %%
+print(diamonds_train.describe())
+
+# %%
+print(diamonds_test.describe())
+
+# %%
+print(diamonds_dev.describe())
+
+# %%
+#Wyświetlenie częstości przykładów dla poszczególnych klas diamentów
+diamonds_train["cut"].value_counts()
+
+# %%
+diamonds_test["cut"].value_counts()
+
+# %%
+diamonds_dev["cut"].value_counts()
+
+# %%
+import matplotlib.pyplot as plt
+
+plt.figure(figsize=(8, 6))
+diamonds['cut'].value_counts().plot(kind='bar')
+plt.title('Rozkład częstości dla szlifów diamentów dla zbioru diamonds')
+plt.xlabel('Szlif')
+plt.ylabel('Liczba wystąpień')
+plt.show()
+
+# %%
+import matplotlib.pyplot as plt
+
+plt.figure(figsize=(8, 6))
+diamonds_train['cut'].value_counts().plot(kind='bar')
+plt.title('Rozkład częstości dla szlifów diamentów dla zbioru diamonds tranującego')
+plt.xlabel('Szlif')
+plt.ylabel('Liczba wystąpień')
+plt.show()
+
+# %%
+import matplotlib.pyplot as plt
+
+plt.figure(figsize=(8, 6))
+diamonds_test['cut'].value_counts().plot(kind='bar')
+plt.title('Rozkład częstości dla szlifów diamentów dla zbioru diamonds testowego')
+plt.xlabel('Szlif')
+plt.ylabel('Liczba wystąpień')
+plt.show()
+
+# %%
+import matplotlib.pyplot as plt
+
+plt.figure(figsize=(8, 6))
+diamonds_dev['cut'].value_counts().plot(kind='bar')
+plt.title('Rozkład częstości dla szlifów diamentów dla zbioru diamonds walidacyjnego')
+plt.xlabel('Szlif')
+plt.ylabel('Liczba wystąpień')
+plt.show()
+
+# %%
+diamonds[["cut","carat"]].groupby("cut").std()
+
+# %%
+diamonds[["cut","carat"]].groupby("cut").mean().plot(kind="bar")
+
+# %%
+#normalizacja wartości typu float do zakrsu 0.0 - 1.0
+#Powyżej wykonano jeszcze konwersję danych typu string na lowerCase
+
+from sklearn.preprocessing import MinMaxScaler
+scaler = MinMaxScaler()
+diamonds[['carat', 'depth', 'table', 'price', 'x', 'y', 'z']] = scaler.fit_transform(diamonds[['carat', 'depth', 'table', 'price', 'x', 'y', 'z']])
+
+#wyświetlenie zbioru
+diamonds
+
+# %%
+# Usuwanie artefaktów
+diamonds = diamonds.dropna() ## usuwanie pustych wierszy, które posiadają przynajmniej jedno wystąpienie NULL or NaN
+diamonds
+
+