jenkinsfiles, dockerfiles, Python scripts, full ML algorithm in .py and .ipnyb (to split later)

2023-04-19 18:29:49 +02:00 · 2023-04-19 18:29:49 +02:00 · f117d780c3
commit f117d780c3
parent da35f02aaf
11 changed files with 2476 additions and 4082 deletions
--- a/29
+++ b/29
@ -1,29 +0,0 @@
-pipeline {
-   agent any
-   //Definijuemy parametry, które będzie można podać podczas wywoływania zadania
-   parameters {
-     string (
-         defaultValue: 'Hello World!',
-         description: 'Tekst, którym chcesz przywitać świat',
-         name: 'INPUT_TEXT',
-         trim: false
-        )
-   }
-   stages {
-      stage('Hello') {
-         steps {
-            //Wypisz wartość parametru w konsoli (To nie jest polecenie bash, tylko groovy!)
-            echo "INPUT_TEXT: ${INPUT_TEXT}"
-            //Wywołaj w konsoli komendę "figlet", która generuje ASCI-art
-            sh "figlet \"${INPUT_TEXT}\" | tee output.txt"
-         }
-      }
-      stage('Goodbye!') {
-         steps {
-            echo 'Goodbye!!'
-            //Zarchiwizuj wynik
-            archiveArtifacts 'output.txt'
-         }
-      }
-   }
-}
--- a/62
+++ b/62
@ -0,0 +1,62 @@
+pipeline {
+   agent  any
+
+         parameters{
+             string(
+                 defaultValue: 'Zalbidegoitia',
+                 description: 'Kaggle username',
+                 name: 'KAGGLE_USERNAME',
+                 trim: false
+             )
+             password(
+                 defaultValue: '',
+                 description: 'Kaggle token taken from kaggle.json file, as described in https://github.com/Kaggle/kaggle-api#api-credentials',
+                 name: 'KAGGLE_KEY'
+             )
+             string(
+                 defaultValue: '1000',
+                 description: 'CUTOFF',
+                 name: 'CUTOFF',
+                 trim: false
+             )
+         }
+    stages {
+      stage('clear_all') {
+         steps {
+            sh 'rm -rf *'
+         }
+      }
+      stage('Build') {
+         steps {
+            sh 'git clone https://git.wmi.amu.edu.pl/s487173/ium_z487173.git'
+         withEnv(["KAGGLE_USERNAME=${params.KAGGLE_USERNAME}",
+                  "KAGGLE_KEY=${params.KAGGLE_KEY}" ]) {
+               sh 'kaggle datasets download -d vivovinco/20222023-football-player-stats'
+               sh 'unzip 20222023-football-player-stats.zip -d ./ium_z487173'
+               sh 'rm 20222023-football-player-stats.zip'
+               sh 'ls -a'
+               sh 'ls -a ./ium_z487173'
+         }
+         }
+      }
+      stage('Docker') {
+agent { 
+    dockerfile {
+        filename 'create_dataset.dockerfile'
+        dir      'ium_z487173'
+        reuseNode true
+    }
+}
+      steps {
+               sh 'ls -a'
+               sh 'python ./ium_z487173/create-dataset.py'
+               archiveArtifacts 'X_test.csv'
+               archiveArtifacts 'X_val.csv'
+               archiveArtifacts 'X_train.csv'
+               archiveArtifacts 'Y_test.csv'
+               archiveArtifacts 'Y_val.csv'
+               archiveArtifacts 'Y_train.csv'
+         }
+      }
+   }
+}
--- a/45
+++ b/45
@ -0,0 +1,45 @@
+pipeline {
+   agent any
+         parameters{
+            choice(
+               choices: ['lastSuccessful()', 'lastCompleted()', 'latestSavedBuild()'], 
+               description: 'Which build to use for copying artifacts',
+               name: 'BUILD_SELECTOR'
+            )}
+    stages {
+      stage('clear_all') {
+         steps {
+            sh 'rm -rf ium_z487173'
+         }
+      }
+      stage('checkout') {
+         steps {
+            sh 'git clone https://git.wmi.amu.edu.pl/s487173/ium_z487173.git'
+         }
+      }
+      stage('copy_artifacts') {
+         steps {
+            copyArtifacts filter: 'X_test.csv,X_val.csv,X_train.csv,Y_test.csv,Y_val.csv,Y_train.csv', fingerprintArtifacts: true, projectName: 'z-s487173-create-dataset', selector: workspace()
+         }
+      }
+      stage('Docker') {
+         agent { 
+            dockerfile {
+               filename 'dataset_stats.dockerfile'
+               dir      'ium_z487173'
+               reuseNode true
+            }
+         }
+         steps {
+               sh 'ls -a'
+               sh 'python ./ium_z487173/dataset-stats.py'
+               archiveArtifacts 'data_stats.txt'
+         }
+      }
+      stage('Goodbye!') {
+         steps {
+            echo 'Goodbye!'           
+         }
+      }
+   }
+}
--- a/create-dataset.py
+++ b/create-dataset.py
@ -0,0 +1,54 @@
+import pandas as pd
+import os
+from sklearn.model_selection import train_test_split
+from sklearn.preprocessing import StandardScaler
+
+
+def get_simplified_position(value):
+    if value.startswith('MF'):
+        return 0
+    elif value.startswith('FW'):
+        return 1
+    elif value.startswith('DF'):
+        return 2
+    elif value.startswith('GK'):
+        return 3
+    else:
+        return value
+
+
+cutoff = int(os.environ['CUTOFF'])
+
+# READ DATA
+players_stats = pd.read_csv('./ium_z487173/2022-2023 Football Player Stats.csv',
+                              engine = 'python',
+                              encoding = 'ISO-8859-1',
+                              sep = ';')
+
+
+# CUT OFF DATASET TO X LINES
+players_stats = players_stats.sample(cutoff)
+
+players_stats = players_stats.drop(players_stats.columns[[0,1,2,4,5,6,7,8,9,10,11]], axis=1)
+
+players_stats['Pos'] = players_stats['Pos'].apply(get_simplified_position)
+player_stats_subset = players_stats.iloc[:,0:13]
+
+X = players_stats
+y = pd.DataFrame(X.pop('Pos'))
+
+scaler = StandardScaler()
+X_scaled = scaler.fit_transform(X)
+X = pd.DataFrame(X_scaled, columns=X.columns)
+
+X_train, X_temp, Y_train, Y_temp = train_test_split(X, y, test_size=0.3, random_state=1)
+X_val, X_test, Y_val, Y_test = train_test_split(X_temp, Y_temp, test_size=0.5, random_state=1)
+
+
+X_train.to_csv('X_train.csv', index=False)
+X_val.to_csv('X_val.csv', index=False)
+X_test.to_csv('X_test.csv', index=False)
+
+Y_train.to_csv('Y_train.csv', index=False)
+Y_val.to_csv('Y_val.csv', index=False)
+Y_test.to_csv('Y_test.csv', index=False)
--- a/create_dataset.dockerfile
+++ b/create_dataset.dockerfile
@ -0,0 +1,5 @@
+FROM continuumio/anaconda3:latest
+
+RUN apt-get update && apt-get install -y
+
+RUN pip install pandas
--- a/data/2022-2023
+++ b/data/2022-2023
--- a/data/20222023-football-player-stats.zip
+++ b/data/20222023-football-player-stats.zip
--- a/dataset-stats.py
+++ b/dataset-stats.py
@ -0,0 +1,46 @@
+import pandas as pd
+
+
+X_train = pd.read_csv('./ium_z487173/X_train.csv',
+                              engine = 'python',
+                              encoding = 'ISO-8859-1',
+                              sep=',')
+
+X_val = pd.read_csv('./ium_z487173/X_val.csv',
+                              engine = 'python',
+                              encoding = 'ISO-8859-1',
+                              sep=',')
+X_test = pd.read_csv('./ium_z487173/X_test.csv',
+                              engine = 'python',
+                              encoding = 'ISO-8859-1',
+                              sep=',')
+
+Y_train = pd.read_csv('./ium_z487173/Y_train.csv',
+                              engine = 'python',
+                              encoding = 'ISO-8859-1',
+                              sep=',')
+
+Y_val = pd.read_csv('./ium_z487173/Y_val.csv',
+                              engine = 'python',
+                              encoding = 'ISO-8859-1',
+                              sep=',')
+Y_test = pd.read_csv('./ium_z487173/Y_test.csv',
+                              engine = 'python',
+                              encoding = 'ISO-8859-1',
+                              sep=',')
+
+
+with open("data_stats.txt", "w") as plik:
+    plik.write("Y_test value counts:\n")
+    plik.write(str(Y_test["Pos"].value_counts()) + "\n\n")
+    plik.write("Y_train value counts:\n")
+    plik.write(str(Y_train["Pos"].value_counts()) + "\n\n")
+    plik.write("Y_val value counts:\n")
+    plik.write(str(Y_val["Pos"].value_counts()) + "\n\n")
+    plik.write("X_train stats:\n")
+    plik.write(str(X_train.describe(include='all')) + "\n\n")
+    plik.write("X_test stats:\n")
+    plik.write(str(X_test.describe(include='all')) + "\n\n")
+    plik.write("X_val stats:\n")
+    plik.write(str(X_val.describe(include='all')) + "\n\n")
+
--- a/dataset_stats.dockerfile
+++ b/dataset_stats.dockerfile
@ -0,0 +1,6 @@
+FROM continuumio/anaconda3:latest
+
+RUN apt-get update && apt-get install -y
+
+RUN pip install pandas
+RUN pip install scikit-learn
--- a/iumz_487173.ipynb
+++ b/iumz_487173.ipynb
--- a/iumz_487173.py
+++ b/iumz_487173.py
@ -0,0 +1,105 @@
+from kaggle.api.kaggle_api_extended import KaggleApi
+import zipfile
+from sklearn.model_selection import train_test_split
+from sklearn.preprocessing import StandardScaler
+import pandas as pd
+import tensorflow as tf
+from keras.models import Sequential
+from keras.layers import Dense
+import matplotlib.pyplot as plt
+from keras import utils
+
+
+
+# Step 2: Download your Kaggle credentials JSON file and save it to a secure location on your system
+
+# Step 3: Authenticate the Kaggle API client using your credentials JSON file
+api = KaggleApi()
+api.authenticate()
+
+# Step 4: Download the dataset files using the Kaggle API client
+api.dataset_download_files('vivovinco/20222023-football-player-stats', path='./data')
+
+# Step 5: Extract the dataset files if they are compressed (e.g., in ZIP format)
+with zipfile.ZipFile('./data/20222023-football-player-stats.zip', 'r') as zip_ref:
+    zip_ref.extractall('./data')
+
+
+def plot_loss_tf(history):
+    fig,ax = plt.subplots(1,1, figsize = (4,3))
+    fig.canvas.toolbar_visible = False
+    fig.canvas.header_visible = False
+    fig.canvas.footer_visible = False
+    ax.plot(history.history['loss'], label='loss')
+    ax.set_xlabel('Epoch')
+    ax.set_ylabel('loss (cost)')
+    ax.legend()
+    ax.grid(True)
+    plt.show()
+
+def get_simplified_position(value):
+    if value.startswith('MF'):
+        return 0
+    elif value.startswith('FW'):
+        return 1
+    elif value.startswith('DF'):
+        return 2
+    elif value.startswith('GK'):
+        return 3
+    else:
+        return value
+
+# Step 6: Access the dataset files in Python and start working with the data
+
+players_stats = pd.read_csv('data/2022-2023 Football Player Stats.csv',engine='python',encoding='ISO-8859-1', sep=';')
+players_stats = players_stats.drop(players_stats.columns[[0,1,2,4,5,6,7,8,9,10,11]], axis=1)
+
+players_stats['Pos'] = players_stats['Pos'].apply(get_simplified_position)
+player_stats_subset = players_stats.iloc[:,0:13]
+player_stats_subset.describe(include='all')
+
+X = players_stats
+y = pd.DataFrame(X.pop('Pos'))
+
+scaler = StandardScaler()
+X_scaled = scaler.fit_transform(X)
+X = pd.DataFrame(X_scaled, columns=X.columns)
+
+X_train, X_temp, Y_train, Y_temp = train_test_split(X, y, test_size=0.3, random_state=1)
+X_val, X_test, Y_val, Y_test = train_test_split(X_temp, Y_temp, test_size=0.5, random_state=1)
+Y_train["Pos"].value_counts()
+
+print(Y_test["Pos"].value_counts())
+print(Y_val["Pos"].value_counts())
+print(X_train.describe(include='all'))
+print(X_test.describe(include='all'))
+print(X_val.describe(include='all'))
+
+Y_test = utils.to_categorical(Y_test)
+Y_train = utils.to_categorical(Y_train)
+Y_val = utils.to_categorical(Y_val)
+
+model = Sequential(
+     [               
+         Dense(100, input_dim=X_train.shape[1], activation='relu'),
+         Dense(70, activation='relu'),
+         Dense(50, activation='relu'),
+         Dense(4, activation='softmax')
+     ], name = "Players_model" 
+)
+
+model.compile(
+    loss=tf.keras.losses.CategoricalCrossentropy(),
+    optimizer=tf.keras.optimizers.Adam(),
+    metrics=['accuracy'])
+
+history = model.fit(
+  X_train,Y_train,
+    epochs = 500,
+    validation_data=(X_val, Y_val)
+)
+
+plot_loss_tf(history)
+print('Evaluating...')
+accuracy = model.evaluate(X_test, Y_test)[1]
+print(f"accuracy: {accuracy}")