jenkinsfiles, dockerfiles, Python scripts, full ML algorithm in .py and .ipnyb (to split later)

This commit is contained in:
Kamil Borowiec 2023-04-19 18:29:49 +02:00
parent da35f02aaf
commit f117d780c3
11 changed files with 2476 additions and 4082 deletions

29
Jenkinsfile vendored
View File

@ -1,29 +0,0 @@
pipeline {
agent any
//Definijuemy parametry, które będzie można podać podczas wywoływania zadania
parameters {
string (
defaultValue: 'Hello World!',
description: 'Tekst, którym chcesz przywitać świat',
name: 'INPUT_TEXT',
trim: false
)
}
stages {
stage('Hello') {
steps {
//Wypisz wartość parametru w konsoli (To nie jest polecenie bash, tylko groovy!)
echo "INPUT_TEXT: ${INPUT_TEXT}"
//Wywołaj w konsoli komendę "figlet", która generuje ASCI-art
sh "figlet \"${INPUT_TEXT}\" | tee output.txt"
}
}
stage('Goodbye!') {
steps {
echo 'Goodbye!!'
//Zarchiwizuj wynik
archiveArtifacts 'output.txt'
}
}
}
}

View File

@ -0,0 +1,62 @@
pipeline {
agent any
parameters{
string(
defaultValue: 'Zalbidegoitia',
description: 'Kaggle username',
name: 'KAGGLE_USERNAME',
trim: false
)
password(
defaultValue: '',
description: 'Kaggle token taken from kaggle.json file, as described in https://github.com/Kaggle/kaggle-api#api-credentials',
name: 'KAGGLE_KEY'
)
string(
defaultValue: '1000',
description: 'CUTOFF',
name: 'CUTOFF',
trim: false
)
}
stages {
stage('clear_all') {
steps {
sh 'rm -rf *'
}
}
stage('Build') {
steps {
sh 'git clone https://git.wmi.amu.edu.pl/s487173/ium_z487173.git'
withEnv(["KAGGLE_USERNAME=${params.KAGGLE_USERNAME}",
"KAGGLE_KEY=${params.KAGGLE_KEY}" ]) {
sh 'kaggle datasets download -d vivovinco/20222023-football-player-stats'
sh 'unzip 20222023-football-player-stats.zip -d ./ium_z487173'
sh 'rm 20222023-football-player-stats.zip'
sh 'ls -a'
sh 'ls -a ./ium_z487173'
}
}
}
stage('Docker') {
agent {
dockerfile {
filename 'create_dataset.dockerfile'
dir 'ium_z487173'
reuseNode true
}
}
steps {
sh 'ls -a'
sh 'python ./ium_z487173/create-dataset.py'
archiveArtifacts 'X_test.csv'
archiveArtifacts 'X_val.csv'
archiveArtifacts 'X_train.csv'
archiveArtifacts 'Y_test.csv'
archiveArtifacts 'Y_val.csv'
archiveArtifacts 'Y_train.csv'
}
}
}
}

45
Jenkinsfile_dataset_stats Normal file
View File

@ -0,0 +1,45 @@
pipeline {
agent any
parameters{
choice(
choices: ['lastSuccessful()', 'lastCompleted()', 'latestSavedBuild()'],
description: 'Which build to use for copying artifacts',
name: 'BUILD_SELECTOR'
)}
stages {
stage('clear_all') {
steps {
sh 'rm -rf ium_z487173'
}
}
stage('checkout') {
steps {
sh 'git clone https://git.wmi.amu.edu.pl/s487173/ium_z487173.git'
}
}
stage('copy_artifacts') {
steps {
copyArtifacts filter: 'X_test.csv,X_val.csv,X_train.csv,Y_test.csv,Y_val.csv,Y_train.csv', fingerprintArtifacts: true, projectName: 'z-s487173-create-dataset', selector: workspace()
}
}
stage('Docker') {
agent {
dockerfile {
filename 'dataset_stats.dockerfile'
dir 'ium_z487173'
reuseNode true
}
}
steps {
sh 'ls -a'
sh 'python ./ium_z487173/dataset-stats.py'
archiveArtifacts 'data_stats.txt'
}
}
stage('Goodbye!') {
steps {
echo 'Goodbye!'
}
}
}
}

54
create-dataset.py Normal file
View File

@ -0,0 +1,54 @@
import pandas as pd
import os
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import StandardScaler
def get_simplified_position(value):
if value.startswith('MF'):
return 0
elif value.startswith('FW'):
return 1
elif value.startswith('DF'):
return 2
elif value.startswith('GK'):
return 3
else:
return value
cutoff = int(os.environ['CUTOFF'])
# READ DATA
players_stats = pd.read_csv('./ium_z487173/2022-2023 Football Player Stats.csv',
engine = 'python',
encoding = 'ISO-8859-1',
sep = ';')
# CUT OFF DATASET TO X LINES
players_stats = players_stats.sample(cutoff)
players_stats = players_stats.drop(players_stats.columns[[0,1,2,4,5,6,7,8,9,10,11]], axis=1)
players_stats['Pos'] = players_stats['Pos'].apply(get_simplified_position)
player_stats_subset = players_stats.iloc[:,0:13]
X = players_stats
y = pd.DataFrame(X.pop('Pos'))
scaler = StandardScaler()
X_scaled = scaler.fit_transform(X)
X = pd.DataFrame(X_scaled, columns=X.columns)
X_train, X_temp, Y_train, Y_temp = train_test_split(X, y, test_size=0.3, random_state=1)
X_val, X_test, Y_val, Y_test = train_test_split(X_temp, Y_temp, test_size=0.5, random_state=1)
X_train.to_csv('X_train.csv', index=False)
X_val.to_csv('X_val.csv', index=False)
X_test.to_csv('X_test.csv', index=False)
Y_train.to_csv('Y_train.csv', index=False)
Y_val.to_csv('Y_val.csv', index=False)
Y_test.to_csv('Y_test.csv', index=False)

View File

@ -0,0 +1,5 @@
FROM continuumio/anaconda3:latest
RUN apt-get update && apt-get install -y
RUN pip install pandas

File diff suppressed because it is too large Load Diff

46
dataset-stats.py Normal file
View File

@ -0,0 +1,46 @@
import pandas as pd
X_train = pd.read_csv('./ium_z487173/X_train.csv',
engine = 'python',
encoding = 'ISO-8859-1',
sep=',')
X_val = pd.read_csv('./ium_z487173/X_val.csv',
engine = 'python',
encoding = 'ISO-8859-1',
sep=',')
X_test = pd.read_csv('./ium_z487173/X_test.csv',
engine = 'python',
encoding = 'ISO-8859-1',
sep=',')
Y_train = pd.read_csv('./ium_z487173/Y_train.csv',
engine = 'python',
encoding = 'ISO-8859-1',
sep=',')
Y_val = pd.read_csv('./ium_z487173/Y_val.csv',
engine = 'python',
encoding = 'ISO-8859-1',
sep=',')
Y_test = pd.read_csv('./ium_z487173/Y_test.csv',
engine = 'python',
encoding = 'ISO-8859-1',
sep=',')
with open("data_stats.txt", "w") as plik:
plik.write("Y_test value counts:\n")
plik.write(str(Y_test["Pos"].value_counts()) + "\n\n")
plik.write("Y_train value counts:\n")
plik.write(str(Y_train["Pos"].value_counts()) + "\n\n")
plik.write("Y_val value counts:\n")
plik.write(str(Y_val["Pos"].value_counts()) + "\n\n")
plik.write("X_train stats:\n")
plik.write(str(X_train.describe(include='all')) + "\n\n")
plik.write("X_test stats:\n")
plik.write(str(X_test.describe(include='all')) + "\n\n")
plik.write("X_val stats:\n")
plik.write(str(X_val.describe(include='all')) + "\n\n")

6
dataset_stats.dockerfile Normal file
View File

@ -0,0 +1,6 @@
FROM continuumio/anaconda3:latest
RUN apt-get update && apt-get install -y
RUN pip install pandas
RUN pip install scikit-learn

File diff suppressed because one or more lines are too long

105
iumz_487173.py Normal file
View File

@ -0,0 +1,105 @@
from kaggle.api.kaggle_api_extended import KaggleApi
import zipfile
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import StandardScaler
import pandas as pd
import tensorflow as tf
from keras.models import Sequential
from keras.layers import Dense
import matplotlib.pyplot as plt
from keras import utils
# Step 2: Download your Kaggle credentials JSON file and save it to a secure location on your system
# Step 3: Authenticate the Kaggle API client using your credentials JSON file
api = KaggleApi()
api.authenticate()
# Step 4: Download the dataset files using the Kaggle API client
api.dataset_download_files('vivovinco/20222023-football-player-stats', path='./data')
# Step 5: Extract the dataset files if they are compressed (e.g., in ZIP format)
with zipfile.ZipFile('./data/20222023-football-player-stats.zip', 'r') as zip_ref:
zip_ref.extractall('./data')
def plot_loss_tf(history):
fig,ax = plt.subplots(1,1, figsize = (4,3))
fig.canvas.toolbar_visible = False
fig.canvas.header_visible = False
fig.canvas.footer_visible = False
ax.plot(history.history['loss'], label='loss')
ax.set_xlabel('Epoch')
ax.set_ylabel('loss (cost)')
ax.legend()
ax.grid(True)
plt.show()
def get_simplified_position(value):
if value.startswith('MF'):
return 0
elif value.startswith('FW'):
return 1
elif value.startswith('DF'):
return 2
elif value.startswith('GK'):
return 3
else:
return value
# Step 6: Access the dataset files in Python and start working with the data
players_stats = pd.read_csv('data/2022-2023 Football Player Stats.csv',engine='python',encoding='ISO-8859-1', sep=';')
players_stats = players_stats.drop(players_stats.columns[[0,1,2,4,5,6,7,8,9,10,11]], axis=1)
players_stats['Pos'] = players_stats['Pos'].apply(get_simplified_position)
player_stats_subset = players_stats.iloc[:,0:13]
player_stats_subset.describe(include='all')
X = players_stats
y = pd.DataFrame(X.pop('Pos'))
scaler = StandardScaler()
X_scaled = scaler.fit_transform(X)
X = pd.DataFrame(X_scaled, columns=X.columns)
X_train, X_temp, Y_train, Y_temp = train_test_split(X, y, test_size=0.3, random_state=1)
X_val, X_test, Y_val, Y_test = train_test_split(X_temp, Y_temp, test_size=0.5, random_state=1)
Y_train["Pos"].value_counts()
print(Y_test["Pos"].value_counts())
print(Y_val["Pos"].value_counts())
print(X_train.describe(include='all'))
print(X_test.describe(include='all'))
print(X_val.describe(include='all'))
Y_test = utils.to_categorical(Y_test)
Y_train = utils.to_categorical(Y_train)
Y_val = utils.to_categorical(Y_val)
model = Sequential(
[
Dense(100, input_dim=X_train.shape[1], activation='relu'),
Dense(70, activation='relu'),
Dense(50, activation='relu'),
Dense(4, activation='softmax')
], name = "Players_model"
)
model.compile(
loss=tf.keras.losses.CategoricalCrossentropy(),
optimizer=tf.keras.optimizers.Adam(),
metrics=['accuracy'])
history = model.fit(
X_train,Y_train,
epochs = 500,
validation_data=(X_val, Y_val)
)
plot_loss_tf(history)
print('Evaluating...')
accuracy = model.evaluate(X_test, Y_test)[1]
print(f"accuracy: {accuracy}")