jenkinsfiles, dockerfiles, Python scripts, full ML algorithm in .py and .ipnyb (to split later)
This commit is contained in:
parent
da35f02aaf
commit
f117d780c3
29
Jenkinsfile
vendored
29
Jenkinsfile
vendored
@ -1,29 +0,0 @@
|
||||
pipeline {
|
||||
agent any
|
||||
//Definijuemy parametry, które będzie można podać podczas wywoływania zadania
|
||||
parameters {
|
||||
string (
|
||||
defaultValue: 'Hello World!',
|
||||
description: 'Tekst, którym chcesz przywitać świat',
|
||||
name: 'INPUT_TEXT',
|
||||
trim: false
|
||||
)
|
||||
}
|
||||
stages {
|
||||
stage('Hello') {
|
||||
steps {
|
||||
//Wypisz wartość parametru w konsoli (To nie jest polecenie bash, tylko groovy!)
|
||||
echo "INPUT_TEXT: ${INPUT_TEXT}"
|
||||
//Wywołaj w konsoli komendę "figlet", która generuje ASCI-art
|
||||
sh "figlet \"${INPUT_TEXT}\" | tee output.txt"
|
||||
}
|
||||
}
|
||||
stage('Goodbye!') {
|
||||
steps {
|
||||
echo 'Goodbye!!'
|
||||
//Zarchiwizuj wynik
|
||||
archiveArtifacts 'output.txt'
|
||||
}
|
||||
}
|
||||
}
|
||||
}
|
62
Jenkinsfile_create_dataset
Normal file
62
Jenkinsfile_create_dataset
Normal file
@ -0,0 +1,62 @@
|
||||
pipeline {
|
||||
agent any
|
||||
|
||||
parameters{
|
||||
string(
|
||||
defaultValue: 'Zalbidegoitia',
|
||||
description: 'Kaggle username',
|
||||
name: 'KAGGLE_USERNAME',
|
||||
trim: false
|
||||
)
|
||||
password(
|
||||
defaultValue: '',
|
||||
description: 'Kaggle token taken from kaggle.json file, as described in https://github.com/Kaggle/kaggle-api#api-credentials',
|
||||
name: 'KAGGLE_KEY'
|
||||
)
|
||||
string(
|
||||
defaultValue: '1000',
|
||||
description: 'CUTOFF',
|
||||
name: 'CUTOFF',
|
||||
trim: false
|
||||
)
|
||||
}
|
||||
stages {
|
||||
stage('clear_all') {
|
||||
steps {
|
||||
sh 'rm -rf *'
|
||||
}
|
||||
}
|
||||
stage('Build') {
|
||||
steps {
|
||||
sh 'git clone https://git.wmi.amu.edu.pl/s487173/ium_z487173.git'
|
||||
withEnv(["KAGGLE_USERNAME=${params.KAGGLE_USERNAME}",
|
||||
"KAGGLE_KEY=${params.KAGGLE_KEY}" ]) {
|
||||
sh 'kaggle datasets download -d vivovinco/20222023-football-player-stats'
|
||||
sh 'unzip 20222023-football-player-stats.zip -d ./ium_z487173'
|
||||
sh 'rm 20222023-football-player-stats.zip'
|
||||
sh 'ls -a'
|
||||
sh 'ls -a ./ium_z487173'
|
||||
}
|
||||
}
|
||||
}
|
||||
stage('Docker') {
|
||||
agent {
|
||||
dockerfile {
|
||||
filename 'create_dataset.dockerfile'
|
||||
dir 'ium_z487173'
|
||||
reuseNode true
|
||||
}
|
||||
}
|
||||
steps {
|
||||
sh 'ls -a'
|
||||
sh 'python ./ium_z487173/create-dataset.py'
|
||||
archiveArtifacts 'X_test.csv'
|
||||
archiveArtifacts 'X_val.csv'
|
||||
archiveArtifacts 'X_train.csv'
|
||||
archiveArtifacts 'Y_test.csv'
|
||||
archiveArtifacts 'Y_val.csv'
|
||||
archiveArtifacts 'Y_train.csv'
|
||||
}
|
||||
}
|
||||
}
|
||||
}
|
45
Jenkinsfile_dataset_stats
Normal file
45
Jenkinsfile_dataset_stats
Normal file
@ -0,0 +1,45 @@
|
||||
pipeline {
|
||||
agent any
|
||||
parameters{
|
||||
choice(
|
||||
choices: ['lastSuccessful()', 'lastCompleted()', 'latestSavedBuild()'],
|
||||
description: 'Which build to use for copying artifacts',
|
||||
name: 'BUILD_SELECTOR'
|
||||
)}
|
||||
stages {
|
||||
stage('clear_all') {
|
||||
steps {
|
||||
sh 'rm -rf ium_z487173'
|
||||
}
|
||||
}
|
||||
stage('checkout') {
|
||||
steps {
|
||||
sh 'git clone https://git.wmi.amu.edu.pl/s487173/ium_z487173.git'
|
||||
}
|
||||
}
|
||||
stage('copy_artifacts') {
|
||||
steps {
|
||||
copyArtifacts filter: 'X_test.csv,X_val.csv,X_train.csv,Y_test.csv,Y_val.csv,Y_train.csv', fingerprintArtifacts: true, projectName: 'z-s487173-create-dataset', selector: workspace()
|
||||
}
|
||||
}
|
||||
stage('Docker') {
|
||||
agent {
|
||||
dockerfile {
|
||||
filename 'dataset_stats.dockerfile'
|
||||
dir 'ium_z487173'
|
||||
reuseNode true
|
||||
}
|
||||
}
|
||||
steps {
|
||||
sh 'ls -a'
|
||||
sh 'python ./ium_z487173/dataset-stats.py'
|
||||
archiveArtifacts 'data_stats.txt'
|
||||
}
|
||||
}
|
||||
stage('Goodbye!') {
|
||||
steps {
|
||||
echo 'Goodbye!'
|
||||
}
|
||||
}
|
||||
}
|
||||
}
|
54
create-dataset.py
Normal file
54
create-dataset.py
Normal file
@ -0,0 +1,54 @@
|
||||
import pandas as pd
|
||||
import os
|
||||
from sklearn.model_selection import train_test_split
|
||||
from sklearn.preprocessing import StandardScaler
|
||||
|
||||
|
||||
def get_simplified_position(value):
|
||||
if value.startswith('MF'):
|
||||
return 0
|
||||
elif value.startswith('FW'):
|
||||
return 1
|
||||
elif value.startswith('DF'):
|
||||
return 2
|
||||
elif value.startswith('GK'):
|
||||
return 3
|
||||
else:
|
||||
return value
|
||||
|
||||
|
||||
cutoff = int(os.environ['CUTOFF'])
|
||||
|
||||
# READ DATA
|
||||
players_stats = pd.read_csv('./ium_z487173/2022-2023 Football Player Stats.csv',
|
||||
engine = 'python',
|
||||
encoding = 'ISO-8859-1',
|
||||
sep = ';')
|
||||
|
||||
|
||||
# CUT OFF DATASET TO X LINES
|
||||
players_stats = players_stats.sample(cutoff)
|
||||
|
||||
players_stats = players_stats.drop(players_stats.columns[[0,1,2,4,5,6,7,8,9,10,11]], axis=1)
|
||||
|
||||
players_stats['Pos'] = players_stats['Pos'].apply(get_simplified_position)
|
||||
player_stats_subset = players_stats.iloc[:,0:13]
|
||||
|
||||
X = players_stats
|
||||
y = pd.DataFrame(X.pop('Pos'))
|
||||
|
||||
scaler = StandardScaler()
|
||||
X_scaled = scaler.fit_transform(X)
|
||||
X = pd.DataFrame(X_scaled, columns=X.columns)
|
||||
|
||||
X_train, X_temp, Y_train, Y_temp = train_test_split(X, y, test_size=0.3, random_state=1)
|
||||
X_val, X_test, Y_val, Y_test = train_test_split(X_temp, Y_temp, test_size=0.5, random_state=1)
|
||||
|
||||
|
||||
X_train.to_csv('X_train.csv', index=False)
|
||||
X_val.to_csv('X_val.csv', index=False)
|
||||
X_test.to_csv('X_test.csv', index=False)
|
||||
|
||||
Y_train.to_csv('Y_train.csv', index=False)
|
||||
Y_val.to_csv('Y_val.csv', index=False)
|
||||
Y_test.to_csv('Y_test.csv', index=False)
|
5
create_dataset.dockerfile
Normal file
5
create_dataset.dockerfile
Normal file
@ -0,0 +1,5 @@
|
||||
FROM continuumio/anaconda3:latest
|
||||
|
||||
RUN apt-get update && apt-get install -y
|
||||
|
||||
RUN pip install pandas
|
File diff suppressed because it is too large
Load Diff
Binary file not shown.
46
dataset-stats.py
Normal file
46
dataset-stats.py
Normal file
@ -0,0 +1,46 @@
|
||||
import pandas as pd
|
||||
|
||||
|
||||
X_train = pd.read_csv('./ium_z487173/X_train.csv',
|
||||
engine = 'python',
|
||||
encoding = 'ISO-8859-1',
|
||||
sep=',')
|
||||
|
||||
X_val = pd.read_csv('./ium_z487173/X_val.csv',
|
||||
engine = 'python',
|
||||
encoding = 'ISO-8859-1',
|
||||
sep=',')
|
||||
X_test = pd.read_csv('./ium_z487173/X_test.csv',
|
||||
engine = 'python',
|
||||
encoding = 'ISO-8859-1',
|
||||
sep=',')
|
||||
|
||||
Y_train = pd.read_csv('./ium_z487173/Y_train.csv',
|
||||
engine = 'python',
|
||||
encoding = 'ISO-8859-1',
|
||||
sep=',')
|
||||
|
||||
Y_val = pd.read_csv('./ium_z487173/Y_val.csv',
|
||||
engine = 'python',
|
||||
encoding = 'ISO-8859-1',
|
||||
sep=',')
|
||||
Y_test = pd.read_csv('./ium_z487173/Y_test.csv',
|
||||
engine = 'python',
|
||||
encoding = 'ISO-8859-1',
|
||||
sep=',')
|
||||
|
||||
|
||||
with open("data_stats.txt", "w") as plik:
|
||||
plik.write("Y_test value counts:\n")
|
||||
plik.write(str(Y_test["Pos"].value_counts()) + "\n\n")
|
||||
plik.write("Y_train value counts:\n")
|
||||
plik.write(str(Y_train["Pos"].value_counts()) + "\n\n")
|
||||
plik.write("Y_val value counts:\n")
|
||||
plik.write(str(Y_val["Pos"].value_counts()) + "\n\n")
|
||||
plik.write("X_train stats:\n")
|
||||
plik.write(str(X_train.describe(include='all')) + "\n\n")
|
||||
plik.write("X_test stats:\n")
|
||||
plik.write(str(X_test.describe(include='all')) + "\n\n")
|
||||
plik.write("X_val stats:\n")
|
||||
plik.write(str(X_val.describe(include='all')) + "\n\n")
|
||||
|
6
dataset_stats.dockerfile
Normal file
6
dataset_stats.dockerfile
Normal file
@ -0,0 +1,6 @@
|
||||
FROM continuumio/anaconda3:latest
|
||||
|
||||
RUN apt-get update && apt-get install -y
|
||||
|
||||
RUN pip install pandas
|
||||
RUN pip install scikit-learn
|
3114
iumz_487173.ipynb
3114
iumz_487173.ipynb
File diff suppressed because one or more lines are too long
105
iumz_487173.py
Normal file
105
iumz_487173.py
Normal file
@ -0,0 +1,105 @@
|
||||
from kaggle.api.kaggle_api_extended import KaggleApi
|
||||
import zipfile
|
||||
from sklearn.model_selection import train_test_split
|
||||
from sklearn.preprocessing import StandardScaler
|
||||
import pandas as pd
|
||||
import tensorflow as tf
|
||||
from keras.models import Sequential
|
||||
from keras.layers import Dense
|
||||
import matplotlib.pyplot as plt
|
||||
from keras import utils
|
||||
|
||||
|
||||
|
||||
# Step 2: Download your Kaggle credentials JSON file and save it to a secure location on your system
|
||||
|
||||
# Step 3: Authenticate the Kaggle API client using your credentials JSON file
|
||||
api = KaggleApi()
|
||||
api.authenticate()
|
||||
|
||||
# Step 4: Download the dataset files using the Kaggle API client
|
||||
api.dataset_download_files('vivovinco/20222023-football-player-stats', path='./data')
|
||||
|
||||
# Step 5: Extract the dataset files if they are compressed (e.g., in ZIP format)
|
||||
with zipfile.ZipFile('./data/20222023-football-player-stats.zip', 'r') as zip_ref:
|
||||
zip_ref.extractall('./data')
|
||||
|
||||
|
||||
def plot_loss_tf(history):
|
||||
fig,ax = plt.subplots(1,1, figsize = (4,3))
|
||||
fig.canvas.toolbar_visible = False
|
||||
fig.canvas.header_visible = False
|
||||
fig.canvas.footer_visible = False
|
||||
ax.plot(history.history['loss'], label='loss')
|
||||
ax.set_xlabel('Epoch')
|
||||
ax.set_ylabel('loss (cost)')
|
||||
ax.legend()
|
||||
ax.grid(True)
|
||||
plt.show()
|
||||
|
||||
def get_simplified_position(value):
|
||||
if value.startswith('MF'):
|
||||
return 0
|
||||
elif value.startswith('FW'):
|
||||
return 1
|
||||
elif value.startswith('DF'):
|
||||
return 2
|
||||
elif value.startswith('GK'):
|
||||
return 3
|
||||
else:
|
||||
return value
|
||||
|
||||
# Step 6: Access the dataset files in Python and start working with the data
|
||||
|
||||
players_stats = pd.read_csv('data/2022-2023 Football Player Stats.csv',engine='python',encoding='ISO-8859-1', sep=';')
|
||||
players_stats = players_stats.drop(players_stats.columns[[0,1,2,4,5,6,7,8,9,10,11]], axis=1)
|
||||
|
||||
players_stats['Pos'] = players_stats['Pos'].apply(get_simplified_position)
|
||||
player_stats_subset = players_stats.iloc[:,0:13]
|
||||
player_stats_subset.describe(include='all')
|
||||
|
||||
X = players_stats
|
||||
y = pd.DataFrame(X.pop('Pos'))
|
||||
|
||||
scaler = StandardScaler()
|
||||
X_scaled = scaler.fit_transform(X)
|
||||
X = pd.DataFrame(X_scaled, columns=X.columns)
|
||||
|
||||
X_train, X_temp, Y_train, Y_temp = train_test_split(X, y, test_size=0.3, random_state=1)
|
||||
X_val, X_test, Y_val, Y_test = train_test_split(X_temp, Y_temp, test_size=0.5, random_state=1)
|
||||
Y_train["Pos"].value_counts()
|
||||
|
||||
print(Y_test["Pos"].value_counts())
|
||||
print(Y_val["Pos"].value_counts())
|
||||
print(X_train.describe(include='all'))
|
||||
print(X_test.describe(include='all'))
|
||||
print(X_val.describe(include='all'))
|
||||
|
||||
Y_test = utils.to_categorical(Y_test)
|
||||
Y_train = utils.to_categorical(Y_train)
|
||||
Y_val = utils.to_categorical(Y_val)
|
||||
|
||||
model = Sequential(
|
||||
[
|
||||
Dense(100, input_dim=X_train.shape[1], activation='relu'),
|
||||
Dense(70, activation='relu'),
|
||||
Dense(50, activation='relu'),
|
||||
Dense(4, activation='softmax')
|
||||
], name = "Players_model"
|
||||
)
|
||||
|
||||
model.compile(
|
||||
loss=tf.keras.losses.CategoricalCrossentropy(),
|
||||
optimizer=tf.keras.optimizers.Adam(),
|
||||
metrics=['accuracy'])
|
||||
|
||||
history = model.fit(
|
||||
X_train,Y_train,
|
||||
epochs = 500,
|
||||
validation_data=(X_val, Y_val)
|
||||
)
|
||||
|
||||
plot_loss_tf(history)
|
||||
print('Evaluating...')
|
||||
accuracy = model.evaluate(X_test, Y_test)[1]
|
||||
print(f"accuracy: {accuracy}")
|
Loading…
Reference in New Issue
Block a user