Compare commits

...

11 Commits

Author SHA1 Message Date
d2ae1e0b32 added model evaluation 2024-05-09 02:33:28 +02:00
db5d87f034 added model evaluation 2024-05-09 02:30:19 +02:00
79947a5811 added model evaluation 2024-05-09 02:28:21 +02:00
e3b48a0364 added model evaluation 2024-05-09 02:23:04 +02:00
63f9975668 added model evaluation 2024-05-09 02:20:22 +02:00
0886815c28 added model evaluation 2024-05-09 02:17:29 +02:00
a987608675 added model evaluation 2024-05-09 02:15:19 +02:00
fbd021ed51 added model evaluation 2024-05-09 02:13:12 +02:00
a0f4bcf55a added model evaluation 2024-05-09 02:11:37 +02:00
b79467e2bf added model evaluation 2024-05-09 02:06:04 +02:00
1fb8564e19 added model evaluation 2024-05-09 01:56:58 +02:00
23 changed files with 1488 additions and 1304404 deletions

View File

@ -0,0 +1,17 @@
FROM ubuntu:latest
RUN apt-get update && \
apt-get install -y \
python3 \
python3-pip \
wget \
unzip \
&& rm -rf /var/lib/apt/lists/*
RUN pip3 install pandas scikit-learn requests numpy
WORKDIR /app
COPY use_model.py /app/
RUN chmod +x use_model.py

View File

@ -0,0 +1,43 @@
pipeline {
agent {
dockerfile true
}
triggers {
upstream(upstreamProjects: 's464953-training/training', threshold: hudson.model.Result.SUCCESS)
}
parameters {
buildSelector(defaultSelector: lastSuccessful(), description: 'Which build to use for copying artifacts', name: 'BUILD_SELECTOR')
gitParameter branchFilter: 'origin/(.*)', defaultValue: 'training', name: 'BRANCH', type: 'PT_BRANCH'
}
stages {
stage('Clone Repository') {
steps {
git 'https://git.wmi.amu.edu.pl/s464953/ium_464953.git'
}
}
stage('Copy Training Artifacts') {
steps {
copyArtifacts filter: 'artifacts/*', projectName: 's464953-training/' + params.BRANCH, selector: buildParameter('BUILD_SELECTOR')
}
}
stage('Copy Evaluation Artifacts') {
steps {
copyArtifacts filter: 'metrics_df.csv', projectName: 's464953-training/' + params.BRANCH, selector: buildParameter('BUILD_SELECTOR')
}
}
stage('Run Script') {
steps {
sh "python3 /app/use_model.py ${currentBuild.number}"
}
}
stage('Archive Artifacts') {
steps {
archiveArtifacts artifacts: '*', onlyIfSuccessful: true
}
}
}
}

View File

@ -0,0 +1,77 @@
import pickle
import pandas as pd
import numpy as np
from sklearn.preprocessing import StandardScaler
from sklearn.metrics import mean_squared_error, f1_score, accuracy_score
import sys
import os
import matplotlib.pyplot as plt
def calculate_metrics(result):
rmse = np.sqrt(mean_squared_error(result["Real"], result["Predictions"]))
f1 = f1_score(result["Real"], result["Predictions"], average='macro')
accuracy = accuracy_score(result["Real"], result["Predictions"])
filename = 'metrics_df.csv'
if os.path.exists(filename):
metrics_df = pd.read_csv(filename)
new_row = pd.DataFrame({'Build number': sys.argv[1], 'RMSE': [rmse], 'F1 Score': [f1], 'Accuracy': [accuracy]})
metrics_df = metrics_df.append(new_row, ignore_index=True)
else:
metrics_df = pd.DataFrame({'Build number': sys.argv[1], 'RMSE': [rmse], 'F1 Score': [f1], 'Accuracy': [accuracy]})
metrics_df.to_csv(filename, index=False)
def create_plots():
metrics_df = pd.read_csv("metrics_df.csv")
plt.plot(metrics_df["Build number"], metrics_df["Accuracy"])
plt.xlabel("Build Number")
plt.ylabel("Accuracy")
plt.title("Accuracy of the model over time")
plt.xticks(range(min(metrics_df["Build number"]), max(metrics_df["Build number"]) + 1))
plt.show()
plt.savefig("Accuracy_plot.png")
plt.plot(metrics_df["Build number"], metrics_df["F1 Score"])
plt.xlabel("Build Number")
plt.ylabel("F1 Score")
plt.title("F1 Score of the model over time")
plt.xticks(range(min(metrics_df["Build number"]), max(metrics_df["Build number"]) + 1))
plt.show()
plt.savefig("F1_score_plot.png")
plt.plot(metrics_df["Build number"], metrics_df["RMSE"])
plt.xlabel("Build Number")
plt.ylabel("RMSE")
plt.title("RMSE of the model over time")
plt.xticks(range(min(metrics_df["Build number"]), max(metrics_df["Build number"]) + 1))
plt.show()
plt.savefig("RMSE_plot.png")
np.set_printoptions(threshold=20)
file_path = 'model.pkl'
with open(file_path, 'rb') as file:
model = pickle.load(file)
print("Model został wczytany z pliku:", file_path)
test_df = pd.read_csv("artifacts/docker_test_dataset.csv")
Y_test = test_df[['playlist_genre']]
X_test = test_df.drop(columns='playlist_genre')
Y_test = np.ravel(Y_test)
scaler = StandardScaler()
numeric_columns = X_test.select_dtypes(include=['int', 'float']).columns
X_test_scaled = scaler.fit_transform(X_test[numeric_columns])
Y_pred = model.predict(X_test_scaled)
result = pd.DataFrame({'Predictions': Y_pred, "Real": Y_test})
result.to_csv("spotify_genre_predictions.csv", index=False)
calculate_metrics(result)
create_plots()

File diff suppressed because it is too large Load Diff

View File

@ -1,26 +1,17 @@
FROM ubuntu:latest FROM ubuntu:latest
ENV KAGGLE_USERNAME=gulczas
ENV KAGGLE_KEY=default_key
RUN apt-get update && \ RUN apt-get update && \
apt-get install -y \ apt-get install -y \
python3 \ python3 \
python3-pip \ python3-pip \
wget \ wget \
unzip \ unzip \
&& rm -rf /var/lib/apt/lists/* && rm -rf /var/lib/apt/lists/*
RUN pip3 install pandas scikit-learn requests kaggle numpy RUN pip3 install pandas scikit-learn requests numpy matplotlib
WORKDIR /app WORKDIR /app
COPY model_creator.py /app/
COPY use_model.py /app/ COPY use_model.py /app/
COPY run_py_scripts.sh /app/
RUN chmod +x model_creator.py
RUN chmod +x use_model.py RUN chmod +x use_model.py
CMD ["bash", "run_py_scripts.sh"]

42
Jenkinsfile vendored
View File

@ -1,11 +1,16 @@
pipeline { pipeline {
agent any agent {
dockerfile true
}
triggers {
upstream(upstreamProjects: 's464953-training/training', threshold: hudson.model.Result.SUCCESS)
}
parameters { parameters {
string(name: 'KAGGLE_USERNAME', defaultValue: 'gulczas', description: 'Kaggle username') buildSelector(defaultSelector: lastSuccessful(), description: 'Which build to use for copying artifacts', name: 'BUILD_SELECTOR')
password(name: 'KAGGLE_KEY', defaultValue: '', description: 'Kaggle API key') gitParameter branchFilter: 'origin/(.*)', defaultValue: 'training', name: 'BRANCH', type: 'PT_BRANCH'
string(name: 'CUTOFF', defaultValue: '90', description: 'Number of rows to cut') }
}
stages { stages {
stage('Clone Repository') { stage('Clone Repository') {
@ -13,28 +18,25 @@ pipeline {
git 'https://git.wmi.amu.edu.pl/s464953/ium_464953.git' git 'https://git.wmi.amu.edu.pl/s464953/ium_464953.git'
} }
} }
stage('Cleanup Artifacts') { stage('Copy Training Artifacts') {
steps { steps {
script { copyArtifacts filter: 'artifacts/*', projectName: 's464953-training/' + params.BRANCH, selector: buildParameter('BUILD_SELECTOR')
sh 'rm -rf artifacts' }
}
}
} }
stage('Copy Evaluation Artifacts') {
steps {
copyArtifacts filter: 'metrics_df.csv', projectName: '_s464953-evaluation/evaluation', selector: buildParameter('BUILD_SELECTOR'), optional: true
}
}
stage('Run Script') { stage('Run Script') {
steps { steps {
script { sh "python3 /app/use_model.py ${currentBuild.number}"
withEnv([
"KAGGLE_USERNAME=${env.KAGGLE_USERNAME}",
"KAGGLE_KEY=${env.KAGGLE_KEY}"])
{
sh "bash ./download_dataset.sh ${params.CUTOFF}"
}
}
} }
} }
stage('Archive Artifacts') { stage('Archive Artifacts') {
steps { steps {
archiveArtifacts artifacts: 'artifacts/*', onlyIfSuccessful: true archiveArtifacts artifacts: 'metrics_df.csv, spotify_genre_predictions.csv, F1_score_plot.png, RMSE_plot.png, Accuracy_plot.png', onlyIfSuccessful: true
} }
} }
} }

View File

@ -1,57 +0,0 @@
pipeline {
agent any
parameters {
string(name: 'KAGGLE_USERNAME', defaultValue: 'gulczas', description: 'Kaggle username')
password(name: 'KAGGLE_KEY', defaultValue: '', description: 'Kaggle API key')
}
stages {
stage('Clone Repository') {
steps {
git 'https://git.wmi.amu.edu.pl/s464953/ium_464953.git'
}
}
stage('Stop and remove existing container') {
steps {
script {
sh "docker stop s464953 || true"
sh "docker rm s464953 || true"
}
}
}
stage('Build Docker image') {
steps {
script {
withEnv([
"KAGGLE_USERNAME=${env.KAGGLE_USERNAME}",
"KAGGLE_KEY=${env.KAGGLE_KEY}"
]) {
sh "docker build --build-arg KAGGLE_USERNAME=$KAGGLE_USERNAME --build-arg KAGGLE_KEY=$KAGGLE_KEY -t s464953 ."
}
}
}
}
stage('Run Docker container') {
steps {
script {
withEnv([
"KAGGLE_USERNAME=${env.KAGGLE_USERNAME}",
"KAGGLE_KEY=${env.KAGGLE_KEY}"
]) {
sh "docker run --name s464953 -e KAGGLE_USERNAME=$KAGGLE_USERNAME -e KAGGLE_KEY=$KAGGLE_KEY -v ${WORKSPACE}:/app s464953"
}
}
}
}
stage('Archive stats.txt artifact') {
steps {
archiveArtifacts artifacts: 'stats.txt', allowEmptyArchive: true
}
}
}
}

View File

@ -1,44 +0,0 @@
pipeline {
agent any
parameters {
string(name: 'KAGGLE_USERNAME', defaultValue: 'gulczas', description: 'Kaggle username')
password(name: 'KAGGLE_KEY', defaultValue: '', description: 'Kaggle API key')
}
stages {
stage('Clone Repository') {
steps {
git 'https://git.wmi.amu.edu.pl/s464953/ium_464953.git'
}
}
stage('Stop and remove existing container') {
steps {
script {
sh "docker stop s464953 || true"
sh "docker rm s464953 || true"
}
}
}
stage('Run Docker container') {
steps {
script {
withEnv([
"KAGGLE_USERNAME=${env.KAGGLE_USERNAME}",
"KAGGLE_KEY=${env.KAGGLE_KEY}"
]) {
sh "docker run --name s464953 -e KAGGLE_USERNAME=$KAGGLE_USERNAME -e KAGGLE_KEY=$KAGGLE_KEY -v ${WORKSPACE}:/app michalgulczynski/ium_s464953:1.0"
}
}
}
}
stage('Archive stats.txt artifact') {
steps {
archiveArtifacts artifacts: 'stats.txt', allowEmptyArchive: true
}
}
}
}

View File

@ -1,57 +0,0 @@
pipeline {
agent any
parameters {
string(name: 'KAGGLE_USERNAME', defaultValue: 'gulczas', description: 'Kaggle username')
password(name: 'KAGGLE_KEY', defaultValue: '', description: 'Kaggle API key')
}
stages {
stage('Clone Repository') {
steps {
git 'https://git.wmi.amu.edu.pl/s464953/ium_464953.git'
}
}
stage('Stop and remove existing container') {
steps {
script {
sh "docker stop s464953 || true"
sh "docker rm s464953 || true"
}
}
}
stage('Build Docker image') {
steps {
script {
withEnv([
"KAGGLE_USERNAME=${env.KAGGLE_USERNAME}",
"KAGGLE_KEY=${env.KAGGLE_KEY}"
]) {
sh "docker build --build-arg KAGGLE_USERNAME=$KAGGLE_USERNAME --build-arg KAGGLE_KEY=$KAGGLE_KEY -t s464953 ."
}
}
}
}
stage('Run Docker container') {
steps {
script {
withEnv([
"KAGGLE_USERNAME=${env.KAGGLE_USERNAME}",
"KAGGLE_KEY=${env.KAGGLE_KEY}"
]) {
sh "docker run --name s464953 -e KAGGLE_USERNAME=$KAGGLE_USERNAME -e KAGGLE_KEY=$KAGGLE_KEY -v ${WORKSPACE}:/app s464953"
}
}
}
}
stage('Archive stats.txt artifact') {
steps {
archiveArtifacts artifacts: 'model.pkl', allowEmptyArchive: true
}
}
}
}

View File

@ -1,42 +0,0 @@
pipeline {
agent any
parameters {
buildSelector( defaultSelector: lastSuccessful(), description: 'Build for copying artifacts', name: 'BUILD_SELECTOR')
}
stages {
stage('Clone Repository') {
steps {
git 'https://git.wmi.amu.edu.pl/s464953/ium_464953.git'
}
}
stage('Cleanup Artifacts') {
steps {
script {
sh 'rm -rf artifacts'
}
}
}
stage('Copy Artifact') {
steps {
withEnv([
"BUILD_SELECTOR=${params.BUILD_SELECTOR}"
]) {
copyArtifacts fingerprintArtifacts: true, projectName: 'z-s464953-create-dataset', selector: buildParameter('$BUILD_SELECTOR')}
}
}
stage('Execute Shell Script') {
steps {
script {
sh "bash ./dataset_stats.sh"
}
}
}
stage('Archive Results') {
steps {
archiveArtifacts artifacts: 'artifacts/*', onlyIfSuccessful: true
}
}
}
}

0
a.txt Normal file
View File

View File

@ -1,113 +0,0 @@
#!/usr/bin/env python
# Import bibliotek
import os
import shutil
import pandas as pd
from sklearn.model_selection import train_test_split
import requests
from sklearn.preprocessing import MinMaxScaler
from kaggle.api.kaggle_api_extended import KaggleApi
#funkcja pobierająca plik
def download_file(url, filename, destination_folder):
# Wersja dla datasetów kaggle
api = KaggleApi()
api.authenticate()
api.dataset_download_files('gulczas/spotify-dataset', path=destination_folder, unzip=True)
# funkcja dzieląca zbiór
def split_dataset(data, test_size=0.2, val_size=0.1, random_state=42):
#Podział na test i trening
train_data, test_data = train_test_split(data, test_size=test_size, random_state=random_state)
#Podział na walidacje i trening
train_data, val_data = train_test_split(train_data, test_size=val_size/(1-test_size), random_state=random_state)
return train_data, val_data, test_data
# Wyświetlanie statystyk zbioru
def print_dataset_stats(data, subset_name):
with open('stats.txt', 'a') as stats_file:
print(f"Statystyki dla zbioru {subset_name}:", file=stats_file)
print(f"Wielkość zbioru {subset_name}: {len(data)}", file=stats_file)
print("\nStatystyki wartości poszczególnych parametrów:", file=stats_file)
print(data.describe(), file=stats_file)
for column in data.columns:
print(f"Rozkład częstości dla kolumny '{column}':", file=stats_file)
print(data[column].value_counts(), file=stats_file)
print("\n", file=stats_file)
# Normalizacja danych
def normalize_data(data):
scaler = MinMaxScaler()
numeric_columns = data.select_dtypes(include=['int', 'float']).columns
scaler.fit(data[numeric_columns])
df_normalized = data.copy()
df_normalized[numeric_columns] = scaler.transform(df_normalized[numeric_columns])
return df_normalized
#Czyszczenie danych
def clean_dataset(data):
data.dropna(inplace=True)
data.drop_duplicates(inplace=True)
return data
# main
url = "https://www.kaggle.com/datasets/gulczas/spotify-dataset?select=Spotify_Dataset.csv"
filename = "Spotify_Dataset.csv"
destination_folder = "datasets"
# Pobieranie jeśli nie ma już pobranego pliku
if not os.path.exists(destination_folder):
os.makedirs(destination_folder)
print(f"Utworzono folder: {destination_folder}")
else:
print(f"Folder {destination_folder} już istnieje.")
if 'Spotify_Dataset.csv' not in os.listdir(destination_folder):
# Pobranie pliku
filepath = download_file(url, filename, destination_folder)
# Wczytanie danych z pliku CSV
data = pd.read_csv("datasets/Spotify_Dataset.csv", sep=";")
# Podział datasetu na zbiory treningowy, walidacyjny i testowy
train_data, val_data, test_data = split_dataset(data)
# Zapisanie podzielonych zbiorów danych do osobnych plików CSV
train_data.to_csv("datasets/train.csv", index=False)
val_data.to_csv("datasets/val.csv", index=False)
test_data.to_csv("datasets/test.csv", index=False)
# Wydrukowanie statystyk dla zbiorów
print_dataset_stats(train_data, "treningowego")
print("\n")
print_dataset_stats(val_data, "walidacyjnego")
print("\n")
print_dataset_stats(test_data, "testowego")
# Normalizacja i czyszczenie zbirów
train_data = normalize_data(train_data)
train_data = clean_dataset(train_data)
val_data = normalize_data(train_data)
val_data = clean_dataset(train_data)
test_data = normalize_data(train_data)
test_data = clean_dataset(train_data)

View File

@ -1,13 +0,0 @@
#!/bin/bash
echo "------------------ Train dataset stats ------------------"
wc -l artifacts/train.csv > stats_train.txt
echo "------------------ Validation dataset stats ------------------"
wc -l artifacts/validation.csv > stats_validation.txt
echo "------------------ Test dataset stats ------------------"
wc -l artifacts/test.csv > stats_test.txt
mkdir -p data
mv stats_train.txt stats_validation.txt stats_test.txt artifacts/

File diff suppressed because it is too large Load Diff

File diff suppressed because it is too large Load Diff

File diff suppressed because it is too large Load Diff

File diff suppressed because it is too large Load Diff

View File

@ -1,26 +0,0 @@
#!/bin/bash
pip install kaggle --upgrade
kaggle datasets download -d gulczas/spotify-dataset
unzip -o spotify-dataset.zip
echo "------------------ Shufle ------------------"
shuf Spotify_Dataset.csv -o shuffled_spotify.csv
echo "------------------ Cut off to top $1 rows ------------------"
head -n $1 shuffled_spotify.csv > cutoff_spotify.csv
echo "------------------ Split ------------------"
total_lines=$(wc -l < cutoff_spotify.csv)
num_test=$((total_lines / 10))
num_train=$((total_lines - (num_test * 2)))
num_validation=$num_test
head -n $num_train cutoff_spotify.csv > train.csv
tail -n $((num_test+num_validation)) cutoff_spotify.csv | head -n $num_test > test.csv
tail -n $num_validation cutoff_spotify.csv > validation.csv
mkdir -p artifacts
mv Spotify_Dataset.csv cutoff_spotify.csv train.csv validation.csv test.csv artifacts/

BIN
model.pkl

Binary file not shown.

View File

@ -1,127 +0,0 @@
import pandas as pd
import os
import numpy as np
from kaggle.api.kaggle_api_extended import KaggleApi
from sklearn.model_selection import train_test_split
from sklearn.linear_model import LogisticRegression
from sklearn.metrics import accuracy_score
from sklearn.preprocessing import StandardScaler
from sklearn. preprocessing import LabelEncoder
import pickle
def download_dataset(dataset_address, destination_folder):
api = KaggleApi()
api.authenticate()
api.dataset_download_files(dataset_address, path=destination_folder, unzip=True)
def check_datasets_presence():
dataset_1 = "Spotify_Dataset.csv"
dataset_2 = "spotify_songs.csv"
destination_folder = "datasets"
if not os.path.exists(destination_folder):
os.makedirs(destination_folder)
print(f"Utworzono folder: {destination_folder}")
else:
print(f"Folder {destination_folder} już istnieje.")
if dataset_1 not in os.listdir(destination_folder):
download_dataset('gulczas/spotify-dataset', destination_folder)
if dataset_2 not in os.listdir(destination_folder):
download_dataset('joebeachcapital/30000-spotify-songs', destination_folder)
def datasets_preparation():
df_1 = pd.read_csv("datasets/spotify_songs.csv")
df_2 = pd.read_csv("datasets/Spotify_Dataset.csv", sep=";")
df_1 = df_1.dropna()
df_2 = df_2.dropna()
df_2 = df_2.rename(columns={'Title': 'track_name'})
columns_to_remove_df_1 = ['track_id', 'track_album_id', 'track_album_name', 'track_album_release_date',
'playlist_id', 'playlist_subgenre']
columns_to_remove_df_2 = ['Date','# of Artist', 'Artist (Ind.)', '# of Nationality',
'Nationality', 'Continent', 'Points (Total)',
'Points (Ind for each Artist/Nat)', 'id', 'Song URL']
df_1 = df_1.drop(columns=columns_to_remove_df_1)
df_2 = df_2.drop(columns=columns_to_remove_df_2)
df_1 = df_1.drop_duplicates(subset=['track_name'])
df_2 = df_2.drop_duplicates(subset=['track_name'])
le = LabelEncoder()
unique_names_df2 = df_2['track_name'].unique()
diff_df = df_1[~df_1['track_name'].isin(unique_names_df2)]
diff_df = diff_df.iloc[:10000]
#diff_df = pd.concat([diff_df, df_1.iloc[:20]], ignore_index=True)
diff_df['track_artist'] = le.fit_transform(diff_df.track_artist)
diff_df['playlist_name'] = le.fit_transform(diff_df.playlist_name)
diff_df['playlist_genre'] = le.fit_transform(diff_df.playlist_genre)
#df_1 = df_1.iloc[20:]
if "docker_test_dataset.csv" not in os.listdir("datasets"):
diff_df.to_csv("datasets/docker_test_dataset.csv", index=False)
result_df = pd.merge(df_1, df_2, on='track_name', how='inner')
result_df = result_df.drop_duplicates(subset=['track_name'])
columns_to_remove_result_df = ['Rank', 'Artists', 'Danceability', 'Energy', 'Loudness',
'Speechiness', 'Acousticness', 'Instrumentalness', 'Valence']
result_df = result_df.drop(columns=columns_to_remove_result_df)
result_df['track_artist'] = le.fit_transform(result_df.track_artist)
result_df['playlist_name'] = le.fit_transform(result_df.playlist_name)
result_df['playlist_genre'] = le.fit_transform(result_df.playlist_genre)
return result_df
check_datasets_presence()
result_df = datasets_preparation()
Y = result_df[['playlist_genre']]
X = result_df.drop(columns='playlist_genre')
X_train, X_test, Y_train, Y_test = train_test_split(X, Y, test_size=0.10, random_state=42)
Y_train = np.ravel(Y_train)
Y_test = np.ravel(Y_test)
scaler = StandardScaler()
numeric_columns = X_train.select_dtypes(include=['int', 'float']).columns
X_train_scaled = scaler.fit_transform(X_train[numeric_columns])
X_test_scaled = scaler.transform(X_test[numeric_columns])
model = LogisticRegression(max_iter=1000)
model.fit(X_train_scaled, Y_train)
Y_pred = model.predict(X_test_scaled)
accuracy = accuracy_score(Y_test, Y_pred)
print("Accuracy:", accuracy)
file_path = 'model.pkl'
if os.path.exists(file_path):
os.remove(file_path)
if file_path not in os.listdir("./"):
with open(file_path, 'wb') as file:
pickle.dump(model, file)
print("Model został zapisany do pliku:", file_path)

View File

@ -1,3 +0,0 @@
Real:['edm', 'edm', 'edm', 'edm', 'edm', 'edm', 'edm', 'edm', 'edm', 'edm', 'edm', 'edm', 'edm', 'edm', 'edm', 'edm', 'edm', 'edm', 'edm', 'edm']
Predicted: ['pop', 'pop', 'pop', 'pop', 'pop', 'pop', 'pop', 'pop', 'pop', 'pop', 'pop', 'pop', 'pop', 'pop', 'pop', 'pop', 'pop', 'pop', 'pop', 'pop']
Accuracy:0.1521

View File

@ -1,3 +0,0 @@
#!/bin/bash
python3 model_creator.py

View File

@ -2,7 +2,57 @@ import pickle
import pandas as pd import pandas as pd
import numpy as np import numpy as np
from sklearn.preprocessing import StandardScaler from sklearn.preprocessing import StandardScaler
from sklearn.metrics import accuracy_score from sklearn.metrics import mean_squared_error, f1_score, accuracy_score
import sys
import os
import matplotlib.pyplot as plt
def calculate_metrics(result):
rmse = np.sqrt(mean_squared_error(result["Real"], result["Predictions"]))
f1 = f1_score(result["Real"], result["Predictions"], average='macro')
accuracy = accuracy_score(result["Real"], result["Predictions"])
filename = 'metrics_df.csv'
if os.path.exists(filename):
metrics_df = pd.read_csv(filename)
new_row = pd.DataFrame({'Build number': sys.argv[1], 'RMSE': [rmse], 'F1 Score': [f1], 'Accuracy': [accuracy]})
metrics_df = pd.concat([metrics_df, new_row], ignore_index=True)
else:
metrics_df = pd.DataFrame({'Build number': sys.argv[1], 'RMSE': [rmse], 'F1 Score': [f1], 'Accuracy': [accuracy]})
metrics_df.to_csv(filename, index=False)
def create_plots():
metrics_df = pd.read_csv("metrics_df.csv")
plt.plot(metrics_df["Build number"], metrics_df["Accuracy"])
plt.xlabel("Build Number")
plt.ylabel("Accuracy")
plt.title("Accuracy of the model over time")
plt.xticks(range(min(metrics_df["Build number"]), max(metrics_df["Build number"]) + 1))
plt.show()
plt.savefig("Accuracy_plot.png")
plt.close()
plt.plot(metrics_df["Build number"], metrics_df["F1 Score"])
plt.xlabel("Build Number")
plt.ylabel("F1 Score")
plt.title("F1 Score of the model over time")
plt.xticks(range(min(metrics_df["Build number"]), max(metrics_df["Build number"]) + 1))
plt.show()
plt.savefig("F1_score_plot.png")
plt.close()
plt.plot(metrics_df["Build number"], metrics_df["RMSE"])
plt.xlabel("Build Number")
plt.ylabel("RMSE")
plt.title("RMSE of the model over time")
plt.xticks(range(min(metrics_df["Build number"]), max(metrics_df["Build number"]) + 1))
plt.show()
plt.savefig("RMSE_plot.png")
plt.close()
np.set_printoptions(threshold=20) np.set_printoptions(threshold=20)
@ -11,7 +61,7 @@ with open(file_path, 'rb') as file:
model = pickle.load(file) model = pickle.load(file)
print("Model został wczytany z pliku:", file_path) print("Model został wczytany z pliku:", file_path)
test_df = pd.read_csv("datasets/docker_test_dataset.csv") test_df = pd.read_csv("artifacts/docker_test_dataset.csv")
Y_test = test_df[['playlist_genre']] Y_test = test_df[['playlist_genre']]
X_test = test_df.drop(columns='playlist_genre') X_test = test_df.drop(columns='playlist_genre')
@ -23,14 +73,8 @@ X_test_scaled = scaler.fit_transform(X_test[numeric_columns])
Y_pred = model.predict(X_test_scaled) Y_pred = model.predict(X_test_scaled)
with open('model_predictions.txt', 'w') as f: result = pd.DataFrame({'Predictions': Y_pred, "Real": Y_test})
pass result.to_csv("spotify_genre_predictions.csv", index=False)
with open('model_predictions.txt', 'a') as f:
labels_dict = {0: 'edm', 1 : 'latin', 2 : 'pop', 3 : 'r&b', 4 : 'rap', 5 :'rock'}
Y_test_labels = [labels_dict[number] for number in Y_test]
Y_pred_labels = [labels_dict[number] for number in Y_pred]
f.write("Real:" + str(Y_test_labels[:20])+ " \nPredicted: "+ str(Y_pred_labels[:20]))
accuracy = accuracy_score(Y_test, Y_pred)
f.write("\nAccuracy:" + str(accuracy))
calculate_metrics(result)
create_plots()