added model evaluation
This commit is contained in:
parent
b7b992cb8a
commit
1fb8564e19
17
.ipynb_checkpoints/Dockerfile-checkpoint
Normal file
17
.ipynb_checkpoints/Dockerfile-checkpoint
Normal file
@ -0,0 +1,17 @@
|
|||||||
|
FROM ubuntu:latest
|
||||||
|
|
||||||
|
RUN apt-get update && \
|
||||||
|
apt-get install -y \
|
||||||
|
python3 \
|
||||||
|
python3-pip \
|
||||||
|
wget \
|
||||||
|
unzip \
|
||||||
|
&& rm -rf /var/lib/apt/lists/*
|
||||||
|
|
||||||
|
RUN pip3 install pandas scikit-learn requests numpy
|
||||||
|
|
||||||
|
WORKDIR /app
|
||||||
|
|
||||||
|
COPY use_model.py /app/
|
||||||
|
|
||||||
|
RUN chmod +x use_model.py
|
43
.ipynb_checkpoints/Jenkinsfile-checkpoint
Normal file
43
.ipynb_checkpoints/Jenkinsfile-checkpoint
Normal file
@ -0,0 +1,43 @@
|
|||||||
|
pipeline {
|
||||||
|
agent {
|
||||||
|
dockerfile true
|
||||||
|
}
|
||||||
|
|
||||||
|
triggers {
|
||||||
|
upstream(upstreamProjects: 's464953-training/training', threshold: hudson.model.Result.SUCCESS)
|
||||||
|
}
|
||||||
|
|
||||||
|
parameters {
|
||||||
|
buildSelector(defaultSelector: lastSuccessful(), description: 'Which build to use for copying artifacts', name: 'BUILD_SELECTOR')
|
||||||
|
gitParameter branchFilter: 'origin/(.*)', defaultValue: 'training', name: 'BRANCH', type: 'PT_BRANCH'
|
||||||
|
}
|
||||||
|
|
||||||
|
stages {
|
||||||
|
stage('Clone Repository') {
|
||||||
|
steps {
|
||||||
|
git 'https://git.wmi.amu.edu.pl/s464953/ium_464953.git'
|
||||||
|
}
|
||||||
|
}
|
||||||
|
stage('Copy Training Artifacts') {
|
||||||
|
steps {
|
||||||
|
copyArtifacts filter: 'artifacts/*', projectName: 's464953-training/' + params.BRANCH, selector: buildParameter('BUILD_SELECTOR')
|
||||||
|
}
|
||||||
|
}
|
||||||
|
stage('Copy Evaluation Artifacts') {
|
||||||
|
steps {
|
||||||
|
copyArtifacts filter: 'metrics_df.csv', projectName: 's464953-training/' + params.BRANCH, selector: buildParameter('BUILD_SELECTOR')
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
|
stage('Run Script') {
|
||||||
|
steps {
|
||||||
|
sh "python3 /app/use_model.py ${currentBuild.number}"
|
||||||
|
}
|
||||||
|
}
|
||||||
|
stage('Archive Artifacts') {
|
||||||
|
steps {
|
||||||
|
archiveArtifacts artifacts: '*', onlyIfSuccessful: true
|
||||||
|
}
|
||||||
|
}
|
||||||
|
}
|
||||||
|
}
|
77
.ipynb_checkpoints/use_model-checkpoint.py
Normal file
77
.ipynb_checkpoints/use_model-checkpoint.py
Normal file
@ -0,0 +1,77 @@
|
|||||||
|
import pickle
|
||||||
|
import pandas as pd
|
||||||
|
import numpy as np
|
||||||
|
from sklearn.preprocessing import StandardScaler
|
||||||
|
from sklearn.metrics import mean_squared_error, f1_score, accuracy_score
|
||||||
|
import sys
|
||||||
|
import os
|
||||||
|
import matplotlib.pyplot as plt
|
||||||
|
|
||||||
|
def calculate_metrics(result):
|
||||||
|
rmse = np.sqrt(mean_squared_error(result["Real"], result["Predictions"]))
|
||||||
|
f1 = f1_score(result["Real"], result["Predictions"], average='macro')
|
||||||
|
accuracy = accuracy_score(result["Real"], result["Predictions"])
|
||||||
|
|
||||||
|
filename = 'metrics_df.csv'
|
||||||
|
if os.path.exists(filename):
|
||||||
|
metrics_df = pd.read_csv(filename)
|
||||||
|
new_row = pd.DataFrame({'Build number': sys.argv[1], 'RMSE': [rmse], 'F1 Score': [f1], 'Accuracy': [accuracy]})
|
||||||
|
metrics_df = metrics_df.append(new_row, ignore_index=True)
|
||||||
|
else:
|
||||||
|
metrics_df = pd.DataFrame({'Build number': sys.argv[1], 'RMSE': [rmse], 'F1 Score': [f1], 'Accuracy': [accuracy]})
|
||||||
|
|
||||||
|
|
||||||
|
metrics_df.to_csv(filename, index=False)
|
||||||
|
|
||||||
|
def create_plots():
|
||||||
|
|
||||||
|
metrics_df = pd.read_csv("metrics_df.csv")
|
||||||
|
|
||||||
|
plt.plot(metrics_df["Build number"], metrics_df["Accuracy"])
|
||||||
|
plt.xlabel("Build Number")
|
||||||
|
plt.ylabel("Accuracy")
|
||||||
|
plt.title("Accuracy of the model over time")
|
||||||
|
plt.xticks(range(min(metrics_df["Build number"]), max(metrics_df["Build number"]) + 1))
|
||||||
|
plt.show()
|
||||||
|
plt.savefig("Accuracy_plot.png")
|
||||||
|
|
||||||
|
plt.plot(metrics_df["Build number"], metrics_df["F1 Score"])
|
||||||
|
plt.xlabel("Build Number")
|
||||||
|
plt.ylabel("F1 Score")
|
||||||
|
plt.title("F1 Score of the model over time")
|
||||||
|
plt.xticks(range(min(metrics_df["Build number"]), max(metrics_df["Build number"]) + 1))
|
||||||
|
plt.show()
|
||||||
|
plt.savefig("F1_score_plot.png")
|
||||||
|
|
||||||
|
plt.plot(metrics_df["Build number"], metrics_df["RMSE"])
|
||||||
|
plt.xlabel("Build Number")
|
||||||
|
plt.ylabel("RMSE")
|
||||||
|
plt.title("RMSE of the model over time")
|
||||||
|
plt.xticks(range(min(metrics_df["Build number"]), max(metrics_df["Build number"]) + 1))
|
||||||
|
plt.show()
|
||||||
|
plt.savefig("RMSE_plot.png")
|
||||||
|
|
||||||
|
np.set_printoptions(threshold=20)
|
||||||
|
|
||||||
|
file_path = 'model.pkl'
|
||||||
|
with open(file_path, 'rb') as file:
|
||||||
|
model = pickle.load(file)
|
||||||
|
print("Model został wczytany z pliku:", file_path)
|
||||||
|
|
||||||
|
test_df = pd.read_csv("artifacts/docker_test_dataset.csv")
|
||||||
|
|
||||||
|
Y_test = test_df[['playlist_genre']]
|
||||||
|
X_test = test_df.drop(columns='playlist_genre')
|
||||||
|
Y_test = np.ravel(Y_test)
|
||||||
|
|
||||||
|
scaler = StandardScaler()
|
||||||
|
numeric_columns = X_test.select_dtypes(include=['int', 'float']).columns
|
||||||
|
X_test_scaled = scaler.fit_transform(X_test[numeric_columns])
|
||||||
|
|
||||||
|
Y_pred = model.predict(X_test_scaled)
|
||||||
|
|
||||||
|
result = pd.DataFrame({'Predictions': Y_pred, "Real": Y_test})
|
||||||
|
result.to_csv("spotify_genre_predictions.csv", index=False)
|
||||||
|
|
||||||
|
calculate_metrics(result)
|
||||||
|
create_plots()
|
1271
.ipynb_checkpoints/zad1-checkpoint.ipynb
Normal file
1271
.ipynb_checkpoints/zad1-checkpoint.ipynb
Normal file
File diff suppressed because it is too large
Load Diff
11
Dockerfile
11
Dockerfile
@ -1,8 +1,5 @@
|
|||||||
FROM ubuntu:latest
|
FROM ubuntu:latest
|
||||||
|
|
||||||
ENV KAGGLE_USERNAME=gulczas
|
|
||||||
ENV KAGGLE_KEY=default_key
|
|
||||||
|
|
||||||
RUN apt-get update && \
|
RUN apt-get update && \
|
||||||
apt-get install -y \
|
apt-get install -y \
|
||||||
python3 \
|
python3 \
|
||||||
@ -11,16 +8,10 @@ RUN apt-get update && \
|
|||||||
unzip \
|
unzip \
|
||||||
&& rm -rf /var/lib/apt/lists/*
|
&& rm -rf /var/lib/apt/lists/*
|
||||||
|
|
||||||
RUN pip3 install pandas scikit-learn requests kaggle numpy
|
RUN pip3 install pandas scikit-learn requests numpy
|
||||||
|
|
||||||
WORKDIR /app
|
WORKDIR /app
|
||||||
|
|
||||||
COPY model_creator.py /app/
|
|
||||||
COPY use_model.py /app/
|
COPY use_model.py /app/
|
||||||
COPY run_py_scripts.sh /app/
|
|
||||||
|
|
||||||
|
|
||||||
RUN chmod +x model_creator.py
|
|
||||||
RUN chmod +x use_model.py
|
RUN chmod +x use_model.py
|
||||||
|
|
||||||
CMD ["bash", "run_py_scripts.sh"]
|
|
34
Jenkinsfile
vendored
34
Jenkinsfile
vendored
@ -1,10 +1,15 @@
|
|||||||
pipeline {
|
pipeline {
|
||||||
agent any
|
agent {
|
||||||
|
dockerfile true
|
||||||
|
}
|
||||||
|
|
||||||
|
triggers {
|
||||||
|
upstream(upstreamProjects: 's464953-training/training', threshold: hudson.model.Result.SUCCESS)
|
||||||
|
}
|
||||||
|
|
||||||
parameters {
|
parameters {
|
||||||
string(name: 'KAGGLE_USERNAME', defaultValue: 'gulczas', description: 'Kaggle username')
|
buildSelector(defaultSelector: lastSuccessful(), description: 'Which build to use for copying artifacts', name: 'BUILD_SELECTOR')
|
||||||
password(name: 'KAGGLE_KEY', defaultValue: '', description: 'Kaggle API key')
|
gitParameter branchFilter: 'origin/(.*)', defaultValue: 'training', name: 'BRANCH', type: 'PT_BRANCH'
|
||||||
string(name: 'CUTOFF', defaultValue: '90', description: 'Number of rows to cut')
|
|
||||||
}
|
}
|
||||||
|
|
||||||
stages {
|
stages {
|
||||||
@ -13,28 +18,25 @@ pipeline {
|
|||||||
git 'https://git.wmi.amu.edu.pl/s464953/ium_464953.git'
|
git 'https://git.wmi.amu.edu.pl/s464953/ium_464953.git'
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
stage('Cleanup Artifacts') {
|
stage('Copy Training Artifacts') {
|
||||||
steps {
|
steps {
|
||||||
script {
|
copyArtifacts filter: 'artifacts/*', projectName: 's464953-training/' + params.BRANCH, selector: buildParameter('BUILD_SELECTOR')
|
||||||
sh 'rm -rf artifacts'
|
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
|
stage('Copy Evaluation Artifacts') {
|
||||||
|
steps {
|
||||||
|
copyArtifacts filter: 'metrics_df.csv', projectName: 's464953-training/' + params.BRANCH, selector: buildParameter('BUILD_SELECTOR')
|
||||||
}
|
}
|
||||||
|
}
|
||||||
|
|
||||||
stage('Run Script') {
|
stage('Run Script') {
|
||||||
steps {
|
steps {
|
||||||
script {
|
sh "python3 /app/use_model.py ${currentBuild.number}"
|
||||||
withEnv([
|
|
||||||
"KAGGLE_USERNAME=${env.KAGGLE_USERNAME}",
|
|
||||||
"KAGGLE_KEY=${env.KAGGLE_KEY}"])
|
|
||||||
{
|
|
||||||
sh "bash ./download_dataset.sh ${params.CUTOFF}"
|
|
||||||
}
|
|
||||||
}
|
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
stage('Archive Artifacts') {
|
stage('Archive Artifacts') {
|
||||||
steps {
|
steps {
|
||||||
archiveArtifacts artifacts: 'artifacts/*', onlyIfSuccessful: true
|
archiveArtifacts artifacts: '*', onlyIfSuccessful: true
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
|
@ -1,57 +0,0 @@
|
|||||||
pipeline {
|
|
||||||
agent any
|
|
||||||
|
|
||||||
parameters {
|
|
||||||
string(name: 'KAGGLE_USERNAME', defaultValue: 'gulczas', description: 'Kaggle username')
|
|
||||||
password(name: 'KAGGLE_KEY', defaultValue: '', description: 'Kaggle API key')
|
|
||||||
}
|
|
||||||
|
|
||||||
stages {
|
|
||||||
stage('Clone Repository') {
|
|
||||||
steps {
|
|
||||||
git 'https://git.wmi.amu.edu.pl/s464953/ium_464953.git'
|
|
||||||
}
|
|
||||||
}
|
|
||||||
|
|
||||||
stage('Stop and remove existing container') {
|
|
||||||
steps {
|
|
||||||
script {
|
|
||||||
sh "docker stop s464953 || true"
|
|
||||||
sh "docker rm s464953 || true"
|
|
||||||
}
|
|
||||||
}
|
|
||||||
}
|
|
||||||
|
|
||||||
stage('Build Docker image') {
|
|
||||||
steps {
|
|
||||||
script {
|
|
||||||
withEnv([
|
|
||||||
"KAGGLE_USERNAME=${env.KAGGLE_USERNAME}",
|
|
||||||
"KAGGLE_KEY=${env.KAGGLE_KEY}"
|
|
||||||
]) {
|
|
||||||
sh "docker build --build-arg KAGGLE_USERNAME=$KAGGLE_USERNAME --build-arg KAGGLE_KEY=$KAGGLE_KEY -t s464953 ."
|
|
||||||
}
|
|
||||||
}
|
|
||||||
}
|
|
||||||
}
|
|
||||||
|
|
||||||
stage('Run Docker container') {
|
|
||||||
steps {
|
|
||||||
script {
|
|
||||||
withEnv([
|
|
||||||
"KAGGLE_USERNAME=${env.KAGGLE_USERNAME}",
|
|
||||||
"KAGGLE_KEY=${env.KAGGLE_KEY}"
|
|
||||||
]) {
|
|
||||||
sh "docker run --name s464953 -e KAGGLE_USERNAME=$KAGGLE_USERNAME -e KAGGLE_KEY=$KAGGLE_KEY -v ${WORKSPACE}:/app s464953"
|
|
||||||
}
|
|
||||||
}
|
|
||||||
}
|
|
||||||
}
|
|
||||||
|
|
||||||
stage('Archive stats.txt artifact') {
|
|
||||||
steps {
|
|
||||||
archiveArtifacts artifacts: 'stats.txt', allowEmptyArchive: true
|
|
||||||
}
|
|
||||||
}
|
|
||||||
}
|
|
||||||
}
|
|
@ -1,44 +0,0 @@
|
|||||||
pipeline {
|
|
||||||
agent any
|
|
||||||
|
|
||||||
parameters {
|
|
||||||
string(name: 'KAGGLE_USERNAME', defaultValue: 'gulczas', description: 'Kaggle username')
|
|
||||||
password(name: 'KAGGLE_KEY', defaultValue: '', description: 'Kaggle API key')
|
|
||||||
}
|
|
||||||
|
|
||||||
stages {
|
|
||||||
stage('Clone Repository') {
|
|
||||||
steps {
|
|
||||||
git 'https://git.wmi.amu.edu.pl/s464953/ium_464953.git'
|
|
||||||
}
|
|
||||||
}
|
|
||||||
|
|
||||||
stage('Stop and remove existing container') {
|
|
||||||
steps {
|
|
||||||
script {
|
|
||||||
sh "docker stop s464953 || true"
|
|
||||||
sh "docker rm s464953 || true"
|
|
||||||
}
|
|
||||||
}
|
|
||||||
}
|
|
||||||
|
|
||||||
stage('Run Docker container') {
|
|
||||||
steps {
|
|
||||||
script {
|
|
||||||
withEnv([
|
|
||||||
"KAGGLE_USERNAME=${env.KAGGLE_USERNAME}",
|
|
||||||
"KAGGLE_KEY=${env.KAGGLE_KEY}"
|
|
||||||
]) {
|
|
||||||
sh "docker run --name s464953 -e KAGGLE_USERNAME=$KAGGLE_USERNAME -e KAGGLE_KEY=$KAGGLE_KEY -v ${WORKSPACE}:/app michalgulczynski/ium_s464953:1.0"
|
|
||||||
}
|
|
||||||
}
|
|
||||||
}
|
|
||||||
}
|
|
||||||
|
|
||||||
stage('Archive stats.txt artifact') {
|
|
||||||
steps {
|
|
||||||
archiveArtifacts artifacts: 'stats.txt', allowEmptyArchive: true
|
|
||||||
}
|
|
||||||
}
|
|
||||||
}
|
|
||||||
}
|
|
@ -1,57 +0,0 @@
|
|||||||
pipeline {
|
|
||||||
agent any
|
|
||||||
|
|
||||||
parameters {
|
|
||||||
string(name: 'KAGGLE_USERNAME', defaultValue: 'gulczas', description: 'Kaggle username')
|
|
||||||
password(name: 'KAGGLE_KEY', defaultValue: '', description: 'Kaggle API key')
|
|
||||||
}
|
|
||||||
|
|
||||||
stages {
|
|
||||||
stage('Clone Repository') {
|
|
||||||
steps {
|
|
||||||
git 'https://git.wmi.amu.edu.pl/s464953/ium_464953.git'
|
|
||||||
}
|
|
||||||
}
|
|
||||||
|
|
||||||
stage('Stop and remove existing container') {
|
|
||||||
steps {
|
|
||||||
script {
|
|
||||||
sh "docker stop s464953 || true"
|
|
||||||
sh "docker rm s464953 || true"
|
|
||||||
}
|
|
||||||
}
|
|
||||||
}
|
|
||||||
|
|
||||||
stage('Build Docker image') {
|
|
||||||
steps {
|
|
||||||
script {
|
|
||||||
withEnv([
|
|
||||||
"KAGGLE_USERNAME=${env.KAGGLE_USERNAME}",
|
|
||||||
"KAGGLE_KEY=${env.KAGGLE_KEY}"
|
|
||||||
]) {
|
|
||||||
sh "docker build --build-arg KAGGLE_USERNAME=$KAGGLE_USERNAME --build-arg KAGGLE_KEY=$KAGGLE_KEY -t s464953 ."
|
|
||||||
}
|
|
||||||
}
|
|
||||||
}
|
|
||||||
}
|
|
||||||
|
|
||||||
stage('Run Docker container') {
|
|
||||||
steps {
|
|
||||||
script {
|
|
||||||
withEnv([
|
|
||||||
"KAGGLE_USERNAME=${env.KAGGLE_USERNAME}",
|
|
||||||
"KAGGLE_KEY=${env.KAGGLE_KEY}"
|
|
||||||
]) {
|
|
||||||
sh "docker run --name s464953 -e KAGGLE_USERNAME=$KAGGLE_USERNAME -e KAGGLE_KEY=$KAGGLE_KEY -v ${WORKSPACE}:/app s464953"
|
|
||||||
}
|
|
||||||
}
|
|
||||||
}
|
|
||||||
}
|
|
||||||
|
|
||||||
stage('Archive stats.txt artifact') {
|
|
||||||
steps {
|
|
||||||
archiveArtifacts artifacts: 'model.pkl', allowEmptyArchive: true
|
|
||||||
}
|
|
||||||
}
|
|
||||||
}
|
|
||||||
}
|
|
@ -1,42 +0,0 @@
|
|||||||
pipeline {
|
|
||||||
agent any
|
|
||||||
|
|
||||||
parameters {
|
|
||||||
buildSelector( defaultSelector: lastSuccessful(), description: 'Build for copying artifacts', name: 'BUILD_SELECTOR')
|
|
||||||
}
|
|
||||||
|
|
||||||
stages {
|
|
||||||
stage('Clone Repository') {
|
|
||||||
steps {
|
|
||||||
git 'https://git.wmi.amu.edu.pl/s464953/ium_464953.git'
|
|
||||||
}
|
|
||||||
}
|
|
||||||
stage('Cleanup Artifacts') {
|
|
||||||
steps {
|
|
||||||
script {
|
|
||||||
sh 'rm -rf artifacts'
|
|
||||||
}
|
|
||||||
}
|
|
||||||
}
|
|
||||||
stage('Copy Artifact') {
|
|
||||||
steps {
|
|
||||||
withEnv([
|
|
||||||
"BUILD_SELECTOR=${params.BUILD_SELECTOR}"
|
|
||||||
]) {
|
|
||||||
copyArtifacts fingerprintArtifacts: true, projectName: 'z-s464953-create-dataset', selector: buildParameter('$BUILD_SELECTOR')}
|
|
||||||
}
|
|
||||||
}
|
|
||||||
stage('Execute Shell Script') {
|
|
||||||
steps {
|
|
||||||
script {
|
|
||||||
sh "bash ./dataset_stats.sh"
|
|
||||||
}
|
|
||||||
}
|
|
||||||
}
|
|
||||||
stage('Archive Results') {
|
|
||||||
steps {
|
|
||||||
archiveArtifacts artifacts: 'artifacts/*', onlyIfSuccessful: true
|
|
||||||
}
|
|
||||||
}
|
|
||||||
}
|
|
||||||
}
|
|
@ -1,113 +0,0 @@
|
|||||||
#!/usr/bin/env python
|
|
||||||
|
|
||||||
# Import bibliotek
|
|
||||||
|
|
||||||
import os
|
|
||||||
import shutil
|
|
||||||
import pandas as pd
|
|
||||||
from sklearn.model_selection import train_test_split
|
|
||||||
import requests
|
|
||||||
from sklearn.preprocessing import MinMaxScaler
|
|
||||||
from kaggle.api.kaggle_api_extended import KaggleApi
|
|
||||||
|
|
||||||
|
|
||||||
#funkcja pobierająca plik
|
|
||||||
|
|
||||||
def download_file(url, filename, destination_folder):
|
|
||||||
# Wersja dla datasetów kaggle
|
|
||||||
|
|
||||||
api = KaggleApi()
|
|
||||||
api.authenticate()
|
|
||||||
|
|
||||||
api.dataset_download_files('gulczas/spotify-dataset', path=destination_folder, unzip=True)
|
|
||||||
|
|
||||||
|
|
||||||
# funkcja dzieląca zbiór
|
|
||||||
|
|
||||||
def split_dataset(data, test_size=0.2, val_size=0.1, random_state=42):
|
|
||||||
#Podział na test i trening
|
|
||||||
train_data, test_data = train_test_split(data, test_size=test_size, random_state=random_state)
|
|
||||||
#Podział na walidacje i trening
|
|
||||||
train_data, val_data = train_test_split(train_data, test_size=val_size/(1-test_size), random_state=random_state)
|
|
||||||
|
|
||||||
return train_data, val_data, test_data
|
|
||||||
|
|
||||||
|
|
||||||
# Wyświetlanie statystyk zbioru
|
|
||||||
|
|
||||||
def print_dataset_stats(data, subset_name):
|
|
||||||
with open('stats.txt', 'a') as stats_file:
|
|
||||||
print(f"Statystyki dla zbioru {subset_name}:", file=stats_file)
|
|
||||||
print(f"Wielkość zbioru {subset_name}: {len(data)}", file=stats_file)
|
|
||||||
|
|
||||||
print("\nStatystyki wartości poszczególnych parametrów:", file=stats_file)
|
|
||||||
print(data.describe(), file=stats_file)
|
|
||||||
|
|
||||||
for column in data.columns:
|
|
||||||
print(f"Rozkład częstości dla kolumny '{column}':", file=stats_file)
|
|
||||||
print(data[column].value_counts(), file=stats_file)
|
|
||||||
print("\n", file=stats_file)
|
|
||||||
|
|
||||||
# Normalizacja danych
|
|
||||||
|
|
||||||
def normalize_data(data):
|
|
||||||
scaler = MinMaxScaler()
|
|
||||||
numeric_columns = data.select_dtypes(include=['int', 'float']).columns
|
|
||||||
scaler.fit(data[numeric_columns])
|
|
||||||
df_normalized = data.copy()
|
|
||||||
df_normalized[numeric_columns] = scaler.transform(df_normalized[numeric_columns])
|
|
||||||
return df_normalized
|
|
||||||
|
|
||||||
#Czyszczenie danych
|
|
||||||
|
|
||||||
def clean_dataset(data):
|
|
||||||
data.dropna(inplace=True)
|
|
||||||
data.drop_duplicates(inplace=True)
|
|
||||||
return data
|
|
||||||
|
|
||||||
|
|
||||||
# main
|
|
||||||
|
|
||||||
url = "https://www.kaggle.com/datasets/gulczas/spotify-dataset?select=Spotify_Dataset.csv"
|
|
||||||
filename = "Spotify_Dataset.csv"
|
|
||||||
destination_folder = "datasets"
|
|
||||||
|
|
||||||
# Pobieranie jeśli nie ma już pobranego pliku
|
|
||||||
if not os.path.exists(destination_folder):
|
|
||||||
os.makedirs(destination_folder)
|
|
||||||
print(f"Utworzono folder: {destination_folder}")
|
|
||||||
else:
|
|
||||||
print(f"Folder {destination_folder} już istnieje.")
|
|
||||||
|
|
||||||
if 'Spotify_Dataset.csv' not in os.listdir(destination_folder):
|
|
||||||
# Pobranie pliku
|
|
||||||
filepath = download_file(url, filename, destination_folder)
|
|
||||||
|
|
||||||
# Wczytanie danych z pliku CSV
|
|
||||||
data = pd.read_csv("datasets/Spotify_Dataset.csv", sep=";")
|
|
||||||
|
|
||||||
# Podział datasetu na zbiory treningowy, walidacyjny i testowy
|
|
||||||
train_data, val_data, test_data = split_dataset(data)
|
|
||||||
|
|
||||||
# Zapisanie podzielonych zbiorów danych do osobnych plików CSV
|
|
||||||
train_data.to_csv("datasets/train.csv", index=False)
|
|
||||||
val_data.to_csv("datasets/val.csv", index=False)
|
|
||||||
test_data.to_csv("datasets/test.csv", index=False)
|
|
||||||
|
|
||||||
# Wydrukowanie statystyk dla zbiorów
|
|
||||||
print_dataset_stats(train_data, "treningowego")
|
|
||||||
print("\n")
|
|
||||||
print_dataset_stats(val_data, "walidacyjnego")
|
|
||||||
print("\n")
|
|
||||||
print_dataset_stats(test_data, "testowego")
|
|
||||||
|
|
||||||
# Normalizacja i czyszczenie zbirów
|
|
||||||
train_data = normalize_data(train_data)
|
|
||||||
train_data = clean_dataset(train_data)
|
|
||||||
val_data = normalize_data(train_data)
|
|
||||||
val_data = clean_dataset(train_data)
|
|
||||||
test_data = normalize_data(train_data)
|
|
||||||
test_data = clean_dataset(train_data)
|
|
||||||
|
|
||||||
|
|
||||||
|
|
@ -1,13 +0,0 @@
|
|||||||
#!/bin/bash
|
|
||||||
|
|
||||||
echo "------------------ Train dataset stats ------------------"
|
|
||||||
wc -l artifacts/train.csv > stats_train.txt
|
|
||||||
|
|
||||||
echo "------------------ Validation dataset stats ------------------"
|
|
||||||
wc -l artifacts/validation.csv > stats_validation.txt
|
|
||||||
|
|
||||||
echo "------------------ Test dataset stats ------------------"
|
|
||||||
wc -l artifacts/test.csv > stats_test.txt
|
|
||||||
|
|
||||||
mkdir -p data
|
|
||||||
mv stats_train.txt stats_validation.txt stats_test.txt artifacts/
|
|
File diff suppressed because it is too large
Load Diff
130389
datasets/test.csv
130389
datasets/test.csv
File diff suppressed because it is too large
Load Diff
456355
datasets/train.csv
456355
datasets/train.csv
File diff suppressed because it is too large
Load Diff
65195
datasets/val.csv
65195
datasets/val.csv
File diff suppressed because it is too large
Load Diff
@ -1,26 +0,0 @@
|
|||||||
#!/bin/bash
|
|
||||||
|
|
||||||
pip install kaggle --upgrade
|
|
||||||
|
|
||||||
kaggle datasets download -d gulczas/spotify-dataset
|
|
||||||
|
|
||||||
unzip -o spotify-dataset.zip
|
|
||||||
|
|
||||||
echo "------------------ Shufle ------------------"
|
|
||||||
shuf Spotify_Dataset.csv -o shuffled_spotify.csv
|
|
||||||
|
|
||||||
echo "------------------ Cut off to top $1 rows ------------------"
|
|
||||||
head -n $1 shuffled_spotify.csv > cutoff_spotify.csv
|
|
||||||
|
|
||||||
echo "------------------ Split ------------------"
|
|
||||||
total_lines=$(wc -l < cutoff_spotify.csv)
|
|
||||||
num_test=$((total_lines / 10))
|
|
||||||
num_train=$((total_lines - (num_test * 2)))
|
|
||||||
num_validation=$num_test
|
|
||||||
|
|
||||||
head -n $num_train cutoff_spotify.csv > train.csv
|
|
||||||
tail -n $((num_test+num_validation)) cutoff_spotify.csv | head -n $num_test > test.csv
|
|
||||||
tail -n $num_validation cutoff_spotify.csv > validation.csv
|
|
||||||
|
|
||||||
mkdir -p artifacts
|
|
||||||
mv Spotify_Dataset.csv cutoff_spotify.csv train.csv validation.csv test.csv artifacts/
|
|
127
model_creator.py
127
model_creator.py
@ -1,127 +0,0 @@
|
|||||||
import pandas as pd
|
|
||||||
import os
|
|
||||||
import numpy as np
|
|
||||||
from kaggle.api.kaggle_api_extended import KaggleApi
|
|
||||||
from sklearn.model_selection import train_test_split
|
|
||||||
from sklearn.linear_model import LogisticRegression
|
|
||||||
from sklearn.metrics import accuracy_score
|
|
||||||
from sklearn.preprocessing import StandardScaler
|
|
||||||
from sklearn. preprocessing import LabelEncoder
|
|
||||||
import pickle
|
|
||||||
|
|
||||||
|
|
||||||
def download_dataset(dataset_address, destination_folder):
|
|
||||||
|
|
||||||
api = KaggleApi()
|
|
||||||
api.authenticate()
|
|
||||||
|
|
||||||
api.dataset_download_files(dataset_address, path=destination_folder, unzip=True)
|
|
||||||
|
|
||||||
|
|
||||||
def check_datasets_presence():
|
|
||||||
|
|
||||||
dataset_1 = "Spotify_Dataset.csv"
|
|
||||||
dataset_2 = "spotify_songs.csv"
|
|
||||||
destination_folder = "datasets"
|
|
||||||
|
|
||||||
if not os.path.exists(destination_folder):
|
|
||||||
os.makedirs(destination_folder)
|
|
||||||
print(f"Utworzono folder: {destination_folder}")
|
|
||||||
else:
|
|
||||||
print(f"Folder {destination_folder} już istnieje.")
|
|
||||||
|
|
||||||
if dataset_1 not in os.listdir(destination_folder):
|
|
||||||
download_dataset('gulczas/spotify-dataset', destination_folder)
|
|
||||||
|
|
||||||
if dataset_2 not in os.listdir(destination_folder):
|
|
||||||
download_dataset('joebeachcapital/30000-spotify-songs', destination_folder)
|
|
||||||
|
|
||||||
|
|
||||||
def datasets_preparation():
|
|
||||||
df_1 = pd.read_csv("datasets/spotify_songs.csv")
|
|
||||||
df_2 = pd.read_csv("datasets/Spotify_Dataset.csv", sep=";")
|
|
||||||
|
|
||||||
df_1 = df_1.dropna()
|
|
||||||
df_2 = df_2.dropna()
|
|
||||||
df_2 = df_2.rename(columns={'Title': 'track_name'})
|
|
||||||
|
|
||||||
columns_to_remove_df_1 = ['track_id', 'track_album_id', 'track_album_name', 'track_album_release_date',
|
|
||||||
'playlist_id', 'playlist_subgenre']
|
|
||||||
columns_to_remove_df_2 = ['Date','# of Artist', 'Artist (Ind.)', '# of Nationality',
|
|
||||||
'Nationality', 'Continent', 'Points (Total)',
|
|
||||||
'Points (Ind for each Artist/Nat)', 'id', 'Song URL']
|
|
||||||
|
|
||||||
df_1 = df_1.drop(columns=columns_to_remove_df_1)
|
|
||||||
df_2 = df_2.drop(columns=columns_to_remove_df_2)
|
|
||||||
df_1 = df_1.drop_duplicates(subset=['track_name'])
|
|
||||||
df_2 = df_2.drop_duplicates(subset=['track_name'])
|
|
||||||
|
|
||||||
le = LabelEncoder()
|
|
||||||
|
|
||||||
unique_names_df2 = df_2['track_name'].unique()
|
|
||||||
diff_df = df_1[~df_1['track_name'].isin(unique_names_df2)]
|
|
||||||
diff_df = diff_df.iloc[:10000]
|
|
||||||
|
|
||||||
#diff_df = pd.concat([diff_df, df_1.iloc[:20]], ignore_index=True)
|
|
||||||
diff_df['track_artist'] = le.fit_transform(diff_df.track_artist)
|
|
||||||
diff_df['playlist_name'] = le.fit_transform(diff_df.playlist_name)
|
|
||||||
diff_df['playlist_genre'] = le.fit_transform(diff_df.playlist_genre)
|
|
||||||
|
|
||||||
#df_1 = df_1.iloc[20:]
|
|
||||||
|
|
||||||
if "docker_test_dataset.csv" not in os.listdir("datasets"):
|
|
||||||
diff_df.to_csv("datasets/docker_test_dataset.csv", index=False)
|
|
||||||
|
|
||||||
result_df = pd.merge(df_1, df_2, on='track_name', how='inner')
|
|
||||||
result_df = result_df.drop_duplicates(subset=['track_name'])
|
|
||||||
columns_to_remove_result_df = ['Rank', 'Artists', 'Danceability', 'Energy', 'Loudness',
|
|
||||||
'Speechiness', 'Acousticness', 'Instrumentalness', 'Valence']
|
|
||||||
result_df = result_df.drop(columns=columns_to_remove_result_df)
|
|
||||||
|
|
||||||
result_df['track_artist'] = le.fit_transform(result_df.track_artist)
|
|
||||||
result_df['playlist_name'] = le.fit_transform(result_df.playlist_name)
|
|
||||||
result_df['playlist_genre'] = le.fit_transform(result_df.playlist_genre)
|
|
||||||
|
|
||||||
return result_df
|
|
||||||
|
|
||||||
|
|
||||||
check_datasets_presence()
|
|
||||||
result_df = datasets_preparation()
|
|
||||||
Y = result_df[['playlist_genre']]
|
|
||||||
X = result_df.drop(columns='playlist_genre')
|
|
||||||
X_train, X_test, Y_train, Y_test = train_test_split(X, Y, test_size=0.10, random_state=42)
|
|
||||||
|
|
||||||
|
|
||||||
Y_train = np.ravel(Y_train)
|
|
||||||
Y_test = np.ravel(Y_test)
|
|
||||||
|
|
||||||
scaler = StandardScaler()
|
|
||||||
numeric_columns = X_train.select_dtypes(include=['int', 'float']).columns
|
|
||||||
X_train_scaled = scaler.fit_transform(X_train[numeric_columns])
|
|
||||||
X_test_scaled = scaler.transform(X_test[numeric_columns])
|
|
||||||
|
|
||||||
model = LogisticRegression(max_iter=1000)
|
|
||||||
model.fit(X_train_scaled, Y_train)
|
|
||||||
|
|
||||||
|
|
||||||
Y_pred = model.predict(X_test_scaled)
|
|
||||||
|
|
||||||
accuracy = accuracy_score(Y_test, Y_pred)
|
|
||||||
print("Accuracy:", accuracy)
|
|
||||||
|
|
||||||
file_path = 'model.pkl'
|
|
||||||
|
|
||||||
if os.path.exists(file_path):
|
|
||||||
os.remove(file_path)
|
|
||||||
|
|
||||||
if file_path not in os.listdir("./"):
|
|
||||||
with open(file_path, 'wb') as file:
|
|
||||||
pickle.dump(model, file)
|
|
||||||
|
|
||||||
print("Model został zapisany do pliku:", file_path)
|
|
||||||
|
|
||||||
|
|
||||||
|
|
||||||
|
|
||||||
|
|
||||||
|
|
@ -1,3 +0,0 @@
|
|||||||
Real:['edm', 'edm', 'edm', 'edm', 'edm', 'edm', 'edm', 'edm', 'edm', 'edm', 'edm', 'edm', 'edm', 'edm', 'edm', 'edm', 'edm', 'edm', 'edm', 'edm']
|
|
||||||
Predicted: ['pop', 'pop', 'pop', 'pop', 'pop', 'pop', 'pop', 'pop', 'pop', 'pop', 'pop', 'pop', 'pop', 'pop', 'pop', 'pop', 'pop', 'pop', 'pop', 'pop']
|
|
||||||
Accuracy:0.1521
|
|
@ -1,3 +0,0 @@
|
|||||||
#!/bin/bash
|
|
||||||
|
|
||||||
python3 model_creator.py
|
|
65
use_model.py
65
use_model.py
@ -2,7 +2,54 @@ import pickle
|
|||||||
import pandas as pd
|
import pandas as pd
|
||||||
import numpy as np
|
import numpy as np
|
||||||
from sklearn.preprocessing import StandardScaler
|
from sklearn.preprocessing import StandardScaler
|
||||||
from sklearn.metrics import accuracy_score
|
from sklearn.metrics import mean_squared_error, f1_score, accuracy_score
|
||||||
|
import sys
|
||||||
|
import os
|
||||||
|
import matplotlib.pyplot as plt
|
||||||
|
|
||||||
|
def calculate_metrics(result):
|
||||||
|
rmse = np.sqrt(mean_squared_error(result["Real"], result["Predictions"]))
|
||||||
|
f1 = f1_score(result["Real"], result["Predictions"], average='macro')
|
||||||
|
accuracy = accuracy_score(result["Real"], result["Predictions"])
|
||||||
|
|
||||||
|
filename = 'metrics_df.csv'
|
||||||
|
if os.path.exists(filename):
|
||||||
|
metrics_df = pd.read_csv(filename)
|
||||||
|
new_row = pd.DataFrame({'Build number': sys.argv[1], 'RMSE': [rmse], 'F1 Score': [f1], 'Accuracy': [accuracy]})
|
||||||
|
metrics_df = metrics_df.append(new_row, ignore_index=True)
|
||||||
|
else:
|
||||||
|
metrics_df = pd.DataFrame({'Build number': sys.argv[1], 'RMSE': [rmse], 'F1 Score': [f1], 'Accuracy': [accuracy]})
|
||||||
|
|
||||||
|
|
||||||
|
metrics_df.to_csv(filename, index=False)
|
||||||
|
|
||||||
|
def create_plots():
|
||||||
|
|
||||||
|
metrics_df = pd.read_csv("metrics_df.csv")
|
||||||
|
|
||||||
|
plt.plot(metrics_df["Build number"], metrics_df["Accuracy"])
|
||||||
|
plt.xlabel("Build Number")
|
||||||
|
plt.ylabel("Accuracy")
|
||||||
|
plt.title("Accuracy of the model over time")
|
||||||
|
plt.xticks(range(min(metrics_df["Build number"]), max(metrics_df["Build number"]) + 1))
|
||||||
|
plt.show()
|
||||||
|
plt.savefig("Accuracy_plot.png")
|
||||||
|
|
||||||
|
plt.plot(metrics_df["Build number"], metrics_df["F1 Score"])
|
||||||
|
plt.xlabel("Build Number")
|
||||||
|
plt.ylabel("F1 Score")
|
||||||
|
plt.title("F1 Score of the model over time")
|
||||||
|
plt.xticks(range(min(metrics_df["Build number"]), max(metrics_df["Build number"]) + 1))
|
||||||
|
plt.show()
|
||||||
|
plt.savefig("F1_score_plot.png")
|
||||||
|
|
||||||
|
plt.plot(metrics_df["Build number"], metrics_df["RMSE"])
|
||||||
|
plt.xlabel("Build Number")
|
||||||
|
plt.ylabel("RMSE")
|
||||||
|
plt.title("RMSE of the model over time")
|
||||||
|
plt.xticks(range(min(metrics_df["Build number"]), max(metrics_df["Build number"]) + 1))
|
||||||
|
plt.show()
|
||||||
|
plt.savefig("RMSE_plot.png")
|
||||||
|
|
||||||
np.set_printoptions(threshold=20)
|
np.set_printoptions(threshold=20)
|
||||||
|
|
||||||
@ -11,7 +58,7 @@ with open(file_path, 'rb') as file:
|
|||||||
model = pickle.load(file)
|
model = pickle.load(file)
|
||||||
print("Model został wczytany z pliku:", file_path)
|
print("Model został wczytany z pliku:", file_path)
|
||||||
|
|
||||||
test_df = pd.read_csv("datasets/docker_test_dataset.csv")
|
test_df = pd.read_csv("artifacts/docker_test_dataset.csv")
|
||||||
|
|
||||||
Y_test = test_df[['playlist_genre']]
|
Y_test = test_df[['playlist_genre']]
|
||||||
X_test = test_df.drop(columns='playlist_genre')
|
X_test = test_df.drop(columns='playlist_genre')
|
||||||
@ -23,14 +70,8 @@ X_test_scaled = scaler.fit_transform(X_test[numeric_columns])
|
|||||||
|
|
||||||
Y_pred = model.predict(X_test_scaled)
|
Y_pred = model.predict(X_test_scaled)
|
||||||
|
|
||||||
with open('model_predictions.txt', 'w') as f:
|
result = pd.DataFrame({'Predictions': Y_pred, "Real": Y_test})
|
||||||
pass
|
result.to_csv("spotify_genre_predictions.csv", index=False)
|
||||||
|
|
||||||
with open('model_predictions.txt', 'a') as f:
|
|
||||||
labels_dict = {0: 'edm', 1 : 'latin', 2 : 'pop', 3 : 'r&b', 4 : 'rap', 5 :'rock'}
|
|
||||||
Y_test_labels = [labels_dict[number] for number in Y_test]
|
|
||||||
Y_pred_labels = [labels_dict[number] for number in Y_pred]
|
|
||||||
f.write("Real:" + str(Y_test_labels[:20])+ " \nPredicted: "+ str(Y_pred_labels[:20]))
|
|
||||||
accuracy = accuracy_score(Y_test, Y_pred)
|
|
||||||
f.write("\nAccuracy:" + str(accuracy))
|
|
||||||
|
|
||||||
|
calculate_metrics(result)
|
||||||
|
create_plots()
|
Loading…
Reference in New Issue
Block a user