Compare commits

...

8 Commits

Author SHA1 Message Date
70e774b2fa added model training 2024-05-09 00:47:34 +02:00
9614bea42a added model training 2024-05-09 00:30:13 +02:00
6e51e6f7d4 added model training 2024-05-09 00:24:27 +02:00
8834036711 added model training 2024-05-09 00:23:19 +02:00
8e9f37aaff added model training 2024-05-09 00:19:43 +02:00
cd670b95ea added model training 2024-05-09 00:17:55 +02:00
1f42dd374e added model training 2024-05-09 00:13:31 +02:00
b20f390d38 added model training 2024-05-09 00:06:02 +02:00
21 changed files with 216 additions and 1304332 deletions

View File

@ -0,0 +1,17 @@
FROM ubuntu:latest
RUN apt-get update && \
apt-get install -y \
python3 \
python3-pip \
wget \
unzip \
&& rm -rf /var/lib/apt/lists/*
RUN pip3 install pandas scikit-learn requests numpy
WORKDIR /app
COPY model_creator.py /app/
RUN chmod +x model_creator.py

View File

@ -0,0 +1,39 @@
pipeline {
agent any
triggers {
upstream(upstreamProjects: 'z-s464953-create-dataset', threshold: hudson.model.Result.SUCCESS)
}
parameters {
string(name: 'TEST_SIZE', defaultValue: '0.10', description: 'Size of test dataset')
string(name: 'MAX_ITER', defaultValue: '1000', description: 'Max number of iterations')
buildSelector(defaultSelector: lastSuccessful(), description: 'Which build to use for copying artifacts', name: 'BUILD_SELECTOR')
}
stages {
stage('Clone Repository') {
steps {
git branch: 'training', url: 'https://git.wmi.amu.edu.pl/s464953/ium_464953.git'
}
}
stage('Copy Artifacts') {
steps {
copyArtifacts filter: 'artifacts/*', projectName: 'z-s464953-create-dataset', selector: buildParameter('BUILD_SELECTOR')
}
}
stage("Run Docker") {
agent {
dockerfile {
filename 'Dockerfile'
reuseNode true
}
}
steps {
sh "python3 /app/model_creator.py ${params.TEST_SIZE} ${params.MAX_ITER}"
archiveArtifacts artifacts: '/app/model.pkl', onlyIfSuccessful: true
}
}
}
}

View File

@ -0,0 +1,118 @@
import pandas as pd
import os
import numpy as np
import shutil
import sys
from sklearn.model_selection import train_test_split
from sklearn.linear_model import LogisticRegression
from sklearn.metrics import accuracy_score
from sklearn.preprocessing import StandardScaler
from sklearn. preprocessing import LabelEncoder
import pickle
def check_datasets_presence():
dataset_1 = "Spotify_Dataset.csv"
dataset_2 = "spotify_songs.csv"
destination_folder = "artifacts"
if not os.path.exists(destination_folder):
raise FileNotFoundError(destination_folder + " folder not found")
if dataset_1 in os.listdir("/."):
shutil.move(dataset_1, destination_folder)
elif dataset_1 not in os.listdir(destination_folder):
raise FileNotFoundError(dataset_1 + " not found")
if dataset_2 in os.listdir("/."):
shutil.move(dataset_2, destination_folder)
elif dataset_2 not in os.listdir(destination_folder):
raise FileNotFoundError(dataset_2 + " not found")
def datasets_preparation():
df_1 = pd.read_csv("artifacts/spotify_songs.csv")
df_2 = pd.read_csv("artifacts/Spotify_Dataset.csv", sep=";")
df_1 = df_1.dropna()
df_2 = df_2.dropna()
df_2 = df_2.rename(columns={'Title': 'track_name'})
columns_to_remove_df_1 = ['track_id', 'track_album_id', 'track_album_name', 'track_album_release_date',
'playlist_id', 'playlist_subgenre']
columns_to_remove_df_2 = ['Date','# of Artist', 'Artist (Ind.)', '# of Nationality',
'Nationality', 'Continent', 'Points (Total)',
'Points (Ind for each Artist/Nat)', 'id', 'Song URL']
df_1 = df_1.drop(columns=columns_to_remove_df_1)
df_2 = df_2.drop(columns=columns_to_remove_df_2)
df_1 = df_1.drop_duplicates(subset=['track_name'])
df_2 = df_2.drop_duplicates(subset=['track_name'])
le = LabelEncoder()
unique_names_df2 = df_2['track_name'].unique()
diff_df = df_1[~df_1['track_name'].isin(unique_names_df2)]
diff_df = diff_df.iloc[:10000]
#diff_df = pd.concat([diff_df, df_1.iloc[:20]], ignore_index=True)
diff_df['track_artist'] = le.fit_transform(diff_df.track_artist)
diff_df['playlist_name'] = le.fit_transform(diff_df.playlist_name)
diff_df['playlist_genre'] = le.fit_transform(diff_df.playlist_genre)
#df_1 = df_1.iloc[20:]
if "docker_test_dataset.csv" not in os.listdir("artifacts"):
diff_df.to_csv("artifacts/docker_test_dataset.csv", index=False)
result_df = pd.merge(df_1, df_2, on='track_name', how='inner')
result_df = result_df.drop_duplicates(subset=['track_name'])
columns_to_remove_result_df = ['Rank', 'Artists', 'Danceability', 'Energy', 'Loudness',
'Speechiness', 'Acousticness', 'Instrumentalness', 'Valence']
result_df = result_df.drop(columns=columns_to_remove_result_df)
result_df['track_artist'] = le.fit_transform(result_df.track_artist)
result_df['playlist_name'] = le.fit_transform(result_df.playlist_name)
result_df['playlist_genre'] = le.fit_transform(result_df.playlist_genre)
return result_df
check_datasets_presence()
result_df = datasets_preparation()
Y = result_df[['playlist_genre']]
X = result_df.drop(columns='playlist_genre')
X_train, X_test, Y_train, Y_test = train_test_split(X, Y, test_size=float(sys.argv[1]), random_state=42)
Y_train = np.ravel(Y_train)
Y_test = np.ravel(Y_test)
scaler = StandardScaler()
numeric_columns = X_train.select_dtypes(include=['int', 'float']).columns
X_train_scaled = scaler.fit_transform(X_train[numeric_columns])
X_test_scaled = scaler.transform(X_test[numeric_columns])
model = LogisticRegression(max_iter=int(sys.argv[2]))
model.fit(X_train_scaled, Y_train)
Y_pred = model.predict(X_test_scaled)
accuracy = accuracy_score(Y_test, Y_pred)
print("Accuracy:", accuracy)
file_path = 'model.pkl'
if os.path.exists(file_path):
os.remove(file_path)
if file_path not in os.listdir("./"):
with open(file_path, 'wb') as file:
pickle.dump(model, file)
print("Model został zapisany do pliku:", file_path)

View File

@ -1,26 +1,17 @@
FROM ubuntu:latest FROM ubuntu:latest
ENV KAGGLE_USERNAME=gulczas
ENV KAGGLE_KEY=default_key
RUN apt-get update && \ RUN apt-get update && \
apt-get install -y \ apt-get install -y \
python3 \ python3 \
python3-pip \ python3-pip \
wget \ wget \
unzip \ unzip \
&& rm -rf /var/lib/apt/lists/* && rm -rf /var/lib/apt/lists/*
RUN pip3 install pandas scikit-learn requests kaggle numpy RUN pip3 install pandas scikit-learn requests numpy
WORKDIR /app WORKDIR /app
COPY model_creator.py /app/ COPY model_creator.py /app/
COPY use_model.py /app/
COPY run_py_scripts.sh /app/
RUN chmod +x model_creator.py
RUN chmod +x model_creator.py
RUN chmod +x use_model.py
CMD ["bash", "run_py_scripts.sh"]

42
Jenkinsfile vendored
View File

@ -1,40 +1,38 @@
pipeline { pipeline {
agent any agent any
triggers {
upstream(upstreamProjects: 'z-s464953-create-dataset', threshold: hudson.model.Result.SUCCESS)
}
parameters { parameters {
string(name: 'KAGGLE_USERNAME', defaultValue: 'gulczas', description: 'Kaggle username') string(name: 'TEST_SIZE', defaultValue: '0.10', description: 'Size of test dataset')
password(name: 'KAGGLE_KEY', defaultValue: '', description: 'Kaggle API key') string(name: 'MAX_ITER', defaultValue: '1000', description: 'Max number of iterations')
string(name: 'CUTOFF', defaultValue: '90', description: 'Number of rows to cut') buildSelector(defaultSelector: lastSuccessful(), description: 'Which build to use for copying artifacts', name: 'BUILD_SELECTOR')
} }
stages { stages {
stage('Clone Repository') { stage('Clone Repository') {
steps { steps {
git 'https://git.wmi.amu.edu.pl/s464953/ium_464953.git' git branch: 'training', url: 'https://git.wmi.amu.edu.pl/s464953/ium_464953.git'
} }
} }
stage('Cleanup Artifacts') { stage('Copy Artifacts') {
steps { steps {
script { copyArtifacts filter: 'artifacts/*', projectName: 'z-s464953-create-dataset', selector: buildParameter('BUILD_SELECTOR')
sh 'rm -rf artifacts' }
}
stage("Run Docker") {
agent {
dockerfile {
filename 'Dockerfile'
reuseNode true
} }
} }
}
stage('Run Script') {
steps { steps {
script {
withEnv([ sh "python3 /app/model_creator.py ${params.TEST_SIZE} ${params.MAX_ITER}"
"KAGGLE_USERNAME=${env.KAGGLE_USERNAME}", archiveArtifacts artifacts: 'model.pkl, artifacts/docker_test_dataset.csv', onlyIfSuccessful: true
"KAGGLE_KEY=${env.KAGGLE_KEY}"])
{
sh "bash ./download_dataset.sh ${params.CUTOFF}"
}
}
}
}
stage('Archive Artifacts') {
steps {
archiveArtifacts artifacts: 'artifacts/*', onlyIfSuccessful: true
} }
} }
} }

View File

@ -1,57 +0,0 @@
pipeline {
agent any
parameters {
string(name: 'KAGGLE_USERNAME', defaultValue: 'gulczas', description: 'Kaggle username')
password(name: 'KAGGLE_KEY', defaultValue: '', description: 'Kaggle API key')
}
stages {
stage('Clone Repository') {
steps {
git 'https://git.wmi.amu.edu.pl/s464953/ium_464953.git'
}
}
stage('Stop and remove existing container') {
steps {
script {
sh "docker stop s464953 || true"
sh "docker rm s464953 || true"
}
}
}
stage('Build Docker image') {
steps {
script {
withEnv([
"KAGGLE_USERNAME=${env.KAGGLE_USERNAME}",
"KAGGLE_KEY=${env.KAGGLE_KEY}"
]) {
sh "docker build --build-arg KAGGLE_USERNAME=$KAGGLE_USERNAME --build-arg KAGGLE_KEY=$KAGGLE_KEY -t s464953 ."
}
}
}
}
stage('Run Docker container') {
steps {
script {
withEnv([
"KAGGLE_USERNAME=${env.KAGGLE_USERNAME}",
"KAGGLE_KEY=${env.KAGGLE_KEY}"
]) {
sh "docker run --name s464953 -e KAGGLE_USERNAME=$KAGGLE_USERNAME -e KAGGLE_KEY=$KAGGLE_KEY -v ${WORKSPACE}:/app s464953"
}
}
}
}
stage('Archive stats.txt artifact') {
steps {
archiveArtifacts artifacts: 'stats.txt', allowEmptyArchive: true
}
}
}
}

View File

@ -1,44 +0,0 @@
pipeline {
agent any
parameters {
string(name: 'KAGGLE_USERNAME', defaultValue: 'gulczas', description: 'Kaggle username')
password(name: 'KAGGLE_KEY', defaultValue: '', description: 'Kaggle API key')
}
stages {
stage('Clone Repository') {
steps {
git 'https://git.wmi.amu.edu.pl/s464953/ium_464953.git'
}
}
stage('Stop and remove existing container') {
steps {
script {
sh "docker stop s464953 || true"
sh "docker rm s464953 || true"
}
}
}
stage('Run Docker container') {
steps {
script {
withEnv([
"KAGGLE_USERNAME=${env.KAGGLE_USERNAME}",
"KAGGLE_KEY=${env.KAGGLE_KEY}"
]) {
sh "docker run --name s464953 -e KAGGLE_USERNAME=$KAGGLE_USERNAME -e KAGGLE_KEY=$KAGGLE_KEY -v ${WORKSPACE}:/app michalgulczynski/ium_s464953:1.0"
}
}
}
}
stage('Archive stats.txt artifact') {
steps {
archiveArtifacts artifacts: 'stats.txt', allowEmptyArchive: true
}
}
}
}

View File

@ -1,57 +0,0 @@
pipeline {
agent any
parameters {
string(name: 'KAGGLE_USERNAME', defaultValue: 'gulczas', description: 'Kaggle username')
password(name: 'KAGGLE_KEY', defaultValue: '', description: 'Kaggle API key')
}
stages {
stage('Clone Repository') {
steps {
git 'https://git.wmi.amu.edu.pl/s464953/ium_464953.git'
}
}
stage('Stop and remove existing container') {
steps {
script {
sh "docker stop s464953 || true"
sh "docker rm s464953 || true"
}
}
}
stage('Build Docker image') {
steps {
script {
withEnv([
"KAGGLE_USERNAME=${env.KAGGLE_USERNAME}",
"KAGGLE_KEY=${env.KAGGLE_KEY}"
]) {
sh "docker build --build-arg KAGGLE_USERNAME=$KAGGLE_USERNAME --build-arg KAGGLE_KEY=$KAGGLE_KEY -t s464953 ."
}
}
}
}
stage('Run Docker container') {
steps {
script {
withEnv([
"KAGGLE_USERNAME=${env.KAGGLE_USERNAME}",
"KAGGLE_KEY=${env.KAGGLE_KEY}"
]) {
sh "docker run --name s464953 -e KAGGLE_USERNAME=$KAGGLE_USERNAME -e KAGGLE_KEY=$KAGGLE_KEY -v ${WORKSPACE}:/app s464953"
}
}
}
}
stage('Archive stats.txt artifact') {
steps {
archiveArtifacts artifacts: 'model.pkl', allowEmptyArchive: true
}
}
}
}

View File

@ -1,42 +0,0 @@
pipeline {
agent any
parameters {
buildSelector( defaultSelector: lastSuccessful(), description: 'Build for copying artifacts', name: 'BUILD_SELECTOR')
}
stages {
stage('Clone Repository') {
steps {
git 'https://git.wmi.amu.edu.pl/s464953/ium_464953.git'
}
}
stage('Cleanup Artifacts') {
steps {
script {
sh 'rm -rf artifacts'
}
}
}
stage('Copy Artifact') {
steps {
withEnv([
"BUILD_SELECTOR=${params.BUILD_SELECTOR}"
]) {
copyArtifacts fingerprintArtifacts: true, projectName: 'z-s464953-create-dataset', selector: buildParameter('$BUILD_SELECTOR')}
}
}
stage('Execute Shell Script') {
steps {
script {
sh "bash ./dataset_stats.sh"
}
}
}
stage('Archive Results') {
steps {
archiveArtifacts artifacts: 'artifacts/*', onlyIfSuccessful: true
}
}
}
}

View File

@ -1,113 +0,0 @@
#!/usr/bin/env python
# Import bibliotek
import os
import shutil
import pandas as pd
from sklearn.model_selection import train_test_split
import requests
from sklearn.preprocessing import MinMaxScaler
from kaggle.api.kaggle_api_extended import KaggleApi
#funkcja pobierająca plik
def download_file(url, filename, destination_folder):
# Wersja dla datasetów kaggle
api = KaggleApi()
api.authenticate()
api.dataset_download_files('gulczas/spotify-dataset', path=destination_folder, unzip=True)
# funkcja dzieląca zbiór
def split_dataset(data, test_size=0.2, val_size=0.1, random_state=42):
#Podział na test i trening
train_data, test_data = train_test_split(data, test_size=test_size, random_state=random_state)
#Podział na walidacje i trening
train_data, val_data = train_test_split(train_data, test_size=val_size/(1-test_size), random_state=random_state)
return train_data, val_data, test_data
# Wyświetlanie statystyk zbioru
def print_dataset_stats(data, subset_name):
with open('stats.txt', 'a') as stats_file:
print(f"Statystyki dla zbioru {subset_name}:", file=stats_file)
print(f"Wielkość zbioru {subset_name}: {len(data)}", file=stats_file)
print("\nStatystyki wartości poszczególnych parametrów:", file=stats_file)
print(data.describe(), file=stats_file)
for column in data.columns:
print(f"Rozkład częstości dla kolumny '{column}':", file=stats_file)
print(data[column].value_counts(), file=stats_file)
print("\n", file=stats_file)
# Normalizacja danych
def normalize_data(data):
scaler = MinMaxScaler()
numeric_columns = data.select_dtypes(include=['int', 'float']).columns
scaler.fit(data[numeric_columns])
df_normalized = data.copy()
df_normalized[numeric_columns] = scaler.transform(df_normalized[numeric_columns])
return df_normalized
#Czyszczenie danych
def clean_dataset(data):
data.dropna(inplace=True)
data.drop_duplicates(inplace=True)
return data
# main
url = "https://www.kaggle.com/datasets/gulczas/spotify-dataset?select=Spotify_Dataset.csv"
filename = "Spotify_Dataset.csv"
destination_folder = "datasets"
# Pobieranie jeśli nie ma już pobranego pliku
if not os.path.exists(destination_folder):
os.makedirs(destination_folder)
print(f"Utworzono folder: {destination_folder}")
else:
print(f"Folder {destination_folder} już istnieje.")
if 'Spotify_Dataset.csv' not in os.listdir(destination_folder):
# Pobranie pliku
filepath = download_file(url, filename, destination_folder)
# Wczytanie danych z pliku CSV
data = pd.read_csv("datasets/Spotify_Dataset.csv", sep=";")
# Podział datasetu na zbiory treningowy, walidacyjny i testowy
train_data, val_data, test_data = split_dataset(data)
# Zapisanie podzielonych zbiorów danych do osobnych plików CSV
train_data.to_csv("datasets/train.csv", index=False)
val_data.to_csv("datasets/val.csv", index=False)
test_data.to_csv("datasets/test.csv", index=False)
# Wydrukowanie statystyk dla zbiorów
print_dataset_stats(train_data, "treningowego")
print("\n")
print_dataset_stats(val_data, "walidacyjnego")
print("\n")
print_dataset_stats(test_data, "testowego")
# Normalizacja i czyszczenie zbirów
train_data = normalize_data(train_data)
train_data = clean_dataset(train_data)
val_data = normalize_data(train_data)
val_data = clean_dataset(train_data)
test_data = normalize_data(train_data)
test_data = clean_dataset(train_data)

View File

@ -1,13 +0,0 @@
#!/bin/bash
echo "------------------ Train dataset stats ------------------"
wc -l artifacts/train.csv > stats_train.txt
echo "------------------ Validation dataset stats ------------------"
wc -l artifacts/validation.csv > stats_validation.txt
echo "------------------ Test dataset stats ------------------"
wc -l artifacts/test.csv > stats_test.txt
mkdir -p data
mv stats_train.txt stats_validation.txt stats_test.txt artifacts/

File diff suppressed because it is too large Load Diff

File diff suppressed because it is too large Load Diff

File diff suppressed because it is too large Load Diff

File diff suppressed because it is too large Load Diff

View File

@ -1,26 +0,0 @@
#!/bin/bash
pip install kaggle --upgrade
kaggle datasets download -d gulczas/spotify-dataset
unzip -o spotify-dataset.zip
echo "------------------ Shufle ------------------"
shuf Spotify_Dataset.csv -o shuffled_spotify.csv
echo "------------------ Cut off to top $1 rows ------------------"
head -n $1 shuffled_spotify.csv > cutoff_spotify.csv
echo "------------------ Split ------------------"
total_lines=$(wc -l < cutoff_spotify.csv)
num_test=$((total_lines / 10))
num_train=$((total_lines - (num_test * 2)))
num_validation=$num_test
head -n $num_train cutoff_spotify.csv > train.csv
tail -n $((num_test+num_validation)) cutoff_spotify.csv | head -n $num_test > test.csv
tail -n $num_validation cutoff_spotify.csv > validation.csv
mkdir -p artifacts
mv Spotify_Dataset.csv cutoff_spotify.csv train.csv validation.csv test.csv artifacts/

BIN
model.pkl

Binary file not shown.

View File

@ -1,7 +1,8 @@
import pandas as pd import pandas as pd
import os import os
import numpy as np import numpy as np
from kaggle.api.kaggle_api_extended import KaggleApi import shutil
import sys
from sklearn.model_selection import train_test_split from sklearn.model_selection import train_test_split
from sklearn.linear_model import LogisticRegression from sklearn.linear_model import LogisticRegression
from sklearn.metrics import accuracy_score from sklearn.metrics import accuracy_score
@ -9,37 +10,27 @@ from sklearn.preprocessing import StandardScaler
from sklearn. preprocessing import LabelEncoder from sklearn. preprocessing import LabelEncoder
import pickle import pickle
def download_dataset(dataset_address, destination_folder):
api = KaggleApi()
api.authenticate()
api.dataset_download_files(dataset_address, path=destination_folder, unzip=True)
def check_datasets_presence(): def check_datasets_presence():
dataset_1 = "Spotify_Dataset.csv" dataset_1 = "Spotify_Dataset.csv"
dataset_2 = "spotify_songs.csv" dataset_2 = "spotify_songs.csv"
destination_folder = "datasets" destination_folder = "artifacts"
if not os.path.exists(destination_folder): if not os.path.exists(destination_folder):
os.makedirs(destination_folder) raise FileNotFoundError(destination_folder + " folder not found")
print(f"Utworzono folder: {destination_folder}") if dataset_1 in os.listdir("/."):
else: shutil.move(dataset_1, destination_folder)
print(f"Folder {destination_folder} już istnieje.") elif dataset_1 not in os.listdir(destination_folder):
raise FileNotFoundError(dataset_1 + " not found")
if dataset_1 not in os.listdir(destination_folder):
download_dataset('gulczas/spotify-dataset', destination_folder) if dataset_2 in os.listdir("/."):
shutil.move(dataset_2, destination_folder)
if dataset_2 not in os.listdir(destination_folder): elif dataset_2 not in os.listdir(destination_folder):
download_dataset('joebeachcapital/30000-spotify-songs', destination_folder) raise FileNotFoundError(dataset_2 + " not found")
def datasets_preparation(): def datasets_preparation():
df_1 = pd.read_csv("datasets/spotify_songs.csv") df_1 = pd.read_csv("artifacts/spotify_songs.csv")
df_2 = pd.read_csv("datasets/Spotify_Dataset.csv", sep=";") df_2 = pd.read_csv("artifacts/Spotify_Dataset.csv", sep=";")
df_1 = df_1.dropna() df_1 = df_1.dropna()
df_2 = df_2.dropna() df_2 = df_2.dropna()
@ -69,8 +60,8 @@ def datasets_preparation():
#df_1 = df_1.iloc[20:] #df_1 = df_1.iloc[20:]
if "docker_test_dataset.csv" not in os.listdir("datasets"): if "docker_test_dataset.csv" not in os.listdir("artifacts"):
diff_df.to_csv("datasets/docker_test_dataset.csv", index=False) diff_df.to_csv("artifacts/docker_test_dataset.csv", index=False)
result_df = pd.merge(df_1, df_2, on='track_name', how='inner') result_df = pd.merge(df_1, df_2, on='track_name', how='inner')
result_df = result_df.drop_duplicates(subset=['track_name']) result_df = result_df.drop_duplicates(subset=['track_name'])
@ -89,7 +80,7 @@ check_datasets_presence()
result_df = datasets_preparation() result_df = datasets_preparation()
Y = result_df[['playlist_genre']] Y = result_df[['playlist_genre']]
X = result_df.drop(columns='playlist_genre') X = result_df.drop(columns='playlist_genre')
X_train, X_test, Y_train, Y_test = train_test_split(X, Y, test_size=0.10, random_state=42) X_train, X_test, Y_train, Y_test = train_test_split(X, Y, test_size=float(sys.argv[1]), random_state=42)
Y_train = np.ravel(Y_train) Y_train = np.ravel(Y_train)
@ -100,7 +91,7 @@ numeric_columns = X_train.select_dtypes(include=['int', 'float']).columns
X_train_scaled = scaler.fit_transform(X_train[numeric_columns]) X_train_scaled = scaler.fit_transform(X_train[numeric_columns])
X_test_scaled = scaler.transform(X_test[numeric_columns]) X_test_scaled = scaler.transform(X_test[numeric_columns])
model = LogisticRegression(max_iter=1000) model = LogisticRegression(max_iter=int(sys.argv[2]))
model.fit(X_train_scaled, Y_train) model.fit(X_train_scaled, Y_train)

View File

@ -1,3 +0,0 @@
Real:['edm', 'edm', 'edm', 'edm', 'edm', 'edm', 'edm', 'edm', 'edm', 'edm', 'edm', 'edm', 'edm', 'edm', 'edm', 'edm', 'edm', 'edm', 'edm', 'edm']
Predicted: ['pop', 'pop', 'pop', 'pop', 'pop', 'pop', 'pop', 'pop', 'pop', 'pop', 'pop', 'pop', 'pop', 'pop', 'pop', 'pop', 'pop', 'pop', 'pop', 'pop']
Accuracy:0.1521

View File

@ -1,3 +0,0 @@
#!/bin/bash
python3 model_creator.py

View File

@ -1,36 +0,0 @@
import pickle
import pandas as pd
import numpy as np
from sklearn.preprocessing import StandardScaler
from sklearn.metrics import accuracy_score
np.set_printoptions(threshold=20)
file_path = 'model.pkl'
with open(file_path, 'rb') as file:
model = pickle.load(file)
print("Model został wczytany z pliku:", file_path)
test_df = pd.read_csv("datasets/docker_test_dataset.csv")
Y_test = test_df[['playlist_genre']]
X_test = test_df.drop(columns='playlist_genre')
Y_test = np.ravel(Y_test)
scaler = StandardScaler()
numeric_columns = X_test.select_dtypes(include=['int', 'float']).columns
X_test_scaled = scaler.fit_transform(X_test[numeric_columns])
Y_pred = model.predict(X_test_scaled)
with open('model_predictions.txt', 'w') as f:
pass
with open('model_predictions.txt', 'a') as f:
labels_dict = {0: 'edm', 1 : 'latin', 2 : 'pop', 3 : 'r&b', 4 : 'rap', 5 :'rock'}
Y_test_labels = [labels_dict[number] for number in Y_test]
Y_pred_labels = [labels_dict[number] for number in Y_pred]
f.write("Real:" + str(Y_test_labels[:20])+ " \nPredicted: "+ str(Y_pred_labels[:20]))
accuracy = accuracy_score(Y_test, Y_pred)
f.write("\nAccuracy:" + str(accuracy))