diff --git a/dvc.yaml b/dvc.yaml new file mode 100644 index 0000000..21ca325 --- /dev/null +++ b/dvc.yaml @@ -0,0 +1,20 @@ +stages: + train: + cmd: python model_creator_dvc.py 0.2 100 + deps: + - model_creator_dvc.py + - spotify_songs.csv + - Spotify_Dataset.csv + outs: + - model.pkl + - docker_test_dataset.csv + + predict: + cmd: python use_model_dvc.py 1 + deps: + - use_model_dvc.py + - model.pkl + - docker_test_dataset.csv + outs: + - spotify_genre_predictions.csv + - metrics_df.csv diff --git a/model_creator_dvc.py b/model_creator_dvc.py new file mode 100644 index 0000000..1163cb5 --- /dev/null +++ b/model_creator_dvc.py @@ -0,0 +1,98 @@ +import pandas as pd +import os +import numpy as np +import shutil +import sys +from sklearn.model_selection import train_test_split +from sklearn.linear_model import LogisticRegression +from sklearn.metrics import accuracy_score +from sklearn.preprocessing import StandardScaler +from sklearn. preprocessing import LabelEncoder +import pickle + +def datasets_preparation(): + df_1 = pd.read_csv("spotify_songs.csv") + df_2 = pd.read_csv("Spotify_Dataset.csv", sep=";") + + df_1 = df_1.dropna() + df_2 = df_2.dropna() + df_2 = df_2.rename(columns={'Title': 'track_name'}) + + columns_to_remove_df_1 = ['track_id', 'track_album_id', 'track_album_name', 'track_album_release_date', + 'playlist_id', 'playlist_subgenre'] + columns_to_remove_df_2 = ['Date','# of Artist', 'Artist (Ind.)', '# of Nationality', + 'Nationality', 'Continent', 'Points (Total)', + 'Points (Ind for each Artist/Nat)', 'id', 'Song URL'] + + df_1 = df_1.drop(columns=columns_to_remove_df_1) + df_2 = df_2.drop(columns=columns_to_remove_df_2) + df_1 = df_1.drop_duplicates(subset=['track_name']) + df_2 = df_2.drop_duplicates(subset=['track_name']) + + le = LabelEncoder() + + unique_names_df2 = df_2['track_name'].unique() + diff_df = df_1[~df_1['track_name'].isin(unique_names_df2)] + diff_df = diff_df.iloc[:10000] + + #diff_df = pd.concat([diff_df, df_1.iloc[:20]], ignore_index=True) + diff_df['track_artist'] = le.fit_transform(diff_df.track_artist) + diff_df['playlist_name'] = le.fit_transform(diff_df.playlist_name) + diff_df['playlist_genre'] = le.fit_transform(diff_df.playlist_genre) + + #df_1 = df_1.iloc[20:] + + if "docker_test_dataset.csv" not in os.listdir(): + diff_df.to_csv("docker_test_dataset.csv", index=False) + + result_df = pd.merge(df_1, df_2, on='track_name', how='inner') + result_df = result_df.drop_duplicates(subset=['track_name']) + columns_to_remove_result_df = ['Rank', 'Artists', 'Danceability', 'Energy', 'Loudness', + 'Speechiness', 'Acousticness', 'Instrumentalness', 'Valence'] + result_df = result_df.drop(columns=columns_to_remove_result_df) + + result_df['track_artist'] = le.fit_transform(result_df.track_artist) + result_df['playlist_name'] = le.fit_transform(result_df.playlist_name) + result_df['playlist_genre'] = le.fit_transform(result_df.playlist_genre) + + return result_df + +result_df = datasets_preparation() +Y = result_df[['playlist_genre']] +X = result_df.drop(columns='playlist_genre') +X_train, X_test, Y_train, Y_test = train_test_split(X, Y, test_size=float(sys.argv[1]), random_state=42) + + +Y_train = np.ravel(Y_train) +Y_test = np.ravel(Y_test) + +scaler = StandardScaler() +numeric_columns = X_train.select_dtypes(include=['int', 'float']).columns +X_train_scaled = scaler.fit_transform(X_train[numeric_columns]) +X_test_scaled = scaler.transform(X_test[numeric_columns]) + +model = LogisticRegression(max_iter=int(sys.argv[2])) +model.fit(X_train_scaled, Y_train) + + +Y_pred = model.predict(X_test_scaled) + +accuracy = accuracy_score(Y_test, Y_pred) +print("Accuracy:", accuracy) + +file_path = 'model.pkl' + +if os.path.exists(file_path): + os.remove(file_path) + +if file_path not in os.listdir("./"): + with open(file_path, 'wb') as file: + pickle.dump(model, file) + +print("Model został zapisany do pliku:", file_path) + + + + + + diff --git a/use_model_dvc.py b/use_model_dvc.py new file mode 100644 index 0000000..6405cc8 --- /dev/null +++ b/use_model_dvc.py @@ -0,0 +1,48 @@ +import pickle +import pandas as pd +import numpy as np +from sklearn.preprocessing import StandardScaler +from sklearn.metrics import mean_squared_error, f1_score, accuracy_score +import sys +import os +import matplotlib.pyplot as plt + +def calculate_metrics(result): + rmse = np.sqrt(mean_squared_error(result["Real"], result["Predictions"])) + f1 = f1_score(result["Real"], result["Predictions"], average='macro') + accuracy = accuracy_score(result["Real"], result["Predictions"]) + + filename = 'metrics_df.csv' + if os.path.exists(filename): + metrics_df = pd.read_csv(filename) + new_row = pd.DataFrame({'RMSE': [rmse], 'F1 Score': [f1], 'Accuracy': [accuracy]}) + metrics_df = pd.concat([metrics_df, new_row], ignore_index=True) + else: + metrics_df = pd.DataFrame({'RMSE': [rmse], 'F1 Score': [f1], 'Accuracy': [accuracy]}) + + + metrics_df.to_csv(filename, index=False) + +np.set_printoptions(threshold=20) + +file_path = 'model.pkl' +with open(file_path, 'rb') as file: + model = pickle.load(file) +print("Model został wczytany z pliku:", file_path) + +test_df = pd.read_csv("docker_test_dataset.csv") + +Y_test = test_df[['playlist_genre']] +X_test = test_df.drop(columns='playlist_genre') +Y_test = np.ravel(Y_test) + +scaler = StandardScaler() +numeric_columns = X_test.select_dtypes(include=['int', 'float']).columns +X_test_scaled = scaler.fit_transform(X_test[numeric_columns]) + +Y_pred = model.predict(X_test_scaled) + +result = pd.DataFrame({'Predictions': Y_pred, "Real": Y_test}) +result.to_csv("spotify_genre_predictions.csv", index=False) + +calculate_metrics(result) \ No newline at end of file