From af91b85a300d7f99c0a725bcb63746afcf1f5dcc Mon Sep 17 00:00:00 2001 From: Michal Gulczynski Date: Tue, 11 Jun 2024 19:06:13 +0200 Subject: [PATCH] ium_07 sacred --- Dockerfile | 17 +----- Jenkinsfile_sacred | 42 +++++++++++++ sacred/sacred_model_creator.py | 108 +++++++++++++++++++++++++++++++++ sacred/sacred_use_model.py | 47 ++++++++++++++ 4 files changed, 198 insertions(+), 16 deletions(-) create mode 100644 Jenkinsfile_sacred create mode 100644 sacred/sacred_model_creator.py create mode 100644 sacred/sacred_use_model.py diff --git a/Dockerfile b/Dockerfile index 8ff1399..2368de7 100644 --- a/Dockerfile +++ b/Dockerfile @@ -1,8 +1,5 @@ FROM ubuntu:latest -ENV KAGGLE_USERNAME=gulczas -ENV KAGGLE_KEY=default_key - RUN apt-get update && \ apt-get install -y \ python3 \ @@ -11,16 +8,4 @@ RUN apt-get update && \ unzip \ && rm -rf /var/lib/apt/lists/* -RUN pip3 install pandas scikit-learn requests kaggle numpy - -WORKDIR /app - -COPY model_creator.py /app/ -COPY use_model.py /app/ -COPY run_py_scripts.sh /app/ - - -RUN chmod +x model_creator.py -RUN chmod +x use_model.py - -CMD ["bash", "run_py_scripts.sh"] \ No newline at end of file +RUN pip3 install pandas scikit-learn requests kaggle numpy sacred pymongo \ No newline at end of file diff --git a/Jenkinsfile_sacred b/Jenkinsfile_sacred new file mode 100644 index 0000000..15a0940 --- /dev/null +++ b/Jenkinsfile_sacred @@ -0,0 +1,42 @@ +pipeline { + agent any + + parameters { + string(name: 'KAGGLE_USERNAME', defaultValue: 'gulczas', description: 'Kaggle username') + password(name: 'KAGGLE_KEY', defaultValue: '', description: 'Kaggle API key') + } + + stages { + stage('Clone Repository') { + steps { + git 'https://git.wmi.amu.edu.pl/s464953/ium_464953.git' + } + } + + stage('Build and Run Experiments') { + agent { + dockerfile { + reuseNode true + } + } + + environment { + KAGGLE_USERNAME = "${params.KAGGLE_USERNAME}" + KAGGLE_KEY = "${params.KAGGLE_KEY}" + } + + steps { + sh 'chmod +x sacred/sacred_model_creator.py' + sh 'python3 sacred/sacred_model_creator.py' + sh 'chmod +x sacred/sacred_use_model.py' + sh 'python3 sacred/sacred_use_model.py' + } + } + + stage('Archive Artifacts from Experiments') { + steps { + archiveArtifacts artifacts: 'my_experiment_logs/**', allowEmptyArchive: true + } + } + } +} diff --git a/sacred/sacred_model_creator.py b/sacred/sacred_model_creator.py new file mode 100644 index 0000000..5d6eb71 --- /dev/null +++ b/sacred/sacred_model_creator.py @@ -0,0 +1,108 @@ +import pandas as pd +import os +import numpy as np +from kaggle.api.kaggle_api_extended import KaggleApi +from sklearn.model_selection import train_test_split +from sklearn.linear_model import LogisticRegression +from sklearn.metrics import accuracy_score +from sklearn.preprocessing import StandardScaler +from sklearn.preprocessing import LabelEncoder +import pickle +from sacred import Experiment +from sacred.observers import MongoObserver, FileObserver + +# Tworzenie eksperymentu +ex = Experiment('123456') # Zastąp '123456' swoim numerem indeksu + +# Dodanie obserwatorów +ex.observers.append(MongoObserver(url='mongodb://admin:IUM_2021@tzietkiewicz.vm.wmi.amu.edu.pl:27017')) +ex.observers.append(FileObserver('my_experiment_logs')) + +def download_dataset(dataset_address, destination_folder): + api = KaggleApi() + api.authenticate() + api.dataset_download_files(dataset_address, path=destination_folder, unzip=True) + +def check_datasets_presence(): + dataset_1 = "Spotify_Dataset.csv" + dataset_2 = "spotify_songs.csv" + destination_folder = "datasets" + if not os.path.exists(destination_folder): + os.makedirs(destination_folder) + print(f"Utworzono folder: {destination_folder}") + else: + print(f"Folder {destination_folder} już istnieje.") + if dataset_1 not in os.listdir(destination_folder): + download_dataset('gulczas/spotify-dataset', destination_folder) + if dataset_2 not in os.listdir(destination_folder): + download_dataset('joebeachcapital/30000-spotify-songs', destination_folder) + +def datasets_preparation(): + df_1 = pd.read_csv("datasets/spotify_songs.csv") + df_2 = pd.read_csv("datasets/Spotify_Dataset.csv", sep=";") + df_1 = df_1.dropna() + df_2 = df_2.dropna() + df_2 = df_2.rename(columns={'Title': 'track_name'}) + columns_to_remove_df_1 = ['track_id', 'track_album_id', 'track_album_name', 'track_album_release_date', + 'playlist_id', 'playlist_subgenre'] + columns_to_remove_df_2 = ['Date','# of Artist', 'Artist (Ind.)', '# of Nationality', + 'Nationality', 'Continent', 'Points (Total)', + 'Points (Ind for each Artist/Nat)', 'id', 'Song URL'] + df_1 = df_1.drop(columns=columns_to_remove_df_1) + df_2 = df_2.drop(columns=columns_to_remove_df_2) + df_1 = df_1.drop_duplicates(subset=['track_name']) + df_2 = df_2.drop_duplicates(subset=['track_name']) + le = LabelEncoder() + unique_names_df2 = df_2['track_name'].unique() + diff_df = df_1[~df_1['track_name'].isin(unique_names_df2)] + diff_df = diff_df.iloc[:10000] + diff_df['track_artist'] = le.fit_transform(diff_df.track_artist) + diff_df['playlist_name'] = le.fit_transform(diff_df.playlist_name) + diff_df['playlist_genre'] = le.fit_transform(diff_df.playlist_genre) + if "docker_test_dataset.csv" not in os.listdir("datasets"): + diff_df.to_csv("datasets/docker_test_dataset.csv", index=False) + result_df = pd.merge(df_1, df_2, on='track_name', how='inner') + result_df = result_df.drop_duplicates(subset=['track_name']) + columns_to_remove_result_df = ['Rank', 'Artists', 'Danceability', 'Energy', 'Loudness', + 'Speechiness', 'Acousticness', 'Instrumentalness', 'Valence'] + result_df = result_df.drop(columns=columns_to_remove_result_df) + result_df['track_artist'] = le.fit_transform(result_df.track_artist) + result_df['playlist_name'] = le.fit_transform(result_df.playlist_name) + result_df['playlist_genre'] = le.fit_transform(result_df.playlist_genre) + return result_df + +@ex.config +def config(): + test_size = 0.10 + random_state = 42 + model_filename = 'model.pkl' + +@ex.main +def run_experiment(test_size, random_state, model_filename): + check_datasets_presence() + result_df = datasets_preparation() + Y = result_df[['playlist_genre']] + X = result_df.drop(columns='playlist_genre') + X_train, X_test, Y_train, Y_test = train_test_split(X, Y, test_size=test_size, random_state=random_state) + Y_train = np.ravel(Y_train) + Y_test = np.ravel(Y_test) + scaler = StandardScaler() + numeric_columns = X_train.select_dtypes(include=['int', 'float']).columns + X_train_scaled = scaler.fit_transform(X_train[numeric_columns]) + X_test_scaled = scaler.transform(X_test[numeric_columns]) + model = LogisticRegression(max_iter=1000) + model.fit(X_train_scaled, Y_train) + Y_pred = model.predict(X_test_scaled) + accuracy = accuracy_score(Y_test, Y_pred) + ex.log_scalar('accuracy', accuracy) + if os.path.exists(model_filename): + os.remove(model_filename) + with open(model_filename, 'wb') as file: + pickle.dump(model, file) + ex.add_artifact(model_filename) + ex.add_resource(__file__) + print("Accuracy:", accuracy) + return accuracy + +if __name__ == '__main__': + ex.run_commandline() diff --git a/sacred/sacred_use_model.py b/sacred/sacred_use_model.py new file mode 100644 index 0000000..837efd2 --- /dev/null +++ b/sacred/sacred_use_model.py @@ -0,0 +1,47 @@ +import pickle +import pandas as pd +import numpy as np +from sklearn.preprocessing import StandardScaler +from sklearn.metrics import accuracy_score +from sacred import Experiment +from sacred.observers import MongoObserver, FileObserver + +ex = Experiment('464953_evaluation') + +# Dodanie obserwatorów +ex.observers.append(MongoObserver(url='mongodb://admin:IUM_2021@tzietkiewicz.vm.wmi.amu.edu.pl:27017')) +ex.observers.append(FileObserver('my_experiment_logs')) + +@ex.config +def config(): + model_filename = 'model.pkl' + test_dataset_filename = 'datasets/docker_test_dataset.csv' + +@ex.main +def run_evaluation(model_filename, test_dataset_filename): + with open(model_filename, 'rb') as file: + model = pickle.load(file) + print("Model został wczytany z pliku:", model_filename) + test_df = pd.read_csv(test_dataset_filename) + Y_test = test_df[['playlist_genre']] + X_test = test_df.drop(columns='playlist_genre') + Y_test = np.ravel(Y_test) + scaler = StandardScaler() + numeric_columns = X_test.select_dtypes(include=['int', 'float']).columns + X_test_scaled = scaler.fit_transform(X_test[numeric_columns]) + Y_pred = model.predict(X_test_scaled) + labels_dict = {0: 'edm', 1 : 'latin', 2 : 'pop', 3 : 'r&b', 4 : 'rap', 5 :'rock'} + Y_test_labels = [labels_dict[number] for number in Y_test] + Y_pred_labels = [labels_dict[number] for number in Y_pred] + accuracy = accuracy_score(Y_test, Y_pred) + ex.log_scalar('accuracy', accuracy) + with open('model_predictions.txt', 'w') as f: + f.write("Real:" + str(Y_test_labels[:20]) + " \nPredicted: " + str(Y_pred_labels[:20])) + f.write("\nAccuracy:" + str(accuracy)) + ex.add_artifact('model_predictions.txt') + ex.add_resource(__file__) + print("Accuracy:", accuracy) + return accuracy + +if __name__ == '__main__': + ex.run_commandline()