ium_07 sacred
This commit is contained in:
parent
5eb5fb7172
commit
af91b85a30
17
Dockerfile
17
Dockerfile
@ -1,8 +1,5 @@
|
||||
FROM ubuntu:latest
|
||||
|
||||
ENV KAGGLE_USERNAME=gulczas
|
||||
ENV KAGGLE_KEY=default_key
|
||||
|
||||
RUN apt-get update && \
|
||||
apt-get install -y \
|
||||
python3 \
|
||||
@ -11,16 +8,4 @@ RUN apt-get update && \
|
||||
unzip \
|
||||
&& rm -rf /var/lib/apt/lists/*
|
||||
|
||||
RUN pip3 install pandas scikit-learn requests kaggle numpy
|
||||
|
||||
WORKDIR /app
|
||||
|
||||
COPY model_creator.py /app/
|
||||
COPY use_model.py /app/
|
||||
COPY run_py_scripts.sh /app/
|
||||
|
||||
|
||||
RUN chmod +x model_creator.py
|
||||
RUN chmod +x use_model.py
|
||||
|
||||
CMD ["bash", "run_py_scripts.sh"]
|
||||
RUN pip3 install pandas scikit-learn requests kaggle numpy sacred pymongo
|
42
Jenkinsfile_sacred
Normal file
42
Jenkinsfile_sacred
Normal file
@ -0,0 +1,42 @@
|
||||
pipeline {
|
||||
agent any
|
||||
|
||||
parameters {
|
||||
string(name: 'KAGGLE_USERNAME', defaultValue: 'gulczas', description: 'Kaggle username')
|
||||
password(name: 'KAGGLE_KEY', defaultValue: '', description: 'Kaggle API key')
|
||||
}
|
||||
|
||||
stages {
|
||||
stage('Clone Repository') {
|
||||
steps {
|
||||
git 'https://git.wmi.amu.edu.pl/s464953/ium_464953.git'
|
||||
}
|
||||
}
|
||||
|
||||
stage('Build and Run Experiments') {
|
||||
agent {
|
||||
dockerfile {
|
||||
reuseNode true
|
||||
}
|
||||
}
|
||||
|
||||
environment {
|
||||
KAGGLE_USERNAME = "${params.KAGGLE_USERNAME}"
|
||||
KAGGLE_KEY = "${params.KAGGLE_KEY}"
|
||||
}
|
||||
|
||||
steps {
|
||||
sh 'chmod +x sacred/sacred_model_creator.py'
|
||||
sh 'python3 sacred/sacred_model_creator.py'
|
||||
sh 'chmod +x sacred/sacred_use_model.py'
|
||||
sh 'python3 sacred/sacred_use_model.py'
|
||||
}
|
||||
}
|
||||
|
||||
stage('Archive Artifacts from Experiments') {
|
||||
steps {
|
||||
archiveArtifacts artifacts: 'my_experiment_logs/**', allowEmptyArchive: true
|
||||
}
|
||||
}
|
||||
}
|
||||
}
|
108
sacred/sacred_model_creator.py
Normal file
108
sacred/sacred_model_creator.py
Normal file
@ -0,0 +1,108 @@
|
||||
import pandas as pd
|
||||
import os
|
||||
import numpy as np
|
||||
from kaggle.api.kaggle_api_extended import KaggleApi
|
||||
from sklearn.model_selection import train_test_split
|
||||
from sklearn.linear_model import LogisticRegression
|
||||
from sklearn.metrics import accuracy_score
|
||||
from sklearn.preprocessing import StandardScaler
|
||||
from sklearn.preprocessing import LabelEncoder
|
||||
import pickle
|
||||
from sacred import Experiment
|
||||
from sacred.observers import MongoObserver, FileObserver
|
||||
|
||||
# Tworzenie eksperymentu
|
||||
ex = Experiment('123456') # Zastąp '123456' swoim numerem indeksu
|
||||
|
||||
# Dodanie obserwatorów
|
||||
ex.observers.append(MongoObserver(url='mongodb://admin:IUM_2021@tzietkiewicz.vm.wmi.amu.edu.pl:27017'))
|
||||
ex.observers.append(FileObserver('my_experiment_logs'))
|
||||
|
||||
def download_dataset(dataset_address, destination_folder):
|
||||
api = KaggleApi()
|
||||
api.authenticate()
|
||||
api.dataset_download_files(dataset_address, path=destination_folder, unzip=True)
|
||||
|
||||
def check_datasets_presence():
|
||||
dataset_1 = "Spotify_Dataset.csv"
|
||||
dataset_2 = "spotify_songs.csv"
|
||||
destination_folder = "datasets"
|
||||
if not os.path.exists(destination_folder):
|
||||
os.makedirs(destination_folder)
|
||||
print(f"Utworzono folder: {destination_folder}")
|
||||
else:
|
||||
print(f"Folder {destination_folder} już istnieje.")
|
||||
if dataset_1 not in os.listdir(destination_folder):
|
||||
download_dataset('gulczas/spotify-dataset', destination_folder)
|
||||
if dataset_2 not in os.listdir(destination_folder):
|
||||
download_dataset('joebeachcapital/30000-spotify-songs', destination_folder)
|
||||
|
||||
def datasets_preparation():
|
||||
df_1 = pd.read_csv("datasets/spotify_songs.csv")
|
||||
df_2 = pd.read_csv("datasets/Spotify_Dataset.csv", sep=";")
|
||||
df_1 = df_1.dropna()
|
||||
df_2 = df_2.dropna()
|
||||
df_2 = df_2.rename(columns={'Title': 'track_name'})
|
||||
columns_to_remove_df_1 = ['track_id', 'track_album_id', 'track_album_name', 'track_album_release_date',
|
||||
'playlist_id', 'playlist_subgenre']
|
||||
columns_to_remove_df_2 = ['Date','# of Artist', 'Artist (Ind.)', '# of Nationality',
|
||||
'Nationality', 'Continent', 'Points (Total)',
|
||||
'Points (Ind for each Artist/Nat)', 'id', 'Song URL']
|
||||
df_1 = df_1.drop(columns=columns_to_remove_df_1)
|
||||
df_2 = df_2.drop(columns=columns_to_remove_df_2)
|
||||
df_1 = df_1.drop_duplicates(subset=['track_name'])
|
||||
df_2 = df_2.drop_duplicates(subset=['track_name'])
|
||||
le = LabelEncoder()
|
||||
unique_names_df2 = df_2['track_name'].unique()
|
||||
diff_df = df_1[~df_1['track_name'].isin(unique_names_df2)]
|
||||
diff_df = diff_df.iloc[:10000]
|
||||
diff_df['track_artist'] = le.fit_transform(diff_df.track_artist)
|
||||
diff_df['playlist_name'] = le.fit_transform(diff_df.playlist_name)
|
||||
diff_df['playlist_genre'] = le.fit_transform(diff_df.playlist_genre)
|
||||
if "docker_test_dataset.csv" not in os.listdir("datasets"):
|
||||
diff_df.to_csv("datasets/docker_test_dataset.csv", index=False)
|
||||
result_df = pd.merge(df_1, df_2, on='track_name', how='inner')
|
||||
result_df = result_df.drop_duplicates(subset=['track_name'])
|
||||
columns_to_remove_result_df = ['Rank', 'Artists', 'Danceability', 'Energy', 'Loudness',
|
||||
'Speechiness', 'Acousticness', 'Instrumentalness', 'Valence']
|
||||
result_df = result_df.drop(columns=columns_to_remove_result_df)
|
||||
result_df['track_artist'] = le.fit_transform(result_df.track_artist)
|
||||
result_df['playlist_name'] = le.fit_transform(result_df.playlist_name)
|
||||
result_df['playlist_genre'] = le.fit_transform(result_df.playlist_genre)
|
||||
return result_df
|
||||
|
||||
@ex.config
|
||||
def config():
|
||||
test_size = 0.10
|
||||
random_state = 42
|
||||
model_filename = 'model.pkl'
|
||||
|
||||
@ex.main
|
||||
def run_experiment(test_size, random_state, model_filename):
|
||||
check_datasets_presence()
|
||||
result_df = datasets_preparation()
|
||||
Y = result_df[['playlist_genre']]
|
||||
X = result_df.drop(columns='playlist_genre')
|
||||
X_train, X_test, Y_train, Y_test = train_test_split(X, Y, test_size=test_size, random_state=random_state)
|
||||
Y_train = np.ravel(Y_train)
|
||||
Y_test = np.ravel(Y_test)
|
||||
scaler = StandardScaler()
|
||||
numeric_columns = X_train.select_dtypes(include=['int', 'float']).columns
|
||||
X_train_scaled = scaler.fit_transform(X_train[numeric_columns])
|
||||
X_test_scaled = scaler.transform(X_test[numeric_columns])
|
||||
model = LogisticRegression(max_iter=1000)
|
||||
model.fit(X_train_scaled, Y_train)
|
||||
Y_pred = model.predict(X_test_scaled)
|
||||
accuracy = accuracy_score(Y_test, Y_pred)
|
||||
ex.log_scalar('accuracy', accuracy)
|
||||
if os.path.exists(model_filename):
|
||||
os.remove(model_filename)
|
||||
with open(model_filename, 'wb') as file:
|
||||
pickle.dump(model, file)
|
||||
ex.add_artifact(model_filename)
|
||||
ex.add_resource(__file__)
|
||||
print("Accuracy:", accuracy)
|
||||
return accuracy
|
||||
|
||||
if __name__ == '__main__':
|
||||
ex.run_commandline()
|
47
sacred/sacred_use_model.py
Normal file
47
sacred/sacred_use_model.py
Normal file
@ -0,0 +1,47 @@
|
||||
import pickle
|
||||
import pandas as pd
|
||||
import numpy as np
|
||||
from sklearn.preprocessing import StandardScaler
|
||||
from sklearn.metrics import accuracy_score
|
||||
from sacred import Experiment
|
||||
from sacred.observers import MongoObserver, FileObserver
|
||||
|
||||
ex = Experiment('464953_evaluation')
|
||||
|
||||
# Dodanie obserwatorów
|
||||
ex.observers.append(MongoObserver(url='mongodb://admin:IUM_2021@tzietkiewicz.vm.wmi.amu.edu.pl:27017'))
|
||||
ex.observers.append(FileObserver('my_experiment_logs'))
|
||||
|
||||
@ex.config
|
||||
def config():
|
||||
model_filename = 'model.pkl'
|
||||
test_dataset_filename = 'datasets/docker_test_dataset.csv'
|
||||
|
||||
@ex.main
|
||||
def run_evaluation(model_filename, test_dataset_filename):
|
||||
with open(model_filename, 'rb') as file:
|
||||
model = pickle.load(file)
|
||||
print("Model został wczytany z pliku:", model_filename)
|
||||
test_df = pd.read_csv(test_dataset_filename)
|
||||
Y_test = test_df[['playlist_genre']]
|
||||
X_test = test_df.drop(columns='playlist_genre')
|
||||
Y_test = np.ravel(Y_test)
|
||||
scaler = StandardScaler()
|
||||
numeric_columns = X_test.select_dtypes(include=['int', 'float']).columns
|
||||
X_test_scaled = scaler.fit_transform(X_test[numeric_columns])
|
||||
Y_pred = model.predict(X_test_scaled)
|
||||
labels_dict = {0: 'edm', 1 : 'latin', 2 : 'pop', 3 : 'r&b', 4 : 'rap', 5 :'rock'}
|
||||
Y_test_labels = [labels_dict[number] for number in Y_test]
|
||||
Y_pred_labels = [labels_dict[number] for number in Y_pred]
|
||||
accuracy = accuracy_score(Y_test, Y_pred)
|
||||
ex.log_scalar('accuracy', accuracy)
|
||||
with open('model_predictions.txt', 'w') as f:
|
||||
f.write("Real:" + str(Y_test_labels[:20]) + " \nPredicted: " + str(Y_pred_labels[:20]))
|
||||
f.write("\nAccuracy:" + str(accuracy))
|
||||
ex.add_artifact('model_predictions.txt')
|
||||
ex.add_resource(__file__)
|
||||
print("Accuracy:", accuracy)
|
||||
return accuracy
|
||||
|
||||
if __name__ == '__main__':
|
||||
ex.run_commandline()
|
Loading…
Reference in New Issue
Block a user