From 70e774b2fab1cee4d48daa3ed3d97268043fcb0f Mon Sep 17 00:00:00 2001 From: s464953 Date: Thu, 9 May 2024 00:47:34 +0200 Subject: [PATCH] added model training --- .ipynb_checkpoints/Dockerfile-checkpoint | 7 ++---- .../model_creator-checkpoint.py | 25 ++++++++----------- 2 files changed, 12 insertions(+), 20 deletions(-) diff --git a/.ipynb_checkpoints/Dockerfile-checkpoint b/.ipynb_checkpoints/Dockerfile-checkpoint index af6025c..d489d41 100644 --- a/.ipynb_checkpoints/Dockerfile-checkpoint +++ b/.ipynb_checkpoints/Dockerfile-checkpoint @@ -1,17 +1,14 @@ FROM ubuntu:latest -ENV KAGGLE_USERNAME=gulczas -ENV KAGGLE_KEY=default_key - RUN apt-get update && \ apt-get install -y \ - python3 \ + python3 \ python3-pip \ wget \ unzip \ && rm -rf /var/lib/apt/lists/* -RUN pip3 install pandas scikit-learn requests kaggle numpy +RUN pip3 install pandas scikit-learn requests numpy WORKDIR /app diff --git a/.ipynb_checkpoints/model_creator-checkpoint.py b/.ipynb_checkpoints/model_creator-checkpoint.py index fab60b2..f35b62f 100644 --- a/.ipynb_checkpoints/model_creator-checkpoint.py +++ b/.ipynb_checkpoints/model_creator-checkpoint.py @@ -3,7 +3,6 @@ import os import numpy as np import shutil import sys -from kaggle.api.kaggle_api_extended import KaggleApi from sklearn.model_selection import train_test_split from sklearn.linear_model import LogisticRegression from sklearn.metrics import accuracy_score @@ -15,27 +14,23 @@ def check_datasets_presence(): dataset_1 = "Spotify_Dataset.csv" dataset_2 = "spotify_songs.csv" - destination_folder = "datasets" + destination_folder = "artifacts" if not os.path.exists(destination_folder): - os.makedirs(destination_folder) - print(f"Utworzono folder: {destination_folder}") - else: - print(f"Folder {destination_folder} już istnieje.") - + raise FileNotFoundError(destination_folder + " folder not found") if dataset_1 in os.listdir("/."): shutil.move(dataset_1, destination_folder) elif dataset_1 not in os.listdir(destination_folder): - raise FileNotFoundError(dataset_1 + "not found") + raise FileNotFoundError(dataset_1 + " not found") if dataset_2 in os.listdir("/."): shutil.move(dataset_2, destination_folder) elif dataset_2 not in os.listdir(destination_folder): - raise FileNotFoundError(dataset_2 + "not found") + raise FileNotFoundError(dataset_2 + " not found") def datasets_preparation(): - df_1 = pd.read_csv("datasets/spotify_songs.csv") - df_2 = pd.read_csv("datasets/Spotify_Dataset.csv", sep=";") + df_1 = pd.read_csv("artifacts/spotify_songs.csv") + df_2 = pd.read_csv("artifacts/Spotify_Dataset.csv", sep=";") df_1 = df_1.dropna() df_2 = df_2.dropna() @@ -65,8 +60,8 @@ def datasets_preparation(): #df_1 = df_1.iloc[20:] - if "docker_test_dataset.csv" not in os.listdir("datasets"): - diff_df.to_csv("datasets/docker_test_dataset.csv", index=False) + if "docker_test_dataset.csv" not in os.listdir("artifacts"): + diff_df.to_csv("artifacts/docker_test_dataset.csv", index=False) result_df = pd.merge(df_1, df_2, on='track_name', how='inner') result_df = result_df.drop_duplicates(subset=['track_name']) @@ -85,7 +80,7 @@ check_datasets_presence() result_df = datasets_preparation() Y = result_df[['playlist_genre']] X = result_df.drop(columns='playlist_genre') -X_train, X_test, Y_train, Y_test = train_test_split(X, Y, test_size=sys.argv[1], random_state=42) +X_train, X_test, Y_train, Y_test = train_test_split(X, Y, test_size=float(sys.argv[1]), random_state=42) Y_train = np.ravel(Y_train) @@ -96,7 +91,7 @@ numeric_columns = X_train.select_dtypes(include=['int', 'float']).columns X_train_scaled = scaler.fit_transform(X_train[numeric_columns]) X_test_scaled = scaler.transform(X_test[numeric_columns]) -model = LogisticRegression(max_iter=sys.argv[2]) +model = LogisticRegression(max_iter=int(sys.argv[2])) model.fit(X_train_scaled, Y_train)