added model training

This commit is contained in:
s464953 2024-05-09 00:47:34 +02:00
parent 9614bea42a
commit 70e774b2fa
2 changed files with 12 additions and 20 deletions

View File

@ -1,8 +1,5 @@
FROM ubuntu:latest
ENV KAGGLE_USERNAME=gulczas
ENV KAGGLE_KEY=default_key
RUN apt-get update && \
apt-get install -y \
python3 \
@ -11,7 +8,7 @@ RUN apt-get update && \
unzip \
&& rm -rf /var/lib/apt/lists/*
RUN pip3 install pandas scikit-learn requests kaggle numpy
RUN pip3 install pandas scikit-learn requests numpy
WORKDIR /app

View File

@ -3,7 +3,6 @@ import os
import numpy as np
import shutil
import sys
from kaggle.api.kaggle_api_extended import KaggleApi
from sklearn.model_selection import train_test_split
from sklearn.linear_model import LogisticRegression
from sklearn.metrics import accuracy_score
@ -15,27 +14,23 @@ def check_datasets_presence():
dataset_1 = "Spotify_Dataset.csv"
dataset_2 = "spotify_songs.csv"
destination_folder = "datasets"
destination_folder = "artifacts"
if not os.path.exists(destination_folder):
os.makedirs(destination_folder)
print(f"Utworzono folder: {destination_folder}")
else:
print(f"Folder {destination_folder} już istnieje.")
raise FileNotFoundError(destination_folder + " folder not found")
if dataset_1 in os.listdir("/."):
shutil.move(dataset_1, destination_folder)
elif dataset_1 not in os.listdir(destination_folder):
raise FileNotFoundError(dataset_1 + "not found")
raise FileNotFoundError(dataset_1 + " not found")
if dataset_2 in os.listdir("/."):
shutil.move(dataset_2, destination_folder)
elif dataset_2 not in os.listdir(destination_folder):
raise FileNotFoundError(dataset_2 + "not found")
raise FileNotFoundError(dataset_2 + " not found")
def datasets_preparation():
df_1 = pd.read_csv("datasets/spotify_songs.csv")
df_2 = pd.read_csv("datasets/Spotify_Dataset.csv", sep=";")
df_1 = pd.read_csv("artifacts/spotify_songs.csv")
df_2 = pd.read_csv("artifacts/Spotify_Dataset.csv", sep=";")
df_1 = df_1.dropna()
df_2 = df_2.dropna()
@ -65,8 +60,8 @@ def datasets_preparation():
#df_1 = df_1.iloc[20:]
if "docker_test_dataset.csv" not in os.listdir("datasets"):
diff_df.to_csv("datasets/docker_test_dataset.csv", index=False)
if "docker_test_dataset.csv" not in os.listdir("artifacts"):
diff_df.to_csv("artifacts/docker_test_dataset.csv", index=False)
result_df = pd.merge(df_1, df_2, on='track_name', how='inner')
result_df = result_df.drop_duplicates(subset=['track_name'])
@ -85,7 +80,7 @@ check_datasets_presence()
result_df = datasets_preparation()
Y = result_df[['playlist_genre']]
X = result_df.drop(columns='playlist_genre')
X_train, X_test, Y_train, Y_test = train_test_split(X, Y, test_size=sys.argv[1], random_state=42)
X_train, X_test, Y_train, Y_test = train_test_split(X, Y, test_size=float(sys.argv[1]), random_state=42)
Y_train = np.ravel(Y_train)
@ -96,7 +91,7 @@ numeric_columns = X_train.select_dtypes(include=['int', 'float']).columns
X_train_scaled = scaler.fit_transform(X_train[numeric_columns])
X_test_scaled = scaler.transform(X_test[numeric_columns])
model = LogisticRegression(max_iter=sys.argv[2])
model = LogisticRegression(max_iter=int(sys.argv[2]))
model.fit(X_train_scaled, Y_train)