added model training
This commit is contained in:
parent
9614bea42a
commit
70e774b2fa
@ -1,8 +1,5 @@
|
||||
FROM ubuntu:latest
|
||||
|
||||
ENV KAGGLE_USERNAME=gulczas
|
||||
ENV KAGGLE_KEY=default_key
|
||||
|
||||
RUN apt-get update && \
|
||||
apt-get install -y \
|
||||
python3 \
|
||||
@ -11,7 +8,7 @@ RUN apt-get update && \
|
||||
unzip \
|
||||
&& rm -rf /var/lib/apt/lists/*
|
||||
|
||||
RUN pip3 install pandas scikit-learn requests kaggle numpy
|
||||
RUN pip3 install pandas scikit-learn requests numpy
|
||||
|
||||
WORKDIR /app
|
||||
|
||||
|
@ -3,7 +3,6 @@ import os
|
||||
import numpy as np
|
||||
import shutil
|
||||
import sys
|
||||
from kaggle.api.kaggle_api_extended import KaggleApi
|
||||
from sklearn.model_selection import train_test_split
|
||||
from sklearn.linear_model import LogisticRegression
|
||||
from sklearn.metrics import accuracy_score
|
||||
@ -15,27 +14,23 @@ def check_datasets_presence():
|
||||
|
||||
dataset_1 = "Spotify_Dataset.csv"
|
||||
dataset_2 = "spotify_songs.csv"
|
||||
destination_folder = "datasets"
|
||||
destination_folder = "artifacts"
|
||||
|
||||
if not os.path.exists(destination_folder):
|
||||
os.makedirs(destination_folder)
|
||||
print(f"Utworzono folder: {destination_folder}")
|
||||
else:
|
||||
print(f"Folder {destination_folder} już istnieje.")
|
||||
|
||||
raise FileNotFoundError(destination_folder + " folder not found")
|
||||
if dataset_1 in os.listdir("/."):
|
||||
shutil.move(dataset_1, destination_folder)
|
||||
elif dataset_1 not in os.listdir(destination_folder):
|
||||
raise FileNotFoundError(dataset_1 + "not found")
|
||||
raise FileNotFoundError(dataset_1 + " not found")
|
||||
|
||||
if dataset_2 in os.listdir("/."):
|
||||
shutil.move(dataset_2, destination_folder)
|
||||
elif dataset_2 not in os.listdir(destination_folder):
|
||||
raise FileNotFoundError(dataset_2 + "not found")
|
||||
raise FileNotFoundError(dataset_2 + " not found")
|
||||
|
||||
def datasets_preparation():
|
||||
df_1 = pd.read_csv("datasets/spotify_songs.csv")
|
||||
df_2 = pd.read_csv("datasets/Spotify_Dataset.csv", sep=";")
|
||||
df_1 = pd.read_csv("artifacts/spotify_songs.csv")
|
||||
df_2 = pd.read_csv("artifacts/Spotify_Dataset.csv", sep=";")
|
||||
|
||||
df_1 = df_1.dropna()
|
||||
df_2 = df_2.dropna()
|
||||
@ -65,8 +60,8 @@ def datasets_preparation():
|
||||
|
||||
#df_1 = df_1.iloc[20:]
|
||||
|
||||
if "docker_test_dataset.csv" not in os.listdir("datasets"):
|
||||
diff_df.to_csv("datasets/docker_test_dataset.csv", index=False)
|
||||
if "docker_test_dataset.csv" not in os.listdir("artifacts"):
|
||||
diff_df.to_csv("artifacts/docker_test_dataset.csv", index=False)
|
||||
|
||||
result_df = pd.merge(df_1, df_2, on='track_name', how='inner')
|
||||
result_df = result_df.drop_duplicates(subset=['track_name'])
|
||||
@ -85,7 +80,7 @@ check_datasets_presence()
|
||||
result_df = datasets_preparation()
|
||||
Y = result_df[['playlist_genre']]
|
||||
X = result_df.drop(columns='playlist_genre')
|
||||
X_train, X_test, Y_train, Y_test = train_test_split(X, Y, test_size=sys.argv[1], random_state=42)
|
||||
X_train, X_test, Y_train, Y_test = train_test_split(X, Y, test_size=float(sys.argv[1]), random_state=42)
|
||||
|
||||
|
||||
Y_train = np.ravel(Y_train)
|
||||
@ -96,7 +91,7 @@ numeric_columns = X_train.select_dtypes(include=['int', 'float']).columns
|
||||
X_train_scaled = scaler.fit_transform(X_train[numeric_columns])
|
||||
X_test_scaled = scaler.transform(X_test[numeric_columns])
|
||||
|
||||
model = LogisticRegression(max_iter=sys.argv[2])
|
||||
model = LogisticRegression(max_iter=int(sys.argv[2]))
|
||||
model.fit(X_train_scaled, Y_train)
|
||||
|
||||
|
||||
|
Loading…
Reference in New Issue
Block a user