added model training
This commit is contained in:
parent
9614bea42a
commit
70e774b2fa
@ -1,17 +1,14 @@
|
|||||||
FROM ubuntu:latest
|
FROM ubuntu:latest
|
||||||
|
|
||||||
ENV KAGGLE_USERNAME=gulczas
|
|
||||||
ENV KAGGLE_KEY=default_key
|
|
||||||
|
|
||||||
RUN apt-get update && \
|
RUN apt-get update && \
|
||||||
apt-get install -y \
|
apt-get install -y \
|
||||||
python3 \
|
python3 \
|
||||||
python3-pip \
|
python3-pip \
|
||||||
wget \
|
wget \
|
||||||
unzip \
|
unzip \
|
||||||
&& rm -rf /var/lib/apt/lists/*
|
&& rm -rf /var/lib/apt/lists/*
|
||||||
|
|
||||||
RUN pip3 install pandas scikit-learn requests kaggle numpy
|
RUN pip3 install pandas scikit-learn requests numpy
|
||||||
|
|
||||||
WORKDIR /app
|
WORKDIR /app
|
||||||
|
|
||||||
|
@ -3,7 +3,6 @@ import os
|
|||||||
import numpy as np
|
import numpy as np
|
||||||
import shutil
|
import shutil
|
||||||
import sys
|
import sys
|
||||||
from kaggle.api.kaggle_api_extended import KaggleApi
|
|
||||||
from sklearn.model_selection import train_test_split
|
from sklearn.model_selection import train_test_split
|
||||||
from sklearn.linear_model import LogisticRegression
|
from sklearn.linear_model import LogisticRegression
|
||||||
from sklearn.metrics import accuracy_score
|
from sklearn.metrics import accuracy_score
|
||||||
@ -15,27 +14,23 @@ def check_datasets_presence():
|
|||||||
|
|
||||||
dataset_1 = "Spotify_Dataset.csv"
|
dataset_1 = "Spotify_Dataset.csv"
|
||||||
dataset_2 = "spotify_songs.csv"
|
dataset_2 = "spotify_songs.csv"
|
||||||
destination_folder = "datasets"
|
destination_folder = "artifacts"
|
||||||
|
|
||||||
if not os.path.exists(destination_folder):
|
if not os.path.exists(destination_folder):
|
||||||
os.makedirs(destination_folder)
|
raise FileNotFoundError(destination_folder + " folder not found")
|
||||||
print(f"Utworzono folder: {destination_folder}")
|
|
||||||
else:
|
|
||||||
print(f"Folder {destination_folder} już istnieje.")
|
|
||||||
|
|
||||||
if dataset_1 in os.listdir("/."):
|
if dataset_1 in os.listdir("/."):
|
||||||
shutil.move(dataset_1, destination_folder)
|
shutil.move(dataset_1, destination_folder)
|
||||||
elif dataset_1 not in os.listdir(destination_folder):
|
elif dataset_1 not in os.listdir(destination_folder):
|
||||||
raise FileNotFoundError(dataset_1 + "not found")
|
raise FileNotFoundError(dataset_1 + " not found")
|
||||||
|
|
||||||
if dataset_2 in os.listdir("/."):
|
if dataset_2 in os.listdir("/."):
|
||||||
shutil.move(dataset_2, destination_folder)
|
shutil.move(dataset_2, destination_folder)
|
||||||
elif dataset_2 not in os.listdir(destination_folder):
|
elif dataset_2 not in os.listdir(destination_folder):
|
||||||
raise FileNotFoundError(dataset_2 + "not found")
|
raise FileNotFoundError(dataset_2 + " not found")
|
||||||
|
|
||||||
def datasets_preparation():
|
def datasets_preparation():
|
||||||
df_1 = pd.read_csv("datasets/spotify_songs.csv")
|
df_1 = pd.read_csv("artifacts/spotify_songs.csv")
|
||||||
df_2 = pd.read_csv("datasets/Spotify_Dataset.csv", sep=";")
|
df_2 = pd.read_csv("artifacts/Spotify_Dataset.csv", sep=";")
|
||||||
|
|
||||||
df_1 = df_1.dropna()
|
df_1 = df_1.dropna()
|
||||||
df_2 = df_2.dropna()
|
df_2 = df_2.dropna()
|
||||||
@ -65,8 +60,8 @@ def datasets_preparation():
|
|||||||
|
|
||||||
#df_1 = df_1.iloc[20:]
|
#df_1 = df_1.iloc[20:]
|
||||||
|
|
||||||
if "docker_test_dataset.csv" not in os.listdir("datasets"):
|
if "docker_test_dataset.csv" not in os.listdir("artifacts"):
|
||||||
diff_df.to_csv("datasets/docker_test_dataset.csv", index=False)
|
diff_df.to_csv("artifacts/docker_test_dataset.csv", index=False)
|
||||||
|
|
||||||
result_df = pd.merge(df_1, df_2, on='track_name', how='inner')
|
result_df = pd.merge(df_1, df_2, on='track_name', how='inner')
|
||||||
result_df = result_df.drop_duplicates(subset=['track_name'])
|
result_df = result_df.drop_duplicates(subset=['track_name'])
|
||||||
@ -85,7 +80,7 @@ check_datasets_presence()
|
|||||||
result_df = datasets_preparation()
|
result_df = datasets_preparation()
|
||||||
Y = result_df[['playlist_genre']]
|
Y = result_df[['playlist_genre']]
|
||||||
X = result_df.drop(columns='playlist_genre')
|
X = result_df.drop(columns='playlist_genre')
|
||||||
X_train, X_test, Y_train, Y_test = train_test_split(X, Y, test_size=sys.argv[1], random_state=42)
|
X_train, X_test, Y_train, Y_test = train_test_split(X, Y, test_size=float(sys.argv[1]), random_state=42)
|
||||||
|
|
||||||
|
|
||||||
Y_train = np.ravel(Y_train)
|
Y_train = np.ravel(Y_train)
|
||||||
@ -96,7 +91,7 @@ numeric_columns = X_train.select_dtypes(include=['int', 'float']).columns
|
|||||||
X_train_scaled = scaler.fit_transform(X_train[numeric_columns])
|
X_train_scaled = scaler.fit_transform(X_train[numeric_columns])
|
||||||
X_test_scaled = scaler.transform(X_test[numeric_columns])
|
X_test_scaled = scaler.transform(X_test[numeric_columns])
|
||||||
|
|
||||||
model = LogisticRegression(max_iter=sys.argv[2])
|
model = LogisticRegression(max_iter=int(sys.argv[2]))
|
||||||
model.fit(X_train_scaled, Y_train)
|
model.fit(X_train_scaled, Y_train)
|
||||||
|
|
||||||
|
|
||||||
|
Loading…
Reference in New Issue
Block a user