diff --git a/.gitignore b/.gitignore index 85771a7..46117a8 100644 --- a/.gitignore +++ b/.gitignore @@ -2,4 +2,5 @@ venv* .venv* .vscode* __pycache__* -music_genre.csv \ No newline at end of file +music_genre.csv +music_genre.model \ No newline at end of file diff --git a/bayes.py b/bayes.py index cb7c515..3e155cd 100644 --- a/bayes.py +++ b/bayes.py @@ -1,3 +1,35 @@ +from sklearn.naive_bayes import GaussianNB +from sklearn.metrics import confusion_matrix, accuracy_score +import pandas as pd +import numpy as np +import pickle, os +import typing + class Bayes: - def __init__(self): - pass \ No newline at end of file + def __init__(self, path: str): + self.path = path + self.model_exists = False + if os.path.isfile(self.path): + self.model_exists = True + with open(self.path, 'rb') as file: + self.classifier = pickle.load(file) + else: + self.classifier = GaussianNB() + + + def train(self, X: pd.DataFrame, Y: pd.Series) -> None: + self.classifier.fit(X, Y) + with open(self.path, 'wb') as file: + pickle.dump(self.classifier, file) + self.model_exists = True + + + def predict(self, X: pd.DataFrame) -> np.ndarray: + predictions = self.classifier.predict(X) + return predictions + + + def eval(self, Y: pd.Series, Y_pred: np.ndarray) -> typing.Tuple[np.ndarray, np.float64]: + cm = confusion_matrix(Y, Y_pred) + ac = accuracy_score(Y, Y_pred) + return (cm, ac) \ No newline at end of file diff --git a/datapreparator.py b/datapreparator.py index 9223b22..408a72b 100644 --- a/datapreparator.py +++ b/datapreparator.py @@ -1,5 +1,7 @@ +from sklearn.model_selection import train_test_split from copy import deepcopy import pandas as pd +import typing class DataPreparator: genre_dict = { @@ -15,9 +17,16 @@ class DataPreparator: "rock" : 10 } + def prepare_data(df: pd.DataFrame) -> pd.DataFrame: data = deepcopy(df) column = df["label"].apply(lambda x: DataPreparator.genre_dict[x]) data.insert(0, 'genre', column, 'float') data = data.drop(columns=['filename', 'label', 'length']) - return data \ No newline at end of file + return data + + + def train_test_split(df: pd.DataFrame) -> typing.Tuple[pd.DataFrame, pd.DataFrame, pd.Series, pd.Series]: + X = df.drop(["genre"], axis=1) + Y = df["genre"] + return train_test_split(X, Y, test_size = 0.20, random_state = False) \ No newline at end of file diff --git a/main.py b/main.py index f520e52..be5f612 100644 --- a/main.py +++ b/main.py @@ -10,3 +10,20 @@ else: data_raw = pd.read_csv('music_genre_raw.csv') data = DataPreparator.prepare_data(data_raw) data.to_csv(filename, index=False) + +X_train, X_test, Y_train, Y_test = DataPreparator.train_test_split(data) + +bayes = Bayes('music_genre.model') +if(not bayes.model_exists): + bayes.train(X_train, Y_train) + + +Y_predicted = bayes.predict(X_train) +eval_result = bayes.eval(Y_train, Y_predicted) +print("Train:") +print(eval_result[1]) + +Y_predicted = bayes.predict(X_test) +eval_result = bayes.eval(Y_test, Y_predicted) +print("Test:") +print(eval_result[1]) diff --git a/requirements.txt b/requirements.txt index da519a7..20be7de 100644 --- a/requirements.txt +++ b/requirements.txt @@ -1 +1,3 @@ -pandas==1.2.4 \ No newline at end of file +pandas==1.2.4 +numpy==1.20.3 +sklearn==0.0 \ No newline at end of file