Improved performance and accuracy tests for different classifiers

- added additional tests for classifiers with different arguments
- added check for overlapping rows in test and train datasets
This commit is contained in:
Ilya Dziamidchyk 2023-09-19 11:18:38 +02:00
parent c7503596f6
commit 773aea2f05

View File

@ -1,22 +1,20 @@
import os import os
import pickle
import time import time
import pandas as pd import pandas as pd
from sklearn.discriminant_analysis import QuadraticDiscriminantAnalysis from sklearn.discriminant_analysis import QuadraticDiscriminantAnalysis
from sklearn.metrics import accuracy_score, precision_score, recall_score, f1_score, log_loss, confusion_matrix, \ from sklearn.metrics import accuracy_score, precision_score, recall_score, f1_score
matthews_corrcoef, cohen_kappa_score from sklearn.ensemble import RandomForestClassifier, GradientBoostingClassifier
from sklearn.ensemble import RandomForestClassifier, AdaBoostClassifier, GradientBoostingClassifier
from sklearn.naive_bayes import GaussianNB from sklearn.naive_bayes import GaussianNB
from sklearn.neighbors import KNeighborsClassifier from sklearn.neighbors import KNeighborsClassifier
from sklearn.neural_network import MLPClassifier
from sklearn.svm import SVR, SVC
from sklearn import preprocessing from sklearn import preprocessing
from sklearn.tree import DecisionTreeClassifier from sklearn.tree import DecisionTreeClassifier
from sklearn.model_selection import ParameterGrid
TRAIN_DATA_DIR = "datasets_train" TRAIN_DATA_DIR = "datasets_train_raw"
TEST_DATA_DIR = "datasets_test" TEST_DATA_DIR = "datasets_test"
def invoke_and_measure(func, *args, **kwargs): def invoke_and_measure(func, *args, **kwargs):
start_time = time.time() start_time = time.time()
result = func(*args, **kwargs) result = func(*args, **kwargs)
@ -25,6 +23,7 @@ def invoke_and_measure(func, *args, **kwargs):
elapsed_time = end_time - start_time elapsed_time = end_time - start_time
return result, elapsed_time return result, elapsed_time
train_df_list = [] train_df_list = []
for file in os.listdir(TRAIN_DATA_DIR): for file in os.listdir(TRAIN_DATA_DIR):
file_path = os.path.join(TRAIN_DATA_DIR, file) file_path = os.path.join(TRAIN_DATA_DIR, file)
@ -42,42 +41,88 @@ for file in os.listdir(TEST_DATA_DIR):
"collapsed"]) "collapsed"])
test_df_list.append(df) test_df_list.append(df)
data_test = pd.concat(train_df_list, ignore_index=True).sample(frac=1, random_state=42) data_test = pd.concat(test_df_list, ignore_index=True).sample(frac=1, random_state=42)
merged_data = pd.merge(data_train, data_test, indicator=True, how='outer')
overlap_rows = merged_data[merged_data['_merge'] == 'both']
if overlap_rows.empty:
print("There are no overlapping rows between train and test datasets.")
else:
print("Train and test datasets have following overlapping rows: ")
print(overlap_rows)
X_train = data_train.iloc[:, 1:-1].values X_train = data_train.iloc[:, 1:-1].values
y_train = data_train.iloc[:, -1].values y_train = data_train.iloc[:, -1].values
lab = preprocessing.LabelEncoder() lab = preprocessing.LabelEncoder()
y_transformed = lab.fit_transform(y_train) y_train_transformed = lab.fit_transform(y_train)
X_test = data_test.iloc[:, 1:-1].values X_test = data_test.iloc[:, 1:-1].values
y_test = data_test.iloc[:, -1].values
y_test_transformed = lab.fit_transform(y_test)
names = [ classifiers_and_parameters = [
"Nearest Neighbors", {
"Decision Tree", "name": "Nearest Neighbors",
"Random Forest", "classifier": KNeighborsClassifier(),
"Naive Bayes", "parameters": {
"QDA", "n_neighbors": [3, 5, 10, 50]
"Gradient Boosting" }
},
{
"name": "Decision Tree",
"classifier": DecisionTreeClassifier(),
"parameters": {
"max_depth": [10, 20, 50]
}
},
{
"name": "Random Forest",
"classifier": RandomForestClassifier(),
"parameters": {
"max_depth": [10, 20, 50],
"n_estimators": [10, 50, 100],
"max_features": ['sqrt', 'log2']
}
},
{
"name": "Naive Bayes",
"classifier": GaussianNB(),
"parameters": {
"var_smoothing": [1e-09, 1e-08, 1e-07]
}
},
{
"name": "QDA",
"classifier": QuadraticDiscriminantAnalysis(),
"parameters": {
"reg_param": [0.0, 0.5, 1.0]
}
},
{
"name": "Gradient Boosting",
"classifier": GradientBoostingClassifier(),
"parameters": {
"learning_rate": [0.01, 0.05, 0.1],
"n_estimators": [50, 100, 200]
}
}
] ]
classifiers = { for item in classifiers_and_parameters:
KNeighborsClassifier(3), name = item["name"]
DecisionTreeClassifier(max_depth=5), clf = item["classifier"]
RandomForestClassifier(max_depth=5, n_estimators=10, max_features=1), param_grid = ParameterGrid(item["parameters"])
GaussianNB(),
QuadraticDiscriminantAnalysis(),
GradientBoostingClassifier()
}
for name, clf in zip(names, classifiers): for params in param_grid:
_, fit_time = invoke_and_measure(clf.fit, X_train, y_transformed) clf.set_params(**params)
y_pred, pred_time = invoke_and_measure(clf.predict, X_test) _, fit_time = invoke_and_measure(clf.fit, X_train, y_train_transformed)
accuracy = accuracy_score(y_train, y_pred) y_pred, pred_time = invoke_and_measure(clf.predict, X_test)
precision = precision_score(y_train, y_pred)
recall = recall_score(y_train, y_pred) accuracy = accuracy_score(y_test_transformed, y_pred)
f1 = f1_score(y_train, y_pred) precision = precision_score(y_test_transformed, y_pred)
print( recall = recall_score(y_test_transformed, y_pred)
f"{name}: accuracy={accuracy * 100:.2f}% precision={precision * 100:.2f}% recall={recall * 100:.2f}% " f1 = f1_score(y_test_transformed, y_pred)
f"f1={f1 * 100:.2f}% " print(
f"train_time={fit_time:.5f}s predict_time={pred_time:.5f}s") f"{name} with params {params}: accuracy={accuracy * 100:.2f}% precision={precision * 100:.2f}% recall={recall * 100:.2f}% "
f"train_time={fit_time:.5f}s predict_time={pred_time:.5f}s")