Improved performance and accuracy tests for different classifiers
- added additional tests for classifiers with different arguments - added check for overlapping rows in test and train datasets
This commit is contained in:
parent
c7503596f6
commit
773aea2f05
@ -1,22 +1,20 @@
|
|||||||
import os
|
import os
|
||||||
import pickle
|
|
||||||
import time
|
import time
|
||||||
|
|
||||||
import pandas as pd
|
import pandas as pd
|
||||||
from sklearn.discriminant_analysis import QuadraticDiscriminantAnalysis
|
from sklearn.discriminant_analysis import QuadraticDiscriminantAnalysis
|
||||||
from sklearn.metrics import accuracy_score, precision_score, recall_score, f1_score, log_loss, confusion_matrix, \
|
from sklearn.metrics import accuracy_score, precision_score, recall_score, f1_score
|
||||||
matthews_corrcoef, cohen_kappa_score
|
from sklearn.ensemble import RandomForestClassifier, GradientBoostingClassifier
|
||||||
from sklearn.ensemble import RandomForestClassifier, AdaBoostClassifier, GradientBoostingClassifier
|
|
||||||
from sklearn.naive_bayes import GaussianNB
|
from sklearn.naive_bayes import GaussianNB
|
||||||
from sklearn.neighbors import KNeighborsClassifier
|
from sklearn.neighbors import KNeighborsClassifier
|
||||||
from sklearn.neural_network import MLPClassifier
|
|
||||||
from sklearn.svm import SVR, SVC
|
|
||||||
from sklearn import preprocessing
|
from sklearn import preprocessing
|
||||||
from sklearn.tree import DecisionTreeClassifier
|
from sklearn.tree import DecisionTreeClassifier
|
||||||
|
from sklearn.model_selection import ParameterGrid
|
||||||
|
|
||||||
TRAIN_DATA_DIR = "datasets_train"
|
TRAIN_DATA_DIR = "datasets_train_raw"
|
||||||
TEST_DATA_DIR = "datasets_test"
|
TEST_DATA_DIR = "datasets_test"
|
||||||
|
|
||||||
|
|
||||||
def invoke_and_measure(func, *args, **kwargs):
|
def invoke_and_measure(func, *args, **kwargs):
|
||||||
start_time = time.time()
|
start_time = time.time()
|
||||||
result = func(*args, **kwargs)
|
result = func(*args, **kwargs)
|
||||||
@ -25,6 +23,7 @@ def invoke_and_measure(func, *args, **kwargs):
|
|||||||
elapsed_time = end_time - start_time
|
elapsed_time = end_time - start_time
|
||||||
return result, elapsed_time
|
return result, elapsed_time
|
||||||
|
|
||||||
|
|
||||||
train_df_list = []
|
train_df_list = []
|
||||||
for file in os.listdir(TRAIN_DATA_DIR):
|
for file in os.listdir(TRAIN_DATA_DIR):
|
||||||
file_path = os.path.join(TRAIN_DATA_DIR, file)
|
file_path = os.path.join(TRAIN_DATA_DIR, file)
|
||||||
@ -42,42 +41,88 @@ for file in os.listdir(TEST_DATA_DIR):
|
|||||||
"collapsed"])
|
"collapsed"])
|
||||||
test_df_list.append(df)
|
test_df_list.append(df)
|
||||||
|
|
||||||
data_test = pd.concat(train_df_list, ignore_index=True).sample(frac=1, random_state=42)
|
data_test = pd.concat(test_df_list, ignore_index=True).sample(frac=1, random_state=42)
|
||||||
|
|
||||||
|
merged_data = pd.merge(data_train, data_test, indicator=True, how='outer')
|
||||||
|
overlap_rows = merged_data[merged_data['_merge'] == 'both']
|
||||||
|
if overlap_rows.empty:
|
||||||
|
print("There are no overlapping rows between train and test datasets.")
|
||||||
|
else:
|
||||||
|
print("Train and test datasets have following overlapping rows: ")
|
||||||
|
print(overlap_rows)
|
||||||
|
|
||||||
X_train = data_train.iloc[:, 1:-1].values
|
X_train = data_train.iloc[:, 1:-1].values
|
||||||
y_train = data_train.iloc[:, -1].values
|
y_train = data_train.iloc[:, -1].values
|
||||||
|
|
||||||
lab = preprocessing.LabelEncoder()
|
lab = preprocessing.LabelEncoder()
|
||||||
y_transformed = lab.fit_transform(y_train)
|
y_train_transformed = lab.fit_transform(y_train)
|
||||||
|
|
||||||
X_test = data_test.iloc[:, 1:-1].values
|
X_test = data_test.iloc[:, 1:-1].values
|
||||||
|
y_test = data_test.iloc[:, -1].values
|
||||||
|
y_test_transformed = lab.fit_transform(y_test)
|
||||||
|
|
||||||
names = [
|
classifiers_and_parameters = [
|
||||||
"Nearest Neighbors",
|
{
|
||||||
"Decision Tree",
|
"name": "Nearest Neighbors",
|
||||||
"Random Forest",
|
"classifier": KNeighborsClassifier(),
|
||||||
"Naive Bayes",
|
"parameters": {
|
||||||
"QDA",
|
"n_neighbors": [3, 5, 10, 50]
|
||||||
"Gradient Boosting"
|
}
|
||||||
|
},
|
||||||
|
{
|
||||||
|
"name": "Decision Tree",
|
||||||
|
"classifier": DecisionTreeClassifier(),
|
||||||
|
"parameters": {
|
||||||
|
"max_depth": [10, 20, 50]
|
||||||
|
}
|
||||||
|
},
|
||||||
|
{
|
||||||
|
"name": "Random Forest",
|
||||||
|
"classifier": RandomForestClassifier(),
|
||||||
|
"parameters": {
|
||||||
|
"max_depth": [10, 20, 50],
|
||||||
|
"n_estimators": [10, 50, 100],
|
||||||
|
"max_features": ['sqrt', 'log2']
|
||||||
|
}
|
||||||
|
},
|
||||||
|
{
|
||||||
|
"name": "Naive Bayes",
|
||||||
|
"classifier": GaussianNB(),
|
||||||
|
"parameters": {
|
||||||
|
"var_smoothing": [1e-09, 1e-08, 1e-07]
|
||||||
|
}
|
||||||
|
},
|
||||||
|
{
|
||||||
|
"name": "QDA",
|
||||||
|
"classifier": QuadraticDiscriminantAnalysis(),
|
||||||
|
"parameters": {
|
||||||
|
"reg_param": [0.0, 0.5, 1.0]
|
||||||
|
}
|
||||||
|
},
|
||||||
|
{
|
||||||
|
"name": "Gradient Boosting",
|
||||||
|
"classifier": GradientBoostingClassifier(),
|
||||||
|
"parameters": {
|
||||||
|
"learning_rate": [0.01, 0.05, 0.1],
|
||||||
|
"n_estimators": [50, 100, 200]
|
||||||
|
}
|
||||||
|
}
|
||||||
]
|
]
|
||||||
|
|
||||||
classifiers = {
|
for item in classifiers_and_parameters:
|
||||||
KNeighborsClassifier(3),
|
name = item["name"]
|
||||||
DecisionTreeClassifier(max_depth=5),
|
clf = item["classifier"]
|
||||||
RandomForestClassifier(max_depth=5, n_estimators=10, max_features=1),
|
param_grid = ParameterGrid(item["parameters"])
|
||||||
GaussianNB(),
|
|
||||||
QuadraticDiscriminantAnalysis(),
|
|
||||||
GradientBoostingClassifier()
|
|
||||||
}
|
|
||||||
|
|
||||||
for name, clf in zip(names, classifiers):
|
for params in param_grid:
|
||||||
_, fit_time = invoke_and_measure(clf.fit, X_train, y_transformed)
|
clf.set_params(**params)
|
||||||
|
_, fit_time = invoke_and_measure(clf.fit, X_train, y_train_transformed)
|
||||||
y_pred, pred_time = invoke_and_measure(clf.predict, X_test)
|
y_pred, pred_time = invoke_and_measure(clf.predict, X_test)
|
||||||
accuracy = accuracy_score(y_train, y_pred)
|
|
||||||
precision = precision_score(y_train, y_pred)
|
accuracy = accuracy_score(y_test_transformed, y_pred)
|
||||||
recall = recall_score(y_train, y_pred)
|
precision = precision_score(y_test_transformed, y_pred)
|
||||||
f1 = f1_score(y_train, y_pred)
|
recall = recall_score(y_test_transformed, y_pred)
|
||||||
|
f1 = f1_score(y_test_transformed, y_pred)
|
||||||
print(
|
print(
|
||||||
f"{name}: accuracy={accuracy * 100:.2f}% precision={precision * 100:.2f}% recall={recall * 100:.2f}% "
|
f"{name} with params {params}: accuracy={accuracy * 100:.2f}% precision={precision * 100:.2f}% recall={recall * 100:.2f}% "
|
||||||
f"f1={f1 * 100:.2f}% "
|
|
||||||
f"train_time={fit_time:.5f}s predict_time={pred_time:.5f}s")
|
f"train_time={fit_time:.5f}s predict_time={pred_time:.5f}s")
|
||||||
|
Loading…
Reference in New Issue
Block a user