Changes to classifiers tests
- changed train and test datasets - removed classifiers which are too slow or produce unsatisfactory results - added more metrics to evaluate classification performance
This commit is contained in:
parent
f6f82463b6
commit
c7503596f6
@ -1,18 +1,21 @@
|
|||||||
|
import os
|
||||||
|
import pickle
|
||||||
import time
|
import time
|
||||||
|
|
||||||
import pandas as pd
|
import pandas as pd
|
||||||
from sklearn import preprocessing
|
|
||||||
from sklearn.discriminant_analysis import QuadraticDiscriminantAnalysis
|
from sklearn.discriminant_analysis import QuadraticDiscriminantAnalysis
|
||||||
|
from sklearn.metrics import accuracy_score, precision_score, recall_score, f1_score, log_loss, confusion_matrix, \
|
||||||
|
matthews_corrcoef, cohen_kappa_score
|
||||||
from sklearn.ensemble import RandomForestClassifier, AdaBoostClassifier, GradientBoostingClassifier
|
from sklearn.ensemble import RandomForestClassifier, AdaBoostClassifier, GradientBoostingClassifier
|
||||||
from sklearn.gaussian_process import GaussianProcessClassifier
|
|
||||||
from sklearn.gaussian_process.kernels import RBF
|
|
||||||
from sklearn.metrics import accuracy_score
|
|
||||||
from sklearn.naive_bayes import GaussianNB
|
from sklearn.naive_bayes import GaussianNB
|
||||||
from sklearn.neighbors import KNeighborsClassifier
|
from sklearn.neighbors import KNeighborsClassifier
|
||||||
from sklearn.neural_network import MLPClassifier
|
from sklearn.neural_network import MLPClassifier
|
||||||
from sklearn.svm import SVC
|
from sklearn.svm import SVR, SVC
|
||||||
|
from sklearn import preprocessing
|
||||||
from sklearn.tree import DecisionTreeClassifier
|
from sklearn.tree import DecisionTreeClassifier
|
||||||
|
|
||||||
|
TRAIN_DATA_DIR = "datasets_train"
|
||||||
|
TEST_DATA_DIR = "datasets_test"
|
||||||
|
|
||||||
def invoke_and_measure(func, *args, **kwargs):
|
def invoke_and_measure(func, *args, **kwargs):
|
||||||
start_time = time.time()
|
start_time = time.time()
|
||||||
@ -22,13 +25,24 @@ def invoke_and_measure(func, *args, **kwargs):
|
|||||||
elapsed_time = end_time - start_time
|
elapsed_time = end_time - start_time
|
||||||
return result, elapsed_time
|
return result, elapsed_time
|
||||||
|
|
||||||
data = pd.read_csv('starclusters-global-parameters2.dat',skiprows=1 ,delim_whitespace=True, header=None)
|
train_df_list = []
|
||||||
|
for file in os.listdir(TRAIN_DATA_DIR):
|
||||||
|
file_path = os.path.join(TRAIN_DATA_DIR, file)
|
||||||
|
df, load_time = invoke_and_measure(pd.read_csv, file_path, delim_whitespace=True, skiprows=1,
|
||||||
|
names=["tbid", "tphys", "r", "vr", "vt", "ik1", "ik2", "sm1", "sm2", "a", "e",
|
||||||
|
"collapsed"])
|
||||||
|
train_df_list.append(df)
|
||||||
|
data_train = pd.concat(train_df_list, ignore_index=True).sample(frac=1, random_state=42)
|
||||||
|
|
||||||
shuffled_data = data.sample(frac=1, random_state=42)
|
test_df_list = []
|
||||||
n = int(0.8 * len(shuffled_data))
|
for file in os.listdir(TEST_DATA_DIR):
|
||||||
|
file_path = os.path.join(TEST_DATA_DIR, file)
|
||||||
|
df, load_time = invoke_and_measure(pd.read_csv, file_path, delim_whitespace=True, skiprows=1,
|
||||||
|
names=["tbid", "tphys", "r", "vr", "vt", "ik1", "ik2", "sm1", "sm2", "a", "e",
|
||||||
|
"collapsed"])
|
||||||
|
test_df_list.append(df)
|
||||||
|
|
||||||
data_train = shuffled_data[:n]
|
data_test = pd.concat(train_df_list, ignore_index=True).sample(frac=1, random_state=42)
|
||||||
data_test = shuffled_data[n:]
|
|
||||||
|
|
||||||
X_train = data_train.iloc[:, 1:-1].values
|
X_train = data_train.iloc[:, 1:-1].values
|
||||||
y_train = data_train.iloc[:, -1].values
|
y_train = data_train.iloc[:, -1].values
|
||||||
@ -42,29 +56,28 @@ names = [
|
|||||||
"Nearest Neighbors",
|
"Nearest Neighbors",
|
||||||
"Decision Tree",
|
"Decision Tree",
|
||||||
"Random Forest",
|
"Random Forest",
|
||||||
"Neural Net",
|
|
||||||
"AdaBoost",
|
|
||||||
"Naive Bayes",
|
"Naive Bayes",
|
||||||
"QDA",
|
"QDA",
|
||||||
"SVC",
|
|
||||||
"Gradient Boosting"
|
"Gradient Boosting"
|
||||||
]
|
]
|
||||||
|
|
||||||
classifiers = [
|
classifiers = {
|
||||||
KNeighborsClassifier(3),
|
KNeighborsClassifier(3),
|
||||||
DecisionTreeClassifier(max_depth=5),
|
DecisionTreeClassifier(max_depth=5),
|
||||||
RandomForestClassifier(max_depth=5, n_estimators=10, max_features=1),
|
RandomForestClassifier(max_depth=5, n_estimators=10, max_features=1),
|
||||||
MLPClassifier(alpha=1, max_iter=1000),
|
|
||||||
AdaBoostClassifier(),
|
|
||||||
GaussianNB(),
|
GaussianNB(),
|
||||||
QuadraticDiscriminantAnalysis(),
|
QuadraticDiscriminantAnalysis(),
|
||||||
SVC(),
|
|
||||||
GradientBoostingClassifier()
|
GradientBoostingClassifier()
|
||||||
]
|
}
|
||||||
|
|
||||||
|
|
||||||
for name, clf in zip(names, classifiers):
|
for name, clf in zip(names, classifiers):
|
||||||
_, fit_time = invoke_and_measure(clf.fit, X_train, y_transformed)
|
_, fit_time = invoke_and_measure(clf.fit, X_train, y_transformed)
|
||||||
y_pred, pred_time = invoke_and_measure(clf.predict, X_test)
|
y_pred, pred_time = invoke_and_measure(clf.predict, X_test)
|
||||||
accuracy = accuracy_score(lab.transform(data_test.iloc[:, -1].values), y_pred)
|
accuracy = accuracy_score(y_train, y_pred)
|
||||||
print(f"{name}: accuracy={accuracy * 100:.2f}% train={fit_time:.5f}s predict={pred_time:.5f}s")
|
precision = precision_score(y_train, y_pred)
|
||||||
|
recall = recall_score(y_train, y_pred)
|
||||||
|
f1 = f1_score(y_train, y_pred)
|
||||||
|
print(
|
||||||
|
f"{name}: accuracy={accuracy * 100:.2f}% precision={precision * 100:.2f}% recall={recall * 100:.2f}% "
|
||||||
|
f"f1={f1 * 100:.2f}% "
|
||||||
|
f"train_time={fit_time:.5f}s predict_time={pred_time:.5f}s")
|
||||||
|
Loading…
Reference in New Issue
Block a user