Added scaler

This commit is contained in:
ilydzi 2023-11-18 10:57:16 +01:00
parent 12b7f17838
commit f779dfc4e0

View File

@ -8,6 +8,7 @@ from sklearn.ensemble import RandomForestClassifier, GradientBoostingClassifier
from sklearn.naive_bayes import GaussianNB from sklearn.naive_bayes import GaussianNB
from sklearn.neighbors import KNeighborsClassifier from sklearn.neighbors import KNeighborsClassifier
from sklearn import preprocessing from sklearn import preprocessing
from sklearn.preprocessing import StandardScaler
from sklearn.tree import DecisionTreeClassifier from sklearn.tree import DecisionTreeClassifier
from sklearn.model_selection import ParameterGrid from sklearn.model_selection import ParameterGrid
@ -24,24 +25,20 @@ def invoke_and_measure(func, *args, **kwargs):
return result, elapsed_time return result, elapsed_time
train_df_list = [] def load_dataset(directory):
for file in os.listdir(TRAIN_DATA_DIR): df_list = []
file_path = os.path.join(TRAIN_DATA_DIR, file) for file in os.listdir(directory):
file_path = os.path.join(directory, file)
df, load_time = invoke_and_measure(pd.read_csv, file_path, delim_whitespace=True, skiprows=1, df, load_time = invoke_and_measure(pd.read_csv, file_path, delim_whitespace=True, skiprows=1,
names=["tbid", "tphys", "r", "vr", "vt", "ik1", "ik2", "sm1", "sm2", "a", "e", names=["tbid", "tphys", "r", "vr", "vt", "ik1", "ik2", "sm1", "sm2", "a",
"e",
"collapsed"]) "collapsed"])
train_df_list.append(df) df_list.append(df)
data_train = pd.concat(train_df_list, ignore_index=True).sample(frac=1, random_state=42) return pd.concat(df_list, ignore_index=True).sample(frac=0.01, random_state=42)
test_df_list = []
for file in os.listdir(TEST_DATA_DIR):
file_path = os.path.join(TEST_DATA_DIR, file)
df, load_time = invoke_and_measure(pd.read_csv, file_path, delim_whitespace=True, skiprows=1,
names=["tbid", "tphys", "r", "vr", "vt", "ik1", "ik2", "sm1", "sm2", "a", "e",
"collapsed"])
test_df_list.append(df)
data_test = pd.concat(test_df_list, ignore_index=True).sample(frac=1, random_state=42) data_train = load_dataset(TRAIN_DATA_DIR)
data_test = load_dataset(TEST_DATA_DIR)
merged_data = pd.merge(data_train, data_test, indicator=True, how='outer') merged_data = pd.merge(data_train, data_test, indicator=True, how='outer')
overlap_rows = merged_data[merged_data['_merge'] == 'both'] overlap_rows = merged_data[merged_data['_merge'] == 'both']
@ -109,6 +106,10 @@ classifiers_and_parameters = [
} }
] ]
scaler = StandardScaler()
X_train_scaled = scaler.fit_transform(X_train)
X_test_scaled = scaler.transform(X_test)
for item in classifiers_and_parameters: for item in classifiers_and_parameters:
name = item["name"] name = item["name"]
clf = item["classifier"] clf = item["classifier"]
@ -116,8 +117,8 @@ for item in classifiers_and_parameters:
for params in param_grid: for params in param_grid:
clf.set_params(**params) clf.set_params(**params)
_, fit_time = invoke_and_measure(clf.fit, X_train, y_train_transformed) _, fit_time = invoke_and_measure(clf.fit, X_train_scaled, y_train_transformed)
y_pred, pred_time = invoke_and_measure(clf.predict, X_test) y_pred, pred_time = invoke_and_measure(clf.predict, X_test_scaled)
accuracy = accuracy_score(y_test_transformed, y_pred) accuracy = accuracy_score(y_test_transformed, y_pred)
precision = precision_score(y_test_transformed, y_pred) precision = precision_score(y_test_transformed, y_pred)