Added scaler

This commit is contained in:
ilydzi 2023-11-18 10:57:16 +01:00
parent 12b7f17838
commit f779dfc4e0

View File

@ -8,6 +8,7 @@ from sklearn.ensemble import RandomForestClassifier, GradientBoostingClassifier
from sklearn.naive_bayes import GaussianNB
from sklearn.neighbors import KNeighborsClassifier
from sklearn import preprocessing
from sklearn.preprocessing import StandardScaler
from sklearn.tree import DecisionTreeClassifier
from sklearn.model_selection import ParameterGrid
@ -24,24 +25,20 @@ def invoke_and_measure(func, *args, **kwargs):
return result, elapsed_time
train_df_list = []
for file in os.listdir(TRAIN_DATA_DIR):
file_path = os.path.join(TRAIN_DATA_DIR, file)
df, load_time = invoke_and_measure(pd.read_csv, file_path, delim_whitespace=True, skiprows=1,
names=["tbid", "tphys", "r", "vr", "vt", "ik1", "ik2", "sm1", "sm2", "a", "e",
"collapsed"])
train_df_list.append(df)
data_train = pd.concat(train_df_list, ignore_index=True).sample(frac=1, random_state=42)
def load_dataset(directory):
df_list = []
for file in os.listdir(directory):
file_path = os.path.join(directory, file)
df, load_time = invoke_and_measure(pd.read_csv, file_path, delim_whitespace=True, skiprows=1,
names=["tbid", "tphys", "r", "vr", "vt", "ik1", "ik2", "sm1", "sm2", "a",
"e",
"collapsed"])
df_list.append(df)
return pd.concat(df_list, ignore_index=True).sample(frac=0.01, random_state=42)
test_df_list = []
for file in os.listdir(TEST_DATA_DIR):
file_path = os.path.join(TEST_DATA_DIR, file)
df, load_time = invoke_and_measure(pd.read_csv, file_path, delim_whitespace=True, skiprows=1,
names=["tbid", "tphys", "r", "vr", "vt", "ik1", "ik2", "sm1", "sm2", "a", "e",
"collapsed"])
test_df_list.append(df)
data_test = pd.concat(test_df_list, ignore_index=True).sample(frac=1, random_state=42)
data_train = load_dataset(TRAIN_DATA_DIR)
data_test = load_dataset(TEST_DATA_DIR)
merged_data = pd.merge(data_train, data_test, indicator=True, how='outer')
overlap_rows = merged_data[merged_data['_merge'] == 'both']
@ -109,6 +106,10 @@ classifiers_and_parameters = [
}
]
scaler = StandardScaler()
X_train_scaled = scaler.fit_transform(X_train)
X_test_scaled = scaler.transform(X_test)
for item in classifiers_and_parameters:
name = item["name"]
clf = item["classifier"]
@ -116,8 +117,8 @@ for item in classifiers_and_parameters:
for params in param_grid:
clf.set_params(**params)
_, fit_time = invoke_and_measure(clf.fit, X_train, y_train_transformed)
y_pred, pred_time = invoke_and_measure(clf.predict, X_test)
_, fit_time = invoke_and_measure(clf.fit, X_train_scaled, y_train_transformed)
y_pred, pred_time = invoke_and_measure(clf.predict, X_test_scaled)
accuracy = accuracy_score(y_test_transformed, y_pred)
precision = precision_score(y_test_transformed, y_pred)