Added scaler
This commit is contained in:
parent
12b7f17838
commit
f779dfc4e0
@ -8,6 +8,7 @@ from sklearn.ensemble import RandomForestClassifier, GradientBoostingClassifier
|
||||
from sklearn.naive_bayes import GaussianNB
|
||||
from sklearn.neighbors import KNeighborsClassifier
|
||||
from sklearn import preprocessing
|
||||
from sklearn.preprocessing import StandardScaler
|
||||
from sklearn.tree import DecisionTreeClassifier
|
||||
from sklearn.model_selection import ParameterGrid
|
||||
|
||||
@ -24,24 +25,20 @@ def invoke_and_measure(func, *args, **kwargs):
|
||||
return result, elapsed_time
|
||||
|
||||
|
||||
train_df_list = []
|
||||
for file in os.listdir(TRAIN_DATA_DIR):
|
||||
file_path = os.path.join(TRAIN_DATA_DIR, file)
|
||||
df, load_time = invoke_and_measure(pd.read_csv, file_path, delim_whitespace=True, skiprows=1,
|
||||
names=["tbid", "tphys", "r", "vr", "vt", "ik1", "ik2", "sm1", "sm2", "a", "e",
|
||||
"collapsed"])
|
||||
train_df_list.append(df)
|
||||
data_train = pd.concat(train_df_list, ignore_index=True).sample(frac=1, random_state=42)
|
||||
def load_dataset(directory):
|
||||
df_list = []
|
||||
for file in os.listdir(directory):
|
||||
file_path = os.path.join(directory, file)
|
||||
df, load_time = invoke_and_measure(pd.read_csv, file_path, delim_whitespace=True, skiprows=1,
|
||||
names=["tbid", "tphys", "r", "vr", "vt", "ik1", "ik2", "sm1", "sm2", "a",
|
||||
"e",
|
||||
"collapsed"])
|
||||
df_list.append(df)
|
||||
return pd.concat(df_list, ignore_index=True).sample(frac=0.01, random_state=42)
|
||||
|
||||
test_df_list = []
|
||||
for file in os.listdir(TEST_DATA_DIR):
|
||||
file_path = os.path.join(TEST_DATA_DIR, file)
|
||||
df, load_time = invoke_and_measure(pd.read_csv, file_path, delim_whitespace=True, skiprows=1,
|
||||
names=["tbid", "tphys", "r", "vr", "vt", "ik1", "ik2", "sm1", "sm2", "a", "e",
|
||||
"collapsed"])
|
||||
test_df_list.append(df)
|
||||
|
||||
data_test = pd.concat(test_df_list, ignore_index=True).sample(frac=1, random_state=42)
|
||||
data_train = load_dataset(TRAIN_DATA_DIR)
|
||||
data_test = load_dataset(TEST_DATA_DIR)
|
||||
|
||||
merged_data = pd.merge(data_train, data_test, indicator=True, how='outer')
|
||||
overlap_rows = merged_data[merged_data['_merge'] == 'both']
|
||||
@ -109,6 +106,10 @@ classifiers_and_parameters = [
|
||||
}
|
||||
]
|
||||
|
||||
scaler = StandardScaler()
|
||||
X_train_scaled = scaler.fit_transform(X_train)
|
||||
X_test_scaled = scaler.transform(X_test)
|
||||
|
||||
for item in classifiers_and_parameters:
|
||||
name = item["name"]
|
||||
clf = item["classifier"]
|
||||
@ -116,8 +117,8 @@ for item in classifiers_and_parameters:
|
||||
|
||||
for params in param_grid:
|
||||
clf.set_params(**params)
|
||||
_, fit_time = invoke_and_measure(clf.fit, X_train, y_train_transformed)
|
||||
y_pred, pred_time = invoke_and_measure(clf.predict, X_test)
|
||||
_, fit_time = invoke_and_measure(clf.fit, X_train_scaled, y_train_transformed)
|
||||
y_pred, pred_time = invoke_and_measure(clf.predict, X_test_scaled)
|
||||
|
||||
accuracy = accuracy_score(y_test_transformed, y_pred)
|
||||
precision = precision_score(y_test_transformed, y_pred)
|
||||
|
Loading…
Reference in New Issue
Block a user