Added scaler
This commit is contained in:
parent
12b7f17838
commit
f779dfc4e0
@ -8,6 +8,7 @@ from sklearn.ensemble import RandomForestClassifier, GradientBoostingClassifier
|
|||||||
from sklearn.naive_bayes import GaussianNB
|
from sklearn.naive_bayes import GaussianNB
|
||||||
from sklearn.neighbors import KNeighborsClassifier
|
from sklearn.neighbors import KNeighborsClassifier
|
||||||
from sklearn import preprocessing
|
from sklearn import preprocessing
|
||||||
|
from sklearn.preprocessing import StandardScaler
|
||||||
from sklearn.tree import DecisionTreeClassifier
|
from sklearn.tree import DecisionTreeClassifier
|
||||||
from sklearn.model_selection import ParameterGrid
|
from sklearn.model_selection import ParameterGrid
|
||||||
|
|
||||||
@ -24,24 +25,20 @@ def invoke_and_measure(func, *args, **kwargs):
|
|||||||
return result, elapsed_time
|
return result, elapsed_time
|
||||||
|
|
||||||
|
|
||||||
train_df_list = []
|
def load_dataset(directory):
|
||||||
for file in os.listdir(TRAIN_DATA_DIR):
|
df_list = []
|
||||||
file_path = os.path.join(TRAIN_DATA_DIR, file)
|
for file in os.listdir(directory):
|
||||||
|
file_path = os.path.join(directory, file)
|
||||||
df, load_time = invoke_and_measure(pd.read_csv, file_path, delim_whitespace=True, skiprows=1,
|
df, load_time = invoke_and_measure(pd.read_csv, file_path, delim_whitespace=True, skiprows=1,
|
||||||
names=["tbid", "tphys", "r", "vr", "vt", "ik1", "ik2", "sm1", "sm2", "a", "e",
|
names=["tbid", "tphys", "r", "vr", "vt", "ik1", "ik2", "sm1", "sm2", "a",
|
||||||
|
"e",
|
||||||
"collapsed"])
|
"collapsed"])
|
||||||
train_df_list.append(df)
|
df_list.append(df)
|
||||||
data_train = pd.concat(train_df_list, ignore_index=True).sample(frac=1, random_state=42)
|
return pd.concat(df_list, ignore_index=True).sample(frac=0.01, random_state=42)
|
||||||
|
|
||||||
test_df_list = []
|
|
||||||
for file in os.listdir(TEST_DATA_DIR):
|
|
||||||
file_path = os.path.join(TEST_DATA_DIR, file)
|
|
||||||
df, load_time = invoke_and_measure(pd.read_csv, file_path, delim_whitespace=True, skiprows=1,
|
|
||||||
names=["tbid", "tphys", "r", "vr", "vt", "ik1", "ik2", "sm1", "sm2", "a", "e",
|
|
||||||
"collapsed"])
|
|
||||||
test_df_list.append(df)
|
|
||||||
|
|
||||||
data_test = pd.concat(test_df_list, ignore_index=True).sample(frac=1, random_state=42)
|
data_train = load_dataset(TRAIN_DATA_DIR)
|
||||||
|
data_test = load_dataset(TEST_DATA_DIR)
|
||||||
|
|
||||||
merged_data = pd.merge(data_train, data_test, indicator=True, how='outer')
|
merged_data = pd.merge(data_train, data_test, indicator=True, how='outer')
|
||||||
overlap_rows = merged_data[merged_data['_merge'] == 'both']
|
overlap_rows = merged_data[merged_data['_merge'] == 'both']
|
||||||
@ -109,6 +106,10 @@ classifiers_and_parameters = [
|
|||||||
}
|
}
|
||||||
]
|
]
|
||||||
|
|
||||||
|
scaler = StandardScaler()
|
||||||
|
X_train_scaled = scaler.fit_transform(X_train)
|
||||||
|
X_test_scaled = scaler.transform(X_test)
|
||||||
|
|
||||||
for item in classifiers_and_parameters:
|
for item in classifiers_and_parameters:
|
||||||
name = item["name"]
|
name = item["name"]
|
||||||
clf = item["classifier"]
|
clf = item["classifier"]
|
||||||
@ -116,8 +117,8 @@ for item in classifiers_and_parameters:
|
|||||||
|
|
||||||
for params in param_grid:
|
for params in param_grid:
|
||||||
clf.set_params(**params)
|
clf.set_params(**params)
|
||||||
_, fit_time = invoke_and_measure(clf.fit, X_train, y_train_transformed)
|
_, fit_time = invoke_and_measure(clf.fit, X_train_scaled, y_train_transformed)
|
||||||
y_pred, pred_time = invoke_and_measure(clf.predict, X_test)
|
y_pred, pred_time = invoke_and_measure(clf.predict, X_test_scaled)
|
||||||
|
|
||||||
accuracy = accuracy_score(y_test_transformed, y_pred)
|
accuracy = accuracy_score(y_test_transformed, y_pred)
|
||||||
precision = precision_score(y_test_transformed, y_pred)
|
precision = precision_score(y_test_transformed, y_pred)
|
||||||
|
Loading…
Reference in New Issue
Block a user