diff --git a/predictions.py b/predictions.py index 8c8e433..d3ddf95 100644 --- a/predictions.py +++ b/predictions.py @@ -1,12 +1,12 @@ -import tensorflow +import keras +import numpy as np +import tensorflow as tf import pandas as pd -model = tensorflow.keras.models.load_model('model.h5') -X_test_data = pd.read_csv("X_test.csv").astype(float) -Y_test_data = pd.read_csv("Y_test.csv").astype(float) +test_data = pd.read_csv("adult_test.csv") -model.evaluate(X_test_data, Y_test_data) +model = keras.models.load_model("model.h5") -predictions = model.predict(X_test_data) +predictions = model.predict(test_data) -predictions.to_csv('predictions.csv', index=False) +np.savetxt("predictions.csv", predictions, delimiter=",") diff --git a/script.py b/script.py index 46f4da0..e7fdad8 100644 --- a/script.py +++ b/script.py @@ -53,9 +53,9 @@ def check_if_data_set_has_division_into_subsets(file_name): def get_statistics(data): - train_data = pd.read_csv("X_train.csv", dtype={"income": "category"}) - dev_data = pd.read_csv("X_dev.csv", dtype={"income": "category"}) - test_data = pd.read_csv("X_test.csv", dtype={"income": "category"}) + train_data = pd.read_csv("adult_train.csv", dtype={"income": "category"}) + dev_data = pd.read_csv("adult_dev.csv", dtype={"income": "category"}) + test_data = pd.read_csv("adult_test.csv", dtype={"income": "category"}) print("Wielkość zbioru: ", len(data)) print("Wielkość zbioru treningowego: ", len(train_data)) @@ -106,34 +106,31 @@ def clean(data): def train_dev_test(data): - X = data.copy() - y = pandas.DataFrame(data.pop('education-num')) - X_train, X_temp, Y_train, Y_temp = train_test_split(X, y, test_size=0.3, random_state=1) - X_dev, X_test, Y_dev, Y_test = train_test_split(X_temp, Y_temp, test_size=0.3, random_state=1) + train_data, test_data = train_test_split(data, test_size=0.3, random_state=42) - X_train.to_csv('X_train.csv', index=False) - X_dev.to_csv('X_dev.csv', index=False) - X_test.to_csv('X_test.csv', index=False) - Y_test.to_csv('Y_test.csv', index=False) - Y_train.to_csv('Y_train.csv', index=False) - Y_dev.to_csv('Y_dev.csv', index=False) - return X_train, X_dev, X_test + test_data, dev_data = train_test_split(test_data, test_size=0.33, random_state=42) + + train_data.to_csv("adult_train.csv", index=False) + dev_data.to_csv("adult_dev.csv", index=False) + test_data.to_csv("adult_test.csv", index=False) + + return train_data, dev_data, test_data def create_model(): - data = pd.read_csv('X_train.csv') + data = pd.read_csv('adult_train.csv') X = data.copy() y = data["education-num"] X_train_encoded = pd.get_dummies(X) y_train_cat = to_categorical(y) model = Sequential() model.add(Dense(64, activation='relu', input_dim=X_train_encoded.shape[1])) - model.add(Dense(17, activation='softmax')) + model.add(Dense(17, activation='sigmoid')) model.compile(optimizer='adam', - loss='categorical_crossentropy', + loss='binary_crossentropy', metrics=['accuracy']) model.fit(X_train_encoded, y_train_cat, epochs=10, batch_size=32, validation_data=(X_train_encoded, y_train_cat)) - model.save('model.h5') + model.save('model.joblib') if __name__ == '__main__':