import os import urllib.request from os.path import exists import pandas from keras.layers import Dense from keras.models import Sequential import pandas as pd import numpy as np from keras.utils import to_categorical from sklearn.model_selection import train_test_split from sklearn.preprocessing import StandardScaler def download_file(): file_exist = exists('/adult.csv') if not file_exist: url = "https://archive.ics.uci.edu/ml/machine-learning-databases/adult/adult.data" filename = "adult.data" urllib.request.urlretrieve(url, filename) convert_data_to_csv() def convert_data_to_csv(): data_file = "adult.data" csv_file = "adult.csv" df = pd.read_csv(data_file, header=None) df.to_csv(csv_file, index=False) filename = "adult.data" os.remove(filename) def add_subsets_to_csv_file(data): data.columns = ["age", "workclass", "fnlwgt", "education", "education-num", "marital-status", "occupation", "relationship", "race", "sex", "capital-gain", "capital-loss", "hours-per-week", "native-country", "income"] X_train_data, X_dev_data, X_test_data = train_dev_test(data) print("Data set: ", data.shape) print("Train Data set: ", X_train_data.shape) print("Dev Data set: ", X_dev_data.shape) print("Test Data set: ", X_test_data.shape) return data def check_if_data_set_has_division_into_subsets(file_name): data = pd.read_csv(file_name) if "train" not in data.columns or "dev" not in data.columns or "test" not in data.columns: data_set = add_subsets_to_csv_file(data) data_set.to_csv(file_name, index=False) def get_statistics(data): train_data = pd.read_csv("X_train.csv", dtype={"income": "category"}) dev_data = pd.read_csv("X_dev.csv", dtype={"income": "category"}) test_data = pd.read_csv("X_test.csv", dtype={"income": "category"}) print("Wielkość zbioru: ", len(data)) print("Wielkość zbioru treningowego: ", len(train_data)) print("Wielkość zbioru walidacyjnego: ", len(dev_data)) print("Wielkość zbioru testowego: ", len(test_data)) print("Średnia wartość wieku: ", np.mean(data["age"])) print("Minimalna wartość wieku: ", np.min(data["age"])) print("Maksymalna wartość wieku: ", np.max(data["age"])) print("Odchylenie standardowe wartości wieku: ", np.std(data["age"])) print("Mediana wartości wieku: ", np.median(data["age"])) print("Rozkład częstości klas: ") freq_dist_all = data['income'].value_counts() print('Rozkład częstości etykiet klas na całym zbiorze danych:') print(freq_dist_all) freq_dist_train = train_data['income'].value_counts() print('Rozkład częstości etykiet klas na zbiorze treningowym:') print(freq_dist_train) freq_dist_test = test_data['income'].value_counts() print('Rozkład częstości etykiet klas na zbiorze testowym:') print(freq_dist_test) freq_dist_dev = dev_data['income'].value_counts() print('Rozkład częstości etykiet klas na zbiorze walidacyjnym:') print(freq_dist_dev) def normalization(data): numeric_features = ['age', 'fnlwgt', 'capital-gain', 'capital-loss', 'hours-per-week'] numeric_data = data[numeric_features] scaler = StandardScaler() normalized_data = scaler.fit_transform(numeric_data) data[numeric_features] = normalized_data print(data.head()) def clean(data): data.replace('?', np.nan, inplace=True) data.dropna(inplace=True) data.drop_duplicates(inplace=True) data[['age', 'fnlwgt', 'capital-gain', 'capital-loss', 'hours-per-week']] = data[ ['age', 'fnlwgt', 'capital-gain', 'capital-loss', 'hours-per-week']].apply(pd.to_numeric) def train_dev_test(data): X = data.copy() y = pandas.DataFrame(data.pop('education-num')) X_train, X_temp, Y_train, Y_temp = train_test_split(X, y, test_size=0.3, random_state=1) X_dev, X_test, Y_dev, Y_test = train_test_split(X_temp, Y_temp, test_size=0.3, random_state=1) X_train.to_csv('X_train.csv', index=False) X_dev.to_csv('X_dev.csv', index=False) X_test.to_csv('X_test.csv', index=False) Y_test.to_csv('Y_test.csv', index=False) Y_train.to_csv('Y_train.csv', index=False) Y_dev.to_csv('Y_dev.csv', index=False) return X_train, X_dev, X_test def create_model(): data = pd.read_csv('X_train.csv') X = data.copy() y = data["education-num"] X_train_encoded = pd.get_dummies(X) y_train_cat = to_categorical(y) model = Sequential() model.add(Dense(64, activation='relu', input_dim=X_train_encoded.shape[1])) model.add(Dense(17, activation='softmax')) model.compile(optimizer='adam', loss='categorical_crossentropy', metrics=['accuracy']) model.fit(X_train_encoded, y_train_cat, epochs=10, batch_size=32, validation_data=(X_train_encoded, y_train_cat)) model.save('model.h5') if __name__ == '__main__': download_file() csv_file_name = 'adult.csv' check_if_data_set_has_division_into_subsets('adult.csv') data = pd.read_csv(csv_file_name, dtype={"income": "category"}) get_statistics(data) normalization(data) clean(data) create_model()