ium_z444439/script.py
2023-05-10 22:50:54 +02:00

148 lines
4.8 KiB
Python

import os
import urllib.request
from os.path import exists
import pandas
from keras.layers import Dense
from keras.models import Sequential
import pandas as pd
import numpy as np
from keras.utils import to_categorical
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import StandardScaler
def download_file():
file_exist = exists('/adult.csv')
if not file_exist:
url = "https://archive.ics.uci.edu/ml/machine-learning-databases/adult/adult.data"
filename = "adult.data"
urllib.request.urlretrieve(url, filename)
convert_data_to_csv()
def convert_data_to_csv():
data_file = "adult.data"
csv_file = "adult.csv"
df = pd.read_csv(data_file, header=None)
df.to_csv(csv_file, index=False)
filename = "adult.data"
os.remove(filename)
def add_subsets_to_csv_file(data):
data.columns = ["age", "workclass", "fnlwgt", "education", "education-num", "marital-status", "occupation",
"relationship", "race", "sex", "capital-gain", "capital-loss", "hours-per-week", "native-country",
"income"]
X_train_data, X_dev_data, X_test_data = train_dev_test(data)
print("Data set: ", data.shape)
print("Train Data set: ", X_train_data.shape)
print("Dev Data set: ", X_dev_data.shape)
print("Test Data set: ", X_test_data.shape)
return data
def check_if_data_set_has_division_into_subsets(file_name):
data = pd.read_csv(file_name)
if "train" not in data.columns or "dev" not in data.columns or "test" not in data.columns:
data_set = add_subsets_to_csv_file(data)
data_set.to_csv(file_name, index=False)
def get_statistics(data):
train_data = pd.read_csv("X_train.csv", dtype={"income": "category"})
dev_data = pd.read_csv("X_dev.csv", dtype={"income": "category"})
test_data = pd.read_csv("X_test.csv", dtype={"income": "category"})
print("Wielkość zbioru: ", len(data))
print("Wielkość zbioru treningowego: ", len(train_data))
print("Wielkość zbioru walidacyjnego: ", len(dev_data))
print("Wielkość zbioru testowego: ", len(test_data))
print("Średnia wartość wieku: ", np.mean(data["age"]))
print("Minimalna wartość wieku: ", np.min(data["age"]))
print("Maksymalna wartość wieku: ", np.max(data["age"]))
print("Odchylenie standardowe wartości wieku: ", np.std(data["age"]))
print("Mediana wartości wieku: ", np.median(data["age"]))
print("Rozkład częstości klas: ")
freq_dist_all = data['income'].value_counts()
print('Rozkład częstości etykiet klas na całym zbiorze danych:')
print(freq_dist_all)
freq_dist_train = train_data['income'].value_counts()
print('Rozkład częstości etykiet klas na zbiorze treningowym:')
print(freq_dist_train)
freq_dist_test = test_data['income'].value_counts()
print('Rozkład częstości etykiet klas na zbiorze testowym:')
print(freq_dist_test)
freq_dist_dev = dev_data['income'].value_counts()
print('Rozkład częstości etykiet klas na zbiorze walidacyjnym:')
print(freq_dist_dev)
def normalization(data):
numeric_features = ['age', 'fnlwgt', 'capital-gain', 'capital-loss', 'hours-per-week']
numeric_data = data[numeric_features]
scaler = StandardScaler()
normalized_data = scaler.fit_transform(numeric_data)
data[numeric_features] = normalized_data
print(data.head())
def clean(data):
data.replace('?', np.nan, inplace=True)
data.dropna(inplace=True)
data.drop_duplicates(inplace=True)
data[['age', 'fnlwgt', 'capital-gain', 'capital-loss', 'hours-per-week']] = data[
['age', 'fnlwgt', 'capital-gain', 'capital-loss', 'hours-per-week']].apply(pd.to_numeric)
def train_dev_test(data):
X = data.copy()
y = pandas.DataFrame(data.pop('education-num'))
X_train, X_temp, Y_train, Y_temp = train_test_split(X, y, test_size=0.3, random_state=1)
X_dev, X_test, Y_dev, Y_test = train_test_split(X_temp, Y_temp, test_size=0.3, random_state=1)
X_train.to_csv('X_train.csv', index=False)
X_dev.to_csv('X_dev.csv', index=False)
X_test.to_csv('X_test.csv', index=False)
Y_test.to_csv('Y_test.csv', index=False)
Y_train.to_csv('Y_train.csv', index=False)
Y_dev.to_csv('Y_dev.csv', index=False)
return X_train, X_dev, X_test
def create_model():
data = pd.read_csv('X_train.csv')
X = data.copy()
y = data["education-num"]
X_train_encoded = pd.get_dummies(X)
y_train_cat = to_categorical(y)
model = Sequential()
model.add(Dense(64, activation='relu', input_dim=X_train_encoded.shape[1]))
model.add(Dense(17, activation='softmax'))
model.compile(optimizer='adam',
loss='categorical_crossentropy',
metrics=['accuracy'])
model.fit(X_train_encoded, y_train_cat, epochs=10, batch_size=32, validation_data=(X_train_encoded, y_train_cat))
model.save('model.h5')
if __name__ == '__main__':
download_file()
csv_file_name = 'adult.csv'
check_if_data_set_has_division_into_subsets('adult.csv')
data = pd.read_csv(csv_file_name, dtype={"income": "category"})
get_statistics(data)
normalization(data)
clean(data)
create_model()