2023-04-19 17:21:39 +02:00
|
|
|
import os
|
|
|
|
import urllib.request
|
2023-05-10 22:50:54 +02:00
|
|
|
from os.path import exists
|
|
|
|
|
2023-04-19 17:21:39 +02:00
|
|
|
import pandas as pd
|
|
|
|
import numpy as np
|
|
|
|
from sklearn.model_selection import train_test_split
|
|
|
|
from sklearn.preprocessing import StandardScaler
|
|
|
|
|
|
|
|
|
|
|
|
def download_file():
|
2023-05-10 22:50:54 +02:00
|
|
|
file_exist = exists('/adult.csv')
|
|
|
|
if not file_exist:
|
|
|
|
url = "https://archive.ics.uci.edu/ml/machine-learning-databases/adult/adult.data"
|
|
|
|
filename = "adult.data"
|
|
|
|
urllib.request.urlretrieve(url, filename)
|
|
|
|
convert_data_to_csv()
|
2023-04-19 17:21:39 +02:00
|
|
|
|
|
|
|
|
|
|
|
def convert_data_to_csv():
|
2023-05-10 22:50:54 +02:00
|
|
|
data_file = "adult.data"
|
|
|
|
csv_file = "adult.csv"
|
|
|
|
df = pd.read_csv(data_file, header=None)
|
|
|
|
df.to_csv(csv_file, index=False)
|
|
|
|
filename = "adult.data"
|
|
|
|
os.remove(filename)
|
2023-04-19 17:21:39 +02:00
|
|
|
|
|
|
|
|
|
|
|
def add_subsets_to_csv_file(data):
|
2023-05-10 22:50:54 +02:00
|
|
|
data.columns = ["age", "workclass", "fnlwgt", "education", "education-num", "marital-status", "occupation",
|
|
|
|
"relationship", "race", "sex", "capital-gain", "capital-loss", "hours-per-week", "native-country",
|
|
|
|
"income"]
|
2023-04-19 17:21:39 +02:00
|
|
|
|
2023-05-10 22:50:54 +02:00
|
|
|
X_train_data, X_dev_data, X_test_data = train_dev_test(data)
|
2023-04-19 17:21:39 +02:00
|
|
|
|
2023-05-10 22:50:54 +02:00
|
|
|
print("Data set: ", data.shape)
|
|
|
|
print("Train Data set: ", X_train_data.shape)
|
|
|
|
print("Dev Data set: ", X_dev_data.shape)
|
|
|
|
print("Test Data set: ", X_test_data.shape)
|
|
|
|
return data
|
2023-04-19 17:21:39 +02:00
|
|
|
|
|
|
|
|
|
|
|
def check_if_data_set_has_division_into_subsets(file_name):
|
2023-05-10 22:50:54 +02:00
|
|
|
data = pd.read_csv(file_name)
|
2023-04-19 17:21:39 +02:00
|
|
|
|
2023-05-10 22:50:54 +02:00
|
|
|
if "train" not in data.columns or "dev" not in data.columns or "test" not in data.columns:
|
|
|
|
data_set = add_subsets_to_csv_file(data)
|
|
|
|
data_set.to_csv(file_name, index=False)
|
2023-04-19 17:21:39 +02:00
|
|
|
|
|
|
|
|
|
|
|
def get_statistics(data):
|
2023-05-11 18:11:43 +02:00
|
|
|
train_data = pd.read_csv("adult_train.csv", dtype={"income": "category"})
|
|
|
|
dev_data = pd.read_csv("adult_dev.csv", dtype={"income": "category"})
|
|
|
|
test_data = pd.read_csv("adult_test.csv", dtype={"income": "category"})
|
2023-05-10 22:50:54 +02:00
|
|
|
|
|
|
|
print("Wielkość zbioru: ", len(data))
|
|
|
|
print("Wielkość zbioru treningowego: ", len(train_data))
|
|
|
|
print("Wielkość zbioru walidacyjnego: ", len(dev_data))
|
|
|
|
print("Wielkość zbioru testowego: ", len(test_data))
|
|
|
|
print("Średnia wartość wieku: ", np.mean(data["age"]))
|
|
|
|
print("Minimalna wartość wieku: ", np.min(data["age"]))
|
|
|
|
print("Maksymalna wartość wieku: ", np.max(data["age"]))
|
|
|
|
print("Odchylenie standardowe wartości wieku: ", np.std(data["age"]))
|
|
|
|
print("Mediana wartości wieku: ", np.median(data["age"]))
|
|
|
|
|
|
|
|
print("Rozkład częstości klas: ")
|
|
|
|
freq_dist_all = data['income'].value_counts()
|
|
|
|
print('Rozkład częstości etykiet klas na całym zbiorze danych:')
|
|
|
|
print(freq_dist_all)
|
|
|
|
|
|
|
|
freq_dist_train = train_data['income'].value_counts()
|
|
|
|
print('Rozkład częstości etykiet klas na zbiorze treningowym:')
|
|
|
|
print(freq_dist_train)
|
|
|
|
|
|
|
|
freq_dist_test = test_data['income'].value_counts()
|
|
|
|
print('Rozkład częstości etykiet klas na zbiorze testowym:')
|
|
|
|
print(freq_dist_test)
|
|
|
|
|
|
|
|
freq_dist_dev = dev_data['income'].value_counts()
|
|
|
|
print('Rozkład częstości etykiet klas na zbiorze walidacyjnym:')
|
|
|
|
print(freq_dist_dev)
|
2023-04-19 17:21:39 +02:00
|
|
|
|
|
|
|
|
|
|
|
def normalization(data):
|
2023-05-10 22:50:54 +02:00
|
|
|
numeric_features = ['age', 'fnlwgt', 'capital-gain', 'capital-loss', 'hours-per-week']
|
|
|
|
numeric_data = data[numeric_features]
|
2023-04-19 17:21:39 +02:00
|
|
|
|
2023-05-10 22:50:54 +02:00
|
|
|
scaler = StandardScaler()
|
|
|
|
normalized_data = scaler.fit_transform(numeric_data)
|
2023-04-19 17:21:39 +02:00
|
|
|
|
2023-05-10 22:50:54 +02:00
|
|
|
data[numeric_features] = normalized_data
|
2023-04-19 17:21:39 +02:00
|
|
|
|
2023-05-10 22:50:54 +02:00
|
|
|
print(data.head())
|
2023-04-19 17:21:39 +02:00
|
|
|
|
|
|
|
|
|
|
|
def clean(data):
|
2023-05-10 22:50:54 +02:00
|
|
|
data.replace('?', np.nan, inplace=True)
|
|
|
|
data.dropna(inplace=True)
|
|
|
|
data.drop_duplicates(inplace=True)
|
|
|
|
data[['age', 'fnlwgt', 'capital-gain', 'capital-loss', 'hours-per-week']] = data[
|
|
|
|
['age', 'fnlwgt', 'capital-gain', 'capital-loss', 'hours-per-week']].apply(pd.to_numeric)
|
|
|
|
|
|
|
|
|
|
|
|
def train_dev_test(data):
|
2023-05-11 18:11:43 +02:00
|
|
|
train_data, test_data = train_test_split(data, test_size=0.3, random_state=42)
|
|
|
|
|
|
|
|
test_data, dev_data = train_test_split(test_data, test_size=0.33, random_state=42)
|
|
|
|
|
|
|
|
train_data.to_csv("adult_train.csv", index=False)
|
|
|
|
dev_data.to_csv("adult_dev.csv", index=False)
|
|
|
|
test_data.to_csv("adult_test.csv", index=False)
|
2023-05-10 22:50:54 +02:00
|
|
|
|
2023-05-11 18:11:43 +02:00
|
|
|
return train_data, dev_data, test_data
|
2023-05-10 22:50:54 +02:00
|
|
|
|
|
|
|
|
2023-04-19 17:21:39 +02:00
|
|
|
if __name__ == '__main__':
|
2023-05-10 22:50:54 +02:00
|
|
|
download_file()
|
|
|
|
csv_file_name = 'adult.csv'
|
|
|
|
check_if_data_set_has_division_into_subsets('adult.csv')
|
|
|
|
data = pd.read_csv(csv_file_name, dtype={"income": "category"})
|
|
|
|
get_statistics(data)
|
|
|
|
normalization(data)
|
|
|
|
clean(data)
|