import os import urllib.request import pandas as pd import numpy as np from sklearn.model_selection import train_test_split from sklearn.preprocessing import StandardScaler def download_file(): url = "https://archive.ics.uci.edu/ml/machine-learning-databases/adult/adult.data" filename = "adult.data" urllib.request.urlretrieve(url, filename) csv_file = convert_data_to_csv() return csv_file def convert_data_to_csv(): data_file = "adult.data" csv_file = "adult.csv" df = pd.read_csv(data_file, header=None) df.to_csv(csv_file, index=False) delete_data_file() return csv_file def delete_data_file(): filename = "adult.data" os.remove(filename) def add_subsets_to_csv_file(data): data.columns = ["age", "workclass", "fnlwgt", "education", "education-num", "marital-status", "occupation", "relationship", "race", "sex", "capital-gain", "capital-loss", "hours-per-week", "native-country", "income"] train_data, test_data = train_test_split(data, test_size=0.2, random_state=42) if len(train_data) > len(test_data): train_data, dev_data = train_test_split(train_data, test_size=0.25, random_state=42) else: dev_data = pd.DataFrame() train_data.to_csv("adult_train.csv", index=False) dev_data.to_csv("adult_dev.csv", index=False) test_data.to_csv("adult_test.csv", index=False) print("Data set: ", data.shape) print("Train Data set: ", train_data.shape) print("Dev Data set: ", dev_data.shape) print("Test Data set: ", test_data.shape) return data def check_if_data_set_has_division_into_subsets(file_name): data = pd.read_csv(file_name) if "train" not in data.columns or "dev" not in data.columns or "test" not in data.columns: data_set = add_subsets_to_csv_file(data) data_set.to_csv(file_name, index=False) def get_statistics(data): train_data = pd.read_csv("adult_train.csv", dtype={"income": "category"}) dev_data = pd.read_csv("adult_dev.csv", dtype={"income": "category"}) test_data = pd.read_csv("adult_test.csv", dtype={"income": "category"}) print("Wielkość zbioru: ", len(data)) print("Wielkość zbioru treningowego: ", len(train_data)) print("Wielkość zbioru walidacyjnego: ", len(dev_data)) print("Wielkość zbioru testowego: ", len(test_data)) print("Średnia wartość wieku: ", np.mean(data["age"])) print("Minimalna wartość wieku: ", np.min(data["age"])) print("Maksymalna wartość wieku: ", np.max(data["age"])) print("Odchylenie standardowe wartości wieku: ", np.std(data["age"])) print("Mediana wartości wieku: ", np.median(data["age"])) print("Rozkład częstości klas: ") freq_dist_all = data['income'].value_counts() print('Rozkład częstości etykiet klas na całym zbiorze danych:') print(freq_dist_all) freq_dist_train = train_data['income'].value_counts() print('Rozkład częstości etykiet klas na zbiorze treningowym:') print(freq_dist_train) freq_dist_test = test_data['income'].value_counts() print('Rozkład częstości etykiet klas na zbiorze testowym:') print(freq_dist_test) freq_dist_dev = dev_data['income'].value_counts() print('Rozkład częstości etykiet klas na zbiorze walidacyjnym:') print(freq_dist_dev) def normalization(data): numeric_features = ['age', 'fnlwgt', 'education-num', 'capital-gain', 'capital-loss', 'hours-per-week'] numeric_data = data[numeric_features] scaler = StandardScaler() normalized_data = scaler.fit_transform(numeric_data) data[numeric_features] = normalized_data print(data.head()) def clean(data): data.replace('?', np.nan, inplace=True) data.dropna(inplace=True) data.drop_duplicates(inplace=True) data[['age', 'fnlwgt', 'education-num', 'capital-gain', 'capital-loss', 'hours-per-week']] = data[ ['age', 'fnlwgt', 'education-num', 'capital-gain', 'capital-loss', 'hours-per-week']].apply(pd.to_numeric) if __name__ == '__main__': csv_file_name = download_file() check_if_data_set_has_division_into_subsets(csv_file_name) data = pd.read_csv(csv_file_name, dtype={"income": "category"}) get_statistics(data) normalization(data) clean(data)