This commit is contained in:
s444439 2023-04-19 20:08:15 +02:00
parent 5ef8e1bd74
commit 69dbec64e7
2 changed files with 7 additions and 7 deletions

View File

@ -3,7 +3,7 @@ import os
from sklearn.model_selection import train_test_split from sklearn.model_selection import train_test_split
CUTOFF = int(os.environ['CUTOFF']) CUTOFF = int(os.environ['CUTOFF'])
adults = pandas.read_csv('adult.csv', engine='python', encoding='ISO-8859-1', sep=',') adults = pandas.read_csv('adult.csv')
adults = adults.dropna() adults = adults.dropna()
adults = adults.sample(CUTOFF) adults = adults.sample(CUTOFF)

View File

@ -19,7 +19,7 @@ def convert_data_to_csv():
csv_file = "adult.csv" csv_file = "adult.csv"
df = pd.read_csv(data_file, header=None) df = pd.read_csv(data_file, header=None)
df.to_csv(csv_file, index=False) df.to_csv(csv_file, index=False)
delete_data_file() # delete_data_file()
return csv_file return csv_file
@ -113,8 +113,8 @@ def clean(data):
if __name__ == '__main__': if __name__ == '__main__':
csv_file_name = download_file() csv_file_name = download_file()
check_if_data_set_has_division_into_subsets(csv_file_name) # check_if_data_set_has_division_into_subsets(csv_file_name)
data = pd.read_csv(csv_file_name, dtype={"income": "category"}) # data = pd.read_csv(csv_file_name, dtype={"income": "category"})
get_statistics(data) # get_statistics(data)
normalization(data) # normalization(data)
clean(data) # clean(data)