diff --git a/main.py b/main.py index aedd2fa..3c4f1de 100644 --- a/main.py +++ b/main.py @@ -1,7 +1,6 @@ -import jupyter +from sklearn import preprocessing import numpy as np import pandas as pd -from sklearn.model_selection import train_test_split import kaggle import pandas import os @@ -18,11 +17,74 @@ def read_data_from_file(filename): return pandas.read_csv(filename) +def split_data(data): + return np.split(data.sample(frac=1, random_state=42), [int(.6 * len(data)), int(.8 * len(data))]) + + +def calculate_stats(data, col_name): + col_values = data[col_name] + return [ + len(data), + np.min(col_values), + np.max(col_values), + np.std(col_values), + np.median(col_values) + ] + + +def calculate_value_counts(data, col_name): + return data[col_name].value_counts() + + +def normalize_data(data): + data = data.iloc[:, 1:] + + numeric_columns = ['realSum', 'person_capacity', 'multi', 'biz', 'cleanliness_rating', 'guest_satisfaction_overall', 'bedrooms', 'dist', 'metro_dist', 'attr_index', 'attr_index_norm', 'rest_index', 'rest_index_norm', 'lng', 'lat'] + non_numeric_columns = ['room_type', 'room_shared', 'room_private', 'host_is_superhost', ] + numeric_data = data[numeric_columns] + non_numeric_data = data[non_numeric_columns] + + enc = preprocessing.OrdinalEncoder() + non_numeric_data_norm = enc.fit_transform(non_numeric_data) + + scaler = preprocessing.MinMaxScaler() + numeric_data_norm = scaler.fit_transform(numeric_data) + + data[numeric_columns] = numeric_data_norm + data[non_numeric_columns] = non_numeric_data_norm + + return data + + if __name__ == '__main__': + # 1.1 if not os.path.isdir('data'): download_data() - df = read_data_from_file(path_to_data + '/barcelona_weekends.csv') - print(df.head()) + whole_set = read_data_from_file(path_to_data + '/barcelona_weekends.csv') + print(whole_set.head()) + # 1.2 + train_set, dev_set, test_set = split_data(whole_set) + # 1.3 + whole_set_stats = calculate_stats(whole_set, 'realSum') + train_set_stats = calculate_stats(train_set, 'realSum') + dev_set_stats = calculate_stats(dev_set, 'realSum') + test_set_stats = calculate_stats(test_set, 'realSum') + columns = ['size', 'minimum', 'maximum', 'standard deviation', 'median'] + rows = ['whole set', 'train', 'dev', 'test'] + print(pd.DataFrame( + data=np.array([whole_set_stats, train_set_stats, dev_set_stats, test_set_stats]), + index=rows, + columns=columns), + end='\n\n') + + print('Whole set', calculate_value_counts(whole_set, 'person_capacity'), end='\n\n') + print('Train', calculate_value_counts(train_set, 'person_capacity'), end='\n\n') + print('Dev', calculate_value_counts(dev_set, 'person_capacity'), end='\n\n') + print('Test', calculate_value_counts(test_set, 'person_capacity'), end='\n\n') + + # 1.4 & 1.5 + normalized_data = normalize_data(whole_set) + print(normalized_data, '\n\n')