from sklearn import preprocessing import numpy as np import pandas as pd import kaggle import pandas import os path_to_data = './data' def download_data(): kaggle.api.authenticate() kaggle.api.dataset_download_files('thedevastator/airbnb-prices-in-european-cities', path=path_to_data, unzip=True) def read_data_from_file(filename): return pandas.read_csv(filename) def split_data(data): return np.split(data.sample(frac=1, random_state=42), [int(.6 * len(data)), int(.8 * len(data))]) def calculate_stats(data, col_name): col_values = data[col_name] return [ len(data), np.min(col_values), np.max(col_values), np.std(col_values), np.median(col_values) ] def calculate_value_counts(data, col_name): return data[col_name].value_counts() def normalize_data(data): data = data.iloc[:, 1:] numeric_columns = ['realSum', 'person_capacity', 'multi', 'biz', 'cleanliness_rating', 'guest_satisfaction_overall', 'bedrooms', 'dist', 'metro_dist', 'attr_index', 'attr_index_norm', 'rest_index', 'rest_index_norm', 'lng', 'lat'] non_numeric_columns = ['room_type', 'room_shared', 'room_private', 'host_is_superhost', ] numeric_data = data[numeric_columns] non_numeric_data = data[non_numeric_columns] enc = preprocessing.OrdinalEncoder() non_numeric_data_norm = enc.fit_transform(non_numeric_data) scaler = preprocessing.MinMaxScaler() numeric_data_norm = scaler.fit_transform(numeric_data) data[numeric_columns] = numeric_data_norm data[non_numeric_columns] = non_numeric_data_norm y_col = 'guest_satisfaction_overall' new_order = list(data.columns.drop(y_col)) + [y_col] new_df = data.reindex(columns=new_order) return new_df if __name__ == '__main__': # 1.1 if not os.path.isdir('data'): download_data() whole_set = read_data_from_file(path_to_data + '/barcelona_weekends.csv') print(whole_set.head()) # 1.2 train_set, dev_set, test_set = split_data(whole_set) # 1.3 whole_set_stats = calculate_stats(whole_set, 'realSum') train_set_stats = calculate_stats(train_set, 'realSum') dev_set_stats = calculate_stats(dev_set, 'realSum') test_set_stats = calculate_stats(test_set, 'realSum') columns = ['size', 'minimum', 'maximum', 'standard deviation', 'median'] rows = ['whole set', 'train', 'dev', 'test'] print(pd.DataFrame( data=np.array([whole_set_stats, train_set_stats, dev_set_stats, test_set_stats]), index=rows, columns=columns), end='\n\n') print('Whole set', calculate_value_counts(whole_set, 'person_capacity'), end='\n\n') print('Train', calculate_value_counts(train_set, 'person_capacity'), end='\n\n') print('Dev', calculate_value_counts(dev_set, 'person_capacity'), end='\n\n') print('Test', calculate_value_counts(test_set, 'person_capacity'), end='\n\n') # 1.4 & 1.5 whole_set_normalized = normalize_data(whole_set) train_set_normalized = normalize_data(train_set) dev_set_normalized = normalize_data(dev_set) test_set_normalized = normalize_data(test_set) print(whole_set_normalized, '\n\n') # Save all sets into files train_set_normalized.to_csv('./barcelona_weekends_datasets/train_set.csv', index=False) dev_set_normalized.to_csv('./barcelona_weekends_datasets/dev_set.csv', index=False) test_set_normalized.to_csv('./barcelona_weekends_datasets/test_set.csv', index=False)