add homework [2.Dane]
This commit is contained in:
parent
4aed81c73e
commit
8eb731b53f
70
main.py
70
main.py
@ -1,7 +1,6 @@
|
|||||||
import jupyter
|
from sklearn import preprocessing
|
||||||
import numpy as np
|
import numpy as np
|
||||||
import pandas as pd
|
import pandas as pd
|
||||||
from sklearn.model_selection import train_test_split
|
|
||||||
import kaggle
|
import kaggle
|
||||||
import pandas
|
import pandas
|
||||||
import os
|
import os
|
||||||
@ -18,11 +17,74 @@ def read_data_from_file(filename):
|
|||||||
return pandas.read_csv(filename)
|
return pandas.read_csv(filename)
|
||||||
|
|
||||||
|
|
||||||
|
def split_data(data):
|
||||||
|
return np.split(data.sample(frac=1, random_state=42), [int(.6 * len(data)), int(.8 * len(data))])
|
||||||
|
|
||||||
|
|
||||||
|
def calculate_stats(data, col_name):
|
||||||
|
col_values = data[col_name]
|
||||||
|
return [
|
||||||
|
len(data),
|
||||||
|
np.min(col_values),
|
||||||
|
np.max(col_values),
|
||||||
|
np.std(col_values),
|
||||||
|
np.median(col_values)
|
||||||
|
]
|
||||||
|
|
||||||
|
|
||||||
|
def calculate_value_counts(data, col_name):
|
||||||
|
return data[col_name].value_counts()
|
||||||
|
|
||||||
|
|
||||||
|
def normalize_data(data):
|
||||||
|
data = data.iloc[:, 1:]
|
||||||
|
|
||||||
|
numeric_columns = ['realSum', 'person_capacity', 'multi', 'biz', 'cleanliness_rating', 'guest_satisfaction_overall', 'bedrooms', 'dist', 'metro_dist', 'attr_index', 'attr_index_norm', 'rest_index', 'rest_index_norm', 'lng', 'lat']
|
||||||
|
non_numeric_columns = ['room_type', 'room_shared', 'room_private', 'host_is_superhost', ]
|
||||||
|
numeric_data = data[numeric_columns]
|
||||||
|
non_numeric_data = data[non_numeric_columns]
|
||||||
|
|
||||||
|
enc = preprocessing.OrdinalEncoder()
|
||||||
|
non_numeric_data_norm = enc.fit_transform(non_numeric_data)
|
||||||
|
|
||||||
|
scaler = preprocessing.MinMaxScaler()
|
||||||
|
numeric_data_norm = scaler.fit_transform(numeric_data)
|
||||||
|
|
||||||
|
data[numeric_columns] = numeric_data_norm
|
||||||
|
data[non_numeric_columns] = non_numeric_data_norm
|
||||||
|
|
||||||
|
return data
|
||||||
|
|
||||||
|
|
||||||
if __name__ == '__main__':
|
if __name__ == '__main__':
|
||||||
|
# 1.1
|
||||||
if not os.path.isdir('data'):
|
if not os.path.isdir('data'):
|
||||||
download_data()
|
download_data()
|
||||||
df = read_data_from_file(path_to_data + '/barcelona_weekends.csv')
|
whole_set = read_data_from_file(path_to_data + '/barcelona_weekends.csv')
|
||||||
print(df.head())
|
print(whole_set.head())
|
||||||
|
|
||||||
|
# 1.2
|
||||||
|
train_set, dev_set, test_set = split_data(whole_set)
|
||||||
|
|
||||||
|
# 1.3
|
||||||
|
whole_set_stats = calculate_stats(whole_set, 'realSum')
|
||||||
|
train_set_stats = calculate_stats(train_set, 'realSum')
|
||||||
|
dev_set_stats = calculate_stats(dev_set, 'realSum')
|
||||||
|
test_set_stats = calculate_stats(test_set, 'realSum')
|
||||||
|
|
||||||
|
columns = ['size', 'minimum', 'maximum', 'standard deviation', 'median']
|
||||||
|
rows = ['whole set', 'train', 'dev', 'test']
|
||||||
|
print(pd.DataFrame(
|
||||||
|
data=np.array([whole_set_stats, train_set_stats, dev_set_stats, test_set_stats]),
|
||||||
|
index=rows,
|
||||||
|
columns=columns),
|
||||||
|
end='\n\n')
|
||||||
|
|
||||||
|
print('Whole set', calculate_value_counts(whole_set, 'person_capacity'), end='\n\n')
|
||||||
|
print('Train', calculate_value_counts(train_set, 'person_capacity'), end='\n\n')
|
||||||
|
print('Dev', calculate_value_counts(dev_set, 'person_capacity'), end='\n\n')
|
||||||
|
print('Test', calculate_value_counts(test_set, 'person_capacity'), end='\n\n')
|
||||||
|
|
||||||
|
# 1.4 & 1.5
|
||||||
|
normalized_data = normalize_data(whole_set)
|
||||||
|
print(normalized_data, '\n\n')
|
||||||
|
Loading…
Reference in New Issue
Block a user