42 lines
1.2 KiB
Python
42 lines
1.2 KiB
Python
import pandas as pd
|
|
import numpy as np
|
|
|
|
|
|
def calculate_stats(data, col_name):
|
|
col_values = data[col_name]
|
|
return [
|
|
len(data),
|
|
np.min(col_values),
|
|
np.max(col_values),
|
|
np.std(col_values),
|
|
np.median(col_values)
|
|
]
|
|
|
|
|
|
def calculate_value_counts(data, col_name):
|
|
return data[col_name].value_counts()
|
|
|
|
|
|
if __name__ == '__main__':
|
|
train = pd.read_csv('barcelona_weekends.train.csv')
|
|
dev = pd.read_csv('barcelona_weekends.dev.csv')
|
|
test = pd.read_csv('barcelona_weekends.test.csv')
|
|
|
|
train_set_stats = calculate_stats(train, 'realSum')
|
|
dev_set_stats = calculate_stats(dev, 'realSum')
|
|
test_set_stats = calculate_stats(test, 'realSum')
|
|
|
|
columns = ['size', 'minimum', 'maximum', 'standard deviation', 'median']
|
|
rows = ['train', 'dev', 'test']
|
|
df = pd.DataFrame(
|
|
data=np.array([train_set_stats, dev_set_stats, test_set_stats]),
|
|
index=rows,
|
|
columns=columns)
|
|
print(df)
|
|
|
|
print('Train', calculate_value_counts(train, 'person_capacity'), end='\n\n')
|
|
print('Dev', calculate_value_counts(dev, 'person_capacity'), end='\n\n')
|
|
print('Test', calculate_value_counts(test, 'person_capacity'), end='\n\n')
|
|
|
|
df.to_csv('stats.csv', index=False)
|