ium_z444510/dataset-stats-new.py

42 lines
1.2 KiB
Python
Raw Permalink Normal View History

2023-05-12 00:08:16 +02:00
import pandas as pd
import numpy as np
def calculate_stats(data, col_name):
col_values = data[col_name]
return [
len(data),
np.min(col_values),
np.max(col_values),
np.std(col_values),
np.median(col_values)
]
def calculate_value_counts(data, col_name):
return data[col_name].value_counts()
if __name__ == '__main__':
train = pd.read_csv('barcelona_weekends.train.csv')
dev = pd.read_csv('barcelona_weekends.dev.csv')
test = pd.read_csv('barcelona_weekends.test.csv')
train_set_stats = calculate_stats(train, 'realSum')
dev_set_stats = calculate_stats(dev, 'realSum')
test_set_stats = calculate_stats(test, 'realSum')
columns = ['size', 'minimum', 'maximum', 'standard deviation', 'median']
rows = ['train', 'dev', 'test']
df = pd.DataFrame(
data=np.array([train_set_stats, dev_set_stats, test_set_stats]),
index=rows,
columns=columns)
print(df)
print('Train', calculate_value_counts(train, 'person_capacity'), end='\n\n')
print('Dev', calculate_value_counts(dev, 'person_capacity'), end='\n\n')
print('Test', calculate_value_counts(test, 'person_capacity'), end='\n\n')
df.to_csv('stats.csv', index=False)