diff --git a/README.md b/README.md new file mode 100644 index 0000000..8c8cdfe --- /dev/null +++ b/README.md @@ -0,0 +1,3 @@ + +![frequency of departures at given hour](./pics/departure_time_frequency.png) +![popularity of trip destinations](./pics/stop_headsign_popularity.png) diff --git a/pics/departure_time_frequency.png b/pics/departure_time_frequency.png new file mode 100644 index 0000000..d8f2bfb Binary files /dev/null and b/pics/departure_time_frequency.png differ diff --git a/pics/stop_headsign_popularity.png b/pics/stop_headsign_popularity.png new file mode 100644 index 0000000..1301ec5 Binary files /dev/null and b/pics/stop_headsign_popularity.png differ diff --git a/prepare-ztm-data.sh b/prepare-ztm-data.sh index 85af93f..c6d6142 100755 --- a/prepare-ztm-data.sh +++ b/prepare-ztm-data.sh @@ -29,4 +29,8 @@ for k in "${keep[@]}"; do fi done -./normalize stop_times.normalized.tsv +if [ ! -f "stop_times.normalized.tsv" ]; then + ./normalize stop_times.normalized.tsv + ./split_train_valid_test.py +fi +./stats.py diff --git a/split_train_valid_test.py b/split_train_valid_test.py old mode 100644 new mode 100755 index 914da57..e4e7fee --- a/split_train_valid_test.py +++ b/split_train_valid_test.py @@ -1,12 +1,11 @@ +#!/usr/bin/env python3 import pandas as pd from sklearn.model_selection import train_test_split -TEST_SIZE = 25 -VALID_SIZE = 25 - data = pd.read_csv('./stop_times.normalized.tsv', sep='\t') -train, test = train_test_split(data, test_size=TEST_SIZE+VALID_SIZE) -valid, test = train_test_split(test, test_size=TEST_SIZE) + +train, test = train_test_split(data, test_size=0.5) +valid, test = train_test_split(test, test_size=0.5) train.to_csv('stop_times.train.tsv', sep='\t') test.to_csv('stop_times.test.tsv', sep='\t') diff --git a/stats.py b/stats.py new file mode 100755 index 0000000..0fbc094 --- /dev/null +++ b/stats.py @@ -0,0 +1,52 @@ +#!/usr/bin/env python3 +import math +import os +import pandas as pd +import contextlib + +pd.set_option('display.float_format', lambda x: '%.5f' % x) + +def float2time(d: float): + hours = math.floor(d * 24) + minutes = math.floor(d * 24 * 60 - hours * 60) + return "%s:%s" % tuple(str(x).rjust(2,'0') for x in (hours, minutes)) + +data = pd.read_csv(f'./stop_times.normalized.tsv', sep='\t', dtype={ 'departure_time': float, 'stop_id': str, 'stop_headsign': str }) + +print("--- Pictures -------------------------------------------------") + +with contextlib.suppress(Exception): + os.mkdir("pics") + +(data["departure_time"] * 24).plot(kind='hist', title="Częstotliwość czasu odjazdu").get_figure().savefig('pics/departure_time_frequency.png') +print("pics/departure_time_frequency.png") +data["stop_headsign"].value_counts().plot(kind='pie', title="Popularność celu").get_figure().savefig('pics/stop_headsign_popularity.png') +print("pics/stop_headsign_popularity.png") + + +print("--- Minmum departure time per stop headsign ------------------") +shgroup = data.groupby('stop_headsign').min(numeric_only=True) +shgroup["departure_time"] = shgroup["departure_time"].map(float2time) +print(shgroup) +print() + +print("--- Maximum departure time per stop headsign -----------------") +shgroup = data.groupby('stop_headsign').max(numeric_only=True) +shgroup["departure_time"] = shgroup["departure_time"].map(float2time) +print(shgroup) +print() + +print("--- Mean departure time per stop headsign --------------------") +shgroup = data.groupby('stop_headsign').mean(numeric_only=True) +shgroup["departure_time"] = shgroup["departure_time"].map(float2time) +print(shgroup) +print() + + +print("--- Normalized data statistics -------------------------------") +print(data.describe(include='all')) + +for subset in ['train', 'valid', 'test']: + print(f"--- {subset.title()} data statistics -------------------------------") + data = pd.read_csv(f'./stop_times.{subset}.tsv', sep='\t', dtype={ 'departure_time': float, 'stop_id': str, 'stop_headsign': str }) + print(data.describe(include='all'))