Data description
This commit is contained in:
parent
a72841a7e3
commit
4c6f16e215
3
README.md
Normal file
3
README.md
Normal file
@ -0,0 +1,3 @@
|
||||
|
||||
![frequency of departures at given hour](./pics/departure_time_frequency.png)
|
||||
![popularity of trip destinations](./pics/stop_headsign_popularity.png)
|
BIN
pics/departure_time_frequency.png
Normal file
BIN
pics/departure_time_frequency.png
Normal file
Binary file not shown.
After Width: | Height: | Size: 15 KiB |
BIN
pics/stop_headsign_popularity.png
Normal file
BIN
pics/stop_headsign_popularity.png
Normal file
Binary file not shown.
After Width: | Height: | Size: 175 KiB |
@ -29,4 +29,8 @@ for k in "${keep[@]}"; do
|
||||
fi
|
||||
done
|
||||
|
||||
./normalize <stop_times.tsv >stop_times.normalized.tsv
|
||||
if [ ! -f "stop_times.normalized.tsv" ]; then
|
||||
./normalize <stop_times.tsv >stop_times.normalized.tsv
|
||||
./split_train_valid_test.py
|
||||
fi
|
||||
./stats.py
|
||||
|
9
split_train_valid_test.py
Normal file → Executable file
9
split_train_valid_test.py
Normal file → Executable file
@ -1,12 +1,11 @@
|
||||
#!/usr/bin/env python3
|
||||
import pandas as pd
|
||||
from sklearn.model_selection import train_test_split
|
||||
|
||||
TEST_SIZE = 25
|
||||
VALID_SIZE = 25
|
||||
|
||||
data = pd.read_csv('./stop_times.normalized.tsv', sep='\t')
|
||||
train, test = train_test_split(data, test_size=TEST_SIZE+VALID_SIZE)
|
||||
valid, test = train_test_split(test, test_size=TEST_SIZE)
|
||||
|
||||
train, test = train_test_split(data, test_size=0.5)
|
||||
valid, test = train_test_split(test, test_size=0.5)
|
||||
|
||||
train.to_csv('stop_times.train.tsv', sep='\t')
|
||||
test.to_csv('stop_times.test.tsv', sep='\t')
|
||||
|
52
stats.py
Executable file
52
stats.py
Executable file
@ -0,0 +1,52 @@
|
||||
#!/usr/bin/env python3
|
||||
import math
|
||||
import os
|
||||
import pandas as pd
|
||||
import contextlib
|
||||
|
||||
pd.set_option('display.float_format', lambda x: '%.5f' % x)
|
||||
|
||||
def float2time(d: float):
|
||||
hours = math.floor(d * 24)
|
||||
minutes = math.floor(d * 24 * 60 - hours * 60)
|
||||
return "%s:%s" % tuple(str(x).rjust(2,'0') for x in (hours, minutes))
|
||||
|
||||
data = pd.read_csv(f'./stop_times.normalized.tsv', sep='\t', dtype={ 'departure_time': float, 'stop_id': str, 'stop_headsign': str })
|
||||
|
||||
print("--- Pictures -------------------------------------------------")
|
||||
|
||||
with contextlib.suppress(Exception):
|
||||
os.mkdir("pics")
|
||||
|
||||
(data["departure_time"] * 24).plot(kind='hist', title="Częstotliwość czasu odjazdu").get_figure().savefig('pics/departure_time_frequency.png')
|
||||
print("pics/departure_time_frequency.png")
|
||||
data["stop_headsign"].value_counts().plot(kind='pie', title="Popularność celu").get_figure().savefig('pics/stop_headsign_popularity.png')
|
||||
print("pics/stop_headsign_popularity.png")
|
||||
|
||||
|
||||
print("--- Minmum departure time per stop headsign ------------------")
|
||||
shgroup = data.groupby('stop_headsign').min(numeric_only=True)
|
||||
shgroup["departure_time"] = shgroup["departure_time"].map(float2time)
|
||||
print(shgroup)
|
||||
print()
|
||||
|
||||
print("--- Maximum departure time per stop headsign -----------------")
|
||||
shgroup = data.groupby('stop_headsign').max(numeric_only=True)
|
||||
shgroup["departure_time"] = shgroup["departure_time"].map(float2time)
|
||||
print(shgroup)
|
||||
print()
|
||||
|
||||
print("--- Mean departure time per stop headsign --------------------")
|
||||
shgroup = data.groupby('stop_headsign').mean(numeric_only=True)
|
||||
shgroup["departure_time"] = shgroup["departure_time"].map(float2time)
|
||||
print(shgroup)
|
||||
print()
|
||||
|
||||
|
||||
print("--- Normalized data statistics -------------------------------")
|
||||
print(data.describe(include='all'))
|
||||
|
||||
for subset in ['train', 'valid', 'test']:
|
||||
print(f"--- {subset.title()} data statistics -------------------------------")
|
||||
data = pd.read_csv(f'./stop_times.{subset}.tsv', sep='\t', dtype={ 'departure_time': float, 'stop_id': str, 'stop_headsign': str })
|
||||
print(data.describe(include='all'))
|
Loading…
Reference in New Issue
Block a user