From a72841a7e350d4370fd62896b062ee05b51efd90 Mon Sep 17 00:00:00 2001 From: Robert Bendun Date: Tue, 21 Mar 2023 01:00:51 +0100 Subject: [PATCH] Data normalization --- .gitignore | 3 + Makefile | 5 + normalize.cc | 108 +++++++++++++++++++++ download-ztm.sh => prepare-ztm-data.sh | 13 ++- split_train_valid_test.py | 13 +++ ztm-data.txt | 126 ++----------------------- 6 files changed, 146 insertions(+), 122 deletions(-) create mode 100644 normalize.cc rename download-ztm.sh => prepare-ztm-data.sh (59%) create mode 100644 split_train_valid_test.py diff --git a/.gitignore b/.gitignore index 0c00f18..48ef6ac 100644 --- a/.gitignore +++ b/.gitignore @@ -2,7 +2,10 @@ languages.*.tsv columns.pruned.tsv columns.original.tsv *.csv +*.tsv csv2tsv/csv2tsv .cache compile_commands.json tsv2json/tsv2json +normalize +data diff --git a/Makefile b/Makefile index fdb2bd1..485f2e2 100644 --- a/Makefile +++ b/Makefile @@ -1,3 +1,8 @@ +normalize: normalize.cc + g++ -std=c++20 -Wall -Wextra -O3 -o $@ $< + csv2tsv/csv2tsv: csv2tsv/csv2tsv.go cd csv2tsv; go build +clean: + rm -f *.csv *.tsv stop*.txt trips.txt diff --git a/normalize.cc b/normalize.cc new file mode 100644 index 0000000..4723af1 --- /dev/null +++ b/normalize.cc @@ -0,0 +1,108 @@ +#include +#include +#include +#include + +namespace split +{ + struct sentinel {}; + + struct iterator + { + using difference_type = ptrdiff_t; + using value_type = std::string_view; + using iterator_category = std::input_iterator_tag; + using pointer = void; + using reference = std::string_view&; + + explicit iterator(std::convertible_to auto&& source, char delim) + : source{source} + , delim{delim} + { + ++*this; // Compute first cell + } + + inline iterator begin() const + { + return *this; + } + + inline sentinel end() const + { + return sentinel{}; + } + + inline bool operator==(sentinel) const + { + return reached_end; + } + + inline iterator& operator++() + { + if (source.empty()) { + reached_end = true; + return *this; + } + + if (auto tab = source.find(delim); tab != std::string_view::npos) { + current = source.substr(0, tab); + source.remove_prefix(tab+1); + } else { + current = source; + source = {}; + } + return *this; + } + + inline iterator operator++(int) + { + auto copy = *this; + ++*this; + return copy; + } + + inline std::string_view operator*() const + { + return current; + } + + std::string_view current; + std::string_view source; + char delim; + bool reached_end = false; + }; +} + +std::optional time2float(std::string_view time) +{ + if (time.size() != 8) return std::nullopt; // Expected time in format hh:mm:ss + + auto const hours = ((time[0] - '0') * 10 + time[1] - '0'); + auto const minutes = ((time[3] - '0') * 10 + time[4] - '0'); + if (hours >= 24) return std::nullopt; // Single day has only 24 hours + + return (float(hours) * 60 + float(minutes)) / (60 * 24); +} + +int main() +{ + std::cout.sync_with_stdio(false); + + bool passed_header = false; + for (std::string line; std::getline(std::cin, line);) { + split::iterator row(line, '\t'); + std::advance(row, 2); if (row == split::sentinel{}) continue; std::string_view const departure_time = *row; + std::advance(row, 1); if (row == split::sentinel{}) continue; std::string_view const stop_id = *row; + std::advance(row, 2); if (row == split::sentinel{}) continue; std::string_view const stop_headsign = *row; + + if (!passed_header) { + std::cout << departure_time << '\t' << stop_id << '\t' << stop_headsign << '\n'; + passed_header = true; + continue; + } + + auto const departure = time2float(departure_time); if (!departure) continue; + + std::cout << *departure << '\t' << stop_id << '\t' << stop_headsign << '\n'; + } +} diff --git a/download-ztm.sh b/prepare-ztm-data.sh similarity index 59% rename from download-ztm.sh rename to prepare-ztm-data.sh index 4158256..85af93f 100755 --- a/download-ztm.sh +++ b/prepare-ztm-data.sh @@ -1,6 +1,8 @@ #!/usr/bin/env bash -set -e -o pipefail +set -xe -o pipefail + +make normalize csv2tsv/csv2tsv keep=(stops.txt trips.txt stop_times.txt) @@ -19,5 +21,12 @@ done cd .. for k in "${keep[@]}"; do - cat $(find data -name "$k") > "$k" + csv="${k%.txt}.csv" + tsv="${k%.txt}.tsv" + if [ ! -f "$tsv" ]; then + cat $(find data -name "$k") > "$csv" + csv2tsv/csv2tsv <"$csv" >"$tsv" + fi done + +./normalize stop_times.normalized.tsv diff --git a/split_train_valid_test.py b/split_train_valid_test.py new file mode 100644 index 0000000..914da57 --- /dev/null +++ b/split_train_valid_test.py @@ -0,0 +1,13 @@ +import pandas as pd +from sklearn.model_selection import train_test_split + +TEST_SIZE = 25 +VALID_SIZE = 25 + +data = pd.read_csv('./stop_times.normalized.tsv', sep='\t') +train, test = train_test_split(data, test_size=TEST_SIZE+VALID_SIZE) +valid, test = train_test_split(test, test_size=TEST_SIZE) + +train.to_csv('stop_times.train.tsv', sep='\t') +test.to_csv('stop_times.test.tsv', sep='\t') +valid.to_csv('stop_times.valid.tsv', sep='\t') diff --git a/ztm-data.txt b/ztm-data.txt index e47dd94..fe0a155 100644 --- a/ztm-data.txt +++ b/ztm-data.txt @@ -1,121 +1,7 @@ -https://www.ztm.poznan.pl/pl/dla-deweloperow/getGTFSFile/?file=20230317_20230430.zip -https://www.ztm.poznan.pl/pl/dla-deweloperow/getGTFSFile/?file=20230316_20230430.zip -https://www.ztm.poznan.pl/pl/dla-deweloperow/getGTFSFile/?file=20230313_20230430.zip -https://www.ztm.poznan.pl/pl/dla-deweloperow/getGTFSFile/?file=20230309_20230331.zip -https://www.ztm.poznan.pl/pl/dla-deweloperow/getGTFSFile/?file=20230306_20230331.zip -https://www.ztm.poznan.pl/pl/dla-deweloperow/getGTFSFile/?file=20230301_20230331.zip -https://www.ztm.poznan.pl/pl/dla-deweloperow/getGTFSFile/?file=20230227_20230228.zip -https://www.ztm.poznan.pl/pl/dla-deweloperow/getGTFSFile/?file=20230224_20230228.zip -https://www.ztm.poznan.pl/pl/dla-deweloperow/getGTFSFile/?file=20230223_20230228.zip -https://www.ztm.poznan.pl/pl/dla-deweloperow/getGTFSFile/?file=20230222_20230222.zip -https://www.ztm.poznan.pl/pl/dla-deweloperow/getGTFSFile/?file=20230215_20230225.zip -https://www.ztm.poznan.pl/pl/dla-deweloperow/getGTFSFile/?file=20230212_20230228.zip -https://www.ztm.poznan.pl/pl/dla-deweloperow/getGTFSFile/?file=20230208_20230228.zip -https://www.ztm.poznan.pl/pl/dla-deweloperow/getGTFSFile/?file=20230204_20230228.zip -https://www.ztm.poznan.pl/pl/dla-deweloperow/getGTFSFile/?file=20230129_20230228.zip -https://www.ztm.poznan.pl/pl/dla-deweloperow/getGTFSFile/?file=20230113_20230131.zip -https://www.ztm.poznan.pl/pl/dla-deweloperow/getGTFSFile/?file=20230107_20230128.zip -https://www.ztm.poznan.pl/pl/dla-deweloperow/getGTFSFile/?file=20230106_20230106.zip -https://www.ztm.poznan.pl/pl/dla-deweloperow/getGTFSFile/?file=20230105_20230105.zip -https://www.ztm.poznan.pl/pl/dla-deweloperow/getGTFSFile/?file=20230101_20230104.zip -https://www.ztm.poznan.pl/pl/dla-deweloperow/getGTFSFile/?file=20221226_20221231.zip -https://www.ztm.poznan.pl/pl/dla-deweloperow/getGTFSFile/?file=20221223_20221225.zip -https://www.ztm.poznan.pl/pl/dla-deweloperow/getGTFSFile/?file=20221218_20221222.zip -https://www.ztm.poznan.pl/pl/dla-deweloperow/getGTFSFile/?file=20221210_20221223.zip -https://www.ztm.poznan.pl/pl/dla-deweloperow/getGTFSFile/?file=20221201_20221210.zip -https://www.ztm.poznan.pl/pl/dla-deweloperow/getGTFSFile/?file=20221129_20221210.zip -https://www.ztm.poznan.pl/pl/dla-deweloperow/getGTFSFile/?file=20221126_20221210.zip -https://www.ztm.poznan.pl/pl/dla-deweloperow/getGTFSFile/?file=20221119_20221125.zip -https://www.ztm.poznan.pl/pl/dla-deweloperow/getGTFSFile/?file=20221115_20221130.zip -https://www.ztm.poznan.pl/pl/dla-deweloperow/getGTFSFile/?file=20221114_20221130.zip -https://www.ztm.poznan.pl/pl/dla-deweloperow/getGTFSFile/?file=20221112_20221113.zip -https://www.ztm.poznan.pl/pl/dla-deweloperow/getGTFSFile/?file=20221110_20221111.zip -https://www.ztm.poznan.pl/pl/dla-deweloperow/getGTFSFile/?file=20221106_20221130.zip -https://www.ztm.poznan.pl/pl/dla-deweloperow/getGTFSFile/?file=20221103_20221130.zip -https://www.ztm.poznan.pl/pl/dla-deweloperow/getGTFSFile/?file=20221102_20221130.zip -https://www.ztm.poznan.pl/pl/dla-deweloperow/getGTFSFile/?file=20221031_20221102.zip -https://www.ztm.poznan.pl/pl/dla-deweloperow/getGTFSFile/?file=20221028_20221030.zip -https://www.ztm.poznan.pl/pl/dla-deweloperow/getGTFSFile/?file=20221026_20221028.zip -https://www.ztm.poznan.pl/pl/dla-deweloperow/getGTFSFile/?file=20221022_20221028.zip -https://www.ztm.poznan.pl/pl/dla-deweloperow/getGTFSFile/?file=20221019_20221028.zip -https://www.ztm.poznan.pl/pl/dla-deweloperow/getGTFSFile/?file=20221017_20221031.zip -https://www.ztm.poznan.pl/pl/dla-deweloperow/getGTFSFile/?file=20221012_20221031.zip -https://www.ztm.poznan.pl/pl/dla-deweloperow/getGTFSFile/?file=20221010_20221031.zip -https://www.ztm.poznan.pl/pl/dla-deweloperow/getGTFSFile/?file=20221005_20221031.zip -https://www.ztm.poznan.pl/pl/dla-deweloperow/getGTFSFile/?file=20221004_20221031.zip -https://www.ztm.poznan.pl/pl/dla-deweloperow/getGTFSFile/?file=20221001_20221031.zip -https://www.ztm.poznan.pl/pl/dla-deweloperow/getGTFSFile/?file=20220924_20220930.zip -https://www.ztm.poznan.pl/pl/dla-deweloperow/getGTFSFile/?file=20220923_20220929.zip -https://www.ztm.poznan.pl/pl/dla-deweloperow/getGTFSFile/?file=20220917_20220930.zip -https://www.ztm.poznan.pl/pl/dla-deweloperow/getGTFSFile/?file=20220915_20220930.zip -https://www.ztm.poznan.pl/pl/dla-deweloperow/getGTFSFile/?file=20220911_20220930.zip -https://www.ztm.poznan.pl/pl/dla-deweloperow/getGTFSFile/?file=20220909_20220930.zip -https://www.ztm.poznan.pl/pl/dla-deweloperow/getGTFSFile/?file=20220906_20220930.zip -https://www.ztm.poznan.pl/pl/dla-deweloperow/getGTFSFile/?file=20220904_20220930.zip -https://www.ztm.poznan.pl/pl/dla-deweloperow/getGTFSFile/?file=20220902_20220903.zip -https://www.ztm.poznan.pl/pl/dla-deweloperow/getGTFSFile/?file=20220901_20220930.zip -https://www.ztm.poznan.pl/pl/dla-deweloperow/getGTFSFile/?file=20220826_20220831.zip -https://www.ztm.poznan.pl/pl/dla-deweloperow/getGTFSFile/?file=20220825_20220831.zip -https://www.ztm.poznan.pl/pl/dla-deweloperow/getGTFSFile/?file=20220822_20220831.zip -https://www.ztm.poznan.pl/pl/dla-deweloperow/getGTFSFile/?file=20220819_20220827.zip -https://www.ztm.poznan.pl/pl/dla-deweloperow/getGTFSFile/?file=20220815_20220821.zip -https://www.ztm.poznan.pl/pl/dla-deweloperow/getGTFSFile/?file=20220812_20220814.zip -https://www.ztm.poznan.pl/pl/dla-deweloperow/getGTFSFile/?file=20220805_20220831.zip -https://www.ztm.poznan.pl/pl/dla-deweloperow/getGTFSFile/?file=20220801_20220831.zip -https://www.ztm.poznan.pl/pl/dla-deweloperow/getGTFSFile/?file=20220730_20220731.zip -https://www.ztm.poznan.pl/pl/dla-deweloperow/getGTFSFile/?file=20220729_20220731.zip -https://www.ztm.poznan.pl/pl/dla-deweloperow/getGTFSFile/?file=20220722_20220728.zip -https://www.ztm.poznan.pl/pl/dla-deweloperow/getGTFSFile/?file=20220718_20220731.zip -https://www.ztm.poznan.pl/pl/dla-deweloperow/getGTFSFile/?file=20220714_20220731.zip -https://www.ztm.poznan.pl/pl/dla-deweloperow/getGTFSFile/?file=20220711_20220731.zip -https://www.ztm.poznan.pl/pl/dla-deweloperow/getGTFSFile/?file=20220707_20220731.zip -https://www.ztm.poznan.pl/pl/dla-deweloperow/getGTFSFile/?file=20220706_20220731.zip -https://www.ztm.poznan.pl/pl/dla-deweloperow/getGTFSFile/?file=20220705_20220731.zip -https://www.ztm.poznan.pl/pl/dla-deweloperow/getGTFSFile/?file=20220704_20220731.zip -https://www.ztm.poznan.pl/pl/dla-deweloperow/getGTFSFile/?file=20220701_20220731.zip -https://www.ztm.poznan.pl/pl/dla-deweloperow/getGTFSFile/?file=20220627_20220731.zip -https://www.ztm.poznan.pl/pl/dla-deweloperow/getGTFSFile/?file=20220625_20220731.zip -https://www.ztm.poznan.pl/pl/dla-deweloperow/getGTFSFile/?file=20220618_20220624.zip -https://www.ztm.poznan.pl/pl/dla-deweloperow/getGTFSFile/?file=20220615_20220617.zip -https://www.ztm.poznan.pl/pl/dla-deweloperow/getGTFSFile/?file=20220613_20220624.zip -https://www.ztm.poznan.pl/pl/dla-deweloperow/getGTFSFile/?file=20220610_20220624.zip -https://www.ztm.poznan.pl/pl/dla-deweloperow/getGTFSFile/?file=20220609_20220624.zip -https://www.ztm.poznan.pl/pl/dla-deweloperow/getGTFSFile/?file=20220608_20220624.zip -https://www.ztm.poznan.pl/pl/dla-deweloperow/getGTFSFile/?file=20220604_20220624.zip -https://www.ztm.poznan.pl/pl/dla-deweloperow/getGTFSFile/?file=20220601_20220624.zip -https://www.ztm.poznan.pl/pl/dla-deweloperow/getGTFSFile/?file=20220526_20220531.zip -https://www.ztm.poznan.pl/pl/dla-deweloperow/getGTFSFile/?file=20220520_20220531.zip -https://www.ztm.poznan.pl/pl/dla-deweloperow/getGTFSFile/?file=20220519_20220531.zip -https://www.ztm.poznan.pl/pl/dla-deweloperow/getGTFSFile/?file=20220517_20220531.zip -https://www.ztm.poznan.pl/pl/dla-deweloperow/getGTFSFile/?file=20220514_20220529.zip -https://www.ztm.poznan.pl/pl/dla-deweloperow/getGTFSFile/?file=20220512_20220513.zip -https://www.ztm.poznan.pl/pl/dla-deweloperow/getGTFSFile/?file=20220507_20220531.zip -https://www.ztm.poznan.pl/pl/dla-deweloperow/getGTFSFile/?file=20220504_20220531.zip -https://www.ztm.poznan.pl/pl/dla-deweloperow/getGTFSFile/?file=20220502_20220503.zip -https://www.ztm.poznan.pl/pl/dla-deweloperow/getGTFSFile/?file=20220429_20220501.zip -https://www.ztm.poznan.pl/pl/dla-deweloperow/getGTFSFile/?file=20220423_20220430.zip -https://www.ztm.poznan.pl/pl/dla-deweloperow/getGTFSFile/?file=20220421_20220422.zip -https://www.ztm.poznan.pl/pl/dla-deweloperow/getGTFSFile/?file=20220420_20220422.zip -https://www.ztm.poznan.pl/pl/dla-deweloperow/getGTFSFile/?file=20220418_20220419.zip -https://www.ztm.poznan.pl/pl/dla-deweloperow/getGTFSFile/?file=20220414_20220417.zip -https://www.ztm.poznan.pl/pl/dla-deweloperow/getGTFSFile/?file=20220409_20220430.zip -https://www.ztm.poznan.pl/pl/dla-deweloperow/getGTFSFile/?file=20220401_20220430.zip -https://www.ztm.poznan.pl/pl/dla-deweloperow/getGTFSFile/?file=20220327_20220415.zip +https://www.ztm.poznan.pl/pl/dla-deweloperow/getGTFSFile/?file=20210301_20210331.zip https://www.ztm.poznan.pl/pl/dla-deweloperow/getGTFSFile/?file=20220321_20220410.zip -https://www.ztm.poznan.pl/pl/dla-deweloperow/getGTFSFile/?file=20220319_20220410.zip -https://www.ztm.poznan.pl/pl/dla-deweloperow/getGTFSFile/?file=20220315_20220410.zip -https://www.ztm.poznan.pl/pl/dla-deweloperow/getGTFSFile/?file=20220314_20220410.zip -https://www.ztm.poznan.pl/pl/dla-deweloperow/getGTFSFile/?file=20220311_20220410.zip -https://www.ztm.poznan.pl/pl/dla-deweloperow/getGTFSFile/?file=20220303_20220331.zip -https://www.ztm.poznan.pl/pl/dla-deweloperow/getGTFSFile/?file=20220301_20220331.zip -https://www.ztm.poznan.pl/pl/dla-deweloperow/getGTFSFile/?file=20220224_20220331.zip -https://www.ztm.poznan.pl/pl/dla-deweloperow/getGTFSFile/?file=20220219_20220331.zip -https://www.ztm.poznan.pl/pl/dla-deweloperow/getGTFSFile/?file=20220217_20220331.zip -https://www.ztm.poznan.pl/pl/dla-deweloperow/getGTFSFile/?file=20220215_20220331.zip -https://www.ztm.poznan.pl/pl/dla-deweloperow/getGTFSFile/?file=20220212_20220312.zip -https://www.ztm.poznan.pl/pl/dla-deweloperow/getGTFSFile/?file=20220204_20220228.zip -https://www.ztm.poznan.pl/pl/dla-deweloperow/getGTFSFile/?file=20220131_20220228.zip -https://www.ztm.poznan.pl/pl/dla-deweloperow/getGTFSFile/?file=20220122_20220228.zip -https://www.ztm.poznan.pl/pl/dla-deweloperow/getGTFSFile/?file=20220115_20220220.zip -https://www.ztm.poznan.pl/pl/dla-deweloperow/getGTFSFile/?file=20220108_20220220.zip -https://www.ztm.poznan.pl/pl/dla-deweloperow/getGTFSFile/?file=20220105_20220107.zip +https://www.ztm.poznan.pl/pl/dla-deweloperow/getGTFSFile/?file=20230223_20230228.zip +https://www.ztm.poznan.pl/pl/dla-deweloperow/getGTFSFile/?file=20230224_20230228.zip +https://www.ztm.poznan.pl/pl/dla-deweloperow/getGTFSFile/?file=20230227_20230228.zip +https://www.ztm.poznan.pl/pl/dla-deweloperow/getGTFSFile/?file=20230301_20230331.zip +https://www.ztm.poznan.pl/pl/dla-deweloperow/getGTFSFile/?file=20230317_20230430.zip