Data normalization
This commit is contained in:
parent
e27bc920d4
commit
a72841a7e3
3
.gitignore
vendored
3
.gitignore
vendored
@ -2,7 +2,10 @@ languages.*.tsv
|
||||
columns.pruned.tsv
|
||||
columns.original.tsv
|
||||
*.csv
|
||||
*.tsv
|
||||
csv2tsv/csv2tsv
|
||||
.cache
|
||||
compile_commands.json
|
||||
tsv2json/tsv2json
|
||||
normalize
|
||||
data
|
||||
|
5
Makefile
5
Makefile
@ -1,3 +1,8 @@
|
||||
normalize: normalize.cc
|
||||
g++ -std=c++20 -Wall -Wextra -O3 -o $@ $<
|
||||
|
||||
csv2tsv/csv2tsv: csv2tsv/csv2tsv.go
|
||||
cd csv2tsv; go build
|
||||
|
||||
clean:
|
||||
rm -f *.csv *.tsv stop*.txt trips.txt
|
||||
|
108
normalize.cc
Normal file
108
normalize.cc
Normal file
@ -0,0 +1,108 @@
|
||||
#include <iostream>
|
||||
#include <fstream>
|
||||
#include <unordered_set>
|
||||
#include <optional>
|
||||
|
||||
namespace split
|
||||
{
|
||||
struct sentinel {};
|
||||
|
||||
struct iterator
|
||||
{
|
||||
using difference_type = ptrdiff_t;
|
||||
using value_type = std::string_view;
|
||||
using iterator_category = std::input_iterator_tag;
|
||||
using pointer = void;
|
||||
using reference = std::string_view&;
|
||||
|
||||
explicit iterator(std::convertible_to<std::string_view> auto&& source, char delim)
|
||||
: source{source}
|
||||
, delim{delim}
|
||||
{
|
||||
++*this; // Compute first cell
|
||||
}
|
||||
|
||||
inline iterator begin() const
|
||||
{
|
||||
return *this;
|
||||
}
|
||||
|
||||
inline sentinel end() const
|
||||
{
|
||||
return sentinel{};
|
||||
}
|
||||
|
||||
inline bool operator==(sentinel) const
|
||||
{
|
||||
return reached_end;
|
||||
}
|
||||
|
||||
inline iterator& operator++()
|
||||
{
|
||||
if (source.empty()) {
|
||||
reached_end = true;
|
||||
return *this;
|
||||
}
|
||||
|
||||
if (auto tab = source.find(delim); tab != std::string_view::npos) {
|
||||
current = source.substr(0, tab);
|
||||
source.remove_prefix(tab+1);
|
||||
} else {
|
||||
current = source;
|
||||
source = {};
|
||||
}
|
||||
return *this;
|
||||
}
|
||||
|
||||
inline iterator operator++(int)
|
||||
{
|
||||
auto copy = *this;
|
||||
++*this;
|
||||
return copy;
|
||||
}
|
||||
|
||||
inline std::string_view operator*() const
|
||||
{
|
||||
return current;
|
||||
}
|
||||
|
||||
std::string_view current;
|
||||
std::string_view source;
|
||||
char delim;
|
||||
bool reached_end = false;
|
||||
};
|
||||
}
|
||||
|
||||
std::optional<float> time2float(std::string_view time)
|
||||
{
|
||||
if (time.size() != 8) return std::nullopt; // Expected time in format hh:mm:ss
|
||||
|
||||
auto const hours = ((time[0] - '0') * 10 + time[1] - '0');
|
||||
auto const minutes = ((time[3] - '0') * 10 + time[4] - '0');
|
||||
if (hours >= 24) return std::nullopt; // Single day has only 24 hours
|
||||
|
||||
return (float(hours) * 60 + float(minutes)) / (60 * 24);
|
||||
}
|
||||
|
||||
int main()
|
||||
{
|
||||
std::cout.sync_with_stdio(false);
|
||||
|
||||
bool passed_header = false;
|
||||
for (std::string line; std::getline(std::cin, line);) {
|
||||
split::iterator row(line, '\t');
|
||||
std::advance(row, 2); if (row == split::sentinel{}) continue; std::string_view const departure_time = *row;
|
||||
std::advance(row, 1); if (row == split::sentinel{}) continue; std::string_view const stop_id = *row;
|
||||
std::advance(row, 2); if (row == split::sentinel{}) continue; std::string_view const stop_headsign = *row;
|
||||
|
||||
if (!passed_header) {
|
||||
std::cout << departure_time << '\t' << stop_id << '\t' << stop_headsign << '\n';
|
||||
passed_header = true;
|
||||
continue;
|
||||
}
|
||||
|
||||
auto const departure = time2float(departure_time); if (!departure) continue;
|
||||
|
||||
std::cout << *departure << '\t' << stop_id << '\t' << stop_headsign << '\n';
|
||||
}
|
||||
}
|
@ -1,6 +1,8 @@
|
||||
#!/usr/bin/env bash
|
||||
|
||||
set -e -o pipefail
|
||||
set -xe -o pipefail
|
||||
|
||||
make normalize csv2tsv/csv2tsv
|
||||
|
||||
keep=(stops.txt trips.txt stop_times.txt)
|
||||
|
||||
@ -19,5 +21,12 @@ done
|
||||
cd ..
|
||||
|
||||
for k in "${keep[@]}"; do
|
||||
cat $(find data -name "$k") > "$k"
|
||||
csv="${k%.txt}.csv"
|
||||
tsv="${k%.txt}.tsv"
|
||||
if [ ! -f "$tsv" ]; then
|
||||
cat $(find data -name "$k") > "$csv"
|
||||
csv2tsv/csv2tsv <"$csv" >"$tsv"
|
||||
fi
|
||||
done
|
||||
|
||||
./normalize <stop_times.tsv >stop_times.normalized.tsv
|
13
split_train_valid_test.py
Normal file
13
split_train_valid_test.py
Normal file
@ -0,0 +1,13 @@
|
||||
import pandas as pd
|
||||
from sklearn.model_selection import train_test_split
|
||||
|
||||
TEST_SIZE = 25
|
||||
VALID_SIZE = 25
|
||||
|
||||
data = pd.read_csv('./stop_times.normalized.tsv', sep='\t')
|
||||
train, test = train_test_split(data, test_size=TEST_SIZE+VALID_SIZE)
|
||||
valid, test = train_test_split(test, test_size=TEST_SIZE)
|
||||
|
||||
train.to_csv('stop_times.train.tsv', sep='\t')
|
||||
test.to_csv('stop_times.test.tsv', sep='\t')
|
||||
valid.to_csv('stop_times.valid.tsv', sep='\t')
|
126
ztm-data.txt
126
ztm-data.txt
@ -1,121 +1,7 @@
|
||||
https://www.ztm.poznan.pl/pl/dla-deweloperow/getGTFSFile/?file=20230317_20230430.zip
|
||||
https://www.ztm.poznan.pl/pl/dla-deweloperow/getGTFSFile/?file=20230316_20230430.zip
|
||||
https://www.ztm.poznan.pl/pl/dla-deweloperow/getGTFSFile/?file=20230313_20230430.zip
|
||||
https://www.ztm.poznan.pl/pl/dla-deweloperow/getGTFSFile/?file=20230309_20230331.zip
|
||||
https://www.ztm.poznan.pl/pl/dla-deweloperow/getGTFSFile/?file=20230306_20230331.zip
|
||||
https://www.ztm.poznan.pl/pl/dla-deweloperow/getGTFSFile/?file=20230301_20230331.zip
|
||||
https://www.ztm.poznan.pl/pl/dla-deweloperow/getGTFSFile/?file=20230227_20230228.zip
|
||||
https://www.ztm.poznan.pl/pl/dla-deweloperow/getGTFSFile/?file=20230224_20230228.zip
|
||||
https://www.ztm.poznan.pl/pl/dla-deweloperow/getGTFSFile/?file=20230223_20230228.zip
|
||||
https://www.ztm.poznan.pl/pl/dla-deweloperow/getGTFSFile/?file=20230222_20230222.zip
|
||||
https://www.ztm.poznan.pl/pl/dla-deweloperow/getGTFSFile/?file=20230215_20230225.zip
|
||||
https://www.ztm.poznan.pl/pl/dla-deweloperow/getGTFSFile/?file=20230212_20230228.zip
|
||||
https://www.ztm.poznan.pl/pl/dla-deweloperow/getGTFSFile/?file=20230208_20230228.zip
|
||||
https://www.ztm.poznan.pl/pl/dla-deweloperow/getGTFSFile/?file=20230204_20230228.zip
|
||||
https://www.ztm.poznan.pl/pl/dla-deweloperow/getGTFSFile/?file=20230129_20230228.zip
|
||||
https://www.ztm.poznan.pl/pl/dla-deweloperow/getGTFSFile/?file=20230113_20230131.zip
|
||||
https://www.ztm.poznan.pl/pl/dla-deweloperow/getGTFSFile/?file=20230107_20230128.zip
|
||||
https://www.ztm.poznan.pl/pl/dla-deweloperow/getGTFSFile/?file=20230106_20230106.zip
|
||||
https://www.ztm.poznan.pl/pl/dla-deweloperow/getGTFSFile/?file=20230105_20230105.zip
|
||||
https://www.ztm.poznan.pl/pl/dla-deweloperow/getGTFSFile/?file=20230101_20230104.zip
|
||||
https://www.ztm.poznan.pl/pl/dla-deweloperow/getGTFSFile/?file=20221226_20221231.zip
|
||||
https://www.ztm.poznan.pl/pl/dla-deweloperow/getGTFSFile/?file=20221223_20221225.zip
|
||||
https://www.ztm.poznan.pl/pl/dla-deweloperow/getGTFSFile/?file=20221218_20221222.zip
|
||||
https://www.ztm.poznan.pl/pl/dla-deweloperow/getGTFSFile/?file=20221210_20221223.zip
|
||||
https://www.ztm.poznan.pl/pl/dla-deweloperow/getGTFSFile/?file=20221201_20221210.zip
|
||||
https://www.ztm.poznan.pl/pl/dla-deweloperow/getGTFSFile/?file=20221129_20221210.zip
|
||||
https://www.ztm.poznan.pl/pl/dla-deweloperow/getGTFSFile/?file=20221126_20221210.zip
|
||||
https://www.ztm.poznan.pl/pl/dla-deweloperow/getGTFSFile/?file=20221119_20221125.zip
|
||||
https://www.ztm.poznan.pl/pl/dla-deweloperow/getGTFSFile/?file=20221115_20221130.zip
|
||||
https://www.ztm.poznan.pl/pl/dla-deweloperow/getGTFSFile/?file=20221114_20221130.zip
|
||||
https://www.ztm.poznan.pl/pl/dla-deweloperow/getGTFSFile/?file=20221112_20221113.zip
|
||||
https://www.ztm.poznan.pl/pl/dla-deweloperow/getGTFSFile/?file=20221110_20221111.zip
|
||||
https://www.ztm.poznan.pl/pl/dla-deweloperow/getGTFSFile/?file=20221106_20221130.zip
|
||||
https://www.ztm.poznan.pl/pl/dla-deweloperow/getGTFSFile/?file=20221103_20221130.zip
|
||||
https://www.ztm.poznan.pl/pl/dla-deweloperow/getGTFSFile/?file=20221102_20221130.zip
|
||||
https://www.ztm.poznan.pl/pl/dla-deweloperow/getGTFSFile/?file=20221031_20221102.zip
|
||||
https://www.ztm.poznan.pl/pl/dla-deweloperow/getGTFSFile/?file=20221028_20221030.zip
|
||||
https://www.ztm.poznan.pl/pl/dla-deweloperow/getGTFSFile/?file=20221026_20221028.zip
|
||||
https://www.ztm.poznan.pl/pl/dla-deweloperow/getGTFSFile/?file=20221022_20221028.zip
|
||||
https://www.ztm.poznan.pl/pl/dla-deweloperow/getGTFSFile/?file=20221019_20221028.zip
|
||||
https://www.ztm.poznan.pl/pl/dla-deweloperow/getGTFSFile/?file=20221017_20221031.zip
|
||||
https://www.ztm.poznan.pl/pl/dla-deweloperow/getGTFSFile/?file=20221012_20221031.zip
|
||||
https://www.ztm.poznan.pl/pl/dla-deweloperow/getGTFSFile/?file=20221010_20221031.zip
|
||||
https://www.ztm.poznan.pl/pl/dla-deweloperow/getGTFSFile/?file=20221005_20221031.zip
|
||||
https://www.ztm.poznan.pl/pl/dla-deweloperow/getGTFSFile/?file=20221004_20221031.zip
|
||||
https://www.ztm.poznan.pl/pl/dla-deweloperow/getGTFSFile/?file=20221001_20221031.zip
|
||||
https://www.ztm.poznan.pl/pl/dla-deweloperow/getGTFSFile/?file=20220924_20220930.zip
|
||||
https://www.ztm.poznan.pl/pl/dla-deweloperow/getGTFSFile/?file=20220923_20220929.zip
|
||||
https://www.ztm.poznan.pl/pl/dla-deweloperow/getGTFSFile/?file=20220917_20220930.zip
|
||||
https://www.ztm.poznan.pl/pl/dla-deweloperow/getGTFSFile/?file=20220915_20220930.zip
|
||||
https://www.ztm.poznan.pl/pl/dla-deweloperow/getGTFSFile/?file=20220911_20220930.zip
|
||||
https://www.ztm.poznan.pl/pl/dla-deweloperow/getGTFSFile/?file=20220909_20220930.zip
|
||||
https://www.ztm.poznan.pl/pl/dla-deweloperow/getGTFSFile/?file=20220906_20220930.zip
|
||||
https://www.ztm.poznan.pl/pl/dla-deweloperow/getGTFSFile/?file=20220904_20220930.zip
|
||||
https://www.ztm.poznan.pl/pl/dla-deweloperow/getGTFSFile/?file=20220902_20220903.zip
|
||||
https://www.ztm.poznan.pl/pl/dla-deweloperow/getGTFSFile/?file=20220901_20220930.zip
|
||||
https://www.ztm.poznan.pl/pl/dla-deweloperow/getGTFSFile/?file=20220826_20220831.zip
|
||||
https://www.ztm.poznan.pl/pl/dla-deweloperow/getGTFSFile/?file=20220825_20220831.zip
|
||||
https://www.ztm.poznan.pl/pl/dla-deweloperow/getGTFSFile/?file=20220822_20220831.zip
|
||||
https://www.ztm.poznan.pl/pl/dla-deweloperow/getGTFSFile/?file=20220819_20220827.zip
|
||||
https://www.ztm.poznan.pl/pl/dla-deweloperow/getGTFSFile/?file=20220815_20220821.zip
|
||||
https://www.ztm.poznan.pl/pl/dla-deweloperow/getGTFSFile/?file=20220812_20220814.zip
|
||||
https://www.ztm.poznan.pl/pl/dla-deweloperow/getGTFSFile/?file=20220805_20220831.zip
|
||||
https://www.ztm.poznan.pl/pl/dla-deweloperow/getGTFSFile/?file=20220801_20220831.zip
|
||||
https://www.ztm.poznan.pl/pl/dla-deweloperow/getGTFSFile/?file=20220730_20220731.zip
|
||||
https://www.ztm.poznan.pl/pl/dla-deweloperow/getGTFSFile/?file=20220729_20220731.zip
|
||||
https://www.ztm.poznan.pl/pl/dla-deweloperow/getGTFSFile/?file=20220722_20220728.zip
|
||||
https://www.ztm.poznan.pl/pl/dla-deweloperow/getGTFSFile/?file=20220718_20220731.zip
|
||||
https://www.ztm.poznan.pl/pl/dla-deweloperow/getGTFSFile/?file=20220714_20220731.zip
|
||||
https://www.ztm.poznan.pl/pl/dla-deweloperow/getGTFSFile/?file=20220711_20220731.zip
|
||||
https://www.ztm.poznan.pl/pl/dla-deweloperow/getGTFSFile/?file=20220707_20220731.zip
|
||||
https://www.ztm.poznan.pl/pl/dla-deweloperow/getGTFSFile/?file=20220706_20220731.zip
|
||||
https://www.ztm.poznan.pl/pl/dla-deweloperow/getGTFSFile/?file=20220705_20220731.zip
|
||||
https://www.ztm.poznan.pl/pl/dla-deweloperow/getGTFSFile/?file=20220704_20220731.zip
|
||||
https://www.ztm.poznan.pl/pl/dla-deweloperow/getGTFSFile/?file=20220701_20220731.zip
|
||||
https://www.ztm.poznan.pl/pl/dla-deweloperow/getGTFSFile/?file=20220627_20220731.zip
|
||||
https://www.ztm.poznan.pl/pl/dla-deweloperow/getGTFSFile/?file=20220625_20220731.zip
|
||||
https://www.ztm.poznan.pl/pl/dla-deweloperow/getGTFSFile/?file=20220618_20220624.zip
|
||||
https://www.ztm.poznan.pl/pl/dla-deweloperow/getGTFSFile/?file=20220615_20220617.zip
|
||||
https://www.ztm.poznan.pl/pl/dla-deweloperow/getGTFSFile/?file=20220613_20220624.zip
|
||||
https://www.ztm.poznan.pl/pl/dla-deweloperow/getGTFSFile/?file=20220610_20220624.zip
|
||||
https://www.ztm.poznan.pl/pl/dla-deweloperow/getGTFSFile/?file=20220609_20220624.zip
|
||||
https://www.ztm.poznan.pl/pl/dla-deweloperow/getGTFSFile/?file=20220608_20220624.zip
|
||||
https://www.ztm.poznan.pl/pl/dla-deweloperow/getGTFSFile/?file=20220604_20220624.zip
|
||||
https://www.ztm.poznan.pl/pl/dla-deweloperow/getGTFSFile/?file=20220601_20220624.zip
|
||||
https://www.ztm.poznan.pl/pl/dla-deweloperow/getGTFSFile/?file=20220526_20220531.zip
|
||||
https://www.ztm.poznan.pl/pl/dla-deweloperow/getGTFSFile/?file=20220520_20220531.zip
|
||||
https://www.ztm.poznan.pl/pl/dla-deweloperow/getGTFSFile/?file=20220519_20220531.zip
|
||||
https://www.ztm.poznan.pl/pl/dla-deweloperow/getGTFSFile/?file=20220517_20220531.zip
|
||||
https://www.ztm.poznan.pl/pl/dla-deweloperow/getGTFSFile/?file=20220514_20220529.zip
|
||||
https://www.ztm.poznan.pl/pl/dla-deweloperow/getGTFSFile/?file=20220512_20220513.zip
|
||||
https://www.ztm.poznan.pl/pl/dla-deweloperow/getGTFSFile/?file=20220507_20220531.zip
|
||||
https://www.ztm.poznan.pl/pl/dla-deweloperow/getGTFSFile/?file=20220504_20220531.zip
|
||||
https://www.ztm.poznan.pl/pl/dla-deweloperow/getGTFSFile/?file=20220502_20220503.zip
|
||||
https://www.ztm.poznan.pl/pl/dla-deweloperow/getGTFSFile/?file=20220429_20220501.zip
|
||||
https://www.ztm.poznan.pl/pl/dla-deweloperow/getGTFSFile/?file=20220423_20220430.zip
|
||||
https://www.ztm.poznan.pl/pl/dla-deweloperow/getGTFSFile/?file=20220421_20220422.zip
|
||||
https://www.ztm.poznan.pl/pl/dla-deweloperow/getGTFSFile/?file=20220420_20220422.zip
|
||||
https://www.ztm.poznan.pl/pl/dla-deweloperow/getGTFSFile/?file=20220418_20220419.zip
|
||||
https://www.ztm.poznan.pl/pl/dla-deweloperow/getGTFSFile/?file=20220414_20220417.zip
|
||||
https://www.ztm.poznan.pl/pl/dla-deweloperow/getGTFSFile/?file=20220409_20220430.zip
|
||||
https://www.ztm.poznan.pl/pl/dla-deweloperow/getGTFSFile/?file=20220401_20220430.zip
|
||||
https://www.ztm.poznan.pl/pl/dla-deweloperow/getGTFSFile/?file=20220327_20220415.zip
|
||||
https://www.ztm.poznan.pl/pl/dla-deweloperow/getGTFSFile/?file=20210301_20210331.zip
|
||||
https://www.ztm.poznan.pl/pl/dla-deweloperow/getGTFSFile/?file=20220321_20220410.zip
|
||||
https://www.ztm.poznan.pl/pl/dla-deweloperow/getGTFSFile/?file=20220319_20220410.zip
|
||||
https://www.ztm.poznan.pl/pl/dla-deweloperow/getGTFSFile/?file=20220315_20220410.zip
|
||||
https://www.ztm.poznan.pl/pl/dla-deweloperow/getGTFSFile/?file=20220314_20220410.zip
|
||||
https://www.ztm.poznan.pl/pl/dla-deweloperow/getGTFSFile/?file=20220311_20220410.zip
|
||||
https://www.ztm.poznan.pl/pl/dla-deweloperow/getGTFSFile/?file=20220303_20220331.zip
|
||||
https://www.ztm.poznan.pl/pl/dla-deweloperow/getGTFSFile/?file=20220301_20220331.zip
|
||||
https://www.ztm.poznan.pl/pl/dla-deweloperow/getGTFSFile/?file=20220224_20220331.zip
|
||||
https://www.ztm.poznan.pl/pl/dla-deweloperow/getGTFSFile/?file=20220219_20220331.zip
|
||||
https://www.ztm.poznan.pl/pl/dla-deweloperow/getGTFSFile/?file=20220217_20220331.zip
|
||||
https://www.ztm.poznan.pl/pl/dla-deweloperow/getGTFSFile/?file=20220215_20220331.zip
|
||||
https://www.ztm.poznan.pl/pl/dla-deweloperow/getGTFSFile/?file=20220212_20220312.zip
|
||||
https://www.ztm.poznan.pl/pl/dla-deweloperow/getGTFSFile/?file=20220204_20220228.zip
|
||||
https://www.ztm.poznan.pl/pl/dla-deweloperow/getGTFSFile/?file=20220131_20220228.zip
|
||||
https://www.ztm.poznan.pl/pl/dla-deweloperow/getGTFSFile/?file=20220122_20220228.zip
|
||||
https://www.ztm.poznan.pl/pl/dla-deweloperow/getGTFSFile/?file=20220115_20220220.zip
|
||||
https://www.ztm.poznan.pl/pl/dla-deweloperow/getGTFSFile/?file=20220108_20220220.zip
|
||||
https://www.ztm.poznan.pl/pl/dla-deweloperow/getGTFSFile/?file=20220105_20220107.zip
|
||||
https://www.ztm.poznan.pl/pl/dla-deweloperow/getGTFSFile/?file=20230223_20230228.zip
|
||||
https://www.ztm.poznan.pl/pl/dla-deweloperow/getGTFSFile/?file=20230224_20230228.zip
|
||||
https://www.ztm.poznan.pl/pl/dla-deweloperow/getGTFSFile/?file=20230227_20230228.zip
|
||||
https://www.ztm.poznan.pl/pl/dla-deweloperow/getGTFSFile/?file=20230301_20230331.zip
|
||||
https://www.ztm.poznan.pl/pl/dla-deweloperow/getGTFSFile/?file=20230317_20230430.zip
|
||||
|
Loading…
Reference in New Issue
Block a user