Data normalization

This commit is contained in:
Robert Bendun 2023-03-21 01:00:51 +01:00
parent e27bc920d4
commit a72841a7e3
6 changed files with 146 additions and 122 deletions

3
.gitignore vendored
View File

@ -2,7 +2,10 @@ languages.*.tsv
columns.pruned.tsv
columns.original.tsv
*.csv
*.tsv
csv2tsv/csv2tsv
.cache
compile_commands.json
tsv2json/tsv2json
normalize
data

View File

@ -1,3 +1,8 @@
normalize: normalize.cc
g++ -std=c++20 -Wall -Wextra -O3 -o $@ $<
csv2tsv/csv2tsv: csv2tsv/csv2tsv.go
cd csv2tsv; go build
clean:
rm -f *.csv *.tsv stop*.txt trips.txt

108
normalize.cc Normal file
View File

@ -0,0 +1,108 @@
#include <iostream>
#include <fstream>
#include <unordered_set>
#include <optional>
namespace split
{
struct sentinel {};
struct iterator
{
using difference_type = ptrdiff_t;
using value_type = std::string_view;
using iterator_category = std::input_iterator_tag;
using pointer = void;
using reference = std::string_view&;
explicit iterator(std::convertible_to<std::string_view> auto&& source, char delim)
: source{source}
, delim{delim}
{
++*this; // Compute first cell
}
inline iterator begin() const
{
return *this;
}
inline sentinel end() const
{
return sentinel{};
}
inline bool operator==(sentinel) const
{
return reached_end;
}
inline iterator& operator++()
{
if (source.empty()) {
reached_end = true;
return *this;
}
if (auto tab = source.find(delim); tab != std::string_view::npos) {
current = source.substr(0, tab);
source.remove_prefix(tab+1);
} else {
current = source;
source = {};
}
return *this;
}
inline iterator operator++(int)
{
auto copy = *this;
++*this;
return copy;
}
inline std::string_view operator*() const
{
return current;
}
std::string_view current;
std::string_view source;
char delim;
bool reached_end = false;
};
}
std::optional<float> time2float(std::string_view time)
{
if (time.size() != 8) return std::nullopt; // Expected time in format hh:mm:ss
auto const hours = ((time[0] - '0') * 10 + time[1] - '0');
auto const minutes = ((time[3] - '0') * 10 + time[4] - '0');
if (hours >= 24) return std::nullopt; // Single day has only 24 hours
return (float(hours) * 60 + float(minutes)) / (60 * 24);
}
int main()
{
std::cout.sync_with_stdio(false);
bool passed_header = false;
for (std::string line; std::getline(std::cin, line);) {
split::iterator row(line, '\t');
std::advance(row, 2); if (row == split::sentinel{}) continue; std::string_view const departure_time = *row;
std::advance(row, 1); if (row == split::sentinel{}) continue; std::string_view const stop_id = *row;
std::advance(row, 2); if (row == split::sentinel{}) continue; std::string_view const stop_headsign = *row;
if (!passed_header) {
std::cout << departure_time << '\t' << stop_id << '\t' << stop_headsign << '\n';
passed_header = true;
continue;
}
auto const departure = time2float(departure_time); if (!departure) continue;
std::cout << *departure << '\t' << stop_id << '\t' << stop_headsign << '\n';
}
}

View File

@ -1,6 +1,8 @@
#!/usr/bin/env bash
set -e -o pipefail
set -xe -o pipefail
make normalize csv2tsv/csv2tsv
keep=(stops.txt trips.txt stop_times.txt)
@ -19,5 +21,12 @@ done
cd ..
for k in "${keep[@]}"; do
cat $(find data -name "$k") > "$k"
csv="${k%.txt}.csv"
tsv="${k%.txt}.tsv"
if [ ! -f "$tsv" ]; then
cat $(find data -name "$k") > "$csv"
csv2tsv/csv2tsv <"$csv" >"$tsv"
fi
done
./normalize <stop_times.tsv >stop_times.normalized.tsv

13
split_train_valid_test.py Normal file
View File

@ -0,0 +1,13 @@
import pandas as pd
from sklearn.model_selection import train_test_split
TEST_SIZE = 25
VALID_SIZE = 25
data = pd.read_csv('./stop_times.normalized.tsv', sep='\t')
train, test = train_test_split(data, test_size=TEST_SIZE+VALID_SIZE)
valid, test = train_test_split(test, test_size=TEST_SIZE)
train.to_csv('stop_times.train.tsv', sep='\t')
test.to_csv('stop_times.test.tsv', sep='\t')
valid.to_csv('stop_times.valid.tsv', sep='\t')

View File

@ -1,121 +1,7 @@
https://www.ztm.poznan.pl/pl/dla-deweloperow/getGTFSFile/?file=20230317_20230430.zip
https://www.ztm.poznan.pl/pl/dla-deweloperow/getGTFSFile/?file=20230316_20230430.zip
https://www.ztm.poznan.pl/pl/dla-deweloperow/getGTFSFile/?file=20230313_20230430.zip
https://www.ztm.poznan.pl/pl/dla-deweloperow/getGTFSFile/?file=20230309_20230331.zip
https://www.ztm.poznan.pl/pl/dla-deweloperow/getGTFSFile/?file=20230306_20230331.zip
https://www.ztm.poznan.pl/pl/dla-deweloperow/getGTFSFile/?file=20230301_20230331.zip
https://www.ztm.poznan.pl/pl/dla-deweloperow/getGTFSFile/?file=20230227_20230228.zip
https://www.ztm.poznan.pl/pl/dla-deweloperow/getGTFSFile/?file=20230224_20230228.zip
https://www.ztm.poznan.pl/pl/dla-deweloperow/getGTFSFile/?file=20230223_20230228.zip
https://www.ztm.poznan.pl/pl/dla-deweloperow/getGTFSFile/?file=20230222_20230222.zip
https://www.ztm.poznan.pl/pl/dla-deweloperow/getGTFSFile/?file=20230215_20230225.zip
https://www.ztm.poznan.pl/pl/dla-deweloperow/getGTFSFile/?file=20230212_20230228.zip
https://www.ztm.poznan.pl/pl/dla-deweloperow/getGTFSFile/?file=20230208_20230228.zip
https://www.ztm.poznan.pl/pl/dla-deweloperow/getGTFSFile/?file=20230204_20230228.zip
https://www.ztm.poznan.pl/pl/dla-deweloperow/getGTFSFile/?file=20230129_20230228.zip
https://www.ztm.poznan.pl/pl/dla-deweloperow/getGTFSFile/?file=20230113_20230131.zip
https://www.ztm.poznan.pl/pl/dla-deweloperow/getGTFSFile/?file=20230107_20230128.zip
https://www.ztm.poznan.pl/pl/dla-deweloperow/getGTFSFile/?file=20230106_20230106.zip
https://www.ztm.poznan.pl/pl/dla-deweloperow/getGTFSFile/?file=20230105_20230105.zip
https://www.ztm.poznan.pl/pl/dla-deweloperow/getGTFSFile/?file=20230101_20230104.zip
https://www.ztm.poznan.pl/pl/dla-deweloperow/getGTFSFile/?file=20221226_20221231.zip
https://www.ztm.poznan.pl/pl/dla-deweloperow/getGTFSFile/?file=20221223_20221225.zip
https://www.ztm.poznan.pl/pl/dla-deweloperow/getGTFSFile/?file=20221218_20221222.zip
https://www.ztm.poznan.pl/pl/dla-deweloperow/getGTFSFile/?file=20221210_20221223.zip
https://www.ztm.poznan.pl/pl/dla-deweloperow/getGTFSFile/?file=20221201_20221210.zip
https://www.ztm.poznan.pl/pl/dla-deweloperow/getGTFSFile/?file=20221129_20221210.zip
https://www.ztm.poznan.pl/pl/dla-deweloperow/getGTFSFile/?file=20221126_20221210.zip
https://www.ztm.poznan.pl/pl/dla-deweloperow/getGTFSFile/?file=20221119_20221125.zip
https://www.ztm.poznan.pl/pl/dla-deweloperow/getGTFSFile/?file=20221115_20221130.zip
https://www.ztm.poznan.pl/pl/dla-deweloperow/getGTFSFile/?file=20221114_20221130.zip
https://www.ztm.poznan.pl/pl/dla-deweloperow/getGTFSFile/?file=20221112_20221113.zip
https://www.ztm.poznan.pl/pl/dla-deweloperow/getGTFSFile/?file=20221110_20221111.zip
https://www.ztm.poznan.pl/pl/dla-deweloperow/getGTFSFile/?file=20221106_20221130.zip
https://www.ztm.poznan.pl/pl/dla-deweloperow/getGTFSFile/?file=20221103_20221130.zip
https://www.ztm.poznan.pl/pl/dla-deweloperow/getGTFSFile/?file=20221102_20221130.zip
https://www.ztm.poznan.pl/pl/dla-deweloperow/getGTFSFile/?file=20221031_20221102.zip
https://www.ztm.poznan.pl/pl/dla-deweloperow/getGTFSFile/?file=20221028_20221030.zip
https://www.ztm.poznan.pl/pl/dla-deweloperow/getGTFSFile/?file=20221026_20221028.zip
https://www.ztm.poznan.pl/pl/dla-deweloperow/getGTFSFile/?file=20221022_20221028.zip
https://www.ztm.poznan.pl/pl/dla-deweloperow/getGTFSFile/?file=20221019_20221028.zip
https://www.ztm.poznan.pl/pl/dla-deweloperow/getGTFSFile/?file=20221017_20221031.zip
https://www.ztm.poznan.pl/pl/dla-deweloperow/getGTFSFile/?file=20221012_20221031.zip
https://www.ztm.poznan.pl/pl/dla-deweloperow/getGTFSFile/?file=20221010_20221031.zip
https://www.ztm.poznan.pl/pl/dla-deweloperow/getGTFSFile/?file=20221005_20221031.zip
https://www.ztm.poznan.pl/pl/dla-deweloperow/getGTFSFile/?file=20221004_20221031.zip
https://www.ztm.poznan.pl/pl/dla-deweloperow/getGTFSFile/?file=20221001_20221031.zip
https://www.ztm.poznan.pl/pl/dla-deweloperow/getGTFSFile/?file=20220924_20220930.zip
https://www.ztm.poznan.pl/pl/dla-deweloperow/getGTFSFile/?file=20220923_20220929.zip
https://www.ztm.poznan.pl/pl/dla-deweloperow/getGTFSFile/?file=20220917_20220930.zip
https://www.ztm.poznan.pl/pl/dla-deweloperow/getGTFSFile/?file=20220915_20220930.zip
https://www.ztm.poznan.pl/pl/dla-deweloperow/getGTFSFile/?file=20220911_20220930.zip
https://www.ztm.poznan.pl/pl/dla-deweloperow/getGTFSFile/?file=20220909_20220930.zip
https://www.ztm.poznan.pl/pl/dla-deweloperow/getGTFSFile/?file=20220906_20220930.zip
https://www.ztm.poznan.pl/pl/dla-deweloperow/getGTFSFile/?file=20220904_20220930.zip
https://www.ztm.poznan.pl/pl/dla-deweloperow/getGTFSFile/?file=20220902_20220903.zip
https://www.ztm.poznan.pl/pl/dla-deweloperow/getGTFSFile/?file=20220901_20220930.zip
https://www.ztm.poznan.pl/pl/dla-deweloperow/getGTFSFile/?file=20220826_20220831.zip
https://www.ztm.poznan.pl/pl/dla-deweloperow/getGTFSFile/?file=20220825_20220831.zip
https://www.ztm.poznan.pl/pl/dla-deweloperow/getGTFSFile/?file=20220822_20220831.zip
https://www.ztm.poznan.pl/pl/dla-deweloperow/getGTFSFile/?file=20220819_20220827.zip
https://www.ztm.poznan.pl/pl/dla-deweloperow/getGTFSFile/?file=20220815_20220821.zip
https://www.ztm.poznan.pl/pl/dla-deweloperow/getGTFSFile/?file=20220812_20220814.zip
https://www.ztm.poznan.pl/pl/dla-deweloperow/getGTFSFile/?file=20220805_20220831.zip
https://www.ztm.poznan.pl/pl/dla-deweloperow/getGTFSFile/?file=20220801_20220831.zip
https://www.ztm.poznan.pl/pl/dla-deweloperow/getGTFSFile/?file=20220730_20220731.zip
https://www.ztm.poznan.pl/pl/dla-deweloperow/getGTFSFile/?file=20220729_20220731.zip
https://www.ztm.poznan.pl/pl/dla-deweloperow/getGTFSFile/?file=20220722_20220728.zip
https://www.ztm.poznan.pl/pl/dla-deweloperow/getGTFSFile/?file=20220718_20220731.zip
https://www.ztm.poznan.pl/pl/dla-deweloperow/getGTFSFile/?file=20220714_20220731.zip
https://www.ztm.poznan.pl/pl/dla-deweloperow/getGTFSFile/?file=20220711_20220731.zip
https://www.ztm.poznan.pl/pl/dla-deweloperow/getGTFSFile/?file=20220707_20220731.zip
https://www.ztm.poznan.pl/pl/dla-deweloperow/getGTFSFile/?file=20220706_20220731.zip
https://www.ztm.poznan.pl/pl/dla-deweloperow/getGTFSFile/?file=20220705_20220731.zip
https://www.ztm.poznan.pl/pl/dla-deweloperow/getGTFSFile/?file=20220704_20220731.zip
https://www.ztm.poznan.pl/pl/dla-deweloperow/getGTFSFile/?file=20220701_20220731.zip
https://www.ztm.poznan.pl/pl/dla-deweloperow/getGTFSFile/?file=20220627_20220731.zip
https://www.ztm.poznan.pl/pl/dla-deweloperow/getGTFSFile/?file=20220625_20220731.zip
https://www.ztm.poznan.pl/pl/dla-deweloperow/getGTFSFile/?file=20220618_20220624.zip
https://www.ztm.poznan.pl/pl/dla-deweloperow/getGTFSFile/?file=20220615_20220617.zip
https://www.ztm.poznan.pl/pl/dla-deweloperow/getGTFSFile/?file=20220613_20220624.zip
https://www.ztm.poznan.pl/pl/dla-deweloperow/getGTFSFile/?file=20220610_20220624.zip
https://www.ztm.poznan.pl/pl/dla-deweloperow/getGTFSFile/?file=20220609_20220624.zip
https://www.ztm.poznan.pl/pl/dla-deweloperow/getGTFSFile/?file=20220608_20220624.zip
https://www.ztm.poznan.pl/pl/dla-deweloperow/getGTFSFile/?file=20220604_20220624.zip
https://www.ztm.poznan.pl/pl/dla-deweloperow/getGTFSFile/?file=20220601_20220624.zip
https://www.ztm.poznan.pl/pl/dla-deweloperow/getGTFSFile/?file=20220526_20220531.zip
https://www.ztm.poznan.pl/pl/dla-deweloperow/getGTFSFile/?file=20220520_20220531.zip
https://www.ztm.poznan.pl/pl/dla-deweloperow/getGTFSFile/?file=20220519_20220531.zip
https://www.ztm.poznan.pl/pl/dla-deweloperow/getGTFSFile/?file=20220517_20220531.zip
https://www.ztm.poznan.pl/pl/dla-deweloperow/getGTFSFile/?file=20220514_20220529.zip
https://www.ztm.poznan.pl/pl/dla-deweloperow/getGTFSFile/?file=20220512_20220513.zip
https://www.ztm.poznan.pl/pl/dla-deweloperow/getGTFSFile/?file=20220507_20220531.zip
https://www.ztm.poznan.pl/pl/dla-deweloperow/getGTFSFile/?file=20220504_20220531.zip
https://www.ztm.poznan.pl/pl/dla-deweloperow/getGTFSFile/?file=20220502_20220503.zip
https://www.ztm.poznan.pl/pl/dla-deweloperow/getGTFSFile/?file=20220429_20220501.zip
https://www.ztm.poznan.pl/pl/dla-deweloperow/getGTFSFile/?file=20220423_20220430.zip
https://www.ztm.poznan.pl/pl/dla-deweloperow/getGTFSFile/?file=20220421_20220422.zip
https://www.ztm.poznan.pl/pl/dla-deweloperow/getGTFSFile/?file=20220420_20220422.zip
https://www.ztm.poznan.pl/pl/dla-deweloperow/getGTFSFile/?file=20220418_20220419.zip
https://www.ztm.poznan.pl/pl/dla-deweloperow/getGTFSFile/?file=20220414_20220417.zip
https://www.ztm.poznan.pl/pl/dla-deweloperow/getGTFSFile/?file=20220409_20220430.zip
https://www.ztm.poznan.pl/pl/dla-deweloperow/getGTFSFile/?file=20220401_20220430.zip
https://www.ztm.poznan.pl/pl/dla-deweloperow/getGTFSFile/?file=20220327_20220415.zip
https://www.ztm.poznan.pl/pl/dla-deweloperow/getGTFSFile/?file=20210301_20210331.zip
https://www.ztm.poznan.pl/pl/dla-deweloperow/getGTFSFile/?file=20220321_20220410.zip
https://www.ztm.poznan.pl/pl/dla-deweloperow/getGTFSFile/?file=20220319_20220410.zip
https://www.ztm.poznan.pl/pl/dla-deweloperow/getGTFSFile/?file=20220315_20220410.zip
https://www.ztm.poznan.pl/pl/dla-deweloperow/getGTFSFile/?file=20220314_20220410.zip
https://www.ztm.poznan.pl/pl/dla-deweloperow/getGTFSFile/?file=20220311_20220410.zip
https://www.ztm.poznan.pl/pl/dla-deweloperow/getGTFSFile/?file=20220303_20220331.zip
https://www.ztm.poznan.pl/pl/dla-deweloperow/getGTFSFile/?file=20220301_20220331.zip
https://www.ztm.poznan.pl/pl/dla-deweloperow/getGTFSFile/?file=20220224_20220331.zip
https://www.ztm.poznan.pl/pl/dla-deweloperow/getGTFSFile/?file=20220219_20220331.zip
https://www.ztm.poznan.pl/pl/dla-deweloperow/getGTFSFile/?file=20220217_20220331.zip
https://www.ztm.poznan.pl/pl/dla-deweloperow/getGTFSFile/?file=20220215_20220331.zip
https://www.ztm.poznan.pl/pl/dla-deweloperow/getGTFSFile/?file=20220212_20220312.zip
https://www.ztm.poznan.pl/pl/dla-deweloperow/getGTFSFile/?file=20220204_20220228.zip
https://www.ztm.poznan.pl/pl/dla-deweloperow/getGTFSFile/?file=20220131_20220228.zip
https://www.ztm.poznan.pl/pl/dla-deweloperow/getGTFSFile/?file=20220122_20220228.zip
https://www.ztm.poznan.pl/pl/dla-deweloperow/getGTFSFile/?file=20220115_20220220.zip
https://www.ztm.poznan.pl/pl/dla-deweloperow/getGTFSFile/?file=20220108_20220220.zip
https://www.ztm.poznan.pl/pl/dla-deweloperow/getGTFSFile/?file=20220105_20220107.zip
https://www.ztm.poznan.pl/pl/dla-deweloperow/getGTFSFile/?file=20230223_20230228.zip
https://www.ztm.poznan.pl/pl/dla-deweloperow/getGTFSFile/?file=20230224_20230228.zip
https://www.ztm.poznan.pl/pl/dla-deweloperow/getGTFSFile/?file=20230227_20230228.zip
https://www.ztm.poznan.pl/pl/dla-deweloperow/getGTFSFile/?file=20230301_20230331.zip
https://www.ztm.poznan.pl/pl/dla-deweloperow/getGTFSFile/?file=20230317_20230430.zip