diff --git a/.gitignore b/.gitignore index 48ef6ac..57a8114 100644 --- a/.gitignore +++ b/.gitignore @@ -3,7 +3,7 @@ columns.pruned.tsv columns.original.tsv *.csv *.tsv -csv2tsv/csv2tsv +src/csv2tsv/csv2tsv .cache compile_commands.json tsv2json/tsv2json diff --git a/Dockerfile b/Dockerfile new file mode 100644 index 0000000..b0ca6ce --- /dev/null +++ b/Dockerfile @@ -0,0 +1,5 @@ +FROM ubuntu:22.04 + +RUN apt update && apt install -y vim make python3 python3-pip python-is-python3 gcc g++ golang wget unzip +RUN pip install pandas matplotlib scikit-learn +CMD "bash" diff --git a/prepare-ztm-data.sh b/prepare-ztm-data.sh deleted file mode 100755 index 4b10638..0000000 --- a/prepare-ztm-data.sh +++ /dev/null @@ -1,47 +0,0 @@ -#!/usr/bin/env bash - -set -xe -o pipefail - -# Disable to allow to work in bare jenkins -# make normalize csv2tsv/csv2tsv - -keep=stop_times.txt - -mkdir -p data && cd data -xargs -- wget --no-verbose --no-clobber <../ztm-data.txt - -for file in $(find . -name 'index*.zip'); do - dir="${file##*=}" - dir="${dir%.zip}" - if [ ! -d "$dir" ]; then - mkdir "$dir" - unzip "$file" -d "$dir" "${keep[@]}" - fi -done - -cd .. - -k=$keep -csv="${k%.txt}.csv" -cat $(find data -name "$k") | shuf > "$csv" -train_size=$(( $(wc -l "$csv" | cut -f1 -d' ') * 8 / 10 )) -echo $train_size - -head -n $train_size $csv >train.csv -tail -n +$train_size $csv >test.csv - -# Disable to allow to work in bare jenkins -# for k in "${keep[@]}"; do -# csv="${k%.txt}.csv" -# tsv="${k%.txt}.tsv" -# if [ ! -f "$tsv" ]; then -# cat $(find data -name "$k") > "$csv" -# csv2tsv/csv2tsv <"$csv" >"$tsv" -# fi -# done -# -# if [ ! -f "stop_times.normalized.tsv" ]; then -# ./normalize stop_times.normalized.tsv -# ./split_train_valid_test.py -# fi -# ./stats.py diff --git a/run.sh b/run.sh new file mode 100755 index 0000000..563148f --- /dev/null +++ b/run.sh @@ -0,0 +1,6 @@ +#!/bin/sh + +set -xe + +docker build -t ium . +docker run -it ium diff --git a/Makefile b/src/Makefile similarity index 100% rename from Makefile rename to src/Makefile diff --git a/csv2tsv/csv2tsv.go b/src/csv2tsv/csv2tsv.go similarity index 100% rename from csv2tsv/csv2tsv.go rename to src/csv2tsv/csv2tsv.go diff --git a/csv2tsv/go.mod b/src/csv2tsv/go.mod similarity index 100% rename from csv2tsv/go.mod rename to src/csv2tsv/go.mod diff --git a/normalize.cc b/src/normalize.cc similarity index 100% rename from normalize.cc rename to src/normalize.cc diff --git a/src/prepare-ztm-data.sh b/src/prepare-ztm-data.sh new file mode 100755 index 0000000..50f54f9 --- /dev/null +++ b/src/prepare-ztm-data.sh @@ -0,0 +1,41 @@ +#!/usr/bin/env bash + +set -xe -o pipefail + +make normalize csv2tsv/csv2tsv + +keep=stop_times.txt + +mkdir -p data && cd data +xargs -- wget --no-verbose --no-clobber <../ztm-data.txt + +for file in $(find . -name 'index*.zip'); do + dir="${file##*=}" + dir="${dir%.zip}" + if [ ! -d "$dir" ]; then + mkdir "$dir" + unzip "$file" -d "$dir" "${keep[@]}" + fi +done + +cd .. + +for k in "${keep[@]}"; do + csv="${k%.txt}.csv" + tsv="${k%.txt}.tsv" + if [ ! -f "$tsv" ]; then + cat $(find data -name "$k") > "$csv" + csv2tsv/csv2tsv <"$csv" >"$tsv" + fi +done + +if [ ! -f "stop_times.normalized.tsv" ]; then + ./normalize stop_times.normalized.tsv +fi + +if [ ! \( -f "stop_times.train.tsv" -a -f "stop_times.test.tsv" -a -f "stop_times.valid.tsv" \) ]; then + ./split_train_valid_test.py +fi + + +./stats.py diff --git a/split_train_valid_test.py b/src/split_train_valid_test.py similarity index 100% rename from split_train_valid_test.py rename to src/split_train_valid_test.py diff --git a/stats.py b/src/stats.py similarity index 100% rename from stats.py rename to src/stats.py diff --git a/stats.sh b/src/stats.sh similarity index 100% rename from stats.sh rename to src/stats.sh diff --git a/ztm-data.txt b/src/ztm-data.txt similarity index 100% rename from ztm-data.txt rename to src/ztm-data.txt