Support docker; restore previous implementaiton

This commit is contained in:
Robert Bendun 2023-04-04 23:32:48 +02:00
parent 0bc033069d
commit e241dd2594
13 changed files with 53 additions and 48 deletions

2
.gitignore vendored
View File

@ -3,7 +3,7 @@ columns.pruned.tsv
columns.original.tsv
*.csv
*.tsv
csv2tsv/csv2tsv
src/csv2tsv/csv2tsv
.cache
compile_commands.json
tsv2json/tsv2json

5
Dockerfile Normal file
View File

@ -0,0 +1,5 @@
FROM ubuntu:22.04
RUN apt update && apt install -y vim make python3 python3-pip python-is-python3 gcc g++ golang wget unzip
RUN pip install pandas matplotlib scikit-learn
CMD "bash"

View File

@ -1,47 +0,0 @@
#!/usr/bin/env bash
set -xe -o pipefail
# Disable to allow to work in bare jenkins
# make normalize csv2tsv/csv2tsv
keep=stop_times.txt
mkdir -p data && cd data
xargs -- wget --no-verbose --no-clobber <../ztm-data.txt
for file in $(find . -name 'index*.zip'); do
dir="${file##*=}"
dir="${dir%.zip}"
if [ ! -d "$dir" ]; then
mkdir "$dir"
unzip "$file" -d "$dir" "${keep[@]}"
fi
done
cd ..
k=$keep
csv="${k%.txt}.csv"
cat $(find data -name "$k") | shuf > "$csv"
train_size=$(( $(wc -l "$csv" | cut -f1 -d' ') * 8 / 10 ))
echo $train_size
head -n $train_size $csv >train.csv
tail -n +$train_size $csv >test.csv
# Disable to allow to work in bare jenkins
# for k in "${keep[@]}"; do
# csv="${k%.txt}.csv"
# tsv="${k%.txt}.tsv"
# if [ ! -f "$tsv" ]; then
# cat $(find data -name "$k") > "$csv"
# csv2tsv/csv2tsv <"$csv" >"$tsv"
# fi
# done
#
# if [ ! -f "stop_times.normalized.tsv" ]; then
# ./normalize <stop_times.tsv >stop_times.normalized.tsv
# ./split_train_valid_test.py
# fi
# ./stats.py

6
run.sh Executable file
View File

@ -0,0 +1,6 @@
#!/bin/sh
set -xe
docker build -t ium .
docker run -it ium

41
src/prepare-ztm-data.sh Executable file
View File

@ -0,0 +1,41 @@
#!/usr/bin/env bash
set -xe -o pipefail
make normalize csv2tsv/csv2tsv
keep=stop_times.txt
mkdir -p data && cd data
xargs -- wget --no-verbose --no-clobber <../ztm-data.txt
for file in $(find . -name 'index*.zip'); do
dir="${file##*=}"
dir="${dir%.zip}"
if [ ! -d "$dir" ]; then
mkdir "$dir"
unzip "$file" -d "$dir" "${keep[@]}"
fi
done
cd ..
for k in "${keep[@]}"; do
csv="${k%.txt}.csv"
tsv="${k%.txt}.tsv"
if [ ! -f "$tsv" ]; then
cat $(find data -name "$k") > "$csv"
csv2tsv/csv2tsv <"$csv" >"$tsv"
fi
done
if [ ! -f "stop_times.normalized.tsv" ]; then
./normalize <stop_times.tsv >stop_times.normalized.tsv
fi
if [ ! \( -f "stop_times.train.tsv" -a -f "stop_times.test.tsv" -a -f "stop_times.valid.tsv" \) ]; then
./split_train_valid_test.py
fi
./stats.py