mass-scraper/parishwebsites/deal-with-not-completed.sh
siulkilulki 9b76f4e8aa Add robust recrawling of not completed data.
Add annotator.py (highlighing hout within context done)
Enhance parish2text.py (enable more flags, convert button)
2018-04-16 23:54:03 +02:00

25 lines
1001 B
Bash
Executable File

#!/usr/bin/env bash
./find-not-completed.sh > not-completed
# cat duplicate-data >> not-completed
#removes not truly finished in processed.txt
grep -v -f <(cat not-completed | sed -e 's@^@\t@' | sed -e 's@$@\$@') processed.txt | sponge processed.txt
#appends filenames from spider-commands.txt which are not in processed.txt
comm -13 <(cut -f2 processed.txt | sort -u) <(grep -o 'data/.*" 2>' spider-commands.txt | sed -Ee 's@data/|" 2>@@g' | sort) >> not-completed
sort -u not-completed | sponge not-completed
# remove data connected with not-completed e.g. logs/ data/
echo data directory file count: `ls -1 data | wc -l`
cd data && xargs rm -f < ../not-completed
cd ..
echo data directory file count: `ls -1 data | wc -l`
echo logs directory file count: `ls -1 logs | wc -l`
cd logs && xargs rm -f < ../not-completed
cd ..
echo logs directory file count: `ls -1 logs | wc -l`
grep -f <(cat not-completed | sed -e 's@^@"data/'@ | sed -e 's@$@"@') spider-commands.txt > spider-commands-add.txt