diff --git a/corpus/wiki.sh b/corpus/wiki.sh new file mode 100755 index 0000000..0899e49 --- /dev/null +++ b/corpus/wiki.sh @@ -0,0 +1,17 @@ +#!/bin/bash + +# Download the latest Wikipedia dump file +wget https://dumps.wikimedia.org/enwiki/latest/enwiki-latest-pages-articles.xml.bz2 + +# Extract the text from the dump file using native Bash commands +bzcat enwiki-latest-pages-articles.xml.bz2 | \ + sed -n '//p' | \ + sed -e 's/<[^>]*>//g' | \ + sed '/^{{/d; /^$/d; /\t{{/d; s/|/ /g; s/=/ /g; /&.*;/d' | \ + sed '/^\s*$/d; /^$/d' | \ + sed 's/^[[:space:]]*//' | \ + tr -d "'[]{}" > \ + enwiki-latest-corpus.txt + +# Clean up +rm enwiki-latest-pages-articles.xml.bz2