From 375fa28dd5b29de7b3061f8187fd08405699305b Mon Sep 17 00:00:00 2001 From: Kuba Date: Tue, 21 Mar 2023 22:26:46 +0100 Subject: [PATCH] restructurized directory --- corpus/wiki.sh | 17 +++++++++++++++++ 1 file changed, 17 insertions(+) create mode 100755 corpus/wiki.sh diff --git a/corpus/wiki.sh b/corpus/wiki.sh new file mode 100755 index 0000000..0899e49 --- /dev/null +++ b/corpus/wiki.sh @@ -0,0 +1,17 @@ +#!/bin/bash + +# Download the latest Wikipedia dump file +wget https://dumps.wikimedia.org/enwiki/latest/enwiki-latest-pages-articles.xml.bz2 + +# Extract the text from the dump file using native Bash commands +bzcat enwiki-latest-pages-articles.xml.bz2 | \ + sed -n '//p' | \ + sed -e 's/<[^>]*>//g' | \ + sed '/^{{/d; /^$/d; /\t{{/d; s/|/ /g; s/=/ /g; /&.*;/d' | \ + sed '/^\s*$/d; /^$/d' | \ + sed 's/^[[:space:]]*//' | \ + tr -d "'[]{}" > \ + enwiki-latest-corpus.txt + +# Clean up +rm enwiki-latest-pages-articles.xml.bz2