2023-03-15 23:59:24 +01:00
|
|
|
#!/bin/bash
|
|
|
|
|
|
|
|
# Download the latest Wikipedia dump file
|
2023-03-16 00:07:03 +01:00
|
|
|
wget https://dumps.wikimedia.org/enwiki/latest/enwiki-latest-pages-articles.xml.bz2
|
2023-03-15 23:59:24 +01:00
|
|
|
|
|
|
|
# Extract the text from the dump file using native Bash commands
|
|
|
|
bzcat enwiki-latest-pages-articles.xml.bz2 | \
|
|
|
|
sed -n '/<text/,/<\/text>/p' | \
|
|
|
|
sed -e 's/<[^>]*>//g' | \
|
|
|
|
sed '/^{{/d; /^$/d; /\t{{/d; s/|/ /g; s/=/ /g; /&.*;/d' | \
|
|
|
|
sed '/^\s*$/d; /^$/d' | \
|
|
|
|
sed 's/^[[:space:]]*//' | \
|
|
|
|
tr -d "'[]{}" > \
|
|
|
|
enwiki-latest-corpus.txt
|
|
|
|
|
|
|
|
# Clean up
|
2023-03-16 00:07:03 +01:00
|
|
|
rm enwiki-latest-pages-articles.xml.bz2
|