From 17253595a325e54495e1a07c8eb39ac48f3db310 Mon Sep 17 00:00:00 2001 From: Kuba Date: Wed, 15 Mar 2023 23:59:24 +0100 Subject: [PATCH] Create corpus script --- .gitignore | 2 ++ readme.md | 32 ++++++++++++++++++++++++++++++++ wiki.sh | 17 +++++++++++++++++ 3 files changed, 51 insertions(+) create mode 100644 .gitignore create mode 100644 readme.md create mode 100755 wiki.sh diff --git a/.gitignore b/.gitignore new file mode 100644 index 0000000..8567826 --- /dev/null +++ b/.gitignore @@ -0,0 +1,2 @@ +*.txt +*.bz2 \ No newline at end of file diff --git a/readme.md b/readme.md new file mode 100644 index 0000000..c0e90f6 --- /dev/null +++ b/readme.md @@ -0,0 +1,32 @@ +# Text corbus +based on Wikipedia EN Latest Articles -- enwiki-latest-pages-articles.xml.bz2 +created with wiki.sh script + +# Code + - include only article text in between marks + - exclude lines without alphabet letters + - clear excessive white spaces + - remove special characters + - remove unintentional empty lines + +# Stats +## No lines +```bash +wc -l + +``` +## Size +```bash +du -sh enwiki-latest-corpus.txt.bz2 + +``` +## Head of file +```bash +bzcat enwiki-latest-corpus.txt.bz2 | head -n 5 + +``` +## Random lines from file +```bash +bzcat enwiki-latest-corpus.txt.bz2 | shuf -n 5 + +``` \ No newline at end of file diff --git a/wiki.sh b/wiki.sh new file mode 100755 index 0000000..ab75603 --- /dev/null +++ b/wiki.sh @@ -0,0 +1,17 @@ +#!/bin/bash + +# Download the latest Wikipedia dump file +#wget https://dumps.wikimedia.org/enwiki/latest/enwiki-latest-pages-articles.xml.bz2 + +# Extract the text from the dump file using native Bash commands +bzcat enwiki-latest-pages-articles.xml.bz2 | \ + sed -n '//p' | \ + sed -e 's/<[^>]*>//g' | \ + sed '/^{{/d; /^$/d; /\t{{/d; s/|/ /g; s/=/ /g; /&.*;/d' | \ + sed '/^\s*$/d; /^$/d' | \ + sed 's/^[[:space:]]*//' | \ + tr -d "'[]{}" > \ + enwiki-latest-corpus.txt + +# Clean up +#rm enwiki-latest-pages-articles.xml.bz2