Create corpus script

This commit is contained in:
Kuba 2023-03-15 23:59:24 +01:00
commit 17253595a3
3 changed files with 51 additions and 0 deletions

2
.gitignore vendored Normal file
View File

@ -0,0 +1,2 @@
*.txt
*.bz2

32
readme.md Normal file
View File

@ -0,0 +1,32 @@
# Text corbus
based on Wikipedia EN Latest Articles -- enwiki-latest-pages-articles.xml.bz2
created with wiki.sh script
# Code
- include only article text in between <text> marks
- exclude lines without alphabet letters
- clear excessive white spaces
- remove special characters
- remove unintentional empty lines
# Stats
## No lines
```bash
wc -l
```
## Size
```bash
du -sh enwiki-latest-corpus.txt.bz2
```
## Head of file
```bash
bzcat enwiki-latest-corpus.txt.bz2 | head -n 5
```
## Random lines from file
```bash
bzcat enwiki-latest-corpus.txt.bz2 | shuf -n 5
```

17
wiki.sh Executable file
View File

@ -0,0 +1,17 @@
#!/bin/bash
# Download the latest Wikipedia dump file
#wget https://dumps.wikimedia.org/enwiki/latest/enwiki-latest-pages-articles.xml.bz2
# Extract the text from the dump file using native Bash commands
bzcat enwiki-latest-pages-articles.xml.bz2 | \
sed -n '/<text/,/<\/text>/p' | \
sed -e 's/<[^>]*>//g' | \
sed '/^{{/d; /^$/d; /\t{{/d; s/|/ /g; s/=/ /g; /&.*;/d' | \
sed '/^\s*$/d; /^$/d' | \
sed 's/^[[:space:]]*//' | \
tr -d "'[]{}" > \
enwiki-latest-corpus.txt
# Clean up
#rm enwiki-latest-pages-articles.xml.bz2