Create corpus script
This commit is contained in:
commit
17253595a3
2
.gitignore
vendored
Normal file
2
.gitignore
vendored
Normal file
@ -0,0 +1,2 @@
|
||||
*.txt
|
||||
*.bz2
|
32
readme.md
Normal file
32
readme.md
Normal file
@ -0,0 +1,32 @@
|
||||
# Text corbus
|
||||
based on Wikipedia EN Latest Articles -- enwiki-latest-pages-articles.xml.bz2
|
||||
created with wiki.sh script
|
||||
|
||||
# Code
|
||||
- include only article text in between <text> marks
|
||||
- exclude lines without alphabet letters
|
||||
- clear excessive white spaces
|
||||
- remove special characters
|
||||
- remove unintentional empty lines
|
||||
|
||||
# Stats
|
||||
## No lines
|
||||
```bash
|
||||
wc -l
|
||||
|
||||
```
|
||||
## Size
|
||||
```bash
|
||||
du -sh enwiki-latest-corpus.txt.bz2
|
||||
|
||||
```
|
||||
## Head of file
|
||||
```bash
|
||||
bzcat enwiki-latest-corpus.txt.bz2 | head -n 5
|
||||
|
||||
```
|
||||
## Random lines from file
|
||||
```bash
|
||||
bzcat enwiki-latest-corpus.txt.bz2 | shuf -n 5
|
||||
|
||||
```
|
17
wiki.sh
Executable file
17
wiki.sh
Executable file
@ -0,0 +1,17 @@
|
||||
#!/bin/bash
|
||||
|
||||
# Download the latest Wikipedia dump file
|
||||
#wget https://dumps.wikimedia.org/enwiki/latest/enwiki-latest-pages-articles.xml.bz2
|
||||
|
||||
# Extract the text from the dump file using native Bash commands
|
||||
bzcat enwiki-latest-pages-articles.xml.bz2 | \
|
||||
sed -n '/<text/,/<\/text>/p' | \
|
||||
sed -e 's/<[^>]*>//g' | \
|
||||
sed '/^{{/d; /^$/d; /\t{{/d; s/|/ /g; s/=/ /g; /&.*;/d' | \
|
||||
sed '/^\s*$/d; /^$/d' | \
|
||||
sed 's/^[[:space:]]*//' | \
|
||||
tr -d "'[]{}" > \
|
||||
enwiki-latest-corpus.txt
|
||||
|
||||
# Clean up
|
||||
#rm enwiki-latest-pages-articles.xml.bz2
|
Loading…
Reference in New Issue
Block a user