Create corpus script
This commit is contained in:
commit
17253595a3
2
.gitignore
vendored
Normal file
2
.gitignore
vendored
Normal file
@ -0,0 +1,2 @@
|
|||||||
|
*.txt
|
||||||
|
*.bz2
|
32
readme.md
Normal file
32
readme.md
Normal file
@ -0,0 +1,32 @@
|
|||||||
|
# Text corbus
|
||||||
|
based on Wikipedia EN Latest Articles -- enwiki-latest-pages-articles.xml.bz2
|
||||||
|
created with wiki.sh script
|
||||||
|
|
||||||
|
# Code
|
||||||
|
- include only article text in between <text> marks
|
||||||
|
- exclude lines without alphabet letters
|
||||||
|
- clear excessive white spaces
|
||||||
|
- remove special characters
|
||||||
|
- remove unintentional empty lines
|
||||||
|
|
||||||
|
# Stats
|
||||||
|
## No lines
|
||||||
|
```bash
|
||||||
|
wc -l
|
||||||
|
|
||||||
|
```
|
||||||
|
## Size
|
||||||
|
```bash
|
||||||
|
du -sh enwiki-latest-corpus.txt.bz2
|
||||||
|
|
||||||
|
```
|
||||||
|
## Head of file
|
||||||
|
```bash
|
||||||
|
bzcat enwiki-latest-corpus.txt.bz2 | head -n 5
|
||||||
|
|
||||||
|
```
|
||||||
|
## Random lines from file
|
||||||
|
```bash
|
||||||
|
bzcat enwiki-latest-corpus.txt.bz2 | shuf -n 5
|
||||||
|
|
||||||
|
```
|
17
wiki.sh
Executable file
17
wiki.sh
Executable file
@ -0,0 +1,17 @@
|
|||||||
|
#!/bin/bash
|
||||||
|
|
||||||
|
# Download the latest Wikipedia dump file
|
||||||
|
#wget https://dumps.wikimedia.org/enwiki/latest/enwiki-latest-pages-articles.xml.bz2
|
||||||
|
|
||||||
|
# Extract the text from the dump file using native Bash commands
|
||||||
|
bzcat enwiki-latest-pages-articles.xml.bz2 | \
|
||||||
|
sed -n '/<text/,/<\/text>/p' | \
|
||||||
|
sed -e 's/<[^>]*>//g' | \
|
||||||
|
sed '/^{{/d; /^$/d; /\t{{/d; s/|/ /g; s/=/ /g; /&.*;/d' | \
|
||||||
|
sed '/^\s*$/d; /^$/d' | \
|
||||||
|
sed 's/^[[:space:]]*//' | \
|
||||||
|
tr -d "'[]{}" > \
|
||||||
|
enwiki-latest-corpus.txt
|
||||||
|
|
||||||
|
# Clean up
|
||||||
|
#rm enwiki-latest-pages-articles.xml.bz2
|
Loading…
Reference in New Issue
Block a user