Compare commits
8 Commits
4f5a13fb50
...
b2b6e0ad30
Author | SHA1 | Date | |
---|---|---|---|
b2b6e0ad30 | |||
6dd47bcdb5 | |||
d6be2a2ce9 | |||
2afff87ba0 | |||
3e0ee10fd9 | |||
99b6e1f59a | |||
737ab3f6bd | |||
375fa28dd5 |
75
analysis/advanced.py
Normal file
75
analysis/advanced.py
Normal file
@ -0,0 +1,75 @@
|
|||||||
|
from matplotlib import pyplot as plt
|
||||||
|
from math import log
|
||||||
|
def log_rang_log_freq(vals: list[int], fname: str = "fig.png"):
|
||||||
|
plt.figure().clear()
|
||||||
|
plt.plot([log(x) for x in range(1, len(vals)+1)], [log(y) for y in vals])
|
||||||
|
plt.savefig('../figures/'+fname)
|
||||||
|
plt.show()
|
||||||
|
return plt
|
||||||
|
|
||||||
|
def words_freq_dict(filename: str = "word_freq.txt") -> dict:
|
||||||
|
words = {}
|
||||||
|
with open(filename) as f:
|
||||||
|
for line in f.readlines():
|
||||||
|
try:
|
||||||
|
occ, word = line.strip().replace('\n', '').split(' ')
|
||||||
|
occ = int(occ)
|
||||||
|
except:
|
||||||
|
#words[len' '] = line.strip().replace('\n', '')
|
||||||
|
pass
|
||||||
|
if len(word) in words:
|
||||||
|
if words[len(word)]['min']['count'] > occ:
|
||||||
|
words[len(word)]['min']['count'] = occ
|
||||||
|
words[len(word)]['min']['word'] = word
|
||||||
|
|
||||||
|
if words[len(word)]['max']['count'] < occ:
|
||||||
|
words[len(word)]['max']['count'] = occ
|
||||||
|
words[len(word)]['max']['word'] = word
|
||||||
|
else:
|
||||||
|
words[len(word)] = {
|
||||||
|
'min' : {
|
||||||
|
'word': word,
|
||||||
|
'count': occ
|
||||||
|
},
|
||||||
|
'max' : {
|
||||||
|
'word': word,
|
||||||
|
'count': occ
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
|
return words
|
||||||
|
|
||||||
|
|
||||||
|
def word_len_occ(filename: str = "len_freq.txt") -> list[int]:
|
||||||
|
word_len, word_occ = [], []
|
||||||
|
with open(filename) as f:
|
||||||
|
for line in f.readlines():
|
||||||
|
occ, l = line.strip().replace('\n', ' ').split(' ')
|
||||||
|
word_len.append(int(l))
|
||||||
|
word_occ.append(int(occ))
|
||||||
|
return word_len[1:], word_occ[1:]
|
||||||
|
|
||||||
|
|
||||||
|
def bigram_len_occ(filename: str = "bigram_freq.txt") -> list[int]:
|
||||||
|
bigram_len, bigram_occ = [], []
|
||||||
|
with open(filename) as f:
|
||||||
|
for line in f.readlines():
|
||||||
|
occ, l1, l2 = line.strip().replace('\n', ' ').split(' ')
|
||||||
|
bigram_len.append(len(l1) + len(l2))
|
||||||
|
bigram_occ.append(int(occ))
|
||||||
|
return bigram_len[1:], bigram_occ[1:]
|
||||||
|
|
||||||
|
|
||||||
|
|
||||||
|
#Zpif law figure
|
||||||
|
log_rang_log_freq(word_len_occ()[1], 'zipf-words.png')
|
||||||
|
|
||||||
|
#Zipf law for bigram figure
|
||||||
|
log_rang_log_freq(bigram_len_occ()[1], 'zipf-bigrams.png')
|
||||||
|
|
||||||
|
#Most & least frequent words from text
|
||||||
|
def disturbing_words():
|
||||||
|
words = words_freq_dict()
|
||||||
|
for i, w in sorted(words.items()):
|
||||||
|
if(w['min']['word'] != w['max']['word']):
|
||||||
|
print(f'{i} - {w}')
|
23
analysis/advanced.sh
Executable file
23
analysis/advanced.sh
Executable file
@ -0,0 +1,23 @@
|
|||||||
|
#!/bin/bash
|
||||||
|
|
||||||
|
#count comas & spaces
|
||||||
|
echo "Ammount of comas"
|
||||||
|
bzcat $1 | tr '[:upper:]' '[:lower:]' | tr -cd ',' | wc -c
|
||||||
|
echo "Ammount of dots"
|
||||||
|
bzcat $1 | tr '[:upper:]' '[:lower:]' | tr -cd '.' | wc -c
|
||||||
|
echo "Ammount of white spaces"
|
||||||
|
bzcat $1 | tr '[:upper:]' '[:lower:]' | tr -cd ' ' | wc -c
|
||||||
|
echo "Ammount of line breaks"
|
||||||
|
bzcat $1 | tr '[:upper:]' '[:lower:]' | tr -cd '\n' | wc -c
|
||||||
|
|
||||||
|
#calculate no occurances of each word
|
||||||
|
echo "calculate no occurances of each word"
|
||||||
|
bzcat $1 | tr -s '[:punct:][:space:]' '\n' | grep -E "^[^\x00-\x7F]*[[:alpha:]][^\x00-\x7F]*$" | sort | uniq -c | sort -nr > word_freq.txt
|
||||||
|
|
||||||
|
# Convert text to lowercase and replace non-alphabetic characters with spaces, then use awk to print bigrams
|
||||||
|
echo "calculate bigrams frequency in text"
|
||||||
|
bzcat $1 | tr '[:upper:]' '[:lower:]' | tr -cs '[:alpha:]' ' ' | fold -w 1000 | sed -e 's/\b\s\+\b/ /g' | awk '{for(i=1;i<NF;i++)print $i,$(i+1)}' | sort | uniq -c > bigram_freq.txt
|
||||||
|
|
||||||
|
#calculate no occurances of words depending on length
|
||||||
|
echo "calculate no occurances of words depending on length"
|
||||||
|
bzcat $1 | tr -cs '[:alnum:]' ' ' | tr ' ' '\n' | awk '{print length}' | sort -n | uniq -c > len_freq.txt
|
4
analysis/bigrams_freq.sh
Executable file
4
analysis/bigrams_freq.sh
Executable file
@ -0,0 +1,4 @@
|
|||||||
|
#!/bin/bash
|
||||||
|
|
||||||
|
# Convert text to lowercase and replace non-alphabetic characters with spaces, then use awk to print bigrams
|
||||||
|
bzcat $1 | tr '[:upper:]' '[:lower:]' | tr -cs '[:alpha:]' ' ' | fold -w 1000 | sed -e 's/\b\s\+\b/ /g' | awk '{for(i=1;i<NF;i++)print $i,$(i+1)}' | sort | uniq -c > bigram_freq.txt
|
3
analysis/len_freq.sh
Normal file
3
analysis/len_freq.sh
Normal file
@ -0,0 +1,3 @@
|
|||||||
|
#!/bin/bash
|
||||||
|
#calculate no occurances of words depending on length
|
||||||
|
bzcat $1 | tr -cs '[:alnum:]' ' ' | tr ' ' '\n' | awk '{print length}' | sort -n | uniq -c > len_freq.txt
|
3
analysis/word_freq.sh
Executable file
3
analysis/word_freq.sh
Executable file
@ -0,0 +1,3 @@
|
|||||||
|
#!/bin/bash
|
||||||
|
#calculate no occurances of each word
|
||||||
|
bzcat $1 | tr -s '[:punct:][:space:]' '\n' | grep -E "^[^\x00-\x7F]*[[:alpha:]][^\x00-\x7F]*$" | sort | uniq -c | sort -nr > word_freq.txt
|
17
corpus/wiki.sh
Executable file
17
corpus/wiki.sh
Executable file
@ -0,0 +1,17 @@
|
|||||||
|
#!/bin/bash
|
||||||
|
|
||||||
|
# Download the latest Wikipedia dump file
|
||||||
|
wget https://dumps.wikimedia.org/enwiki/latest/enwiki-latest-pages-articles.xml.bz2
|
||||||
|
|
||||||
|
# Extract the text from the dump file using native Bash commands
|
||||||
|
bzcat enwiki-latest-pages-articles.xml.bz2 | \
|
||||||
|
sed -n '/<text/,/<\/text>/p' | \
|
||||||
|
sed -e 's/<[^>]*>//g' | \
|
||||||
|
sed '/^{{/d; /^$/d; /\t{{/d; s/|/ /g; s/=/ /g; /&.*;/d' | \
|
||||||
|
sed '/^\s*$/d; /^$/d' | \
|
||||||
|
sed 's/^[[:space:]]*//' | \
|
||||||
|
tr -d "'[]{}" > \
|
||||||
|
enwiki-latest-corpus.txt
|
||||||
|
|
||||||
|
# Clean up
|
||||||
|
rm enwiki-latest-pages-articles.xml.bz2
|
BIN
figures/zipf-bigrams.png
Normal file
BIN
figures/zipf-bigrams.png
Normal file
Binary file not shown.
After Width: | Height: | Size: 19 KiB |
BIN
figures/zipf-words.png
Normal file
BIN
figures/zipf-words.png
Normal file
Binary file not shown.
After Width: | Height: | Size: 21 KiB |
71
readme.md
71
readme.md
@ -2,6 +2,77 @@
|
|||||||
based on Wikipedia EN Latest Articles -- enwiki-latest-pages-articles.xml.bz2
|
based on Wikipedia EN Latest Articles -- enwiki-latest-pages-articles.xml.bz2
|
||||||
created with wiki.sh script
|
created with wiki.sh script
|
||||||
|
|
||||||
|
---
|
||||||
|
|
||||||
|
# Task 2 - advanced analysis
|
||||||
|
|
||||||
|
## Zipf law
|
||||||
|
![Zipf law for words](figures/zipf-words.png)
|
||||||
|
|
||||||
|
## Zipf law for bigrams
|
||||||
|
![Zipf law for bigrams](figures/zipf-bigrams.png)
|
||||||
|
|
||||||
|
*For this corpus bigrams Zipf law doest not apply*
|
||||||
|
|
||||||
|
## Words that disturb Zpifs law the most
|
||||||
|
|
||||||
|
### Short words with low frequency
|
||||||
|
*These are mostly words from another languages and alphabets*
|
||||||
|
- 1 - 'min': {'word': '𝜑', 'count': 1}
|
||||||
|
- 2 - 'min': {'word': '𝘰𝘧', 'count': 1}
|
||||||
|
- 3 - 'min': {'word': '𝘢𝘯𝘥', 'count': 1}
|
||||||
|
- 4 - 'min': {'word': '𝔷𝔴𝔢𝔶', 'count': 1}
|
||||||
|
- 5 - 'min': {'word': '𝔳𝔞𝔱𝔢𝔯', 'count': 1}
|
||||||
|
- 6 - 'min': {'word': '𝕿𝖚𝖗𝖙𝖑𝖊', 'count': 1}
|
||||||
|
- 7 - 'min': {'word': '𝘕𝘢𝘯𝘺𝘢𝘯𝘨', 'count': 1}
|
||||||
|
- 8 - 'min': {'word': '𝘙𝘦𝘱𝘶𝘣𝘭𝘪𝘤', 'count': 1}
|
||||||
|
- 9 - 'min': {'word': '𝔲𝔣𝔤𝔢𝔭𝔬𝔥𝔰𝔱', 'count': 1}
|
||||||
|
- 10 - 'min': {'word': '𝔱𝔯𝔦𝔠𝔨𝔩𝔦𝔰𝔠𝔥', 'count': 1}
|
||||||
|
- 11 - 'min': {'word': '𝔤𝔢𝔰𝔠𝔥𝔪𝔦𝔰𝔷𝔢𝔫', 'count': 1}
|
||||||
|
- 12 - 'min': {'word': '𝔯𝔢𝔠𝔥𝔱𝔰𝔠𝔥𝔞𝔣𝔣𝔢', 'count': 1}
|
||||||
|
|
||||||
|
|
||||||
|
### Long words with high frequency
|
||||||
|
*Just weird words used multiple times*
|
||||||
|
- 41 - 'max': {'word': 'ConductionsystemoftheheartwithouttheHeart', 'count': 8}
|
||||||
|
- 42 - 'max': {'word': 'RightHumanPosteriorDistalRadiusUlnaCarpals', 'count': 6}
|
||||||
|
- 43 - 'max': {'word': 'ayantheObituariesOfEminentMenByIbnKhallikan', 'count': 15}
|
||||||
|
- 44 - 'max': {'word': 'MujahidinAnHistoricalWorkInTheArabicLanguage', 'count': 18}
|
||||||
|
- 45 - 'max': {'word': 'Pneumonoultramicroscopicsilicovolcanoconiosis', 'count': 12}
|
||||||
|
- 66 - 'max': {'word': 'SyrianGentlemanAndWarriorInThePeriodOfTheCrusadesMemoirsOfUsamaIbn', 'count': 3}
|
||||||
|
- 67 - 'max': {'word': 'GayLiberationAndSocialismDocumentsFromTheDiscussionsOnGayLiberation', 'count': 3}
|
||||||
|
- 70 - 'max': {'word': 'FieldMarshallVonMackensenAndCrownPrinceBorisReviewingBulgarianRegiment', 'count': 2}
|
||||||
|
- 72 - 'max': {'word': 'ExploringSharedHistoryPreservingSharedHeritagePenangsLinksToASiamesePast', 'count': 2}
|
||||||
|
- 76 - 'max': {'word': 'IsuPemilikanWilayahPantaiTimurSabahSatuPenelusurandaripadaSudutSumberSejarah', 'count': 2}
|
||||||
|
- 79 - 'max': {'word': 'Donaudampfschiffahrtselektrizitätenhauptbetriebswerkbauunterbeamtengesellschaft', 'count': 2}
|
||||||
|
- 88 - 'max': {'word': 'AhmadIbnMuhammadThalabiAraisAlMajalisFiQisasAlAnbiyaLivesOfTheProphetsLivesOfTheProphets', 'count': 2}
|
||||||
|
- 94 - 'max': {'word': 'Llanshyfairshypwllshygwynshygyllshygogeryshychwyrnshydrobshywlllshylantyshysilioshygogoshygoch', 'count': 2}
|
||||||
|
- 100 - 'max': {'word': 'Bababadalgharaghtakamminarronnkonnbronntonnerronntuonnthunntrovarrhounawnskawntoohoohoordenenthurnuk', 'count': 2}
|
||||||
|
|
||||||
|
|
||||||
|
## Custom metrics
|
||||||
|
|
||||||
|
### How complex were sentences - comas ~ dots ratio
|
||||||
|
|
||||||
|
22120137 / 24536140 ~= 0.9015328817
|
||||||
|
|
||||||
|
*This indicates, that less than 10% of sentences were highly complex*
|
||||||
|
|
||||||
|
### Words used in single paragraph - white spaces ~ new lines ratio
|
||||||
|
|
||||||
|
500903582 / 60208160 ~= 8.31952981124
|
||||||
|
|
||||||
|
*We can see, that on avarage a paragraph has barely over 8 words, which indicates that corpus has a lot of short and simple paragraphs*
|
||||||
|
|
||||||
|
|
||||||
|
## Code
|
||||||
|
Data for above analysis were gathered using *advanced.sh* script and processed using *advanced.py* script placed in analysis folder
|
||||||
|
|
||||||
|
|
||||||
|
---
|
||||||
|
|
||||||
|
# Task 1 - corpus extraction & preparation
|
||||||
|
|
||||||
# Code
|
# Code
|
||||||
- include only article text in between <text> marks
|
- include only article text in between <text> marks
|
||||||
- exclude lines without alphabet letters
|
- exclude lines without alphabet letters
|
||||||
|
Loading…
Reference in New Issue
Block a user