count number of occurances of each word in text

calculate frequency of word acording to length in text
calculate frequency of bigrams in text
2023-03-21 22:32:40 +01:00 · 2023-03-21 22:32:15 +01:00 · 2023-03-21 22:31:17 +01:00 · 2023-03-21 22:30:42 +01:00 · 2023-03-21 22:30:20 +01:00 · 2023-03-21 22:27:48 +01:00
9 changed files with 196 additions and 0 deletions
--- a/analysis/advanced.py
+++ b/analysis/advanced.py
@ -0,0 +1,75 @@
+from matplotlib import pyplot as plt
+from math import log
+def log_rang_log_freq(vals: list[int], fname: str = "fig.png"):
+    plt.figure().clear()
+    plt.plot([log(x) for x in range(1, len(vals)+1)], [log(y) for y in vals])
+    plt.savefig('../figures/'+fname)
+    plt.show()
+    return plt
+
+def words_freq_dict(filename: str = "word_freq.txt") -> dict:
+    words = {}
+    with open(filename) as f:
+        for line in f.readlines():
+            try:
+                occ, word = line.strip().replace('\n', '').split(' ')
+                occ = int(occ)
+            except:
+                #words[len' '] = line.strip().replace('\n', '')
+                pass
+            if len(word) in words:
+                if words[len(word)]['min']['count'] > occ:
+                    words[len(word)]['min']['count'] = occ
+                    words[len(word)]['min']['word'] = word
+
+                if words[len(word)]['max']['count'] < occ:
+                    words[len(word)]['max']['count'] = occ
+                    words[len(word)]['max']['word'] = word
+            else:
+                words[len(word)] = {
+                    'min' : {
+                        'word': word,
+                        'count': occ
+                    },
+                    'max' : {
+                        'word': word,
+                        'count': occ
+                    }
+                }
+
+    return words
+
+
+def word_len_occ(filename: str = "len_freq.txt") -> list[int]:
+    word_len, word_occ = [], []
+    with open(filename) as f:
+        for line in f.readlines():
+            occ, l = line.strip().replace('\n', ' ').split(' ')
+            word_len.append(int(l))
+            word_occ.append(int(occ))
+    return word_len[1:], word_occ[1:]
+
+
+def bigram_len_occ(filename: str = "bigram_freq.txt") -> list[int]:
+    bigram_len, bigram_occ = [], []
+    with open(filename) as f:
+        for line in f.readlines():
+            occ, l1, l2 = line.strip().replace('\n', ' ').split(' ')
+            bigram_len.append(len(l1) + len(l2))
+            bigram_occ.append(int(occ))
+    return bigram_len[1:], bigram_occ[1:]
+
+
+
+#Zpif law figure
+log_rang_log_freq(word_len_occ()[1], 'zipf-words.png')
+
+#Zipf law for bigram figure
+log_rang_log_freq(bigram_len_occ()[1], 'zipf-bigrams.png')
+
+#Most & least frequent words from text
+def disturbing_words():
+    words = words_freq_dict()
+    for i, w in sorted(words.items()):
+        if(w['min']['word'] != w['max']['word']):
+            print(f'{i}  - {w}')
--- a/analysis/advanced.sh
+++ b/analysis/advanced.sh
@ -0,0 +1,23 @@
+#!/bin/bash
+
+#count comas & spaces
+echo "Ammount of comas"
+bzcat $1 | tr '[:upper:]' '[:lower:]' | tr -cd ',' | wc -c
+echo "Ammount of dots"
+bzcat $1 | tr '[:upper:]' '[:lower:]' | tr -cd '.' | wc -c
+echo "Ammount of white spaces"
+bzcat $1 | tr '[:upper:]' '[:lower:]' | tr -cd ' ' | wc -c
+echo "Ammount of line breaks"
+bzcat $1 | tr '[:upper:]' '[:lower:]' | tr -cd '\n' | wc -c
+
+#calculate no occurances of each word
+echo "calculate no occurances of each word"
+bzcat $1 | tr -s '[:punct:][:space:]' '\n' | grep -E "^[^\x00-\x7F]*[[:alpha:]][^\x00-\x7F]*$" | sort | uniq -c | sort -nr > word_freq.txt
+
+# Convert text to lowercase and replace non-alphabetic characters with spaces, then use awk to print bigrams
+echo "calculate bigrams frequency in text"
+bzcat $1 | tr '[:upper:]' '[:lower:]' | tr -cs '[:alpha:]' ' ' | fold -w 1000 | sed -e 's/\b\s\+\b/ /g' | awk '{for(i=1;i<NF;i++)print $i,$(i+1)}' | sort | uniq -c > bigram_freq.txt
+
+#calculate no occurances of words depending on length
+echo "calculate no occurances of words depending on length"
+bzcat $1 | tr -cs '[:alnum:]' ' ' | tr ' ' '\n' | awk '{print length}' | sort -n | uniq -c > len_freq.txt
--- a/analysis/bigrams_freq.sh
+++ b/analysis/bigrams_freq.sh
@ -0,0 +1,4 @@
+#!/bin/bash
+
+# Convert text to lowercase and replace non-alphabetic characters with spaces, then use awk to print bigrams
+bzcat $1 | tr '[:upper:]' '[:lower:]' | tr -cs '[:alpha:]' ' ' | fold -w 1000 | sed -e 's/\b\s\+\b/ /g' | awk '{for(i=1;i<NF;i++)print $i,$(i+1)}' | sort | uniq -c > bigram_freq.txt
--- a/analysis/len_freq.sh
+++ b/analysis/len_freq.sh
@ -0,0 +1,3 @@
+#!/bin/bash
+#calculate no occurances of words depending on length
+bzcat $1 | tr -cs '[:alnum:]' ' ' | tr ' ' '\n' | awk '{print length}' | sort -n | uniq -c > len_freq.txt
--- a/analysis/word_freq.sh
+++ b/analysis/word_freq.sh
@ -0,0 +1,3 @@
+#!/bin/bash
+#calculate no occurances of each word
+bzcat $1 | tr -s '[:punct:][:space:]' '\n' | grep -E "^[^\x00-\x7F]*[[:alpha:]][^\x00-\x7F]*$" | sort | uniq -c | sort -nr > word_freq.txt
--- a/corpus/wiki.sh
+++ b/corpus/wiki.sh
@ -0,0 +1,17 @@
+#!/bin/bash
+
+# Download the latest Wikipedia dump file
+wget https://dumps.wikimedia.org/enwiki/latest/enwiki-latest-pages-articles.xml.bz2
+
+# Extract the text from the dump file using native Bash commands
+bzcat enwiki-latest-pages-articles.xml.bz2 | \
+	sed -n '/<text/,/<\/text>/p' | \
+	sed -e 's/<[^>]*>//g' | \
+	sed '/^{{/d; /^$/d; /\t{{/d; s/|/ /g; s/=/ /g; /&.*;/d' | \
+	sed '/^\s*$/d; /^$/d' | \
+	sed 's/^[[:space:]]*//' | \
+	tr -d "'[]{}" > \
+	enwiki-latest-corpus.txt
+
+# Clean up
+rm enwiki-latest-pages-articles.xml.bz2
--- a/figures/zipf-bigrams.png
+++ b/figures/zipf-bigrams.png
--- a/figures/zipf-words.png
+++ b/figures/zipf-words.png
--- a/readme.md
+++ b/readme.md
@ -2,6 +2,77 @@
 based on Wikipedia EN Latest Articles -- enwiki-latest-pages-articles.xml.bz2
 created with wiki.sh script

+---
+
+# Task 2 - advanced analysis
+
+## Zipf law
+![Zipf law for words](figures/zipf-words.png)
+
+## Zipf law for bigrams
+![Zipf law for bigrams](figures/zipf-bigrams.png)
+
+*For this corpus bigrams Zipf law doest not apply*
+
+## Words that disturb Zpifs law the most 
+
+### Short words with low frequency
+*These are mostly words from another languages and alphabets*
+ - 1  - 'min': {'word': '𝜑', 'count': 1}
+ - 2  - 'min': {'word': '𝘰𝘧', 'count': 1}
+ - 3  - 'min': {'word': '𝘢𝘯𝘥', 'count': 1}
+ - 4  - 'min': {'word': '𝔷𝔴𝔢𝔶', 'count': 1}
+ - 5  - 'min': {'word': '𝔳𝔞𝔱𝔢𝔯', 'count': 1}
+ - 6  - 'min': {'word': '𝕿𝖚𝖗𝖙𝖑𝖊', 'count': 1}
+ - 7  - 'min': {'word': '𝘕𝘢𝘯𝘺𝘢𝘯𝘨', 'count': 1}
+ - 8  - 'min': {'word': '𝘙𝘦𝘱𝘶𝘣𝘭𝘪𝘤', 'count': 1}
+ - 9  - 'min': {'word': '𝔲𝔣𝔤𝔢𝔭𝔬𝔥𝔰𝔱', 'count': 1}
+ - 10  - 'min': {'word': '𝔱𝔯𝔦𝔠𝔨𝔩𝔦𝔰𝔠𝔥', 'count': 1}
+ - 11  - 'min': {'word': '𝔤𝔢𝔰𝔠𝔥𝔪𝔦𝔰𝔷𝔢𝔫', 'count': 1}
+ - 12  - 'min': {'word': '𝔯𝔢𝔠𝔥𝔱𝔰𝔠𝔥𝔞𝔣𝔣𝔢', 'count': 1}
+
+
+### Long words with high frequency
+*Just weird words used multiple times*
+ - 41  - 'max': {'word': 'ConductionsystemoftheheartwithouttheHeart', 'count': 8}
+ - 42  -  'max': {'word': 'RightHumanPosteriorDistalRadiusUlnaCarpals', 'count': 6}
+ - 43  - 'max': {'word': 'ayantheObituariesOfEminentMenByIbnKhallikan', 'count': 15}
+ - 44  - 'max': {'word': 'MujahidinAnHistoricalWorkInTheArabicLanguage', 'count': 18}
+ - 45  - 'max': {'word': 'Pneumonoultramicroscopicsilicovolcanoconiosis', 'count': 12}
+ - 66  - 'max': {'word': 'SyrianGentlemanAndWarriorInThePeriodOfTheCrusadesMemoirsOfUsamaIbn', 'count': 3}
+ - 67  - 'max': {'word': 'GayLiberationAndSocialismDocumentsFromTheDiscussionsOnGayLiberation', 'count': 3}
+ - 70  - 'max': {'word': 'FieldMarshallVonMackensenAndCrownPrinceBorisReviewingBulgarianRegiment', 'count': 2}
+ - 72  - 'max': {'word': 'ExploringSharedHistoryPreservingSharedHeritagePenangsLinksToASiamesePast', 'count': 2}
+ - 76  - 'max': {'word': 'IsuPemilikanWilayahPantaiTimurSabahSatuPenelusurandaripadaSudutSumberSejarah', 'count': 2}
+ - 79  - 'max': {'word': 'Donaudampfschiffahrtselektrizitätenhauptbetriebswerkbauunterbeamtengesellschaft', 'count': 2}
+ - 88  - 'max': {'word': 'AhmadIbnMuhammadThalabiAraisAlMajalisFiQisasAlAnbiyaLivesOfTheProphetsLivesOfTheProphets', 'count': 2}
+ - 94  - 'max': {'word': 'Llanshyfairshypwllshygwynshygyllshygogeryshychwyrnshydrobshywlllshylantyshysilioshygogoshygoch', 'count': 2}
+ - 100  - 'max': {'word': 'Bababadalgharaghtakamminarronnkonnbronntonnerronntuonnthunntrovarrhounawnskawntoohoohoordenenthurnuk', 'count': 2}
+
+
+## Custom metrics
+
+### How complex were sentences - comas ~ dots ratio
+
+22120137 / 24536140 ~= 0.9015328817
+
+*This indicates, that less than 10% of sentences were highly complex*
+
+### Words used in single paragraph - white spaces ~ new lines ratio
+
+500903582 / 60208160 ~= 8.31952981124
+
+*We can see, that on avarage a paragraph has barely over 8 words, which indicates that corpus has a lot of short and simple paragraphs*
+
+
+## Code
+Data for above analysis were gathered using *advanced.sh* script and processed using *advanced.py* script placed in analysis folder
+
+
+---
+
+# Task 1 - corpus extraction & preparation
+
 # Code
 - include only article text in between <text> marks
 - exclude lines without alphabet letters
Author	SHA1	Message	Date
Kuba	b2b6e0ad30	count number of occurances of each word in text	2023-03-21 22:32:40 +01:00
Kuba	6dd47bcdb5	calculate frequency of word acording to length in text	2023-03-21 22:32:15 +01:00
Kuba	d6be2a2ce9	calculate frequency of bigrams in text	2023-03-21 22:31:17 +01:00
Kuba	2afff87ba0	final script to perform data processing	2023-03-21 22:30:42 +01:00
Kuba	3e0ee10fd9	data processing, analysis and visualization	2023-03-21 22:30:20 +01:00
Kuba	99b6e1f59a	Zipf law graphs	2023-03-21 22:27:48 +01:00
Kuba	737ab3f6bd	Added Tast2 - advanced analysis	2023-03-21 22:27:18 +01:00
Kuba	375fa28dd5	restructurized directory	2023-03-21 22:26:46 +01:00