From fb82cdfc04aa5c8b05ece76baad6c0957046f71c Mon Sep 17 00:00:00 2001
From: Adam Wojdyla <adam.wojdyla@outlook.com>
Date: Wed, 22 Mar 2023 04:32:34 +0100
Subject: [PATCH] statistics script

---
 {Lab1-PrepareCorpora => Lab1}/README.md |   0
 {Lab1-PrepareCorpora => Lab1}/clean.py  |   3 +-
 Lab2/README.md                          |  46 +++++++
 Lab2/statistics.py                      | 153 ++++++++++++++++++++++++
 4 files changed, 201 insertions(+), 1 deletion(-)
 rename {Lab1-PrepareCorpora => Lab1}/README.md (100%)
 rename {Lab1-PrepareCorpora => Lab1}/clean.py (93%)
 create mode 100644 Lab2/README.md
 create mode 100644 Lab2/statistics.py

diff --git a/Lab1-PrepareCorpora/README.md b/Lab1/README.md
similarity index 100%
rename from Lab1-PrepareCorpora/README.md
rename to Lab1/README.md
diff --git a/Lab1-PrepareCorpora/clean.py b/Lab1/clean.py
similarity index 93%
rename from Lab1-PrepareCorpora/clean.py
rename to Lab1/clean.py
index 3157959..a64b594 100644
--- a/Lab1-PrepareCorpora/clean.py
+++ b/Lab1/clean.py
@@ -27,6 +27,7 @@ def clean_with_regex(text):
     return []
   out = list(filter(lambda item: filter_line(item), out))
   out = list(map(lambda item: re.sub("(?<=\d)(\(\d+\))(?=\s+)|(\(\d+\)\s+)|(\d+\.)+\s", "", item), out))
+  out = list(map(lambda item: re.sub("[^\w\d\s\\\)\(\/-]", "", item), out))
   if out:
     out.pop(len(out)-1)
   return out
@@ -41,7 +42,7 @@ def print_text(text, sort=False):
 def save_to_file(paragraph_list, file_name):
   with open(file_name, 'a') as f:
     for line in paragraph_list:
-      f.write("%s\n" % line.strip())
+      f.write("%s\n" % line.strip().lower())
     f.close()
 
 
diff --git a/Lab2/README.md b/Lab2/README.md
new file mode 100644
index 0000000..5c72445
--- /dev/null
+++ b/Lab2/README.md
@@ -0,0 +1,46 @@
+# Statystyki  
+
+## Statystyki podstawowe
+
+### 10 nadłuższych słów
+
+`
+MarineStrategyFrameworkDirectiveClassificationValue
+OtherFinancialProfessionalAndInformationServices
+GuineaPeruPhilippinesQatarRomaniaRussiaRwandaSao
+MarineStrategyFrameworkDirectiveClassificationValue
+AustraliaArgentinaBotswanaBrazilChileNamibiaNew
+ManufacturingOfElectricalAndOpticalEquipment
+ClassificationAndQuantificationFrameworkValue
+FinancialProfessionalAndInformationServices
+measuredIndicatedAndInferredMineralResource
+AnthropogenicGeomorphologicFeatureTypeValue
+`
+
+### Prawo Zipfa dla słów
+
+![title](images/zipf-law-words.png)
+
+### Prawo Zipfa dla trigramów z słów
+
+![title](images/zipf-law-3grams.png)
+
+### Słowa łamiące prawo łączące długość z częstością
+
+- aunt (4 znaki, 31 wystąpień)
+- cave (4 znaki, 31 wystąpień)
+- amateur (7 znaków, 31 wystąpień)
+- CommissionFranz (15 znaków, 2090 wystąpień)
+- responsibilities (16 znaków, 2087 wystąpień)
+- Interventionsstelle (19 znaków, 231 wystąpień)
+- hydrogenorthophosphate (22 znaków, 148 wystąpień)
+- polytetrafluoroethylene (23 znaków, 148 wystąpień)  
+
+### Częstotliwość zaimków
+
+![title](images/pt-pronouns.png)
+
+### Ilosć wystąpień dat (lata)
+
+`['1999', '1975', '1987', '1992', '1985', '1981', '1988', '1986', '1995', '1991', '1993', '1990', '1994', '1983', '1989'...`
+![title](images/pt-years.png)
diff --git a/Lab2/statistics.py b/Lab2/statistics.py
new file mode 100644
index 0000000..70d5372
--- /dev/null
+++ b/Lab2/statistics.py
@@ -0,0 +1,153 @@
+import matplotlib.pyplot as plt
+from collections import Counter
+from collections import OrderedDict
+import regex as re
+from math import log
+
+file_path = "Lab1/out-merged.txt"
+file_content = None
+
+with open(file_path, 'r') as file:
+  file_content = file.read()
+
+# file_content = file_content[:100]
+
+def get_characters(t):
+    yield from t
+
+def freq_list(g, top=None):
+  c = Counter(g)
+
+  if top is None:
+     items = c.items()
+  else:
+     items = c.most_common(top)
+
+  return OrderedDict(sorted(items, key=lambda t: -t[1]))
+
+def get_words(t):
+  for m in re.finditer(r'[\p{L}0-9\*]+', t):
+     yield m.group(0)
+
+def rang_freq_with_labels(name, g, top=None):
+   freq = freq_list(g, top)
+
+   plt.figure(figsize=(12, 3))
+   plt.ylabel('liczba wystąpień')
+   plt.bar(freq.keys(), freq.values())
+   
+   fname = f'Lab2/images/{name}.png'
+   
+   plt.savefig(fname)
+   
+   return fname
+
+def log_rang_log_freq(name, g):
+   freq = freq_list(g)
+
+   plt.figure().clear()
+   plt.plot([log(x) for x in range(1, len(freq.values())+1)], [log(y) for y in freq.values()])
+
+   fname = f'Lab2/images/{name}.png'
+
+   plt.savefig(fname)
+
+   return fname
+
+def ngrams(iter, size):
+    ngram = []
+    for item in iter:
+       ngram.append(item)
+       if len(ngram) == size:
+          yield tuple(ngram)
+          ngram = ngram[1:]
+
+def get_ngrams(t, size):
+   for word in get_words(t):
+    for m in ngrams(word, size):
+        yield m
+
+def get_w_freq_by_w_len(word_len):
+    for word, count in freq.items():
+        if len(word) == word_len:
+            yield (count, word)
+            
+def get_average_freq_by_w_len(word_lenghts):
+    results = dict()
+    for l in word_lenghts:
+        word_freq = list(get_w_freq_by_w_len(l))
+        if len(word_freq) == 0:
+            continue
+        average = sum([w[0] for w in word_freq]) / len(word_freq)
+        results[l] = average
+        
+    return results
+
+def get_low_high_freq_by_w_len(word_lenghts):
+    """
+    Returns top 5 most frequent and non frequent words for each word length + average frequency.
+    """
+    results = []
+    for l in word_lenghts:
+        word_freq = list(get_w_freq_by_w_len(l))
+        word_freq.sort()
+        word_freq = list(filter(lambda t: re.findall("\d",str(t[1])) == [] and t[0] > 30, word_freq))
+        word_stats = {
+            'word_len': l,
+            'average_freq': average_freq[l],
+            'low_freq': word_freq[:10],
+            'high_freq': word_freq[-10:]
+        }
+        results.append(word_stats)
+    return results
+
+def get_pronouns_stats(freqs):
+    pronouns = ["i", "you", "he", "she", "it"]
+    pronoun_words_freq = [f for f in freqs.items() if f[0] in pronouns]
+    
+    x = [f[0] for f in pronoun_words_freq]
+    y = [f[1] for f in pronoun_words_freq]
+    
+    plt.figure(figsize=(12, 3))
+    plt.ylabel('liczba wystąpień')
+    plt.bar(x, y)
+    plt.savefig("Lab2/images/pt-pronouns.png")
+
+    return pronoun_words_freq
+
+def get_years_stats(freqs):
+    years_word_freq = [f for f in freqs.items() if re.findall(r"\b1{1}[0-9]{3}\b", f[0])]
+    x = [f[0] for f in years_word_freq]
+    y = [f[1] for f in years_word_freq]
+
+    plt.figure(figsize=(12, 3))
+    plt.ylabel('liczba wystąpień')
+    plt.bar(x, y)
+    plt.savefig("Lab2/images/pt-years.png")
+
+    return years_word_freq
+
+print("Generating statistics...")
+
+# 10 most frequent words in the text
+rang_freq_with_labels('most-freq-words-20', get_words(file_content), top=20)
+
+# Zipf's law
+log_rang_log_freq('zipf-law-words', get_words(file_content))
+
+# Zipf's law for 3-grams
+log_rang_log_freq('zipf-law-2grams', get_ngrams(file_content, 3))
+
+# Words breaking the Zipf's law
+freq = freq_list(get_words(file_content))
+lenghts = [*set(len(f[0]) for f in freq.items())]
+average_freq = get_average_freq_by_w_len(lenghts)
+get_low_high_freq_by_w_len(lenghts)
+
+# Frequency of pronouns
+get_pronouns_stats(freq)
+
+print("Done")
+
+# Number of years in words
+get_years_stats(freq)
\ No newline at end of file