statistics script

2023-03-22 04:32:34 +01:00 · 2023-03-22 04:32:34 +01:00 · fb82cdfc04
commit fb82cdfc04
parent 893f4409e5
4 changed files with 201 additions and 1 deletions
--- a/Lab1-PrepareCorpora/README.md
+++ b/Lab1-PrepareCorpora/README.md
--- a/Lab1-PrepareCorpora/clean.py
+++ b/Lab1-PrepareCorpora/clean.py
@ -27,6 +27,7 @@ def clean_with_regex(text):
    return []
  out = list(filter(lambda item: filter_line(item), out))
  out = list(map(lambda item: re.sub("(?<=\d)(\(\d+\))(?=\s+)|(\(\d+\)\s+)|(\d+\.)+\s", "", item), out))
  out = list(map(lambda item: re.sub("[^\w\d\s\\\)\(\/-]", "", item), out))
  if out:
    out.pop(len(out)-1)
  return out
@ -41,7 +42,7 @@ def print_text(text, sort=False):
 def save_to_file(paragraph_list, file_name):
  with open(file_name, 'a') as f:
    for line in paragraph_list:
-      f.write("%s\n" % line.strip())
+      f.write("%s\n" % line.strip().lower())
    f.close()
--- a/Lab2/README.md
+++ b/Lab2/README.md
@ -0,0 +1,46 @@
 # Statystyki  
 ## Statystyki podstawowe
 ### 10 nadłuższych słów
 `
 MarineStrategyFrameworkDirectiveClassificationValue
 OtherFinancialProfessionalAndInformationServices
 GuineaPeruPhilippinesQatarRomaniaRussiaRwandaSao
 MarineStrategyFrameworkDirectiveClassificationValue
 AustraliaArgentinaBotswanaBrazilChileNamibiaNew
 ManufacturingOfElectricalAndOpticalEquipment
 ClassificationAndQuantificationFrameworkValue
 FinancialProfessionalAndInformationServices
 measuredIndicatedAndInferredMineralResource
 AnthropogenicGeomorphologicFeatureTypeValue
 `
 ### Prawo Zipfa dla słów
 ![title](images/zipf-law-words.png)
 ### Prawo Zipfa dla trigramów z słów
 ![title](images/zipf-law-3grams.png)
 ### Słowa łamiące prawo łączące długość z częstością
 - aunt (4 znaki, 31 wystąpień)
 - cave (4 znaki, 31 wystąpień)
 - amateur (7 znaków, 31 wystąpień)
 - CommissionFranz (15 znaków, 2090 wystąpień)
 - responsibilities (16 znaków, 2087 wystąpień)
 - Interventionsstelle (19 znaków, 231 wystąpień)
 - hydrogenorthophosphate (22 znaków, 148 wystąpień)
 - polytetrafluoroethylene (23 znaków, 148 wystąpień)  
 ### Częstotliwość zaimków
 ![title](images/pt-pronouns.png)
 ### Ilosć wystąpień dat (lata)
 `['1999', '1975', '1987', '1992', '1985', '1981', '1988', '1986', '1995', '1991', '1993', '1990', '1994', '1983', '1989'...`
 ![title](images/pt-years.png)
--- a/Lab2/statistics.py
+++ b/Lab2/statistics.py
@ -0,0 +1,153 @@
 import matplotlib.pyplot as plt
 from collections import Counter
 from collections import OrderedDict
 import regex as re
 from math import log
 file_path = "Lab1/out-merged.txt"
 file_content = None
 with open(file_path, 'r') as file:
  file_content = file.read()
 # file_content = file_content[:100]
 def get_characters(t):
    yield from t
 def freq_list(g, top=None):
  c = Counter(g)
  if top is None:
     items = c.items()
  else:
     items = c.most_common(top)
  return OrderedDict(sorted(items, key=lambda t: -t[1]))
 def get_words(t):
  for m in re.finditer(r'[\p{L}0-9\*]+', t):
     yield m.group(0)
 def rang_freq_with_labels(name, g, top=None):
   freq = freq_list(g, top)
   plt.figure(figsize=(12, 3))
   plt.ylabel('liczba wystąpień')
   plt.bar(freq.keys(), freq.values())
   fname = f'Lab2/images/{name}.png'
   plt.savefig(fname)
   return fname
 def log_rang_log_freq(name, g):
   freq = freq_list(g)
   plt.figure().clear()
   plt.plot([log(x) for x in range(1, len(freq.values())+1)], [log(y) for y in freq.values()])
   fname = f'Lab2/images/{name}.png'
   plt.savefig(fname)
   return fname
 def ngrams(iter, size):
    ngram = []
    for item in iter:
       ngram.append(item)
       if len(ngram) == size:
          yield tuple(ngram)
          ngram = ngram[1:]
 def get_ngrams(t, size):
   for word in get_words(t):
    for m in ngrams(word, size):
        yield m
 def get_w_freq_by_w_len(word_len):
    for word, count in freq.items():
        if len(word) == word_len:
            yield (count, word)
 def get_average_freq_by_w_len(word_lenghts):
    results = dict()
    for l in word_lenghts:
        word_freq = list(get_w_freq_by_w_len(l))
        if len(word_freq) == 0:
            continue
        average = sum([w[0] for w in word_freq]) / len(word_freq)
        results[l] = average
    return results
 def get_low_high_freq_by_w_len(word_lenghts):
    """
    Returns top 5 most frequent and non frequent words for each word length + average frequency.
    """
    results = []
    for l in word_lenghts:
        word_freq = list(get_w_freq_by_w_len(l))
        word_freq.sort()
        word_freq = list(filter(lambda t: re.findall("\d",str(t[1])) == [] and t[0] > 30, word_freq))
        word_stats = {
            'word_len': l,
            'average_freq': average_freq[l],
            'low_freq': word_freq[:10],
            'high_freq': word_freq[-10:]
        }
        results.append(word_stats)
    return results
 def get_pronouns_stats(freqs):
    pronouns = ["i", "you", "he", "she", "it"]
    pronoun_words_freq = [f for f in freqs.items() if f[0] in pronouns]
    x = [f[0] for f in pronoun_words_freq]
    y = [f[1] for f in pronoun_words_freq]
    plt.figure(figsize=(12, 3))
    plt.ylabel('liczba wystąpień')
    plt.bar(x, y)
    plt.savefig("Lab2/images/pt-pronouns.png")
    return pronoun_words_freq
 def get_years_stats(freqs):
    years_word_freq = [f for f in freqs.items() if re.findall(r"\b1{1}[0-9]{3}\b", f[0])]
    x = [f[0] for f in years_word_freq]
    y = [f[1] for f in years_word_freq]
    plt.figure(figsize=(12, 3))
    plt.ylabel('liczba wystąpień')
    plt.bar(x, y)
    plt.savefig("Lab2/images/pt-years.png")
    return years_word_freq
 print("Generating statistics...")
 # 10 most frequent words in the text
 rang_freq_with_labels('most-freq-words-20', get_words(file_content), top=20)
 # Zipf's law
 log_rang_log_freq('zipf-law-words', get_words(file_content))
 # Zipf's law for 3-grams
 log_rang_log_freq('zipf-law-2grams', get_ngrams(file_content, 3))
 # Words breaking the Zipf's law
 freq = freq_list(get_words(file_content))
 lenghts = [*set(len(f[0]) for f in freq.items())]
 average_freq = get_average_freq_by_w_len(lenghts)
 get_low_high_freq_by_w_len(lenghts)
 # Frequency of pronouns
 get_pronouns_stats(freq)
 print("Done")
 # Number of years in words
 get_years_stats(freq)