s444507-modelowanie-jezyka/Lab2/statistics.py

import matplotlib.pyplot as plt
from collections import Counter
from collections import OrderedDict
import regex as re
from math import log

file_path = "Lab1/out-merged.txt"
file_content = None

with open(file_path, 'r') as file:
  file_content = file.read()

# file_content = file_content[:100]

def get_characters(t):
    yield from t

def freq_list(g, top=None):
  c = Counter(g)

  if top is None:
     items = c.items()
  else:
     items = c.most_common(top)

  return OrderedDict(sorted(items, key=lambda t: -t[1]))

def get_words(t):
  for m in re.finditer(r'[\p{L}0-9\*]+', t):
     yield m.group(0)

def rang_freq_with_labels(name, g, top=None):
   freq = freq_list(g, top)

   plt.figure(figsize=(12, 3))
   plt.ylabel('liczba wystąpień')
   plt.bar(freq.keys(), freq.values())
   
   fname = f'Lab2/images/{name}.png'
   
   plt.savefig(fname)
   
   return fname

def log_rang_log_freq(name, g):
   freq = freq_list(g)

   plt.figure().clear()
   plt.plot([log(x) for x in range(1, len(freq.values())+1)], [log(y) for y in freq.values()])

   fname = f'Lab2/images/{name}.png'

   plt.savefig(fname)

   return fname

def ngrams(iter, size):
    ngram = []
    for item in iter:
       ngram.append(item)
       if len(ngram) == size:
          yield tuple(ngram)
          ngram = ngram[1:]

def get_ngrams(t, size):
   for word in get_words(t):
    for m in ngrams(word, size):
        yield m

def get_w_freq_by_w_len(word_len):
    for word, count in freq.items():
        if len(word) == word_len:
            yield (count, word)
            
def get_average_freq_by_w_len(word_lenghts):
    results = dict()
    for l in word_lenghts:
        word_freq = list(get_w_freq_by_w_len(l))
        if len(word_freq) == 0:
            continue
        average = sum([w[0] for w in word_freq]) / len(word_freq)
        results[l] = average
        
    return results

def get_low_high_freq_by_w_len(word_lenghts):
    """
    Returns top 5 most frequent and non frequent words for each word length + average frequency.
    """
    results = []
    for l in word_lenghts:
        word_freq = list(get_w_freq_by_w_len(l))
        word_freq.sort()
        word_freq = list(filter(lambda t: re.findall("\d",str(t[1])) == [] and t[0] > 30, word_freq))
        word_stats = {
            'word_len': l,
            'average_freq': average_freq[l],
            'low_freq': word_freq[:10],
            'high_freq': word_freq[-10:]
        }
        results.append(word_stats)
    return results

def get_pronouns_stats(freqs):
    pronouns = ["i", "you", "he", "she", "it"]
    pronoun_words_freq = [f for f in freqs.items() if f[0] in pronouns]
    
    x = [f[0] for f in pronoun_words_freq]
    y = [f[1] for f in pronoun_words_freq]
    
    plt.figure(figsize=(12, 3))
    plt.ylabel('liczba wystąpień')
    plt.bar(x, y)
    plt.savefig("Lab2/images/pt-pronouns.png")

    return pronoun_words_freq

def get_years_stats(freqs):
    years_word_freq = [f for f in freqs.items() if re.findall(r"\b1{1}[0-9]{3}\b", f[0])]
    x = [f[0] for f in years_word_freq]
    y = [f[1] for f in years_word_freq]

    plt.figure(figsize=(12, 3))
    plt.ylabel('liczba wystąpień')
    plt.bar(x, y)
    plt.savefig("Lab2/images/pt-years.png")

    return years_word_freq

print("Generating statistics...")

# 10 most frequent words in the text
rang_freq_with_labels('most-freq-words-20', get_words(file_content), top=20)

# Zipf's law
log_rang_log_freq('zipf-law-words', get_words(file_content))

# Zipf's law for 3-grams
log_rang_log_freq('zipf-law-2grams', get_ngrams(file_content, 3))

# Words breaking the Zipf's law
freq = freq_list(get_words(file_content))
lenghts = [*set(len(f[0]) for f in freq.items())]
average_freq = get_average_freq_by_w_len(lenghts)
get_low_high_freq_by_w_len(lenghts)

# Frequency of pronouns
get_pronouns_stats(freq)

print("Done")

# Number of years in words
get_years_stats(freq)
statistics script 2023-03-22 04:32:34 +01:00			`import matplotlib.pyplot as plt`
			`from collections import Counter`
			`from collections import OrderedDict`
			`import regex as re`
			`from math import log`

			`file_path = "Lab1/out-merged.txt"`
			`file_content = None`

			`with open(file_path, 'r') as file:`
			`file_content = file.read()`

			`# file_content = file_content[:100]`

			`def get_characters(t):`
			`yield from t`

			`def freq_list(g, top=None):`
			`c = Counter(g)`

			`if top is None:`
			`items = c.items()`
			`else:`
			`items = c.most_common(top)`

			`return OrderedDict(sorted(items, key=lambda t: -t[1]))`

			`def get_words(t):`
			`for m in re.finditer(r'[\p{L}0-9\*]+', t):`
			`yield m.group(0)`

			`def rang_freq_with_labels(name, g, top=None):`
			`freq = freq_list(g, top)`

			`plt.figure(figsize=(12, 3))`
			`plt.ylabel('liczba wystąpień')`
			`plt.bar(freq.keys(), freq.values())`

			`fname = f'Lab2/images/{name}.png'`

			`plt.savefig(fname)`

			`return fname`

			`def log_rang_log_freq(name, g):`
			`freq = freq_list(g)`

			`plt.figure().clear()`
			`plt.plot([log(x) for x in range(1, len(freq.values())+1)], [log(y) for y in freq.values()])`

			`fname = f'Lab2/images/{name}.png'`

			`plt.savefig(fname)`

			`return fname`

			`def ngrams(iter, size):`
			`ngram = []`
			`for item in iter:`
			`ngram.append(item)`
			`if len(ngram) == size:`
			`yield tuple(ngram)`
			`ngram = ngram[1:]`

			`def get_ngrams(t, size):`
			`for word in get_words(t):`
			`for m in ngrams(word, size):`
			`yield m`

			`def get_w_freq_by_w_len(word_len):`
			`for word, count in freq.items():`
			`if len(word) == word_len:`
			`yield (count, word)`

			`def get_average_freq_by_w_len(word_lenghts):`
			`results = dict()`
			`for l in word_lenghts:`
			`word_freq = list(get_w_freq_by_w_len(l))`
			`if len(word_freq) == 0:`
			`continue`
			`average = sum([w[0] for w in word_freq]) / len(word_freq)`
			`results[l] = average`

			`return results`

			`def get_low_high_freq_by_w_len(word_lenghts):`
			`"""`
			`Returns top 5 most frequent and non frequent words for each word length + average frequency.`
			`"""`
			`results = []`
			`for l in word_lenghts:`
			`word_freq = list(get_w_freq_by_w_len(l))`
			`word_freq.sort()`
			`word_freq = list(filter(lambda t: re.findall("\d",str(t[1])) == [] and t[0] > 30, word_freq))`
			`word_stats = {`
			`'word_len': l,`
			`'average_freq': average_freq[l],`
			`'low_freq': word_freq[:10],`
			`'high_freq': word_freq[-10:]`
			`}`
			`results.append(word_stats)`
			`return results`

			`def get_pronouns_stats(freqs):`
			`pronouns = ["i", "you", "he", "she", "it"]`
			`pronoun_words_freq = [f for f in freqs.items() if f[0] in pronouns]`

			`x = [f[0] for f in pronoun_words_freq]`
			`y = [f[1] for f in pronoun_words_freq]`

			`plt.figure(figsize=(12, 3))`
			`plt.ylabel('liczba wystąpień')`
			`plt.bar(x, y)`
			`plt.savefig("Lab2/images/pt-pronouns.png")`

			`return pronoun_words_freq`

			`def get_years_stats(freqs):`
			`years_word_freq = [f for f in freqs.items() if re.findall(r"\b1{1}[0-9]{3}\b", f[0])]`
			`x = [f[0] for f in years_word_freq]`
			`y = [f[1] for f in years_word_freq]`

			`plt.figure(figsize=(12, 3))`
			`plt.ylabel('liczba wystąpień')`
			`plt.bar(x, y)`
			`plt.savefig("Lab2/images/pt-years.png")`

			`return years_word_freq`

			`print("Generating statistics...")`

			`# 10 most frequent words in the text`
			`rang_freq_with_labels('most-freq-words-20', get_words(file_content), top=20)`

			`# Zipf's law`
			`log_rang_log_freq('zipf-law-words', get_words(file_content))`

			`# Zipf's law for 3-grams`
			`log_rang_log_freq('zipf-law-2grams', get_ngrams(file_content, 3))`

			`# Words breaking the Zipf's law`
			`freq = freq_list(get_words(file_content))`
			`lenghts = [*set(len(f[0]) for f in freq.items())]`
			`average_freq = get_average_freq_by_w_len(lenghts)`
			`get_low_high_freq_by_w_len(lenghts)`

			`# Frequency of pronouns`
			`get_pronouns_stats(freq)`

			`print("Done")`

			`# Number of years in words`
			`get_years_stats(freq)`