178 lines
4.8 KiB
Python
178 lines
4.8 KiB
Python
import matplotlib.pyplot as plt
|
|
from collections import Counter
|
|
from collections import OrderedDict
|
|
import regex as re
|
|
from math import log
|
|
import argparse
|
|
import os
|
|
|
|
parser=argparse.ArgumentParser()
|
|
parser.add_argument("--filepath")
|
|
args=parser.parse_args()
|
|
|
|
FILE_PATH = "Lab1/out-merged.txt" if args.filepath is None else args.filepath
|
|
IMAGES_PATH = os.path.join(os.path.dirname(os.path.abspath(__file__)), "images")
|
|
file_content = None
|
|
|
|
with open(FILE_PATH, 'r') as file:
|
|
file_content = file.read()
|
|
file.close()
|
|
|
|
# file_content = file_content[:10000000]
|
|
|
|
def get_characters(t):
|
|
yield from t
|
|
|
|
def freq_list(g, top=None):
|
|
c = Counter(g)
|
|
|
|
if top is None:
|
|
items = c.items()
|
|
else:
|
|
items = c.most_common(top)
|
|
|
|
return OrderedDict(sorted(items, key=lambda t: -t[1]))
|
|
|
|
def get_words(t):
|
|
for m in re.finditer(r'[\p{L}0-9\*]+', t):
|
|
yield m.group(0)
|
|
|
|
def rang_freq_with_labels(name, g, top=None):
|
|
freq = freq_list(g, top)
|
|
|
|
plt.figure(figsize=(12, 3))
|
|
plt.ylabel('liczba wystąpień')
|
|
plt.bar(freq.keys(), freq.values())
|
|
|
|
fname = f'/{name}.png'
|
|
|
|
plt.savefig(IMAGES_PATH + fname)
|
|
|
|
return fname
|
|
|
|
def log_rang_log_freq(name, g):
|
|
freq = freq_list(g)
|
|
|
|
plt.figure().clear()
|
|
plt.plot([log(x) for x in range(1, len(freq.values())+1)], [log(y) for y in freq.values()])
|
|
|
|
fname = f'/{name}.png'
|
|
|
|
plt.savefig(IMAGES_PATH + fname)
|
|
|
|
return fname
|
|
|
|
def ngrams(iter, size):
|
|
ngram = []
|
|
for item in iter:
|
|
ngram.append(item)
|
|
if len(ngram) == size:
|
|
yield tuple(ngram)
|
|
ngram = ngram[1:]
|
|
|
|
def get_ngrams(t, size):
|
|
for word in get_words(t):
|
|
for m in ngrams(word, size):
|
|
yield m
|
|
|
|
def get_w_freq_by_w_len(freq, word_len):
|
|
for word, count in freq.items():
|
|
if len(word) == word_len:
|
|
yield (count, word)
|
|
|
|
def get_average_freq_by_w_len(freq, word_lenghts):
|
|
results = dict()
|
|
for l in word_lenghts:
|
|
word_freq = list(get_w_freq_by_w_len(freq, l))
|
|
if len(word_freq) == 0:
|
|
continue
|
|
average = sum([w[0] for w in word_freq]) / len(word_freq)
|
|
results[l] = average
|
|
|
|
return results
|
|
|
|
def get_low_high_freq_by_w_len(freq, word_lenghts, average_freq):
|
|
"""
|
|
Returns top 5 most frequent and non frequent words for each word length + average frequency.
|
|
"""
|
|
results = []
|
|
for l in word_lenghts:
|
|
word_freq = list(get_w_freq_by_w_len(freq, l))
|
|
word_freq.sort()
|
|
word_freq = list(filter(lambda t: re.findall("\d",str(t[1])) == [] and t[0] > 30, word_freq))
|
|
word_stats = {
|
|
'word_len': l,
|
|
'average_freq': average_freq[l],
|
|
'low_freq': word_freq[:5],
|
|
'high_freq': word_freq[-5:]
|
|
}
|
|
results.append(word_stats)
|
|
return results
|
|
|
|
def get_pronouns_stats(freqs):
|
|
pronouns = ["i", "you", "he", "she", "it"]
|
|
pronoun_words_freq = [f for f in freqs.items() if f[0] in pronouns]
|
|
|
|
x = [f[0] for f in pronoun_words_freq]
|
|
y = [f[1] for f in pronoun_words_freq]
|
|
|
|
plt.figure(figsize=(12, 3))
|
|
plt.ylabel('liczba wystąpień')
|
|
plt.bar(x, y)
|
|
plt.savefig(IMAGES_PATH + "/pt-pronouns.png")
|
|
|
|
return pronoun_words_freq
|
|
|
|
def get_years_stats(freqs):
|
|
years_word_freq = [f for f in freqs.items() if re.findall(r"\b1{1}[0-9]{3}\b", f[0])]
|
|
x = [f[0] for f in years_word_freq]
|
|
y = [f[1] for f in years_word_freq]
|
|
|
|
plt.figure(figsize=(12, 3))
|
|
plt.ylabel('liczba wystąpień')
|
|
plt.bar(x, y)
|
|
plt.savefig(IMAGES_PATH + "/pt-years.png")
|
|
|
|
return years_word_freq
|
|
|
|
def get_longest_words(top):
|
|
all_words = list(get_words(file_content))
|
|
deduplicated_word_listr = [*set(all_words)]
|
|
deduplicated_word_listr.sort(key=len)
|
|
deduplicated_word_listr.reverse()
|
|
return deduplicated_word_listr[:top]
|
|
|
|
print("Generating statistics...")
|
|
|
|
# 10 longest words
|
|
print("Calculating 10 longest words...")
|
|
print(get_longest_words(10))
|
|
|
|
# 10 most frequent words in the text
|
|
print("Calculating 10 most frequent words in the text...")
|
|
rang_freq_with_labels('most-freq-words-10', get_words(file_content), top=10)
|
|
|
|
# Zipf's law
|
|
print("Calculating Zipf's law...")
|
|
log_rang_log_freq('zipf-law-words', get_words(file_content))
|
|
|
|
# Zipf's law for 3-grams
|
|
print("Calculating Zipf's law for 3-grams...")
|
|
log_rang_log_freq('zipf-law-3grams', get_ngrams(file_content, 3))
|
|
|
|
# Words breaking the Zipf's law
|
|
print("Calculating words breaking the Zipf's law...")
|
|
freq = freq_list(get_words(file_content))
|
|
lenghts = [*set(len(f[0]) for f in freq.items())]
|
|
average_freq = get_average_freq_by_w_len(freq, lenghts)
|
|
get_low_high_freq_by_w_len(freq, lenghts, average_freq)
|
|
|
|
# Frequency of pronouns
|
|
print("Calculating frequency of pronouns...")
|
|
get_pronouns_stats(freq)
|
|
|
|
# Number of years in words
|
|
print("Calculating number of years in words...")
|
|
get_years_stats(freq)
|
|
|
|
print("Done") |