import matplotlib.pyplot as plt from collections import Counter from collections import OrderedDict import regex as re from math import log import argparse import os parser=argparse.ArgumentParser() parser.add_argument("--filepath") args=parser.parse_args() FILE_PATH = "Lab1/out-merged.txt" if args.filepath is None else args.filepath IMAGES_PATH = os.path.join(os.path.dirname(os.path.abspath(__file__)), "images") file_content = None with open(FILE_PATH, 'r') as file: file_content = file.read() file.close() # file_content = file_content[:10000000] def get_characters(t): yield from t def freq_list(g, top=None): c = Counter(g) if top is None: items = c.items() else: items = c.most_common(top) return OrderedDict(sorted(items, key=lambda t: -t[1])) def get_words(t): for m in re.finditer(r'[\p{L}0-9\*]+', t): yield m.group(0) def rang_freq_with_labels(name, g, top=None): freq = freq_list(g, top) plt.figure(figsize=(12, 3)) plt.ylabel('liczba wystąpień') plt.bar(freq.keys(), freq.values()) fname = f'/{name}.png' plt.savefig(IMAGES_PATH + fname) return fname def log_rang_log_freq(name, g): freq = freq_list(g) plt.figure().clear() plt.plot([log(x) for x in range(1, len(freq.values())+1)], [log(y) for y in freq.values()]) fname = f'/{name}.png' plt.savefig(IMAGES_PATH + fname) return fname def ngrams(iter, size): ngram = [] for item in iter: ngram.append(item) if len(ngram) == size: yield tuple(ngram) ngram = ngram[1:] def get_ngrams(t, size): for word in get_words(t): for m in ngrams(word, size): yield m def get_w_freq_by_w_len(freq, word_len): for word, count in freq.items(): if len(word) == word_len: yield (count, word) def get_average_freq_by_w_len(freq, word_lenghts): results = dict() for l in word_lenghts: word_freq = list(get_w_freq_by_w_len(freq, l)) if len(word_freq) == 0: continue average = sum([w[0] for w in word_freq]) / len(word_freq) results[l] = average return results def get_low_high_freq_by_w_len(freq, word_lenghts, average_freq): """ Returns top 5 most frequent and non frequent words for each word length + average frequency. """ results = [] for l in word_lenghts: word_freq = list(get_w_freq_by_w_len(freq, l)) word_freq.sort() word_freq = list(filter(lambda t: re.findall("\d",str(t[1])) == [] and t[0] > 30, word_freq)) word_stats = { 'word_len': l, 'average_freq': average_freq[l], 'low_freq': word_freq[:5], 'high_freq': word_freq[-5:] } results.append(word_stats) return results def get_pronouns_stats(freqs): pronouns = ["i", "you", "he", "she", "it"] pronoun_words_freq = [f for f in freqs.items() if f[0] in pronouns] x = [f[0] for f in pronoun_words_freq] y = [f[1] for f in pronoun_words_freq] plt.figure(figsize=(12, 3)) plt.ylabel('liczba wystąpień') plt.bar(x, y) plt.savefig(IMAGES_PATH + "/pt-pronouns.png") return pronoun_words_freq def get_years_stats(freqs): years_word_freq = [f for f in freqs.items() if re.findall(r"\b1{1}[0-9]{3}\b", f[0])] x = [f[0] for f in years_word_freq] y = [f[1] for f in years_word_freq] plt.figure(figsize=(12, 3)) plt.ylabel('liczba wystąpień') plt.bar(x, y) plt.savefig(IMAGES_PATH + "/pt-years.png") return years_word_freq def get_longest_words(top): all_words = list(get_words(file_content)) deduplicated_word_listr = [*set(all_words)] deduplicated_word_listr.sort(key=len) deduplicated_word_listr.reverse() return deduplicated_word_listr[:top] print("Generating statistics...") # 10 longest words print("Calculating 10 longest words...") print(get_longest_words(10)) # 10 most frequent words in the text print("Calculating 10 most frequent words in the text...") rang_freq_with_labels('most-freq-words-10', get_words(file_content), top=10) # Zipf's law print("Calculating Zipf's law...") log_rang_log_freq('zipf-law-words', get_words(file_content)) # Zipf's law for 3-grams print("Calculating Zipf's law for 3-grams...") log_rang_log_freq('zipf-law-3grams', get_ngrams(file_content, 3)) # Words breaking the Zipf's law print("Calculating words breaking the Zipf's law...") freq = freq_list(get_words(file_content)) lenghts = [*set(len(f[0]) for f in freq.items())] average_freq = get_average_freq_by_w_len(freq, lenghts) get_low_high_freq_by_w_len(freq, lenghts, average_freq) # Frequency of pronouns print("Calculating frequency of pronouns...") get_pronouns_stats(freq) # Number of years in words print("Calculating number of years in words...") get_years_stats(freq) print("Done")