From fb82cdfc04aa5c8b05ece76baad6c0957046f71c Mon Sep 17 00:00:00 2001 From: Adam Wojdyla Date: Wed, 22 Mar 2023 04:32:34 +0100 Subject: [PATCH] statistics script --- {Lab1-PrepareCorpora => Lab1}/README.md | 0 {Lab1-PrepareCorpora => Lab1}/clean.py | 3 +- Lab2/README.md | 46 +++++++ Lab2/statistics.py | 153 ++++++++++++++++++++++++ 4 files changed, 201 insertions(+), 1 deletion(-) rename {Lab1-PrepareCorpora => Lab1}/README.md (100%) rename {Lab1-PrepareCorpora => Lab1}/clean.py (93%) create mode 100644 Lab2/README.md create mode 100644 Lab2/statistics.py diff --git a/Lab1-PrepareCorpora/README.md b/Lab1/README.md similarity index 100% rename from Lab1-PrepareCorpora/README.md rename to Lab1/README.md diff --git a/Lab1-PrepareCorpora/clean.py b/Lab1/clean.py similarity index 93% rename from Lab1-PrepareCorpora/clean.py rename to Lab1/clean.py index 3157959..a64b594 100644 --- a/Lab1-PrepareCorpora/clean.py +++ b/Lab1/clean.py @@ -27,6 +27,7 @@ def clean_with_regex(text): return [] out = list(filter(lambda item: filter_line(item), out)) out = list(map(lambda item: re.sub("(?<=\d)(\(\d+\))(?=\s+)|(\(\d+\)\s+)|(\d+\.)+\s", "", item), out)) + out = list(map(lambda item: re.sub("[^\w\d\s\\\)\(\/-]", "", item), out)) if out: out.pop(len(out)-1) return out @@ -41,7 +42,7 @@ def print_text(text, sort=False): def save_to_file(paragraph_list, file_name): with open(file_name, 'a') as f: for line in paragraph_list: - f.write("%s\n" % line.strip()) + f.write("%s\n" % line.strip().lower()) f.close() diff --git a/Lab2/README.md b/Lab2/README.md new file mode 100644 index 0000000..5c72445 --- /dev/null +++ b/Lab2/README.md @@ -0,0 +1,46 @@ +# Statystyki + +## Statystyki podstawowe + +### 10 nadłuższych słów + +` +MarineStrategyFrameworkDirectiveClassificationValue +OtherFinancialProfessionalAndInformationServices +GuineaPeruPhilippinesQatarRomaniaRussiaRwandaSao +MarineStrategyFrameworkDirectiveClassificationValue +AustraliaArgentinaBotswanaBrazilChileNamibiaNew +ManufacturingOfElectricalAndOpticalEquipment +ClassificationAndQuantificationFrameworkValue +FinancialProfessionalAndInformationServices +measuredIndicatedAndInferredMineralResource +AnthropogenicGeomorphologicFeatureTypeValue +` + +### Prawo Zipfa dla słów + +![title](images/zipf-law-words.png) + +### Prawo Zipfa dla trigramów z słów + +![title](images/zipf-law-3grams.png) + +### Słowa łamiące prawo łączące długość z częstością + +- aunt (4 znaki, 31 wystąpień) +- cave (4 znaki, 31 wystąpień) +- amateur (7 znaków, 31 wystąpień) +- CommissionFranz (15 znaków, 2090 wystąpień) +- responsibilities (16 znaków, 2087 wystąpień) +- Interventionsstelle (19 znaków, 231 wystąpień) +- hydrogenorthophosphate (22 znaków, 148 wystąpień) +- polytetrafluoroethylene (23 znaków, 148 wystąpień) + +### Częstotliwość zaimków + +![title](images/pt-pronouns.png) + +### Ilosć wystąpień dat (lata) + +`['1999', '1975', '1987', '1992', '1985', '1981', '1988', '1986', '1995', '1991', '1993', '1990', '1994', '1983', '1989'...` +![title](images/pt-years.png) diff --git a/Lab2/statistics.py b/Lab2/statistics.py new file mode 100644 index 0000000..70d5372 --- /dev/null +++ b/Lab2/statistics.py @@ -0,0 +1,153 @@ +import matplotlib.pyplot as plt +from collections import Counter +from collections import OrderedDict +import regex as re +from math import log + +file_path = "Lab1/out-merged.txt" +file_content = None + +with open(file_path, 'r') as file: + file_content = file.read() + +# file_content = file_content[:100] + +def get_characters(t): + yield from t + +def freq_list(g, top=None): + c = Counter(g) + + if top is None: + items = c.items() + else: + items = c.most_common(top) + + return OrderedDict(sorted(items, key=lambda t: -t[1])) + +def get_words(t): + for m in re.finditer(r'[\p{L}0-9\*]+', t): + yield m.group(0) + +def rang_freq_with_labels(name, g, top=None): + freq = freq_list(g, top) + + plt.figure(figsize=(12, 3)) + plt.ylabel('liczba wystąpień') + plt.bar(freq.keys(), freq.values()) + + fname = f'Lab2/images/{name}.png' + + plt.savefig(fname) + + return fname + +def log_rang_log_freq(name, g): + freq = freq_list(g) + + plt.figure().clear() + plt.plot([log(x) for x in range(1, len(freq.values())+1)], [log(y) for y in freq.values()]) + + fname = f'Lab2/images/{name}.png' + + plt.savefig(fname) + + return fname + +def ngrams(iter, size): + ngram = [] + for item in iter: + ngram.append(item) + if len(ngram) == size: + yield tuple(ngram) + ngram = ngram[1:] + +def get_ngrams(t, size): + for word in get_words(t): + for m in ngrams(word, size): + yield m + +def get_w_freq_by_w_len(word_len): + for word, count in freq.items(): + if len(word) == word_len: + yield (count, word) + +def get_average_freq_by_w_len(word_lenghts): + results = dict() + for l in word_lenghts: + word_freq = list(get_w_freq_by_w_len(l)) + if len(word_freq) == 0: + continue + average = sum([w[0] for w in word_freq]) / len(word_freq) + results[l] = average + + return results + +def get_low_high_freq_by_w_len(word_lenghts): + """ + Returns top 5 most frequent and non frequent words for each word length + average frequency. + """ + results = [] + for l in word_lenghts: + word_freq = list(get_w_freq_by_w_len(l)) + word_freq.sort() + word_freq = list(filter(lambda t: re.findall("\d",str(t[1])) == [] and t[0] > 30, word_freq)) + word_stats = { + 'word_len': l, + 'average_freq': average_freq[l], + 'low_freq': word_freq[:10], + 'high_freq': word_freq[-10:] + } + results.append(word_stats) + return results + +def get_pronouns_stats(freqs): + pronouns = ["i", "you", "he", "she", "it"] + pronoun_words_freq = [f for f in freqs.items() if f[0] in pronouns] + + x = [f[0] for f in pronoun_words_freq] + y = [f[1] for f in pronoun_words_freq] + + plt.figure(figsize=(12, 3)) + plt.ylabel('liczba wystąpień') + plt.bar(x, y) + plt.savefig("Lab2/images/pt-pronouns.png") + + return pronoun_words_freq + +def get_years_stats(freqs): + years_word_freq = [f for f in freqs.items() if re.findall(r"\b1{1}[0-9]{3}\b", f[0])] + x = [f[0] for f in years_word_freq] + y = [f[1] for f in years_word_freq] + + plt.figure(figsize=(12, 3)) + plt.ylabel('liczba wystąpień') + plt.bar(x, y) + plt.savefig("Lab2/images/pt-years.png") + + return years_word_freq + +print("Generating statistics...") + +# 10 most frequent words in the text +rang_freq_with_labels('most-freq-words-20', get_words(file_content), top=20) + +# Zipf's law +log_rang_log_freq('zipf-law-words', get_words(file_content)) + +# Zipf's law for 3-grams +log_rang_log_freq('zipf-law-2grams', get_ngrams(file_content, 3)) + +# Words breaking the Zipf's law +freq = freq_list(get_words(file_content)) +lenghts = [*set(len(f[0]) for f in freq.items())] +average_freq = get_average_freq_by_w_len(lenghts) +get_low_high_freq_by_w_len(lenghts) + +# Frequency of pronouns +get_pronouns_stats(freq) + +print("Done") + +# Number of years in words +get_years_stats(freq) \ No newline at end of file