statistics script
This commit is contained in:
parent
893f4409e5
commit
fb82cdfc04
@ -27,6 +27,7 @@ def clean_with_regex(text):
|
||||
return []
|
||||
out = list(filter(lambda item: filter_line(item), out))
|
||||
out = list(map(lambda item: re.sub("(?<=\d)(\(\d+\))(?=\s+)|(\(\d+\)\s+)|(\d+\.)+\s", "", item), out))
|
||||
out = list(map(lambda item: re.sub("[^\w\d\s\\\)\(\/-]", "", item), out))
|
||||
if out:
|
||||
out.pop(len(out)-1)
|
||||
return out
|
||||
@ -41,7 +42,7 @@ def print_text(text, sort=False):
|
||||
def save_to_file(paragraph_list, file_name):
|
||||
with open(file_name, 'a') as f:
|
||||
for line in paragraph_list:
|
||||
f.write("%s\n" % line.strip())
|
||||
f.write("%s\n" % line.strip().lower())
|
||||
f.close()
|
||||
|
||||
|
46
Lab2/README.md
Normal file
46
Lab2/README.md
Normal file
@ -0,0 +1,46 @@
|
||||
# Statystyki
|
||||
|
||||
## Statystyki podstawowe
|
||||
|
||||
### 10 nadłuższych słów
|
||||
|
||||
`
|
||||
MarineStrategyFrameworkDirectiveClassificationValue
|
||||
OtherFinancialProfessionalAndInformationServices
|
||||
GuineaPeruPhilippinesQatarRomaniaRussiaRwandaSao
|
||||
MarineStrategyFrameworkDirectiveClassificationValue
|
||||
AustraliaArgentinaBotswanaBrazilChileNamibiaNew
|
||||
ManufacturingOfElectricalAndOpticalEquipment
|
||||
ClassificationAndQuantificationFrameworkValue
|
||||
FinancialProfessionalAndInformationServices
|
||||
measuredIndicatedAndInferredMineralResource
|
||||
AnthropogenicGeomorphologicFeatureTypeValue
|
||||
`
|
||||
|
||||
### Prawo Zipfa dla słów
|
||||
|
||||
![title](images/zipf-law-words.png)
|
||||
|
||||
### Prawo Zipfa dla trigramów z słów
|
||||
|
||||
![title](images/zipf-law-3grams.png)
|
||||
|
||||
### Słowa łamiące prawo łączące długość z częstością
|
||||
|
||||
- aunt (4 znaki, 31 wystąpień)
|
||||
- cave (4 znaki, 31 wystąpień)
|
||||
- amateur (7 znaków, 31 wystąpień)
|
||||
- CommissionFranz (15 znaków, 2090 wystąpień)
|
||||
- responsibilities (16 znaków, 2087 wystąpień)
|
||||
- Interventionsstelle (19 znaków, 231 wystąpień)
|
||||
- hydrogenorthophosphate (22 znaków, 148 wystąpień)
|
||||
- polytetrafluoroethylene (23 znaków, 148 wystąpień)
|
||||
|
||||
### Częstotliwość zaimków
|
||||
|
||||
![title](images/pt-pronouns.png)
|
||||
|
||||
### Ilosć wystąpień dat (lata)
|
||||
|
||||
`['1999', '1975', '1987', '1992', '1985', '1981', '1988', '1986', '1995', '1991', '1993', '1990', '1994', '1983', '1989'...`
|
||||
![title](images/pt-years.png)
|
153
Lab2/statistics.py
Normal file
153
Lab2/statistics.py
Normal file
@ -0,0 +1,153 @@
|
||||
import matplotlib.pyplot as plt
|
||||
from collections import Counter
|
||||
from collections import OrderedDict
|
||||
import regex as re
|
||||
from math import log
|
||||
|
||||
file_path = "Lab1/out-merged.txt"
|
||||
file_content = None
|
||||
|
||||
with open(file_path, 'r') as file:
|
||||
file_content = file.read()
|
||||
|
||||
# file_content = file_content[:100]
|
||||
|
||||
def get_characters(t):
|
||||
yield from t
|
||||
|
||||
def freq_list(g, top=None):
|
||||
c = Counter(g)
|
||||
|
||||
if top is None:
|
||||
items = c.items()
|
||||
else:
|
||||
items = c.most_common(top)
|
||||
|
||||
return OrderedDict(sorted(items, key=lambda t: -t[1]))
|
||||
|
||||
def get_words(t):
|
||||
for m in re.finditer(r'[\p{L}0-9\*]+', t):
|
||||
yield m.group(0)
|
||||
|
||||
def rang_freq_with_labels(name, g, top=None):
|
||||
freq = freq_list(g, top)
|
||||
|
||||
plt.figure(figsize=(12, 3))
|
||||
plt.ylabel('liczba wystąpień')
|
||||
plt.bar(freq.keys(), freq.values())
|
||||
|
||||
fname = f'Lab2/images/{name}.png'
|
||||
|
||||
plt.savefig(fname)
|
||||
|
||||
return fname
|
||||
|
||||
def log_rang_log_freq(name, g):
|
||||
freq = freq_list(g)
|
||||
|
||||
plt.figure().clear()
|
||||
plt.plot([log(x) for x in range(1, len(freq.values())+1)], [log(y) for y in freq.values()])
|
||||
|
||||
fname = f'Lab2/images/{name}.png'
|
||||
|
||||
plt.savefig(fname)
|
||||
|
||||
return fname
|
||||
|
||||
def ngrams(iter, size):
|
||||
ngram = []
|
||||
for item in iter:
|
||||
ngram.append(item)
|
||||
if len(ngram) == size:
|
||||
yield tuple(ngram)
|
||||
ngram = ngram[1:]
|
||||
|
||||
def get_ngrams(t, size):
|
||||
for word in get_words(t):
|
||||
for m in ngrams(word, size):
|
||||
yield m
|
||||
|
||||
def get_w_freq_by_w_len(word_len):
|
||||
for word, count in freq.items():
|
||||
if len(word) == word_len:
|
||||
yield (count, word)
|
||||
|
||||
def get_average_freq_by_w_len(word_lenghts):
|
||||
results = dict()
|
||||
for l in word_lenghts:
|
||||
word_freq = list(get_w_freq_by_w_len(l))
|
||||
if len(word_freq) == 0:
|
||||
continue
|
||||
average = sum([w[0] for w in word_freq]) / len(word_freq)
|
||||
results[l] = average
|
||||
|
||||
return results
|
||||
|
||||
def get_low_high_freq_by_w_len(word_lenghts):
|
||||
"""
|
||||
Returns top 5 most frequent and non frequent words for each word length + average frequency.
|
||||
"""
|
||||
results = []
|
||||
for l in word_lenghts:
|
||||
word_freq = list(get_w_freq_by_w_len(l))
|
||||
word_freq.sort()
|
||||
word_freq = list(filter(lambda t: re.findall("\d",str(t[1])) == [] and t[0] > 30, word_freq))
|
||||
word_stats = {
|
||||
'word_len': l,
|
||||
'average_freq': average_freq[l],
|
||||
'low_freq': word_freq[:10],
|
||||
'high_freq': word_freq[-10:]
|
||||
}
|
||||
results.append(word_stats)
|
||||
return results
|
||||
|
||||
def get_pronouns_stats(freqs):
|
||||
pronouns = ["i", "you", "he", "she", "it"]
|
||||
pronoun_words_freq = [f for f in freqs.items() if f[0] in pronouns]
|
||||
|
||||
x = [f[0] for f in pronoun_words_freq]
|
||||
y = [f[1] for f in pronoun_words_freq]
|
||||
|
||||
plt.figure(figsize=(12, 3))
|
||||
plt.ylabel('liczba wystąpień')
|
||||
plt.bar(x, y)
|
||||
plt.savefig("Lab2/images/pt-pronouns.png")
|
||||
|
||||
return pronoun_words_freq
|
||||
|
||||
def get_years_stats(freqs):
|
||||
years_word_freq = [f for f in freqs.items() if re.findall(r"\b1{1}[0-9]{3}\b", f[0])]
|
||||
x = [f[0] for f in years_word_freq]
|
||||
y = [f[1] for f in years_word_freq]
|
||||
|
||||
plt.figure(figsize=(12, 3))
|
||||
plt.ylabel('liczba wystąpień')
|
||||
plt.bar(x, y)
|
||||
plt.savefig("Lab2/images/pt-years.png")
|
||||
|
||||
return years_word_freq
|
||||
|
||||
print("Generating statistics...")
|
||||
|
||||
# 10 most frequent words in the text
|
||||
rang_freq_with_labels('most-freq-words-20', get_words(file_content), top=20)
|
||||
|
||||
# Zipf's law
|
||||
log_rang_log_freq('zipf-law-words', get_words(file_content))
|
||||
|
||||
# Zipf's law for 3-grams
|
||||
log_rang_log_freq('zipf-law-2grams', get_ngrams(file_content, 3))
|
||||
|
||||
# Words breaking the Zipf's law
|
||||
freq = freq_list(get_words(file_content))
|
||||
lenghts = [*set(len(f[0]) for f in freq.items())]
|
||||
average_freq = get_average_freq_by_w_len(lenghts)
|
||||
get_low_high_freq_by_w_len(lenghts)
|
||||
|
||||
# Frequency of pronouns
|
||||
get_pronouns_stats(freq)
|
||||
|
||||
print("Done")
|
||||
|
||||
# Number of years in words
|
||||
get_years_stats(freq)
|
Loading…
Reference in New Issue
Block a user