Zad_1_lang_corpus_ML/zipf.py

64 lines
1.4 KiB
Python
Raw Normal View History

2023-03-21 23:51:37 +01:00
import matplotlib.pyplot as plt
from math import log
from collections import OrderedDict
from collections import Counter
import regex as re
from itertools import islice
def freq_list(g, top=None):
c = Counter(g)
if top is None:
items = c.items()
else:
items = c.most_common(top)
return OrderedDict(sorted(items, key=lambda t: -t[1]))
def log_rang_log_freq(name, g):
freq = freq_list(g)
plt.figure().clear()
plt.plot([log(x) for x in range(1, len(freq.values())+1)], [log(y) for y in freq.values()])
fname = f'{name}.png'
plt.savefig(fname)
return fname
def get_words(t):
for m in re.finditer(r'[\p{L}0-9-\*]+', t):
yield m.group(0)
file = open('News-Commentary-v16', 'r' )
def get_characters(t):
yield from t
content = file.read()
# log_rang_log_freq('pt-words-log-log', get_words(content))
# b = freq_list(get_characters(content))
# print(b)
# a = list(islice(b, 0, 10))
# print(a)
# log_rang_log_freq('pt-chars-log-log', get_characters(content))
def ngrams(iter, size):
ngram = []
for item in iter:
ngram.append(item)
if len(ngram) == size:
yield tuple(ngram)
ngram = ngram[1:]
# ngram_list = list(islice(ngrams(get_words(content), 3), 0, 100 ))
# print(ngram_list)
log_rang_log_freq('pt-3_unigram_chars-log-log', ngrams(get_characters(content), 3))
log_rang_log_freq('pt-3_unigram_words-log-log', ngrams(get_words(content), 3))