64 lines
1.4 KiB
Python
64 lines
1.4 KiB
Python
import matplotlib.pyplot as plt
|
|
from math import log
|
|
from collections import OrderedDict
|
|
from collections import Counter
|
|
import regex as re
|
|
from itertools import islice
|
|
|
|
|
|
def freq_list(g, top=None):
|
|
c = Counter(g)
|
|
|
|
if top is None:
|
|
items = c.items()
|
|
else:
|
|
items = c.most_common(top)
|
|
|
|
return OrderedDict(sorted(items, key=lambda t: -t[1]))
|
|
|
|
def log_rang_log_freq(name, g):
|
|
freq = freq_list(g)
|
|
|
|
plt.figure().clear()
|
|
plt.plot([log(x) for x in range(1, len(freq.values())+1)], [log(y) for y in freq.values()])
|
|
|
|
fname = f'{name}.png'
|
|
|
|
plt.savefig(fname)
|
|
|
|
return fname
|
|
|
|
def get_words(t):
|
|
for m in re.finditer(r'[\p{L}0-9-\*]+', t):
|
|
yield m.group(0)
|
|
|
|
file = open('News-Commentary-v16', 'r' )
|
|
|
|
def get_characters(t):
|
|
yield from t
|
|
|
|
|
|
content = file.read()
|
|
|
|
|
|
# log_rang_log_freq('pt-words-log-log', get_words(content))
|
|
# b = freq_list(get_characters(content))
|
|
# print(b)
|
|
# a = list(islice(b, 0, 10))
|
|
# print(a)
|
|
# log_rang_log_freq('pt-chars-log-log', get_characters(content))
|
|
|
|
def ngrams(iter, size):
|
|
ngram = []
|
|
for item in iter:
|
|
ngram.append(item)
|
|
if len(ngram) == size:
|
|
yield tuple(ngram)
|
|
ngram = ngram[1:]
|
|
|
|
# ngram_list = list(islice(ngrams(get_words(content), 3), 0, 100 ))
|
|
# print(ngram_list)
|
|
|
|
|
|
log_rang_log_freq('pt-3_unigram_chars-log-log', ngrams(get_characters(content), 3))
|
|
log_rang_log_freq('pt-3_unigram_words-log-log', ngrams(get_words(content), 3)) |