import matplotlib.pyplot as plt from math import log from collections import OrderedDict from collections import Counter import regex as re from itertools import islice def freq_list(g, top=None): c = Counter(g) if top is None: items = c.items() else: items = c.most_common(top) return OrderedDict(sorted(items, key=lambda t: -t[1])) def log_rang_log_freq(name, g): freq = freq_list(g) plt.figure().clear() plt.plot([log(x) for x in range(1, len(freq.values())+1)], [log(y) for y in freq.values()]) fname = f'{name}.png' plt.savefig(fname) return fname def get_words(t): for m in re.finditer(r'[\p{L}0-9-\*]+', t): yield m.group(0) file = open('News-Commentary-v16', 'r' ) def get_characters(t): yield from t content = file.read() # log_rang_log_freq('pt-words-log-log', get_words(content)) # b = freq_list(get_characters(content)) # print(b) # a = list(islice(b, 0, 10)) # print(a) # log_rang_log_freq('pt-chars-log-log', get_characters(content)) def ngrams(iter, size): ngram = [] for item in iter: ngram.append(item) if len(ngram) == size: yield tuple(ngram) ngram = ngram[1:] # ngram_list = list(islice(ngrams(get_words(content), 3), 0, 100 )) # print(ngram_list) log_rang_log_freq('pt-3_unigram_chars-log-log', ngrams(get_characters(content), 3)) log_rang_log_freq('pt-3_unigram_words-log-log', ngrams(get_words(content), 3))