112 lines
2.7 KiB
Python
112 lines
2.7 KiB
Python
import lzma
|
|
import matplotlib.pyplot as plt
|
|
from math import log
|
|
from collections import OrderedDict
|
|
from collections import Counter
|
|
import regex as re
|
|
from itertools import islice
|
|
import json
|
|
import tqdm
|
|
ignore_rare = 4000
|
|
model_v = '4000'
|
|
|
|
|
|
def freq_list(g, top=None):
|
|
c = Counter(g)
|
|
|
|
if top is None:
|
|
items = c.items()
|
|
else:
|
|
items = c.most_common(top)
|
|
|
|
return OrderedDict(sorted(items, key=lambda t: -t[1]))
|
|
|
|
def get_words(t):
|
|
for m in re.finditer(r'[\p{L}0-9-\*]+', t):
|
|
yield m.group(0)
|
|
|
|
def ngrams(iter, size, w_counter):
|
|
ngram = []
|
|
for item in iter:
|
|
if w_counter[item] < ignore_rare:
|
|
ngram.append('<UNK>')
|
|
else:
|
|
ngram.append(item)
|
|
|
|
if len(ngram) == size:
|
|
yield tuple(ngram)
|
|
ngram = ngram[1:]
|
|
|
|
PREFIX_TRAIN = 'train'
|
|
words = []
|
|
|
|
|
|
counter_lines = 0
|
|
with lzma.open(f'{PREFIX_TRAIN}/in.tsv.xz', 'r') as train, open(f'{PREFIX_TRAIN}/expected.tsv', 'r') as expected:
|
|
for t_line, e_line in zip(train, expected):
|
|
t_line = t_line.decode("utf-8")
|
|
|
|
t_line = t_line.rstrip()
|
|
e_line = e_line.rstrip()
|
|
|
|
|
|
t_line_splitted_by_tab = t_line.split('\t')
|
|
|
|
t_line_cleared = t_line_splitted_by_tab[-2] + ' ' + e_line + ' ' + t_line_splitted_by_tab[-1]
|
|
|
|
t_line_cleared = t_line_cleared.lower()
|
|
|
|
|
|
|
|
words += re.findall(r'\p{L}+', t_line_cleared)
|
|
|
|
# t_line_splitted = t_line_cleared.split()
|
|
# words += t_line_splitted
|
|
|
|
if counter_lines % 100000 == 0:
|
|
print(counter_lines)
|
|
|
|
counter_lines+=1
|
|
if counter_lines > 70000: # 50000 12gb ram
|
|
break
|
|
|
|
words_c = Counter(words)
|
|
|
|
with open(f'vocab_{model_v}.txt', 'w') as f:
|
|
for word, amount in words_c.items():
|
|
if amount < ignore_rare:
|
|
continue
|
|
f.write(word + '\n')
|
|
|
|
|
|
trigrams_ = ngrams(words, 3, words_c)
|
|
tetragrams_ = ngrams(words, 4, words_c)
|
|
|
|
|
|
def create_probabilities_bigrams(trigrams, tetragrams):
|
|
probabilities_grams = {}
|
|
for tetragram, gram_amount in tetragrams.items():
|
|
# if bigram_amount <=2:
|
|
# continue
|
|
p_word_right = gram_amount / trigrams[tetragram[:-1]]
|
|
p_word_left = gram_amount / trigrams[tetragram[1:]]
|
|
probabilities_grams[tetragram] = (str(p_word_right), str(p_word_left))
|
|
|
|
return probabilities_grams
|
|
|
|
trigram_c = Counter(trigrams_)
|
|
word_=''
|
|
tetragrams_ = Counter(tetragrams_)
|
|
probabilities = create_probabilities_bigrams(trigram_c, tetragrams_)
|
|
|
|
|
|
items = probabilities.items()
|
|
probabilities = OrderedDict(sorted(items, key=lambda t:t[1], reverse=True))
|
|
items=''
|
|
|
|
|
|
with open(f'model_{model_v}.tsv', 'w') as f:
|
|
for tetragram, left_right_p in probabilities.items():
|
|
f.write("\t".join(tetragram) + "\t" + "\t".join(left_right_p) + '\n')
|
|
|