challenging-america-word-ga.../train.py
Mikołaj Pokrywka 65d889d652 tetragram
2023-04-12 20:56:08 +02:00

112 lines
2.7 KiB
Python

import lzma
import matplotlib.pyplot as plt
from math import log
from collections import OrderedDict
from collections import Counter
import regex as re
from itertools import islice
import json
import tqdm
ignore_rare = 4000
model_v = '4000'
def freq_list(g, top=None):
c = Counter(g)
if top is None:
items = c.items()
else:
items = c.most_common(top)
return OrderedDict(sorted(items, key=lambda t: -t[1]))
def get_words(t):
for m in re.finditer(r'[\p{L}0-9-\*]+', t):
yield m.group(0)
def ngrams(iter, size, w_counter):
ngram = []
for item in iter:
if w_counter[item] < ignore_rare:
ngram.append('<UNK>')
else:
ngram.append(item)
if len(ngram) == size:
yield tuple(ngram)
ngram = ngram[1:]
PREFIX_TRAIN = 'train'
words = []
counter_lines = 0
with lzma.open(f'{PREFIX_TRAIN}/in.tsv.xz', 'r') as train, open(f'{PREFIX_TRAIN}/expected.tsv', 'r') as expected:
for t_line, e_line in zip(train, expected):
t_line = t_line.decode("utf-8")
t_line = t_line.rstrip()
e_line = e_line.rstrip()
t_line_splitted_by_tab = t_line.split('\t')
t_line_cleared = t_line_splitted_by_tab[-2] + ' ' + e_line + ' ' + t_line_splitted_by_tab[-1]
t_line_cleared = t_line_cleared.lower()
words += re.findall(r'\p{L}+', t_line_cleared)
# t_line_splitted = t_line_cleared.split()
# words += t_line_splitted
if counter_lines % 100000 == 0:
print(counter_lines)
counter_lines+=1
if counter_lines > 70000: # 50000 12gb ram
break
words_c = Counter(words)
with open(f'vocab_{model_v}.txt', 'w') as f:
for word, amount in words_c.items():
if amount < ignore_rare:
continue
f.write(word + '\n')
trigrams_ = ngrams(words, 3, words_c)
tetragrams_ = ngrams(words, 4, words_c)
def create_probabilities_bigrams(trigrams, tetragrams):
probabilities_grams = {}
for tetragram, gram_amount in tetragrams.items():
# if bigram_amount <=2:
# continue
p_word_right = gram_amount / trigrams[tetragram[:-1]]
p_word_left = gram_amount / trigrams[tetragram[1:]]
probabilities_grams[tetragram] = (str(p_word_right), str(p_word_left))
return probabilities_grams
trigram_c = Counter(trigrams_)
word_=''
tetragrams_ = Counter(tetragrams_)
probabilities = create_probabilities_bigrams(trigram_c, tetragrams_)
items = probabilities.items()
probabilities = OrderedDict(sorted(items, key=lambda t:t[1], reverse=True))
items=''
with open(f'model_{model_v}.tsv', 'w') as f:
for tetragram, left_right_p in probabilities.items():
f.write("\t".join(tetragram) + "\t" + "\t".join(left_right_p) + '\n')