tetragram

This commit is contained in:
Mikołaj Pokrywka 2023-04-12 20:56:08 +02:00
parent c7d96f1597
commit 65d889d652
5 changed files with 18144 additions and 18086 deletions

File diff suppressed because it is too large Load Diff

100
predict.py Normal file
View File

@ -0,0 +1,100 @@
import lzma
import matplotlib.pyplot as plt
from math import log
from collections import OrderedDict
from collections import Counter
import regex as re
from itertools import islice
import json
import pdb
model_v = "4000"
PREFIX_VALID = 'test-A'
probabilities = {}
with open(f'model_{model_v}.tsv', 'r') as f:
for line in f:
line = line.rstrip()
splitted_line = line.split('\t')
probabilities[tuple(splitted_line[:4])] = (float(splitted_line[4]), float(splitted_line[5]))
vocab = set()
with open(f"vocab_{model_v}.txt", 'r') as f:
for l in f:
vocab.add(l.rstrip())
def count_probabilities(_probabilities, _chunk_left, _chunk_right):
for index, (l, r) in enumerate(zip( _chunk_left, _chunk_right)):
if l not in vocab:
_chunk_left[index] = "<UNK>"
if r not in vocab:
_chunk_right[index] = "<UNK>"
results_left = {}
best_ = {}
for tetragram, probses in _probabilities.items():
if tetragram[-1] == "<UNK>":
return best_
if len(results_left) > 2:
break
if list(tetragram[:3]) == _chunk_left:
# for tetragram_2, probses_2 in _probabilities.items():
# if list(tetragram_2[1:]) == _chunk_right:
# best_[tetragram[-1]] = probses[0] * probses_2[1]
if tetragram[-1] not in best_:
best_[tetragram[-1]] = probses[0] * 0.7
items = best_.items()
return OrderedDict(sorted(items, key=lambda t:t[1], reverse=True))
with lzma.open(f'{PREFIX_VALID}/in.tsv.xz', 'r') as train:
for t_line in train:
t_line = t_line.decode("utf-8")
t_line = t_line.rstrip()
t_line = t_line.lower()
t_line_splitted_by_tab = t_line.split('\t')
words_before = t_line_splitted_by_tab[-2]
words_before = re.findall(r'\p{L}+', words_before)
words_after = t_line_splitted_by_tab[-1]
words_after = re.findall(r'\p{L}+', words_after)
chunk_left = words_before[-3:]
chunk_right = words_after[0:3]
probs_ordered = count_probabilities(probabilities, chunk_left, chunk_right)
# if len(probs_ordered) !=0:
# print(probs_ordered)
if len(probs_ordered) ==0:
print(f"the:0.1 to:0.1 a:0.1 :0.7")
continue
result_string = ''
counter_ = 0
for word_, p in probs_ordered.items():
if counter_>4:
break
re_ = re.search(r'\p{L}+', word_)
if re_:
word_cleared = re_.group(0)
result_string += f"{word_cleared}:{str(p)} "
else:
if result_string == '':
result_string = f"the:0.5 a:0.3 "
continue
counter_+=1
result_string += ':0.2'
print(result_string)
a=1

153
run.py
View File

@ -1,153 +0,0 @@
import lzma
import matplotlib.pyplot as plt
from math import log
from collections import OrderedDict
from collections import Counter
import regex as re
from itertools import islice
def freq_list(g, top=None):
c = Counter(g)
if top is None:
items = c.items()
else:
items = c.most_common(top)
return OrderedDict(sorted(items, key=lambda t: -t[1]))
def get_words(t):
for m in re.finditer(r'[\p{L}0-9-\*]+', t):
yield m.group(0)
def ngrams(iter, size):
ngram = []
for item in iter:
ngram.append(item)
if len(ngram) == size:
yield tuple(ngram)
ngram = ngram[1:]
PREFIX_TRAIN = 'train'
words = []
counter_lines = 0
with lzma.open(f'{PREFIX_TRAIN}/in.tsv.xz', 'r') as train, open(f'{PREFIX_TRAIN}/expected.tsv', 'r') as expected:
for t_line, e_line in zip(train, expected):
t_line = t_line.decode("utf-8")
t_line = t_line.rstrip()
e_line = e_line.rstrip()
t_line_splitted_by_tab = t_line.split('\t')
t_line_cleared = t_line_splitted_by_tab[-2] + ' ' + e_line + ' ' + t_line_splitted_by_tab[-1]
words += t_line_cleared.split()
counter_lines+=1
if counter_lines > 90000:
break
# lzmaFile = lzma.open('dev-0/in.tsv.xz', 'rb')
# content = lzmaFile.read().decode("utf-8")
# words = get_words(trainset)
ngrams_ = ngrams(words, 2)
def create_probabilities_bigrams(w_c, b_c):
probabilities_bigrams = {}
for bigram, bigram_amount in b_c.items():
if bigram_amount <=2:
continue
p_word_before = bigram_amount / w_c[bigram[0]]
p_word_after = bigram_amount / w_c[bigram[1]]
probabilities_bigrams[bigram] = (p_word_before, p_word_after)
return probabilities_bigrams
words_c = Counter(words)
word_=''
bigram_c = Counter(ngrams_)
ngrams_=''
probabilities = create_probabilities_bigrams(words_c, bigram_c)
items = probabilities.items()
probabilities = OrderedDict(sorted(items, key=lambda t:t[1], reverse=True))
items=''
# sorted_by_freq = freq_list(ngrams)
PREFIX_VALID = 'test-A'
def count_probabilities(w_b, w_a, probs, w_c, b_c):
results_before = {}
results_after = {}
for bigram, probses in probs.items():
if len(results_before) > 20 or len(results_after) > 20:
break
if w_b == bigram[0]:
results_before[bigram] = probses[0]
if w_a == bigram[1]:
results_after[bigram] = probses[1]
a=1
best_ = {}
for bigram, probses in results_before.items():
for bigram_2, probses_2 in results_after.items():
best_[bigram[1]] = probses * probses_2
for bigram, probses in results_after.items():
for bigram_2, probses_2 in results_before.items():
if bigram[0] in best_:
if probses * probses_2 < probses_2:
continue
best_[bigram[0]] = probses * probses_2
items = best_.items()
return OrderedDict(sorted(items, key=lambda t:t[1], reverse=True))
with lzma.open(f'{PREFIX_VALID}/in.tsv.xz', 'r') as train:
for t_line in train:
t_line = t_line.decode("utf-8")
t_line = t_line.rstrip()
t_line = t_line.replace('\\n', ' ')
t_line_splitted_by_tab = t_line.split('\t')
words_pre = t_line_splitted_by_tab[-2].split()
words_po = t_line_splitted_by_tab[-1].split()
w_pre = words_pre[-1]
w_po = words_po[0]
probs_ordered = count_probabilities(w_pre, w_po,probabilities, words_c, bigram_c)
if len(probs_ordered) ==0:
print(f"the:0.5 a:0.3 :0.2")
continue
result_string = ''
counter_ = 0
for word_, p in probs_ordered.items():
if counter_>4:
break
re_ = re.search(r'\p{L}+', word_)
if re_:
word_cleared = re_.group(0)
result_string += f"{word_cleared}:{str(p)} "
else:
if result_string == '':
result_string = f"the:0.5 a:0.3 "
continue
counter_+=1
result_string += ':0.1'
print(result_string)
a=1

File diff suppressed because it is too large Load Diff

111
train.py Normal file
View File

@ -0,0 +1,111 @@
import lzma
import matplotlib.pyplot as plt
from math import log
from collections import OrderedDict
from collections import Counter
import regex as re
from itertools import islice
import json
import tqdm
ignore_rare = 4000
model_v = '4000'
def freq_list(g, top=None):
c = Counter(g)
if top is None:
items = c.items()
else:
items = c.most_common(top)
return OrderedDict(sorted(items, key=lambda t: -t[1]))
def get_words(t):
for m in re.finditer(r'[\p{L}0-9-\*]+', t):
yield m.group(0)
def ngrams(iter, size, w_counter):
ngram = []
for item in iter:
if w_counter[item] < ignore_rare:
ngram.append('<UNK>')
else:
ngram.append(item)
if len(ngram) == size:
yield tuple(ngram)
ngram = ngram[1:]
PREFIX_TRAIN = 'train'
words = []
counter_lines = 0
with lzma.open(f'{PREFIX_TRAIN}/in.tsv.xz', 'r') as train, open(f'{PREFIX_TRAIN}/expected.tsv', 'r') as expected:
for t_line, e_line in zip(train, expected):
t_line = t_line.decode("utf-8")
t_line = t_line.rstrip()
e_line = e_line.rstrip()
t_line_splitted_by_tab = t_line.split('\t')
t_line_cleared = t_line_splitted_by_tab[-2] + ' ' + e_line + ' ' + t_line_splitted_by_tab[-1]
t_line_cleared = t_line_cleared.lower()
words += re.findall(r'\p{L}+', t_line_cleared)
# t_line_splitted = t_line_cleared.split()
# words += t_line_splitted
if counter_lines % 100000 == 0:
print(counter_lines)
counter_lines+=1
if counter_lines > 70000: # 50000 12gb ram
break
words_c = Counter(words)
with open(f'vocab_{model_v}.txt', 'w') as f:
for word, amount in words_c.items():
if amount < ignore_rare:
continue
f.write(word + '\n')
trigrams_ = ngrams(words, 3, words_c)
tetragrams_ = ngrams(words, 4, words_c)
def create_probabilities_bigrams(trigrams, tetragrams):
probabilities_grams = {}
for tetragram, gram_amount in tetragrams.items():
# if bigram_amount <=2:
# continue
p_word_right = gram_amount / trigrams[tetragram[:-1]]
p_word_left = gram_amount / trigrams[tetragram[1:]]
probabilities_grams[tetragram] = (str(p_word_right), str(p_word_left))
return probabilities_grams
trigram_c = Counter(trigrams_)
word_=''
tetragrams_ = Counter(tetragrams_)
probabilities = create_probabilities_bigrams(trigram_c, tetragrams_)
items = probabilities.items()
probabilities = OrderedDict(sorted(items, key=lambda t:t[1], reverse=True))
items=''
with open(f'model_{model_v}.tsv', 'w') as f:
for tetragram, left_right_p in probabilities.items():
f.write("\t".join(tetragram) + "\t" + "\t".join(left_right_p) + '\n')