Compare commits

..

3 Commits

Author SHA1 Message Date
Mikołaj Pokrywka
b647907ce4 trzymam kciuki za dobry wynik 2023-04-13 21:33:24 +02:00
Mikołaj Pokrywka
9abd651db6 tetragram 2023-04-13 09:31:20 +02:00
Mikołaj Pokrywka
65d889d652 tetragram 2023-04-12 20:56:08 +02:00
5 changed files with 18248 additions and 18086 deletions

File diff suppressed because it is too large Load Diff

140
predict.py Normal file
View File

@ -0,0 +1,140 @@
import lzma
import matplotlib.pyplot as plt
from math import log
from collections import OrderedDict
from collections import Counter
import regex as re
from itertools import islice
import json
import pdb
model_v = "1"
PREFIX_VALID = 'test-A'
prob_4gram = {}
with open(f'4_gram_model_{model_v}.tsv', 'r') as f:
for line in f:
line = line.rstrip()
splitted_line = line.split('\t')
prob_4gram[tuple(splitted_line[:3])] = json.loads(splitted_line[-1])
prob_3gram = {}
# with open(f'3_gram_model_{model_v}.tsv', 'r') as f:
# for line in f:
# line = line.rstrip()
# splitted_line = line.split('\t')
# prob_3gram[tuple(splitted_line[:2])] = json.loads(splitted_line[-1])
prob_2gram = {}
# with open(f'2_gram_model_{model_v}.tsv', 'r') as f:
# for line in f:
# line = line.rstrip()
# splitted_line = line.split('\t')
# prob_2gram[tuple(splitted_line[0])] = json.loads(splitted_line[-1])
vocab = set()
with open(f"vocab_{model_v}.txt", 'r') as f:
for l in f:
vocab.add(l.rstrip())
# probabilities_bi = {}
# with open(f'bigram_big_unk_20', 'r') as f:
# for line in f:
# line = line.rstrip()
# splitted_line = line.split('\t')
# probabilities_bi[tuple(splitted_line[:2])] = (float(splitted_line[2]), float(splitted_line[3]))
def count_probabilities(prob_4gram_x, prob_3gram_x, prob_2gram_x, _chunk_left, _chunk_right):
for index, (l, r) in enumerate(zip(_chunk_left, _chunk_right)):
if l not in vocab:
_chunk_left[index] = "<UNK>"
if r not in vocab:
_chunk_right[index] = "<UNK>"
_chunk_left = tuple(_chunk_left)
_chunk_right = tuple(_chunk_right)
hyps_4 = prob_4gram_x.get(_chunk_left)
# if _chunk_left not in prob_3gram_x:
# return {}
# hyps_3 = prob_3gram_x.get(_chunk_left)
# if _chunk_left not in prob_2gram_x:
# return {}
# hyps_2 = prob_2gram_x.get(_chunk_left)
if hyps_4 is None:
return {}
items = hyps_4.items()
return OrderedDict(sorted(items, key=lambda t:t[1], reverse=True))
with lzma.open(f'{PREFIX_VALID}/in.tsv.xz', 'r') as train:
for t_line in train:
t_line = t_line.decode("utf-8")
t_line = t_line.rstrip()
t_line = t_line.lower()
t_line = t_line.replace("\\\\n", ' ')
t_line_splitted_by_tab = t_line.split('\t')
words_before = t_line_splitted_by_tab[-2]
words_before = re.findall(r'\p{L}+', words_before)
words_after = t_line_splitted_by_tab[-1]
words_after = re.findall(r'\p{L}+', words_after)
chunk_left = words_before[-3:]
chunk_right = words_after[0:3]
probs_ordered = count_probabilities(prob_4gram, prob_3gram, prob_2gram, chunk_left, chunk_right)
# if len(probs_ordered) !=0:
# print(probs_ordered)
if len(probs_ordered) ==0:
print(f"the:0.1 to:0.1 a:0.1 :0.7")
continue
result_string = ''
counter_ = 0
p_sum = 0
for word_, p in probs_ordered.items():
if counter_>30:
break
re_ = re.search(r'\p{L}+', word_)
if re_:
word_cleared = re_.group(0)
p = p*0.9
p_sum += p
result_string += f"{word_cleared}:{str(p)} "
else:
if result_string == '':
result_string = f"the:0.5 a:0.3 "
continue
counter_+=1
res = 1 - p_sum
result_string += f':{res}'
print(result_string)
a=1

153
run.py
View File

@ -1,153 +0,0 @@
import lzma
import matplotlib.pyplot as plt
from math import log
from collections import OrderedDict
from collections import Counter
import regex as re
from itertools import islice
def freq_list(g, top=None):
c = Counter(g)
if top is None:
items = c.items()
else:
items = c.most_common(top)
return OrderedDict(sorted(items, key=lambda t: -t[1]))
def get_words(t):
for m in re.finditer(r'[\p{L}0-9-\*]+', t):
yield m.group(0)
def ngrams(iter, size):
ngram = []
for item in iter:
ngram.append(item)
if len(ngram) == size:
yield tuple(ngram)
ngram = ngram[1:]
PREFIX_TRAIN = 'train'
words = []
counter_lines = 0
with lzma.open(f'{PREFIX_TRAIN}/in.tsv.xz', 'r') as train, open(f'{PREFIX_TRAIN}/expected.tsv', 'r') as expected:
for t_line, e_line in zip(train, expected):
t_line = t_line.decode("utf-8")
t_line = t_line.rstrip()
e_line = e_line.rstrip()
t_line_splitted_by_tab = t_line.split('\t')
t_line_cleared = t_line_splitted_by_tab[-2] + ' ' + e_line + ' ' + t_line_splitted_by_tab[-1]
words += t_line_cleared.split()
counter_lines+=1
if counter_lines > 90000:
break
# lzmaFile = lzma.open('dev-0/in.tsv.xz', 'rb')
# content = lzmaFile.read().decode("utf-8")
# words = get_words(trainset)
ngrams_ = ngrams(words, 2)
def create_probabilities_bigrams(w_c, b_c):
probabilities_bigrams = {}
for bigram, bigram_amount in b_c.items():
if bigram_amount <=2:
continue
p_word_before = bigram_amount / w_c[bigram[0]]
p_word_after = bigram_amount / w_c[bigram[1]]
probabilities_bigrams[bigram] = (p_word_before, p_word_after)
return probabilities_bigrams
words_c = Counter(words)
word_=''
bigram_c = Counter(ngrams_)
ngrams_=''
probabilities = create_probabilities_bigrams(words_c, bigram_c)
items = probabilities.items()
probabilities = OrderedDict(sorted(items, key=lambda t:t[1], reverse=True))
items=''
# sorted_by_freq = freq_list(ngrams)
PREFIX_VALID = 'test-A'
def count_probabilities(w_b, w_a, probs, w_c, b_c):
results_before = {}
results_after = {}
for bigram, probses in probs.items():
if len(results_before) > 20 or len(results_after) > 20:
break
if w_b == bigram[0]:
results_before[bigram] = probses[0]
if w_a == bigram[1]:
results_after[bigram] = probses[1]
a=1
best_ = {}
for bigram, probses in results_before.items():
for bigram_2, probses_2 in results_after.items():
best_[bigram[1]] = probses * probses_2
for bigram, probses in results_after.items():
for bigram_2, probses_2 in results_before.items():
if bigram[0] in best_:
if probses * probses_2 < probses_2:
continue
best_[bigram[0]] = probses * probses_2
items = best_.items()
return OrderedDict(sorted(items, key=lambda t:t[1], reverse=True))
with lzma.open(f'{PREFIX_VALID}/in.tsv.xz', 'r') as train:
for t_line in train:
t_line = t_line.decode("utf-8")
t_line = t_line.rstrip()
t_line = t_line.replace('\\n', ' ')
t_line_splitted_by_tab = t_line.split('\t')
words_pre = t_line_splitted_by_tab[-2].split()
words_po = t_line_splitted_by_tab[-1].split()
w_pre = words_pre[-1]
w_po = words_po[0]
probs_ordered = count_probabilities(w_pre, w_po,probabilities, words_c, bigram_c)
if len(probs_ordered) ==0:
print(f"the:0.5 a:0.3 :0.2")
continue
result_string = ''
counter_ = 0
for word_, p in probs_ordered.items():
if counter_>4:
break
re_ = re.search(r'\p{L}+', word_)
if re_:
word_cleared = re_.group(0)
result_string += f"{word_cleared}:{str(p)} "
else:
if result_string == '':
result_string = f"the:0.5 a:0.3 "
continue
counter_+=1
result_string += ':0.1'
print(result_string)
a=1

File diff suppressed because it is too large Load Diff

175
train.py Normal file
View File

@ -0,0 +1,175 @@
import lzma
import matplotlib.pyplot as plt
from math import log
from collections import OrderedDict
from collections import Counter
import regex as re
from itertools import islice
import json
import tqdm
ignore_rare = 15000 #7500 perpex511.51 9000 perpex=505 15000 perpex503
model_v = '1'
def freq_list(g, top=None):
c = Counter(g)
if top is None:
items = c.items()
else:
items = c.most_common(top)
return OrderedDict(sorted(items, key=lambda t: -t[1]))
def get_words(t):
for m in re.finditer(r'[\p{L}0-9-\*]+', t):
yield m.group(0)
def ngrams(iter, size, w_counter):
ngram = []
for item in iter:
if w_counter[item] < ignore_rare:
ngram.append('<UNK>')
else:
ngram.append(item)
if len(ngram) == size:
yield tuple(ngram)
ngram = ngram[1:]
PREFIX_TRAIN = 'train'
words = []
counter_lines = 0
with lzma.open(f'{PREFIX_TRAIN}/in.tsv.xz', 'r') as train, open(f'{PREFIX_TRAIN}/expected.tsv', 'r') as expected:
for t_line, e_line in zip(train, expected):
t_line = t_line.decode("utf-8")
t_line = t_line.rstrip()
e_line = e_line.rstrip()
t_line_splitted_by_tab = t_line.split('\t')
t_line_cleared = t_line_splitted_by_tab[-2] + ' ' + e_line + ' ' + t_line_splitted_by_tab[-1]
t_line_cleared = t_line_cleared.lower()
t_line_cleared = t_line_cleared.replace("\\\\n", ' ')
words += re.findall(r'\p{L}+', t_line_cleared)
# t_line_splitted = t_line_cleared.split()
# words += t_line_splitted
if counter_lines % 100000 == 0:
print(counter_lines)
counter_lines+=1
if counter_lines > 130000: # 50000 12gb ram
break
words_c = Counter(words)
with open(f'vocab_{model_v}.txt', 'w') as f:
for word, amount in words_c.items():
if amount < ignore_rare:
continue
f.write(word + '\n')
with open(f'vocab_{model_v}.txt', 'w') as f:
for word, amount in words_c.items():
if amount < ignore_rare:
continue
f.write(word + '\n')
def create_model(grams4, trigrams):
model = {}
for gram4, amount4 in grams4.items():
trigram = gram4[:-1]
last_word = gram4[-1]
if last_word == "<UNK>":
continue
probibility = amount4 / trigrams[trigram]
if trigram in model:
model[trigram][last_word] = probibility
continue
model[trigram] = {last_word: probibility}
return model
def create_bigram_model(bigram_x, word_c_x):
model = {}
for gram4, amount4 in bigram_x.items():
word_key = gram4[0]
last_word = gram4[1]
if last_word == "<UNK>" or word_key=="<UNK>":
continue
try:
probibility = amount4 / word_c_x[word_key]
except:
print(gram4)
print(word_key)
print(last_word)
raise Exception
if word_key in model:
model[word_key][last_word] = probibility
continue
model[word_key] = {last_word: probibility}
return model
trigrams_ = ngrams(words, 3, words_c)
tetragrams_ = ngrams(words, 4, words_c)
trigram_c = Counter(trigrams_)
trigrams_ = ''
tetragrams_c = Counter(tetragrams_)
tetragrams_ = ''
model = create_model(tetragrams_c, trigram_c)
with open(f'4_gram_model_{model_v}.tsv', 'w') as f:
for trigram, hyps in model.items():
f.write("\t".join(trigram) + "\t" + json.dumps(hyps) + '\n')
# ========= Trigram
model=""
trigrams_ = ngrams(words, 3, words_c)
bigrams_ = ngrams(words, 2, words_c)
trigram_c = Counter(trigrams_)
trigrams_ = ''
bigram_c = Counter(bigrams_)
bigrams_ = ''
model = create_model(trigram_c, bigram_c)
trigram_c = ""
with open(f'3_gram_model_{model_v}.tsv', 'w') as f:
for trigram, hyps in model.items():
f.write("\t".join(trigram) + "\t" + json.dumps(hyps) + '\n')
model = ""
# ========= Bigram
model=""
bigrams_ = ngrams(words, 2, words_c)
bigram_c = Counter(bigrams_)
bigrams_ = ''
model = create_bigram_model(bigram_c, words_c)
with open(f'2_gram_model_{model_v}.tsv', 'w') as f:
for trigram, hyps in model.items():
f.write(trigram + "\t" + json.dumps(hyps) + '\n')
model = ""