second try

This commit is contained in:
Mikołaj Pokrywka 2023-04-04 23:33:33 +02:00
parent 9a1a5628a9
commit c7d96f1597
3 changed files with 4706 additions and 4697 deletions

File diff suppressed because it is too large Load Diff

33
run.py
View File

@ -28,9 +28,10 @@ def ngrams(iter, size):
yield tuple(ngram) yield tuple(ngram)
ngram = ngram[1:] ngram = ngram[1:]
PREFIX_TRAIN = 'dev-0' PREFIX_TRAIN = 'train'
trainset = '' words = []
counter_lines = 0
with lzma.open(f'{PREFIX_TRAIN}/in.tsv.xz', 'r') as train, open(f'{PREFIX_TRAIN}/expected.tsv', 'r') as expected: with lzma.open(f'{PREFIX_TRAIN}/in.tsv.xz', 'r') as train, open(f'{PREFIX_TRAIN}/expected.tsv', 'r') as expected:
for t_line, e_line in zip(train, expected): for t_line, e_line in zip(train, expected):
t_line = t_line.decode("utf-8") t_line = t_line.decode("utf-8")
@ -42,15 +43,15 @@ with lzma.open(f'{PREFIX_TRAIN}/in.tsv.xz', 'r') as train, open(f'{PREFIX_TRAIN}
t_line_cleared = t_line_splitted_by_tab[-2] + ' ' + e_line + ' ' + t_line_splitted_by_tab[-1] t_line_cleared = t_line_splitted_by_tab[-2] + ' ' + e_line + ' ' + t_line_splitted_by_tab[-1]
words += t_line_cleared.split()
trainset += t_line_cleared + ' ' counter_lines+=1
if counter_lines > 90000:
break
trainset = trainset.replace('\\n', ' ')
# lzmaFile = lzma.open('dev-0/in.tsv.xz', 'rb') # lzmaFile = lzma.open('dev-0/in.tsv.xz', 'rb')
# content = lzmaFile.read().decode("utf-8") # content = lzmaFile.read().decode("utf-8")
words = trainset.split()
# words = get_words(trainset) # words = get_words(trainset)
ngrams_ = ngrams(words, 2) ngrams_ = ngrams(words, 2)
@ -59,7 +60,7 @@ ngrams_ = ngrams(words, 2)
def create_probabilities_bigrams(w_c, b_c): def create_probabilities_bigrams(w_c, b_c):
probabilities_bigrams = {} probabilities_bigrams = {}
for bigram, bigram_amount in b_c.items(): for bigram, bigram_amount in b_c.items():
if bigram_amount <=6: if bigram_amount <=2:
continue continue
p_word_before = bigram_amount / w_c[bigram[0]] p_word_before = bigram_amount / w_c[bigram[0]]
p_word_after = bigram_amount / w_c[bigram[1]] p_word_after = bigram_amount / w_c[bigram[1]]
@ -67,18 +68,17 @@ def create_probabilities_bigrams(w_c, b_c):
return probabilities_bigrams return probabilities_bigrams
words_c = Counter(words) words_c = Counter(words)
word_=''
bigram_c = Counter(ngrams_) bigram_c = Counter(ngrams_)
ngrams_=''
probabilities = create_probabilities_bigrams(words_c, bigram_c) probabilities = create_probabilities_bigrams(words_c, bigram_c)
items = probabilities.items() items = probabilities.items()
probabilities = OrderedDict(sorted(items, key=lambda t:t[1], reverse=True)) probabilities = OrderedDict(sorted(items, key=lambda t:t[1], reverse=True))
probabilities items=''
# sorted_by_freq = freq_list(ngrams) # sorted_by_freq = freq_list(ngrams)
# print(bigram_c)
PREFIX_VALID = 'test-A' PREFIX_VALID = 'test-A'
@ -137,7 +137,16 @@ with lzma.open(f'{PREFIX_VALID}/in.tsv.xz', 'r') as train:
for word_, p in probs_ordered.items(): for word_, p in probs_ordered.items():
if counter_>4: if counter_>4:
break break
result_string += f"{word_}:{str(p)} " re_ = re.search(r'\p{L}+', word_)
if re_:
word_cleared = re_.group(0)
result_string += f"{word_cleared}:{str(p)} "
else:
if result_string == '':
result_string = f"the:0.5 a:0.3 "
continue
counter_+=1 counter_+=1
result_string += ':0.1' result_string += ':0.1'
print(result_string) print(result_string)

File diff suppressed because it is too large Load Diff