upload script
This commit is contained in:
parent
84f20fa9ac
commit
767978c654
166
script.py
Normal file
166
script.py
Normal file
@ -0,0 +1,166 @@
|
||||
#!/usr/bin/env python
|
||||
# coding: utf-8
|
||||
|
||||
# MODEL TRIGRAMOWY - uwzględniamy dwa poprzednie słowa
|
||||
|
||||
# In[4]:
|
||||
|
||||
|
||||
import lzma
|
||||
import csv
|
||||
import re
|
||||
import math
|
||||
|
||||
|
||||
# In[5]:
|
||||
|
||||
|
||||
def read_data(folder_name):
|
||||
|
||||
all_data = lzma.open(f'{folder_name}/in.tsv.xz').read().decode('UTF-8').split('\n')
|
||||
data = [line.split('\t') for line in all_data][:-1]
|
||||
data = [[i[6].replace('\\n', ' '), i[7].replace('\\n', ' ')] for i in data]
|
||||
|
||||
words = []
|
||||
with open(f'{folder_name}/expected.tsv') as file:
|
||||
tsv_file = csv.reader(file, delimiter="\t")
|
||||
for line in tsv_file:
|
||||
words.append(line[0])
|
||||
|
||||
return data, words
|
||||
|
||||
train_data, train_words = read_data('train')
|
||||
|
||||
|
||||
# In[10]:
|
||||
|
||||
|
||||
def print_example(data, words, idx):
|
||||
print(f'{data[idx][0]} _____{words[idx].upper()}_____ {data[idx][1]}')
|
||||
|
||||
print_example(train_data, train_words, 13)
|
||||
|
||||
|
||||
# In[26]:
|
||||
|
||||
|
||||
def generate_N_grams(text, ngram=1, no_punctuation=True):
|
||||
text = re.sub(r'[\-] ', '', text).lower()
|
||||
if no_punctuation:
|
||||
text = re.sub(r'[\)\(\.\,\-]', ' ', text)
|
||||
words=[word for word in text.split()]
|
||||
temp=zip(*[words[i:] for i in range(0,ngram)])
|
||||
ans=[' '.join(ngram) for ngram in temp]
|
||||
return ans
|
||||
|
||||
N_grams = []
|
||||
for i in range(len(train_data[:2000])): # POPRAWIĆ !
|
||||
N_grams += generate_N_grams(f'{train_data[i][0]} {train_words[i]} {train_data[i][1]}', 2)
|
||||
N_grams += generate_N_grams(f'{train_data[i][0]} {train_words[i]} {train_data[i][1]}', 3)
|
||||
|
||||
|
||||
# In[27]:
|
||||
|
||||
|
||||
def check_prob(N_grams):
|
||||
count = {}
|
||||
for i in N_grams:
|
||||
i = i.rsplit(maxsplit=1)
|
||||
if i[0] in count:
|
||||
if i[1] in count[i[0]]:
|
||||
count[i[0]][i[1]] += 1
|
||||
else:
|
||||
count[i[0]][i[1]] = 1
|
||||
else:
|
||||
count[i[0]] = {i[1]: 1}
|
||||
|
||||
for word in count:
|
||||
s = sum(count[word].values())
|
||||
for i in count[word]:
|
||||
count[word][i] = count[word][i] / s
|
||||
|
||||
return count
|
||||
|
||||
probs = check_prob(N_grams)
|
||||
|
||||
|
||||
# In[28]:
|
||||
|
||||
|
||||
dev_data, dev_words = read_data('dev-0')
|
||||
|
||||
|
||||
# In[29]:
|
||||
|
||||
|
||||
def find_word(word_1, word_2):
|
||||
tmp_probs = {}
|
||||
if word_1 in probs:
|
||||
if word_2 in probs:
|
||||
for i in probs[word_1]:
|
||||
if i in probs[word_2]:
|
||||
tmp_probs[i] = probs[word_1][i] * probs[word_2][i]
|
||||
if tmp_probs[i] == 1:
|
||||
tmp_probs[i] = 0.1
|
||||
else:
|
||||
c = probs[word_2][min(probs[word_2].keys(), key=(lambda k: probs[word_2][k]))] / 10
|
||||
tmp_probs[i] = probs[word_1][i] * c
|
||||
else:
|
||||
tmp_probs = probs[word_1]
|
||||
else:
|
||||
tmp_probs = {}
|
||||
|
||||
sorted_list = sorted(tmp_probs.items(), key=lambda x: x[1], reverse=True)[:1]
|
||||
tmm = ' '.join([i[0] + ':' + str(i[1]) for i in sorted_list])
|
||||
s = 1 - sum(n for _, n in sorted_list)
|
||||
if s == 0:
|
||||
s = 0.01
|
||||
tmm += ' :' + str(s)
|
||||
if tmp_probs == {}:
|
||||
return ':1'
|
||||
return tmm
|
||||
|
||||
|
||||
# In[30]:
|
||||
|
||||
|
||||
dev_found_words = []
|
||||
|
||||
for i in dev_data:
|
||||
t = i[0]
|
||||
t = re.sub(r'[\-] ', '', t).lower()
|
||||
if True:
|
||||
t = re.sub(r'[\)\(\.\,\-]', ' ', t)
|
||||
words=[word for word in t.split()]
|
||||
dev_found_words.append(find_word(words[-1], ' '.join(words[-2:])))
|
||||
|
||||
|
||||
# In[31]:
|
||||
|
||||
|
||||
f = open("dev-0/out.tsv", "w")
|
||||
f.write('\n'.join(dev_found_words) + '\n')
|
||||
f.close()
|
||||
|
||||
|
||||
#
|
||||
|
||||
# In[ ]:
|
||||
|
||||
|
||||
test_data = read_data('test-A/in.tsv.xz')
|
||||
test_data = [[i[6].replace('\\n', ' '), i[7].replace('\\n', ' ')] for i in test_data]
|
||||
|
||||
test_found_words = []
|
||||
|
||||
for i in test_data:
|
||||
t = i[0]
|
||||
if True:
|
||||
t = re.sub(r'[\.\,\-]', ' ', t).lower()
|
||||
words=[word for word in t.split()]
|
||||
test_found_words.append(find_word(words[-1], ' '.join(words[-2:])))
|
||||
|
||||
f = open("test-A/out.tsv", "w")
|
||||
f.write('\n'.join(test_found_words) + '\n')
|
||||
f.close()
|
||||
|
Loading…
Reference in New Issue
Block a user