This commit is contained in:
Michal Maciaszek 2020-12-08 12:10:40 +01:00
parent a2b50f54aa
commit cb4a1f6a4f
4 changed files with 25813 additions and 33 deletions

11628
dev-1/out.tsv Normal file

File diff suppressed because it is too large Load Diff

14132
test-A/out.tsv Normal file

File diff suppressed because it is too large Load Diff

View File

@ -3,7 +3,7 @@ import sys
from math import log from math import log
import regex as re import regex as re
def get_prob(bigrams, unigrams): def count_prob(bigrams, unigrams):
prob = (bigrams + 1.0) / (unigrams + 1) prob = (bigrams + 1.0) / (unigrams + 1)
if prob > 1.0: if prob > 1.0:
return 1.0 return 1.0
@ -25,9 +25,9 @@ def main():
pre_ngram = tuple(left_word + [word]) pre_ngram = tuple(left_word + [word])
post_ngram = tuple([word] + right_word) post_ngram = tuple([word] + right_word)
#print(pre_ngram) #print(pre_ngram)
pre_ngram_prob = get_prob(ngrams[2].get(pre_ngram, 0), ngrams[1].get(word[0],0) + vocabulary_size * 1000) pre_ngram_prob = count_prob(ngrams[2].get(pre_ngram, 0), ngrams[1].get(word[0],0) + vocabulary_size * 1000)
#if pre_ngram_prob>0: #if pre_ngram_prob>0:
post_ngram_prob = get_prob(ngrams[2].get(post_ngram, 0), ngrams[1].get(word[0],0) + vocabulary_size * 1000) post_ngram_prob = count_prob(ngrams[2].get(post_ngram, 0), ngrams[1].get(word[0],0) + vocabulary_size * 1000)
probabilities.append((word, pre_ngram_prob * post_ngram_prob)) probabilities.append((word, pre_ngram_prob * post_ngram_prob))
probabilities = sorted(probabilities, key=lambda t: t[1], reverse=True)[:50] probabilities = sorted(probabilities, key=lambda t: t[1], reverse=True)[:50]

View File

@ -1,37 +1,57 @@
#!/usr/bin/python3
import sys
import regex as re
import pickle import pickle
import sys
from math import log
import regex as re
def count_prob(bigrams, unigrams):
def into_words(sentence): prob = (bigrams + 1.0) / (unigrams + 1)
return re.findall(r'\p{P}|[^\p{P}\s]+', sentence.lower()) if prob > 1.0:
return 1.0
else:
return prob
def main(): def main():
ngrams = {1: {}, 2: {}} ngrams = pickle.load(open('ngrams_2.pkl', 'rb'))
lowest_ngram = 1 vocabulary_size = len(ngrams[1])
highest_ngram = 2 for line in sys.stdin:
counter = 0 words = re.findall(r'.*\t.*\t.* (.*?)\t(.*?) ', line.lower())[0]
for line in sys.stdin: #dla kazdej linii z pliku #print(words)
line = line.split('\t')[4] # podziel na 4 left_word = [str(words[0])]
tokens = into_words(line) #na slowa right_word = [str(words[1])]
number_of_tokens = len(tokens) #ile slow?
for n in range(lowest_ngram, highest_ngram+1): #dla kazdego ngram probabilities = []
for i in range(0, number_of_tokens-n+1): #i tyle ile jest slow -n gram + 1 for word in ngrams[1].keys():
ngram = tuple(tokens[i:i+n]) word = str(word[0])
if ngram in ngrams[n]: pre_ngram = tuple(left_word + [word])
ngrams[n][ngram] += 1 post_ngram = tuple([word] + right_word)
else: #print(pre_ngram)
ngrams[n][ngram] = 1 pre_ngram_prob = count_prob(ngrams[2].get(pre_ngram, 0), ngrams[1].get(word[0],0) + vocabulary_size * 1000)
if counter % 1000 == 0: #if pre_ngram_prob>0:
print('counter = ', counter) post_ngram_prob = count_prob(ngrams[2].get(post_ngram, 0), ngrams[1].get(word[0],0) + vocabulary_size * 1000)
counter += 1
ngrams[1] = dict(sorted(ngrams[1].items(), key=lambda item: ngrams[1][item[0]], reverse=True)[:1000]) probabilities.append((word, pre_ngram_prob * post_ngram_prob))
ngrams[2] = dict(sorted(ngrams[2].items(), key=lambda item: ngrams[2][item[0]], reverse=True)[:100000]) probabilities = sorted(probabilities, key=lambda t: t[1], reverse=True)[:50]
#ngrams[3] = dict(sorted(ngrams[3].items(), key=lambda item: ngrams[3][item[0]], reverse=True)[:120000]) probability = 1.0
pickle.dump(ngrams, open('ngrams_2.pkl', 'wb')) text = ''
counter = 0
has_log_prob0 = False
for p in probabilities:
word = p[0]
prob = p[1]
if counter == 0 and (probability - prob <= 0.0):
text = word + ':' + str(log(0.95)) + ' :' + str(log(0.05))
has_log_prob0 = True
break
if counter > 0 and (probability - prob <= 0.0):
text += ':' + str(log(probability))
has_log_prob0 = True
break
text += word + ':' + str(log(prob)) + ' '
probability -= prob
counter += 1
if not has_log_prob0:
text += ':' + str(log(0.0001))
print(text)
if __name__ == '__main__': if __name__ == '__main__':