60 lines
2.0 KiB
Python
60 lines
2.0 KiB
Python
|
import pickle
|
||
|
import sys
|
||
|
from math import log
|
||
|
|
||
|
import regex as re
|
||
|
|
||
|
|
||
|
def get_prob(count, total, classes):
|
||
|
prob = (count + 1.0) / (total + classes)
|
||
|
if prob > 1.0:
|
||
|
return 1.0
|
||
|
else:
|
||
|
return prob
|
||
|
|
||
|
|
||
|
def main():
|
||
|
ngrams = pickle.load(open('ngrams.pkl', 'rb'))
|
||
|
vocabulary_size = len(ngrams[1])
|
||
|
for line in sys.stdin:
|
||
|
words = re.findall(r'.*\t.*\t.* (.*?) (.*?)\t(.*?) (.*?) ', line.lower())[0]
|
||
|
left_words = [str(words[0]), str(words[1])]
|
||
|
right_words = [str(words[2]), str(words[3])]
|
||
|
|
||
|
probabilities = []
|
||
|
for word in ngrams[1].keys():
|
||
|
word = str(word[0])
|
||
|
pre_ngram = tuple(left_words + [word])
|
||
|
post_ngram = tuple([word] + right_words)
|
||
|
pre_ngram_prob = get_prob(ngrams[3].get(pre_ngram, 0), ngrams[2].get(tuple(left_words), 0),
|
||
|
vocabulary_size)
|
||
|
post_ngram_prob = get_prob(ngrams[3].get(post_ngram, 0), ngrams[2].get(post_ngram[0:2], 0),
|
||
|
vocabulary_size)
|
||
|
probabilities.append((word, pre_ngram_prob * post_ngram_prob))
|
||
|
probabilities = sorted(probabilities, key=lambda t: t[1], reverse=True)[:50]
|
||
|
probability = 1.0
|
||
|
text = ''
|
||
|
counter = 0
|
||
|
has_log_prob0 = False
|
||
|
for p in probabilities:
|
||
|
word = p[0]
|
||
|
prob = p[1]
|
||
|
if counter == 0 and (probability - prob <= 0.0):
|
||
|
text = word + ':' + str(log(0.95)) + ' :' + str(log(0.05))
|
||
|
has_log_prob0 = True
|
||
|
break
|
||
|
if counter > 0 and (probability - prob <= 0.0):
|
||
|
text += ':' + str(log(probability))
|
||
|
has_log_prob0 = True
|
||
|
break
|
||
|
text += word + ':' + str(log(prob)) + ' '
|
||
|
probability -= prob
|
||
|
counter += 1
|
||
|
if not has_log_prob0:
|
||
|
text += ':' + str(log(0.0001))
|
||
|
print(text)
|
||
|
|
||
|
|
||
|
if __name__ == '__main__':
|
||
|
main()
|