retro-gap/predict.py

import pickle
import sys
from math import log

import regex as re


def get_prob(count, total, classes):
    prob = (count + 1.0) / (total + classes)
    if prob > 1.0:
        return 1.0
    else:
        return prob


def main():
    ngrams = pickle.load(open('ngrams.pkl', 'rb'))
    vocabulary_size = len(ngrams[1])
    for line in sys.stdin:
        words = re.findall(r'.*\t.*\t.* (.*?) (.*?)\t(.*?) (.*?) ', line.lower())[0]
        left_words = [str(words[0]), str(words[1])]
        right_words = [str(words[2]), str(words[3])]

        probabilities = []
        for word in ngrams[1].keys():
            word = str(word[0])
            pre_ngram = tuple(left_words + [word])
            post_ngram = tuple([word] + right_words)
            pre_ngram_prob = get_prob(ngrams[3].get(pre_ngram, 0), ngrams[2].get(tuple(left_words), 0),
                                      vocabulary_size)
            post_ngram_prob = get_prob(ngrams[3].get(post_ngram, 0), ngrams[2].get(post_ngram[0:2], 0),
                                       vocabulary_size)
            probabilities.append((word, pre_ngram_prob * post_ngram_prob))
        probabilities = sorted(probabilities, key=lambda t: t[1], reverse=True)[:50]
        probability = 1.0
        text = ''
        counter = 0
        has_log_prob0 = False
        for p in probabilities:
            word = p[0]
            prob = p[1]
            if counter == 0 and (probability - prob <= 0.0):
                text = word + ':' + str(log(0.95)) + ' :' + str(log(0.05))
                has_log_prob0 = True
                break
            if counter > 0 and (probability - prob <= 0.0):
                text += ':' + str(log(probability))
                has_log_prob0 = True
                break
            text += word + ':' + str(log(prob)) + ' '
            probability -= prob
            counter += 1
        if not has_log_prob0:
            text += ':' + str(log(0.0001))
        print(text)


if __name__ == '__main__':
    main()
dev-0 2020-12-08 12:01:14 +01:00			`import pickle`
			`import sys`
			`from math import log`

			`import regex as re`


			`def get_prob(count, total, classes):`
			`prob = (count + 1.0) / (total + classes)`
			`if prob > 1.0:`
			`return 1.0`
			`else:`
			`return prob`


			`def main():`
			`ngrams = pickle.load(open('ngrams.pkl', 'rb'))`
			`vocabulary_size = len(ngrams[1])`
			`for line in sys.stdin:`
			`words = re.findall(r'.\t.\t.* (.?) (.?)\t(.?) (.?) ', line.lower())[0]`
			`left_words = [str(words[0]), str(words[1])]`
			`right_words = [str(words[2]), str(words[3])]`

			`probabilities = []`
			`for word in ngrams[1].keys():`
			`word = str(word[0])`
			`pre_ngram = tuple(left_words + [word])`
			`post_ngram = tuple([word] + right_words)`
			`pre_ngram_prob = get_prob(ngrams[3].get(pre_ngram, 0), ngrams[2].get(tuple(left_words), 0),`
			`vocabulary_size)`
			`post_ngram_prob = get_prob(ngrams[3].get(post_ngram, 0), ngrams[2].get(post_ngram[0:2], 0),`
			`vocabulary_size)`
			`probabilities.append((word, pre_ngram_prob * post_ngram_prob))`
			`probabilities = sorted(probabilities, key=lambda t: t[1], reverse=True)[:50]`
			`probability = 1.0`
			`text = ''`
			`counter = 0`
			`has_log_prob0 = False`
			`for p in probabilities:`
			`word = p[0]`
			`prob = p[1]`
			`if counter == 0 and (probability - prob <= 0.0):`
			`text = word + ':' + str(log(0.95)) + ' :' + str(log(0.05))`
			`has_log_prob0 = True`
			`break`
			`if counter > 0 and (probability - prob <= 0.0):`
			`text += ':' + str(log(probability))`
			`has_log_prob0 = True`
			`break`
			`text += word + ':' + str(log(prob)) + ' '`
			`probability -= prob`
			`counter += 1`
			`if not has_log_prob0:`
			`text += ':' + str(log(0.0001))`
			`print(text)`


			`if __name__ == '__main__':`
			`main()`