challenging-america-word-ga.../bigrams_predict.py

import pickle
import sys
from collections import Counter
from tqdm import tqdm
from math import log

from itertools import dropwhile


def get_bigram_prob(context, word, prefix: bool, word_stats, bigram_stats):
    if prefix:
        bigram_count = bigram_stats.get((context, word))
    else:
        bigram_count = bigram_stats.get((word, context))  
    
    context_count = word_stats.get(context)

    if not context_count or not bigram_count:
        return 0

    prob = log(bigram_count / context_count)
    return prob


with open('word_stats.pickle', 'rb') as file:
    word_stats = pickle.load(file)
with open('bigram_stats.pickle', 'rb') as file:
    bigram_stats = pickle.load(file)

# print("Unpickled")

for key, count in dropwhile(lambda key_count: key_count[1] >= 1000, word_stats.most_common()):
    del word_stats[key]

for key, count in dropwhile(lambda key_count: key_count[1] >= 1000, bigram_stats.most_common()):
    del bigram_stats[key]

# print(word_stats.most_common(10))
# print(bigram_stats.most_common(10))

line_num = 1

for line in tqdm(sys.stdin):
    # print(f"Line {line_num}")
    line_num += 1
    _, _, _, _, _, _, l_context, r_context = line.split("\t")
    l_context = l_context.replace(r"\n", " ")
    r_context = r_context.replace(r"\n", " ")
    prev_word = l_context.split()[-1]
    next_word = r_context.split()[0]
    # print(f"Context: {prev_word} <MASK> {next_word}")
    # print(f"{prev_word in word_stats=}")
    # print(f"{next_word in word_stats=}")

    l_probs = dict()
    r_probs = dict()

    for key in bigram_stats.keys():
        if key[0] == prev_word:
            l_probs[key[1]] = get_bigram_prob(prev_word, key[1], True, word_stats, bigram_stats)
        if key[1] == next_word:
            r_probs[key[0]] = get_bigram_prob(key[0], next_word, False, word_stats, bigram_stats)

    mult_probs = dict()
    for key in l_probs.keys():
        prob = float(l_probs.get(key, 0.0)) + float(r_probs.get(key, 0.0))
        mult_probs[key] = prob
        # if prob > 0:
        #     print(key)

    sorted_probs = sorted(mult_probs.items(), key=lambda item: item[1], reverse=True)
    # print(r_probs)
    #print(mult_probs)
    # print(len(sorted_probs))
    # print(sorted_probs[:5])

    k = 10

    top_5 = sorted_probs[:k]

    # sum = 0
    # for word, prob in top_5:
    #     sum += prob

    result = []
    for word, prob in top_5:
        # if sum != 0:
        result.append(f"{word}:{prob}")
        # else:
        #     result.append(f"{word}:{1/k}")
    if not result:
        top_5 = sorted(l_probs.items(), key=lambda item: item[1], reverse=True)
        #print(len(top_5))
        top_5 = top_5[:k]
        # sum = 0
        # for word, prob in top_5:
        #     sum += prob

        result = []
        for word, prob in top_5:
            # if sum != 0:
            result.append(f"{word}:{prob}")
            # else:
            #     result.append(f"{word}:{1/k}")
    if not result:
        top_5 = sorted(r_probs.items(), key=lambda item: item[1], reverse=True)
        #print(len(top_5))
        top_5 = top_5[:k]
        # sum = 0
        # for word, prob in top_5:
        #     sum += prob

        result = []
        for word, prob in top_5:
            # if sum != 0:
            result.append(f"{word}:{prob}")
            # else:
            #     result.append(f"{word}:{1/k}")
    if not result:
        result.append("the:-10.0")
        sum = 0.01
    print(" ".join(result) + f" :{-0.01}")
Add missing code for bigram model 2023-04-22 18:58:52 +02:00			`import pickle`
			`import sys`
			`from collections import Counter`
			`from tqdm import tqdm`
			`from math import log`

			`from itertools import dropwhile`


			`def get_bigram_prob(context, word, prefix: bool, word_stats, bigram_stats):`
			`if prefix:`
			`bigram_count = bigram_stats.get((context, word))`
			`else:`
			`bigram_count = bigram_stats.get((word, context))`

			`context_count = word_stats.get(context)`

			`if not context_count or not bigram_count:`
			`return 0`

			`prob = log(bigram_count / context_count)`
			`return prob`


			`with open('word_stats.pickle', 'rb') as file:`
			`word_stats = pickle.load(file)`
			`with open('bigram_stats.pickle', 'rb') as file:`
			`bigram_stats = pickle.load(file)`

			`# print("Unpickled")`

			`for key, count in dropwhile(lambda key_count: key_count[1] >= 1000, word_stats.most_common()):`
			`del word_stats[key]`

			`for key, count in dropwhile(lambda key_count: key_count[1] >= 1000, bigram_stats.most_common()):`
			`del bigram_stats[key]`

			`# print(word_stats.most_common(10))`
			`# print(bigram_stats.most_common(10))`

			`line_num = 1`

			`for line in tqdm(sys.stdin):`
			`# print(f"Line {line_num}")`
			`line_num += 1`
			`_, _, _, _, _, _, l_context, r_context = line.split("\t")`
			`l_context = l_context.replace(r"\n", " ")`
			`r_context = r_context.replace(r"\n", " ")`
			`prev_word = l_context.split()[-1]`
			`next_word = r_context.split()[0]`
			`# print(f"Context: {prev_word} <MASK> {next_word}")`
			`# print(f"{prev_word in word_stats=}")`
			`# print(f"{next_word in word_stats=}")`

			`l_probs = dict()`
			`r_probs = dict()`

			`for key in bigram_stats.keys():`
			`if key[0] == prev_word:`
			`l_probs[key[1]] = get_bigram_prob(prev_word, key[1], True, word_stats, bigram_stats)`
			`if key[1] == next_word:`
			`r_probs[key[0]] = get_bigram_prob(key[0], next_word, False, word_stats, bigram_stats)`

			`mult_probs = dict()`
			`for key in l_probs.keys():`
			`prob = float(l_probs.get(key, 0.0)) + float(r_probs.get(key, 0.0))`
			`mult_probs[key] = prob`
			`# if prob > 0:`
			`# print(key)`

			`sorted_probs = sorted(mult_probs.items(), key=lambda item: item[1], reverse=True)`
			`# print(r_probs)`
			`#print(mult_probs)`
			`# print(len(sorted_probs))`
			`# print(sorted_probs[:5])`

			`k = 10`

			`top_5 = sorted_probs[:k]`

			`# sum = 0`
			`# for word, prob in top_5:`
			`# sum += prob`

			`result = []`
			`for word, prob in top_5:`
			`# if sum != 0:`
			`result.append(f"{word}:{prob}")`
			`# else:`
			`# result.append(f"{word}:{1/k}")`
			`if not result:`
			`top_5 = sorted(l_probs.items(), key=lambda item: item[1], reverse=True)`
			`#print(len(top_5))`
			`top_5 = top_5[:k]`
			`# sum = 0`
			`# for word, prob in top_5:`
			`# sum += prob`

			`result = []`
			`for word, prob in top_5:`
			`# if sum != 0:`
			`result.append(f"{word}:{prob}")`
			`# else:`
			`# result.append(f"{word}:{1/k}")`
			`if not result:`
			`top_5 = sorted(r_probs.items(), key=lambda item: item[1], reverse=True)`
			`#print(len(top_5))`
			`top_5 = top_5[:k]`
			`# sum = 0`
			`# for word, prob in top_5:`
			`# sum += prob`

			`result = []`
			`for word, prob in top_5:`
			`# if sum != 0:`
			`result.append(f"{word}:{prob}")`
			`# else:`
			`# result.append(f"{word}:{1/k}")`
			`if not result:`
			`result.append("the:-10.0")`
			`sum = 0.01`
			`print(" ".join(result) + f" :{-0.01}")`