challenging-america-word-ga.../bigram.py

import collections
import re
import random
import math

input_file_path = "train/in.tsv"
bigrams = collections.defaultdict(lambda: collections.defaultdict(int))


def clean_text(text: str):
    text = text.replace('\n', ' ')
    text = re.sub(r'[^a-zA-Z0-9\s]', '', text)
    text = text.lower()
    text = text.strip()
    return text


with open('train/expected.tsv', 'r', encoding="utf-8") as f:
    expected = [line for line in f]

with open(input_file_path, 'r', encoding="utf-8") as f:
    data = [line.split('\t') for line in f]

#data = data[:200000] # total is over 400 000

combined = []

for idx, row in enumerate(data):
    line = clean_text(row[6]) + ' ' + expected[idx] + ' ' + clean_text(row[7])
    combined.append(line.lower())


for line in combined:
    tokens = re.findall(r"\b\w+\b", line)

    for i in range(len(tokens) - 1):
        bigrams[tokens[i]][tokens[i+1]] += 1


most_popular_words = [
    "be:0.5 and:0.2 of:0.1 :0.2",
    "a:0.5 in:0.2 to:0.1 :0.2",
    "have:0.5 too:0.2 it:0.1 :0.2",
    "I:0.5 that:0.2 for:0.1 :0.2",
    "you:0.5 he:0.2 with:0.1 :0.2",
    "on:0.5 do:0.2 say:0.1 :0.2",
    "this:0.5 they:0.2 at:0.1 :0.2",
    "but:0.5 we:0.2 his:0.1 :0.2"
]


with open('test-A/in.tsv', "r", encoding="utf-8") as input_file, open('test-A/out.tsv', "w", encoding="utf-8") as output_file:

    lines = input_file.readlines()

    for idx, line in enumerate(lines):
        tokens = re.findall(r"\b\w+\b", clean_text(line.split("\t")[6]))

        probabilities = []
        denominator = sum(bigrams[tokens[-1]].values())

        for possible_word in bigrams[tokens[-1]]:
            probability = bigrams[tokens[-1]][possible_word] / denominator
            probabilities.append((possible_word, probability))

        probabilities.sort(key=lambda x: x[1], reverse=True)
        print(f'Line {idx} of {len(lines)}')
        
        if len(probabilities) >= 3:
            out_line = ""
            out_line += probabilities[0][0] + ":0.6 "
            out_line += probabilities[1][0] + ":0.2 "
            out_line += probabilities[2][0] + ":0.1 "
            out_line += ":0.1"
            output_file.write(out_line + "\n")

        else:
            output_file.write(random.choice(most_popular_words) + "\n")
bigram solution 2023-04-04 22:43:03 +02:00			`import collections`
			`import re`
			`import random`
			`import math`

			`input_file_path = "train/in.tsv"`
			`bigrams = collections.defaultdict(lambda: collections.defaultdict(int))`


			`def clean_text(text: str):`
			`text = text.replace('\n', ' ')`
			`text = re.sub(r'[^a-zA-Z0-9\s]', '', text)`
			`text = text.lower()`
			`text = text.strip()`
			`return text`


			`with open('train/expected.tsv', 'r', encoding="utf-8") as f:`
			`expected = [line for line in f]`

			`with open(input_file_path, 'r', encoding="utf-8") as f:`
			`data = [line.split('\t') for line in f]`

			`#data = data[:200000] # total is over 400 000`

			`combined = []`

			`for idx, row in enumerate(data):`
			`line = clean_text(row[6]) + ' ' + expected[idx] + ' ' + clean_text(row[7])`
			`combined.append(line.lower())`


			`for line in combined:`
			`tokens = re.findall(r"\b\w+\b", line)`

			`for i in range(len(tokens) - 1):`
			`bigrams[tokens[i]][tokens[i+1]] += 1`



			`most_popular_words = [`
			`"be:0.5 and:0.2 of:0.1 :0.2",`
			`"a:0.5 in:0.2 to:0.1 :0.2",`
			`"have:0.5 too:0.2 it:0.1 :0.2",`
			`"I:0.5 that:0.2 for:0.1 :0.2",`
			`"you:0.5 he:0.2 with:0.1 :0.2",`
			`"on:0.5 do:0.2 say:0.1 :0.2",`
			`"this:0.5 they:0.2 at:0.1 :0.2",`
			`"but:0.5 we:0.2 his:0.1 :0.2"`
			`]`


			`with open('test-A/in.tsv', "r", encoding="utf-8") as input_file, open('test-A/out.tsv', "w", encoding="utf-8") as output_file:`

			`lines = input_file.readlines()`

			`for idx, line in enumerate(lines):`
			`tokens = re.findall(r"\b\w+\b", clean_text(line.split("\t")[6]))`

			`probabilities = []`
			`denominator = sum(bigrams[tokens[-1]].values())`

			`for possible_word in bigrams[tokens[-1]]:`
			`probability = bigrams[tokens[-1]][possible_word] / denominator`
			`probabilities.append((possible_word, probability))`

			`probabilities.sort(key=lambda x: x[1], reverse=True)`
			`print(f'Line {idx} of {len(lines)}')`

			`if len(probabilities) >= 3:`
			`out_line = ""`
			`out_line += probabilities[0][0] + ":0.6 "`
			`out_line += probabilities[1][0] + ":0.2 "`
			`out_line += probabilities[2][0] + ":0.1 "`
			`out_line += ":0.1"`
			`output_file.write(out_line + "\n")`

			`else:`
			`output_file.write(random.choice(most_popular_words) + "\n")`