challenging-america-word-ga.../run.py

from ast import Mod
import pandas as pd
import csv
import regex as re
from nltk import bigrams, word_tokenize
from collections import Counter, defaultdict

    def read(self):
        self.data = pd.read_csv(
            self._path,
            sep="\t",
            error_bad_lines=False,
            header=None,
            quoting=csv.QUOTE_NONE
        )


class Model:
    def __init__(self, alpha):
        self.alpha = alpha
        self.model = defaultdict(lambda: defaultdict(lambda: 0))
        self.vocab = set()

    def train(self, data):
        for _, row in data.iterrows():
            words = word_tokenize(clean(row["final"]))
            for w1, w2 in bigrams(words, pad_left=True, pad_right=True):
                if w1 and w2:
                    self.model[w1][w2] += 1
                    self.vocab.add(w1)
                    self.vocab.add(w2)
                    
        for w1 in self.model:
            total_count = float(sum(self.model[w1].values()))
            denominator = total_count + self.alpha * len(self.vocab)
            for w2 in self.model[w1]:
                nominator = self.model[w1][w2] + self.alpha
                self.model[w1][w2] = nominator / denominator


    def _predict(self, word):
        predictions = dict(self.model[word])
        most_common = dict(Counter(predictions).most_common(6))

        total_prob = 0.0
        str_prediction = ""

        for word, prob in most_common.items():
            total_prob += prob
            str_prediction += f"{word}:{prob} "

        if not total_prob:
            return "the:0.2 be:0.2 to:0.2 of:0.1 and:0.1 a:0.1 :0.1"

        if 1 - total_prob >= 0.01:
            str_prediction += f":{1-total_prob}"
        else:
            str_prediction += f":0.01"

        return str_prediction


    def predict(self, read_path, save_path):
        data = pd.read_csv(
            read_path, sep="\t", error_bad_lines=False, header=None, quoting=csv.QUOTE_NONE
        )
        with open(save_path, "w") as file:
            for _, row in data.iterrows():
                words = word_tokenize(clean(row[6]))
                if len(words) < 3:
                    prediction = "the:0.2 be:0.2 to:0.2 of:0.1 and:0.1 a:0.1 :0.1"
                else:
                    prediction = self._predict(words[-1])
                file.write(prediction + "\n")


if __name__ == '__main__':

    data = pd.read_csv(
        "train/in.tsv.xz",
        sep="\t",
        error_bad_lines=False,
        header=None,
        quoting=csv.QUOTE_NONE,
    )
    train_labels = pd.read_csv(
        "train/expected.tsv",
        sep="\t",
        error_bad_lines=False,
        header=None,
        quoting=csv.QUOTE_NONE,
    )

    train_data = data[[6, 7]]
    train_data = pd.concat([train_data, train_labels], axis=1)

    train_data["final"] = train_data[6] + train_data[0] + train_data[7]

    model = Model(0.0001)
    model.train(train_data)
    model.predict("dev-0/in.tsv.xz", "dev-0/out.tsv")
    model.predict("test-A/in.tsv.xz", "test-A/out.tsv")
smoothing fix 2022-04-10 19:14:38 +02:00			`from ast import Mod`
solution 2022-04-03 11:32:58 +02:00			`import pandas as pd`
			`import csv`
			`import regex as re`
			`from nltk import bigrams, word_tokenize`
			`from collections import Counter, defaultdict`

smoothing 2022-04-10 18:24:05 +02:00			`def read(self):`
smoothing 2022-04-10 18:13:50 +02:00			`self.data = pd.read_csv(`
			`self._path,`
			`sep="\t",`
			`error_bad_lines=False,`
			`header=None,`
smoothing 2022-04-10 18:24:05 +02:00			`quoting=csv.QUOTE_NONE`
smoothing 2022-04-10 18:13:50 +02:00			`)`
solution 2022-04-03 11:32:58 +02:00

smoothing 2022-04-10 18:13:50 +02:00			`class Model:`
smoothing fix 2022-04-10 19:14:38 +02:00			`def __init__(self, alpha):`
smoothing 2022-04-10 18:13:50 +02:00			`self.alpha = alpha`
			`self.model = defaultdict(lambda: defaultdict(lambda: 0))`
			`self.vocab = set()`

			`def train(self, data):`
			`for _, row in data.iterrows():`
			`words = word_tokenize(clean(row["final"]))`
smoothing fix 2022-04-10 19:14:38 +02:00			`for w1, w2 in bigrams(words, pad_left=True, pad_right=True):`
			`if w1 and w2:`
			`self.model[w1][w2] += 1`
			`self.vocab.add(w1)`
			`self.vocab.add(w2)`

smoothing 2022-04-10 18:13:50 +02:00			`for w1 in self.model:`
			`total_count = float(sum(self.model[w1].values()))`
smoothing 2022-04-10 18:43:05 +02:00			`denominator = total_count + self.alpha * len(self.vocab)`
smoothing 2022-04-10 18:13:50 +02:00			`for w2 in self.model[w1]:`
smoothing 2022-04-10 18:43:05 +02:00			`nominator = self.model[w1][w2] + self.alpha`
			`self.model[w1][w2] = nominator / denominator`
solution 2022-04-03 11:32:58 +02:00
smoothing fix 2022-04-10 19:14:38 +02:00
smoothing 2022-04-10 18:13:50 +02:00			`def _predict(self, word):`
			`predictions = dict(self.model[word])`
smoothing 2022-04-10 18:43:05 +02:00			`most_common = dict(Counter(predictions).most_common(6))`
solution 2022-04-03 11:32:58 +02:00
smoothing 2022-04-10 18:13:50 +02:00			`total_prob = 0.0`
			`str_prediction = ""`
solution 2022-04-03 11:32:58 +02:00
smoothing 2022-04-10 18:13:50 +02:00			`for word, prob in most_common.items():`
			`total_prob += prob`
			`str_prediction += f"{word}:{prob} "`
solution 2022-04-03 11:32:58 +02:00
smoothing 2022-04-10 18:13:50 +02:00			`if not total_prob:`
			`return "the:0.2 be:0.2 to:0.2 of:0.1 and:0.1 a:0.1 :0.1"`
solution 2022-04-03 11:32:58 +02:00
smoothing 2022-04-10 18:13:50 +02:00			`if 1 - total_prob >= 0.01:`
			`str_prediction += f":{1-total_prob}"`
			`else:`
			`str_prediction += f":0.01"`
solution 2022-04-03 11:32:58 +02:00
smoothing 2022-04-10 18:13:50 +02:00			`return str_prediction`
solution 2022-04-03 11:32:58 +02:00
smoothing fix 2022-04-10 19:14:38 +02:00
			`def predict(self, read_path, save_path):`
			`data = pd.read_csv(`
			`read_path, sep="\t", error_bad_lines=False, header=None, quoting=csv.QUOTE_NONE`
			`)`
smoothing 2022-04-10 18:13:50 +02:00			`with open(save_path, "w") as file:`
			`for _, row in data.iterrows():`
			`words = word_tokenize(clean(row[6]))`
			`if len(words) < 3:`
			`prediction = "the:0.2 be:0.2 to:0.2 of:0.1 and:0.1 a:0.1 :0.1"`
			`else:`
			`prediction = self._predict(words[-1])`
			`file.write(prediction + "\n")`


			`if __name__ == '__main__':`

solution 2022-04-03 11:32:58 +02:00			`data = pd.read_csv(`
smoothing fix 2022-04-10 19:14:38 +02:00			`"train/in.tsv.xz",`
			`sep="\t",`
			`error_bad_lines=False,`
			`header=None,`
			`quoting=csv.QUOTE_NONE,`
solution 2022-04-03 11:32:58 +02:00			`)`
smoothing fix 2022-04-10 19:14:38 +02:00			`train_labels = pd.read_csv(`
			`"train/expected.tsv",`
			`sep="\t",`
			`error_bad_lines=False,`
			`header=None,`
			`quoting=csv.QUOTE_NONE,`
			`)`

			`train_data = data[[6, 7]]`
			`train_data = pd.concat([train_data, train_labels], axis=1)`
smoothing 2022-04-10 18:13:50 +02:00
			`train_data["final"] = train_data[6] + train_data[0] + train_data[7]`
solution 2022-04-03 11:32:58 +02:00
smoothing fix 2022-04-10 19:14:38 +02:00			`model = Model(0.0001)`
smoothing 2022-04-10 18:13:50 +02:00			`model.train(train_data)`
			`model.predict("dev-0/in.tsv.xz", "dev-0/out.tsv")`
smoothing fix 2022-04-10 19:14:38 +02:00			`model.predict("test-A/in.tsv.xz", "test-A/out.tsv")`