challenging-america-word-ga.../testing.ipynb
2022-04-04 16:03:59 +02:00

14 KiB
Raw Permalink Blame History

import pandas as pd
import csv
import regex as re
import nltk
from collections import Counter, defaultdict
import string
import unicodedata

def clean_text(text): 
    return re.sub(r"\p{P}", "", str(text).lower().replace("-\\\\n", "").replace("\\\\n", " "))

def train_model(data, model):
    for _, row in data.iterrows():
        words = nltk.word_tokenize(clean_text(row[760]))
        for w1, w2 in nltk.bigrams(words, pad_left=True, pad_right=True):
            if w1 and w2:
                model[w2][w1] += 1
    for w2 in model:
        total_count = float(sum(model[w2].values()))
        for w1 in model[w2]:
            model[w2][w1] /= total_count


def predict(word, model):
    predictions = dict(model[word])
    most_common = dict(Counter(predictions).most_common(5))

    total_prob = 0.0
    str_prediction = ""
    for word, prob in most_common.items():
        total_prob += prob
        str_prediction += f"{word}:{prob} "

    if not total_prob:
        return "the:0.2 be:0.2 to:0.2 of:0.1 and:0.1 a:0.1 :0.1"

    if 1 - total_prob >= 0.01:
        str_prediction += f":{1-total_prob}"
    else:
        str_prediction += f":0.01"

    return str_prediction


def predict_data(read_path, save_path, model):
    data = pd.read_csv(
        read_path,
        sep="\t",
        error_bad_lines=False,
        header=None,
        quoting=csv.QUOTE_NONE,
        encoding="utf-8"
    )
    with open(save_path, "w", encoding="utf-8") as f:
        for _, row in data.iterrows():
            words = nltk.word_tokenize(clean_text(row[7]))
            if len(words) < 3:
                prediction = "the:0.2 be:0.2 to:0.2 of:0.1 and:0.1 a:0.1 :0.1"
            else:
                prediction = predict(words[-1], model)
            f.write(prediction + "\n")
with open("in-header.tsv") as f:
    in_cols = f.read().strip().split("\t")

with open("out-header.tsv") as f:
    out_cols = f.read().strip().split("\t")
in_cols
['FileId', 'Year', 'LeftContext', 'RightContext']
out_cols
['Word']
data = pd.read_csv(
    "train/in.tsv.xz",
    sep="\t",
    on_bad_lines='skip',
    header=None,
    # names=in_cols,
    quoting=csv.QUOTE_NONE,
    encoding="utf-8"
)

train_words = pd.read_csv(
    "train/expected.tsv",
    sep="\t",
    on_bad_lines='skip',
    header=None,
    # names=out_cols,
    quoting=csv.QUOTE_NONE,
    encoding="utf-8"
)

train_data = data[[7, 6]]
train_data = pd.concat([train_data, train_words], axis=1)

train_data[760] = train_data[7] + train_data[0] + train_data[6]
train_data
7 6 0 760
0 said\nit's all squash. The best I could get\ni... came fiom the last place to this\nplace, and t... lie said\nit's all squash. The best I could get\ni...
1 \ninto a proper perspective with those\nminor ... MB. BOOT'S POLITICAL OBEED\nAttempt to imagine... himself \ninto a proper perspective with those\nminor ...
2 all notU\nashore and afloat arc subjects for I... "Thera were in 1771 only aeventy-nine\n*ub*erl... of all notU\nashore and afloat arc subjects for I...
3 ceucju l< d no; <o waste it nud so\nsunk it in... A gixnl man y nitereRtiiiv dii-clos-\nur«s reg... ably ceucju l< d no; <o waste it nud so\nsunk it in...
4 ascertained w? OCt the COOltS of ibis\nletale ... Tin: 188UB TV THF BBABBT QABJE\nMr. Schiffs *t... j ascertained w? OCt the COOltS of ibis\nletale ...
... ... ... ... ...
432017 \nSam was arrested.\nThe case excited a great ... Sam Clendenin bad a fancy for Ui«\nscience of ... and \nSam was arrested.\nThe case excited a great ...
432018 through the alnp the »Uitors laapeeeed tia.»\n... Wita.htt halting the party ware dilven to the ... paasliic through the alnp the »Uitors laapeeeed tia.»\n...
432019 Agua Negra across the line.\nIt was a grim pla... It was the last thing that either of\nthem exp... for Agua Negra across the line.\nIt was a grim pla...
432020 \na note of Wood, Dialogue fc Co., for\nc27,im... settlement with the department.\nIt is also sh... for \na note of Wood, Dialogue fc Co., for\nc27,im...
432021 3214c;do White at 3614c: Mixed Western at\n331... Flour quotations—low extras at 1 R0®2 50;\ncit... at 3214c;do White at 3614c: Mixed Western at\n331...

432022 rows × 4 columns


model = defaultdict(lambda: defaultdict(lambda: 0))

train_model(train_data, model)
predict_data("dev-0/in.tsv.xz", "dev-0/out.tsv", model)
C:\Users\Norbert\AppData\Local\Temp\ipykernel_15436\749044266.py:46: FutureWarning: The error_bad_lines argument has been deprecated and will be removed in a future version. Use on_bad_lines in the future.


  data = pd.read_csv(
predict_data("test-A/in.tsv.xz", "test-A/out.tsv", model)
C:\Users\Norbert\AppData\Local\Temp\ipykernel_15436\749044266.py:46: FutureWarning: The error_bad_lines argument has been deprecated and will be removed in a future version. Use on_bad_lines in the future.


  data = pd.read_csv(