challenging-america-word-gap-prediction/testing.ipynb at master

s440054/challenging-america-word-gap-prediction

2022-04-04 16:03:59 +02:00

14 KiB

Raw Permalink Blame History

import pandas as pd
import csv
import regex as re
import nltk
from collections import Counter, defaultdict
import string
import unicodedata

def clean_text(text): 
    return re.sub(r"\p{P}", "", str(text).lower().replace("-\\\\n", "").replace("\\\\n", " "))

def train_model(data, model):
    for _, row in data.iterrows():
        words = nltk.word_tokenize(clean_text(row[760]))
        for w1, w2 in nltk.bigrams(words, pad_left=True, pad_right=True):
            if w1 and w2:
                model[w2][w1] += 1
    for w2 in model:
        total_count = float(sum(model[w2].values()))
        for w1 in model[w2]:
            model[w2][w1] /= total_count


def predict(word, model):
    predictions = dict(model[word])
    most_common = dict(Counter(predictions).most_common(5))

    total_prob = 0.0
    str_prediction = ""
    for word, prob in most_common.items():
        total_prob += prob
        str_prediction += f"{word}:{prob} "

    if not total_prob:
        return "the:0.2 be:0.2 to:0.2 of:0.1 and:0.1 a:0.1 :0.1"

    if 1 - total_prob >= 0.01:
        str_prediction += f":{1-total_prob}"
    else:
        str_prediction += f":0.01"

    return str_prediction


def predict_data(read_path, save_path, model):
    data = pd.read_csv(
        read_path,
        sep="\t",
        error_bad_lines=False,
        header=None,
        quoting=csv.QUOTE_NONE,
        encoding="utf-8"
    )
    with open(save_path, "w", encoding="utf-8") as f:
        for _, row in data.iterrows():
            words = nltk.word_tokenize(clean_text(row[7]))
            if len(words) < 3:
                prediction = "the:0.2 be:0.2 to:0.2 of:0.1 and:0.1 a:0.1 :0.1"
            else:
                prediction = predict(words[-1], model)
            f.write(prediction + "\n")

with open("in-header.tsv") as f:
    in_cols = f.read().strip().split("\t")

with open("out-header.tsv") as f:
    out_cols = f.read().strip().split("\t")

in_cols

['FileId', 'Year', 'LeftContext', 'RightContext']

out_cols

['Word']

data = pd.read_csv(
    "train/in.tsv.xz",
    sep="\t",
    on_bad_lines='skip',
    header=None,
    # names=in_cols,
    quoting=csv.QUOTE_NONE,
    encoding="utf-8"
)

train_words = pd.read_csv(
    "train/expected.tsv",
    sep="\t",
    on_bad_lines='skip',
    header=None,
    # names=out_cols,
    quoting=csv.QUOTE_NONE,
    encoding="utf-8"
)

train_data = data[[7, 6]]
train_data = pd.concat([train_data, train_words], axis=1)

train_data[760] = train_data[7] + train_data[0] + train_data[6]

train_data

	7	6	0	760
0	said\nit's all squash. The best I could get\ni...	came fiom the last place to this\nplace, and t...	lie	said\nit's all squash. The best I could get\ni...
1	\ninto a proper perspective with those\nminor ...	MB. BOOT'S POLITICAL OBEED\nAttempt to imagine...	himself	\ninto a proper perspective with those\nminor ...
2	all notU\nashore and afloat arc subjects for I...	"Thera were in 1771 only aeventy-nine\nuberl...	of	all notU\nashore and afloat arc subjects for I...
3	ceucju l< d no; <o waste it nud so\nsunk it in...	A gixnl man y nitereRtiiiv dii-clos-\nur«s reg...	ably	ceucju l< d no; <o waste it nud so\nsunk it in...
4	ascertained w? OCt the COOltS of ibis\nletale ...	Tin: 188UB TV THF BBABBT QABJE\nMr. Schiffs *t...	j	ascertained w? OCt the COOltS of ibis\nletale ...
...	...	...	...	...
432017	\nSam was arrested.\nThe case excited a great ...	Sam Clendenin bad a fancy for Ui«\nscience of ...	and	\nSam was arrested.\nThe case excited a great ...
432018	through the alnp the »Uitors laapeeeed tia.»\n...	Wita.htt halting the party ware dilven to the ...	paasliic	through the alnp the »Uitors laapeeeed tia.»\n...
432019	Agua Negra across the line.\nIt was a grim pla...	It was the last thing that either of\nthem exp...	for	Agua Negra across the line.\nIt was a grim pla...
432020	\na note of Wood, Dialogue fc Co., for\nc27,im...	settlement with the department.\nIt is also sh...	for	\na note of Wood, Dialogue fc Co., for\nc27,im...
432021	3214c;do White at 3614c: Mixed Western at\n331...	Flour quotations—low extras at 1 R0®2 50;\ncit...	at	3214c;do White at 3614c: Mixed Western at\n331...

432022 rows × 4 columns


model = defaultdict(lambda: defaultdict(lambda: 0))

train_model(train_data, model)

predict_data("dev-0/in.tsv.xz", "dev-0/out.tsv", model)

C:\Users\Norbert\AppData\Local\Temp\ipykernel_15436\749044266.py:46: FutureWarning: The error_bad_lines argument has been deprecated and will be removed in a future version. Use on_bad_lines in the future.


  data = pd.read_csv(

predict_data("test-A/in.tsv.xz", "test-A/out.tsv", model)

C:\Users\Norbert\AppData\Local\Temp\ipykernel_15436\749044266.py:46: FutureWarning: The error_bad_lines argument has been deprecated and will be removed in a future version. Use on_bad_lines in the future.


  data = pd.read_csv(

14 KiB Raw Permalink Blame History Unescape Escape

14 KiB

Raw Permalink Blame History