aaaa

2022-04-05 19:08:22 +02:00 · 2022-04-05 19:08:22 +02:00 · 8a8e1a8307
commit 8a8e1a8307
parent a459bfbb6f
4 changed files with 17746 additions and 17735 deletions
--- a/dev-0/out.tsv
+++ b/dev-0/out.tsv
--- a/run.py
+++ b/run.py
@ -36,28 +36,6 @@ def train_model(data, model):
            model[w2][w1] /= total_count
 def predict(word, model):
    predictions = dict(model[word])
    most_common = dict(Counter(predictions).most_common(5))
    total_prob = 0.0
    str_prediction = ""
    for word, prob in most_common.items():
        total_prob += prob
        str_prediction += f"{word}:{prob} "
    if not total_prob:
        return "the:0.2 be:0.2 to:0.2 of:0.1 and:0.1 a:0.1 :0.1"
    if 1 - total_prob >= 0.01:
        str_prediction += f":{1-total_prob}"
    else:
        str_prediction += f":0.01"
    return str_prediction
 def predict_data(read_path, save_path, model):
    data = get_csv(read_path)
@ -67,9 +45,32 @@ def predict_data(read_path, save_path, model):
            if len(words) < 3:
                prediction = "the:0.2 be:0.2 to:0.2 of:0.1 and:0.1 a:0.1 :0.1"
            else:
-                prediction = predict(words[-1], model)
+                prediction = predict(words[0], model)
            f.write(prediction + "\n")
 def predict(word, model):
    predictions = dict(model[word])
    most_common = dict(Counter(predictions).most_common(6))
    total_prob = 0.0
    str_prediction = ""
    for word, prob in most_common.items():
        total_prob += prob
        str_prediction += f"{word}:{prob} "
    if total_prob == 0.0:
        return "the:0.2 be:0.2 to:0.2 of:0.1 and:0.1 a:0.1 :0.1"
    rem_prob = 1 - total_prob
    if rem_prob < 0.01:
        rem_prob = 0.01
    str_prediction += f":{rem_prob}"
    return str_prediction
 if __name__ == "__main__":
    main()
--- a/test-A/out.tsv
+++ b/test-A/out.tsv
--- a/utils.py
+++ b/utils.py
@ -5,9 +5,19 @@ from csv import QUOTE_NONE
 ENCODING = "utf-8"
 REP = re.compile(r"[{}\[\]\&%^$*#\(\)@\t\n0123456789]+")
 REM = re.compile(r"'s|[\-]\\n|\-\\n|\p{P}")
 def clean_text(text):
-    return re.sub(r"\p{P}", "", str(text).lower().replace("-\\n", "").replace("\\n", " "))
+    res = str(text).lower().strip()
    res = res.replace("’", "'")
    res = REM.sub("", res)
    res = REP.sub(" ", res)
    res = res.replace("'s", " is")
    res = res.replace("'ll", " will")
    res = res.replace("won't", "will not")
    return res.replace("'m", " am")
 def get_csv(fname):