s430705

2022-04-03 19:43:11 +02:00 · 2022-04-03 19:43:11 +02:00 · 290a1f802c
commit 290a1f802c
parent 68537ae8d2
3 changed files with 16190 additions and 16197 deletions
--- a/dev-0/out.tsv
+++ b/dev-0/out.tsv
--- a/run.py
+++ b/run.py
@ -13,16 +13,9 @@ DEFAULT_PREDICTION = 'the:0.2 be:0.2 to:0.2 of:0.1 and:0.1 a:0.1 :0.1'
 def preprocess_text(text):
-    # remove punctuation
+    text = text.lower().replace("-\\n", "").replace("\\n", " ")
-    text = text.translate(str.maketrans(' ', ' ', string.punctuation))
+    text = re.sub(r"\p{P}", "", text)
-    # only alphabets and numerics
+
    text = re.sub('[^a-zA-Z]', ' ', text)
    # replace newline with space
    text = re.sub("\n", " ", text)
    # lower case
    text = text.lower()
    # split and join the words
    text = ' '.join(text.split())
    return text
@ -102,7 +95,7 @@ with open("dev-0/out.tsv", "w") as file:
        text = preprocess_text(str(row[7]))
        words = word_tokenize(text)
        if len(words) < 3:
-            prediction = "the:0.2 be:0.2 to:0.2 of:0.1 and:0.1 a:0.1 :0.1"
+            prediction = DEFAULT_PREDICTION
        else:
            prediction = predict_probs(words[0], words[1])
        file.write(prediction + "\n")
@ -112,7 +105,7 @@ with open("test-A/out.tsv", "w") as file:
        text = preprocess_text(str(row[7]))
        words = word_tokenize(text)
        if len(words) < 3:
-            prediction = "the:0.2 be:0.2 to:0.2 of:0.1 and:0.1 a:0.1 :0.1"
+            prediction = DEFAULT_PREDICTION
        else:
            prediction = predict_probs(words[0], words[1])
        file.write(prediction + "\n")
--- a/test-A/out.tsv
+++ b/test-A/out.tsv