s430705

2022-04-03 19:36:26 +02:00 · 2022-04-03 19:36:26 +02:00 · 68537ae8d2
commit 68537ae8d2
parent 206774da84
3 changed files with 13486 additions and 10379 deletions
--- a/dev-0/out.tsv
+++ b/dev-0/out.tsv
--- a/run.py
+++ b/run.py
@ -13,12 +13,6 @@ DEFAULT_PREDICTION = 'the:0.2 be:0.2 to:0.2 of:0.1 and:0.1 a:0.1 :0.1'


 def preprocess_text(text):
-    # normalize text
-    text = (
-        unicodedata.normalize('NFKD', text).encode('ascii', 'ignore').decode(
-            'utf-8', 'ignore'))
-    # replace html chars with ' '
-    text = re.sub('<.*?>', ' ', text)
    # remove punctuation
    text = text.translate(str.maketrans(' ', ' ', string.punctuation))
    # only alphabets and numerics
@ -56,18 +50,6 @@ def predict_probs(word1, word2):
    return str_prediction


-def prepare_output(file_path):
-    with open(file_path, 'w') as file:
-        for index, row in test_data.iterrows():
-            text = preprocess_text(str(row[7]))
-            words = word_tokenize(text)
-            if len(words) < 4:
-                prediction = DEFAULT_PREDICTION
-            else:
-                prediction = predict_probs(words[0], words[1])
-            file.write(prediction + '\n')
-
-
 def train_model(training_data):
    for index, row in training_data.iterrows():
        text = preprocess_text(str(row["final"]))
@ -90,15 +72,16 @@ data = pd.read_csv(
    warn_bad_lines=False,
    header=None,
    quoting=csv.QUOTE_NONE,
-    nrows=200000,
+    nrows=100000,
 )
+
 train_labels = pd.read_csv(
    "train/expected.tsv",
    sep="\t",
    error_bad_lines=False,
    header=None,
    quoting=csv.QUOTE_NONE,
-    nrows=200000,
+    nrows=100000,
 )

 train_data = data[[6, 7]]
@ -113,5 +96,24 @@ test_data = pd.read_csv('test-A/in.tsv.xz', sep='\t', error_bad_lines=False, war


 train_model(train_data)
-prepare_output("dev-0/out.tsv")
-prepare_output("test-A/out.tsv")
+
+with open("dev-0/out.tsv", "w") as file:
+    for _, row in dev_data.iterrows():
+        text = preprocess_text(str(row[7]))
+        words = word_tokenize(text)
+        if len(words) < 3:
+            prediction = "the:0.2 be:0.2 to:0.2 of:0.1 and:0.1 a:0.1 :0.1"
+        else:
+            prediction = predict_probs(words[0], words[1])
+        file.write(prediction + "\n")
+
+with open("test-A/out.tsv", "w") as file:
+    for _, row in test_data.iterrows():
+        text = preprocess_text(str(row[7]))
+        words = word_tokenize(text)
+        if len(words) < 3:
+            prediction = "the:0.2 be:0.2 to:0.2 of:0.1 and:0.1 a:0.1 :0.1"
+        else:
+            prediction = predict_probs(words[0], words[1])
+        file.write(prediction + "\n")
+
--- a/test-A/out.tsv
+++ b/test-A/out.tsv