v2 20000 rows top 12

2022-04-02 17:35:49 +02:00 · 2022-04-02 17:35:49 +02:00 · e15e94a20c
commit e15e94a20c
parent 20f3c70aea
3 changed files with 17765 additions and 17758 deletions
--- a/dev-0/out.tsv
+++ b/dev-0/out.tsv
--- a/run.py
+++ b/run.py
@ -1,8 +1,16 @@
 import pandas as pd
 import csv
+import regex as re
 from nltk import trigrams, word_tokenize
 from collections import Counter, defaultdict

+def clean_text(text):
+    text = text.lower().replace('-\\n', '').replace('\\n', ' ')
+    text = re.sub(r'\p{P}', '', text)
+
+    return text
+
+
 train_data = pd.read_csv('train/in.tsv.xz', sep='\t', error_bad_lines=False, warn_bad_lines=False, header=None, quoting=csv.QUOTE_NONE)
 train_labels = pd.read_csv('train/expected.tsv', sep='\t', error_bad_lines=False, warn_bad_lines=False, header=None, quoting=csv.QUOTE_NONE)

@ -14,13 +22,12 @@ train_data['final'] = train_data[6] + train_data[0] + train_data[7]
 model = defaultdict(lambda: defaultdict(lambda: 0))

 for index, row in train_data.iterrows():
-    text = str(row['final']).lower()
-    text = text.replace('-\\n', '')
-    text = text.replace('\\n', ' ')
+    text = clean_text(str(row['final']))
    words = word_tokenize(text)
    for w1, w2, w3 in trigrams(words, pad_right=True, pad_left=True):
+        if w1 and w2 and w3:
            model[(w2, w3)][w1] += 1
-    if index > 10000:
+    if index > 20000:
        break

 for w2_w3 in model:
@ -39,6 +46,9 @@ def predict_probs(word1, word2):
        total_prob += prob
        str_prediction += f'{word}:{prob} '

+    if total_prob == 0.0:
+        return 'the:0.3 be:0.2 to:0.2 of:0.2 :0.1'
+
    remaining_prob = 1 - total_prob

    if remaining_prob < 0.0001:
@ -48,29 +58,26 @@ def predict_probs(word1, word2):
    
    return str_prediction

+
 dev_data = pd.read_csv('dev-0/in.tsv.xz', sep='\t', error_bad_lines=False, warn_bad_lines=False, header=None, quoting=csv.QUOTE_NONE)
 test_data = pd.read_csv('test-A/in.tsv.xz', sep='\t', error_bad_lines=False, warn_bad_lines=False, header=None, quoting=csv.QUOTE_NONE)

 with open('dev-0/out.tsv', 'w') as file:
    for index, row in dev_data.iterrows():
-        text = str(row[7]).lower()
-        text = text.replace('-\\n', '')
-        text = text.replace('\\n', ' ')
+        text = clean_text(str(row[7]))
        words = word_tokenize(text)
        if len(words) < 4:
-            prediction = 'and:0.01 :0.99'
+            prediction = 'the:0.3 be:0.2 to:0.2 of:0.2 :0.1'
        else:
            prediction = predict_probs(words[0], words[1])
        file.write(prediction + '\n')

 with open('test-A/out.tsv', 'w') as file:
    for index, row in test_data.iterrows():
-        text = str(row[7]).lower()
-        text = text.replace('-\\n', '')
-        text = text.replace('\\n', ' ')
+        text = clean_text(str(row[7]))
        words = word_tokenize(text)
        if len(words) < 4:
-            prediction = 'and:0.01 :0.99'
+            prediction = 'the:0.3 be:0.2 to:0.2 of:0.2 :0.1'
        else:
            prediction = predict_probs(words[0], words[1])
        file.write(prediction + '\n')
--- a/test-A/out.tsv
+++ b/test-A/out.tsv