s430705

2022-04-03 19:36:26 +02:00 · 2022-04-03 19:36:26 +02:00 · 68537ae8d2
commit 68537ae8d2
parent 206774da84
3 changed files with 13486 additions and 10379 deletions
--- a/dev-0/out.tsv
+++ b/dev-0/out.tsv
--- a/run.py
+++ b/run.py
@ -13,12 +13,6 @@ DEFAULT_PREDICTION = 'the:0.2 be:0.2 to:0.2 of:0.1 and:0.1 a:0.1 :0.1'
 def preprocess_text(text):
    # normalize text
    text = (
        unicodedata.normalize('NFKD', text).encode('ascii', 'ignore').decode(
            'utf-8', 'ignore'))
    # replace html chars with ' '
    text = re.sub('<.*?>', ' ', text)
    # remove punctuation
    text = text.translate(str.maketrans(' ', ' ', string.punctuation))
    # only alphabets and numerics
@ -56,18 +50,6 @@ def predict_probs(word1, word2):
    return str_prediction
 def prepare_output(file_path):
    with open(file_path, 'w') as file:
        for index, row in test_data.iterrows():
            text = preprocess_text(str(row[7]))
            words = word_tokenize(text)
            if len(words) < 4:
                prediction = DEFAULT_PREDICTION
            else:
                prediction = predict_probs(words[0], words[1])
            file.write(prediction + '\n')
 def train_model(training_data):
    for index, row in training_data.iterrows():
        text = preprocess_text(str(row["final"]))
@ -90,15 +72,16 @@ data = pd.read_csv(
    warn_bad_lines=False,
    header=None,
    quoting=csv.QUOTE_NONE,
-    nrows=200000,
+    nrows=100000,
 )
 train_labels = pd.read_csv(
    "train/expected.tsv",
    sep="\t",
    error_bad_lines=False,
    header=None,
    quoting=csv.QUOTE_NONE,
-    nrows=200000,
+    nrows=100000,
 )
 train_data = data[[6, 7]]
@ -113,5 +96,24 @@ test_data = pd.read_csv('test-A/in.tsv.xz', sep='\t', error_bad_lines=False, war
 train_model(train_data)
-prepare_output("dev-0/out.tsv")
+
-prepare_output("test-A/out.tsv")
+with open("dev-0/out.tsv", "w") as file:
    for _, row in dev_data.iterrows():
        text = preprocess_text(str(row[7]))
        words = word_tokenize(text)
        if len(words) < 3:
            prediction = "the:0.2 be:0.2 to:0.2 of:0.1 and:0.1 a:0.1 :0.1"
        else:
            prediction = predict_probs(words[0], words[1])
        file.write(prediction + "\n")
 with open("test-A/out.tsv", "w") as file:
    for _, row in test_data.iterrows():
        text = preprocess_text(str(row[7]))
        words = word_tokenize(text)
        if len(words) < 3:
            prediction = "the:0.2 be:0.2 to:0.2 of:0.1 and:0.1 a:0.1 :0.1"
        else:
            prediction = predict_probs(words[0], words[1])
        file.write(prediction + "\n")
--- a/test-A/out.tsv
+++ b/test-A/out.tsv