test

2024-05-15 20:16:07 +02:00 · 2024-05-15 20:16:07 +02:00 · a3db769933
commit a3db769933
parent 7ddece883e
12 changed files with 125509 additions and 0 deletions
--- a/.gitignore
+++ b/.gitignore
@ -0,0 +1,2 @@
+word2vec_100_3_polish.bin
+word2vec_100_3_polish.bin.syn0.npy
--- a/README.md
+++ b/README.md
@ -0,0 +1,25 @@
+
+Sport Texts Classification Challenge - Ball
+======================
+
+Guess whether the sport is connected to the ball for a Polish article. Evaluation metrics: Accuracy, Likelihood.
+
+Classes
+-------
+
+* `1` — ball
+* `0` — no-ball
+
+Directory structure
+-------------------
+
+* `README.md` — this file
+* `config.txt` — configuration file
+* `train/` — directory with training data
+* `train/train.tsv` — sample train set
+* `dev-0/` — directory with dev (test) data
+* `dev-0/in.tsv` — input data for the dev set
+* `dev-0/expected.tsv` — expected (reference) data for the dev set
+* `test-A` — directory with test data
+* `test-A/in.tsv` — input data for the test set
+* `test-A/expected.tsv` — expected (reference) data for the test set
--- a/config.txt
+++ b/config.txt
@ -0,0 +1 @@
+--metric Likelihood --metric Accuracy --precision 5
--- a/dev-0/expected.tsv
+++ b/dev-0/expected.tsv
--- a/dev-0/in.tsv
+++ b/dev-0/in.tsv
--- a/dev-0/out.tsv
+++ b/dev-0/out.tsv
--- a/main.py
+++ b/main.py
--- a/run.py
+++ b/run.py
@ -0,0 +1,101 @@
+import pandas as pd
+from gensim.models import KeyedVectors
+import tensorflow as tf
+import numpy as np
+
+
+def read_data():
+    train_dataset = pd.read_csv(
+        "train/train.tsv.gz",
+        sep="\t",
+        compression="gzip",
+        names=["Class", "Text"],
+        on_bad_lines="skip",
+    )
+    dev_0_dataset = pd.read_csv(
+        "dev-0/in.tsv",
+        sep="\t",
+        names=["Text"],
+        on_bad_lines="skip",
+    )
+    test_A_dataset = pd.read_csv(
+        "test-A/in.tsv",
+        sep="\t",
+        names=["Text"],
+        on_bad_lines="skip",
+    )
+
+    return train_dataset, dev_0_dataset, test_A_dataset
+
+
+def text_to_vector(text, word2vec, vector_size):
+    words = text.split()
+    text_vector = np.zeros(vector_size)
+    for word in words:
+        if word in word2vec:
+            text_vector += word2vec[word]
+    return text_vector / len(words)
+
+
+def main():
+    train_dataset, dev_0_dataset, test_A_dataset = read_data()
+
+    # Word2Vec parameters
+    vector_size = 100
+
+    # Training the Word2Vec model
+    word2vec = KeyedVectors.load("word2vec_100_3_polish.bin")
+
+    # Convert text to vectors
+    train_vectors = np.array(
+        [text_to_vector(text, word2vec, vector_size) for text in train_dataset["Text"]]
+    )
+    dev_0_vectors = np.array(
+        [text_to_vector(text, word2vec, vector_size) for text in dev_0_dataset["Text"]]
+    )
+    test_A_vectors = np.array(
+        [text_to_vector(text, word2vec, vector_size) for text in test_A_dataset["Text"]]
+    )
+
+    # Train a simple neural network
+    model = tf.keras.Sequential(
+        [
+            tf.keras.layers.Input(shape=(vector_size,)),
+            tf.keras.layers.Dense(64, activation="relu"),
+            tf.keras.layers.Dense(32, activation="relu"),
+            tf.keras.layers.Dense(1, activation="sigmoid"),
+        ]
+    )
+
+    model.compile(
+        optimizer="adam",
+        loss="binary_crossentropy",
+        metrics=["accuracy"],
+    )
+
+    model.fit(
+        train_vectors,
+        train_dataset["Class"],
+        epochs=10,
+        batch_size=16,
+    )
+
+    # Predict on test set
+    test_A_predictions = model.predict(test_A_vectors)
+    test_A_predictions = (test_A_predictions > 0.5).astype(int)
+
+    # Save predictions
+    test_A_predictions = pd.DataFrame(test_A_predictions)
+    test_A_predictions.to_csv("test-A/out.tsv", index=False, header=False)
+
+    # Predict on dev-0 set
+    dev_0_predictions = model.predict(dev_0_vectors)
+    dev_0_predictions = (dev_0_predictions > 0.5).astype(int)
+
+    # Save predictions
+    dev_0_predictions = pd.DataFrame(dev_0_predictions)
+    dev_0_predictions.to_csv("dev-0/out.tsv", index=False, header=False)
+
+
+if __name__ == "__main__":
+    main()
--- a/test-A/in.tsv
+++ b/test-A/in.tsv
--- a/test-A/out.tsv
+++ b/test-A/out.tsv
--- a/train/train.tsv
+++ b/train/train.tsv
--- a/train/train.tsv.gz
+++ b/train/train.tsv.gz
				`@ -0,0 +1 @@`
				`--metric Likelihood --metric Accuracy --precision 5`