initial commit

2024-05-20 07:11:19 +02:00 · 2024-05-20 07:11:19 +02:00 · ed502c693f
commit ed502c693f
12 changed files with 125499 additions and 0 deletions
--- a/.gitignore
+++ b/.gitignore
@ -0,0 +1,7 @@
+fasttext_100_3_polish.bin
+fasttext_100_3_polish.bin.trainables.syn1neg.npy
+fasttext_100_3_polish.bin.trainables.vectors_ngrams_lockf.npy
+fasttext_100_3_polish.bin.trainables.vectors_vocab_lockf.npy
+fasttext_100_3_polish.bin.wv.vectors_ngrams.npy
+fasttext_100_3_polish.bin.wv.vectors_vocab.npy
+fasttext_100_3_polish.bin.wv.vectors.npy
--- a/README.md
+++ b/README.md
@ -0,0 +1,25 @@
+
+Sport Texts Classification Challenge - Ball
+======================
+
+Guess whether the sport is connected to the ball for a Polish article. Evaluation metrics: Accuracy, Likelihood.
+
+Classes
+-------
+
+* `1` — ball
+* `0` — no-ball
+
+Directory structure
+-------------------
+
+* `README.md` — this file
+* `config.txt` — configuration file
+* `train/` — directory with training data
+* `train/train.tsv` — sample train set
+* `dev-0/` — directory with dev (test) data
+* `dev-0/in.tsv` — input data for the dev set
+* `dev-0/expected.tsv` — expected (reference) data for the dev set
+* `test-A` — directory with test data
+* `test-A/in.tsv` — input data for the test set
+* `test-A/expected.tsv` — expected (reference) data for the test set
--- a/config.txt
+++ b/config.txt
@ -0,0 +1 @@
+--metric Likelihood --metric Accuracy --precision 5
--- a/dev-0/expected.tsv
+++ b/dev-0/expected.tsv
--- a/dev-0/in.tsv
+++ b/dev-0/in.tsv
--- a/dev-0/out.tsv
+++ b/dev-0/out.tsv
--- a/link.tsv
+++ b/link.tsv
@ -0,0 +1 @@
+Link: https://git.wmi.amu.edu.pl/s464861/Word2Vec
--- a/main.py
+++ b/main.py
@ -0,0 +1,83 @@
+import pandas as pd
+from gensim.models import KeyedVectors
+import gensim
+import numpy as np
+from sklearn.model_selection import train_test_split
+import tensorflow as tf
+from keras.optimizers import Adam
+
+class TextClassifier:
+    def __init__(self, vector_size=100, model_path="fasttext_100_3_polish.bin"):
+        self.vector_size = vector_size
+        self.word2vec = KeyedVectors.load(model_path)
+        self.model = self._build_model()
+
+    def _build_model(self):
+        model = tf.keras.Sequential([
+            tf.keras.layers.Input(shape=(self.vector_size,)),
+            tf.keras.layers.Dense(64, activation="relu"),
+            tf.keras.layers.Dense(32, activation="relu"),
+            tf.keras.layers.Dense(1, activation="sigmoid")
+        ])
+        model.compile(
+            optimizer=Adam(learning_rate=0.001),
+            loss="binary_crossentropy",
+            metrics=["accuracy"]
+        )
+        return model
+
+    def load_data(self, train_path, dev_path, test_path):
+        self.train_data = pd.read_csv(train_path, sep="\t", names=["Class", "Text"])
+        self.dev_data = pd.read_csv(dev_path, sep="\t", names=["Text"])
+        self.test_data = pd.read_csv(test_path, sep="\t", names=["Text"])
+
+    def preprocess_text(self, text):
+        tokens = gensim.utils.simple_preprocess(text)
+        vector = np.zeros(self.vector_size)
+        count = 0
+        for token in tokens:
+            if token in self.word2vec.wv:
+                vector += self.word2vec.wv[token]
+                count += 1
+        if count > 0:
+            vector /= count
+        return vector
+
+    def prepare_datasets(self):
+        self.train_vectors = np.array(
+            [self.preprocess_text(text) for text in self.train_data["Text"]]
+        )
+        self.dev_vectors = np.array(
+            [self.preprocess_text(text) for text in self.dev_data["Text"]]
+        )
+        self.test_vectors = np.array(
+            [self.preprocess_text(text) for text in self.test_data["Text"]]
+        )
+
+        self.X_train, self.X_val, self.y_train, self.y_val = train_test_split(
+            self.train_vectors, self.train_data["Class"], test_size=0.1, random_state=42
+        )
+
+    def train_model(self, epochs=30, batch_size=16):
+        self.model.fit(
+            self.X_train,
+            self.y_train,
+            validation_data=(self.X_val, self.y_val),
+            epochs=epochs,
+            batch_size=batch_size
+        )
+
+    def predict_and_save(self, data, output_path):
+        predictions = (self.model.predict(data) > 0.5).astype(int)
+        pd.DataFrame(predictions).to_csv(output_path, index=False, header=False)
+
+    def run(self):
+        self.load_data("train/train.tsv", "dev-0/in.tsv", "test-A/in.tsv")
+        self.prepare_datasets()
+        self.train_model()
+        self.predict_and_save(self.test_vectors, "test-A/out.tsv")
+        self.predict_and_save(self.dev_vectors, "dev-0/out.tsv")
+
+if __name__ == "__main__":
+    classifier = TextClassifier()
+    classifier.run()
--- a/test-A/in.tsv
+++ b/test-A/in.tsv
--- a/test-A/out.tsv
+++ b/test-A/out.tsv
--- a/train/train.tsv
+++ b/train/train.tsv
--- a/train/train.tsv.gz
+++ b/train/train.tsv.gz
				`@ -0,0 +1 @@`
				`--metric Likelihood --metric Accuracy --precision 5`
				`@ -0,0 +1 @@`
				`Link: https://git.wmi.amu.edu.pl/s464861/Word2Vec`