initial commit

2024-05-20 07:11:19 +02:00 · 2024-05-20 07:11:19 +02:00 · ed502c693f
commit ed502c693f
12 changed files with 125499 additions and 0 deletions
--- a/.gitignore
+++ b/.gitignore
@ -0,0 +1,7 @@
 fasttext_100_3_polish.bin
 fasttext_100_3_polish.bin.trainables.syn1neg.npy
 fasttext_100_3_polish.bin.trainables.vectors_ngrams_lockf.npy
 fasttext_100_3_polish.bin.trainables.vectors_vocab_lockf.npy
 fasttext_100_3_polish.bin.wv.vectors_ngrams.npy
 fasttext_100_3_polish.bin.wv.vectors_vocab.npy
 fasttext_100_3_polish.bin.wv.vectors.npy
--- a/README.md
+++ b/README.md
@ -0,0 +1,25 @@
 Sport Texts Classification Challenge - Ball
 ======================
 Guess whether the sport is connected to the ball for a Polish article. Evaluation metrics: Accuracy, Likelihood.
 Classes
 -------
 * `1` — ball
 * `0` — no-ball
 Directory structure
 -------------------
 * `README.md` — this file
 * `config.txt` — configuration file
 * `train/` — directory with training data
 * `train/train.tsv` — sample train set
 * `dev-0/` — directory with dev (test) data
 * `dev-0/in.tsv` — input data for the dev set
 * `dev-0/expected.tsv` — expected (reference) data for the dev set
 * `test-A` — directory with test data
 * `test-A/in.tsv` — input data for the test set
 * `test-A/expected.tsv` — expected (reference) data for the test set
--- a/config.txt
+++ b/config.txt
@ -0,0 +1 @@
 --metric Likelihood --metric Accuracy --precision 5
--- a/dev-0/expected.tsv
+++ b/dev-0/expected.tsv
--- a/dev-0/in.tsv
+++ b/dev-0/in.tsv
--- a/dev-0/out.tsv
+++ b/dev-0/out.tsv
--- a/link.tsv
+++ b/link.tsv
@ -0,0 +1 @@
 Link: https://git.wmi.amu.edu.pl/s464861/Word2Vec
--- a/main.py
+++ b/main.py
@ -0,0 +1,83 @@
 import pandas as pd
 from gensim.models import KeyedVectors
 import gensim
 import numpy as np
 from sklearn.model_selection import train_test_split
 import tensorflow as tf
 from keras.optimizers import Adam
 class TextClassifier:
    def __init__(self, vector_size=100, model_path="fasttext_100_3_polish.bin"):
        self.vector_size = vector_size
        self.word2vec = KeyedVectors.load(model_path)
        self.model = self._build_model()
    def _build_model(self):
        model = tf.keras.Sequential([
            tf.keras.layers.Input(shape=(self.vector_size,)),
            tf.keras.layers.Dense(64, activation="relu"),
            tf.keras.layers.Dense(32, activation="relu"),
            tf.keras.layers.Dense(1, activation="sigmoid")
        ])
        model.compile(
            optimizer=Adam(learning_rate=0.001),
            loss="binary_crossentropy",
            metrics=["accuracy"]
        )
        return model
    def load_data(self, train_path, dev_path, test_path):
        self.train_data = pd.read_csv(train_path, sep="\t", names=["Class", "Text"])
        self.dev_data = pd.read_csv(dev_path, sep="\t", names=["Text"])
        self.test_data = pd.read_csv(test_path, sep="\t", names=["Text"])
    def preprocess_text(self, text):
        tokens = gensim.utils.simple_preprocess(text)
        vector = np.zeros(self.vector_size)
        count = 0
        for token in tokens:
            if token in self.word2vec.wv:
                vector += self.word2vec.wv[token]
                count += 1
        if count > 0:
            vector /= count
        return vector
    def prepare_datasets(self):
        self.train_vectors = np.array(
            [self.preprocess_text(text) for text in self.train_data["Text"]]
        )
        self.dev_vectors = np.array(
            [self.preprocess_text(text) for text in self.dev_data["Text"]]
        )
        self.test_vectors = np.array(
            [self.preprocess_text(text) for text in self.test_data["Text"]]
        )
        self.X_train, self.X_val, self.y_train, self.y_val = train_test_split(
            self.train_vectors, self.train_data["Class"], test_size=0.1, random_state=42
        )
    def train_model(self, epochs=30, batch_size=16):
        self.model.fit(
            self.X_train,
            self.y_train,
            validation_data=(self.X_val, self.y_val),
            epochs=epochs,
            batch_size=batch_size
        )
    def predict_and_save(self, data, output_path):
        predictions = (self.model.predict(data) > 0.5).astype(int)
        pd.DataFrame(predictions).to_csv(output_path, index=False, header=False)
    def run(self):
        self.load_data("train/train.tsv", "dev-0/in.tsv", "test-A/in.tsv")
        self.prepare_datasets()
        self.train_model()
        self.predict_and_save(self.test_vectors, "test-A/out.tsv")
        self.predict_and_save(self.dev_vectors, "dev-0/out.tsv")
 if __name__ == "__main__":
    classifier = TextClassifier()
    classifier.run()
--- a/test-A/in.tsv
+++ b/test-A/in.tsv
--- a/test-A/out.tsv
+++ b/test-A/out.tsv
--- a/train/train.tsv
+++ b/train/train.tsv
--- a/train/train.tsv.gz
+++ b/train/train.tsv.gz
		`@ -0,0 +1 @@`
							`--metric Likelihood --metric Accuracy --precision 5`
		`@ -0,0 +1 @@`
							`Link: https://git.wmi.amu.edu.pl/s464861/Word2Vec`