import pandas as pd from gensim.models import KeyedVectors import gensim import numpy as np from sklearn.model_selection import train_test_split import tensorflow as tf from keras.optimizers import Adam class TextClassifier: def __init__(self, vector_size=100, model_path="fasttext_100_3_polish.bin"): self.vector_size = vector_size self.word2vec = KeyedVectors.load(model_path) self.model = self._build_model() def _build_model(self): model = tf.keras.Sequential([ tf.keras.layers.Input(shape=(self.vector_size,)), tf.keras.layers.Dense(64, activation="relu"), tf.keras.layers.Dense(32, activation="relu"), tf.keras.layers.Dense(1, activation="sigmoid") ]) model.compile( optimizer=Adam(learning_rate=0.001), loss="binary_crossentropy", metrics=["accuracy"] ) return model def load_data(self, train_path, dev_path, test_path): self.train_data = pd.read_csv(train_path, sep="\t", names=["Class", "Text"]) self.dev_data = pd.read_csv(dev_path, sep="\t", names=["Text"]) self.test_data = pd.read_csv(test_path, sep="\t", names=["Text"]) def preprocess_text(self, text): tokens = gensim.utils.simple_preprocess(text) vector = np.zeros(self.vector_size) count = 0 for token in tokens: if token in self.word2vec.wv: vector += self.word2vec.wv[token] count += 1 if count > 0: vector /= count return vector def prepare_datasets(self): self.train_vectors = np.array( [self.preprocess_text(text) for text in self.train_data["Text"]] ) self.dev_vectors = np.array( [self.preprocess_text(text) for text in self.dev_data["Text"]] ) self.test_vectors = np.array( [self.preprocess_text(text) for text in self.test_data["Text"]] ) self.X_train, self.X_val, self.y_train, self.y_val = train_test_split( self.train_vectors, self.train_data["Class"], test_size=0.1, random_state=42 ) def train_model(self, epochs=30, batch_size=16): self.model.fit( self.X_train, self.y_train, validation_data=(self.X_val, self.y_val), epochs=epochs, batch_size=batch_size ) def predict_and_save(self, data, output_path): predictions = (self.model.predict(data) > 0.5).astype(int) pd.DataFrame(predictions).to_csv(output_path, index=False, header=False) def run(self): self.load_data("train/train.tsv", "dev-0/in.tsv", "test-A/in.tsv") self.prepare_datasets() self.train_model() self.predict_and_save(self.test_vectors, "test-A/out.tsv") self.predict_and_save(self.dev_vectors, "dev-0/out.tsv") if __name__ == "__main__": classifier = TextClassifier() classifier.run()