import pandas as pd from gensim.models import KeyedVectors from sklearn.model_selection import train_test_split import tensorflow as tf from keras.optimizers import Adam import numpy as np def read_data(): train_dataset = pd.read_csv("train/train.tsv", sep="\t", names=["Class", "Text"]) dev_0_dataset = pd.read_csv("dev-0/in.tsv", sep="\t", names=["Text"]) test_A_dataset = pd.read_csv("test-A/in.tsv", sep="\t", names=["Text"]) return train_dataset, dev_0_dataset, test_A_dataset def text_to_vector(text, word2vec, vector_size): words = text.split() text_vector = np.zeros(vector_size) word_count = 0 for word in words: if word in word2vec.wv: text_vector += word2vec.wv[word] word_count += 1 if word_count > 0: text_vector /= word_count return text_vector def main(): train_dataset, dev_0_dataset, test_A_dataset = read_data() # Word2Vec parameters vector_size = 300 # Training the Word2Vec model word2vec = KeyedVectors.load("word2vec_300_3_polish.bin") # Convert text to vectors train_vectors = np.array( [text_to_vector(text, word2vec, vector_size) for text in train_dataset["Text"]] ) dev_0_vectors = np.array( [text_to_vector(text, word2vec, vector_size) for text in dev_0_dataset["Text"]] ) test_A_vectors = np.array( [text_to_vector(text, word2vec, vector_size) for text in test_A_dataset["Text"]] ) train_vectors, val_vectors, train_labels, val_labels = train_test_split( train_vectors, train_dataset["Class"], test_size=0.1, random_state=42 ) # Train a simple neural network model = tf.keras.Sequential( [ tf.keras.layers.Input(shape=(vector_size,)), tf.keras.layers.Dense(64, activation="relu"), tf.keras.layers.Dense(32, activation="relu"), tf.keras.layers.Dense(1, activation="sigmoid"), ] ) model.compile( optimizer=Adam(learning_rate=1e-3), loss="binary_crossentropy", metrics=["accuracy"], ) model.fit( train_vectors, train_labels, validation_data=(val_vectors, val_labels), epochs=30, batch_size=16, ) # Predict on test set test_A_predictions = model.predict(test_A_vectors) test_A_predictions = (test_A_predictions > 0.5).astype(int) # Save predictions test_A_predictions = pd.DataFrame(test_A_predictions) test_A_predictions.to_csv("test-A/out.tsv", index=False, header=False) # Predict on dev-0 set dev_0_predictions = model.predict(dev_0_vectors) dev_0_predictions = (dev_0_predictions > 0.5).astype(int) # Save predictions dev_0_predictions = pd.DataFrame(dev_0_predictions) dev_0_predictions.to_csv("dev-0/out.tsv", index=False, header=False) if __name__ == "__main__": main()