2024-05-15 20:16:07 +02:00
|
|
|
import pandas as pd
|
|
|
|
from gensim.models import KeyedVectors
|
2024-05-16 14:30:49 +02:00
|
|
|
from sklearn.model_selection import train_test_split
|
2024-05-15 20:16:07 +02:00
|
|
|
import tensorflow as tf
|
2024-05-16 14:30:49 +02:00
|
|
|
from keras.optimizers import Adam
|
2024-05-15 20:16:07 +02:00
|
|
|
import numpy as np
|
2024-05-19 22:42:01 +02:00
|
|
|
import gensim
|
2024-05-15 20:16:07 +02:00
|
|
|
|
|
|
|
|
|
|
|
def read_data():
|
2024-05-15 20:40:39 +02:00
|
|
|
train_dataset = pd.read_csv("train/train.tsv", sep="\t", names=["Class", "Text"])
|
|
|
|
dev_0_dataset = pd.read_csv("dev-0/in.tsv", sep="\t", names=["Text"])
|
|
|
|
test_A_dataset = pd.read_csv("test-A/in.tsv", sep="\t", names=["Text"])
|
2024-05-15 20:16:07 +02:00
|
|
|
|
|
|
|
return train_dataset, dev_0_dataset, test_A_dataset
|
|
|
|
|
|
|
|
|
|
|
|
def text_to_vector(text, word2vec, vector_size):
|
2024-05-19 22:42:01 +02:00
|
|
|
words = gensim.utils.simple_preprocess(text)
|
2024-05-15 20:16:07 +02:00
|
|
|
text_vector = np.zeros(vector_size)
|
2024-05-19 11:31:01 +02:00
|
|
|
word_count = 0
|
2024-05-15 20:16:07 +02:00
|
|
|
for word in words:
|
2024-05-19 11:31:01 +02:00
|
|
|
if word in word2vec.wv:
|
|
|
|
text_vector += word2vec.wv[word]
|
|
|
|
word_count += 1
|
|
|
|
if word_count > 0:
|
|
|
|
text_vector /= word_count
|
|
|
|
return text_vector
|
2024-05-15 20:16:07 +02:00
|
|
|
|
|
|
|
|
|
|
|
def main():
|
|
|
|
train_dataset, dev_0_dataset, test_A_dataset = read_data()
|
|
|
|
|
|
|
|
# Word2Vec parameters
|
2024-05-19 12:03:31 +02:00
|
|
|
vector_size = 100
|
2024-05-15 20:16:07 +02:00
|
|
|
|
2024-05-19 22:42:01 +02:00
|
|
|
# Loading the Word2Vec model
|
2024-05-19 12:03:31 +02:00
|
|
|
word2vec = KeyedVectors.load("fasttext_100_3_polish.bin")
|
2024-05-15 20:16:07 +02:00
|
|
|
|
|
|
|
# Convert text to vectors
|
|
|
|
train_vectors = np.array(
|
|
|
|
[text_to_vector(text, word2vec, vector_size) for text in train_dataset["Text"]]
|
|
|
|
)
|
|
|
|
dev_0_vectors = np.array(
|
|
|
|
[text_to_vector(text, word2vec, vector_size) for text in dev_0_dataset["Text"]]
|
|
|
|
)
|
|
|
|
test_A_vectors = np.array(
|
|
|
|
[text_to_vector(text, word2vec, vector_size) for text in test_A_dataset["Text"]]
|
|
|
|
)
|
|
|
|
|
2024-05-16 14:30:49 +02:00
|
|
|
train_vectors, val_vectors, train_labels, val_labels = train_test_split(
|
|
|
|
train_vectors, train_dataset["Class"], test_size=0.1, random_state=42
|
|
|
|
)
|
|
|
|
|
2024-05-15 20:16:07 +02:00
|
|
|
# Train a simple neural network
|
|
|
|
model = tf.keras.Sequential(
|
|
|
|
[
|
|
|
|
tf.keras.layers.Input(shape=(vector_size,)),
|
|
|
|
tf.keras.layers.Dense(64, activation="relu"),
|
|
|
|
tf.keras.layers.Dense(32, activation="relu"),
|
|
|
|
tf.keras.layers.Dense(1, activation="sigmoid"),
|
|
|
|
]
|
|
|
|
)
|
|
|
|
|
|
|
|
model.compile(
|
2024-05-16 14:30:49 +02:00
|
|
|
optimizer=Adam(learning_rate=1e-3),
|
2024-05-15 20:16:07 +02:00
|
|
|
loss="binary_crossentropy",
|
|
|
|
metrics=["accuracy"],
|
|
|
|
)
|
|
|
|
|
|
|
|
model.fit(
|
|
|
|
train_vectors,
|
2024-05-16 14:30:49 +02:00
|
|
|
train_labels,
|
|
|
|
validation_data=(val_vectors, val_labels),
|
|
|
|
epochs=30,
|
2024-05-15 20:16:07 +02:00
|
|
|
batch_size=16,
|
|
|
|
)
|
|
|
|
|
|
|
|
# Predict on test set
|
|
|
|
test_A_predictions = model.predict(test_A_vectors)
|
|
|
|
test_A_predictions = (test_A_predictions > 0.5).astype(int)
|
|
|
|
|
|
|
|
# Save predictions
|
|
|
|
test_A_predictions = pd.DataFrame(test_A_predictions)
|
|
|
|
test_A_predictions.to_csv("test-A/out.tsv", index=False, header=False)
|
|
|
|
|
|
|
|
# Predict on dev-0 set
|
|
|
|
dev_0_predictions = model.predict(dev_0_vectors)
|
|
|
|
dev_0_predictions = (dev_0_predictions > 0.5).astype(int)
|
|
|
|
|
|
|
|
# Save predictions
|
|
|
|
dev_0_predictions = pd.DataFrame(dev_0_predictions)
|
|
|
|
dev_0_predictions.to_csv("dev-0/out.tsv", index=False, header=False)
|
|
|
|
|
|
|
|
|
|
|
|
if __name__ == "__main__":
|
|
|
|
main()
|