In [1]:
import pandas as pd
import numpy as np
from gensim.models import Word2Vec
from tensorflow.keras.models import Sequential
from tensorflow.keras.layers import Dense

In [2]:
def load_train_data(file_path):
    labels = []
    texts = []
    with open(file_path, 'r', encoding='utf-8') as file:
        for line in file:
            parts = line.split('\t', 1)
            if len(parts) == 2:
                label, text = parts
                labels.append(int(label))
                texts.append(text.strip())
    return pd.DataFrame({'label': labels, 'text': texts})

In [3]:
def load_data(file_path):
    texts = []
    with open(file_path, 'r', encoding='utf-8') as file:
        for line in file:
            texts.append(line.strip())
    return pd.DataFrame({'text': texts})

In [4]:
def load_labels(file_path):
    labels = []
    with open(file_path, 'r', encoding='utf-8') as file:
        for line in file:
            labels.append(int(line.strip()))
    return pd.DataFrame({'label': labels})

In [5]:
def get_average_word2vec(tokens_list, model, k=100):
    vec = np.zeros(k)
    count = 0
    for word in tokens_list:
        if word in model.wv:
            vec += model.wv[word]
            count += 1
    if count != 0:
        vec /= count
    return vec

In [6]:
def preprocess_data(file_path, word2vec_model):
    data = load_data(file_path)
    X = np.array([get_average_word2vec(text.split(), word2vec_model) for text in data['text']])
    return X

In [7]:
train_data = load_train_data('train/train.tsv')
sentences = [text.split() for text in train_data['text']]
word2vec_model = Word2Vec(sentences, vector_size=100, window=5, min_count=1, workers=4)

In [8]:
X_train = np.array([get_average_word2vec(text.split(), word2vec_model) for text in train_data['text']])
y_train = np.array(train_data['label'])
X_dev = preprocess_data('dev-0/in.tsv', word2vec_model)
dev_labels = load_labels('dev-0/expected.tsv')

In [9]:
model = Sequential()
model.add(Dense(64, activation='relu'))
model.add(Dense(32, activation='relu'))
model.add(Dense(1, activation='sigmoid'))

model.compile(loss='binary_crossentropy', optimizer='adam', metrics=['accuracy'])
model.fit(X_train, y_train, epochs=100, batch_size=32, validation_data=(X_dev, dev_labels))

Epoch 1/100
[1m3067/3067[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m4s[0m 880us/step - accuracy: 0.9494 - loss: 0.1326 - val_accuracy: 0.9718 - val_loss: 0.0791
Epoch 2/100
[1m3067/3067[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m3s[0m 879us/step - accuracy: 0.9693 - loss: 0.0806 - val_accuracy: 0.9714 - val_loss: 0.0764
Epoch 3/100
[1m3067/3067[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m3s[0m 874us/step - accuracy: 0.9710 - loss: 0.0749 - val_accuracy: 0.9727 - val_loss: 0.0743
Epoch 4/100
[1m3067/3067[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m3s[0m 872us/step - accuracy: 0.9720 - loss: 0.0740 - val_accuracy: 0.9725 - val_loss: 0.0725
Epoch 5/100
[1m3067/3067[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m3s[0m 854us/step - accuracy: 0.9723 - loss: 0.0718 - val_accuracy: 0.9732 - val_loss: 0.0709
Epoch 6/100
[1m3067/3067[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m3s[0m 855us/step - accuracy: 0.9737 - loss: 0.0687 - val_accuracy: 0.9685 - val_loss: 0.092

<keras.src.callbacks.history.History at 0x2a5e3e7da50>

In [10]:
loss, accuracy = model.evaluate(X_dev, dev_labels)
print(f"Accuracy: {accuracy}")


[1m171/171[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 660us/step - accuracy: 0.9641 - loss: 0.1876
Accuracy: 0.9658840894699097


In [11]:
dev0_pred = model.predict(X_dev)
dev0_pred = (dev0_pred > 0.5).astype(int)

[1m171/171[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 794us/step


In [12]:
dev0_pred = pd.DataFrame(dev0_pred)
dev0_pred.to_csv("dev-0/out.tsv", index=False, header=False)

In [13]:
X_testA = preprocess_data('test-A/in.tsv', word2vec_model)
testA_pred = model.predict(X_testA)
testA_pred = (testA_pred > 0.5).astype(int)

[1m171/171[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 630us/step


In [14]:
testA_pred = pd.DataFrame(testA_pred)
testA_pred.to_csv("test-A/out.tsv", index=False, header=False)