In [1]:
import numpy as np
import pandas as pd
from gensim.models import Word2Vec
from gensim.utils import simple_preprocess
from sklearn.model_selection import train_test_split
from sklearn.metrics import accuracy_score, classification_report
from tensorflow.keras.models import Sequential
from tensorflow.keras.layers import Dense


In [2]:
def build_corpus(file_list):
    documents = []
    for file in file_list:
        with open(file, 'r', encoding="utf8") as f:
            for line in f:
                processed_line = simple_preprocess(line)
                documents.append(processed_line)
    return documents

In [3]:
def text_to_vector(text, model):
    tokens = simple_preprocess(text)
    word_vectors = [model.wv[token] for token in tokens if token in model.wv]
    if word_vectors:
        return np.mean(word_vectors, axis=0)
    else:
        return np.zeros(model.vector_size)

In [4]:
def read_text_file(filepath):
    lines = []
    with open(filepath, 'r', encoding="utf8") as file:
        for line in file:
            lines.append(line.strip())
    return lines

In [5]:
def write_predictions_to_file(predictions, filepath):
    with open(filepath, 'w', encoding="utf8") as file:
        for prediction in predictions:
            file.write(f"{prediction[0]}\n")

In [6]:
documents = build_corpus(['dev-0/in.tsv', 'test-A/in.tsv'])
word2vec_model = Word2Vec(sentences=documents, vector_size=100, window=5, min_count=1, workers=4)
word2vec_model.save("word2vec.model")

dev_texts = read_text_file('dev-0/in.tsv')
test_texts = read_text_file('test-A/in.tsv')

dev_features = np.array([text_to_vector(text, word2vec_model) for text in dev_texts])
test_features = np.array([text_to_vector(text, word2vec_model) for text in test_texts])

dev_labels = pd.read_csv('dev-0/expected.tsv', sep='\t', header=None).values.flatten()
X_train, X_valid, y_train, y_valid = train_test_split(dev_features, dev_labels, test_size=0.2, random_state=42)

neural_network = Sequential([
    Dense(64, activation='relu', input_shape=(100,)),
    Dense(32, activation='relu'),
    Dense(1, activation='sigmoid')
])

neural_network.compile(optimizer='adam', loss='binary_crossentropy', metrics=['accuracy'])

training_history = neural_network.fit(X_train, y_train, epochs=1000, batch_size=32, validation_data=(X_valid, y_valid))

dev_predictions_raw = neural_network.predict(dev_features)
test_predictions_raw = neural_network.predict(test_features)

dev_predictions = (dev_predictions_raw > 0.5).astype(int)
test_predictions = (test_predictions_raw > 0.5).astype(int)

write_predictions_to_file(dev_predictions, 'dev-0/out.tsv')
write_predictions_to_file(test_predictions, 'test-A/out.tsv')


Epoch 1/1000


  super().__init__(activity_regularizer=activity_regularizer, **kwargs)


[1m137/137[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m1s[0m 3ms/step - accuracy: 0.6695 - loss: 0.5541 - val_accuracy: 0.8258 - val_loss: 0.3704
Epoch 2/1000
[1m137/137[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 1ms/step - accuracy: 0.8517 - loss: 0.3327 - val_accuracy: 0.8433 - val_loss: 0.3428
Epoch 3/1000
[1m137/137[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 1ms/step - accuracy: 0.8496 - loss: 0.3206 - val_accuracy: 0.8313 - val_loss: 0.3492
Epoch 4/1000
[1m137/137[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 1ms/step - accuracy: 0.8634 - loss: 0.2961 - val_accuracy: 0.8414 - val_loss: 0.3361
Epoch 5/1000
[1m137/137[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 1ms/step - accuracy: 0.8571 - loss: 0.3020 - val_accuracy: 0.8442 - val_loss: 0.3439
Epoch 6/1000
[1m137/137[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 2ms/step - accuracy: 0.8655 - loss: 0.2932 - val_accuracy: 0.8368 - val_loss: 0.3387
Epoch 7/1000
[1m137/137[0m 

In [7]:
dev_pred_labels = pd.read_csv('dev-0/out.tsv', header=None).values.flatten()
expected_labels = dev_labels

accuracy = accuracy_score(expected_labels, dev_pred_labels)
report = classification_report(expected_labels, dev_pred_labels)

print("=== Evaluation Results ===")
print(f"Accuracy: {accuracy:.4f}")
print("\nClassification Report:\n")
print(report)
print("==========================\n")

=== Evaluation Results ===
Accuracy: 0.9364

Classification Report:

              precision    recall  f1-score   support

           0       0.93      0.89      0.91      1983
           1       0.94      0.96      0.95      3469

    accuracy                           0.94      5452
   macro avg       0.93      0.93      0.93      5452
weighted avg       0.94      0.94      0.94      5452


