### Importing libraries

In [4]:
# Data manipulation
import numpy as np
import pandas as pd

# Word2vec
from gensim.models import KeyedVectors

# NLP
import spacy

# Neural network
import torch
import torch.nn as nn
import torch.optim as optim

# Metrics
from sklearn.metrics import accuracy_score

## Load word2vec model (100 dimensions)

In [5]:
word2vec = KeyedVectors.load('word2vec/word2vec_100_3_polish.bin')

## Load spacy model

In [6]:
nlp = spacy.load('pl_core_news_sm')

## Neural network model

In [7]:
class NeuralNetwork(nn.Module):
 def __init__(self, input_size, hidden_size):
 super(NeuralNetwork, self).__init__()
 
 self.fc1 = nn.Linear(input_size, hidden_size)
 self.fc2 = nn.Linear(hidden_size, hidden_size // 2)
 self.fc3 = nn.Linear(hidden_size // 2, hidden_size // 4)
 self.fc4 = nn.Linear(hidden_size // 4, hidden_size // 8)
 self.fc5 = nn.Linear(hidden_size // 8, 1)
 
 self.relu = nn.ReLU()
 self.sigmoid = nn.Sigmoid()
 
 def forward(self, x):
 x = self.fc1(x)
 x = self.relu(x)
 x = self.fc2(x)
 x = self.relu(x)
 x = self.fc3(x)
 x = self.relu(x)
 x = self.fc4(x)
 x = self.relu(x)
 x = self.fc5(x)
 x = self.sigmoid(x)
 return x

## Load and preprocess data

In [8]:
# Load data
df_train = pd.read_csv('train/train.tsv', delimiter='\t', header=None)
df_test = pd.read_csv('test-A/in.tsv', delimiter='\t', header=None)
df_dev = pd.read_csv('dev-0/in.tsv', delimiter='\t', header=None)
df_dev_expected = pd.read_csv('dev-0/expected.tsv', delimiter='\t', header=None)

# Drop invalid columns
df_train.drop(columns=2, inplace=True)
df_test.drop(columns=1, inplace=True)

# Rename columns
df_train.columns = ['label', 'sentence']
df_test.columns = ['sentence']
df_dev.columns = ['sentence']
df_dev_expected.columns = ['label']

# Convert sentences to lowercase
df_train['sentence'] = df_train['sentence'].apply(lambda x: x.lower())
df_test['sentence'] = df_test['sentence'].apply(lambda x: x.lower())
df_dev['sentence'] = df_dev['sentence'].apply(lambda x: x.lower())

## Sentence representation with word2vec

In [9]:
def get_sentence_representation(sentence):
 doc = nlp(sentence.lower())
 return np.sum([word2vec[token.text] for token in doc if token.text in word2vec], axis=0)

In [None]:
# Train data
X_train = np.array([get_sentence_representation(sentence) for sentence in df_train['sentence']])
y_train = df_train['label'].values

# Dev data
X_dev = np.array([get_sentence_representation(sentence) for sentence in df_dev['sentence']])
y_dev = df_dev_expected['label'].values

# Test data
X_test = np.array([get_sentence_representation(sentence) for sentence in df_test['sentence']])

In [10]:
# Load previously saved data (word2vec representation of sentences)
X_train = np.load('X_train.npy')
y_train = np.load('y_train.npy')

X_dev = np.load('X_dev.npy')
y_dev = np.load('y_dev.npy')

X_test = np.load('X_test.npy')

In [16]:
model = NeuralNetwork(100, 256)

criterion = nn.BCELoss()
optimizer = optim.Adam(model.parameters(), lr=0.001, weight_decay=0.001)

X_train_tensor = torch.from_numpy(X_train).float()
y_train_tensor = torch.from_numpy(y_train).float().view(-1, 1)

X_dev_tensor = torch.from_numpy(X_dev).float()
y_dev_tensor = torch.from_numpy(y_dev).float().view(-1, 1)

model.train()

best_epoch = 0
max_accuracy = 0.0

for epoch in range(500):
 
 optimizer.zero_grad()
 y_pred = model(X_train_tensor)
 loss = criterion(y_pred, y_train_tensor)
 loss.backward()
 optimizer.step()
 
 # dev loss
 with torch.no_grad():
 y_pred = model(X_dev_tensor)
 loss = criterion(y_pred, y_dev_tensor)
 accuracy = accuracy_score(y_dev_tensor, np.where(y_pred > 0.5, 1, 0))

 if max_accuracy < accuracy:
 best_epoch = epoch
 max_accuracy = accuracy
 
 print(f"Epoch {epoch} - loss: {loss}")
 print(f"Epoch {epoch} - accuracy: {accuracy}")
 
model.eval()

with torch.no_grad():
 y_pred = model(X_dev_tensor)
 y_pred = np.where(y_pred > 0.5, 1, 0)
 accuracy = accuracy_score(y_dev_tensor, y_pred)
 print(accuracy)

print(f"Best epoch: {best_epoch}")
print(f"Max accuracy: {max_accuracy}")

Epoch 0 - loss: 0.9116247892379761
Epoch 0 - accuracy: 0.6362802641232576
Epoch 1 - loss: 0.7140407562255859
Epoch 1 - accuracy: 0.6362802641232576
Epoch 2 - loss: 0.618240237236023
Epoch 2 - accuracy: 0.6366471019809244
Epoch 3 - loss: 0.6239327788352966
Epoch 3 - accuracy: 0.6977256052824652
Epoch 4 - loss: 0.6335155367851257
Epoch 4 - accuracy: 0.7316581071166545
Epoch 5 - loss: 0.6156240701675415
Epoch 5 - accuracy: 0.7279897285399853
Epoch 6 - loss: 0.5953847169876099
Epoch 6 - accuracy: 0.694424064563463
Epoch 7 - loss: 0.5810463428497314
Epoch 7 - accuracy: 0.6766324284666178
Epoch 8 - loss: 0.5640420317649841
Epoch 8 - accuracy: 0.6856199559794571
Epoch 9 - loss: 0.5385629534721375
Epoch 9 - accuracy: 0.733125458547322
Epoch 10 - loss: 0.5121918320655823
Epoch 10 - accuracy: 0.7870506236243581
Epoch 11 - loss: 0.49032482504844666
Epoch 11 - accuracy: 0.80997798972854
Epoch 12 - loss: 0.45911359786987305
Epoch 12 - accuracy: 0.8105282465150404
Epoch 13 - loss: 0.4357451200485229

In [18]:
best_model = NeuralNetwork(100, 256)

criterion = nn.BCELoss()
optimizer = optim.Adam(best_model.parameters(), lr=0.001, weight_decay=0.001)

X_train_tensor = torch.from_numpy(X_train).float()
y_train_tensor = torch.from_numpy(y_train).float().view(-1, 1)

X_dev_tensor = torch.from_numpy(X_dev).float()
y_dev_tensor = torch.from_numpy(y_dev).float().view(-1, 1)

X_test_tensor = torch.from_numpy(X_test).float()

best_model.train()

best_epoch = 471

for epoch in range(best_epoch + 1):
 
 optimizer.zero_grad()
 y_pred = best_model(X_train_tensor)
 loss = criterion(y_pred, y_train_tensor)
 loss.backward()
 optimizer.step()
 
 # dev loss
 with torch.no_grad():
 y_pred = best_model(X_dev_tensor)
 loss = criterion(y_pred, y_dev_tensor)
 accuracy = accuracy_score(y_dev_tensor, np.where(y_pred > 0.5, 1, 0))
 
 print(f"Epoch {epoch} - loss: {loss}")
 print(f"Epoch {epoch} - accuracy: {accuracy}")
 
model.eval()

Epoch 0 - loss: 0.6600890755653381
Epoch 0 - accuracy: 0.636463683052091
Epoch 1 - loss: 0.6251927614212036
Epoch 1 - accuracy: 0.7168011738811445
Epoch 2 - loss: 0.6044067740440369
Epoch 2 - accuracy: 0.7454145267791636
Epoch 3 - loss: 0.5796983242034912
Epoch 3 - accuracy: 0.6925898752751284
Epoch 4 - loss: 0.5561812520027161
Epoch 4 - accuracy: 0.7035950110051358
Epoch 5 - loss: 0.5200029015541077
Epoch 5 - accuracy: 0.8017241379310345
Epoch 6 - loss: 0.4880651533603668
Epoch 6 - accuracy: 0.8242846661775495
Epoch 7 - loss: 0.4609140455722809
Epoch 7 - accuracy: 0.7848495964783566
Epoch 8 - loss: 0.4216255843639374
Epoch 8 - accuracy: 0.8431768158473955
Epoch 9 - loss: 0.39770105481147766
Epoch 9 - accuracy: 0.8464783565663977
Epoch 10 - loss: 0.38234901428222656
Epoch 10 - accuracy: 0.8275862068965517
Epoch 11 - loss: 0.3460651636123657
Epoch 11 - accuracy: 0.8617021276595744
Epoch 12 - loss: 0.3165454864501953
Epoch 12 - accuracy: 0.8769258987527513
Epoch 13 - loss: 0.310476511716

In [30]:
with torch.no_grad():
 y_pred_dev = best_model(X_dev_tensor)
 y_pred_dev = np.where(y_pred_dev > 0.5, 1, 0)
 df_dev_out = pd.DataFrame(y_pred_dev)
 df_dev_out.to_csv('dev-0/out.tsv', sep = '\t', index = None)
 accuracy_dev = accuracy_score(y_dev_tensor, y_pred_dev)
 print(f"Dev accuracy: {accuracy_dev}")

 y_pred_test = best_model(X_test_tensor)
 y_pred_test = np.where(y_pred_test > 0.5, 1, 0)
 df_test_out = pd.DataFrame(y_pred_test)
 df_test_out.to_csv('test-A/out.tsv', sep = '\t', index = None)

Dev accuracy: 0.9512105649303008
