import csv import gensim.downloader import numpy as np import pandas as pd import torch import torch.nn as nn from nltk import word_tokenize # Feed forward neural network model class FeedforwardNeuralNetModel(nn.Module): def __init__(self, input_dim, hidden_dim, output_dim): super(FeedforwardNeuralNetModel, self).__init__() # Linear function 1: vocab_size --> 500 self.fc1 = nn.Linear(input_dim, hidden_dim) # Non-linearity 1 self.relu1 = nn.ReLU() # Linear function 2: 500 --> 500 self.fc2 = nn.Linear(hidden_dim, hidden_dim) # Non-linearity 2 self.relu2 = nn.ReLU() # Linear function 3 (readout): 500 --> 3 self.fc3 = nn.Linear(hidden_dim, output_dim) def forward(self, x): # Linear function 1 out = self.fc1(x) # Non-linearity 1 out = self.relu1(out) # Non-linearity 2 out = self.relu2(out) # Linear function 3 (readout) out = self.fc3(out) return torch.sigmoid(out) col_names = ["content", "id", "label"] # Loading dataset train_set_features = pd.read_table( "train/in.tsv.xz", error_bad_lines=False, quoting=csv.QUOTE_NONE, header=None, names=col_names[:2], ) train_set_labels = pd.read_table( "train/expected.tsv", error_bad_lines=False, quoting=csv.QUOTE_NONE, header=None, names=col_names[2:], ) dev_set = pd.read_table( "dev-0/in.tsv.xz", error_bad_lines=False, header=None, quoting=csv.QUOTE_NONE, names=col_names[:2], ) test_set = pd.read_table( "test-A/in.tsv.xz", error_bad_lines=False, header=None, quoting=csv.QUOTE_NONE, names=col_names[:2], ) # Lowercase text X_train = train_set_features["content"].str.lower() y_train = train_set_labels["label"] X_dev = dev_set["content"].str.lower() X_test = test_set["content"].str.lower() # Tokenize text with nltk X_train = [word_tokenize(content) for content in X_train] X_dev = [word_tokenize(content) for content in X_dev] X_test = [word_tokenize(content) for content in X_test] # Vectorize text word2vec = gensim.downloader.load("word2vec-google-news-300") X_train = [ np.mean( [word2vec[word] for word in content if word in word2vec] or [np.zeros(300)], axis=0, ) for content in X_train ] X_dev = [ np.mean( [word2vec[word] for word in content if word in word2vec] or [np.zeros(300)], axis=0, ) for content in X_dev ] X_test = [ np.mean( [word2vec[word] for word in content if word in word2vec] or [np.zeros(300)], axis=0, ) for content in X_test ] # Model config input_dim = 300 hidden_layer = 600 output_dim = 1 batch_size = 10 epochs = 10 # Model init model = FeedforwardNeuralNetModel(input_dim, hidden_layer, output_dim) optimizer = torch.optim.SGD(model.parameters(), lr=0.01) criterion = torch.nn.BCELoss() # Learning model for epoch in range(epochs): model.train() for i in range(0, y_train.shape[0], batch_size): X = X_train[i : i + batch_size] X = torch.tensor(X) y = y_train[i : i + batch_size] y = torch.tensor(y.astype(np.float32).to_numpy()).reshape(-1, 1) outputs = model(X.float()) loss = criterion(outputs, y) optimizer.zero_grad() loss.backward() optimizer.step() # Making predictions for dev-0 & and test-A test_prediction = [] dev_prediction = [] model.eval() with torch.no_grad(): for i in range(0, len(X_test), batch_size): X = X_test[i : i + batch_size] X = torch.tensor(X) outputs = model(X.float()) prediction = outputs > 0.5 test_prediction += prediction.tolist() for i in range(0, len(X_dev), batch_size): X = X_dev[i : i + batch_size] X = torch.tensor(X) outputs = model(X.float()) prediction = outputs > 0.5 dev_prediction += prediction.tolist() test_prediction = np.asarray(test_prediction, dtype=np.int32) dev_prediction = np.asarray(dev_prediction, dtype=np.int32) test_prediction.tofile("./test-A/out.tsv", sep="\n") dev_prediction.tofile("./dev-0/out.tsv", sep="\n")