In [1]:
import numpy as np
import gensim
import torch
import pandas as pd
from gensim.models import Word2Vec
from gensim import downloader
from sklearn.feature_extraction.text import TfidfVectorizer

In [2]:
BATCH_SIZE = 10
EPOCHS = 100
FEAUTERES = 200


In [3]:
class NeuralNetworkModel(torch.nn.Module):
 
 def __init__(self):
 super(NeuralNetworkModel, self).__init__()
 self.fc1 = torch.nn.Linear(FEAUTERES,500)
 self.fc2 = torch.nn.Linear(500,1)

 def forward(self, x):
 x = self.fc1(x)
 x = torch.relu(x)
 x = self.fc2(x)
 x = torch.sigmoid(x)
 return x

In [4]:
word2vec = downloader.load("glove-twitter-200")

In [5]:
def readData(fileName): 
 with open(f'{fileName}/in.tsv', 'r', encoding='utf8') as f:
 X = np.array([x.strip().lower() for x in f.readlines()])
 with open(f'{fileName}/expected.tsv', 'r', encoding='utf8') as f:
 y = np.array([int(x.strip()) for x in f.readlines()])
 return X,y

In [6]:
X_file,y_file = readData('dev-0')

In [7]:
x_train_w2v = [np.mean([word2vec[word.lower()] for word in doc.split() if word.lower() in word2vec]
 or [np.zeros(FEAUTERES)], axis=0) for doc in X_file]

In [8]:
def train_model(X_file,y_file):
 model = NeuralNetworkModel()

 criterion = torch.nn.BCELoss()
 optimizer = torch.optim.ASGD(model.parameters(), lr=0.05)
 for epoch in range(EPOCHS):
 print(epoch)
 loss_score = 0
 acc_score = 0
 items_total = 0
 for i in range(0, y_file.shape[0], BATCH_SIZE):
 x = X_file[i:i+BATCH_SIZE]
 x = torch.tensor(np.array(x).astype(np.float32))
 y = y_file[i:i+BATCH_SIZE]
 y = torch.tensor(y.astype(np.float32)).reshape(-1, 1)
 y_pred = model(x)
 acc_score += torch.sum((y_pred > 0.5) == y).item()
 items_total += y.shape[0]

 optimizer.zero_grad()
 loss = criterion(y_pred, y)
 loss.backward()
 optimizer.step()

 loss_score += loss.item() * y.shape[0]
 
 print((loss_score / items_total), (acc_score / items_total))
 return model

In [9]:
def predict(model,x_file):
 y_dev = []
 with torch.no_grad():
 for i in range(0, len(x_file), BATCH_SIZE):
 x = x_file[i:i+BATCH_SIZE]
 x = torch.tensor(np.array(x).astype(np.float32))
 outputs = model(x)
 y = (outputs > 0.5)
 y_dev.extend(y)
 return y_dev
 

In [10]:
def wrtieToFile(fileName,y_file):
 y_out = []
 for y in y_file:
 y_out.append(int(str(y[0]).split('(')[1].split(')')[0]=='True'))
 with open(f'{fileName}/out.tsv','w',encoding='utf8') as f:
 for y in y_out:
 f.write(f'{y}\n')

In [11]:
model = train_model(x_train_w2v,y_file)

0
0.6414709375416563 0.6464339908952959
1
0.6118579905971953 0.6589529590288316
2
0.5930351529140393 0.677731411229135
3
0.5807589731138194 0.6936646433990895
4
0.5711128521026628 0.7031487101669196
5
0.5637358135638451 0.7065629742033384
6
0.5573145605239321 0.710546282245827
7
0.5521481898931252 0.715288315629742
8
0.5475104518053836 0.7181335356600911
9
0.5430893454028008 0.7202200303490136
10
0.5395108298066443 0.7236342943854325
11
0.5361589408495177 0.7257207890743551
12
0.53314527610885 0.7270485584218513
13
0.5298747769267226 0.7297040971168437
14
0.5269876997833096 0.7319802731411229
15
0.5245049590914763 0.7336874051593323
16
0.5220209190930057 0.7363429438543247
17
0.5203242429527871 0.7365326251896813
18
0.5182899421417297 0.737670713201821
19
0.5155506848000069 0.7401365705614568
20
0.5131794015095429 0.7403262518968133
21
0.5113656374375719 0.7412746585735963
22
0.5092821710139558 0.7420333839150227
23
0.5067137854063547 0.7441198786039454
24
0.5047900934558085 0.74525796

In [12]:
y_dev=predict(model,x_train_w2v)

In [13]:
wrtieToFile("dev-0",y_dev)

In [15]:
with open(f'test-A/in.tsv', 'r', encoding='utf8') as f:
 X = np.array([x.strip().lower() for x in f.readlines()])

In [16]:
x_train_w2v = [np.mean([word2vec[word.lower()] for word in doc.split() if word.lower() in word2vec]
 or [np.zeros(FEAUTERES)], axis=0) for doc in X]

In [17]:
y_dev=predict(model,x_train_w2v)

In [18]:
wrtieToFile("test-A",y_dev)