# %% import numpy as np import gensim import torch import pandas as pd from gensim.models import Word2Vec from gensim import downloader from sklearn.feature_extraction.text import TfidfVectorizer # %% BATCH_SIZE = 10 EPOCHS = 100 FEAUTERES = 200 # %% class NeuralNetworkModel(torch.nn.Module): def __init__(self): super(NeuralNetworkModel, self).__init__() self.fc1 = torch.nn.Linear(FEAUTERES,500) self.fc2 = torch.nn.Linear(500,1) def forward(self, x): x = self.fc1(x) x = torch.relu(x) x = self.fc2(x) x = torch.sigmoid(x) return x # %% word2vec = downloader.load("glove-twitter-200") # %% def readData(fileName): with open(f'{fileName}/in.tsv', 'r', encoding='utf8') as f: X = np.array([x.strip().lower() for x in f.readlines()]) with open(f'{fileName}/expected.tsv', 'r', encoding='utf8') as f: y = np.array([int(x.strip()) for x in f.readlines()]) return X,y # %% X_file,y_file = readData('dev-0') # %% x_train_w2v = [np.mean([word2vec[word.lower()] for word in doc.split() if word.lower() in word2vec] or [np.zeros(FEAUTERES)], axis=0) for doc in X_file] # %% def train_model(X_file,y_file): model = NeuralNetworkModel() criterion = torch.nn.BCELoss() optimizer = torch.optim.ASGD(model.parameters(), lr=0.05) for epoch in range(EPOCHS): print(epoch) loss_score = 0 acc_score = 0 items_total = 0 for i in range(0, y_file.shape[0], BATCH_SIZE): x = X_file[i:i+BATCH_SIZE] x = torch.tensor(np.array(x).astype(np.float32)) y = y_file[i:i+BATCH_SIZE] y = torch.tensor(y.astype(np.float32)).reshape(-1, 1) y_pred = model(x) acc_score += torch.sum((y_pred > 0.5) == y).item() items_total += y.shape[0] optimizer.zero_grad() loss = criterion(y_pred, y) loss.backward() optimizer.step() loss_score += loss.item() * y.shape[0] print((loss_score / items_total), (acc_score / items_total)) return model # %% def predict(model,x_file): y_dev = [] with torch.no_grad(): for i in range(0, len(x_file), BATCH_SIZE): x = x_file[i:i+BATCH_SIZE] x = torch.tensor(np.array(x).astype(np.float32)) outputs = model(x) y = (outputs > 0.5) y_dev.extend(y) return y_dev # %% def wrtieToFile(fileName,y_file): y_out = [] for y in y_file: y_out.append(int(str(y[0]).split('(')[1].split(')')[0]=='True')) with open(f'{fileName}/out.tsv','w',encoding='utf8') as f: for y in y_out: f.write(f'{y}\n') # %% model = train_model(x_train_w2v,y_file) # %% y_dev=predict(model,x_train_w2v) # %% wrtieToFile("dev-0",y_dev) # %% with open(f'test-A/in.tsv', 'r', encoding='utf8') as f: X = np.array([x.strip().lower() for x in f.readlines()]) # %% x_train_w2v = [np.mean([word2vec[word.lower()] for word in doc.split() if word.lower() in word2vec] or [np.zeros(FEAUTERES)], axis=0) for doc in X] # %% y_dev=predict(model,x_train_w2v) # %% wrtieToFile("test-A",y_dev)