from gensim.models import KeyedVectors import nltk import numpy as np import os import gensim from gensim.models import Word2Vec import numpy as np import pandas as pd import matplotlib.pyplot as plt import matplotlib.gridspec as gridspec from sklearn.preprocessing import LabelEncoder from sklearn.linear_model import LogisticRegression import torch import csv # Assigning data from files to variables train = pd.read_table('train/train.tsv', error_bad_lines=False, sep='\t', quoting=csv.QUOTE_NONE, header=None) x_dev = pd.read_table('dev-0/in.tsv', error_bad_lines=False, sep='\t', header=None, quoting=csv.QUOTE_NONE) y_dev = pd.read_table('dev-0/expected.tsv', error_bad_lines=False, sep='\t', header=None, quoting=csv.QUOTE_NONE) x_test = pd.read_table('test-A/in.tsv', error_bad_lines=False, sep='\t', header=None, quoting=csv.QUOTE_NONE) # Data split na x i y x_train = train[1].values y_train = train[0].values x_dev = x_dev[0].values x_test = x_test[0].values # I needed this only once # nltk.download('punkt') # Tokenization def tokenize(data): new_data = [nltk.word_tokenize(x) for x in data] for doc in new_data: i = 0 while i < len(doc): if doc[i].isalpha(): doc[i] = doc[i].lower() else: del doc[i] i += 1 return new_data x_train_tokenized = tokenize(x_train) x_dev_tokenized = tokenize(x_dev) x_test_tokenized = tokenize(x_test) # trained custom model form wiki-forms-all-100-skipg-ns # run only on first try # http://dsmodels.nlp.ipipan.waw.pl/dsmodels/wiki-forms-all-100-skipg-ns.txt.gz # word2vec = KeyedVectors.load_word2vec_format( # 'wiki-forms-all-100-skipg-ns.txt.gz', binary=False) # word2vec.save("word2vec.bin") word2vec = KeyedVectors.load("word2vec.bin") x_train = [np.mean([word2vec[word] for word in content if word in word2vec] or [ np.zeros(100)], axis=0) for content in x_train] x_train_tensor = torch.tensor( np.array(x_train, dtype=np.float32).astype(np.float32)) x_train_vec = np.array(x_train, dtype=np.float32) x_dev = [np.mean([word2vec[word] for word in content if word in word2vec] or [ np.zeros(100)], axis=0) for content in x_dev] x_dev_vec = np.array(x_dev, dtype=np.float32) x_test = [np.mean([word2vec[word] for word in content if word in word2vec] or [ np.zeros(100)], axis=0) for content in x_test] x_test_vec = np.array(x_test, dtype=np.float32) class NNModel(torch.nn.Module): def __init__(self): super(NNModel, self).__init__() self.fc1 = torch.nn.Linear(100, 200) self.fc2 = torch.nn.Linear(200, 1) def forward(self, x): x = self.fc1(x) x = torch.relu(x) x = self.fc2(x) x = torch.sigmoid(x) return x model = NNModel() criterion = torch.nn.BCELoss() optimizer = torch.optim.SGD(model.parameters(), lr=0.01) batch_size = 12 for epoch in range(10): loss_score = 0 acc_score = 0 items_total = 0 model.train() for i in range(0, y_train.shape[0], batch_size): X = x_train_vec[i:i + batch_size] X = torch.tensor(X.astype(np.float32)) Y = y_train[i:i + batch_size] Y = torch.tensor(Y.astype(np.float32)).reshape(-1, 1) Y_predictions = model(X) acc_score += torch.sum((Y_predictions > 0.5) == Y).item() items_total += Y.shape[0] optimizer.zero_grad() loss = criterion(Y_predictions, Y) loss.backward() optimizer.step() loss_score += loss.item() * Y.shape[0] print(epoch) # Generation y_pred_dev = model(torch.tensor(x_dev_vec.astype(np.float32))) y_pred_dev = y_pred_dev.cpu().detach().numpy() y_pred_dev = (y_pred_dev > 0.5) y_pred_dev = np.asarray(y_pred_dev, dtype=np.int32) y_pred_dev.tofile('dev-0/out.tsv', sep='\n') y_pred_test = model(torch.tensor(x_dev_vec.astype(np.float32))) y_pred_test = y_pred_test.cpu().detach().numpy() y_pred_test = (y_pred_test > 0.5) y_pred_test = np.asarray(y_pred_test, dtype=np.int32) y_pred_test.tofile('test-A/out.tsv', sep='\n')