import numpy as np import pandas as pd import torch import gensim import gensim.downloader as api from sklearn.feature_extraction.text import HashingVectorizer from sklearn.metrics import accuracy_score def load_train_data(): data = pd.read_csv("train/in.tsv.xz", sep='\t', names=['text', 'id'], nrows=30000) data = data.drop(columns=['id']) labels_df = pd.read_csv("train/expected.tsv", sep='\t', names=['label'], nrows=30000) labels = labels_df['label'].values return data, labels def load_test_data(): data = pd.read_csv("test-A/in.tsv.xz", sep='\t', names=['text', 'id']) data = data.drop(columns=['id']) return data def load_dev_data(): data = pd.read_csv("dev-0/in.tsv.xz", sep='\t', names=['text', 'id']) data = data.drop(columns=['id']) labels_df = pd.read_csv("dev-0/expected.tsv", sep='\t', names=['label']) labels = labels_df['label'].values return data, labels class NeuralNetworkModel(torch.nn.Module): def __init__(self, features): super(NeuralNetworkModel, self).__init__() self.fc1 = torch.nn.Linear(features, 500) self.fc2 = torch.nn.Linear(500, 1) def forward(self, x): x = self.fc1(x) x = torch.relu(x) x = self.fc2(x) x = torch.sigmoid(x) return x # def tokenize(doc): # doc_splited = doc.split(" ") # doc_tokenized = [list(set(gensim.utils.tokenize(x, lowercase=True))) for x in doc_splited] # doc_tokenized_str = "" # print(doc_tokenized) # for word in doc_tokenized: # doc_tokenized_str += word[0] # doc_tokenized += " " # print(doc_tokenized_str) # return doc_tokenized_str # def document_vector(doc): # """Create document vectors by averaging word vectors. Remove out-of-vocabulary words.""" # doc = [word for word in doc if word in w2v.key_to_index] # return np.mean(w2v[doc], axis=0) def train_model(model, X, Y, batch_size=5, epoch_amount=5): for epoch in range(epoch_amount): loss_score = 0 acc_score = 0 items_total = 0 model.train() for i in range(0, Y.shape[0], batch_size): X_step = X[i:i + batch_size] X_step = torch.tensor(X_step.astype(np.float32).todense()) Y_step = Y[i:i + batch_size] Y_step = torch.tensor(Y_step.astype(np.float32)).reshape(-1, 1) Y_predictions = model(X_step) acc_score += torch.sum((Y_predictions > 0.5) == Y_step).item() items_total += Y_step.shape[0] optimizer.zero_grad() loss = criterion(Y_predictions, Y_step) loss.backward() optimizer.step() loss_score += loss.item() * Y_step.shape[0] print("epoch: ", epoch+1, "/", epoch_amount) return (loss_score / items_total), (acc_score / items_total) def test_model(model, X): model.eval() X = torch.tensor(X.astype(np.float32).todense()) Y_raw = model(X) Y = [1 if x > 0.5 else 0 for x in Y_raw.detach().numpy()] return Y if __name__ == "__main__": # loading and prepearing data # w2v = api.load('fasttext-wiki-news-subwords-300') print("Loading data...") data, Y = load_train_data() FEATURES = 20000 BATCH = 5 EPOCHES = 5 # text vectorization print("Vectorizing text data...") vectorizer = HashingVectorizer(n_features=FEATURES) X = vectorizer.fit_transform(data['text'].values) #X = [] # for doc in data['text'].values: # X.append(document_vector(tokenize(doc))) # X = np.asarray(X) # print(X[:5]) # train model print("Training model...") nn_model = NeuralNetworkModel(FEATURES) criterion = torch.nn.BCELoss() optimizer = torch.optim.SGD(nn_model.parameters(), lr=0.1) train_model(nn_model, X, Y, BATCH, EPOCHES) # test model print("Testing model...") data_dev, Y_dev_exp = load_dev_data() X_dev = vectorizer.transform(data_dev['text'].values) Y_dev_pred = test_model(nn_model, X_dev) # acc = accuracy_score(Y_dev_exp, Y_dev_pred) # print("dev accuracy: ", acc) np.savetxt("dev-0/out.tsv", Y_dev_pred, fmt='%i', delimiter="\t") data_test = load_test_data() X_test = vectorizer.transform(data_test['text'].values) Y_test_pred = test_model(nn_model, X_test) np.savetxt("test-A/out.tsv", Y_test_pred, fmt='%i', delimiter="\t")