import pandas as pd import numpy as np import torch import csv from nltk.tokenize import word_tokenize from gensim.models import Word2Vec import gensim.downloader CONTENT = 'content' ID = 'id' LABEL = 'label' col_names = [CONTENT, ID, LABEL] word2vec = gensim.downloader.load('word2vec-google-news-300') BATCH_SIZE = 10 TRAIN_IN_PATH = 'train/in.tsv.xz' TRAIN_EXP_PATH = 'train/expected.tsv' DEV_PATH = 'dev-0/in.tsv.xz' TEST_PATH = 'test-A/in.tsv.xz' DEV_OUT_PATH = './dev-0/out.tsv' TEST_OUT_PATH = './test-A/out.tsv' INPUT_SIZE = 300 HIDDEN_SIZE = 600 NUM_CLASSES = 1 class NeuralNetwork(torch.nn.Module): def __init__(self, input_size, hidden_size, num_classes): super(NeuralNetwork, self).__init__() self.l1 = torch.nn.Linear(input_size, hidden_size) self.l2 = torch.nn.Linear(hidden_size, num_classes) def forward(self, x): x = self.l1(x) x = torch.relu(x) x = self.l2(x) x = torch.sigmoid(x) return x def load_set(path, col_n): table_set = pd.read_table(path, error_bad_lines=False, quoting=csv.QUOTE_NONE, header=None, names=col_n) return table_set def to_lower(t_set, header): a_set = t_set[header].str.lower() return a_set def tokenize(t_set): tokenized_set = [word_tokenize(content) for content in t_set] return tokenized_set def word_2_vec(t_set, w2v): c_set = [np.mean([w2v[word] for word in content if word in w2v] or [np.zeros(300)], axis=0) for content in t_set] return c_set def calc_prediction(x_t_set, batch_len, t_model): pred = [] for i in range(0, len(x_t_set), batch_len): x_t = x_t_set[i:i + batch_len] x_t = torch.tensor(x_t) out = t_model(x_t.float()) prediction = (out > 0.5) pred = pred + prediction.tolist() return pred def predict(p_model, batch_len, x_t_test): t_pred = [] p_model.eval() with torch.no_grad(): t_pred = calc_prediction(x_t_test, batch_len, p_model) return t_pred def train_model(model_to_train, y_t_train, x_t_train): cri = torch.nn.BCELoss() opt = torch.optim.SGD(model_to_train.parameters(), lr=0.01) for epoch in range(6): model_to_train.train() for index in range(0, y_t_train.shape[0], BATCH_SIZE): t_x = x_t_train[index:index + BATCH_SIZE] t_x = torch.tensor(t_x) t_y = y_t_train[index:index + BATCH_SIZE] t_y = torch.tensor(t_y.astype(np.float32).to_numpy()).reshape(-1, 1) out = model_to_train(t_x.float()) loss = cri(out, t_y) opt.zero_grad() loss.backward() opt.step() return model_to_train t_set_features = load_set(TRAIN_IN_PATH, col_names[:2]) t_set_labels = load_set(TRAIN_EXP_PATH, col_names[2:]) dev_set = load_set(DEV_PATH, col_names[:2]) test_set = load_set(TEST_PATH, col_names[:2]) x_train = t_set_features[CONTENT].str.lower() y_train = t_set_labels[LABEL] x_dev = dev_set[CONTENT].str.lower() x_test = test_set[CONTENT].str.lower() x_train = tokenize(x_train) x_dev = tokenize(x_dev) x_test = tokenize(x_test) x_train = word_2_vec(x_train, word2vec) x_dev = word_2_vec(x_dev, word2vec) x_test = word_2_vec(x_test, word2vec) model = NeuralNetwork(INPUT_SIZE, HIDDEN_SIZE, NUM_CLASSES) trained_model = train_model(model, y_train, x_train) dev_prediction = predict(trained_model, 10, x_dev) test_prediction = predict(trained_model, 10, x_test) trained_model.eval() dev_prediction = np.asarray(dev_prediction, dtype=np.int32) test_prediction = np.asarray(test_prediction, dtype=np.int32) dev_prediction.tofile(DEV_OUT_PATH, sep='\n') test_prediction.tofile(TEST_OUT_PATH, sep='\n')