from gensim import downloader from gensim.utils import simple_preprocess import gensim import numpy as np import torch import pandas as pd # Przeklejony kod z jupyter notebook'a with open("in.tsv", "r") as train_file: X_train = train_file.readlines() X_train = [gensim.utils.simple_preprocess(x) for x in X_train] y_train = pd.read_csv("expected.tsv", header=None) y_train = y_train.values with open("dev_in.tsv", "r") as train_file: X_test = train_file.readlines() X_test = [gensim.utils.simple_preprocess(x) for x in X_test] y_test = pd.read_csv("dev_expected.tsv", header=None) y_test = y_test.values w2v_model = gensim.models.Word2Vec(X_train, vector_size=100, window=5, min_count=2) words = set(w2v_model.wv.index_to_key) X_train_vector = np.array([np.array([w2v_model.wv[i] for i in ls if i in words]) for ls in X_train]) X_test_vector = np.array([np.array([w2v_model.wv[i] for i in ls if i in words]) for ls in X_test]) X_train_vector_average = [] for vector in X_train_vector: if vector.size: X_train_vector_average.append(vector.mean(axis=0)) else: X_train_vector_average.append(np.zeros(100, dtype=float)) X_test_vector_average = [] for vector in X_test_vector: if vector.size: X_test_vector_average.append(vector.mean(axis=0)) else: X_test_vector_average.append(np.zeros(100, dtype=float)) X_train_vector_average = np.array(X_train_vector_average) X_test_vector_average = np.array(X_test_vector_average) FEATURES = 100 class NeuralNetworkModel(torch.nn.Module): def __init__(self): super(NeuralNetworkModel, self).__init__() self.fc1 = torch.nn.Linear(FEATURES,500) self.fc2 = torch.nn.Linear(500,1) def forward(self, x): x = self.fc1(x) x = torch.relu(x) x = self.fc2(x) x = torch.sigmoid(x) return x nn_model = NeuralNetworkModel() BATCH_SIZE = 32 criterion = torch.nn.BCELoss() optimizer = torch.optim.SGD(nn_model.parameters(), lr = 0.1) def get_loss_acc(model, X_dataset, Y_dataset): loss_score = 0 acc_score = 0 items_total = 0 model.eval() for i in range(0, Y_dataset.shape[0], BATCH_SIZE): X = X_dataset[i:i+BATCH_SIZE] X = torch.tensor(X.astype(np.float32)) Y = Y_dataset[i:i+BATCH_SIZE] Y = torch.tensor(Y.astype(np.float32)).reshape(-1,1) Y_predictions = model(X) acc_score += torch.sum((Y_predictions > 0.5) == Y).item() items_total += Y.shape[0] loss = criterion(Y_predictions, Y) loss_score += loss.item() * Y.shape[0] return (loss_score / items_total), (acc_score / items_total) for epoch in range(50): loss_score = 0 acc_score = 0 items_total = 0 nn_model.train() for i in range(0, y_train.shape[0] - 42, BATCH_SIZE): X = X_train_vector_average[i:i+BATCH_SIZE] X = torch.tensor(X.astype(np.float32)) Y = y_train[i:i+BATCH_SIZE] Y = torch.tensor(Y.astype(np.float32)).reshape(-1,1) Y_predictions = nn_model(X) acc_score += torch.sum((Y_predictions > 0.5) == Y).item() items_total += Y.shape[0] optimizer.zero_grad() loss = criterion(Y_predictions, Y) loss.backward() optimizer.step() loss_score += loss.item() * Y.shape[0] display(epoch) display(get_loss_acc(nn_model, X_train_vector_average, y_train)) with open("test_in.tsv", "r") as real_test_file: X_real_test = real_test_file.readlines() X_real_test = [gensim.utils.simple_preprocess(x) for x in X_real_test] X_real_test_vector = np.array([np.array([w2v_model.wv[i] for i in ls if i in words]) for ls in X_real_test]) X_real_test_vector_average = [] for vector in X_real_test_vector: if vector.size: X_real_test_vector_average.append(vector.mean(axis=0)) else: X_real_test_vector_average.append(np.zeros(100, dtype=float)) X_real_test_vector_average = np.array(X_real_test_vector_average) dev_output = [] test_output = [] nn_model.eval() for i in range(len(X_test_vector_average)): X = X_test_vector_average[i] X = torch.tensor(X.astype(np.float32)) Y_predictions = nn_model(X) if Y_predictions[0] > 0.5: dev_output.append("1\n") else: dev_output.append("0\n") for i in range(len(X_real_test_vector_average)): X = X_real_test_vector_average[i] X = torch.tensor(X.astype(np.float32)) Y_predictions = nn_model(X) if Y_predictions[0] > 0.5: test_output.append("1\n") else: test_output.append("0\n") with open("dev_out.tsv", "w") as dev_file: dev_file.writelines(dev_output) with open("test_out.tsv", "w") as test_file: test_file.writelines(test_output)