import lzma import gensim.downloader import numpy as np import torch FEAUTERES = 100 def predict(data): nn_model.eval() predictions = [] for i in range(len(data)): X = data[i] X = torch.tensor(X.astype(np.float32)) Y_predictions = nn_model(X) if Y_predictions[0] > 0.5: predictions.append("1") else: predictions.append("0") return predictions def vectorize(data): vectorized_data = np.array([np.array([word2vec_model.wv[i] for i in ls if i in words]) for ls in data]) avarage_vector = [] for vector in vectorized_data: if vector.size: avarage_vector.append(vector.mean(axis=0)) else: avarage_vector.append(np.zeros(100, dtype=float)) return avarage_vector def generate_out(folder_path): print('Generating out') X_dev = [] with lzma.open(f"{folder_path}/in.tsv.xz", 'r') as file: for line in file: line = line.strip() line = line.decode("utf-8") tabs = line.rsplit('\t') content = tabs[0] pre_processed = gensim.utils.simple_preprocess(content) X_dev.append(pre_processed) print("step 5") X_dev = vectorize(X_dev) prediction = predict(X_dev) print("step 6") f = open(f"{folder_path}/out.tsv", "a") for p in prediction: f.write(str(p) + '\n') f.close() def get_loss_acc(model, X_dataset, Y_dataset): loss_score = 0 acc_score = 0 items_total = 0 model.eval() for i in range(0, Y_dataset.shape[0], BATCH_SIZE): X = X_dataset[i:i+BATCH_SIZE] X = torch.tensor(X.astype(np.float32)) Y = Y_dataset[i:i+BATCH_SIZE] Y = torch.tensor(Y.astype(np.float32)).reshape(-1,1) Y_predictions = model(X) acc_score += torch.sum((Y_predictions > 0.5) == Y).item() items_total += Y.shape[0] loss = criterion(Y_predictions, Y) loss_score += loss.item() * Y.shape[0] return (loss_score / items_total), (acc_score / items_total) class NeuralNetworkModel(torch.nn.Module): def __init__(self): super(NeuralNetworkModel, self).__init__() self.fc1 = torch.nn.Linear(FEAUTERES,500) self.fc2 = torch.nn.Linear(500,1) def forward(self, x): x = self.fc1(x) x = torch.relu(x) x = self.fc2(x) x = torch.sigmoid(x) return x if __name__ == "__main__": X = [] Y = [] with lzma.open('train/in.tsv.xz', 'r') as file: for line in file: line = line.strip() line = line.decode("utf-8") tabs = line.rsplit('\t') content = tabs[0] pre_processed = gensim.utils.simple_preprocess(content) X.append(pre_processed) print("step 1") with open('train/expected.tsv', 'r') as file: for line in file: line = line.strip() Y.append(int(line)) X_train = X Y_train = Y print("step 2") print('Word to vec start') word2vec_model = gensim.models.Word2Vec(X_train, vector_size=100, window=5, min_count=2) print('Created model') words = set(word2vec_model.wv.index_to_key) print('Created set of worlds') X_train = vectorize(X_train) X_train = np.array(X_train) Y_train = np.array(Y_train) print('Vectorized data') print('Word to vec ended') print("step 3") # model = LogisticRegression() # model.fit(X_vectorized, Y) nn_model = NeuralNetworkModel() BATCH_SIZE = 5 criterion = torch.nn.BCELoss() optimizer = torch.optim.SGD(nn_model.parameters(), lr = 0.1) for epoch in range(7): loss_score = 0 acc_score = 0 items_total = 0 nn_model.train() for i in range(0, Y_train.shape[0], BATCH_SIZE): X = X_train[i:i+BATCH_SIZE] X = torch.tensor(X.astype(np.float32)) Y = Y_train[i:i+BATCH_SIZE] Y = torch.tensor(Y.astype(np.float32)).reshape(-1,1) Y_predictions = nn_model(X) acc_score += torch.sum((Y_predictions > 0.5) == Y).item() items_total += Y.shape[0] optimizer.zero_grad() loss = criterion(Y_predictions, Y) loss.backward() optimizer.step() loss_score += loss.item() * Y.shape[0] print('Printing') print(epoch) print(get_loss_acc(nn_model, X_train, Y_train)) # display(get_loss_acc(nn_model, X_dev, Y_dev)) print("step 4") generate_out('dev-0') # generate_out('dev-1') generate_out('test-A')