diff --git a/run_pytorch.py b/run_pytorch.py new file mode 100644 index 0000000..8dc13db --- /dev/null +++ b/run_pytorch.py @@ -0,0 +1,149 @@ +import torch +import numpy as np +from gensim.models import Word2Vec +import lzma +import pandas as pd + + +class ScepticNetwork(torch.nn.Module): + + def __init__(self, features=100): + super(ScepticNetwork, self).__init__() + self.lin_1 = torch.nn.Linear(features, 500) + self.lin_2 = torch.nn.Linear(500, 1) + + def forward(self, x): + x = self.lin_1(x) + x = torch.relu(x) + x = self.lin_2(x) + x = torch.sigmoid(x) + return x + + + +def evaluate(model, X, Y, criterion, batch_size): + loss_score = 0 + acc_score = 0 + items_total = 0 + model.eval() + for i in range(0, Y.shape[0], batch_size): + X_tens = torch.tensor(X[i:i + batch_size].astype(np.float32)) + Y_tens = torch.tensor(Y[i:i + batch_size].astype(np.float32)).reshape( + -1, 1) + Y_predictions = model(X_tens) + acc_score += torch.sum((Y_predictions > 0.5) == Y_tens).item() + items_total += Y_tens.shape[0] + + loss = criterion(Y_predictions, Y_tens) + + loss_score += loss.item() * Y_tens.shape[0] + return (loss_score / items_total), (acc_score / items_total) + + +def train(model, + x_train, + y_train, + optimizer, + criterion=torch.nn.BCELoss(), + epochs=5, + batch_size=256): + for epoch in range(epochs): + loss_score = 0 + acc_score = 0 + items_total = 0 + model.train() + for i in range(0, len(y_train), batch_size): + X_tens = torch.tensor(x_train[i:i + batch_size].astype(np.float32)) + Y_tens = torch.tensor(y_train[i:i + batch_size].astype( + np.float32)).reshape(-1, 1) + Y_predictions = model(X_tens) + acc_score += torch.sum((Y_predictions > 0.5) == Y_tens).item() + items_total += Y_tens.shape[0] + + optimizer.zero_grad() + loss = criterion(Y_predictions, Y_tens) + loss.backward() + optimizer.step() + + loss_score += loss.item() * Y_tens.shape[0] + + print(f'Epoch {epoch+1}/{epochs}') + loss, accuracy = evaluate(model, x_train, y_train, criterion, + batch_size) + print(f'Train set\nloss = {loss}, accuracy = {accuracy}') + + +def flatten(t): + return [str(int(item)) for sublist in t for item in sublist] + + +def predict(model, data): + data = torch.tensor(data.astype(np.float32)) + with torch.no_grad(): + return flatten(model(data).round().tolist()) + + +PATHS = ['train/in.tsv', 'dev-0/in.tsv', 'test-A/in.tsv'] + + +def read_data(path, train=True): + print(f"I am reading the data from {path}...") + with open(path, 'r', encoding='utf-8') as f: + if train: + data = [line.strip().split() for line in f.readlines()] + else: + data = [line.strip() for line in f.readlines()] + print("Data loaded") + return data + + +def save_predictions(path, preds): + new_path = f"{path.split('/')[0]}/out.pt.tsv" + print(f"Saving predictions to {new_path}") + with open(new_path, 'w') as f: + for line in preds: + f.write(f'{line}\n') + + +def vectorize_data(data, vectorizer): + result = [ + np.mean([ + vectorizer.wv[word] + if word in vectorizer.wv else np.zeros(100, dtype=float) + for word in doc + ], + axis=0) for doc in data + ] + return np.array(result) + + + + +if __name__ == '__main__': + + # * Load training data + data = read_data(PATHS[0]) + x_train = np.array(data) + y_train = np.array(read_data('train/expected.tsv', False)) + print( + f"X_data: {x_train[:5]} {type(x_train)}, y_data: {y_train[:5]} {type(y_train)}\nx shape:{x_train.shape}\ty shape: {y_train.shape}" + ) + + # * Vectorize data + w2v = Word2Vec(x_train, vector_size=100, min_count=2) + x_train_vec = vectorize_data(x_train, w2v) + + # * Loading & training model + model = ScepticNetwork() + optimizer = torch.optim.SGD(model.parameters(), lr=0.15) + print("Now I will train the model...") + train(model, x_train_vec, y_train, epochs=50, optimizer=optimizer) + print("Training completed!\n\n") + + # * Making predictions + for path in PATHS[1:]: + X = vectorize_data(read_data(path), w2v) + print(f"I will make predictions for {path}") + predictions = predict(model, X) + print(f'Saving predictions for {path}') + save_predictions(path,predictions)