Script upload

2022-05-27 17:45:00 +02:00 · 2022-05-27 17:45:00 +02:00 · f26ab9bdbe
commit f26ab9bdbe
parent 81bd23dbcb
1 changed files with 149 additions and 0 deletions
--- a/run_pytorch.py
+++ b/run_pytorch.py
@ -0,0 +1,149 @@
+import torch
+import numpy as np
+from gensim.models import Word2Vec
+import lzma
+import pandas as pd
+
+
+class ScepticNetwork(torch.nn.Module):
+
+    def __init__(self, features=100):
+        super(ScepticNetwork, self).__init__()
+        self.lin_1 = torch.nn.Linear(features, 500)
+        self.lin_2 = torch.nn.Linear(500, 1)
+
+    def forward(self, x):
+        x = self.lin_1(x)
+        x = torch.relu(x)
+        x = self.lin_2(x)
+        x = torch.sigmoid(x)
+        return x
+
+
+
+def evaluate(model, X, Y, criterion, batch_size):
+    loss_score = 0
+    acc_score = 0
+    items_total = 0
+    model.eval()
+    for i in range(0, Y.shape[0], batch_size):
+        X_tens = torch.tensor(X[i:i + batch_size].astype(np.float32))
+        Y_tens = torch.tensor(Y[i:i + batch_size].astype(np.float32)).reshape(
+            -1, 1)
+        Y_predictions = model(X_tens)
+        acc_score += torch.sum((Y_predictions > 0.5) == Y_tens).item()
+        items_total += Y_tens.shape[0]
+
+        loss = criterion(Y_predictions, Y_tens)
+
+        loss_score += loss.item() * Y_tens.shape[0]
+    return (loss_score / items_total), (acc_score / items_total)
+
+
+def train(model,
+          x_train,
+          y_train,
+          optimizer,
+          criterion=torch.nn.BCELoss(),
+          epochs=5,
+          batch_size=256):
+    for epoch in range(epochs):
+        loss_score = 0
+        acc_score = 0
+        items_total = 0
+        model.train()
+        for i in range(0, len(y_train), batch_size):
+            X_tens = torch.tensor(x_train[i:i + batch_size].astype(np.float32))
+            Y_tens = torch.tensor(y_train[i:i + batch_size].astype(
+                np.float32)).reshape(-1, 1)
+            Y_predictions = model(X_tens)
+            acc_score += torch.sum((Y_predictions > 0.5) == Y_tens).item()
+            items_total += Y_tens.shape[0]
+
+            optimizer.zero_grad()
+            loss = criterion(Y_predictions, Y_tens)
+            loss.backward()
+            optimizer.step()
+
+            loss_score += loss.item() * Y_tens.shape[0]
+
+        print(f'Epoch {epoch+1}/{epochs}')
+        loss, accuracy = evaluate(model, x_train, y_train, criterion,
+                                  batch_size)
+        print(f'Train set\nloss = {loss}, accuracy = {accuracy}')
+
+
+def flatten(t):
+    return [str(int(item)) for sublist in t for item in sublist]
+
+
+def predict(model, data):
+    data = torch.tensor(data.astype(np.float32))
+    with torch.no_grad():
+        return flatten(model(data).round().tolist())
+
+
+PATHS = ['train/in.tsv', 'dev-0/in.tsv', 'test-A/in.tsv']
+
+
+def read_data(path, train=True):
+    print(f"I am reading the data from {path}...")
+    with open(path, 'r', encoding='utf-8') as f:
+        if train:
+            data = [line.strip().split() for line in f.readlines()]
+        else:
+            data = [line.strip() for line in f.readlines()]
+    print("Data loaded")
+    return data
+
+
+def save_predictions(path, preds):
+    new_path = f"{path.split('/')[0]}/out.pt.tsv"
+    print(f"Saving predictions to {new_path}")
+    with open(new_path, 'w') as f:
+        for line in preds:
+            f.write(f'{line}\n')
+
+
+def vectorize_data(data, vectorizer):
+    result = [
+        np.mean([
+            vectorizer.wv[word]
+            if word in vectorizer.wv else np.zeros(100, dtype=float)
+            for word in doc
+        ],
+                axis=0) for doc in data
+    ]
+    return np.array(result)
+
+
+
+
+if __name__ == '__main__':
+
+    # * Load training data
+    data = read_data(PATHS[0])
+    x_train = np.array(data)
+    y_train = np.array(read_data('train/expected.tsv', False))
+    print(
+        f"X_data: {x_train[:5]} {type(x_train)}, y_data: {y_train[:5]} {type(y_train)}\nx shape:{x_train.shape}\ty shape: {y_train.shape}"
+    )
+
+    # * Vectorize data
+    w2v = Word2Vec(x_train, vector_size=100, min_count=2)
+    x_train_vec = vectorize_data(x_train, w2v)
+
+    # * Loading & training model
+    model = ScepticNetwork()
+    optimizer = torch.optim.SGD(model.parameters(), lr=0.15)
+    print("Now I will train the model...")
+    train(model, x_train_vec, y_train, epochs=50, optimizer=optimizer)
+    print("Training completed!\n\n")
+
+    # * Making predictions
+    for path in PATHS[1:]:
+        X = vectorize_data(read_data(path), w2v)
+        print(f"I will make predictions for {path}")
+        predictions = predict(model, X)
+        print(f'Saving predictions for {path}')
+        save_predictions(path,predictions)