task 9 uploaded

2021-05-31 21:01:02 +02:00 · 2021-05-31 21:01:02 +02:00 · 93adc7c664
commit 93adc7c664
parent eb6976bcd9
3 changed files with 2530 additions and 2396 deletions
--- a/dev-0/out.tsv
+++ b/dev-0/out.tsv
--- a/solution_2.py
+++ b/solution_2.py
@ -0,0 +1,134 @@
+import pandas as pd
+import numpy as np
+import torch
+import csv
+from nltk.tokenize import word_tokenize
+from gensim.models import Word2Vec
+import gensim.downloader
+
+CONTENT = 'content'
+ID = 'id'
+LABEL = 'label'
+
+col_names = [CONTENT, ID, LABEL]
+word2vec = gensim.downloader.load('word2vec-google-news-300')
+BATCH_SIZE = 10
+TRAIN_IN_PATH = 'train/in.tsv.xz'
+TRAIN_EXP_PATH = 'train/expected.tsv'
+DEV_PATH = 'dev-0/in.tsv.xz'
+TEST_PATH = 'test-A/in.tsv.xz'
+DEV_OUT_PATH = './dev-0/out.tsv'
+TEST_OUT_PATH = './test-A/out.tsv'
+INPUT_SIZE = 300
+HIDDEN_SIZE = 600
+NUM_CLASSES = 1
+
+
+class NeuralNetwork(torch.nn.Module):
+    def __init__(self, input_size, hidden_size, num_classes):
+        super(NeuralNetwork, self).__init__()
+        self.l1 = torch.nn.Linear(input_size, hidden_size)
+        self.l2 = torch.nn.Linear(hidden_size, num_classes)
+
+    def forward(self, x):
+        x = self.l1(x)
+        x = torch.relu(x)
+        x = self.l2(x)
+        x = torch.sigmoid(x)
+        return x
+
+
+def load_set(path, col_n):
+    table_set = pd.read_table(path, error_bad_lines=False, quoting=csv.QUOTE_NONE, header=None, names=col_n)
+    return table_set
+
+
+def to_lower(t_set, header):
+    a_set = t_set[header].str.lower()
+    return a_set
+
+
+def tokenize(t_set):
+    tokenized_set = [word_tokenize(content) for content in t_set]
+    return tokenized_set
+
+
+def word_2_vec(t_set, w2v):
+    c_set = [np.mean([w2v[word] for word in content if word in w2v] or [np.zeros(300)], axis=0) for content in
+             t_set]
+    return c_set
+
+
+def calc_prediction(x_t_set, batch_len, t_model):
+    pred = []
+    for i in range(0, len(x_t_set), batch_len):
+        x_t = x_t_set[i:i + batch_len]
+        x_t = torch.tensor(x_t)
+
+        out = t_model(x_t.float())
+
+        prediction = (out > 0.5)
+        pred = pred + prediction.tolist()
+    return pred
+
+
+def predict(p_model, batch_len, x_t_test):
+    t_pred = []
+    p_model.eval()
+    with torch.no_grad():
+        t_pred = calc_prediction(x_t_test, batch_len, p_model)
+
+    return t_pred
+
+
+def train_model(model_to_train, y_t_train, x_t_train):
+    cri = torch.nn.BCELoss()
+    opt = torch.optim.SGD(model_to_train.parameters(), lr=0.01)
+    for epoch in range(6):
+        model_to_train.train()
+        for index in range(0, y_t_train.shape[0], BATCH_SIZE):
+            t_x = x_t_train[index:index + BATCH_SIZE]
+            t_x = torch.tensor(t_x)
+            t_y = y_t_train[index:index + BATCH_SIZE]
+            t_y = torch.tensor(t_y.astype(np.float32).to_numpy()).reshape(-1, 1)
+
+            out = model_to_train(t_x.float())
+            loss = cri(out, t_y)
+
+            opt.zero_grad()
+            loss.backward()
+            opt.step()
+    return model_to_train
+
+
+t_set_features = load_set(TRAIN_IN_PATH, col_names[:2])
+t_set_labels = load_set(TRAIN_EXP_PATH, col_names[2:])
+dev_set = load_set(DEV_PATH, col_names[:2])
+test_set = load_set(TEST_PATH, col_names[:2])
+
+x_train = t_set_features[CONTENT].str.lower()
+y_train = t_set_labels[LABEL]
+x_dev = dev_set[CONTENT].str.lower()
+x_test = test_set[CONTENT].str.lower()
+
+x_train = tokenize(x_train)
+x_dev = tokenize(x_dev)
+x_test = tokenize(x_test)
+
+x_train = word_2_vec(x_train, word2vec)
+x_dev = word_2_vec(x_dev, word2vec)
+x_test = word_2_vec(x_test, word2vec)
+
+model = NeuralNetwork(INPUT_SIZE, HIDDEN_SIZE, NUM_CLASSES)
+trained_model = train_model(model, y_train, x_train)
+
+dev_prediction = predict(trained_model, 10, x_dev)
+test_prediction = predict(trained_model, 10, x_test)
+
+trained_model.eval()
+
+dev_prediction = np.asarray(dev_prediction, dtype=np.int32)
+test_prediction = np.asarray(test_prediction, dtype=np.int32)
+
+dev_prediction.tofile(DEV_OUT_PATH, sep='\n')
+test_prediction.tofile(TEST_OUT_PATH, sep='\n')
--- a/test-A/out.tsv
+++ b/test-A/out.tsv