paranormal-or-skeptic/run.ipynb
2022-06-01 22:55:27 +02:00

14 KiB

import numpy as np
import gensim
import re
import torch
import pandas as pd
from gensim.models import Word2Vec
from gensim import downloader
from sklearn.feature_extraction.text import TfidfVectorizer
BATCH_SIZE = 64
EPOCHS = 100
FEATURES = 200
with open('train/in.tsv', 'r', encoding='utf8') as f:
    X_train = f.readlines()
with open('train/expected.tsv', 'r', encoding='utf8') as f:
    y_train = f.readlines()

with open('dev-0/in.tsv', 'r', encoding='utf8') as f:
    X_dev = f.readlines()
with open('dev-0/expected.tsv', 'r', encoding='utf8') as f:
    y_dev = f.readlines()

with open('test-A/in.tsv', 'r', encoding='utf8') as f:
    X_test = f.readlines()
for i, line in enumerate(X_train):
    X_train[i] = re.sub(r'\t[0-9]+\n', '', line)

for i, line in enumerate(X_dev):
    X_dev[i] = re.sub(r'\t[0-9]+\n', '', line)

for i, line in enumerate(X_test):
    X_test[i] = re.sub(r'\t[0-9]+\n', '', line)

for i, line in enumerate(y_train):
    y_train[i] = re.sub(r'\n', '', line)

for i, line in enumerate(y_dev):
    y_dev[i] = re.sub(r'\n', '', line)
def readData(fileName):  
    with open(f'{fileName}/in.tsv', 'r', encoding='utf8') as f:
        X =  np.array([x.strip().lower() for x in f.readlines()])
    with open(f'{fileName}/expected.tsv', 'r', encoding='utf8') as f:
        y = np.array([int(x.strip()) for x in f.readlines()])
    return X,y

X_file,y_file = readData('dev-0')
class NeuralNetworkModel(torch.nn.Module):
    
    def __init__(self):
        super(NeuralNetworkModel, self).__init__()
        self.fc1 = torch.nn.Linear(FEATURES, 500)
        self.fc2 = torch.nn.Linear(500, 1)

    def forward(self, x):
        x = self.fc1(x)
        x = torch.relu(x)
        x = self.fc2(x)
        x = torch.sigmoid(x)
        return x
word2vec = downloader.load("glove-twitter-200")
X_train_w2v = [np.mean([word2vec[word.lower()] for word in doc.split() if word.lower() in word2vec]
                   or [np.zeros(FEATURES)], axis=0) for doc in X_train]
y_train = np.array(y_train)
def train_model(X_train, y_train):
    model = NeuralNetworkModel()

    criterion = torch.nn.BCELoss()
    optimizer = torch.optim.ASGD(model.parameters(), lr=0.05)

    for epoch in range(EPOCHS):

        print(epoch)
        loss_score = 0
        acc_score = 0
        items_total = 0

        for i in range(0, y_train.shape[0], BATCH_SIZE):
            x = X_train[i:i+BATCH_SIZE]
            x = torch.tensor(np.array(x).astype(np.float32))
            y = y_train[i:i+BATCH_SIZE]
            y = torch.tensor(y.astype(np.float32)).reshape(-1, 1)
            y_pred = model(x)
            acc_score += torch.sum((y_pred > 0.5) == y).item()
            items_total += y.shape[0]

            optimizer.zero_grad()
            loss = criterion(y_pred, y)
            loss.backward()
            optimizer.step()

            loss_score += loss.item() * y.shape[0]
        
        print((loss_score / items_total), (acc_score / items_total))
        
    return model
def predict(model, x_test):
    y_dev = []
    
    with torch.no_grad():
        for i in range(0, len(x_test), BATCH_SIZE):
            x = x_test[i:i+BATCH_SIZE]
            x = torch.tensor(np.array(x).astype(np.float32))
            outputs = model(x)
            y = (outputs > 0.5)
            y_dev.extend(y)

    return y_dev
model = train_model(X_train_w2v, y_train)
0
0.5714333134919922 0.6966561801788113
1
0.5395073619374668 0.7242514132585581
2
0.5322582519146749 0.7296247310751125
3
0.5277940251241121 0.7327292379626976
4
0.5243827499623 0.7345525745996775
5
0.521483356086283 0.7361825270478868
6
0.5188610753636298 0.7376052821509848
7
0.5164497832484463 0.7390211306759122
8
0.5142272224311959 0.7402332351448137
9
0.5121725415654607 0.7413451942302446
10
0.510225843876934 0.742412260557568
11
0.5084293723366556 0.7430476657492429
12
0.5067300511753501 0.7440560261621181
13
0.5051866206455035 0.7450609332859082
14
0.503752063642534 0.7458586430645868
15
0.5024285955103476 0.7466943390232027
16
0.5011020173689057 0.7476439935216297
17
0.49986460605734995 0.7483691842295194
18
0.498722317965918 0.7489527900849163
19
0.4976401474074949 0.749584741987506
20
0.4966364578740479 0.7502788530936291
21
0.4956408892432799 0.7507208740965332
22
0.4946911594690806 0.7513459194209525
23
0.4938261365074296 0.7519433384326902
24
0.49291108882053136 0.7526996087423466
25
0.49207683927175633 0.752979325158247
26
0.4912937934254017 0.7534524257629179
27
0.49052768458365964 0.7539186197894184
28
0.48980189713607974 0.7542535888306818
29
0.48902049401931186 0.7547819420607157
30
0.48832297395034846 0.7553828143615386
31
0.48764632061179475 0.7556832505119501
32
0.4869866096390585 0.7563359221490509
33
0.48635514366306837 0.7567813964410403
34
0.48572428783405186 0.7574616943908226
35
0.4851059672855987 0.7577897568539155
36
0.4844747067054167 0.7581350857624344
37
0.4838937349887044 0.7585080409836349
38
0.48333403454228063 0.7584769613818682
39
0.4827657912931136 0.7590916468390319
40
0.48225590195293194 0.7592435915587802
41
0.48163791058193006 0.7597857579451549
42
0.4811314198011156 0.7601414467209293
43
0.4806143895582873 0.7607181459981559
44
0.4800953709221985 0.7609598762341192
45
0.47956847999038854 0.7612913919862974
46
0.4790844480555675 0.7616470807620719
47
0.47860829903493235 0.761795572192735
48
0.4781695258369003 0.762089101764976
49
0.4776893918277479 0.7624827767206876
50
0.47722041533606274 0.7628246523401213
51
0.4767699545351635 0.7631596213813847
52
0.47637271544187293 0.7633253792574738
53
0.47592309171862696 0.7635705627825222
54
0.47549356202221993 0.7638744522220189
55
0.47508612961542673 0.7642370475759638
56
0.47468646391106234 0.764351006115775
57
0.4742474519497854 0.7646790685788679
58
0.4737666401496256 0.7650623836673239
59
0.47335995538274667 0.7652972073251169
60
0.4729701449600526 0.7654422454666947
61
0.4725969795466422 0.7656252697882098
62
0.47221369839845356 0.7661121835492215
63
0.4718388513139844 0.7663021144489068
64
0.47147053143633466 0.7664575124577404
65
0.4711233925314738 0.7666543499355961
66
0.47074752713287643 0.7669340663514965
67
0.4703749315941604 0.7673242880181229
68
0.470022628463849 0.7672828485491006
69
0.4696828857076031 0.7677559491537715
70
0.4693190624670805 0.7678491879590716
71
0.4689852795644025 0.7683257418528278
72
0.46865665018555414 0.7687194168085393
73
0.468258934943202 0.7687297766757949
74
0.46797715189850664 0.7687608562775615
75
0.46764439033620286 0.7690716522952286
76
0.46732620352289256 0.769351368711129
77
0.4670077633846447 0.769700150908733
78
0.4667117469477995 0.7697692166904369
79
0.4664313273439932 0.7700420265281668
80
0.4661624620708029 0.7704426080620487
81
0.46585000600566223 0.7703148363658967
82
0.4655422194174101 0.7706739784307564
83
0.4652497145337105 0.7708708159086122
84
0.46495632112782237 0.7708673626195269
85
0.46467082155335016 0.7712023316607903
86
0.46439953297526376 0.7715269408347981
87
0.4640616501378699 0.7718032039616133
88
0.46377603995408073 0.7721139999792803
89
0.46352646427627725 0.7722072387845804
90
0.46323162764281506 0.7723971696842657
91
0.4629823635760337 0.7724765953332251
92
0.46268333841052883 0.7727770314836366
93
0.4624373474653466 0.7728978966016182
94
0.4621637105605031 0.7731396268375814
95
0.4618823675153035 0.7730463880322813
96
0.4615598618066211 0.7733571840499484
97
0.4613917053205442 0.7734089833862262
98
0.4610787309787952 0.7734642360115892
99
0.4608159763176817 0.7737197794038932
X_dev_w2v = [np.mean([word2vec[word.lower()] for word in doc.split() if word.lower() in word2vec]
                   or [np.zeros(FEATURES)], axis=0) for doc in X_dev]
y_dev=predict(model, X_dev_w2v)
y_dev = ['1' if bool(item) else '0' for item in y_dev]
with open('dev-0/out.tsv', 'wt') as f:
    for pred in y_dev:
        f.write(str(pred)+'\n')
X_test_w2v = [np.mean([word2vec[word.lower()] for word in doc.split() if word.lower() in word2vec]
                   or [np.zeros(FEATURES)], axis=0) for doc in X_test]
y_test=predict(model, X_test_w2v)
y_test = ['1' if bool(item) else '0' for item in y_test]
with open('test-A/out.tsv', 'wt') as f:
    for pred in y_test:
        f.write(str(pred)+'\n')