paranormal-or-skeptic-ISI-p.../run.ipynb

21 KiB
Raw Permalink Blame History

import numpy as np
import pandas as pd
import torch
import csv
import lzma
import gensim.downloader
from nltk import word_tokenize
#def read_file(filename):
#    result = []
#    with open(filename, 'r', encoding="utf-8") as file:
#        for line in file:
#            text = line.split("\t")[0].strip()
#            result.append(text)
#    return result
x_train = pd.read_table('train/in.tsv', sep='\t', header=None, quoting=3)
x_train = x_train[0:200000]
x_train
0 1
0 have you had an medical issues recently? 1335187994
1 It's supposedly aluminum, barium, and strontiu... 1346187161
2 Nobel prizes don't make you rich. 1337160218
3 I came for the article, I stayed for the doctor. 1277674344
4 you resorted to insults AND got owned directly... 1348538535
... ... ...
199995 It's really sad. My sister used to believe tha... 1334111989
199996 I don't mean it in a dickish way, I'm being se... 1322700456
199997 Fair enough, I stand corrected. 1354646212
199998 Right. Scientists tend to think and conclude l... 1348777201
199999 Because they are illiterate 1249579722

200000 rows × 2 columns

with open('train/expected.tsv', 'r', encoding='utf8') as file:
    y_train = pd.read_csv(file, sep='\t', header=None)
y_train = y_train[0:200000]
y_train
0
0 1
1 0
2 0
3 0
4 0
... ...
199995 0
199996 0
199997 1
199998 1
199999 0

200000 rows × 1 columns

with open('dev-0/in.tsv', 'r', encoding='utf8') as file:
    x_dev = pd.read_csv(file, sep='\t', header=None)
x_dev
0 1
0 In which case, tell them I'm in work, or dead,... 1328302967
1 Put me down as another for Mysterious Universe... 1347836881
2 The military of any country would never admit ... 1331905826
3 An example would have been more productive tha... 1315584834
4 sorry, but the authors of this article admit t... 1347389166
... ... ...
5267 Your fault for going at all. That's how we get... 1308176634
5268 EVP....that's a shot in the GH drinking game. 1354408646
5269 i think a good hard massage is good for you. t... 1305726318
5270 Interesting theory. Makes my imagination run w... 1339839088
5271 Tampering of candy? More like cooking somethin... 1320262659

5272 rows × 2 columns

with open('test-A/in.tsv', 'r', encoding='utf8') as file:
    x_test = pd.read_csv(file, sep='\t', header=None)
x_test
0 1
0 Gentleman, I believe we can agree that this is... 1304170330
1 The problem is that it will just turn it r/nos... 1353763204
2 Well, according to some Christian apologists, ... 1336314173
3 Don't know if this is what you are looking for... 1348860314
4 I respect what you're saying completely. I jus... 1341285952
... ... ...
5147 GAMBIT 1326441107
5148 >Joe Rogan is no snake oil salesman.\n\nHe ... 1319464245
5149 Reading further, Sagan does seem to agree with... 1322126150
5150 Notice that they never invoke god, or any othe... 1307679295
5151 They might co-ordinate an anniversary attack o... 1342409261

5152 rows × 2 columns

class NeuralNetworkModel(torch.nn.Module):
    def __init__(self):
        super(NeuralNetworkModel, self).__init__()
        self.l01 = torch.nn.Linear(300, 300)
        self.l02 = torch.nn.Linear(300, 1)

    def forward(self, x):
        x = self.l01(x)
        x = torch.relu(x)
        x = self.l02(x)
        x = torch.sigmoid(x)
        return x
x_train = x_train[0].str.lower()
y_train = y_train[0]
x_dev = x_dev[0].str.lower()
x_test = x_test[0].str.lower()

x_train = [word_tokenize(x) for x in x_train]
x_dev = [word_tokenize(x) for x in x_dev]
x_test = [word_tokenize(x) for x in x_test]
#x_test
len(x_test)
5152
from gensim.test.utils import common_texts
from gensim.models import Word2Vec

word2vec = gensim.downloader.load('word2vec-google-news-300')
x_train = [np.mean([word2vec[word] for word in content if word in word2vec] or [np.zeros(300)], axis=0) for content in x_train]
x_dev = [np.mean([word2vec[word] for word in content if word in word2vec] or [np.zeros(300)], axis=0) for content in x_dev]
x_test = [np.mean([word2vec[word] for word in content if word in word2vec] or [np.zeros(300)], axis=0) for content in x_test]
len(x_test)
5152
model = NeuralNetworkModel()
BATCH_SIZE = 5
criterion = torch.nn.BCELoss()
optimizer = torch.optim.SGD(model.parameters(), lr=0.01)

for epoch in range(BATCH_SIZE):
    model.train()
    for i in range(0, y_train.shape[0], BATCH_SIZE):
        X = x_train[i:i + BATCH_SIZE]
        X = torch.tensor(X)
        y = y_train[i:i + BATCH_SIZE]
        y = torch.tensor(y.astype(np.float32).to_numpy()).reshape(-1, 1)
        optimizer.zero_grad()
        outputs = model(X.float())
        loss = criterion(outputs, y)
        loss.backward()
        optimizer.step()
y_dev = []
y_test = []
model.eval()

with torch.no_grad():
    for i in range(0, len(x_dev), BATCH_SIZE):
        X = x_dev[i:i + BATCH_SIZE]
        X = torch.tensor(X)
        outputs = model(X.float())
        prediction = (outputs > 0.5)
        y_dev += prediction.tolist()

    for i in range(0, len(x_test), BATCH_SIZE):
        X = x_test[i:i + BATCH_SIZE]
        X = torch.tensor(X)
        outputs = model(X.float())
        prediction = (outputs >= 0.5)
        y_test += prediction.tolist()
len(y_test)
5152
y_dev = np.asarray(y_dev, dtype=np.int32)
y_test = np.asarray(y_test, dtype=np.int32)
len(y_test)
2062
y_dev.tofile('./dev-0/out.tsv', sep='\n')
y_test.tofile('./test-A/out.tsv', sep='\n')
!jupyter nbconvert --to script run.ipynb