paranormal-or-skeptic-ISI-public/run.ipynb at master

import numpy as np
import pandas as pd
import torch
import csv
import lzma
import gensim.downloader
from nltk import word_tokenize

#def read_file(filename):
#    result = []
#    with open(filename, 'r', encoding="utf-8") as file:
#        for line in file:
#            text = line.split("\t")[0].strip()
#            result.append(text)
#    return result

x_train = pd.read_table('train/in.tsv', sep='\t', header=None, quoting=3)
x_train = x_train[0:200000]
x_train

	0	1
0	have you had an medical issues recently?	1335187994
1	It's supposedly aluminum, barium, and strontiu...	1346187161
2	Nobel prizes don't make you rich.	1337160218
3	I came for the article, I stayed for the doctor.	1277674344
4	you resorted to insults AND got owned directly...	1348538535
...	...	...
199995	It's really sad. My sister used to believe tha...	1334111989
199996	I don't mean it in a dickish way, I'm being se...	1322700456
199997	Fair enough, I stand corrected.	1354646212
199998	Right. Scientists tend to think and conclude l...	1348777201
199999	Because they are illiterate	1249579722

200000 rows × 2 columns

with open('train/expected.tsv', 'r', encoding='utf8') as file:
    y_train = pd.read_csv(file, sep='\t', header=None)
y_train = y_train[0:200000]
y_train

	0
0	1
1	0
2	0
3	0
4	0
...	...
199995	0
199996	0
199997	1
199998	1
199999	0

200000 rows × 1 columns

with open('dev-0/in.tsv', 'r', encoding='utf8') as file:
    x_dev = pd.read_csv(file, sep='\t', header=None)
x_dev

	0	1
0	In which case, tell them I'm in work, or dead,...	1328302967
1	Put me down as another for Mysterious Universe...	1347836881
2	The military of any country would never admit ...	1331905826
3	An example would have been more productive tha...	1315584834
4	sorry, but the authors of this article admit t...	1347389166
...	...	...
5267	Your fault for going at all. That's how we get...	1308176634
5268	EVP....that's a shot in the GH drinking game.	1354408646
5269	i think a good hard massage is good for you. t...	1305726318
5270	Interesting theory. Makes my imagination run w...	1339839088
5271	Tampering of candy? More like cooking somethin...	1320262659

5272 rows × 2 columns

with open('test-A/in.tsv', 'r', encoding='utf8') as file:
    x_test = pd.read_csv(file, sep='\t', header=None)
x_test

	0	1
0	Gentleman, I believe we can agree that this is...	1304170330
1	The problem is that it will just turn it r/nos...	1353763204
2	Well, according to some Christian apologists, ...	1336314173
3	Don't know if this is what you are looking for...	1348860314
4	I respect what you're saying completely. I jus...	1341285952
...	...	...
5147	GAMBIT	1326441107
5148	>Joe Rogan is no snake oil salesman.\n\nHe ...	1319464245
5149	Reading further, Sagan does seem to agree with...	1322126150
5150	Notice that they never invoke god, or any othe...	1307679295
5151	They might co-ordinate an anniversary attack o...	1342409261

5152 rows × 2 columns

class NeuralNetworkModel(torch.nn.Module):
    def __init__(self):
        super(NeuralNetworkModel, self).__init__()
        self.l01 = torch.nn.Linear(300, 300)
        self.l02 = torch.nn.Linear(300, 1)

    def forward(self, x):
        x = self.l01(x)
        x = torch.relu(x)
        x = self.l02(x)
        x = torch.sigmoid(x)
        return x

x_train = x_train[0].str.lower()
y_train = y_train[0]
x_dev = x_dev[0].str.lower()
x_test = x_test[0].str.lower()

x_train = [word_tokenize(x) for x in x_train]
x_dev = [word_tokenize(x) for x in x_dev]
x_test = [word_tokenize(x) for x in x_test]
#x_test

len(x_test)

from gensim.test.utils import common_texts
from gensim.models import Word2Vec

word2vec = gensim.downloader.load('word2vec-google-news-300')
x_train = [np.mean([word2vec[word] for word in content if word in word2vec] or [np.zeros(300)], axis=0) for content in x_train]
x_dev = [np.mean([word2vec[word] for word in content if word in word2vec] or [np.zeros(300)], axis=0) for content in x_dev]
x_test = [np.mean([word2vec[word] for word in content if word in word2vec] or [np.zeros(300)], axis=0) for content in x_test]
len(x_test)

model = NeuralNetworkModel()
BATCH_SIZE = 5
criterion = torch.nn.BCELoss()
optimizer = torch.optim.SGD(model.parameters(), lr=0.01)

for epoch in range(BATCH_SIZE):
    model.train()
    for i in range(0, y_train.shape[0], BATCH_SIZE):
        X = x_train[i:i + BATCH_SIZE]
        X = torch.tensor(X)
        y = y_train[i:i + BATCH_SIZE]
        y = torch.tensor(y.astype(np.float32).to_numpy()).reshape(-1, 1)
        optimizer.zero_grad()
        outputs = model(X.float())
        loss = criterion(outputs, y)
        loss.backward()
        optimizer.step()

y_dev = []
y_test = []
model.eval()

with torch.no_grad():
    for i in range(0, len(x_dev), BATCH_SIZE):
        X = x_dev[i:i + BATCH_SIZE]
        X = torch.tensor(X)
        outputs = model(X.float())
        prediction = (outputs > 0.5)
        y_dev += prediction.tolist()

    for i in range(0, len(x_test), BATCH_SIZE):
        X = x_test[i:i + BATCH_SIZE]
        X = torch.tensor(X)
        outputs = model(X.float())
        prediction = (outputs >= 0.5)
        y_test += prediction.tolist()
len(y_test)

y_dev = np.asarray(y_dev, dtype=np.int32)
y_test = np.asarray(y_test, dtype=np.int32)
len(y_test)

y_dev.tofile('./dev-0/out.tsv', sep='\n')
y_test.tofile('./test-A/out.tsv', sep='\n')

!jupyter nbconvert --to script run.ipynb

21 KiB Raw Permalink Blame History Unescape Escape

21 KiB

Raw Permalink Blame History