paranormal-or-skeptic/run.py
2022-06-01 22:55:27 +02:00

159 lines
4.1 KiB
Python

# %%
import numpy as np
import gensim
import re
import torch
import pandas as pd
from gensim.models import Word2Vec
from gensim import downloader
from sklearn.feature_extraction.text import TfidfVectorizer
# %%
BATCH_SIZE = 64
EPOCHS = 100
FEATURES = 200
# %%
with open('train/in.tsv', 'r', encoding='utf8') as f:
X_train = f.readlines()
with open('train/expected.tsv', 'r', encoding='utf8') as f:
y_train = f.readlines()
with open('dev-0/in.tsv', 'r', encoding='utf8') as f:
X_dev = f.readlines()
with open('dev-0/expected.tsv', 'r', encoding='utf8') as f:
y_dev = f.readlines()
with open('test-A/in.tsv', 'r', encoding='utf8') as f:
X_test = f.readlines()
# %%
for i, line in enumerate(X_train):
X_train[i] = re.sub(r'\t[0-9]+\n', '', line)
for i, line in enumerate(X_dev):
X_dev[i] = re.sub(r'\t[0-9]+\n', '', line)
for i, line in enumerate(X_test):
X_test[i] = re.sub(r'\t[0-9]+\n', '', line)
for i, line in enumerate(y_train):
y_train[i] = re.sub(r'\n', '', line)
for i, line in enumerate(y_dev):
y_dev[i] = re.sub(r'\n', '', line)
# %%
def readData(fileName):
with open(f'{fileName}/in.tsv', 'r', encoding='utf8') as f:
X = np.array([x.strip().lower() for x in f.readlines()])
with open(f'{fileName}/expected.tsv', 'r', encoding='utf8') as f:
y = np.array([int(x.strip()) for x in f.readlines()])
return X,y
X_file,y_file = readData('dev-0')
# %%
class NeuralNetworkModel(torch.nn.Module):
def __init__(self):
super(NeuralNetworkModel, self).__init__()
self.fc1 = torch.nn.Linear(FEATURES, 500)
self.fc2 = torch.nn.Linear(500, 1)
def forward(self, x):
x = self.fc1(x)
x = torch.relu(x)
x = self.fc2(x)
x = torch.sigmoid(x)
return x
# %%
word2vec = downloader.load("glove-twitter-200")
# %%
X_train_w2v = [np.mean([word2vec[word.lower()] for word in doc.split() if word.lower() in word2vec]
or [np.zeros(FEATURES)], axis=0) for doc in X_train]
# %%
y_train = np.array(y_train)
# %%
def train_model(X_train, y_train):
model = NeuralNetworkModel()
criterion = torch.nn.BCELoss()
optimizer = torch.optim.ASGD(model.parameters(), lr=0.05)
for epoch in range(EPOCHS):
print(epoch)
loss_score = 0
acc_score = 0
items_total = 0
for i in range(0, y_train.shape[0], BATCH_SIZE):
x = X_train[i:i+BATCH_SIZE]
x = torch.tensor(np.array(x).astype(np.float32))
y = y_train[i:i+BATCH_SIZE]
y = torch.tensor(y.astype(np.float32)).reshape(-1, 1)
y_pred = model(x)
acc_score += torch.sum((y_pred > 0.5) == y).item()
items_total += y.shape[0]
optimizer.zero_grad()
loss = criterion(y_pred, y)
loss.backward()
optimizer.step()
loss_score += loss.item() * y.shape[0]
print((loss_score / items_total), (acc_score / items_total))
return model
# %%
def predict(model, x_test):
y_dev = []
with torch.no_grad():
for i in range(0, len(x_test), BATCH_SIZE):
x = x_test[i:i+BATCH_SIZE]
x = torch.tensor(np.array(x).astype(np.float32))
outputs = model(x)
y = (outputs > 0.5)
y_dev.extend(y)
return y_dev
# %%
model = train_model(X_train_w2v, y_train)
# %%
X_dev_w2v = [np.mean([word2vec[word.lower()] for word in doc.split() if word.lower() in word2vec]
or [np.zeros(FEATURES)], axis=0) for doc in X_dev]
# %%
y_dev=predict(model, X_dev_w2v)
y_dev = ['1' if bool(item) else '0' for item in y_dev]
# %%
with open('dev-0/out.tsv', 'wt') as f:
for pred in y_dev:
f.write(str(pred)+'\n')
# %%
X_test_w2v = [np.mean([word2vec[word.lower()] for word in doc.split() if word.lower() in word2vec]
or [np.zeros(FEATURES)], axis=0) for doc in X_test]
# %%
y_test=predict(model, X_test_w2v)
y_test = ['1' if bool(item) else '0' for item in y_test]
# %%
with open('test-A/out.tsv', 'wt') as f:
for pred in y_test:
f.write(str(pred)+'\n')