paranormal-or-skeptic-ISI-p.../.ipynb_checkpoints/run-checkpoint.ipynb
2022-06-14 23:36:56 +02:00

8.0 KiB

import numpy as np
import gensim
import torch
import pandas as pd
from sklearn.model_selection import train_test_split
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.metrics import accuracy_score
def predict_year(x, path_out, model):
    results = model.predict(x)
    with open(path_out, 'wt') as file:
        for r in results:
            file.write(str(r) + '\n') 
def read_file(filename):
    result = []
    with open(filename, 'r', encoding="utf-8") as file:
        for line in file:
            text = line.split("\t")[0].strip()
            result.append(text)
    return result
with open('train/in.tsv', 'r', encoding='utf8') as file:
    train = pd.read_csv(file, sep='\t', header=None)
with open('train/expected.tsv', 'r', encoding='utf8') as file:
    train_y = pd.read_csv(file, sep='\t', header=None)
train_y = train_y[0:10000]
train_y = train_y[0]
train = train[0:10000]
train_x = train[0]
train_x = [gensim.utils.simple_preprocess(x) for x in train_x]
#train_x
from gensim.test.utils import common_texts
from gensim.models import Word2Vec

model = Word2Vec(sentences=train_x, vector_size=100, window=5, min_count=1, workers=4)
#data, min_count = 1, vector_size = 100, window = 5, sg = 1
words = set(model.wv.index_to_key)
train_x_vec = np.array([np.array([model.wv[i] for i in x if i in words]) for x in train_x])
C:\Users\korne\AppData\Local\Temp\ipykernel_3520\3800840358.py:2: VisibleDeprecationWarning: Creating an ndarray from ragged nested sequences (which is a list-or-tuple of lists-or-tuples-or ndarrays with different lengths or shapes) is deprecated. If you meant to do this, you must specify 'dtype=object' when creating the ndarray.
  train_x_vec = np.array([np.array([model.wv[i] for i in x if i in words]) for x in train_x])
FEATURES = 100

class NeuralNetworkModel(torch.nn.Module):
    def __init__(self):
        super(NeuralNetworkModel, self).__init__()
        self.fc1 = torch.nn.Linear(FEATURES,500)
        self.fc2 = torch.nn.Linear(500,1)

    def forward(self, x):
        x = self.fc1(x)
        x = torch.relu(x)
        x = self.fc2(x)
        x = torch.sigmoid(x)
        return x

nn_model = NeuralNetworkModel()
BATCH_SIZE = 40
criterion = torch.nn.BCELoss()
optimizer = torch.optim.SGD(nn_model.parameters(), lr = 0.1)

def get_loss_acc(model, data_x, data_y):
    loss_score = 0
    acc_score = 0
    items_total = 0
    model.eval()
    for i in range(0, data_y.shape[0], BATCH_SIZE):
        X = data_x[i:i+BATCH_SIZE]
        X = torch.tensor(X.astype(np.float32))
        Y = data_y[i:i+BATCH_SIZE]
        Y = torch.tensor(Y.astype(np.float32)).reshape(-1,1)
        Y_predictions = model(X)
        acc_score += torch.sum((Y_predictions > 0.5) == Y).item()
        items_total += Y.shape[0]

        loss = criterion(Y_predictions, Y)

        loss_score += loss.item() * Y.shape[0]
    return (loss_score / items_total), (acc_score / items_total)


for epoch in range(5):
    loss_score = 0
    acc_score = 0
    items_total = 0
    nn_model.train()
    for i in range(0, train_y.shape[0] - 42, BATCH_SIZE):
        X = train_x_vec[i:i+BATCH_SIZE]
        X = torch.tensor(X.astype(np.float32))
        Y = train_y[i:i+BATCH_SIZE]
        Y = torch.tensor(Y.astype(np.float32)).reshape(-1,1)
        Y_predictions = nn_model(X)
        acc_score += torch.sum((Y_predictions > 0.5) == Y).item()
        items_total += Y.shape[0]

        optimizer.zero_grad()
        loss = criterion(Y_predictions, Y)
        loss.backward()
        optimizer.step()


        loss_score += loss.item() * Y.shape[0]

    display(epoch)
    display(get_loss_acc(model, train_x_vect, train_y))
#print('trenowanie modelu')
model = NeuralNetworkModel()
BATCH_SIZE = 5
criterion = torch.nn.BCELoss()
optimizer = torch.optim.SGD(model.parameters(), lr=0.01)

for epoch in range(BATCH_SIZE):
    model.train()
    for i in range(0, y_train.shape[0], BATCH_SIZE):
        X = x_train[i:i + BATCH_SIZE]
        X = torch.tensor(X)
        y = y_train[i:i + BATCH_SIZE]
        y = torch.tensor(y.astype(np.float32).to_numpy()).reshape(-1, 1)
        optimizer.zero_grad()
        outputs = model(X.float())
        loss = criterion(outputs, y)
        loss.backward()
        optimizer.step()

#print('predykcja wynikow')
y_dev = []
y_test = []
model.eval()

with torch.no_grad():
    for i in range(0, len(x_dev), BATCH_SIZE):
        X = x_dev[i:i + BATCH_SIZE]
        X = torch.tensor(X)
        outputs = model(X.float())
        prediction = (outputs > 0.5)
        y_dev += prediction.tolist()

    for i in range(0, len(x_test), BATCH_SIZE):
        X = x_test[i:i + BATCH_SIZE]
        X = torch.tensor(X)
        outputs = model(X.float())
        y = (outputs >= 0.5)
        y_test += prediction.tolist()