lab10/neural.ipynb
2021-05-27 17:11:28 +02:00

6.8 KiB

import gensim
import nltk
import pandas as pd 
import numpy as np 
import os
import io
import gzip
import torch

# wget http://publications.it.p.lodz.pl/2016/word_embeddings/pl-embeddings-cbow.txt   900MB

def read_data_gz(baseUrl):
    f = gzip.open(baseUrl,'r')
    data_unzip = f.read()
    data = pd.read_table(io.StringIO(data_unzip.decode('utf-8')), error_bad_lines=False, header= None) 
    return data

def preprocess(data):
    data_tokenize = [nltk.word_tokenize(x) for x in data]

    for doc in data_tokenize:
        i = 0
        while i < len(doc):
            if doc[i].isalpha():
                doc[i] = doc[i].lower()
            else:
                del doc[i]
            i += 1
    return data_tokenize

class NeuralNetworkModel(torch.nn.Module):

    def __init__(self):
        super(NeuralNetworkModel, self).__init__()
        self.fc1 = torch.nn.Linear(100,200)
        self.fc2 = torch.nn.Linear(200,1)

    def forward(self, x):
        x = self.fc1(x)
        x = torch.relu(x)
        x = self.fc2(x)
        x = torch.sigmoid(x)
        return x

data_train = read_data_gz('./train/train.tsv.gz')
data_dev = pd.read_table('./dev-0/in.tsv', error_bad_lines=False, header= None)


y_train = data_train[0].values
x_train = data_train[1].values
x_dev  =  data_dev[0].values


b'Skipping line 25706: expected 2 fields, saw 3\nSkipping line 58881: expected 2 fields, saw 3\nSkipping line 73761: expected 2 fields, saw 3\n'
model = gensim.models.KeyedVectors.load_word2vec_format('pl-embeddings-cbow.txt', binary=False)
x_train_tokenize = preprocess(x_train)
x_dev_tokenize = preprocess(x_dev)
x_train_vectors = [np.mean([model[word] for word in content if word in model] or [np.zeros(100)], axis=0) for content in x_train_tokenize]
x_train_vectors = np.array(x_train_vectors)

x_dev_vectors= [np.mean([model[word] for word in content if word in model] or [np.zeros(100)], axis=0) for content in x_dev_tokenize]
x_dev_vectors = np.array(x_dev_vectors, dtype=np.float32)
x_dev_tensor = torch.tensor(x_dev_vectors.astype(np.float32))
# -------------------------------------------------------------------------------------------------------------------------------------------
model_nn = NeuralNetworkModel()
criterion = torch.nn.BCELoss()
optimizer = torch.optim.SGD(model_nn.parameters(), lr=0.01)
 
batch_size = 10
print('Trenowanie modelu...')
 
for epoch in range(6):
    loss_score = 0
    acc_score = 0
    items_total = 0
    model_nn.train()
    for i in range(0, y_train.shape[0], batch_size):
        X = x_train_vectors[i:i+batch_size]
        X = torch.tensor(X.astype(np.float32))
        Y = y_train[i:i+batch_size]
        Y = torch.tensor(Y.astype(np.float32)).reshape(-1,1)
 
        Y_predictions = model_nn(X)
        acc_score += torch.sum((Y_predictions > 0.5) == Y).item()
        items_total += Y.shape[0] 

        optimizer.zero_grad()
        loss = criterion(Y_predictions, Y)
        loss.backward()
        optimizer.step()


        loss_score += loss.item() * Y.shape[0]

Trenowanie modelu...
# -------------------------------------------------------------------------------------------------------------------------------------------
ypred = model_nn(x_dev_tensor)
ypred = ypred.cpu().detach().numpy() 
ypred = (ypred > 0.5)
ypred = np.asarray(ypred, dtype=np.int32)

y_exptected = pd.read_table('./dev-0/expected.tsv', header= None)
y_exptected = y_exptected.values
from sklearn.metrics import accuracy_score
from sklearn.metrics import classification_report

print("Score = ",accuracy_score(y_exptected, ypred))

print('-' * 60)
print(classification_report(y_exptected, ypred))
Score =  0.973037417461482
------------------------------------------------------------
              precision    recall  f1-score   support

           0       0.97      0.96      0.96      1983
           1       0.97      0.98      0.98      3469

    accuracy                           0.97      5452
   macro avg       0.97      0.97      0.97      5452
weighted avg       0.97      0.97      0.97      5452