Uczenie_maszynowe_zadanie_10/logistyczna.ipynb
2021-05-27 18:38:28 +02:00

5.9 KiB

from sklearn.naive_bayes import GaussianNB
import pandas as pd
import torch
from sklearn.naive_bayes import MultinomialNB
from sklearn.feature_extraction.text import TfidfVectorizer
import gensim
from gensim.models import KeyedVectors
import gensim.downloader
import nltk
import csv

import numpy as np

import numpy as np
def tokenize_data(data):
    data_tokenize = [nltk.word_tokenize(x) for x in data]
 
    for doc in data_tokenize:
        i = 0
        while i < len(doc):
            if doc[i].isalpha():
                doc[i] = doc[i].lower()
            else:
                del doc[i]
            i += 1
    return data_tokenize

class NeuralNetwork(torch.nn.Module):
    def __init__(self, input_size, hidden_size, num_classes):
        super(NeuralNetwork, self).__init__()
        self.l1 = torch.nn.Linear(input_size, hidden_size)
        self.l2 = torch.nn.Linear(hidden_size, num_classes)
 
    def forward(self, x):
        x = self.l1(x)
        x = torch.relu(x)
        x = self.l2(x)
        x = torch.sigmoid(x)
        return x
r_in = './train/train.tsv'

r_ind_ev = './dev-0/in.tsv'
tsv_read = pd.read_table(r_in, error_bad_lines=False, quoting=csv.QUOTE_NONE, sep='\t', header=None)
tsv_read_dev = pd.read_table(r_ind_ev, error_bad_lines=False, sep='\t', quoting=csv.QUOTE_NONE, header=None)

y_train = tsv_read[0].values
X_train = tsv_read[1].values
X_dev = tsv_read_dev[0].values

X_train = tokenize_data(X_train)
X_dev = tokenize_data(X_dev)
C:\Users\micha\anaconda3\lib\site-packages\gensim\similarities\__init__.py:15: UserWarning: The gensim.similarities.levenshtein submodule is disabled, because the optional Levenshtein package <https://pypi.org/project/python-Levenshtein/> is unavailable. Install Levenhstein (e.g. `pip install python-Levenshtein`) to suppress this warning.
  warnings.warn(msg)
b'Skipping line 25706: expected 2 fields, saw 3\nSkipping line 58881: expected 2 fields, saw 3\nSkipping line 73761: expected 2 fields, saw 3\n'

model = KeyedVectors.load("word2vec_100_3_polish.bin")

X_train = [np.mean([model[word] for word in content if word in model] or [np.zeros(100)], axis=0) for content in X_train]
x_train_vectors = np.array(X_train, dtype=np.float32)
x_train_tensor = torch.tensor(x_train_vectors.astype(np.float32))

X_dev = [np.mean([model[word] for word in content if word in model] or [np.zeros(100)], axis=0) for content in X_dev]
x_dev_vectors = np.array(X_dev, dtype=np.float32)
model = NeuralNetwork(100, 200, 1)
criterion = torch.nn.BCELoss()
optimizer = torch.optim.SGD(model.parameters(), lr=0.01)
  
batch_size = 12

 
for epoch in range(6):
    loss_score = 0
    acc_score = 0
    items_total = 0
    model.train()
    for i in range(0, y_train.shape[0], batch_size):
        X = x_train_vectors[i:i+batch_size]
        X = torch.tensor(X.astype(np.float32))
        Y = y_train[i:i+batch_size]
        Y = torch.tensor(Y.astype(np.float32)).reshape(-1,1)
 
        Y_predictions = model(X)
        acc_score += torch.sum((Y_predictions > 0.5) == Y).item()
        items_total += Y.shape[0] 
 
        optimizer.zero_grad()
        loss = criterion(Y_predictions, Y)
        loss.backward()
        optimizer.step()
 
 
        loss_score += loss.item() * Y.shape[0]
 
C:\Users\micha\anaconda3\lib\site-packages\torch\autograd\__init__.py:145: UserWarning: CUDA initialization: The NVIDIA driver on your system is too old (found version 9010). Please update your GPU driver by downloading and installing a new version from the URL: http://www.nvidia.com/Download/index.aspx Alternatively, go to: https://pytorch.org to install a PyTorch version that has been compiled with your version of the CUDA driver. (Triggered internally at  ..\c10\cuda\CUDAFunctions.cpp:109.)
  Variable._execution_engine.run_backward(
predictions = model(torch.tensor(x_dev_vectors.astype(np.float32)))
predictions = predictions.cpu().detach().numpy() 
predictions = (predictions > 0.5)
predictions = np.asarray(predictions, dtype=np.int32)
predictions.tofile('dev-0/out.tsv', sep='\n')