sport-text-classification-b.../regresja-logistyczna.ipynb
patrycjalazna 54b82a7411 done
2021-05-26 01:35:32 +02:00

8.7 KiB

import numpy as np
import os
import gensim
from gensim.models import Word2Vec
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import matplotlib.gridspec as gridspec
from sklearn.preprocessing import LabelEncoder
from sklearn.linear_model import LogisticRegression
import torch
import csv
# wczytanie danych
train = pd.read_table('train/train.tsv', error_bad_lines=False, sep='\t', quoting=csv.QUOTE_NONE, header=None)
x_dev = pd.read_table('dev-0/in.tsv', error_bad_lines=False, sep='\t', header=None, quoting=csv.QUOTE_NONE)
y_dev = pd.read_table('dev-0/expected.tsv', error_bad_lines=False, sep='\t', header=None, quoting=csv.QUOTE_NONE)
x_test = pd.read_table('test-A/in.tsv', error_bad_lines=False, sep='\t', header=None, quoting=csv.QUOTE_NONE)

# podzial na x i y
x_train = train[1].values
y_train = train[0].values
x_dev = x_dev[0].values
x_test = x_test[0].values

print(len(x_train))
print(len(y_train))
print(len(x_dev))
print(len(y_dev))
b'Skipping line 25706: expected 2 fields, saw 3\nSkipping line 58881: expected 2 fields, saw 3\nSkipping line 73761: expected 2 fields, saw 3\n'
98129
98129
5452
5452
b'Skipping line 1983: expected 1 fields, saw 2\nSkipping line 5199: expected 1 fields, saw 2\n'
import nltk
#nltk.download('punkt')


# tokenizacja 
def tokenize_data(data):
    data_tokenize = [nltk.word_tokenize(x) for x in data]
 
    for doc in data_tokenize:
        i = 0
        while i < len(doc):
            if doc[i].isalpha():
                doc[i] = doc[i].lower()
            else:
                del doc[i]
            i += 1
    return data_tokenize

x_train_tokenized = tokenize_data(x_train)
x_dev_tokenized = tokenize_data(x_dev)
x_test_tokenized = tokenize_data(x_test)
from gensim.models import KeyedVectors

word2vec_model = KeyedVectors.load("word2vec.bin")

# sprawdzenie czy dziala
print(word2vec_model.similar_by_word("kwiat"))
[('róż', 0.8955456018447876), ('kwiatek', 0.8504886031150818), ('fiołek', 0.831953763961792), ('chryzantema', 0.8315931558609009), ('bukiet', 0.8306410908699036), ('wiśnia', 0.8005671501159668), ('żonkil', 0.8005172610282898), ('liść', 0.7998315095901489), ('lilia', 0.7931062579154968), ('peonia', 0.7918344140052795)]
/usr/local/Cellar/jupyterlab/3.0.14/libexec/lib/python3.9/site-packages/gensim/models/keyedvectors.py:772: RuntimeWarning: invalid value encountered in true_divide
  dists = dot(self.vectors[clip_start:clip_end], mean) / self.norms[clip_start:clip_end]
x_train = [np.mean([word2vec_model[word] for word in content if word in word2vec_model] or [np.zeros(100)], axis=0) for content in x_train]
x_train_tensor = torch.tensor(np.array(x_train, dtype=np.float32).astype(np.float32))
x_train_vec = np.array(x_train, dtype=np.float32)

x_dev = [np.mean([word2vec_model[word] for word in content if word in word2vec_model] or [np.zeros(100)], axis=0) for content in x_dev]
x_dev_vec = np.array(x_dev, dtype=np.float32)


x_test = [np.mean([word2vec_model[word] for word in content if word in word2vec_model] or [np.zeros(100)], axis=0) for content in x_test]
x_test_vec = np.array(x_test, dtype=np.float32)
class NeuralNetworkModel(torch.nn.Module):

    def __init__(self):
        super(NeuralNetworkModel, self).__init__()
        self.fc1 = torch.nn.Linear(100,200)
        self.fc2 = torch.nn.Linear(200,1)

    def forward(self, x):
        x = self.fc1(x)
        x = torch.relu(x)
        x = self.fc2(x)
        x = torch.sigmoid(x)
        return x
        
nn_model = NeuralNetworkModel()
criterion = torch.nn.BCELoss()
optimizer = torch.optim.SGD(model.parameters(), lr=0.01)
batch_size = 12
for epoch in range(6):
    loss_score = 0
    acc_score = 0
    items_total = 0
    nn_model.train()
    
    for i in range(0, y_train.shape[0], batch_size):
        X = x_train_vec[i:i + batch_size]
        X = torch.tensor(X.astype(np.float32))
        Y = y_train[i:i + batch_size]
        Y = torch.tensor(Y.astype(np.float32)).reshape(-1, 1)
        Y_predictions = nn_model(X)
        acc_score += torch.sum((Y_predictions > 0.5) == Y).item()
        items_total += Y.shape[0]

        optimizer.zero_grad()
        loss = criterion(Y_predictions, Y)
        loss.backward()
        optimizer.step()

        loss_score += loss.item() * Y.shape[0]
    display(epoch)
0
1
2
3
4
5
# predykcje 
y_pred_dev = nn_model(torch.tensor(x_dev_vec.astype(np.float32)))
y_pred_dev = y_pred_dev.cpu().detach().numpy()
y_pred_dev = (y_pred_dev > 0.5)
y_pred_dev = np.asarray(y_pred_dev, dtype=np.int32)
y_pred_dev.tofile('dev-0/out2.tsv', sep='\n')


y_pred_test = nn_model(torch.tensor(x_dev_vec.astype(np.float32)))
y_pred_test = y_pred_test.cpu().detach().numpy()
y_pred_test = (y_pred_test > 0.5)
y_pred_test = np.asarray(y_pred_test, dtype=np.int32)
y_pred_test.tofile('test-A/out2.tsv', sep='\n')