sport-text-classification-b.../neural.py

116 lines
4.2 KiB
Python
Raw Normal View History

2021-05-23 19:11:17 +02:00
import gensim
import nltk
import pandas as pd
import numpy as np
import os
import io
import gzip
import torch
def read_data_gz(baseUrl):
f = gzip.open(baseUrl,'r')
data_unzip = f.read()
data = pd.read_table(io.StringIO(data_unzip.decode('utf-8')), error_bad_lines=False, header= None)
return data
def preprocess(data):
data_tokenize = [nltk.word_tokenize(x) for x in data]
for doc in data_tokenize:
i = 0
while i < len(doc):
if doc[i].isalpha():
doc[i] = doc[i].lower()
else:
del doc[i]
i += 1
return data_tokenize
class NeuralNetworkModel(torch.nn.Module):
def __init__(self):
super(NeuralNetworkModel, self).__init__()
self.fc1 = torch.nn.Linear(100,200)
self.fc2 = torch.nn.Linear(200,1)
def forward(self, x):
x = self.fc1(x)
x = torch.relu(x)
x = self.fc2(x)
x = torch.sigmoid(x)
return x
data_train = read_data_gz('train/train.tsv.gz')
data_dev = pd.read_table('dev-0/in.tsv', error_bad_lines=False, header= None)
data_test = pd.read_table('test-A/in.tsv', error_bad_lines=False, header= None)
model = gensim.models.KeyedVectors.load_word2vec_format('pl-embeddings-cbow.txt', binary=False)
y_train = data_train[0].values
x_train = data_train[1].values
x_dev = data_dev[0].values
x_test = data_test[0].values
x_train_tokenize = preprocess(x_train)
x_dev_tokenize = preprocess(x_dev)
x_test_tokenize = preprocess(x_test)
# -------------------------------------------------------------------------------------------------------------------------------------------
x_train_vectors = [np.mean([model[word] for word in content if word in model] or [np.zeros(100)], axis=0) for content in x_train_tokenize]
x_train_vectors = np.array(x_train_vectors)
# -------------------------------------------------------------------------------------------------------------------------------------------
x_dev_vectors= [np.mean([model[word] for word in content if word in model] or [np.zeros(100)], axis=0) for content in x_dev_tokenize]
x_dev_vectors = np.array(x_dev_vectors, dtype=np.float32)
x_dev_tensor = torch.tensor(x_dev_vectors.astype(np.float32))
# -------------------------------------------------------------------------------------------------------------------------------------------
x_test_vectors= [np.mean([model[word] for word in content if word in model] or [np.zeros(100)], axis=0) for content in x_test_tokenize]
x_test_vectors = np.array(x_test_vectors, dtype=np.float32)
x_test_tensor = torch.tensor(x_test_vectors.astype(np.float32))
# -------------------------------------------------------------------------------------------------------------------------------------------
model_nn = NeuralNetworkModel()
criterion = torch.nn.BCELoss()
optimizer = torch.optim.SGD(model_nn.parameters(), lr=0.01)
batch_size = 10
print('Trenowanie modelu...')
for epoch in range(6):
loss_score = 0
acc_score = 0
items_total = 0
model_nn.train()
for i in range(0, y_train.shape[0], batch_size):
X = x_train_vectors[i:i+batch_size]
X = torch.tensor(X.astype(np.float32))
Y = y_train[i:i+batch_size]
Y = torch.tensor(Y.astype(np.float32)).reshape(-1,1)
Y_predictions = model_nn(X)
acc_score += torch.sum((Y_predictions > 0.5) == Y).item()
items_total += Y.shape[0]
optimizer.zero_grad()
loss = criterion(Y_predictions, Y)
loss.backward()
optimizer.step()
loss_score += loss.item() * Y.shape[0]
# -------------------------------------------------------------------------------------------------------------------------------------------
ypred = model_nn(x_dev_tensor)
ypred = ypred.cpu().detach().numpy()
ypred = (ypred > 0.5)
ypred = np.asarray(ypred, dtype=np.int32)
ypred.tofile('dev-0/out.tsv', sep='\n')
# -------------------------------------------------------------------------------------------------------------------------------------------
ypredtest = model_nn(x_test_tensor)
ypredtest = ypredtest.cpu().detach().numpy()
ypredtest = (ypredtest > 0.5)
ypredtest = np.asarray(ypredtest, dtype=np.int32)
ypredtest.tofile('test-A/out.tsv', sep='\n')