sport-text-classification-b.../Regresja_Logistyczna.py
2021-05-25 15:51:03 +02:00

108 lines
3.3 KiB
Python

import pandas as pd
import torch
from gensim.models import KeyedVectors
import nltk
import csv
import numpy as np
def tokenize_data(data):
data_tokenize = [nltk.word_tokenize(x) for x in data]
for doc in data_tokenize:
i = 0
while i < len(doc):
if doc[i].isalpha():
doc[i] = doc[i].lower()
else:
del doc[i]
i += 1
return data_tokenize
class NeuralNetwork(torch.nn.Module):
def __init__(self, input_size, hidden_size, num_classes):
super(NeuralNetwork, self).__init__()
self.l1 = torch.nn.Linear(input_size, hidden_size)
self.l2 = torch.nn.Linear(hidden_size, num_classes)
def forward(self, x):
x = self.l1(x)
x = torch.relu(x)
x = self.l2(x)
x = torch.sigmoid(x)
return x
r_in = './train/train.tsv'
r_ind_ev = './dev-0/in.tsv'
tsv_read = pd.read_table(r_in, error_bad_lines=False, quoting=csv.QUOTE_NONE, sep='\t', header=None)
tsv_read_dev = pd.read_table(r_ind_ev, error_bad_lines=False, sep='\t', quoting=csv.QUOTE_NONE, header=None)
tsv_read_test_in = pd.read_table('./test-A/in.tsv', error_bad_lines=False, quoting=csv.QUOTE_NONE, header=None)
y_train = tsv_read[0].values
X_train = tsv_read[1].values
X_dev = tsv_read_dev[0].values
X_test = tsv_read_test_in[0].values
X_train = tokenize_data(X_train)
X_dev = tokenize_data(X_dev)
X_test = tokenize_data(X_test)
model = KeyedVectors.load("./word2vec/word2vec_100_3_polish.bin")
X_train = [np.mean([model[word] for word in content if word in model] or [np.zeros(100)], axis=0) for content in X_train]
x_train_vectors = np.array(X_train, dtype=np.float32)
x_train_tensor = torch.tensor(x_train_vectors.astype(np.float32))
X_dev = [np.mean([model[word] for word in content if word in model] or [np.zeros(100)], axis=0) for content in X_dev]
x_dev_vectors = np.array(X_dev, dtype=np.float32)
X_test = [np.mean([model[word] for word in content if word in model] or [np.zeros(100)], axis=0) for content in X_test]
x_test_vectors = np.array(X_test, dtype=np.float32)
model = NeuralNetwork(100, 200, 1)
criterion = torch.nn.BCELoss()
optimizer = torch.optim.SGD(model.parameters(), lr=0.01)
batch_size = 12
for epoch in range(6):
loss_score = 0
acc_score = 0
items_total = 0
model.train()
for i in range(0, y_train.shape[0], batch_size):
X = x_train_vectors[i:i + batch_size]
X = torch.tensor(X.astype(np.float32))
Y = y_train[i:i + batch_size]
Y = torch.tensor(Y.astype(np.float32)).reshape(-1, 1)
Y_predictions = model(X)
acc_score += torch.sum((Y_predictions > 0.5) == Y).item()
items_total += Y.shape[0]
optimizer.zero_grad()
loss = criterion(Y_predictions, Y)
loss.backward()
optimizer.step()
loss_score += loss.item() * Y.shape[0]
predictions = model(torch.tensor(x_dev_vectors.astype(np.float32)))
predictions = predictions.cpu().detach().numpy()
predictions = (predictions > 0.5)
predictions = np.asarray(predictions, dtype=np.int32)
predictions.tofile('dev-0/out.tsv', sep='\n')
predictions = model(torch.tensor(x_dev_vectors.astype(np.float32)))
predictions = predictions.cpu().detach().numpy()
predictions = (predictions > 0.5)
predictions = np.asarray(predictions, dtype=np.int32)
predictions.tofile('test-A/out.tsv', sep='\n')