sport-text-classification-b.../rlrozwiazanie.py

165 lines
4.7 KiB
Python

import gzip
from sklearn import metrics
import pandas as pd
import numpy as np
from gensim.models import KeyedVectors
import re
import torch
from torch.utils.data import TensorDataset, DataLoader
def get_str_cleaned(str_dirty):
punctuation = '!"#$%&\'()*+,-./:;<=>?@[\\\\]^_`{|}~'
new_str = str_dirty.lower()
new_str = re.sub(' +', ' ', new_str)
for char in punctuation:
new_str = new_str.replace(char,'')
return new_str
train_X = []
train_y = []
with gzip.open('train/train.tsv.gz','r') as fin:
for line in fin:
sline = line.decode('UTF-8').replace("\n", "").split("\t")
cleared = get_str_cleaned(''.join(sline[1:]))
if len(cleared)>0:
train_y.append(int(sline[0]))
train_X.append(cleared)
train_X_data = pd.DataFrame(train_X)
#Korpusy można pobrać z:
#http://dsmodels.nlp.ipipan.waw.pl/dsmodels/nkjp+wiki-forms-all-100-cbow-hs.txt.gz
#http://dsmodels.nlp.ipipan.waw.pl/dsmodels/wiki-forms-all-100-skipg-ns.txt.gz
#w2v = KeyedVectors.load_word2vec_format('../../../ncexclude/nkjp+wiki-forms-all-100-cbow-hs.txt.gz', binary=False)
w2v = KeyedVectors.load_word2vec_format('../../../ncexclude/wiki-forms-all-100-skipg-ns.txt.gz', binary=False)
#w2v.save("word2vec.wordvectors")
#w2v = KeyedVectors.load("word2vec.wordvectors")
def document_vector(doc):
try:
doc2 = []
doc = doc.split(' ')
for word in doc:
if word in w2v:
doc2.append(word)
return np.mean(w2v[doc2], axis=0)
except:
return np.zeros(100)
train_X_data = train_X_data[train_X_data.columns[0]].apply(document_vector)
dev_X = []
with open('dev-0/in.tsv','r') as dev_in_file:
for line in dev_in_file:
dev_X.append(get_str_cleaned(line.rstrip('\n')))
dev_y = []
with open('dev-0/expected.tsv','r') as dev_expected_file:
for line in dev_expected_file:
dev_y.append(int(line.rstrip('\n')))
dev_X_data = pd.DataFrame(dev_X)
dev_X_data = dev_X_data[dev_X_data.columns[0]].apply(document_vector)
class LogisticRegressionModel(torch.nn.Module):
def __init__(self):
super(LogisticRegressionModel, self).__init__()
self.fc = torch.nn.Linear(100,1)
def forward(self, x):
x = self.fc(x)
x = torch.sigmoid(x)
return x
lr_model = LogisticRegressionModel()
criterion = torch.nn.BCELoss()
optimizer = torch.optim.SGD(lr_model.parameters(), lr = 0.1)
train_x_tensor = torch.tensor(train_X_data).float()
train_y_tensor = torch.tensor(train_y).float()
train_dataset = TensorDataset(train_x_tensor, train_y_tensor)
train_loader = DataLoader(dataset=train_dataset)
dev_x_tensor = torch.tensor(dev_X_data).float()
dev_y_tensor = torch.tensor(dev_y).float()
dev_dataset = TensorDataset(dev_x_tensor, dev_y_tensor)
dev_loader = DataLoader(dataset=dev_dataset)
n_epochs = 2
def make_train_step(model, loss_fn, optimizer):
def train_step(x, y):
model.train()
yhat = model(x)
loss = loss_fn(yhat, y.unsqueeze(1))
loss.backward()
optimizer.step()
optimizer.zero_grad()
return loss.item()
return train_step
train_step = make_train_step(lr_model, criterion, optimizer)
training_losses = []
validation_losses = []
for epoch in range(n_epochs):
y_pred = []
losses = []
for x_batch, y_batch in train_loader:
loss = train_step(x_batch, y_batch)
losses.append(loss)
training_loss = np.mean(losses)
training_losses.append(training_loss)
#Evaluation
with torch.no_grad():
val_losses = []
for x_val, y_val in dev_loader:
lr_model.eval()
yhat = lr_model(x_val)
y_pred.append(int(yhat.item() > 0.5))
val_loss = criterion(yhat, y_val.unsqueeze(1))
val_losses.append(val_loss.item())
validation_loss = np.mean(val_losses)
validation_losses.append(validation_loss)
print(f"[{epoch+1}] Training loss: {training_loss:.3f}\t Validation loss: {validation_loss:.3f}")
score1 = metrics.accuracy_score(dev_y, y_pred)
print("accuracy: %0.5f" % score1)
file = open('dev-0/out.tsv',"w")
for i in y_pred:
file.writelines("{}\n".format(i))
file.close()
test_X = []
with open('test-A/in.tsv','r') as test_in_file:
for line in test_in_file:
test_X.append(get_str_cleaned(line.rstrip('\n')))
test_X_data = pd.DataFrame(test_X)
test_X_data = test_X_data[test_X_data.columns[0]].apply(document_vector)
test_x_tensor = torch.tensor(test_X_data).float()
val_y_pred = []
with torch.no_grad():
for x_val in test_x_tensor:
lr_model.eval()
yhat = lr_model(x_val)
val_y_pred.append(int(yhat.item() > 0.5))
file = open('test-A/out.tsv',"w")
for i in val_y_pred:
file.writelines("{}\n".format(i))
file.close()