sport-text-classification-b.../rlrozwiazanie.py

199 lines
6.0 KiB
Python
Raw Normal View History

2021-05-25 20:16:00 +02:00
#import numpy as np
import gzip
from sklearn.naive_bayes import MultinomialNB
from sklearn.feature_extraction.text import TfidfVectorizer, CountVectorizer
from sklearn import metrics
import pandas as pd
import numpy as np
import gensim
from gensim.models import Word2Vec, Phrases, phrases, KeyedVectors
from sklearn.linear_model import LogisticRegression
import re
import torch
from torch.utils.data import Dataset, TensorDataset, DataLoader
def get_str_cleaned(str_dirty):
punctuation = '!"#$%&\'()*+,-./:;<=>?@[\\\\]^_`{|}~'
new_str = str_dirty.lower()
new_str = re.sub(' +', ' ', new_str)
for char in punctuation:
new_str = new_str.replace(char,'')
return new_str
#df = pd.read_csv('sport-text-classification-ball-ISI-public/train/train.tsv.gz', compression='gzip', header=None, sep='\t', error_bad_lines=False)
train_X = []
train_y = []
with gzip.open('train/train.tsv.gz','r') as fin:
for line in fin:
sline = line.decode('UTF-8').replace("\n", "").split("\t")
cleared = get_str_cleaned(''.join(sline[1:]))
if len(cleared)>0:
train_y.append(int(sline[0]))
train_X.append(cleared)
train_X_data = pd.DataFrame(train_X)
#w2v = gensim.models.Word2Vec(vector_size=100)
# #w2v.wv.load_word2vec_format('../../../ncexclude/nkjp+wiki-forms-all-100-cbow-hs.txt.gz', binary=False)
#w2v.wv.load_word2vec_format('../../../ncexclude/wiki-forms-all-100-skipg-ns.txt.gz', binary=False)
#w2v = Word2Vec.load("w2v.model")
#w2v.wv.init_sims()
#w2v.wv.load("word2vec.wordvectors")
#w2v = KeyedVectors.load_word2vec_format('../../../ncexclude/wiki-forms-all-100-skipg-ns.txt.gz', binary=False)
w2v = KeyedVectors.load("word2vec2.wordvectors")
#print(list(w2v.index_to_key))
def document_vector(doc):
"""Create document vectors by averaging word vectors. Remove out-of-vocabulary words."""
#print(doc)
#doc = [word for word in doc if word in w2v.index_to_key]
try:
doc2 = []
doc = doc.split(' ')
for word in doc:
#print(get_str_cleaned(word))
#print(word)
#print(w2v.wv.index_to_key)
if word in w2v:
doc2.append(word)
return np.mean(w2v[doc2], axis=0)
except:
print(doc)
return np.zeros(100)
train_X_data = train_X_data[train_X_data.columns[0]].apply(document_vector)
dev_X = []
with open('dev-0/in.tsv','r') as dev_in_file:
for line in dev_in_file:
dev_X.append(get_str_cleaned(line.rstrip('\n')))
dev_y = []
with open('dev-0/expected.tsv','r') as dev_expected_file:
for line in dev_expected_file:
dev_y.append(int(line.rstrip('\n')))
dev_X_data = pd.DataFrame(dev_X)
dev_X_data = dev_X_data[dev_X_data.columns[0]].apply(document_vector)
# X_train_vec = list(train_X_data['doc_vector'])
# X_dev_vec = list(dev_X_data['doc_vector'])
class LogisticRegressionModel(torch.nn.Module):
def __init__(self):
super(LogisticRegressionModel, self).__init__()
self.fc = torch.nn.Linear(100,1)
def forward(self, x):
x = self.fc(x)
x = torch.sigmoid(x)
return x
lr_model = LogisticRegressionModel()
criterion = torch.nn.BCELoss()
optimizer = torch.optim.SGD(lr_model.parameters(), lr = 0.1)
train_x_tensor = torch.tensor(train_X_data).float()
train_y_tensor = torch.tensor(train_y).float()
train_dataset = TensorDataset(train_x_tensor, train_y_tensor)
train_loader = DataLoader(dataset=train_dataset)
dev_x_tensor = torch.tensor(dev_X_data).float()
dev_y_tensor = torch.tensor(dev_y).float()
dev_dataset = TensorDataset(dev_x_tensor, dev_y_tensor)
dev_loader = DataLoader(dataset=dev_dataset)
n_epochs = 2
# loss_score = 0
# acc_score = 0
# items_total = 0
# for x_batch, y_batch in train_loader:
# lr_model.train()
# # Makes predictions
# yhat = lr_model(x_batch)
# # Computes loss
# loss = criterion(yhat, y_batch.unsqueeze(1))
# # Computes gradients
# loss.backward()
# # Updates parameters and zeroes gradients
# optimizer.step()
# optimizer.zero_grad()
# loss_score += loss.item() * yhat.shape[0]
# print(loss_score)
def make_train_step(model, loss_fn, optimizer):
def train_step(x, y):
model.train()
yhat = model(x)
loss = loss_fn(yhat, y.unsqueeze(1))
loss.backward()
optimizer.step()
optimizer.zero_grad()
return loss.item()
return train_step
train_step = make_train_step(lr_model, criterion, optimizer)
training_losses = []
validation_losses = []
for epoch in range(n_epochs):
y_pred = []
losses = []
for x_batch, y_batch in train_loader:
loss = train_step(x_batch, y_batch)
losses.append(loss)
training_loss = np.mean(losses)
training_losses.append(training_loss)
#Evaluation
with torch.no_grad():
val_losses = []
for x_val, y_val in dev_loader:
lr_model.eval()
yhat = lr_model(x_val)
y_pred.append(int(yhat.item() > 0.5))
val_loss = criterion(yhat, y_val.unsqueeze(1))
val_losses.append(val_loss.item())
validation_loss = np.mean(val_losses)
validation_losses.append(validation_loss)
print(f"[{epoch+1}] Training loss: {training_loss:.3f}\t Validation loss: {validation_loss:.3f}")
score1 = metrics.accuracy_score(dev_y, y_pred)
print("accuracy: %0.5f" % score1)
file = open('dev-0/out.tsv',"w")
for i in y_pred:
file.writelines("{}\n".format(i))
file.close()
test_X = []
with open('test-A/in.tsv','r') as test_in_file:
for line in test_in_file:
test_X.append(get_str_cleaned(line.rstrip('\n')))
test_X_data = pd.DataFrame(test_X)
test_X_data = test_X_data[test_X_data.columns[0]].apply(document_vector)
test_x_tensor = torch.tensor(test_X_data).float()
val_y_pred = []
with torch.no_grad():
for x_val in test_x_tensor:
lr_model.eval()
yhat = lr_model(x_val)
val_y_pred.append(int(yhat.item() > 0.5))
file = open('test-A/out.tsv',"w")
for i in val_y_pred:
file.writelines("{}\n".format(i))
file.close()