add machine learning scripts + corpus (WIP)
This commit is contained in:
parent
bc20d5dbab
commit
131b8f71c0
46
classifier/arglstm.py
Normal file
46
classifier/arglstm.py
Normal file
@ -0,0 +1,46 @@
|
|||||||
|
import torch.nn as nn
|
||||||
|
import torch
|
||||||
|
|
||||||
|
class ArgLSTM(nn.Module):
|
||||||
|
"""
|
||||||
|
The RNN model that will be used to perform Sentiment analysis.
|
||||||
|
"""
|
||||||
|
|
||||||
|
def __init__(self, vocab_size, output_size, embedding_dim, hidden_dim, glove_weights, n_layers, drop_prob=0.5):
|
||||||
|
"""
|
||||||
|
Initialize the model by setting up the layers.
|
||||||
|
"""
|
||||||
|
super().__init__()
|
||||||
|
|
||||||
|
# embedding and LSTM layers
|
||||||
|
self.embedding = nn.Embedding(vocab_size, embedding_dim, padding_idx=0)
|
||||||
|
print(self.embedding.weight.shape)
|
||||||
|
print(torch.from_numpy(glove_weights).shape)
|
||||||
|
self.embedding.weight.data.copy_(torch.from_numpy(glove_weights))
|
||||||
|
self.embedding.weight.requires_grad = False ## freeze embeddings
|
||||||
|
self.lstm = nn.LSTM(embedding_dim, hidden_dim, n_layers, dropout=drop_prob, batch_first=True)
|
||||||
|
|
||||||
|
|
||||||
|
# dropout layer
|
||||||
|
self.dropout = nn.Dropout(0.2)
|
||||||
|
|
||||||
|
# linear and sigmoid layers
|
||||||
|
self.fc = nn.Linear(hidden_dim, output_size)
|
||||||
|
self.sig = nn.Sigmoid()
|
||||||
|
|
||||||
|
|
||||||
|
def forward(self, x):
|
||||||
|
"""
|
||||||
|
Perform a forward pass of our model on some input and hidden state.
|
||||||
|
"""
|
||||||
|
# embeddings and lstm_out
|
||||||
|
embeds = self.embedding(x)
|
||||||
|
lstm_out, (ht, ct) = self.lstm(embeds)
|
||||||
|
|
||||||
|
# dropout and fully-connected layer
|
||||||
|
out = self.dropout(ht[-1])
|
||||||
|
out = self.fc(out)
|
||||||
|
# sigmoid function
|
||||||
|
sig_out = self.sig(out)
|
||||||
|
|
||||||
|
return sig_out
|
251
classifier/make_model.py
Normal file
251
classifier/make_model.py
Normal file
@ -0,0 +1,251 @@
|
|||||||
|
#!/usr/bin/env python3
|
||||||
|
|
||||||
|
import argparse
|
||||||
|
import pdb
|
||||||
|
import string
|
||||||
|
from collections import Counter
|
||||||
|
import numpy as np
|
||||||
|
import torch
|
||||||
|
import torch.nn as nn
|
||||||
|
import torch.nn.functional as F
|
||||||
|
from torch.utils.data import DataLoader, TensorDataset
|
||||||
|
from random import shuffle
|
||||||
|
from arglstm import ArgLSTM
|
||||||
|
import os
|
||||||
|
|
||||||
|
|
||||||
|
def normalize(text):
|
||||||
|
punctuation = string.punctuation + "„“”«»‚’-–…"
|
||||||
|
return ''.join([c for c in text.lower() if c not in punctuation])
|
||||||
|
|
||||||
|
def pad_features(text_ints, seq_len):
|
||||||
|
''' Return features of text_ints, where each text is padded with 0's
|
||||||
|
or truncated to the input seq_len
|
||||||
|
'''
|
||||||
|
features = np.zeros((len(text_ints), seq_len), dtype = int)
|
||||||
|
|
||||||
|
for i, text in enumerate(text_ints):
|
||||||
|
text_len = len(text)
|
||||||
|
|
||||||
|
if text_len <= seq_len:
|
||||||
|
zeroes = list(np.zeros(seq_len-text_len))
|
||||||
|
new = zeroes+text
|
||||||
|
else:
|
||||||
|
new = text[0:seq_len]
|
||||||
|
|
||||||
|
features[i,:] = np.array(new)
|
||||||
|
|
||||||
|
return features
|
||||||
|
|
||||||
|
|
||||||
|
# arguments
|
||||||
|
parser = argparse.ArgumentParser(description=
|
||||||
|
"Train model on a given corpus.")
|
||||||
|
parser.add_argument('filename')
|
||||||
|
args = parser.parse_args()
|
||||||
|
|
||||||
|
os.environ["CUDA_VISIBLE_DEVICES"]=""
|
||||||
|
|
||||||
|
# load data
|
||||||
|
with open(args.filename, 'r') as f:
|
||||||
|
corpus = normalize(f.read()).split('\n')[1:]
|
||||||
|
shuffle(corpus)
|
||||||
|
paragraphs = []
|
||||||
|
tags = []
|
||||||
|
for _,p,t in [e.split('\t') for e in corpus]:
|
||||||
|
paragraphs.append(p.strip())
|
||||||
|
tags.append(t.strip())
|
||||||
|
|
||||||
|
# translate to numbers
|
||||||
|
words = ' '.join(paragraphs).split()
|
||||||
|
|
||||||
|
# Glove embeddings
|
||||||
|
def load_glove_vectors(glove_file="glove_100_3_polish.txt"):
|
||||||
|
"""Load the glove word vectors"""
|
||||||
|
word_vectors = {}
|
||||||
|
with open(glove_file) as f:
|
||||||
|
for line in f:
|
||||||
|
split = line.split()
|
||||||
|
word_vectors[split[0]] = np.array([float(x) for x in split[1:]])
|
||||||
|
return word_vectors
|
||||||
|
|
||||||
|
def get_emb_matrix(pretrained, word_counts, emb_size=100):
|
||||||
|
""" Creates embedding matrix from word vectors"""
|
||||||
|
vocab_size = len(words) + 2
|
||||||
|
vocab_to_idx = {}
|
||||||
|
vocab = ["", "UNK"]
|
||||||
|
W = np.zeros((vocab_size, emb_size), dtype="float32")
|
||||||
|
W[0] = np.zeros(emb_size, dtype='float32') # adding a vector for padding
|
||||||
|
W[1] = np.random.uniform(-0.25, 0.25, emb_size) # adding a vector for unknown words
|
||||||
|
vocab_to_idx["UNK"] = 1
|
||||||
|
i = 2
|
||||||
|
for word in word_counts:
|
||||||
|
if word in word_vecs:
|
||||||
|
W[i] = word_vecs[word]
|
||||||
|
else:
|
||||||
|
W[i] = np.random.uniform(-0.25,0.25, emb_size)
|
||||||
|
vocab_to_idx[word] = i
|
||||||
|
vocab.append(word)
|
||||||
|
i += 1
|
||||||
|
return W, np.array(vocab), vocab_to_idx
|
||||||
|
|
||||||
|
word_vecs = load_glove_vectors()
|
||||||
|
pretrained_weights, vocab, vocab2index = get_emb_matrix(word_vecs, words)
|
||||||
|
|
||||||
|
pars_int = []
|
||||||
|
for par in paragraphs:
|
||||||
|
pars_int.append([vocab2index[w] for w in par.split()])
|
||||||
|
|
||||||
|
label_dict = {
|
||||||
|
'hipoteza':0,'rzeczowe':1, 'logiczne':2, 'emocjonalne':3, 'inne':4 }
|
||||||
|
print(torch.arange(0, 5))
|
||||||
|
print(torch.Tensor([label_dict[t] for t in tags]))
|
||||||
|
labels = np.array(F.one_hot(torch.Tensor([label_dict[t] for t in tags]).to(torch.long), num_classes=5))
|
||||||
|
|
||||||
|
# pad sequences
|
||||||
|
features = pad_features(pars_int, 200)
|
||||||
|
|
||||||
|
# split into sets
|
||||||
|
ratio = 0.8
|
||||||
|
size = len(features)
|
||||||
|
train_par = features[:int(ratio*size)]
|
||||||
|
remaining = features[int(ratio*size):]
|
||||||
|
dev_par = remaining[:int(len(remaining)*0.5)]
|
||||||
|
test_par = remaining[int(len(remaining)*0.5):]
|
||||||
|
|
||||||
|
train_lab = labels[:int(ratio*size)]
|
||||||
|
remaining = labels[int(ratio*size):]
|
||||||
|
dev_lab = remaining[:int(len(remaining)*0.5)]
|
||||||
|
test_lab = remaining[int(len(remaining)*0.5):]
|
||||||
|
|
||||||
|
# create Tensor datasets
|
||||||
|
train_data = TensorDataset(torch.from_numpy(train_par), torch.from_numpy(train_lab))
|
||||||
|
dev_data = TensorDataset(torch.from_numpy(dev_par), torch.from_numpy(dev_lab))
|
||||||
|
test_data = TensorDataset(torch.from_numpy(test_par), torch.from_numpy(test_lab))
|
||||||
|
|
||||||
|
# dataloaders
|
||||||
|
batch_size = 50
|
||||||
|
|
||||||
|
# make sure to SHUFFLE your data
|
||||||
|
train_loader = DataLoader(train_data, shuffle=True, batch_size=batch_size, drop_last=True)
|
||||||
|
dev_loader = DataLoader(dev_data, shuffle=True, batch_size=batch_size, drop_last=True)
|
||||||
|
test_loader = DataLoader(test_data, shuffle=True, batch_size=batch_size, drop_last=True)
|
||||||
|
|
||||||
|
|
||||||
|
# Instantiate the model w/ hyperparams
|
||||||
|
output_size = 5
|
||||||
|
embedding_dim = 100
|
||||||
|
hidden_dim = 256
|
||||||
|
n_layers = 2
|
||||||
|
net = ArgLSTM(len(vocab), output_size, embedding_dim, hidden_dim, pretrained_weights, n_layers)
|
||||||
|
print(net)
|
||||||
|
|
||||||
|
|
||||||
|
# loss and optimization functions
|
||||||
|
lr=0.001
|
||||||
|
|
||||||
|
criterion = nn.BCELoss()
|
||||||
|
optimizer = torch.optim.Adam(net.parameters(), lr=lr)
|
||||||
|
|
||||||
|
|
||||||
|
# training params
|
||||||
|
|
||||||
|
epochs = 10 # 3-4 is approx where I noticed the validation loss stop decreasing
|
||||||
|
|
||||||
|
counter = 0
|
||||||
|
print_every = 1
|
||||||
|
clip=5 # gradient clipping
|
||||||
|
|
||||||
|
train_on_gpu = False
|
||||||
|
# move model to GPU, if available
|
||||||
|
#if(train_on_gpu):
|
||||||
|
# net.cuda()
|
||||||
|
|
||||||
|
net.train()
|
||||||
|
# train for some number of epochs
|
||||||
|
for e in range(epochs):
|
||||||
|
# initialize hidden state
|
||||||
|
# batch loop
|
||||||
|
for inputs, labels in train_loader:
|
||||||
|
counter += 1
|
||||||
|
|
||||||
|
#if(train_on_gpu):
|
||||||
|
# inputs, labels = inputs.cuda(), labels.cuda()
|
||||||
|
|
||||||
|
# zero accumulated gradients
|
||||||
|
net.zero_grad()
|
||||||
|
|
||||||
|
# get the output from the model
|
||||||
|
inputs = inputs.type(torch.LongTensor)
|
||||||
|
output = net(inputs)
|
||||||
|
|
||||||
|
# calculate the loss and perform backprop
|
||||||
|
loss = criterion(output, labels.float())
|
||||||
|
loss.backward()
|
||||||
|
# `clip_grad_norm` helps prevent the exploding gradient problem in RNNs / LSTMs.
|
||||||
|
nn.utils.clip_grad_norm_(net.parameters(), clip)
|
||||||
|
optimizer.step()
|
||||||
|
|
||||||
|
# loss stats
|
||||||
|
if counter % print_every == 0:
|
||||||
|
# Get validation loss
|
||||||
|
val_losses = []
|
||||||
|
net.eval()
|
||||||
|
for inputs, labels in dev_loader:
|
||||||
|
|
||||||
|
# Creating new variables for the hidden state, otherwise
|
||||||
|
# we'd backprop through the entire training history
|
||||||
|
|
||||||
|
#if(train_on_gpu):
|
||||||
|
# inputs, labels = inputs.cuda(), labels.cuda()
|
||||||
|
|
||||||
|
inputs = inputs.type(torch.LongTensor)
|
||||||
|
output = net(inputs)
|
||||||
|
val_loss = criterion(output, labels.float())
|
||||||
|
|
||||||
|
val_losses.append(val_loss.item())
|
||||||
|
|
||||||
|
net.train()
|
||||||
|
print("Epoch: {}/{}...".format(e+1, epochs),
|
||||||
|
"Step: {}...".format(counter),
|
||||||
|
"Loss: {:.6f}...".format(loss.item()),
|
||||||
|
"Val Loss: {:.6f}".format(np.mean(val_losses)))
|
||||||
|
|
||||||
|
# Get test data loss and accuracy
|
||||||
|
|
||||||
|
test_losses = [] # track loss
|
||||||
|
num_correct = 0
|
||||||
|
|
||||||
|
net.eval()
|
||||||
|
# iterate over test data
|
||||||
|
for inputs, labels in test_loader:
|
||||||
|
|
||||||
|
#if(train_on_gpu):
|
||||||
|
# inputs, labels = inputs.cuda(), labels.cuda()
|
||||||
|
|
||||||
|
# get predicted outputs
|
||||||
|
inputs = inputs.type(torch.LongTensor)
|
||||||
|
output = net(inputs)
|
||||||
|
|
||||||
|
# calculate loss
|
||||||
|
test_loss = criterion(output, labels.float())
|
||||||
|
test_losses.append(test_loss.item())
|
||||||
|
|
||||||
|
# convert output probabilities to predicted class (0 or 1)
|
||||||
|
pred = torch.argmax(output, dim=1)
|
||||||
|
print(pred)
|
||||||
|
print(torch.argmax(labels.float(), dim=1).view_as(pred))
|
||||||
|
|
||||||
|
# compare predictions to true label
|
||||||
|
correct_tensor = pred.eq(torch.argmax(labels.float(), dim=1).view_as(pred))
|
||||||
|
correct = np.squeeze(correct_tensor.numpy()) #if not train_on_gpu else np.squeeze(correct_tensor.cpu().numpy())
|
||||||
|
num_correct += np.sum(correct)
|
||||||
|
|
||||||
|
|
||||||
|
# -- stats! -- ##
|
||||||
|
# avg test loss
|
||||||
|
print("Test loss: {:.3f}".format(np.mean(test_losses)))
|
||||||
|
|
||||||
|
# accuracy over all test data
|
||||||
|
test_acc = num_correct/len(test_loader.dataset)
|
||||||
|
print("Test accuracy: {:.3f}".format(test_acc))
|
2039
classifier/tagged_corpus.tsv
Normal file
2039
classifier/tagged_corpus.tsv
Normal file
File diff suppressed because it is too large
Load Diff
Loading…
Reference in New Issue
Block a user