add machine learning scripts + corpus (WIP)

This commit is contained in:
Karolin 2021-01-05 07:39:37 +01:00
parent bc20d5dbab
commit 131b8f71c0
3 changed files with 2336 additions and 0 deletions

46
classifier/arglstm.py Normal file
View File

@ -0,0 +1,46 @@
import torch.nn as nn
import torch
class ArgLSTM(nn.Module):
"""
The RNN model that will be used to perform Sentiment analysis.
"""
def __init__(self, vocab_size, output_size, embedding_dim, hidden_dim, glove_weights, n_layers, drop_prob=0.5):
"""
Initialize the model by setting up the layers.
"""
super().__init__()
# embedding and LSTM layers
self.embedding = nn.Embedding(vocab_size, embedding_dim, padding_idx=0)
print(self.embedding.weight.shape)
print(torch.from_numpy(glove_weights).shape)
self.embedding.weight.data.copy_(torch.from_numpy(glove_weights))
self.embedding.weight.requires_grad = False ## freeze embeddings
self.lstm = nn.LSTM(embedding_dim, hidden_dim, n_layers, dropout=drop_prob, batch_first=True)
# dropout layer
self.dropout = nn.Dropout(0.2)
# linear and sigmoid layers
self.fc = nn.Linear(hidden_dim, output_size)
self.sig = nn.Sigmoid()
def forward(self, x):
"""
Perform a forward pass of our model on some input and hidden state.
"""
# embeddings and lstm_out
embeds = self.embedding(x)
lstm_out, (ht, ct) = self.lstm(embeds)
# dropout and fully-connected layer
out = self.dropout(ht[-1])
out = self.fc(out)
# sigmoid function
sig_out = self.sig(out)
return sig_out

251
classifier/make_model.py Normal file
View File

@ -0,0 +1,251 @@
#!/usr/bin/env python3
import argparse
import pdb
import string
from collections import Counter
import numpy as np
import torch
import torch.nn as nn
import torch.nn.functional as F
from torch.utils.data import DataLoader, TensorDataset
from random import shuffle
from arglstm import ArgLSTM
import os
def normalize(text):
punctuation = string.punctuation + "„“”«»‚’-–…"
return ''.join([c for c in text.lower() if c not in punctuation])
def pad_features(text_ints, seq_len):
''' Return features of text_ints, where each text is padded with 0's
or truncated to the input seq_len
'''
features = np.zeros((len(text_ints), seq_len), dtype = int)
for i, text in enumerate(text_ints):
text_len = len(text)
if text_len <= seq_len:
zeroes = list(np.zeros(seq_len-text_len))
new = zeroes+text
else:
new = text[0:seq_len]
features[i,:] = np.array(new)
return features
# arguments
parser = argparse.ArgumentParser(description=
"Train model on a given corpus.")
parser.add_argument('filename')
args = parser.parse_args()
os.environ["CUDA_VISIBLE_DEVICES"]=""
# load data
with open(args.filename, 'r') as f:
corpus = normalize(f.read()).split('\n')[1:]
shuffle(corpus)
paragraphs = []
tags = []
for _,p,t in [e.split('\t') for e in corpus]:
paragraphs.append(p.strip())
tags.append(t.strip())
# translate to numbers
words = ' '.join(paragraphs).split()
# Glove embeddings
def load_glove_vectors(glove_file="glove_100_3_polish.txt"):
"""Load the glove word vectors"""
word_vectors = {}
with open(glove_file) as f:
for line in f:
split = line.split()
word_vectors[split[0]] = np.array([float(x) for x in split[1:]])
return word_vectors
def get_emb_matrix(pretrained, word_counts, emb_size=100):
""" Creates embedding matrix from word vectors"""
vocab_size = len(words) + 2
vocab_to_idx = {}
vocab = ["", "UNK"]
W = np.zeros((vocab_size, emb_size), dtype="float32")
W[0] = np.zeros(emb_size, dtype='float32') # adding a vector for padding
W[1] = np.random.uniform(-0.25, 0.25, emb_size) # adding a vector for unknown words
vocab_to_idx["UNK"] = 1
i = 2
for word in word_counts:
if word in word_vecs:
W[i] = word_vecs[word]
else:
W[i] = np.random.uniform(-0.25,0.25, emb_size)
vocab_to_idx[word] = i
vocab.append(word)
i += 1
return W, np.array(vocab), vocab_to_idx
word_vecs = load_glove_vectors()
pretrained_weights, vocab, vocab2index = get_emb_matrix(word_vecs, words)
pars_int = []
for par in paragraphs:
pars_int.append([vocab2index[w] for w in par.split()])
label_dict = {
'hipoteza':0,'rzeczowe':1, 'logiczne':2, 'emocjonalne':3, 'inne':4 }
print(torch.arange(0, 5))
print(torch.Tensor([label_dict[t] for t in tags]))
labels = np.array(F.one_hot(torch.Tensor([label_dict[t] for t in tags]).to(torch.long), num_classes=5))
# pad sequences
features = pad_features(pars_int, 200)
# split into sets
ratio = 0.8
size = len(features)
train_par = features[:int(ratio*size)]
remaining = features[int(ratio*size):]
dev_par = remaining[:int(len(remaining)*0.5)]
test_par = remaining[int(len(remaining)*0.5):]
train_lab = labels[:int(ratio*size)]
remaining = labels[int(ratio*size):]
dev_lab = remaining[:int(len(remaining)*0.5)]
test_lab = remaining[int(len(remaining)*0.5):]
# create Tensor datasets
train_data = TensorDataset(torch.from_numpy(train_par), torch.from_numpy(train_lab))
dev_data = TensorDataset(torch.from_numpy(dev_par), torch.from_numpy(dev_lab))
test_data = TensorDataset(torch.from_numpy(test_par), torch.from_numpy(test_lab))
# dataloaders
batch_size = 50
# make sure to SHUFFLE your data
train_loader = DataLoader(train_data, shuffle=True, batch_size=batch_size, drop_last=True)
dev_loader = DataLoader(dev_data, shuffle=True, batch_size=batch_size, drop_last=True)
test_loader = DataLoader(test_data, shuffle=True, batch_size=batch_size, drop_last=True)
# Instantiate the model w/ hyperparams
output_size = 5
embedding_dim = 100
hidden_dim = 256
n_layers = 2
net = ArgLSTM(len(vocab), output_size, embedding_dim, hidden_dim, pretrained_weights, n_layers)
print(net)
# loss and optimization functions
lr=0.001
criterion = nn.BCELoss()
optimizer = torch.optim.Adam(net.parameters(), lr=lr)
# training params
epochs = 10 # 3-4 is approx where I noticed the validation loss stop decreasing
counter = 0
print_every = 1
clip=5 # gradient clipping
train_on_gpu = False
# move model to GPU, if available
#if(train_on_gpu):
# net.cuda()
net.train()
# train for some number of epochs
for e in range(epochs):
# initialize hidden state
# batch loop
for inputs, labels in train_loader:
counter += 1
#if(train_on_gpu):
# inputs, labels = inputs.cuda(), labels.cuda()
# zero accumulated gradients
net.zero_grad()
# get the output from the model
inputs = inputs.type(torch.LongTensor)
output = net(inputs)
# calculate the loss and perform backprop
loss = criterion(output, labels.float())
loss.backward()
# `clip_grad_norm` helps prevent the exploding gradient problem in RNNs / LSTMs.
nn.utils.clip_grad_norm_(net.parameters(), clip)
optimizer.step()
# loss stats
if counter % print_every == 0:
# Get validation loss
val_losses = []
net.eval()
for inputs, labels in dev_loader:
# Creating new variables for the hidden state, otherwise
# we'd backprop through the entire training history
#if(train_on_gpu):
# inputs, labels = inputs.cuda(), labels.cuda()
inputs = inputs.type(torch.LongTensor)
output = net(inputs)
val_loss = criterion(output, labels.float())
val_losses.append(val_loss.item())
net.train()
print("Epoch: {}/{}...".format(e+1, epochs),
"Step: {}...".format(counter),
"Loss: {:.6f}...".format(loss.item()),
"Val Loss: {:.6f}".format(np.mean(val_losses)))
# Get test data loss and accuracy
test_losses = [] # track loss
num_correct = 0
net.eval()
# iterate over test data
for inputs, labels in test_loader:
#if(train_on_gpu):
# inputs, labels = inputs.cuda(), labels.cuda()
# get predicted outputs
inputs = inputs.type(torch.LongTensor)
output = net(inputs)
# calculate loss
test_loss = criterion(output, labels.float())
test_losses.append(test_loss.item())
# convert output probabilities to predicted class (0 or 1)
pred = torch.argmax(output, dim=1)
print(pred)
print(torch.argmax(labels.float(), dim=1).view_as(pred))
# compare predictions to true label
correct_tensor = pred.eq(torch.argmax(labels.float(), dim=1).view_as(pred))
correct = np.squeeze(correct_tensor.numpy()) #if not train_on_gpu else np.squeeze(correct_tensor.cpu().numpy())
num_correct += np.sum(correct)
# -- stats! -- ##
# avg test loss
print("Test loss: {:.3f}".format(np.mean(test_losses)))
# accuracy over all test data
test_acc = num_correct/len(test_loader.dataset)
print("Test accuracy: {:.3f}".format(test_acc))

2039
classifier/tagged_corpus.tsv Normal file

File diff suppressed because it is too large Load Diff