add machine learning scripts + corpus (WIP)

2021-01-05 07:39:37 +01:00 · 2021-01-05 07:39:37 +01:00 · 131b8f71c0
commit 131b8f71c0
parent bc20d5dbab
3 changed files with 2336 additions and 0 deletions
--- a/classifier/arglstm.py
+++ b/classifier/arglstm.py
@ -0,0 +1,46 @@
 import torch.nn as nn
 import torch
 class ArgLSTM(nn.Module):
    """
    The RNN model that will be used to perform Sentiment analysis.
    """
    def __init__(self, vocab_size, output_size, embedding_dim, hidden_dim, glove_weights, n_layers, drop_prob=0.5):
        """
        Initialize the model by setting up the layers.
        """
        super().__init__()
        # embedding and LSTM layers
        self.embedding = nn.Embedding(vocab_size, embedding_dim, padding_idx=0)
        print(self.embedding.weight.shape)
        print(torch.from_numpy(glove_weights).shape)
        self.embedding.weight.data.copy_(torch.from_numpy(glove_weights))
        self.embedding.weight.requires_grad = False ## freeze embeddings
        self.lstm = nn.LSTM(embedding_dim, hidden_dim, n_layers, dropout=drop_prob, batch_first=True)
        # dropout layer
        self.dropout = nn.Dropout(0.2)
        # linear and sigmoid layers
        self.fc = nn.Linear(hidden_dim, output_size)
        self.sig = nn.Sigmoid()
    def forward(self, x):
        """
        Perform a forward pass of our model on some input and hidden state.
        """
        # embeddings and lstm_out
        embeds = self.embedding(x)
        lstm_out, (ht, ct) = self.lstm(embeds)
        # dropout and fully-connected layer
        out = self.dropout(ht[-1])
        out = self.fc(out)
        # sigmoid function
        sig_out = self.sig(out)
        return sig_out
--- a/classifier/make_model.py
+++ b/classifier/make_model.py
@ -0,0 +1,251 @@
 #!/usr/bin/env python3
 import argparse
 import pdb
 import string
 from collections import Counter
 import numpy as np
 import torch
 import torch.nn as nn
 import torch.nn.functional as F
 from torch.utils.data import DataLoader, TensorDataset
 from random import shuffle
 from arglstm import ArgLSTM
 import os
 def normalize(text):
 	punctuation = string.punctuation + "„“”«»‚’-–…"
 	return ''.join([c for c in text.lower() if c not in punctuation])
 def pad_features(text_ints, seq_len):
    ''' Return features of text_ints, where each text is padded with 0's
    	or truncated to the input seq_len
    '''
    features = np.zeros((len(text_ints), seq_len), dtype = int)
    for i, text in enumerate(text_ints):
        text_len = len(text)
        if text_len <= seq_len:
            zeroes = list(np.zeros(seq_len-text_len))
            new = zeroes+text
        else:
            new = text[0:seq_len]
        features[i,:] = np.array(new)
    return features
 # arguments
 parser = argparse.ArgumentParser(description=
 	"Train model on a given corpus.")
 parser.add_argument('filename')
 args = parser.parse_args()
 os.environ["CUDA_VISIBLE_DEVICES"]=""
 # load data
 with open(args.filename, 'r') as f:
 	corpus = normalize(f.read()).split('\n')[1:]
 	shuffle(corpus)
 	paragraphs = []
 	tags = []
 	for _,p,t in [e.split('\t') for e in corpus]:
 		paragraphs.append(p.strip())
 		tags.append(t.strip())
 # translate to numbers
 words = ' '.join(paragraphs).split()
 # Glove embeddings
 def load_glove_vectors(glove_file="glove_100_3_polish.txt"):
    """Load the glove word vectors"""
    word_vectors = {}
    with open(glove_file) as f:
        for line in f:
            split = line.split()
            word_vectors[split[0]] = np.array([float(x) for x in split[1:]])
    return word_vectors
 def get_emb_matrix(pretrained, word_counts, emb_size=100):
    """ Creates embedding matrix from word vectors"""
    vocab_size = len(words) + 2
    vocab_to_idx = {}
    vocab = ["", "UNK"]
    W = np.zeros((vocab_size, emb_size), dtype="float32")
    W[0] = np.zeros(emb_size, dtype='float32') # adding a vector for padding
    W[1] = np.random.uniform(-0.25, 0.25, emb_size) # adding a vector for unknown words 
    vocab_to_idx["UNK"] = 1
    i = 2
    for word in word_counts:
        if word in word_vecs:
            W[i] = word_vecs[word]
        else:
            W[i] = np.random.uniform(-0.25,0.25, emb_size)
        vocab_to_idx[word] = i
        vocab.append(word)
        i += 1   
    return W, np.array(vocab), vocab_to_idx
 word_vecs = load_glove_vectors()
 pretrained_weights, vocab, vocab2index = get_emb_matrix(word_vecs, words)
 pars_int = []
 for par in paragraphs:
    pars_int.append([vocab2index[w] for w in par.split()])
 label_dict = {
 	'hipoteza':0,'rzeczowe':1, 'logiczne':2, 'emocjonalne':3, 'inne':4 }
 print(torch.arange(0, 5))
 print(torch.Tensor([label_dict[t] for t in tags]))
 labels = np.array(F.one_hot(torch.Tensor([label_dict[t] for t in tags]).to(torch.long), num_classes=5))
 # pad sequences
 features = pad_features(pars_int, 200)
 # split into sets
 ratio = 0.8
 size = len(features)
 train_par = features[:int(ratio*size)]
 remaining = features[int(ratio*size):]
 dev_par = remaining[:int(len(remaining)*0.5)]
 test_par = remaining[int(len(remaining)*0.5):]
 train_lab = labels[:int(ratio*size)]
 remaining = labels[int(ratio*size):]
 dev_lab = remaining[:int(len(remaining)*0.5)]
 test_lab = remaining[int(len(remaining)*0.5):]
 # create Tensor datasets
 train_data = TensorDataset(torch.from_numpy(train_par), torch.from_numpy(train_lab))
 dev_data = TensorDataset(torch.from_numpy(dev_par), torch.from_numpy(dev_lab))
 test_data = TensorDataset(torch.from_numpy(test_par), torch.from_numpy(test_lab))
 # dataloaders
 batch_size = 50
 # make sure to SHUFFLE your data
 train_loader = DataLoader(train_data, shuffle=True, batch_size=batch_size, drop_last=True)
 dev_loader = DataLoader(dev_data, shuffle=True, batch_size=batch_size, drop_last=True)
 test_loader = DataLoader(test_data, shuffle=True, batch_size=batch_size, drop_last=True)
 # Instantiate the model w/ hyperparams
 output_size = 5
 embedding_dim = 100
 hidden_dim = 256
 n_layers = 2
 net = ArgLSTM(len(vocab), output_size, embedding_dim, hidden_dim, pretrained_weights, n_layers)
 print(net)
 # loss and optimization functions
 lr=0.001
 criterion = nn.BCELoss()
 optimizer = torch.optim.Adam(net.parameters(), lr=lr)
 # training params
 epochs = 10 # 3-4 is approx where I noticed the validation loss stop decreasing
 counter = 0
 print_every = 1
 clip=5 # gradient clipping
 train_on_gpu = False
 # move model to GPU, if available
 #if(train_on_gpu):
 #    net.cuda()
 net.train()
 # train for some number of epochs
 for e in range(epochs):
    # initialize hidden state
    # batch loop
    for inputs, labels in train_loader:
        counter += 1
        #if(train_on_gpu):
        #    inputs, labels = inputs.cuda(), labels.cuda()
        # zero accumulated gradients
        net.zero_grad()
        # get the output from the model
        inputs = inputs.type(torch.LongTensor)
        output = net(inputs)
        # calculate the loss and perform backprop
        loss = criterion(output, labels.float())
        loss.backward()
        # `clip_grad_norm` helps prevent the exploding gradient problem in RNNs / LSTMs.
        nn.utils.clip_grad_norm_(net.parameters(), clip)
        optimizer.step()
        # loss stats
        if counter % print_every == 0:
            # Get validation loss
            val_losses = []
            net.eval()
            for inputs, labels in dev_loader:
                # Creating new variables for the hidden state, otherwise
                # we'd backprop through the entire training history
                #if(train_on_gpu):
                #    inputs, labels = inputs.cuda(), labels.cuda()
                inputs = inputs.type(torch.LongTensor)
                output = net(inputs)
                val_loss = criterion(output, labels.float())
                val_losses.append(val_loss.item())
            net.train()
            print("Epoch: {}/{}...".format(e+1, epochs),
                  "Step: {}...".format(counter),
                  "Loss: {:.6f}...".format(loss.item()),
                  "Val Loss: {:.6f}".format(np.mean(val_losses)))
 # Get test data loss and accuracy
 test_losses = [] # track loss
 num_correct = 0
 net.eval()
 # iterate over test data
 for inputs, labels in test_loader:
    #if(train_on_gpu):
    #    inputs, labels = inputs.cuda(), labels.cuda()
    # get predicted outputs
    inputs = inputs.type(torch.LongTensor)
    output = net(inputs)
    # calculate loss
    test_loss = criterion(output, labels.float())
    test_losses.append(test_loss.item())
    # convert output probabilities to predicted class (0 or 1)
    pred = torch.argmax(output, dim=1)
    print(pred)
    print(torch.argmax(labels.float(), dim=1).view_as(pred))
    # compare predictions to true label
    correct_tensor = pred.eq(torch.argmax(labels.float(), dim=1).view_as(pred))
    correct = np.squeeze(correct_tensor.numpy()) #if not train_on_gpu else np.squeeze(correct_tensor.cpu().numpy())
    num_correct += np.sum(correct)
 # -- stats! -- ##
 # avg test loss
 print("Test loss: {:.3f}".format(np.mean(test_losses)))
 # accuracy over all test data
 test_acc = num_correct/len(test_loader.dataset)
 print("Test accuracy: {:.3f}".format(test_acc))
--- a/classifier/tagged_corpus.tsv
+++ b/classifier/tagged_corpus.tsv