Uczenie_Glebokie/3. RNN/rnn.ipynb
!unzip -q /content/en-ner-conll-2003.zip -d /content/
import os
import pandas as pd
import gensim
from gensim.models import KeyedVectors
import numpy as np
from tensorflow.keras.models import Sequential
from tensorflow.keras.layers import Dense
import matplotlib.pyplot as plt
from keras.regularizers import l2
from collections import Counter
from torchtext.vocab import vocab
from tqdm.notebook import tqdm
import torch
Declare path

data_dir_path = 'en-ner-conll-2003'
train_path = os.path.join(data_dir_path, 'train', 'train.tsv')
dev_texts_path = os.path.join(data_dir_path, 'dev-0', 'in.tsv')
dev_labels_path = os.path.join(data_dir_path, 'dev-0', 'expected.tsv')
dev_predicted_path = os.path.join(data_dir_path, 'dev-0', 'out.tsv')
test_texts_path = os.path.join(data_dir_path, 'test-A', 'in.tsv')
test_predicted_path = os.path.join(data_dir_path, 'test-A', 'out.tsv')
word2vec_file_path = 'word2vec_100_3_polish.bin'

Load files

train_data = pd.read_csv(train_path, sep='\t', usecols=[0, 1], header=None, names=['label', 'text'])
dev_texts_data = pd.read_csv(dev_texts_path, sep='\t', usecols=[0], header=None, names=['text'])
dev_labels_data = pd.read_csv(dev_labels_path, sep='\t', usecols=[0], header=None, names=['label'])
test_texts_data = pd.read_csv(test_texts_path, sep='\t', usecols=[0], header=None, names=['text'])

Build Vocab

def build_vocab(dataset):
    counter = Counter()
    for document in dataset:
    return vocab(counter, specials=["<unk>", "<pad>", "<bos>", "<eos>"])
train_X = train_data['text'].apply(lambda x: gensim.utils.simple_preprocess(x))
v = build_vocab(train_X)
itos = v.get_itos()
Preprocess data

reformat_ner_dict = {'O': 0, 'B-PER': 1, 'I-PER': 2, 'B-ORG': 3, 'I-ORG': 4, 'B-LOC': 5, 'I-LOC': 6, 'B-MISC': 7, 'I-MISC': 8}
def fit_data_Y(column):
    dt = [
        [reformat_ner_dict[token] for token in row.split()] for row in column
    return [torch.tensor([0] + document + [0], dtype=torch.long) for document in dt]

def fit_data_X(dt):
    return [
            [v["<bos>"]] + [v[token] for token in document.split()] + [v["<eos>"]],
        for document in dt

train_X = fit_data_X(train_data['text'])
train_Y = fit_data_Y(train_data['label'])
dev_X = fit_data_X(dev_texts_data['text'])
dev_Y = fit_data_Y(dev_labels_data['label'])
test_X = fit_data_X(test_texts_data['text'])

Create model

num_tags = len(reformat_ner_dict)
class LSTM(torch.nn.Module):

    def __init__(self):
        super(LSTM, self).__init__()
        self.emb = torch.nn.Embedding(len(v.get_itos()), 100)
        self.rec = torch.nn.LSTM(100, 256, 1, batch_first=True)
        self.fc1 = torch.nn.Linear(256, num_tags)

    def forward(self, x):
        emb = torch.relu(self.emb(x))
        lstm_output, (h_n, c_n) = self.rec(emb)
        out_weights = self.fc1(lstm_output)
        return out_weights
lstm = LSTM()
criterion = torch.nn.CrossEntropyLoss()
optimizer = torch.optim.Adam(lstm.parameters())

Evaluation model

def get_accuracy(y_true, y_pred):
    hit = 0
    missed = 0

    for p, t in zip(y_pred, y_true):
        if p == t:
            hit += 1
            missed += 1

    accuracy = hit / (hit + missed)
    return accuracy
def eval_model(dataset_tokens, dataset_labels, model):
    Y_true = []
    Y_pred = []
    for i in tqdm(range(len(dataset_labels))):
        batch_tokens = dataset_tokens[i].unsqueeze(0)
        tags = list(dataset_labels[i].numpy())
        Y_true += tags

        Y_batch_pred_weights = model(batch_tokens).squeeze(0)
        Y_batch_pred = torch.argmax(Y_batch_pred_weights, 1)
        Y_pred += list(Y_batch_pred.numpy())

    return get_accuracy(Y_true, Y_pred)

Train model

dev_accuracy_history = list()
for i in range(NUM_EPOCHS):
    for i in tqdm(range(len(train_Y))):
        batch_tokens = train_X[i].unsqueeze(0)
        tags = train_Y[i].unsqueeze(1)

        predicted_tags = lstm(batch_tokens)

        loss = criterion(predicted_tags.squeeze(0), tags.squeeze(1))


    dev_accuracy = eval_model(dev_X, dev_Y, lstm)
    print("dev_accuracy:", dev_accuracy)
dev_accuracy: 0.8261289675666806
dev_accuracy: 0.8233395927148092
dev_accuracy: 0.8254361816557583
dev_accuracy: 0.8291188857085559
dev_accuracy: 0.8282255565076297
dev_accuracy: 0.8284260997976336
dev_accuracy: 0.8291735793331024
dev_accuracy: 0.8304133014894897
dev_accuracy: 0.8287724927530947
dev_accuracy: 0.8272228400576106
dev_accuracy: 0.8259284242766768
dev_accuracy: 0.8301033709503929
dev_accuracy: 0.8318900293522452
dev_accuracy: 0.8309602377349546
dev_accuracy: 0.8275145393885253
dev_accuracy: 0.8292829665821954
dev_accuracy: 0.8277333138867112
dev_accuracy: 0.8284990246303623
dev_accuracy: 0.8304315326976719
dev_accuracy: 0.8267488286448743
dev_accuracy: 0.8274051521394323
dev_accuracy: 0.8274233833476144
dev_accuracy: 0.8282984813403584
dev_accuracy: 0.8308690816940438
dev_accuracy: 0.8321634974749776
dev_accuracy: 0.8278609323439864
dev_accuracy: 0.8294288162476527
dev_accuracy: 0.826384204481231
dev_accuracy: 0.8301945269913037
dev_accuracy: 0.8305226887385827
dev_accuracy: 0.8314707115640554
dev_accuracy: 0.8291006545003737
dev_accuracy: 0.8289183424185521
dev_accuracy: 0.8299392900767534
dev_accuracy: 0.8310513937758656
dev_accuracy: 0.8308873129022261
dev_accuracy: 0.8304862263222184
dev_accuracy: 0.831653023645877
dev_accuracy: 0.830559151154947
dev_accuracy: 0.8280250132176259
dev_accuracy: 0.8291188857085559
dev_accuracy: 0.831981185393156
dev_accuracy: 0.8316894860622414
dev_accuracy: 0.8291371169167381
dev_accuracy: 0.8292647353740132
dev_accuracy: 0.8296840531622031
dev_accuracy: 0.830121602158575
dev_accuracy: 0.8324551968058923
dev_accuracy: 0.8317259484786057
dev_accuracy: 0.8326557400958962
dev_accuracy: 0.8298481340358426
dev_accuracy: 0.8332573699659076
dev_accuracy: 0.8299757524931177
dev_accuracy: 0.834825253869574
dev_accuracy: 0.8307596944449509
dev_accuracy: 0.8290641920840094
dev_accuracy: 0.8288454175858234
dev_accuracy: 0.8292647353740132
dev_accuracy: 0.8293194289985597
dev_accuracy: 0.8281344004667189
dev_accuracy: 0.8313977867313267
dev_accuracy: 0.8272410712657928
dev_accuracy: 0.8286448742958196
dev_accuracy: 0.8293011977903776
dev_accuracy: 0.8289365736267342
dev_accuracy: 0.8287177991285483
dev_accuracy: 0.8273504585148858
dev_accuracy: 0.8285901806712731
dev_accuracy: 0.8298663652440247
dev_accuracy: 0.8284078685894514
dev_accuracy: 0.8279338571767151
dev_accuracy: 0.830230989407668
dev_accuracy: 0.8311425498167764
dev_accuracy: 0.8319082605604273
dev_accuracy: 0.8324369655977102
dev_accuracy: 0.8309237753185904
dev_accuracy: 0.8289912672512807
dev_accuracy: 0.8305773823631292
dev_accuracy: 0.8278427011358043
dev_accuracy: 0.8313613243149623
dev_accuracy: 0.8255638001130335
dev_accuracy: 0.8272775336821571
dev_accuracy: 0.8278427011358043
dev_accuracy: 0.8237042168784525
dev_accuracy: 0.8291371169167381
dev_accuracy: 0.8268764471021495
dev_accuracy: 0.8277515450948935
dev_accuracy: 0.8281161692585368
dev_accuracy: 0.8263659732730488
dev_accuracy: 0.8299392900767534
dev_accuracy: 0.8283896373812693
dev_accuracy: 0.8274416145557966
dev_accuracy: 0.8283714061730871
dev_accuracy: 0.8262930484403201
dev_accuracy: 0.8298116716194782
plt.plot(dev_accuracy_history, label='Validation Accuracy')
plt.title('Model Accuracy Over Epochs')

Predict and save results

reversed_ner_dict = {v: k for k, v in reformat_ner_dict.items()}
{0: 'O', 1: 'B-PER', 2: 'I-PER', 3: 'B-ORG', 4: 'I-ORG', 5: 'B-LOC', 6: 'I-LOC', 7: 'B-MISC', 8: 'I-MISC'}
def predict_and_save(X, filename):
    Y_predicted = []
    for i in tqdm(range(len(X))):
        batch_tokens = X[i].unsqueeze(0)
        Y_batch_pred_weights = lstm(batch_tokens).squeeze(0)
        Y_batch_pred = torch.argmax(Y_batch_pred_weights, 1)
        Y_processed_pred = " ".join(reversed_ner_dict[item] for item in Y_batch_pred.numpy()[1:-1])
    Y_predicted_df = pd.DataFrame(Y_predicted, columns=['predicted_label'])
    Y_predicted_df.to_csv(filename, sep='\t', index=False, header=None)
dev_predicted = predict_and_save(dev_X, dev_predicted_path)
test_predicted = predict_and_save(test_X, test_predicted_path)
