RNN/RNN.ipynb
2024-05-28 16:48:34 +02:00

22 KiB
Raw Permalink Blame History

import pandas as pd
import numpy as np
import tensorflow as tf
from tensorflow.keras.preprocessing.sequence import pad_sequences
from tensorflow.keras.utils import to_categorical
from sklearn.metrics import classification_report
2024-05-28 15:20:02.976189: I tensorflow/core/platform/cpu_feature_guard.cc:210] This TensorFlow binary is optimized to use available CPU instructions in performance-critical operations.
To enable the following instructions: AVX2 FMA, in other operations, rebuild TensorFlow with the appropriate compiler flags.
2024-05-28 15:20:04.436596: W tensorflow/compiler/tf2tensorrt/utils/py_utils.cc:38] TF-TRT Warning: Could not find TensorRT
def load_data(file_path):
    with open(file_path, 'r', encoding='utf-8') as file:
        lines = file.readlines()
    sentences = [line.strip() for line in lines]
    return sentences
train_data = pd.read_csv('en-ner-conll-2003/train/train.tsv', sep='\t', header=None, names=['label', 'sentence'], encoding='utf-8')
dev_sentences = load_data('en-ner-conll-2003/dev-0/in.tsv')
dev_labels = load_data('en-ner-conll-2003/dev-0/expected.tsv')
test_sentences = load_data('en-ner-conll-2003/test-A/in.tsv')
def preprocess_data(sentences, labels=None):
    tokenized_sentences = [sentence.split() for sentence in sentences]
    if labels is not None:
        tokenized_labels = [label.split() for label in labels]
        return tokenized_sentences, tokenized_labels
    return tokenized_sentences
train_sentences, train_labels = preprocess_data(train_data['sentence'].values, train_data['label'].values)
dev_sentences, dev_labels = preprocess_data(dev_sentences, dev_labels)
test_sentences = preprocess_data(test_sentences)
special_tokens = ['<PAD>', '<UNK>', '<BOS>', '<EOS>']
word2idx = {w: i + len(special_tokens) for i, w in enumerate(set(word for sentence in train_sentences for word in sentence))}
for i, token in enumerate(special_tokens):
    word2idx[token] = i

idx2word = {i: w for w, i in word2idx.items()}

label2idx = {
    'O': 0,
    'B-PER': 1, 'I-PER': 2,
    'B-ORG': 3, 'I-ORG': 4,
    'B-LOC': 5, 'I-LOC': 6,
    'B-MISC': 7, 'I-MISC': 8
}
idx2label = {i: l for l, i in label2idx.items()}
def encode_data(sentences, labels=None):
    encoded_sentences = [[word2idx.get(word, word2idx['<UNK>']) for word in sentence] for sentence in sentences]
    if labels is not None:
        encoded_labels = [[label2idx[label] for label in label_list] for label_list in labels]
        return encoded_sentences, encoded_labels
    return encoded_sentences
X_train, y_train = encode_data(train_sentences, train_labels)
X_dev, y_dev = encode_data(dev_sentences, dev_labels)
X_test = encode_data(test_sentences)
max_len = 1000  
X_train = pad_sequences(X_train, padding='post', maxlen=max_len)
y_train = pad_sequences(y_train, padding='post', maxlen=max_len)

X_dev = pad_sequences(X_dev, padding='post', maxlen=max_len)
y_dev = pad_sequences(y_dev, padding='post', maxlen=max_len)

X_test = pad_sequences(X_test, padding='post', maxlen=max_len)

y_train = [to_categorical(i, num_classes=len(label2idx)) for i in y_train]
y_dev = [to_categorical(i, num_classes=len(label2idx)) for i in y_dev]
model = tf.keras.models.Sequential([
    tf.keras.layers.Embedding(input_dim=len(word2idx), output_dim=64),
    tf.keras.layers.Bidirectional(tf.keras.layers.LSTM(units=32, return_sequences=True)),
    tf.keras.layers.TimeDistributed(tf.keras.layers.Dense(len(label2idx), activation='softmax'))
])

model.compile(optimizer='adam', loss='categorical_crossentropy', metrics=['accuracy'])

history = model.fit(X_train, np.array(y_train), validation_data=(X_dev, np.array(y_dev)), epochs=25, batch_size=14)

y_pred = model.predict(X_dev)
y_pred = np.argmax(y_pred, axis=-1)
y_true = np.argmax(np.array(y_dev), axis=-1)
Epoch 1/25
68/68 ━━━━━━━━━━━━━━━━━━━━ 160s 2s/step - accuracy: 0.8954 - loss: 0.5933 - val_accuracy: 0.9606 - val_loss: 0.1730
Epoch 2/25
68/68 ━━━━━━━━━━━━━━━━━━━━ 135s 2s/step - accuracy: 0.9642 - loss: 0.1531 - val_accuracy: 0.9606 - val_loss: 0.1563
Epoch 3/25
68/68 ━━━━━━━━━━━━━━━━━━━━ 104s 2s/step - accuracy: 0.9623 - loss: 0.1432 - val_accuracy: 0.9607 - val_loss: 0.1358
Epoch 4/25
68/68 ━━━━━━━━━━━━━━━━━━━━ 103s 2s/step - accuracy: 0.9630 - loss: 0.1177 - val_accuracy: 0.9648 - val_loss: 0.1104
Epoch 5/25
68/68 ━━━━━━━━━━━━━━━━━━━━ 104s 2s/step - accuracy: 0.9707 - loss: 0.0885 - val_accuracy: 0.9727 - val_loss: 0.0901
Epoch 6/25
68/68 ━━━━━━━━━━━━━━━━━━━━ 141s 2s/step - accuracy: 0.9790 - loss: 0.0684 - val_accuracy: 0.9779 - val_loss: 0.0751
Epoch 7/25
68/68 ━━━━━━━━━━━━━━━━━━━━ 141s 2s/step - accuracy: 0.9871 - loss: 0.0510 - val_accuracy: 0.9831 - val_loss: 0.0625
Epoch 8/25
68/68 ━━━━━━━━━━━━━━━━━━━━ 104s 2s/step - accuracy: 0.9919 - loss: 0.0377 - val_accuracy: 0.9857 - val_loss: 0.0540
Epoch 9/25
68/68 ━━━━━━━━━━━━━━━━━━━━ 104s 2s/step - accuracy: 0.9947 - loss: 0.0265 - val_accuracy: 0.9874 - val_loss: 0.0472
Epoch 10/25
68/68 ━━━━━━━━━━━━━━━━━━━━ 104s 2s/step - accuracy: 0.9963 - loss: 0.0209 - val_accuracy: 0.9885 - val_loss: 0.0431
Epoch 11/25
68/68 ━━━━━━━━━━━━━━━━━━━━ 104s 2s/step - accuracy: 0.9976 - loss: 0.0148 - val_accuracy: 0.9891 - val_loss: 0.0401
Epoch 12/25
68/68 ━━━━━━━━━━━━━━━━━━━━ 104s 2s/step - accuracy: 0.9983 - loss: 0.0121 - val_accuracy: 0.9895 - val_loss: 0.0386
Epoch 13/25
68/68 ━━━━━━━━━━━━━━━━━━━━ 104s 2s/step - accuracy: 0.9986 - loss: 0.0093 - val_accuracy: 0.9897 - val_loss: 0.0376
Epoch 14/25
68/68 ━━━━━━━━━━━━━━━━━━━━ 142s 2s/step - accuracy: 0.9989 - loss: 0.0077 - val_accuracy: 0.9896 - val_loss: 0.0385
Epoch 15/25
68/68 ━━━━━━━━━━━━━━━━━━━━ 108s 2s/step - accuracy: 0.9991 - loss: 0.0067 - val_accuracy: 0.9896 - val_loss: 0.0385
Epoch 16/25
68/68 ━━━━━━━━━━━━━━━━━━━━ 142s 2s/step - accuracy: 0.9992 - loss: 0.0057 - val_accuracy: 0.9899 - val_loss: 0.0371
Epoch 17/25
68/68 ━━━━━━━━━━━━━━━━━━━━ 142s 2s/step - accuracy: 0.9995 - loss: 0.0045 - val_accuracy: 0.9897 - val_loss: 0.0392
Epoch 18/25
68/68 ━━━━━━━━━━━━━━━━━━━━ 113s 2s/step - accuracy: 0.9995 - loss: 0.0040 - val_accuracy: 0.9899 - val_loss: 0.0385
Epoch 19/25
68/68 ━━━━━━━━━━━━━━━━━━━━ 113s 2s/step - accuracy: 0.9996 - loss: 0.0035 - val_accuracy: 0.9896 - val_loss: 0.0404
Epoch 20/25
68/68 ━━━━━━━━━━━━━━━━━━━━ 114s 2s/step - accuracy: 0.9997 - loss: 0.0030 - val_accuracy: 0.9898 - val_loss: 0.0391
Epoch 21/25
68/68 ━━━━━━━━━━━━━━━━━━━━ 141s 2s/step - accuracy: 0.9997 - loss: 0.0028 - val_accuracy: 0.9898 - val_loss: 0.0406
Epoch 22/25
68/68 ━━━━━━━━━━━━━━━━━━━━ 142s 2s/step - accuracy: 0.9998 - loss: 0.0022 - val_accuracy: 0.9896 - val_loss: 0.0421
Epoch 23/25
68/68 ━━━━━━━━━━━━━━━━━━━━ 142s 2s/step - accuracy: 0.9998 - loss: 0.0021 - val_accuracy: 0.9897 - val_loss: 0.0417
Epoch 24/25
68/68 ━━━━━━━━━━━━━━━━━━━━ 113s 2s/step - accuracy: 0.9998 - loss: 0.0019 - val_accuracy: 0.9898 - val_loss: 0.0415
Epoch 25/25
68/68 ━━━━━━━━━━━━━━━━━━━━ 113s 2s/step - accuracy: 0.9998 - loss: 0.0019 - val_accuracy: 0.9897 - val_loss: 0.0434
7/7 ━━━━━━━━━━━━━━━━━━━━ 20s 1s/step
y_pred_tags = [[idx2label[i] for i in row] for row in y_pred]
y_true_tags = [[idx2label[i] for i in row] for row in y_true]

print(classification_report(
    [item for sublist in y_true_tags for item in sublist],
    [item for sublist in y_pred_tags for item in sublist],
    labels=list(label2idx.values())[1:], 
    target_names=[idx2label[i] for i in list(label2idx.values())[1:]]
))
/home/michal/.local/lib/python3.11/site-packages/sklearn/metrics/_classification.py:1509: UndefinedMetricWarning: Precision is ill-defined and being set to 0.0 in labels with no predicted samples. Use `zero_division` parameter to control this behavior.
  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))
/home/michal/.local/lib/python3.11/site-packages/sklearn/metrics/_classification.py:1509: UndefinedMetricWarning: Recall is ill-defined and being set to 0.0 in labels with no true samples. Use `zero_division` parameter to control this behavior.
  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))
/home/michal/.local/lib/python3.11/site-packages/sklearn/metrics/_classification.py:1509: UndefinedMetricWarning: F-score is ill-defined and being set to 0.0 in labels with no true nor predicted samples. Use `zero_division` parameter to control this behavior.
  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))
/home/michal/.local/lib/python3.11/site-packages/sklearn/metrics/_classification.py:1509: UndefinedMetricWarning: Precision is ill-defined and being set to 0.0 due to no predicted samples. Use `zero_division` parameter to control this behavior.
  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))
/home/michal/.local/lib/python3.11/site-packages/sklearn/metrics/_classification.py:1509: UndefinedMetricWarning: Recall is ill-defined and being set to 0.0 due to no true samples. Use `zero_division` parameter to control this behavior.
  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))
/home/michal/.local/lib/python3.11/site-packages/sklearn/metrics/_classification.py:1509: UndefinedMetricWarning: F-score is ill-defined and being set to 0.0 due to no true nor predicted samples. Use `zero_division` parameter to control this behavior.
  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))
/home/michal/.local/lib/python3.11/site-packages/sklearn/metrics/_classification.py:1509: UndefinedMetricWarning: Precision is ill-defined and being set to 0.0 in labels with no predicted samples. Use `zero_division` parameter to control this behavior.
  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))
/home/michal/.local/lib/python3.11/site-packages/sklearn/metrics/_classification.py:1509: UndefinedMetricWarning: Recall is ill-defined and being set to 0.0 in labels with no true samples. Use `zero_division` parameter to control this behavior.
  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))
/home/michal/.local/lib/python3.11/site-packages/sklearn/metrics/_classification.py:1509: UndefinedMetricWarning: F-score is ill-defined and being set to 0.0 in labels with no true nor predicted samples. Use `zero_division` parameter to control this behavior.
  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))
              precision    recall  f1-score   support

       B-PER       0.00      0.00      0.00         0
       I-PER       0.00      0.00      0.00         0
       B-ORG       0.00      0.00      0.00         0
       I-ORG       0.00      0.00      0.00         0
       B-LOC       0.00      0.00      0.00         0
       I-LOC       0.00      0.00      0.00         0
      B-MISC       0.00      0.00      0.00         0
      I-MISC       0.00      0.00      0.00         0

   micro avg       0.00      0.00      0.00         0
   macro avg       0.00      0.00      0.00         0
weighted avg       0.00      0.00      0.00         0

/home/michal/.local/lib/python3.11/site-packages/sklearn/metrics/_classification.py:1509: UndefinedMetricWarning: Precision is ill-defined and being set to 0.0 in labels with no predicted samples. Use `zero_division` parameter to control this behavior.
  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))
/home/michal/.local/lib/python3.11/site-packages/sklearn/metrics/_classification.py:1509: UndefinedMetricWarning: Recall is ill-defined and being set to 0.0 in labels with no true samples. Use `zero_division` parameter to control this behavior.
  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))
/home/michal/.local/lib/python3.11/site-packages/sklearn/metrics/_classification.py:1509: UndefinedMetricWarning: F-score is ill-defined and being set to 0.0 in labels with no true nor predicted samples. Use `zero_division` parameter to control this behavior.
  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))
def correct_iob_labels(predictions):
    corrected = []
    for pred in predictions:
        corrected_sentence = []
        prev_label = 'O'
        for label in pred:
            if label.startswith('I-') and (prev_label == 'O' or prev_label[2:] != label[2:]):
                corrected_sentence.append('B-' + label[2:])
            else:
                corrected_sentence.append(label)
            prev_label = corrected_sentence[-1]
        corrected.append(corrected_sentence)
    return corrected
y_test_pred = model.predict(X_test)
y_test_pred = np.argmax(y_test_pred, axis=-1)
y_test_pred_tags = [[idx2label[i] for i in row] for row in y_test_pred]

y_pred_tags_corrected = correct_iob_labels(y_pred_tags)
y_test_pred_tags_corrected = correct_iob_labels(y_test_pred_tags)
8/8 ━━━━━━━━━━━━━━━━━━━━ 1s 95ms/step
dev_predictions = [' '.join(tags) for tags in y_pred_tags_corrected]
with open('en-ner-conll-2003/dev-0/out.tsv', 'w', encoding='utf-8') as f:
    for prediction in dev_predictions:
        f.write("%s\n" % prediction)
test_predictions = [' '.join(tags) for tags in y_test_pred_tags_corrected]
with open('en-ner-conll-2003/test-A/out.tsv', 'w', encoding='utf-8') as f:
    for prediction in test_predictions:
        f.write("%s\n" % prediction)