22 KiB
22 KiB
import pandas as pd
import numpy as np
import tensorflow as tf
from tensorflow.keras.preprocessing.sequence import pad_sequences
from tensorflow.keras.utils import to_categorical
from sklearn.metrics import classification_report
def load_data(file_path):
with open(file_path, 'r', encoding='utf-8') as file:
lines = file.readlines()
sentences = [line.strip() for line in lines]
return sentences
train_data = pd.read_csv('en-ner-conll-2003-master/en-ner-conll-2003/train/train.tsv/train.tsv', sep='\t', header=None, names=['label', 'sentence'], encoding='utf-8')
dev_sentences = load_data('en-ner-conll-2003-master/en-ner-conll-2003/dev-0/in.tsv')
dev_labels = load_data('en-ner-conll-2003-master/en-ner-conll-2003/dev-0/expected.tsv')
test_sentences = load_data('en-ner-conll-2003-master/en-ner-conll-2003/test-A/in.tsv')
def preprocess_data(sentences, labels=None):
tokenized_sentences = [sentence.split() for sentence in sentences]
if labels is not None:
tokenized_labels = [label.split() for label in labels]
return tokenized_sentences, tokenized_labels
return tokenized_sentences
train_sentences, train_labels = preprocess_data(train_data['sentence'].values, train_data['label'].values)
dev_sentences, dev_labels = preprocess_data(dev_sentences, dev_labels)
test_sentences = preprocess_data(test_sentences)
special_tokens = ['<PAD>', '<UNK>', '<BOS>', '<EOS>']
word2idx = {w: i + len(special_tokens) for i, w in enumerate(set(word for sentence in train_sentences for word in sentence))}
for i, token in enumerate(special_tokens):
word2idx[token] = i
idx2word = {i: w for w, i in word2idx.items()}
label2idx = {
'O': 0,
'B-PER': 1, 'I-PER': 2,
'B-ORG': 3, 'I-ORG': 4,
'B-LOC': 5, 'I-LOC': 6,
'B-MISC': 7, 'I-MISC': 8
}
idx2label = {i: l for l, i in label2idx.items()}
def encode_data(sentences, labels=None):
encoded_sentences = [[word2idx.get(word, word2idx['<UNK>']) for word in sentence] for sentence in sentences]
if labels is not None:
encoded_labels = [[label2idx[label] for label in label_list] for label_list in labels]
return encoded_sentences, encoded_labels
return encoded_sentences
X_train, y_train = encode_data(train_sentences, train_labels)
X_dev, y_dev = encode_data(dev_sentences, dev_labels)
X_test = encode_data(test_sentences)
max_len = 1000
X_train = pad_sequences(X_train, padding='post', maxlen=max_len)
y_train = pad_sequences(y_train, padding='post', maxlen=max_len)
X_dev = pad_sequences(X_dev, padding='post', maxlen=max_len)
y_dev = pad_sequences(y_dev, padding='post', maxlen=max_len)
X_test = pad_sequences(X_test, padding='post', maxlen=max_len)
y_train = [to_categorical(i, num_classes=len(label2idx)) for i in y_train]
y_dev = [to_categorical(i, num_classes=len(label2idx)) for i in y_dev]
model = tf.keras.models.Sequential([
tf.keras.layers.Embedding(input_dim=len(word2idx), output_dim=64, input_length=max_len),
tf.keras.layers.Bidirectional(tf.keras.layers.LSTM(units=32, return_sequences=True)),
tf.keras.layers.TimeDistributed(tf.keras.layers.Dense(len(label2idx), activation='softmax'))
])
model.compile(optimizer='adam', loss='categorical_crossentropy', metrics=['accuracy'])
history = model.fit(X_train, np.array(y_train), validation_data=(X_dev, np.array(y_dev)), epochs=25, batch_size=16)
y_pred = model.predict(X_dev)
y_pred = np.argmax(y_pred, axis=-1)
y_true = np.argmax(np.array(y_dev), axis=-1)
Epoch 1/25
C:\Users\Kosmitos\AppData\Local\Programs\Python\Python312\Lib\site-packages\keras\src\layers\core\embedding.py:90: UserWarning: Argument `input_length` is deprecated. Just remove it. warnings.warn(
[1m60/60[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m79s[0m 773ms/step - accuracy: 0.8902 - loss: 0.6219 - val_accuracy: 0.9606 - val_loss: 0.1716 Epoch 2/25 [1m60/60[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m50s[0m 825ms/step - accuracy: 0.9656 - loss: 0.1477 - val_accuracy: 0.9606 - val_loss: 0.1553 Epoch 3/25 [1m60/60[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m47s[0m 788ms/step - accuracy: 0.9650 - loss: 0.1328 - val_accuracy: 0.9607 - val_loss: 0.1352 Epoch 4/25 [1m60/60[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m47s[0m 780ms/step - accuracy: 0.9646 - loss: 0.1136 - val_accuracy: 0.9648 - val_loss: 0.1134 Epoch 5/25 [1m60/60[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m46s[0m 772ms/step - accuracy: 0.9705 - loss: 0.0910 - val_accuracy: 0.9716 - val_loss: 0.0963 Epoch 6/25 [1m60/60[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m45s[0m 746ms/step - accuracy: 0.9776 - loss: 0.0753 - val_accuracy: 0.9761 - val_loss: 0.0827 Epoch 7/25 [1m60/60[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m47s[0m 778ms/step - accuracy: 0.9837 - loss: 0.0590 - val_accuracy: 0.9804 - val_loss: 0.0707 Epoch 8/25 [1m60/60[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m47s[0m 777ms/step - accuracy: 0.9880 - loss: 0.0484 - val_accuracy: 0.9833 - val_loss: 0.0614 Epoch 9/25 [1m60/60[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m45s[0m 751ms/step - accuracy: 0.9914 - loss: 0.0391 - val_accuracy: 0.9852 - val_loss: 0.0538 Epoch 10/25 [1m60/60[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m46s[0m 772ms/step - accuracy: 0.9937 - loss: 0.0292 - val_accuracy: 0.9865 - val_loss: 0.0486 Epoch 11/25 [1m60/60[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m46s[0m 766ms/step - accuracy: 0.9950 - loss: 0.0234 - val_accuracy: 0.9878 - val_loss: 0.0447 Epoch 12/25 [1m60/60[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m45s[0m 750ms/step - accuracy: 0.9960 - loss: 0.0198 - val_accuracy: 0.9885 - val_loss: 0.0420 Epoch 13/25 [1m60/60[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m45s[0m 743ms/step - accuracy: 0.9971 - loss: 0.0160 - val_accuracy: 0.9890 - val_loss: 0.0397 Epoch 14/25 [1m60/60[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m45s[0m 754ms/step - accuracy: 0.9975 - loss: 0.0137 - val_accuracy: 0.9892 - val_loss: 0.0386 Epoch 15/25 [1m60/60[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m45s[0m 745ms/step - accuracy: 0.9981 - loss: 0.0110 - val_accuracy: 0.9896 - val_loss: 0.0369 Epoch 16/25 [1m60/60[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m46s[0m 759ms/step - accuracy: 0.9983 - loss: 0.0097 - val_accuracy: 0.9897 - val_loss: 0.0365 Epoch 17/25 [1m60/60[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m44s[0m 731ms/step - accuracy: 0.9986 - loss: 0.0082 - val_accuracy: 0.9897 - val_loss: 0.0365 Epoch 18/25 [1m60/60[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m43s[0m 714ms/step - accuracy: 0.9989 - loss: 0.0070 - val_accuracy: 0.9900 - val_loss: 0.0355 Epoch 19/25 [1m60/60[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m43s[0m 711ms/step - accuracy: 0.9991 - loss: 0.0059 - val_accuracy: 0.9899 - val_loss: 0.0351 Epoch 20/25 [1m60/60[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m44s[0m 729ms/step - accuracy: 0.9992 - loss: 0.0054 - val_accuracy: 0.9900 - val_loss: 0.0353 Epoch 21/25 [1m60/60[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m44s[0m 731ms/step - accuracy: 0.9993 - loss: 0.0048 - val_accuracy: 0.9900 - val_loss: 0.0350 Epoch 22/25 [1m60/60[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m43s[0m 714ms/step - accuracy: 0.9993 - loss: 0.0044 - val_accuracy: 0.9901 - val_loss: 0.0353 Epoch 23/25 [1m60/60[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m44s[0m 738ms/step - accuracy: 0.9994 - loss: 0.0040 - val_accuracy: 0.9900 - val_loss: 0.0359 Epoch 24/25 [1m60/60[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m44s[0m 725ms/step - accuracy: 0.9995 - loss: 0.0036 - val_accuracy: 0.9900 - val_loss: 0.0353 Epoch 25/25 [1m60/60[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m43s[0m 722ms/step - accuracy: 0.9995 - loss: 0.0033 - val_accuracy: 0.9899 - val_loss: 0.0366 [1m7/7[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m13s[0m 1s/step
y_pred_tags = [[idx2label[i] for i in row] for row in y_pred]
y_true_tags = [[idx2label[i] for i in row] for row in y_true]
print(classification_report(
[item for sublist in y_true_tags for item in sublist],
[item for sublist in y_pred_tags for item in sublist],
labels=list(label2idx.values())[1:],
target_names=[idx2label[i] for i in list(label2idx.values())[1:]]
))
C:\Users\Kosmitos\AppData\Local\Programs\Python\Python312\Lib\site-packages\sklearn\metrics\_classification.py:1517: UndefinedMetricWarning: Precision is ill-defined and being set to 0.0 in labels with no predicted samples. Use `zero_division` parameter to control this behavior. _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result)) C:\Users\Kosmitos\AppData\Local\Programs\Python\Python312\Lib\site-packages\sklearn\metrics\_classification.py:1517: UndefinedMetricWarning: Recall is ill-defined and being set to 0.0 in labels with no true samples. Use `zero_division` parameter to control this behavior. _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result)) C:\Users\Kosmitos\AppData\Local\Programs\Python\Python312\Lib\site-packages\sklearn\metrics\_classification.py:1517: UndefinedMetricWarning: F-score is ill-defined and being set to 0.0 in labels with no true nor predicted samples. Use `zero_division` parameter to control this behavior. _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result)) C:\Users\Kosmitos\AppData\Local\Programs\Python\Python312\Lib\site-packages\sklearn\metrics\_classification.py:1517: UndefinedMetricWarning: Precision is ill-defined and being set to 0.0 due to no predicted samples. Use `zero_division` parameter to control this behavior. _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result)) C:\Users\Kosmitos\AppData\Local\Programs\Python\Python312\Lib\site-packages\sklearn\metrics\_classification.py:1517: UndefinedMetricWarning: Recall is ill-defined and being set to 0.0 due to no true samples. Use `zero_division` parameter to control this behavior. _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result)) C:\Users\Kosmitos\AppData\Local\Programs\Python\Python312\Lib\site-packages\sklearn\metrics\_classification.py:1517: UndefinedMetricWarning: F-score is ill-defined and being set to 0.0 due to no true nor predicted samples. Use `zero_division` parameter to control this behavior. _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result)) C:\Users\Kosmitos\AppData\Local\Programs\Python\Python312\Lib\site-packages\sklearn\metrics\_classification.py:1517: UndefinedMetricWarning: Precision is ill-defined and being set to 0.0 in labels with no predicted samples. Use `zero_division` parameter to control this behavior. _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result)) C:\Users\Kosmitos\AppData\Local\Programs\Python\Python312\Lib\site-packages\sklearn\metrics\_classification.py:1517: UndefinedMetricWarning: Recall is ill-defined and being set to 0.0 in labels with no true samples. Use `zero_division` parameter to control this behavior. _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result)) C:\Users\Kosmitos\AppData\Local\Programs\Python\Python312\Lib\site-packages\sklearn\metrics\_classification.py:1517: UndefinedMetricWarning: F-score is ill-defined and being set to 0.0 in labels with no true nor predicted samples. Use `zero_division` parameter to control this behavior. _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))
precision recall f1-score support B-PER 0.00 0.00 0.00 0 I-PER 0.00 0.00 0.00 0 B-ORG 0.00 0.00 0.00 0 I-ORG 0.00 0.00 0.00 0 B-LOC 0.00 0.00 0.00 0 I-LOC 0.00 0.00 0.00 0 B-MISC 0.00 0.00 0.00 0 I-MISC 0.00 0.00 0.00 0 micro avg 0.00 0.00 0.00 0 macro avg 0.00 0.00 0.00 0 weighted avg 0.00 0.00 0.00 0
C:\Users\Kosmitos\AppData\Local\Programs\Python\Python312\Lib\site-packages\sklearn\metrics\_classification.py:1517: UndefinedMetricWarning: Precision is ill-defined and being set to 0.0 in labels with no predicted samples. Use `zero_division` parameter to control this behavior. _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result)) C:\Users\Kosmitos\AppData\Local\Programs\Python\Python312\Lib\site-packages\sklearn\metrics\_classification.py:1517: UndefinedMetricWarning: Recall is ill-defined and being set to 0.0 in labels with no true samples. Use `zero_division` parameter to control this behavior. _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result)) C:\Users\Kosmitos\AppData\Local\Programs\Python\Python312\Lib\site-packages\sklearn\metrics\_classification.py:1517: UndefinedMetricWarning: F-score is ill-defined and being set to 0.0 in labels with no true nor predicted samples. Use `zero_division` parameter to control this behavior. _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))
def correct_iob_labels(predictions):
corrected = []
for pred in predictions:
corrected_sentence = []
prev_label = 'O'
for label in pred:
if label.startswith('I-') and (prev_label == 'O' or prev_label[2:] != label[2:]):
corrected_sentence.append('B-' + label[2:])
else:
corrected_sentence.append(label)
prev_label = corrected_sentence[-1]
corrected.append(corrected_sentence)
return corrected
y_test_pred = model.predict(X_test)
y_test_pred = np.argmax(y_test_pred, axis=-1)
y_test_pred_tags = [[idx2label[i] for i in row] for row in y_test_pred]
y_pred_tags_corrected = correct_iob_labels(y_pred_tags)
y_test_pred_tags_corrected = correct_iob_labels(y_test_pred_tags)
[1m8/8[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m1s[0m 99ms/step
dev_predictions = [' '.join(tags) for tags in y_pred_tags_corrected]
with open('en-ner-conll-2003-master/en-ner-conll-2003/dev-0/out.tsv', 'w', encoding='utf-8') as f:
for prediction in dev_predictions:
f.write("%s\n" % prediction)
test_predictions = [' '.join(tags) for tags in y_test_pred_tags_corrected]
with open('en-ner-conll-2003-master/en-ner-conll-2003/test-A/out.tsv', 'w', encoding='utf-8') as f:
for prediction in test_predictions:
f.write("%s\n" % prediction)