116 lines
4.8 KiB
Python
116 lines
4.8 KiB
Python
import pandas as pd
|
|
import numpy as np
|
|
import tensorflow as tf
|
|
from tensorflow.keras.preprocessing.sequence import pad_sequences
|
|
from tensorflow.keras.utils import to_categorical
|
|
from sklearn.metrics import classification_report
|
|
import sys
|
|
|
|
# Ustawienie kodowania na utf-8
|
|
sys.stdout.reconfigure(encoding='utf-8')
|
|
|
|
# Load data
|
|
def load_data(file_path):
|
|
with open(file_path, 'r', encoding='utf-8') as file:
|
|
lines = file.readlines()
|
|
sentences = [line.strip() for line in lines]
|
|
return sentences
|
|
|
|
train_data = pd.read_csv('./train/train.tsv', sep='\t', header=None, names=['label', 'sentence'], encoding='utf-8')
|
|
dev_sentences = load_data('./dev0/in.tsv')
|
|
dev_labels = load_data('./dev0/expected.tsv')
|
|
test_sentences = load_data('./testA/in.tsv')
|
|
|
|
# Preprocess data
|
|
def preprocess_data(sentences, labels=None):
|
|
tokenized_sentences = [sentence.split() for sentence in sentences]
|
|
if labels is not None:
|
|
tokenized_labels = [label.split() for label in labels]
|
|
return tokenized_sentences, tokenized_labels
|
|
return tokenized_sentences
|
|
|
|
train_sentences, train_labels = preprocess_data(train_data['sentence'].values, train_data['label'].values)
|
|
dev_sentences, dev_labels = preprocess_data(dev_sentences, dev_labels)
|
|
test_sentences = preprocess_data(test_sentences)
|
|
|
|
# Create a word index and label index
|
|
word2idx = {w: i + 2 for i, w in enumerate(set(word for sentence in train_sentences for word in sentence))}
|
|
word2idx['<PAD>'] = 0
|
|
word2idx['<UNK>'] = 1
|
|
idx2word = {i: w for w, i in word2idx.items()}
|
|
|
|
label2idx = {l: i + 1 for i, l in enumerate(set(label for label_list in train_labels for label in label_list))}
|
|
label2idx['<PAD>'] = 0
|
|
idx2label = {i: l for l, i in label2idx.items()}
|
|
|
|
# Convert words and labels to integers
|
|
def encode_data(sentences, labels=None):
|
|
encoded_sentences = [[word2idx.get(word, word2idx['<UNK>']) for word in sentence] for sentence in sentences]
|
|
if labels is not None:
|
|
encoded_labels = [[label2idx[label] for label in label_list] for label_list in labels]
|
|
return encoded_sentences, encoded_labels
|
|
return encoded_sentences
|
|
|
|
X_train, y_train = encode_data(train_sentences, train_labels)
|
|
X_dev, y_dev = encode_data(dev_sentences, dev_labels)
|
|
X_test = encode_data(test_sentences)
|
|
|
|
# Limit sequence length to avoid excessive memory usage
|
|
max_len = 100 # You can adjust this value to a reasonable limit based on your data and memory
|
|
X_train = pad_sequences(X_train, padding='post', maxlen=max_len)
|
|
y_train = pad_sequences(y_train, padding='post', maxlen=max_len)
|
|
|
|
X_dev = pad_sequences(X_dev, padding='post', maxlen=max_len)
|
|
y_dev = pad_sequences(y_dev, padding='post', maxlen=max_len)
|
|
|
|
X_test = pad_sequences(X_test, padding='post', maxlen=max_len)
|
|
|
|
y_train = [to_categorical(i, num_classes=len(label2idx)) for i in y_train]
|
|
y_dev = [to_categorical(i, num_classes=len(label2idx)) for i in y_dev]
|
|
|
|
# Define the model with reduced complexity
|
|
model = tf.keras.models.Sequential([
|
|
tf.keras.layers.Embedding(input_dim=len(word2idx), output_dim=64, input_length=max_len),
|
|
tf.keras.layers.Bidirectional(tf.keras.layers.LSTM(units=32, return_sequences=True)),
|
|
tf.keras.layers.TimeDistributed(tf.keras.layers.Dense(len(label2idx), activation='softmax'))
|
|
])
|
|
|
|
model.compile(optimizer='adam', loss='categorical_crossentropy', metrics=['accuracy'])
|
|
|
|
# Train the model with a smaller batch size
|
|
history = model.fit(X_train, np.array(y_train), validation_data=(X_dev, np.array(y_dev)), epochs=5, batch_size=16)
|
|
|
|
# Evaluate the model
|
|
y_pred = model.predict(X_dev)
|
|
y_pred = np.argmax(y_pred, axis=-1)
|
|
y_true = np.argmax(np.array(y_dev), axis=-1)
|
|
|
|
# Map predictions and true labels to their original tags
|
|
y_pred_tags = [[idx2label[i] for i in row] for row in y_pred]
|
|
y_true_tags = [[idx2label[i] for i in row] for row in y_true]
|
|
|
|
# Print the classification report using UTF-8 encoding
|
|
print(classification_report(
|
|
[item for sublist in y_true_tags for item in sublist],
|
|
[item for sublist in y_pred_tags for item in sublist],
|
|
labels=list(label2idx.values())[1:], # Exclude the padding label from the report
|
|
target_names=[idx2label[i] for i in list(label2idx.values())[1:]]
|
|
))
|
|
|
|
# Predict on test data
|
|
y_test_pred = model.predict(X_test)
|
|
y_test_pred = np.argmax(y_test_pred, axis=-1)
|
|
y_test_pred_tags = [[idx2label[i] for i in row] for row in y_test_pred]
|
|
|
|
# Save dev predictions
|
|
dev_predictions = [' '.join(tags) for tags in y_pred_tags]
|
|
with open('./dev0/out.tsv', 'w', encoding='utf-8') as f:
|
|
for prediction in dev_predictions:
|
|
f.write("%s\n" % prediction)
|
|
|
|
# Save test predictions
|
|
test_predictions = [' '.join(tags) for tags in y_test_pred_tags]
|
|
with open('./testA/out.tsv', 'w', encoding='utf-8') as f:
|
|
for prediction in test_predictions:
|
|
f.write("%s\n" % prediction)
|