136 lines
4.9 KiB
Python
136 lines
4.9 KiB
Python
![]() |
import pandas as pd
|
||
|
import torch
|
||
|
import torch.nn as nn
|
||
|
import torch.optim as optim
|
||
|
from torch.nn.utils.rnn import pad_sequence
|
||
|
from collections import Counter
|
||
|
|
||
|
#Wczytywanie danych z plików TSV
|
||
|
def load_data(file_path):
|
||
|
data = []
|
||
|
with open(file_path, 'r', encoding='utf-8') as f:
|
||
|
for line in f:
|
||
|
if line.strip():
|
||
|
data.append(line.strip().split())
|
||
|
else:
|
||
|
data.append(["<pad>"])
|
||
|
return data
|
||
|
|
||
|
# Wczytaj dane
|
||
|
train_sentences = load_data('en-ner-conll-2003/dev-0/in.tsv')
|
||
|
train_labels = load_data('en-ner-conll-2003/dev-0/expected.tsv')
|
||
|
test_sentences = load_data('en-ner-conll-2003/test-A/in.tsv')
|
||
|
|
||
|
#słownik i mapowanie
|
||
|
def build_vocab(sentences):
|
||
|
counter = Counter(token for sentence in sentences for token in sentence if token != "<pad>")
|
||
|
vocab = {token: idx + 2 for idx, (token, _) in enumerate(counter.most_common())}
|
||
|
vocab['<pad>'] = 0
|
||
|
vocab['<unk>'] = 1
|
||
|
return vocab
|
||
|
|
||
|
def build_label_mapping(labels):
|
||
|
unique_labels = set(label for sentence in labels for label in sentence if label != "<pad>")
|
||
|
label_map = {label: idx+1 for idx, label in enumerate(unique_labels)}
|
||
|
label_map['<pad>'] = 0
|
||
|
return label_map
|
||
|
|
||
|
vocab = build_vocab(train_sentences)
|
||
|
label_map = build_label_mapping(train_labels)
|
||
|
num_tags = len(label_map)
|
||
|
|
||
|
#Przekształcanie danych
|
||
|
def convert_to_tensor(sentences, vocab):
|
||
|
return [torch.tensor([vocab.get(token, vocab['<unk>']) for token in sentence], dtype=torch.long) for sentence in sentences]
|
||
|
|
||
|
def convert_labels_to_tensor(labels, label_map):
|
||
|
return [torch.tensor([label_map[label] for label in sentence], dtype=torch.long) for sentence in labels]
|
||
|
|
||
|
train_tokens_ids = convert_to_tensor(train_sentences, vocab)
|
||
|
train_labels_ids = convert_labels_to_tensor(train_labels, label_map)
|
||
|
test_tokens_ids = convert_to_tensor(test_sentences, vocab)
|
||
|
|
||
|
#Padding
|
||
|
def pad_data(sequences):
|
||
|
return pad_sequence(sequences, batch_first=True, padding_value=0)
|
||
|
|
||
|
padded_train_tokens_ids = pad_data(train_tokens_ids)
|
||
|
padded_train_labels_ids = pad_data(train_labels_ids)
|
||
|
padded_test_tokens_ids = pad_data(test_tokens_ids)
|
||
|
|
||
|
#Definiowanie modelu LSTM
|
||
|
class LSTMNER(nn.Module):
|
||
|
def __init__(self, vocab_size, num_tags):
|
||
|
super(LSTMNER, self).__init__()
|
||
|
self.embedding = nn.Embedding(vocab_size, 100)
|
||
|
self.lstm = nn.LSTM(100, 256, batch_first=True)
|
||
|
self.fc = nn.Linear(256, num_tags)
|
||
|
|
||
|
def forward(self, x):
|
||
|
emb = self.embedding(x)
|
||
|
lstm_out, _ = self.lstm(emb)
|
||
|
out = self.fc(lstm_out)
|
||
|
return out
|
||
|
|
||
|
#Parametry
|
||
|
model = LSTMNER(len(vocab), num_tags)
|
||
|
criterion = nn.CrossEntropyLoss(ignore_index=0)
|
||
|
optimizer = optim.Adam(model.parameters())
|
||
|
|
||
|
#Trenowanie
|
||
|
num_epochs = 25 #NALEŻY ZMNIEJSZYĆ PRZED URUCHOMIENIEM
|
||
|
for epoch in range(num_epochs):
|
||
|
model.train()
|
||
|
epoch_loss = 0
|
||
|
for x, y in zip(padded_train_tokens_ids, padded_train_labels_ids):
|
||
|
x = x.unsqueeze(0)
|
||
|
y = y.unsqueeze(0)
|
||
|
|
||
|
optimizer.zero_grad()
|
||
|
outputs = model(x)
|
||
|
loss = criterion(outputs.view(-1, num_tags), y.view(-1))
|
||
|
loss.backward()
|
||
|
optimizer.step()
|
||
|
epoch_loss += loss.item()
|
||
|
print(f"Epoch {epoch + 1}, Loss: {epoch_loss / len(train_tokens_ids)}")
|
||
|
|
||
|
#Predykcja i zapis wyników do pliku
|
||
|
def predict(model, data, label_map):
|
||
|
model.eval()
|
||
|
reverse_label_map = {v: k for k, v in label_map.items()}
|
||
|
predictions = []
|
||
|
with torch.no_grad():
|
||
|
for sentence in data:
|
||
|
output = model(sentence.unsqueeze(0))
|
||
|
predicted_tags = torch.argmax(output, dim=-1).squeeze().tolist()
|
||
|
predictions.append([reverse_label_map[tag] for tag in predicted_tags])
|
||
|
return predictions
|
||
|
|
||
|
def save_predictions(predictions, file_path):
|
||
|
with open(file_path, 'w', encoding='utf-8') as file:
|
||
|
for sentence_preds in predictions:
|
||
|
sentence_output = ' '.join([tag for tag in sentence_preds if tag != '<pad>'])
|
||
|
file.write(f"{sentence_output}\n")
|
||
|
|
||
|
def correct_labels(predictions):
|
||
|
corrected = []
|
||
|
for sentence in predictions:
|
||
|
fixed_sentence = []
|
||
|
for i, tag in enumerate(sentence):
|
||
|
if tag.startswith('I-') and (i == 0 or not sentence[i-1].endswith(tag[2:])):
|
||
|
fixed_sentence.append('B-' + tag[2:])
|
||
|
else:
|
||
|
fixed_sentence.append(tag)
|
||
|
corrected.append(fixed_sentence)
|
||
|
return corrected
|
||
|
|
||
|
dev_predictions = predict(model, padded_train_tokens_ids, label_map)
|
||
|
test_predictions = predict(model, padded_test_tokens_ids, label_map)
|
||
|
|
||
|
corrected_dev_predictions = correct_labels(dev_predictions)
|
||
|
corrected_test_predictions = correct_labels(test_predictions)
|
||
|
|
||
|
save_predictions(corrected_dev_predictions, 'en-ner-conll-2003/dev-0/out.tsv')
|
||
|
save_predictions(corrected_test_predictions, 'en-ner-conll-2003/test-A/out.tsv')
|
||
|
|
||
|
print("Predykcje zapisane.")
|