forked from kubapok/en-ner-conll-2003
11 KiB
11 KiB
import pandas as pd
import numpy as np
import csv
import os.path
import shutil
import torch
from tqdm import tqdm
from itertools import islice
from sklearn.model_selection import train_test_split
from torchtext.vocab import Vocab
from collections import Counter
from nltk.tokenize import word_tokenize
import gensim.downloader as api
from gensim.models.word2vec import Word2Vec
C:\Users\grzyb\anaconda3\lib\site-packages\gensim\similarities\__init__.py:15: UserWarning: The gensim.similarities.levenshtein submodule is disabled, because the optional Levenshtein package <https://pypi.org/project/python-Levenshtein/> is unavailable. Install Levenhstein (e.g. `pip install python-Levenshtein`) to suppress this warning. warnings.warn(msg)
class NERModel(torch.nn.Module):
def __init__(self,):
super(NERModel, self).__init__()
self.emb = torch.nn.Embedding(23628,200)
self.fc1 = torch.nn.Linear(600,9)
def forward(self, x):
x = self.emb(x)
x = x.reshape(600)
x = self.fc1(x)
return x
def process_output(lines):
result = []
for line in lines:
last_label = None
new_line = []
for label in line:
if(label != "O" and label[0:2] == "I-"):
if last_label == None or last_label == "O":
label = label.replace('I-', 'B-')
else:
label = "I-" + last_label[2:]
last_label = label
new_line.append(label)
x = (" ".join(new_line))
result.append(" ".join(new_line))
return result
def build_vocab(dataset):
counter = Counter()
for document in dataset:
counter.update(document)
return Vocab(counter, specials=['<unk>', '<pad>', '<bos>', '<eos>'])
def data_process(dt):
return [ torch.tensor([vocab['<bos>']] +[vocab[token] for token in document ] + [vocab['<eos>']], dtype = torch.long) for document in dt]
def labels_process(dt):
return [ torch.tensor([0] + document + [0], dtype = torch.long) for document in dt]
def predict(input_tokens, labels):
results = []
for i in range(len(input_tokens)):
line_results = []
for j in range(1, len(input_tokens[i]) - 1):
x = input_tokens[i][j-1: j+2].to(device_gpu)
predicted = ner_model(x.long())
result = torch.argmax(predicted)
label = labels[result]
line_results.append(label)
results.append(line_results)
return results
train = pd.read_csv('train/train.tsv.xz', sep='\t', names=['a', 'b'])
labels = ['O','B-LOC', 'I-LOC','B-MISC', 'I-MISC', 'B-ORG', 'I-ORG', 'B-PER', 'I-PER']
train["a"]=train["a"].apply(lambda x: [labels.index(y) for y in x.split()])
train["b"]=train["b"].apply(lambda x: x.split())
vocab = build_vocab(train['b'])
tensors = []
for sent in train["b"]:
sent_tensor = torch.tensor(())
for word in sent:
temp = torch.tensor([word[0].isupper(), word[0].isdigit()])
sent_tensor = torch.cat((sent_tensor, temp))
tensors.append(sent_tensor)
torch.cuda.get_device_name(0)
'NVIDIA GeForce RTX 2060'
device_gpu = torch.device("cuda:0")
ner_model = NERModel().to(device_gpu)
criterion = torch.nn.CrossEntropyLoss()
optimizer = torch.optim.Adam(ner_model.parameters())
train_labels = labels_process(train['a'])
train_tokens_ids = data_process(train['b'])
train_tensors = [torch.cat((token, tensors[i])) for i, token in enumerate(train_tokens_ids)]
for epoch in range(5):
acc_score = 0
prec_score = 0
selected_items = 0
recall_score = 0
relevant_items = 0
items_total = 0
ner_model.train()
for i in range(len(train_labels)):
for j in range(1, len(train_labels[i]) - 1):
X = train_tensors[i][j - 1: j + 2].to(device_gpu)
Y = train_labels[i][j: j + 1].to(device_gpu)
Y_predictions = ner_model(X.long())
acc_score += int(torch.argmax(Y_predictions) == Y)
if torch.argmax(Y_predictions) != 0:
selected_items += 1
if torch.argmax(Y_predictions) != 0 and torch.argmax(Y_predictions) == Y.item():
prec_score += 1
if Y.item() != 0:
relevant_items += 1
if Y.item() != 0 and torch.argmax(Y_predictions) == Y.item():
recall_score += 1
items_total += 1
optimizer.zero_grad()
loss = criterion(Y_predictions.unsqueeze(0), Y)
loss.backward()
optimizer.step()
precision = prec_score / selected_items
recall = recall_score / relevant_items
f1_score = (2 * precision * recall) / (precision + recall)
print(f'epoch: {epoch}')
print(f'f1: {f1_score}')
print(f'acc: {acc_score / items_total}')
epoch: 0 f1: 0.6310260230881535 acc: 0.9099004714510215 epoch: 1 f1: 0.7977381727751791 acc: 0.9539025667888947 epoch: 2 f1: 0.8635445687583837 acc: 0.9699162783858546 epoch: 3 f1: 0.9047002002591589 acc: 0.9794417946385082 epoch: 4 f1: 0.9300697243387956 acc: 0.9852774944170274
def create_tensors_list(data):
tensors = []
for sent in data["a"]:
sent_tensor = torch.tensor(())
for word in sent:
temp = torch.tensor([word[0].isupper(), word[0].isdigit()])
sent_tensor = torch.cat((sent_tensor, temp))
tensors.append(sent_tensor)
return tensors
dev = pd.read_csv('dev-0/in.tsv', sep='\t', names=['a'])
dev["a"] = dev["a"].apply(lambda x: x.split())
dev_tokens_ids = data_process(dev["a"])
dev_extra_tensors = create_tensors_list(dev)
dev_tensors = [torch.cat((token, dev_extra_tensors[i])) for i, token in enumerate(dev_tokens_ids)]
results = predict(dev_tensors, labels)
results_processed = process_output(results)
with open("dev-0/out.tsv", "w") as f:
for line in results_processed:
f.write(line + "\n")
test = pd.read_csv('test-A/in.tsv', sep='\t', names=['a'])
test["a"] = test["a"].apply(lambda x: x.split())
test_tokens_ids = data_process(test["a"])
test_extra_tensors = create_tensors_list(test)
test_tensors = [torch.cat((token, test_extra_tensors[i])) for i, token in enumerate(test_tokens_ids)]
results = predict(test_tensors, labels)
results_processed = process_output(results)
with open("test-A/out.tsv", "w") as f:
for line in results_processed:
f.write(line + "\n")
model_path = "seq_labeling.model"
torch.save(ner_model.state_dict(), model_path)