# Informacje na temat zakomentowanego kodu oraz wyników znajdują się w README.md import pandas as pd import os.path import shutil import torch import pandas as pd from torchtext.vocab import Vocab from collections import Counter # class NERModelWithAlpha(torch.nn.Module): # def __init__(self,): # super(NERModel, self).__init__() # self.emb = torch.nn.Embedding(23629,200) # self.fc1 = torch.nn.Linear(1200,9) # def forward(self, x): # x = self.emb(x) # x = x.reshape(1200) # x = self.fc1(x) # return x class NERModel(torch.nn.Module): def __init__(self,): super(NERModel, self).__init__() self.emb = torch.nn.Embedding(23628,200) self.fc1 = torch.nn.Linear(600,9) def forward(self, x): x = self.emb(x) x = x.reshape(600) x = self.fc1(x) return x def data_process(dt): return [ torch.tensor([vocab['']] +[vocab[token] for token in document ] + [vocab['']], dtype = torch.long) for document in dt] # def data_process(dt): # result = [] # for document in dt: # sentence = [vocab[''],vocab['']] # for token in document: # sentence += [vocab[token]] # sentence += [vocab[''] if token.isalpha() else vocab['']] # sentence += [vocab[''],vocab['']] # result.append(torch.tensor(sentence, dtype = torch.long)) # return result def build_vocab(dataset): counter = Counter() for document in dataset: counter.update(document) return Vocab(counter, specials=['', '', '', '']) #, '', '']) def labels_process(dt): return [ torch.tensor([0] + document + [0], dtype = torch.long) for document in dt] def process(model, x): predicted = model(x) result = torch.argmax(predicted) return labels[result] def process_dataset(model, path): with open(path, 'r') as f: lines = f.readlines() X = [x.split() for x in lines] data_tokens_ids = data_process(X) results = [] for i in range(len(data_tokens_ids)): line_results = [] for j in range(1, len(data_tokens_ids[i]) - 1): # for j in range(2, len(data_tokens_ids[i]) - 3, 2): #x = data_tokens_ids[i][j-2: j+4].to(device_gpu) x = data_tokens_ids[i][j-1: j+2].to(device_cpu) label = process(model, x) line_results.append(label) results.append(line_results) return results # Przetwarzanie danych z wyjścia modelu (gdy B- i I- nie dotyczą tej samej etykiety) def process_output(lines): result = [] for line in lines: last_label = None new_line = [] for label in line: if(label != "O" and label[0:2] == "I-"): if last_label == None or last_label == "O": label = label.replace('I-', 'B-') else: label = "I-" + last_label[2:] last_label = label new_line.append(label) result.append(" ".join(new_line)) return result labels = ['O','B-LOC', 'I-LOC','B-MISC', 'I-MISC', 'B-ORG', 'I-ORG', 'B-PER', 'I-PER'] model_path = "seq_labeling.model_old" if not os.path.isfile('train/train.tsv'): import lzma with lzma.open('train/train.tsv.xz', 'rb') as f_in: with open('train/train.tsv', 'wb') as f_out: shutil.copyfileobj(f_in, f_out) data = pd.read_csv('train/train.tsv', sep='\t', names=['iob', 'tokens']) data["iob"]=data["iob"].apply(lambda x: [labels.index(y) for y in x.split()]) data["tokens"]=data["tokens"].apply(lambda x: x.split()) vocab = build_vocab(data['tokens']) #device_gpu = torch.device("cuda:0") device_cpu = torch.device("cpu") ner_model = NERModel().to(device_cpu) criterion = torch.nn.CrossEntropyLoss() optimizer = torch.optim.Adam(ner_model.parameters()) train_labels = labels_process(data['iob']) train_tokens_ids = data_process(data['tokens']) if not os.path.isfile(model_path): for epoch in range(5): acc_score = 0 prec_score = 0 selected_items = 0 recall_score = 0 relevant_items = 0 items_total = 0 ner_model.train() for i in range(len(train_labels)): for j in range(1, len(train_labels[i]) - 1): #for j in range(2, len(train_labels[i]) - 2, 2): #X = train_tokens_ids[i][j-2: j+4].to(device_gpu) X = train_tokens_ids[i][j-1: j+2].to(device_cpu) Y = train_labels[i][j: j+1].to(device_cpu) Y_predictions = ner_model(X) acc_score += int(torch.argmax(Y_predictions) == Y) if torch.argmax(Y_predictions) != 0: selected_items +=1 if torch.argmax(Y_predictions) != 0 and torch.argmax(Y_predictions) == Y.item(): prec_score += 1 if Y.item() != 0: relevant_items +=1 if Y.item() != 0 and torch.argmax(Y_predictions) == Y.item(): recall_score += 1 items_total += 1 optimizer.zero_grad() loss = criterion(Y_predictions.unsqueeze(0), Y) loss.backward() optimizer.step() precision = prec_score / selected_items recall = recall_score / relevant_items f1_score = (2*precision * recall) / (precision + recall) print(f'epoch: {epoch}') print(f'f1: {f1_score}') print(f'acc: {acc_score/ items_total}') torch.save(ner_model.state_dict(), model_path) else: ner_model.load_state_dict(torch.load(model_path, map_location=torch.device('cpu'))) results = process_dataset(ner_model,"dev-0/in.tsv") file_content = process_output(results) with open("dev-0/out.tsv", "w") as f: for line in file_content: f.write(line + "\n") results = process_dataset(ner_model,"test-A/in.tsv") file_content = [' '.join(x) for x in results] with open("test-A/out.tsv", "w") as f: for line in file_content: f.write(line + "\n")