en-ner-conll-2003/main.py

from os import sep
from nltk import word_tokenize
import pandas as pd
import torch
from torch._C import device
from tqdm import tqdm
from torchtext.vocab import vocab
from collections import Counter, OrderedDict
import spacy
from torchcrf import CRF
from torch.utils.data import DataLoader
import numpy as np
from sklearn.metrics import accuracy_score, f1_score, classification_report
import csv
import pickle

class Model(torch.nn.Module):
    def __init__(self, num_tags, seq_length):
        super(Model, self).__init__()
        self.emb = torch.nn.Embedding(len(vocab.get_itos()), 100)
        self.gru = torch.nn.GRU(100, 256, 1, batch_first=True)
        self.hidden2tag = torch.nn.Linear(256, 9)
        self.crf = CRF(num_tags, batch_first=True)
        self.relu = torch.nn.ReLU()
        self.fc1 = torch.nn.Linear(1, seq_length)
        self.softmax = torch.nn.Softmax(dim=0)
        self.sigm = torch.nn.Sigmoid()

    def forward(self, data, tags):
        emb = self.relu(self.emb(data))
        out, h_n = self.gru(emb)
        out = self.hidden2tag(out)
        out = self.crf(out, tags.T)
        return -out

    def decode(self, data):
        emb = self.relu(self.emb(data))
        out, h_n = self.gru(emb)
        out = self.hidden2tag(out)
        out = self.crf.decode(out)
        return out

    def train_mode(self):
        self.crf.train()
    
    def eval_mode(self):
        self.crf.eval()


def process_document(document):
    return document.split(" ")

def build_vocab(dataset):
    counter = Counter()
    for document in dataset:
        counter.update(process_document(document))
    sorted_by_freq_tuples = sorted(counter.items(), key=lambda x: x[1], reverse=True)
    ordered_dict = OrderedDict(sorted_by_freq_tuples)
    v = vocab(counter)
    default_index = 0
    v.set_default_index(default_index)
    return v

def data_process(dt):
    return [ torch.tensor([vocab[token]  for token in  document.split(" ") ], dtype = torch.long) for document in dt]

def labels_process(dt):
    return [ torch.tensor([labels_vocab[token]  for token in  document.split(" ") ], dtype = torch.long) for document in dt]

# mode = "train"
# mode = "eval"
mode = "generate"

save_path = "test-A/out.tsv"

# data = pd.read_csv("dev-0/in.tsv", sep="\t", names=['0'])
# data.columns = ["labels", "text"]

train_target = pd.read_csv("train/train.tsv", sep = '\t', names = ['labels', 'data'])

# ex_data = pd.read_csv("dev-0/expected.tsv", sep="\t", names=['labels'])

# in_data = data["0"]
# target = ex_data["labels"]

test_data = pd.read_csv("test-A/in.tsv", sep = '\t', names=['0'])
# test_data.columns = ['0']
# data = test_data['0']
in_data = test_data['0']
# target = list(np.zeros(len(in_data)))
target = train_target['labels']

# labels_vocab = build_vocab(data['labels'])
if mode == "train":
    vocab = build_vocab(in_data)
    with open("vocab.pickle", "wb") as file:
        pickle.dump(vocab, file)
    print("Vocab saved")
else:
    with open("vocab.pickle", "rb") as file:
        vocab = pickle.load(file)

labels_vocab = {
    'O': 0,
    'B-PER': 1,
    'B-LOC': 2,
    'I-PER': 3,
    'B-MISC': 4,
    'I-MISC': 5,
    'I-LOC': 6,
    'B-ORG': 7,
    'I-ORG': 8
}

inv_labels_vocab = {v: k for k, v in labels_vocab.items()}

train_tokens_ids = data_process(in_data)
train_labels = labels_process(target)

num_tags = 9
NUM_EPOCHS = 5
seq_length = 5

model = Model(num_tags, seq_length)
device = torch.device("cpu")
model.to(device)
# model.cuda(0)

if mode == "train": 
    criterion = torch.nn.CrossEntropyLoss()
    optimizer = torch.optim.Adam(model.parameters())
    for i in range(NUM_EPOCHS):
        model.train()
        model.train_mode()
        #for i in tqdm(range(500)):
        for i in tqdm(range(len(train_labels))):
            for k in range(0, len(train_tokens_ids[i]) - seq_length, seq_length):
                batch_tokens = train_tokens_ids[i][k: k + seq_length].unsqueeze(0)
                tags = train_labels[i][k: k + seq_length].unsqueeze(1)
                
                predicted_tags = model(batch_tokens.to(device), tags.to(device))

                predicted_tags.backward()
                optimizer.step()
                model.zero_grad()
                model.crf.zero_grad()
                optimizer.zero_grad()

    torch.save(model.state_dict(), "model.torch")

if mode == "eval" or mode == "generate":
    model.eval()
    model.eval_mode()
    predicted = []
    correct = []
    model.load_state_dict(torch.load("model.torch"))
    for i in tqdm(range(0, len(train_tokens_ids))):
        last_idx = 0
        for k in range(0, len(train_tokens_ids[i]) - seq_length, seq_length):
            batch_tokens = train_tokens_ids[i][k: k + seq_length].unsqueeze(0)
            tags = train_labels[i][k: k + seq_length].unsqueeze(1)
            predicted_tags = model.decode(batch_tokens.to(device))
            predicted += predicted_tags[0]
            correct += [x[0] for x in tags.numpy().tolist()]
            last_idx = k
        l = len(train_tokens_ids[i])
        rest = l - int(l/seq_length) * seq_length
        if rest != 0:
            batch_tokens = train_tokens_ids[i][last_idx: last_idx + rest].unsqueeze(0)
            tags = train_labels[i][last_idx: last_idx + rest].unsqueeze(1)
            predicted_tags = model.decode(batch_tokens.to(device))
            predicted += predicted_tags[0]
            correct += [x[0] for x in tags.numpy().tolist()]

    if mode == "eval":
        print(classification_report(correct, predicted))
        print(accuracy_score(correct, predicted))
        print(f1_score(correct, predicted, average="weighted"))
    
    predicted = list(map(lambda x: inv_labels_vocab[x], predicted))
    slices = [len(x.split(" ")) for x in in_data]
    with open(save_path, "w") as save:
        writer = csv.writer(save, delimiter='\t', lineterminator='\n')
        accumulator = 0
        for slice in slices:
            writer.writerow([' '.join(predicted[accumulator: accumulator + slice])])
            accumulator += slice - 1
working on it 2021-06-20 19:04:16 +02:00			`from os import sep`
			`from nltk import word_tokenize`
			`import pandas as pd`
			`import torch`
progress 2021-06-20 22:03:34 +02:00			`from torch._C import device`
working on it 2021-06-20 19:04:16 +02:00			`from tqdm import tqdm`
			`from torchtext.vocab import vocab`
			`from collections import Counter, OrderedDict`
			`import spacy`
			`from torchcrf import CRF`
			`from torch.utils.data import DataLoader`
its working 2021-06-21 00:43:43 +02:00			`import numpy as np`
			`from sklearn.metrics import accuracy_score, f1_score, classification_report`
zadanie11 2021-06-21 01:38:09 +02:00			`import csv`
			`import pickle`
working on it 2021-06-20 19:04:16 +02:00
			`class Model(torch.nn.Module):`
			`def __init__(self, num_tags, seq_length):`
			`super(Model, self).__init__()`
			`self.emb = torch.nn.Embedding(len(vocab.get_itos()), 100)`
			`self.gru = torch.nn.GRU(100, 256, 1, batch_first=True)`
			`self.hidden2tag = torch.nn.Linear(256, 9)`
			`self.crf = CRF(num_tags, batch_first=True)`
			`self.relu = torch.nn.ReLU()`
			`self.fc1 = torch.nn.Linear(1, seq_length)`
			`self.softmax = torch.nn.Softmax(dim=0)`
			`self.sigm = torch.nn.Sigmoid()`

			`def forward(self, data, tags):`
			`emb = self.relu(self.emb(data))`
			`out, h_n = self.gru(emb)`
			`out = self.hidden2tag(out)`
			`out = self.crf(out, tags.T)`
its working 2021-06-21 00:43:43 +02:00			`return -out`
progress 2021-06-20 22:03:34 +02:00
			`def decode(self, data):`
			`emb = self.relu(self.emb(data))`
			`out, h_n = self.gru(emb)`
			`out = self.hidden2tag(out)`
			`out = self.crf.decode(out)`
working on it 2021-06-20 19:04:16 +02:00			`return out`

its working 2021-06-21 00:43:43 +02:00			`def train_mode(self):`
			`self.crf.train()`

			`def eval_mode(self):`
			`self.crf.eval()`

working on it 2021-06-20 19:04:16 +02:00
			`def process_document(document):`
			`return document.split(" ")`

			`def build_vocab(dataset):`
			`counter = Counter()`
			`for document in dataset:`
			`counter.update(process_document(document))`
			`sorted_by_freq_tuples = sorted(counter.items(), key=lambda x: x[1], reverse=True)`
			`ordered_dict = OrderedDict(sorted_by_freq_tuples)`
			`v = vocab(counter)`
zadanie11 2021-06-21 01:38:09 +02:00			`default_index = 0`
working on it 2021-06-20 19:04:16 +02:00			`v.set_default_index(default_index)`
			`return v`

			`def data_process(dt):`
			`return [ torch.tensor([vocab[token] for token in document.split(" ") ], dtype = torch.long) for document in dt]`

			`def labels_process(dt):`
			`return [ torch.tensor([labels_vocab[token] for token in document.split(" ") ], dtype = torch.long) for document in dt]`

zadanie11 2021-06-21 01:38:09 +02:00			`# mode = "train"`
... 2021-06-21 18:57:04 +02:00			`# mode = "eval"`
			`mode = "generate"`
zadanie11 2021-06-21 01:38:09 +02:00
... 2021-06-21 18:57:04 +02:00			`save_path = "test-A/out.tsv"`
zadanie11 2021-06-21 01:38:09 +02:00
... 2021-06-21 18:57:04 +02:00			`# data = pd.read_csv("dev-0/in.tsv", sep="\t", names=['0'])`
zadanie11 2021-06-21 01:38:09 +02:00			`# data.columns = ["labels", "text"]`

... 2021-06-21 18:57:04 +02:00			`train_target = pd.read_csv("train/train.tsv", sep = '\t', names = ['labels', 'data'])`
zadanie11 2021-06-21 01:38:09 +02:00
... 2021-06-21 18:57:04 +02:00			`# ex_data = pd.read_csv("dev-0/expected.tsv", sep="\t", names=['labels'])`
zadanie11 2021-06-21 01:38:09 +02:00
... 2021-06-21 18:57:04 +02:00			`# in_data = data["0"]`
			`# target = ex_data["labels"]`

			`test_data = pd.read_csv("test-A/in.tsv", sep = '\t', names=['0'])`
zadanie11 2021-06-21 01:38:09 +02:00			`# test_data.columns = ['0']`
			`# data = test_data['0']`
... 2021-06-21 18:57:04 +02:00			`in_data = test_data['0']`
zadanie11 2021-06-21 01:38:09 +02:00			`# target = list(np.zeros(len(in_data)))`
... 2021-06-21 18:57:04 +02:00			`target = train_target['labels']`
zadanie11 2021-06-21 01:38:09 +02:00
working on it 2021-06-20 19:04:16 +02:00			`# labels_vocab = build_vocab(data['labels'])`
zadanie11 2021-06-21 01:38:09 +02:00			`if mode == "train":`
			`vocab = build_vocab(in_data)`
			`with open("vocab.pickle", "wb") as file:`
			`pickle.dump(vocab, file)`
			`print("Vocab saved")`
			`else:`
			`with open("vocab.pickle", "rb") as file:`
			`vocab = pickle.load(file)`
working on it 2021-06-20 19:04:16 +02:00
			`labels_vocab = {`
			`'O': 0,`
			`'B-PER': 1,`
			`'B-LOC': 2,`
			`'I-PER': 3,`
			`'B-MISC': 4,`
			`'I-MISC': 5,`
			`'I-LOC': 6,`
			`'B-ORG': 7,`
			`'I-ORG': 8`
			`}`

its working 2021-06-21 00:43:43 +02:00			`inv_labels_vocab = {v: k for k, v in labels_vocab.items()}`

zadanie11 2021-06-21 01:38:09 +02:00			`train_tokens_ids = data_process(in_data)`
			`train_labels = labels_process(target)`
progress 2021-06-20 22:03:34 +02:00
working on it 2021-06-20 19:04:16 +02:00			`num_tags = 9`
			`NUM_EPOCHS = 5`
its working 2021-06-21 00:43:43 +02:00			`seq_length = 5`
working on it 2021-06-20 19:04:16 +02:00
			`model = Model(num_tags, seq_length)`
zadanie11 2021-06-21 01:38:09 +02:00			`device = torch.device("cpu")`
progress 2021-06-20 22:03:34 +02:00			`model.to(device)`
zadanie11 2021-06-21 01:38:09 +02:00			`# model.cuda(0)`
progress 2021-06-20 22:03:34 +02:00
zadanie11 2021-06-21 01:38:09 +02:00			`if mode == "train":`
			`criterion = torch.nn.CrossEntropyLoss()`
			`optimizer = torch.optim.Adam(model.parameters())`
progress 2021-06-20 22:03:34 +02:00			`for i in range(NUM_EPOCHS):`
			`model.train()`
its working 2021-06-21 00:43:43 +02:00			`model.train_mode()`
progress 2021-06-20 22:03:34 +02:00			`#for i in tqdm(range(500)):`
			`for i in tqdm(range(len(train_labels))):`
			`for k in range(0, len(train_tokens_ids[i]) - seq_length, seq_length):`
			`batch_tokens = train_tokens_ids[i][k: k + seq_length].unsqueeze(0)`
			`tags = train_labels[i][k: k + seq_length].unsqueeze(1)`

			`predicted_tags = model(batch_tokens.to(device), tags.to(device))`

			`predicted_tags.backward()`
			`optimizer.step()`
			`model.zero_grad()`
its working 2021-06-21 00:43:43 +02:00			`model.crf.zero_grad()`
			`optimizer.zero_grad()`
progress 2021-06-20 22:03:34 +02:00
			`torch.save(model.state_dict(), "model.torch")`

zadanie11 2021-06-21 01:38:09 +02:00			`if mode == "eval" or mode == "generate":`
progress 2021-06-20 22:03:34 +02:00			`model.eval()`
its working 2021-06-21 00:43:43 +02:00			`model.eval_mode()`
			`predicted = []`
			`correct = []`
			`model.load_state_dict(torch.load("model.torch"))`
zadanie11 2021-06-21 01:38:09 +02:00			`for i in tqdm(range(0, len(train_tokens_ids))):`
change 2021-06-21 11:41:45 +02:00			`last_idx = 0`
working on it 2021-06-20 19:04:16 +02:00			`for k in range(0, len(train_tokens_ids[i]) - seq_length, seq_length):`
			`batch_tokens = train_tokens_ids[i][k: k + seq_length].unsqueeze(0)`
			`tags = train_labels[i][k: k + seq_length].unsqueeze(1)`
progress 2021-06-20 22:03:34 +02:00			`predicted_tags = model.decode(batch_tokens.to(device))`
its working 2021-06-21 00:43:43 +02:00			`predicted += predicted_tags[0]`
			`correct += [x[0] for x in tags.numpy().tolist()]`
change 2021-06-21 11:41:45 +02:00			`last_idx = k`
			`l = len(train_tokens_ids[i])`
			`rest = l - int(l/seq_length) * seq_length`
			`if rest != 0:`
			`batch_tokens = train_tokens_ids[i][last_idx: last_idx + rest].unsqueeze(0)`
			`tags = train_labels[i][last_idx: last_idx + rest].unsqueeze(1)`
			`predicted_tags = model.decode(batch_tokens.to(device))`
			`predicted += predicted_tags[0]`
			`correct += [x[0] for x in tags.numpy().tolist()]`

zadanie11 2021-06-21 01:38:09 +02:00			`if mode == "eval":`
			`print(classification_report(correct, predicted))`
			`print(accuracy_score(correct, predicted))`
			`print(f1_score(correct, predicted, average="weighted"))`
its working 2021-06-21 00:43:43 +02:00
			`predicted = list(map(lambda x: inv_labels_vocab[x], predicted))`
zadanie11 2021-06-21 01:38:09 +02:00			`slices = [len(x.split(" ")) for x in in_data]`
			`with open(save_path, "w") as save:`
			`writer = csv.writer(save, delimiter='\t', lineterminator='\n')`
its working 2021-06-21 00:43:43 +02:00			`accumulator = 0`
			`for slice in slices:`
zadanie11 2021-06-21 01:38:09 +02:00			`writer.writerow([' '.join(predicted[accumulator: accumulator + slice])])`
fix 2021-06-21 18:46:18 +02:00			`accumulator += slice - 1`
its working 2021-06-21 00:43:43 +02:00