s478841

2022-06-12 17:47:03 +02:00 · 2022-06-12 17:47:03 +02:00 · eb69e726e3
commit eb69e726e3
parent 82c2482af6
1 changed files with 254 additions and 0 deletions
--- a/run.py
+++ b/run.py
@ -0,0 +1,254 @@
 import lzma
 from collections import Counter
 import torch
 import torch.nn as nn
 import torchtext.vocab
 from bidict import bidict
 from string import punctuation
 LABEL_TO_ID = bidict({
    'O': 0,
    'B-PER': 1,
    'B-LOC': 2,
    'I-PER': 3,
    'B-MISC': 4,
    'I-MISC': 5,
    'I-LOC': 6,
    'B-ORG': 7,
    'I-ORG': 8
 })
 ID_TO_LABEL = LABEL_TO_ID.inverse
 def read_data(path):
    print(f"I am reading the data from {path}...")
    if path[-2:] == 'xz':
        data = {'text': [], 'tokens': []}
        with lzma.open(path, 'rt', encoding='utf-8') as f:
            for line in f:
                line = line.strip().rsplit('\t')
                tokens, text = line[0].split(), line[1].split()
                if len(tokens) == len(text):
                    data['tokens'].append(tokens)
                    data['text'].append(text)
    else:
        with open(path, 'r', encoding='utf-8') as f:
            data = [line.strip().split() for line in f]
    print("Data loaded")
    return data
 def make_vocabulary(dataset):
    counter = Counter()
    for document in dataset:
        counter.update(document)
    vocab = torchtext.vocab.vocab(
        counter, specials=['<unk>', '<pad>', '<bos>', '<eos>'])
    vocab.set_default_index(0)
    return vocab
 def tokenize_data(data, vocab):
    return [
        torch.tensor([vocab['<bos>']] + [vocab[token] for token in document] +
                     [vocab['<eos>']],
                     dtype=torch.long) for document in data
    ]
 def encode_labels(data):
    data_num = [[LABEL_TO_ID[label] for label in labels] for labels in data]
    return [
        torch.tensor([0] + document + [0], dtype=torch.long)
        for document in data_num
    ]
 def add_features(x_base, x_str):
    word_features = [0, 0, 0, 0, 0, 0, 0, 0, 0]
    if len(x_str) > 1 and len(x_str[1]) > 1:
        word = x_str[1]
        if word.isupper():
            word_features[0] = 1
        if word[0].isupper():
            word_features[1] = 1
        if word.isalnum():
            word_features[2] = 1
        if word.isnumeric():
            word_features[3] = 1
        if '-' in word:
            word_features[4] = 1
        if '/' in word:
            word_features[5] = 1
        for char in word:
            if char in punctuation:
                word_features[6] = 1
                break
        if len(word) > 6:
            word_features[7] = 1
        if len(word) < 3:
            word_features[8] = 1
    extra_features = torch.tensor(word_features)
    x_features = torch.cat((x_base, extra_features), 0)
    return x_features
 class NERModel(nn.Module):
    def __init__(self):
        super(NERModel, self).__init__()
        self.embedding = nn.Embedding(23627, 200)
        self.linear = nn.Linear(2400, 9)
    def forward(self, x):
        x = self.embedding(x)
        x = x.reshape(2400)
        x = self.linear(x)
        return x
 def train_model(model,
                data,
                train_labels,
                train_tokens_ids,
                epochs,
                save=False):
    model.train()
    for epoch in range(epochs):
        loss_score = 0
        acc_score = 0
        prec_score = 0
        selected_items = 0
        recall_score = 0
        relevant_items = 0
        items_total = 0
        for i in range(len(train_labels) - 1):
            for j in range(1, len(train_labels[i]) - 1):
                X_base = train_tokens_ids[i][j - 1:j + 2]
                X_string = data['text'][i][j - 1:j + 2]
                X_extra = add_features(X_base, X_string)
                Y = train_labels[i][j:j + 1]
                X = X_extra.to(device)
                Y = Y.to(device)
                Y_predictions = model(X)
                pred_class = torch.argmax(Y_predictions)
                y_item = Y.item()
                acc_score += pred_class == Y
                if pred_class != 0:
                    selected_items += 1
                    if pred_class == y_item:
                        prec_score += 1
                if y_item != 0:
                    relevant_items += 1
                    if pred_class == y_item:
                        recall_score += 1
                items_total += 1
                optimizer.zero_grad()
                loss = criterion(Y_predictions.unsqueeze(0), Y)
                loss.backward()
                optimizer.step()
                loss_score += loss.item()
            precision = prec_score / selected_items
            recall = recall_score / relevant_items
            f1_score = 2 * precision * recall / (
                precision + recall) if precision and recall else 0
            if i + 1 % 10 == 0:
                print('Epoch: ', epoch)
                print('Loss: ', loss_score / items_total)
                print('Accuracy: ', acc_score / items_total)
                print('F1-score: ', f1_score)
        print('Finished epoch: ', epoch)
    if save:
        torch.save(model, 'model.pt')
 def write_results(data, path):
    with open(path, 'w') as f:
        for line in data:
            f.write(f'{line}\n')
    print(f"Data written to the file {path}")
@torch.no_grad()
 def predict(model, x_data, vocab, device):
    tokens_ids = tokenize_data(x_data, vocab)
    preds = []
    # print('Getting into predicting loop')
    for i in range(len(tokens_ids)):
        labels = ''
        # print('I will go with the sentence:\t', i)
        for j in range(1, len(tokens_ids[i]) - 1):
            x_base = tokens_ids[i][j - 1:j + 2]
            x_strings = x_data[i][j - 1:j + 2]
            x_features = add_features(x_base, x_strings)  # .to(device)
            # print('I will predict on data:\t', x_base, x_strings)
            try:
                pred = model(x_features)
                label = ID_TO_LABEL[int(torch.argmax(pred))]
                labels += f'{label} '
            except Exception as ex:
                print(f'Exception\t→\t{ex}\t{x_strings}→{x_features}')
        preds.append(labels[:-1])
    print('Done with the inference, now writing it into the file!\n')
    lines = []
    for line in preds:
        prev_label = None
        new_line = []
        for label in line.split():
            if label[0] == 'I':
                if prev_label is None or prev_label == 'O':
                    label = label.replace('I', 'B')
                else:
                    label = 'I' + prev_label[1:]
            prev_label = label
            new_line.append(label)
        lines.append(' '.join(new_line))
    return lines
 if __name__ == '__main__':
    # * Data loading
    data = read_data('./train/train.tsv.xz')
    vocab = make_vocabulary(data['text'])
    train_tokens_ids = tokenize_data(data['text'], vocab)
    train_labels = encode_labels(data['tokens'])
    # * Model set-up
    device = torch.device("cuda:0" if torch.cuda.is_available() else "cpu")
    print('My device is ', device)
    ner_model = NERModel().to(device)
    criterion = nn.CrossEntropyLoss()
    optimizer = torch.optim.Adam(ner_model.parameters())
    epochs = 3
    # * Training
    train_model(ner_model,
                data,
                train_labels,
                train_tokens_ids,
                epochs,
                save=True)
    # * Inference time!!!
    print("Now, let's predict something!")
    # new_model = torch.load(PATH)
    ner_model.cpu()
    ner_model.eval()
    # * Inference on dev-0 data
    dev_data = read_data('./dev-0/in.tsv')
    write_results(predict(ner_model, dev_data, vocab, device),
                  './dev-0/out.tsv')
    # * Inference on test-A data
    test_data = read_data('./test-A/in.tsv')
    write_results(predict(ner_model, test_data, vocab, device),
                  './test-A/out.tsv')