en-ner-conll-2003/seq.py

49 lines
1.5 KiB
Python
Raw Normal View History

2021-06-21 21:56:24 +02:00
from numpy.lib.shape_base import split
2021-06-21 21:10:27 +02:00
import pandas as pd
import numpy as np
import gensim
import torch
import pandas as pd
from sklearn.model_selection import train_test_split
2021-06-21 21:56:24 +02:00
from collections import Counter
from torchtext.vocab import Vocab
# Functions from jupyter
def build_vocab(dataset):
counter = Counter()
for document in dataset:
counter.update(document)
return Vocab(counter, specials=['<unk>', '<pad>', '<bos>', '<eos>'])
def data_process(dt, vocab):
return [torch.tensor([vocab['<bos>']] + [vocab[token] for token in document] + [vocab['<eos>']], dtype=torch.long) for document in dt]
def labels_process(dt, vocab):
return [torch.tensor([0] + document + [0], dtype=torch.long) for document in dt]
2021-06-21 21:10:27 +02:00
# Load data
2021-06-21 21:56:24 +02:00
def load_data():
train = pd.read_csv('train/train.tsv', sep='\t',
names=['labels', 'document'])
Y_train = [y.split(sep=" ") for y in train['labels'].values]
X_train = [x.split(sep=" ") for x in train['document'].values]
dev = pd.read_csv('dev-0/in.tsv', sep='\t', names=['document'])
exp = pd.read_csv('dev-0/expected.tsv', sep='\t', names=['labels'])
X_dev = [x.split(sep=" ") for x in dev['document'].values]
Y_dev = [y.split(sep=" ") for y in exp['labels'].values]
test = pd.read_csv('test-A/in.tsv', sep='\t', names=['document'])
X_test = test['document'].values
return X_train, Y_train, X_dev, Y_dev, X_test
2021-06-21 21:10:27 +02:00
2021-06-21 21:56:24 +02:00
if __name__ == "__main__":
X_train, Y_train, X_dev, Y_dev, X_test = load_data()