forked from kubapok/en-ner-conll-2003
load_data and tokenize
This commit is contained in:
parent
37da463746
commit
0553a8f27f
1
.gitignore
vendored
1
.gitignore
vendored
@ -1,2 +1,3 @@
|
|||||||
*~
|
*~
|
||||||
*.pyc
|
*.pyc
|
||||||
|
venv/*
|
47
seq.py
47
seq.py
@ -1,21 +1,48 @@
|
|||||||
|
from numpy.lib.shape_base import split
|
||||||
import pandas as pd
|
import pandas as pd
|
||||||
import numpy as np
|
import numpy as np
|
||||||
import gensim
|
import gensim
|
||||||
import torch
|
import torch
|
||||||
import pandas as pd
|
import pandas as pd
|
||||||
import seaborn as sns
|
|
||||||
from sklearn.model_selection import train_test_split
|
from sklearn.model_selection import train_test_split
|
||||||
|
from collections import Counter
|
||||||
|
from torchtext.vocab import Vocab
|
||||||
|
|
||||||
|
|
||||||
|
# Functions from jupyter
|
||||||
|
def build_vocab(dataset):
|
||||||
|
counter = Counter()
|
||||||
|
for document in dataset:
|
||||||
|
counter.update(document)
|
||||||
|
return Vocab(counter, specials=['<unk>', '<pad>', '<bos>', '<eos>'])
|
||||||
|
|
||||||
|
|
||||||
|
def data_process(dt, vocab):
|
||||||
|
return [torch.tensor([vocab['<bos>']] + [vocab[token] for token in document] + [vocab['<eos>']], dtype=torch.long) for document in dt]
|
||||||
|
|
||||||
|
|
||||||
|
def labels_process(dt, vocab):
|
||||||
|
return [torch.tensor([0] + document + [0], dtype=torch.long) for document in dt]
|
||||||
|
|
||||||
|
|
||||||
# Load data
|
# Load data
|
||||||
train = pd.read_csv('train/train.tsv', sep='\t', names=['labels', 'document'])
|
def load_data():
|
||||||
Y_train = train['labels'].values
|
train = pd.read_csv('train/train.tsv', sep='\t',
|
||||||
X_train = train['document'].values
|
names=['labels', 'document'])
|
||||||
|
|
||||||
test = pd.read_csv('test-A/in.tsv', sep='\t', names=['document'])
|
Y_train = [y.split(sep=" ") for y in train['labels'].values]
|
||||||
X_test = test['document'].values
|
X_train = [x.split(sep=" ") for x in train['document'].values]
|
||||||
|
|
||||||
dev = pd.read_csv('dev-0/in.tsv', sep='\t', names=['document'])
|
dev = pd.read_csv('dev-0/in.tsv', sep='\t', names=['document'])
|
||||||
exp = pd.read_csv('dev-0/expected.tsv', sep='\t', names=['labels'])
|
exp = pd.read_csv('dev-0/expected.tsv', sep='\t', names=['labels'])
|
||||||
X_dev = dev['document'].values
|
X_dev = [x.split(sep=" ") for x in dev['document'].values]
|
||||||
Y_dev = dev['labels'].values
|
Y_dev = [y.split(sep=" ") for y in exp['labels'].values]
|
||||||
|
|
||||||
|
test = pd.read_csv('test-A/in.tsv', sep='\t', names=['document'])
|
||||||
|
X_test = test['document'].values
|
||||||
|
|
||||||
|
return X_train, Y_train, X_dev, Y_dev, X_test
|
||||||
|
|
||||||
|
|
||||||
|
if __name__ == "__main__":
|
||||||
|
X_train, Y_train, X_dev, Y_dev, X_test = load_data()
|
||||||
|
Loading…
Reference in New Issue
Block a user