load_data and tokenize

This commit is contained in:
Maciej Sobkowiak 2021-06-21 21:56:24 +02:00
parent 37da463746
commit 0553a8f27f
2 changed files with 38 additions and 10 deletions

1
.gitignore vendored
View File

@ -1,2 +1,3 @@
*~ *~
*.pyc *.pyc
venv/*

47
seq.py
View File

@ -1,21 +1,48 @@
from numpy.lib.shape_base import split
import pandas as pd import pandas as pd
import numpy as np import numpy as np
import gensim import gensim
import torch import torch
import pandas as pd import pandas as pd
import seaborn as sns
from sklearn.model_selection import train_test_split from sklearn.model_selection import train_test_split
from collections import Counter
from torchtext.vocab import Vocab
# Functions from jupyter
def build_vocab(dataset):
counter = Counter()
for document in dataset:
counter.update(document)
return Vocab(counter, specials=['<unk>', '<pad>', '<bos>', '<eos>'])
def data_process(dt, vocab):
return [torch.tensor([vocab['<bos>']] + [vocab[token] for token in document] + [vocab['<eos>']], dtype=torch.long) for document in dt]
def labels_process(dt, vocab):
return [torch.tensor([0] + document + [0], dtype=torch.long) for document in dt]
# Load data # Load data
train = pd.read_csv('train/train.tsv', sep='\t', names=['labels', 'document']) def load_data():
Y_train = train['labels'].values train = pd.read_csv('train/train.tsv', sep='\t',
X_train = train['document'].values names=['labels', 'document'])
test = pd.read_csv('test-A/in.tsv', sep='\t', names=['document']) Y_train = [y.split(sep=" ") for y in train['labels'].values]
X_test = test['document'].values X_train = [x.split(sep=" ") for x in train['document'].values]
dev = pd.read_csv('dev-0/in.tsv', sep='\t', names=['document']) dev = pd.read_csv('dev-0/in.tsv', sep='\t', names=['document'])
exp = pd.read_csv('dev-0/expected.tsv', sep='\t', names=['labels']) exp = pd.read_csv('dev-0/expected.tsv', sep='\t', names=['labels'])
X_dev = dev['document'].values X_dev = [x.split(sep=" ") for x in dev['document'].values]
Y_dev = dev['labels'].values Y_dev = [y.split(sep=" ") for y in exp['labels'].values]
test = pd.read_csv('test-A/in.tsv', sep='\t', names=['document'])
X_test = test['document'].values
return X_train, Y_train, X_dev, Y_dev, X_test
if __name__ == "__main__":
X_train, Y_train, X_dev, Y_dev, X_test = load_data()