from numpy.lib.shape_base import split import pandas as pd import numpy as np import gensim import torch import pandas as pd from sklearn.model_selection import train_test_split from collections import Counter from torchtext.vocab import Vocab # Functions from jupyter def build_vocab(dataset): counter = Counter() for document in dataset: counter.update(document) return Vocab(counter, specials=['', '', '', '']) def data_process(dt, vocab): return [torch.tensor([vocab['']] + [vocab[token] for token in document] + [vocab['']], dtype=torch.long) for document in dt] def labels_process(dt, vocab): return [torch.tensor([0] + document + [0], dtype=torch.long) for document in dt] # Load data def load_data(): train = pd.read_csv('train/train.tsv', sep='\t', names=['labels', 'document']) Y_train = [y.split(sep=" ") for y in train['labels'].values] X_train = [x.split(sep=" ") for x in train['document'].values] dev = pd.read_csv('dev-0/in.tsv', sep='\t', names=['document']) exp = pd.read_csv('dev-0/expected.tsv', sep='\t', names=['labels']) X_dev = [x.split(sep=" ") for x in dev['document'].values] Y_dev = [y.split(sep=" ") for y in exp['labels'].values] test = pd.read_csv('test-A/in.tsv', sep='\t', names=['document']) X_test = test['document'].values return X_train, Y_train, X_dev, Y_dev, X_test if __name__ == "__main__": X_train, Y_train, X_dev, Y_dev, X_test = load_data()