From 0553a8f27f7905b414c8b4683fa53c9bf87d774e Mon Sep 17 00:00:00 2001 From: Maciej Sobkowiak Date: Mon, 21 Jun 2021 21:56:24 +0200 Subject: [PATCH] load_data and tokenize --- .gitignore | 1 + seq.py | 47 +++++++++++++++++++++++++++++++++++++---------- 2 files changed, 38 insertions(+), 10 deletions(-) diff --git a/.gitignore b/.gitignore index 2f836aa..3116d85 100644 --- a/.gitignore +++ b/.gitignore @@ -1,2 +1,3 @@ *~ *.pyc +venv/* \ No newline at end of file diff --git a/seq.py b/seq.py index c57a458..a7f447b 100644 --- a/seq.py +++ b/seq.py @@ -1,21 +1,48 @@ +from numpy.lib.shape_base import split import pandas as pd import numpy as np import gensim import torch import pandas as pd -import seaborn as sns from sklearn.model_selection import train_test_split +from collections import Counter +from torchtext.vocab import Vocab + + +# Functions from jupyter +def build_vocab(dataset): + counter = Counter() + for document in dataset: + counter.update(document) + return Vocab(counter, specials=['', '', '', '']) + + +def data_process(dt, vocab): + return [torch.tensor([vocab['']] + [vocab[token] for token in document] + [vocab['']], dtype=torch.long) for document in dt] + + +def labels_process(dt, vocab): + return [torch.tensor([0] + document + [0], dtype=torch.long) for document in dt] # Load data -train = pd.read_csv('train/train.tsv', sep='\t', names=['labels', 'document']) -Y_train = train['labels'].values -X_train = train['document'].values +def load_data(): + train = pd.read_csv('train/train.tsv', sep='\t', + names=['labels', 'document']) -test = pd.read_csv('test-A/in.tsv', sep='\t', names=['document']) -X_test = test['document'].values + Y_train = [y.split(sep=" ") for y in train['labels'].values] + X_train = [x.split(sep=" ") for x in train['document'].values] -dev = pd.read_csv('dev-0/in.tsv', sep='\t', names=['document']) -exp = pd.read_csv('dev-0/expected.tsv', sep='\t', names=['labels']) -X_dev = dev['document'].values -Y_dev = dev['labels'].values + dev = pd.read_csv('dev-0/in.tsv', sep='\t', names=['document']) + exp = pd.read_csv('dev-0/expected.tsv', sep='\t', names=['labels']) + X_dev = [x.split(sep=" ") for x in dev['document'].values] + Y_dev = [y.split(sep=" ") for y in exp['labels'].values] + + test = pd.read_csv('test-A/in.tsv', sep='\t', names=['document']) + X_test = test['document'].values + + return X_train, Y_train, X_dev, Y_dev, X_test + + +if __name__ == "__main__": + X_train, Y_train, X_dev, Y_dev, X_test = load_data()