3.3 KiB
3.3 KiB
import pandas as pd
import numpy as np
import os.path
import gzip
import shutil
import torch
if not os.path.isfile('train/train.tsv'):
import lzma
with lzma.open('train/train.tsv.xz', 'rb') as f_in:
with open('train/train.tsv', 'wb') as f_out:
shutil.copyfileobj(f_in, f_out)
raw_data = pd.read_csv('train/train.tsv', sep='\t', names=['labels', 'text'])
data = []
for sentence in raw_data.to_numpy():
for label, word in zip(sentence[0].split(), sentence[1].split()):
data.append([label,word,len(word), any(c.isdigit() for c in word), word.isupper()])
df = pd.DataFrame(data, columns=['Label', 'Word', 'WordLen', 'WordHasDigit', 'CapitalFirst'], index=None)
df[df["Label"]==None]
Label | Word | WordLen | WordHasDigit | CapitalFirst |
---|
def labels_process(dt):
return [ torch.tensor([0] + document + [0], dtype = torch.long) for document in dt]
def data_process(dt):
return [ torch.tensor([vocab['<bos>']] +[vocab[token] for token in document ] + [vocab['<eos>']], dtype = torch.long) for document in dt]