en-ner-conll-2003/seq_labeling.py.ipynb
2021-06-01 16:48:56 +02:00

3.3 KiB

import pandas as pd
import numpy as np
import os.path
import gzip
import shutil
import torch
if not os.path.isfile('train/train.tsv'):
    import lzma
    with lzma.open('train/train.tsv.xz', 'rb') as f_in:
        with open('train/train.tsv', 'wb') as f_out:
            shutil.copyfileobj(f_in, f_out)
raw_data = pd.read_csv('train/train.tsv', sep='\t', names=['labels', 'text'])
data = []
for sentence in raw_data.to_numpy():
    for label, word in zip(sentence[0].split(), sentence[1].split()):
        data.append([label,word,len(word), any(c.isdigit() for c in word), word.isupper()])
df = pd.DataFrame(data, columns=['Label', 'Word', 'WordLen', 'WordHasDigit', 'CapitalFirst'], index=None)
df[df["Label"]==None]
Label Word WordLen WordHasDigit CapitalFirst
def labels_process(dt):
    return [ torch.tensor([0] + document + [0], dtype = torch.long) for document in dt]

def data_process(dt):
    return [ torch.tensor([vocab['<bos>']] +[vocab[token]  for token in  document ] + [vocab['<eos>']], dtype = torch.long) for document in dt]