update roberta script

This commit is contained in:
s440058 2021-06-22 14:25:22 +02:00
parent 99eb245d6c
commit 8cf99e9f55
3 changed files with 10427 additions and 3 deletions

5272
dev-0/out.tsv Normal file

File diff suppressed because it is too large Load Diff

View File

@ -3,7 +3,7 @@ import torch
PATHS = ['train/in.tsv', 'train/expected.tsv', 'dev-0/in.tsv', 'test-A/in.tsv', './dev-0/out.tsv', './test-A/out.tsv'] PATHS = ['train/in.tsv', 'train/expected.tsv', 'dev-0/in.tsv', 'test-A/in.tsv', './dev-0/out.tsv', './test-A/out.tsv']
OUTPUT_PATHS = ['dev-0/out.tsv', 'test-A/out.tsv'] OUTPUT_PATHS = ['dev-0/out.tsv', 'test-A/out.tsv']
PRE_TRAINED = 'roberta-base' PRE_TRAINED = ['roberta-base']
def get_data(path): def get_data(path):
data = [] data = []
@ -32,8 +32,8 @@ class IMDbDataset(torch.utils.data.Dataset):
return len(self.labels) return len(self.labels)
def prepare(data_train_X, data_train_Y): def prepare(data_train_X, data_train_Y):
tokenizer = AutoTokenizer.from_pretrained(PRE_TRAINED) tokenizer = AutoTokenizer.from_pretrained(PRE_TRAINED[0])
model = AutoModelForSequenceClassification.from_pretrained(PRE_TRAINED, num_labels=2) model = AutoModelForSequenceClassification.from_pretrained(PRE_TRAINED[0], num_labels=2)
device = torch.device("cpu") device = torch.device("cpu")
model.to(device) model.to(device)
encoded_input = tokenizer([text[0] for text in list(zip(data_train_X, data_train_Y))], truncation=True, padding=True) encoded_input = tokenizer([text[0] for text in list(zip(data_train_X, data_train_Y))], truncation=True, padding=True)

5152
test-A/out.tsv Normal file

File diff suppressed because it is too large Load Diff