hf roberta base epoch1 (fix)

This commit is contained in:
Jakub Pokrywka 2021-12-14 12:30:15 +01:00
parent f986c74861
commit dc6eb48ec2
5 changed files with 19048 additions and 19037 deletions

File diff suppressed because it is too large Load Diff

View File

@ -1,26 +1,38 @@
import pickle import pickle
from datasets import load_dataset from datasets import load_dataset, Dataset
from transformers import AutoTokenizer from transformers import AutoTokenizer
from tqdm import tqdm from tqdm import tqdm
from sklearn.preprocessing import MinMaxScaler from sklearn.preprocessing import MinMaxScaler
import numpy as np import numpy as np
dataset = load_dataset('csv', sep='\t', data_files={'train': ['./train_huggingface_format.csv'], 'test': ['./dev-0_huggingface_format.csv']}) #dataset = load_dataset('csv', sep='\t', data_files={'train': ['./train_huggingface_format.csv'], 'test': ['./dev-0_huggingface_format.csv']})
test_dataset_A = load_dataset('csv', sep='\t', data_files='test-A_huggingface_format.csv') #test_dataset_A = load_dataset('csv', sep='\t', data_files='test-A_huggingface_format.csv')
#
tokenizer = AutoTokenizer.from_pretrained('roberta-base') tokenizer = AutoTokenizer.from_pretrained('roberta-base')
#
def tokenize_function(examples): def tokenize_function(examples):
t = tokenizer(examples["text"], padding="max_length", truncation=True, max_length=512) t = tokenizer(examples["text"], padding="max_length", truncation=True, max_length=512)
return t return t
#
#test_tokenized_datasets_A = test_dataset_A.map(tokenize_function, batched=True)
#tokenized_datasets = dataset.map(tokenize_function, batched=True)
test_tokenized_datasets_A = test_dataset_A.map(tokenize_function, batched=True) def get_dataset_dict(dataset):
tokenized_datasets = dataset.map(tokenize_function, batched=True) with open(dataset) as f_in:
next(f_in)
d = dict()
d['year'] = list()
d['text'] = list()
for l in f_in:
y,t = l.rstrip().split('\t')
d['year'].append(y)
d['text'].append(t)
return d
train_dataset = tokenized_datasets["train"].shuffle(seed=42) train_dataset = Dataset.from_dict(get_dataset_dict('train_huggingface_format.csv')).map(tokenize_function, batched=True).shuffle(seed=42)
eval_dataset_full = tokenized_datasets["test"] eval_dataset_full = Dataset.from_dict(get_dataset_dict('dev-0_huggingface_format.csv')).map(tokenize_function, batched=True).shuffle(seed=42)
eval_dataset_small = tokenized_datasets["test"].select(range(2000)) eval_dataset_small = eval_dataset_full.select(range(2000))
test_dataset_A = test_tokenized_datasets_A["train"] test_dataset_A = Dataset.from_dict(get_dataset_dict('test-A_huggingface_format.csv')).map(tokenize_function, batched=True).shuffle(seed=42)
scalers = dict() scalers = dict()
@ -34,7 +46,7 @@ def add_scaled(example):
train_dataset = train_dataset.map(add_scaled) train_dataset = train_dataset.map(add_scaled)
eval_dataset_full = eval_dataset_full.map(add_scaled) eval_dataset_full = eval_dataset_full.map(add_scaled)
eval_dataset_small = eval_dataset_small.map(add_scaled) eval_dataset_small = eval_dataset_small.map(add_scaled)
#test_dataset_A = test_dataset_A.map(add_scaled) test_dataset_A = test_dataset_A.map(add_scaled)
with open('train_dataset.pickle','wb') as f_p: with open('train_dataset.pickle','wb') as f_p:

View File

@ -10,7 +10,6 @@ from tqdm.auto import tqdm
BATCH_SIZE = 4 BATCH_SIZE = 4
with open('train_dataset.pickle','rb') as f_p: with open('train_dataset.pickle','rb') as f_p:
train_dataset = pickle.load(f_p) train_dataset = pickle.load(f_p)
@ -29,14 +28,14 @@ model = AutoModelForSequenceClassification.from_pretrained('roberta-base', num_l
optimizer = AdamW(model.parameters(), lr=1e-6) optimizer = AdamW(model.parameters(), lr=1e-6)
num_epochs = 3 num_epochs = 1
num_training_steps = num_epochs * len(train_dataloader) num_training_steps = num_epochs * len(train_dataloader)
lr_scheduler = get_scheduler( #lr_scheduler = get_scheduler(
"linear", # "linear",
optimizer=optimizer, # optimizer=optimizer,
num_warmup_steps=0, # num_warmup_steps=0,
num_training_steps=num_training_steps # num_training_steps=num_training_steps
) #)
device = torch.device("cuda") if torch.cuda.is_available() else torch.device("cpu") device = torch.device("cuda") if torch.cuda.is_available() else torch.device("cpu")
@ -71,7 +70,7 @@ def eval():
outputs = model(**batch) outputs = model(**batch)
loss = outputs.loss loss = outputs.loss
eval_loss += loss.item() eval_loss += loss.item()
print(f'eval loss: {eval_loss / i }') print(f'epoch {epoch} eval loss: {eval_loss / i }')
model.train() model.train()
@ -84,14 +83,14 @@ for epoch in range(num_epochs):
loss.backward() loss.backward()
optimizer.step() optimizer.step()
lr_scheduler.step() #lr_scheduler.step()
optimizer.zero_grad() optimizer.zero_grad()
progress_bar.update(1) progress_bar.update(1)
train_loss += loss.item() train_loss += loss.item()
#import pdb; pdb.set_trace() #import pdb; pdb.set_trace()
if i % 5000 == 0 and i > 1 : if i % 5000 == 0 and i > 1 :
print(f'train loss: {train_loss / 5000 }', end = '\t\t') print(f' epoch {epoch} train loss: {train_loss / 5000 }', end = '\t\t')
train_loss = 0.0 train_loss = 0.0
eval() eval()

View File

@ -20,7 +20,7 @@ with open('eval_dataset_full.pickle','rb') as f_p:
eval_dataset_full = pickle.load(f_p) eval_dataset_full = pickle.load(f_p)
device = 'cuda' device = 'cuda'
model = AutoModelForSequenceClassification.from_pretrained('./roberta_year_prediction') model = AutoModelForSequenceClassification.from_pretrained('./roberta_year_prediction/epoch_0')
model.eval() model.eval()
model.to(device) model.to(device)
@ -46,7 +46,7 @@ def predict(dataset, out_f):
del batch[c] del batch[c]
outputs.extend(model(**batch).logits.tolist()) outputs.extend(model(**batch).logits.tolist())
progress_bar.update(1) progress_bar.update(1)
outputs_transformed = scalers['year'].inverse_transform(outputs) outputs_transformed = scalers['year'].inverse_transform(outputs)
with open(out_f,'w') as f_out: with open(out_f,'w') as f_out:

File diff suppressed because it is too large Load Diff