This commit is contained in:
kubapok 2021-12-13 15:23:55 +01:00
parent 543b6e4e0f
commit f986c74861
3 changed files with 36 additions and 36 deletions

View File

@ -17,37 +17,37 @@ def tokenize_function(examples):
test_tokenized_datasets_A = test_dataset_A.map(tokenize_function, batched=True)
tokenized_datasets = dataset.map(tokenize_function, batched=True)
#train_dataset = tokenized_datasets["train"].shuffle(seed=42)
train_dataset = tokenized_datasets["train"].shuffle(seed=42)
eval_dataset_full = tokenized_datasets["test"]
#eval_dataset_small = tokenized_datasets["test"].select(range(2000))
#test_dataset_A = test_tokenized_datasets_A["train"]
#
#
#scalers = dict()
#scalers['year'] = MinMaxScaler().fit(np.array(train_dataset['year']).reshape(-1,1))
#
#def add_scaled(example):
# for factor in ('year',):
# example[factor + '_scaled'] = scalers[factor].transform(np.array(example[factor]).reshape(-1,1)).reshape(1,-1)[0].item()
# return example
#
#train_dataset = train_dataset.map(add_scaled)
#eval_dataset_full = eval_dataset_full.map(add_scaled)
#eval_dataset_small = eval_dataset_small.map(add_scaled)
##test_dataset_A = test_dataset_A.map(add_scaled)
#
#
#with open('train_dataset.pickle','wb') as f_p:
# pickle.dump(train_dataset, f_p)
#
#with open('eval_dataset_small.pickle','wb') as f_p:
# pickle.dump(eval_dataset_small, f_p)
#
#with open('eval_dataset_full.pickle','wb') as f_p:
# pickle.dump(eval_dataset_full, f_p)
#
#with open('test_dataset_A.pickle','wb') as f_p:
# pickle.dump(test_dataset_A, f_p)
#
#with open('scalers.pickle','wb') as f_p:
# pickle.dump(scalers, f_p)
eval_dataset_small = tokenized_datasets["test"].select(range(2000))
test_dataset_A = test_tokenized_datasets_A["train"]
scalers = dict()
scalers['year'] = MinMaxScaler().fit(np.array(train_dataset['year']).reshape(-1,1))
def add_scaled(example):
for factor in ('year',):
example[factor + '_scaled'] = scalers[factor].transform(np.array(example[factor]).reshape(-1,1)).reshape(1,-1)[0].item()
return example
train_dataset = train_dataset.map(add_scaled)
eval_dataset_full = eval_dataset_full.map(add_scaled)
eval_dataset_small = eval_dataset_small.map(add_scaled)
#test_dataset_A = test_dataset_A.map(add_scaled)
with open('train_dataset.pickle','wb') as f_p:
pickle.dump(train_dataset, f_p)
with open('eval_dataset_small.pickle','wb') as f_p:
pickle.dump(eval_dataset_small, f_p)
with open('eval_dataset_full.pickle','wb') as f_p:
pickle.dump(eval_dataset_full, f_p)
with open('test_dataset_A.pickle','wb') as f_p:
pickle.dump(test_dataset_A, f_p)
with open('scalers.pickle','wb') as f_p:
pickle.dump(scalers, f_p)

View File

@ -29,7 +29,7 @@ model = AutoModelForSequenceClassification.from_pretrained('roberta-base', num_l
optimizer = AdamW(model.parameters(), lr=1e-6)
num_epochs = 1
num_epochs = 3
num_training_steps = num_epochs * len(train_dataloader)
lr_scheduler = get_scheduler(
"linear",
@ -95,4 +95,4 @@ for epoch in range(num_epochs):
train_loss = 0.0
eval()
model.save_pretrained('roberta_year_prediction')
model.save_pretrained(f'roberta_year_prediction/epoch_{epoch}')

View File

@ -23,7 +23,7 @@ with open('test-A_huggingface_format.csv','r') as f_p:
test_dataset = f_p.readlines()
device = 'cuda'
model = AutoModelForSequenceClassification.from_pretrained('./roberta_year_prediction')
model = AutoModelForSequenceClassification.from_pretrained('./roberta_year_prediction/epoch_0')
tokenizer = AutoTokenizer.from_pretrained('roberta-base')
model.eval()
model.to(device)