diff --git a/hf_roberta_base/02_load_dataset.py b/hf_roberta_base/02_load_dataset.py index 6fc1981..c3a523a 100644 --- a/hf_roberta_base/02_load_dataset.py +++ b/hf_roberta_base/02_load_dataset.py @@ -17,37 +17,37 @@ def tokenize_function(examples): test_tokenized_datasets_A = test_dataset_A.map(tokenize_function, batched=True) tokenized_datasets = dataset.map(tokenize_function, batched=True) -#train_dataset = tokenized_datasets["train"].shuffle(seed=42) +train_dataset = tokenized_datasets["train"].shuffle(seed=42) eval_dataset_full = tokenized_datasets["test"] -#eval_dataset_small = tokenized_datasets["test"].select(range(2000)) -#test_dataset_A = test_tokenized_datasets_A["train"] -# -# -#scalers = dict() -#scalers['year'] = MinMaxScaler().fit(np.array(train_dataset['year']).reshape(-1,1)) -# -#def add_scaled(example): -# for factor in ('year',): -# example[factor + '_scaled'] = scalers[factor].transform(np.array(example[factor]).reshape(-1,1)).reshape(1,-1)[0].item() -# return example -# -#train_dataset = train_dataset.map(add_scaled) -#eval_dataset_full = eval_dataset_full.map(add_scaled) -#eval_dataset_small = eval_dataset_small.map(add_scaled) -##test_dataset_A = test_dataset_A.map(add_scaled) -# -# -#with open('train_dataset.pickle','wb') as f_p: -# pickle.dump(train_dataset, f_p) -# -#with open('eval_dataset_small.pickle','wb') as f_p: -# pickle.dump(eval_dataset_small, f_p) -# -#with open('eval_dataset_full.pickle','wb') as f_p: -# pickle.dump(eval_dataset_full, f_p) -# -#with open('test_dataset_A.pickle','wb') as f_p: -# pickle.dump(test_dataset_A, f_p) -# -#with open('scalers.pickle','wb') as f_p: -# pickle.dump(scalers, f_p) +eval_dataset_small = tokenized_datasets["test"].select(range(2000)) +test_dataset_A = test_tokenized_datasets_A["train"] + + +scalers = dict() +scalers['year'] = MinMaxScaler().fit(np.array(train_dataset['year']).reshape(-1,1)) + +def add_scaled(example): + for factor in ('year',): + example[factor + '_scaled'] = scalers[factor].transform(np.array(example[factor]).reshape(-1,1)).reshape(1,-1)[0].item() + return example + +train_dataset = train_dataset.map(add_scaled) +eval_dataset_full = eval_dataset_full.map(add_scaled) +eval_dataset_small = eval_dataset_small.map(add_scaled) +#test_dataset_A = test_dataset_A.map(add_scaled) + + +with open('train_dataset.pickle','wb') as f_p: + pickle.dump(train_dataset, f_p) + +with open('eval_dataset_small.pickle','wb') as f_p: + pickle.dump(eval_dataset_small, f_p) + +with open('eval_dataset_full.pickle','wb') as f_p: + pickle.dump(eval_dataset_full, f_p) + +with open('test_dataset_A.pickle','wb') as f_p: + pickle.dump(test_dataset_A, f_p) + +with open('scalers.pickle','wb') as f_p: + pickle.dump(scalers, f_p) diff --git a/hf_roberta_base/03_train_pytorch_regression.py b/hf_roberta_base/03_train_pytorch_regression.py index ca760eb..8737f45 100644 --- a/hf_roberta_base/03_train_pytorch_regression.py +++ b/hf_roberta_base/03_train_pytorch_regression.py @@ -29,7 +29,7 @@ model = AutoModelForSequenceClassification.from_pretrained('roberta-base', num_l optimizer = AdamW(model.parameters(), lr=1e-6) -num_epochs = 1 +num_epochs = 3 num_training_steps = num_epochs * len(train_dataloader) lr_scheduler = get_scheduler( "linear", @@ -95,4 +95,4 @@ for epoch in range(num_epochs): train_loss = 0.0 eval() -model.save_pretrained('roberta_year_prediction') + model.save_pretrained(f'roberta_year_prediction/epoch_{epoch}') diff --git a/hf_roberta_base/04_predict_from_file.py b/hf_roberta_base/04_predict_from_file.py index c02d895..c0adbde 100644 --- a/hf_roberta_base/04_predict_from_file.py +++ b/hf_roberta_base/04_predict_from_file.py @@ -23,7 +23,7 @@ with open('test-A_huggingface_format.csv','r') as f_p: test_dataset = f_p.readlines() device = 'cuda' -model = AutoModelForSequenceClassification.from_pretrained('./roberta_year_prediction') +model = AutoModelForSequenceClassification.from_pretrained('./roberta_year_prediction/epoch_0') tokenizer = AutoTokenizer.from_pretrained('roberta-base') model.eval() model.to(device)