From 33e11dad3d4cc010afd82ab5adb1acb520cd2b9c Mon Sep 17 00:00:00 2001 From: kubapok Date: Sat, 2 Jul 2022 12:02:13 +0200 Subject: [PATCH] hf_roberta_base --- hf_roberta_base/01_create_datasets.py | 16 +++ hf_roberta_base/02_load_dataset.py | 70 ++++++++++ .../03_train_pytorch_regression.py | 127 ++++++++++++++++++ hf_roberta_base/04_predict.py | 48 +++++++ hf_roberta_base/04_predict_from_file.py | 53 ++++++++ hf_roberta_base/config.py | 4 + 6 files changed, 318 insertions(+) create mode 100644 hf_roberta_base/01_create_datasets.py create mode 100644 hf_roberta_base/02_load_dataset.py create mode 100644 hf_roberta_base/03_train_pytorch_regression.py create mode 100644 hf_roberta_base/04_predict.py create mode 100644 hf_roberta_base/04_predict_from_file.py create mode 100644 hf_roberta_base/config.py diff --git a/hf_roberta_base/01_create_datasets.py b/hf_roberta_base/01_create_datasets.py new file mode 100644 index 0000000..4a1e22f --- /dev/null +++ b/hf_roberta_base/01_create_datasets.py @@ -0,0 +1,16 @@ + +for split in 'train', 'dev-0': + with open(f'../{split}/in.tsv') as f_in, open(f'../{split}/expected.tsv') as f_exp, open(f'./{split}_huggingface_format.csv', 'w') as f_hf: + f_hf.write('year_start_float\tyear_end_float\tyear_middle_float\tyear_middle_int\ttext\n') + for line_in, line_exp in zip(f_in, f_exp): + year_start_float, year_end_float = line_exp.rstrip().split(',') + year_middle_float = (float(year_start_float) + float(year_end_float)) / 2 + year_middle_int = round(year_middle_float) + f_hf.write(f'{year_start_float}\t{year_end_float}\t{year_middle_float}\t{year_middle_int}\t{line_in}') + +for split in ('test-A',): + with open(f'../{split}/in.tsv') as f_in, open(f'./{split}_huggingface_format.csv', 'w') as f_hf: + f_hf.write('year_start_float\tyear_end_float\tyear_middle_float\tyear_middle_int\ttext\n') + for line_in in f_in: + expected = '0.0\t0.0\t0.0\t0' + f_hf.write(expected + '\t' + line_in) diff --git a/hf_roberta_base/02_load_dataset.py b/hf_roberta_base/02_load_dataset.py new file mode 100644 index 0000000..a08d499 --- /dev/null +++ b/hf_roberta_base/02_load_dataset.py @@ -0,0 +1,70 @@ +from config import MODEL, TEST +import pickle +from datasets import load_dataset, Dataset +from transformers import AutoTokenizer +from tqdm import tqdm +from sklearn.preprocessing import MinMaxScaler +import numpy as np + +values = ('year_start_float', 'year_end_float', 'year_middle_float', 'year_middle_int', 'text') + +tokenizer = AutoTokenizer.from_pretrained(MODEL) +def tokenize_function(examples): + t = tokenizer(examples["text"], padding="max_length", truncation=True, max_length=512) + return t + +def get_dataset_dict(dataset): + with open(dataset) as f_in: + next(f_in) + d = dict() + for v in values: + d[v] = list() + for l in f_in: + args = l.rstrip().split('\t') + for v, a in zip(values, args): + d[v].append(a) + return d + +train_dataset = Dataset.from_dict(get_dataset_dict('train_huggingface_format.csv')).map(tokenize_function, batched=True).shuffle(seed=42) +eval_dataset_full = Dataset.from_dict(get_dataset_dict('dev-0_huggingface_format.csv')).map(tokenize_function, batched=True) +eval_dataset_small = eval_dataset_full.shuffle(seed=42).select(range(2000)) +test_dataset_A = Dataset.from_dict(get_dataset_dict('test-A_huggingface_format.csv')).map(tokenize_function, batched=True) + +if TEST: + train_dataset = train_dataset.select(range(500)) + eval_dataset_full = eval_dataset_full.select(range(400)) + eval_dataset_small = eval_dataset_small.select(range(50)) + test_dataset_A = test_dataset_A.select(range(200)) + + +scalers = dict() +values_to_scale = ('year_start_float', 'year_end_float', 'year_middle_float') +for v in values_to_scale: + scalers[v] = MinMaxScaler().fit(np.array(train_dataset[v]).reshape(-1, 1)) + +def add_scaled(example): + for factor in values_to_scale: + example[factor + '_scaled'] = scalers[factor].transform(np.array(example[factor]).reshape(-1,1)).reshape(1,-1)[0].item() + return example + + +train_dataset = train_dataset.map(add_scaled) +eval_dataset_full = eval_dataset_full.map(add_scaled) +eval_dataset_small = eval_dataset_small.map(add_scaled) +test_dataset_A = test_dataset_A.map(add_scaled) + + +with open('train_dataset.pickle', 'wb') as f_p: + pickle.dump(train_dataset, f_p) + +with open('eval_dataset_small.pickle', 'wb') as f_p: + pickle.dump(eval_dataset_small, f_p) + +with open('eval_dataset_full.pickle', 'wb') as f_p: + pickle.dump(eval_dataset_full, f_p) + +with open('test_dataset_A.pickle', 'wb') as f_p: + pickle.dump(test_dataset_A, f_p) + +with open('scalers.pickle', 'wb') as f_p: + pickle.dump(scalers, f_p) diff --git a/hf_roberta_base/03_train_pytorch_regression.py b/hf_roberta_base/03_train_pytorch_regression.py new file mode 100644 index 0000000..5694d43 --- /dev/null +++ b/hf_roberta_base/03_train_pytorch_regression.py @@ -0,0 +1,127 @@ +from config import MODEL, TEST +import pickle +from datasets import load_dataset +from transformers import AutoTokenizer, RobertaModel, RobertaTokenizer +from torch.utils.data import DataLoader +from transformers import AutoModelForSequenceClassification +from transformers import AdamW +from transformers import get_scheduler +import torch +from tqdm.auto import tqdm + +BATCH_SIZE = 1 +EARLY_STOPPING = 3 +WARMUP_STEPS = 10_000 + +STEPS_EVAL = 5_000 + +if TEST: + STEPS_EVAL = 100 + WARMUP_STEPS = 10 + +with open('train_dataset.pickle', 'rb') as f_p: + train_dataset = pickle.load(f_p) + +with open('eval_dataset_small.pickle', 'rb') as f_p: + eval_dataset_small = pickle.load(f_p) + +with open('eval_dataset_full.pickle', 'rb') as f_p: + eval_dataset_full = pickle.load(f_p) + + + +train_dataloader = DataLoader(train_dataset, shuffle=True, batch_size=BATCH_SIZE) +eval_dataloader_small = DataLoader(eval_dataset_small, batch_size=BATCH_SIZE) +eval_dataloader_full = DataLoader(eval_dataset_full, batch_size=BATCH_SIZE) + +model = AutoModelForSequenceClassification.from_pretrained(MODEL, num_labels=1) +optimizer = AdamW(model.parameters(), lr=1e-6) + + +num_epochs = 1 +num_training_steps = num_epochs * len(train_dataloader) +lr_scheduler = get_scheduler( + "linear", + optimizer=optimizer, + num_warmup_steps=WARMUP_STEPS, + num_training_steps=num_training_steps +) + + +device = torch.device("cuda") if torch.cuda.is_available() else torch.device("cpu") +model.to(device) + + +progress_bar = tqdm(range(num_training_steps)) +model.train() + +model.train() +model.to(device) + +def transform_batch(batch): + batch['input_ids'] = torch.stack(batch['input_ids']).permute(1,0).to(device) + batch['attention_mask'] = torch.stack(batch['attention_mask']).permute(1,0).to(device) + batch['labels'] = batch['year_middle_float_scaled'].to(device).float() + + batch['labels'].to(device) + batch['input_ids'].to(device) + batch['attention_mask'].to(device) + + for c in set(batch.keys()) - {'input_ids', 'attention_mask', 'labels'}: + del batch[c] + return batch + + +def eval(full=False): + model.eval() + eval_loss = 0.0 + dataloader = eval_dataloader_full if full else eval_dataloader_small + for i, batch in enumerate(dataloader): + batch = transform_batch(batch) + outputs = model(**batch) + loss = outputs.loss + eval_loss += loss.item() + print(f'epoch {epoch} eval loss: {eval_loss / i }') + model.train() + return eval_loss + + +best_eval_loss = 9999 +epochs_without_progress = 0 +for epoch in range(num_epochs): + train_loss = 0.0 + for i, batch in enumerate(train_dataloader): + batch = transform_batch(batch) + outputs = model(**batch) + loss = outputs.loss + loss.backward() + train_loss += loss.item() + progress_bar.update(1) + + # DELAYED UPDATE + #if i % 16 == 1 and i > 1: + # optimizer.step() + # #lr_scheduler.step() + # optimizer.zero_grad() + + # DELAYED UPDATE + optimizer.step() + lr_scheduler.step() + optimizer.zero_grad() + + if i % STEPS_EVAL == 0 and i > 1: + print(f' epoch {epoch} train loss: {train_loss / STEPS_EVAL }', end='\t\t') + train_loss = 0.0 + eval(full = False) + + model.save_pretrained(f'roberta_year_prediction/epoch_{epoch}') + eval_loss = eval(full=True) + + if eval_loss < best_eval_loss: + model.save_pretrained(f'roberta_year_prediction/epoch_best') + best_eval_loss = eval_loss + else: + epochs_without_progress += 1 + + if epochs_without_progress > EARLY_STOPPING: + break diff --git a/hf_roberta_base/04_predict.py b/hf_roberta_base/04_predict.py new file mode 100644 index 0000000..ac0b51f --- /dev/null +++ b/hf_roberta_base/04_predict.py @@ -0,0 +1,48 @@ +import pickle +import torch +from transformers import AutoModelForSequenceClassification +from torch.utils.data import DataLoader +from tqdm.auto import tqdm + +with open('eval_dataset_full.pickle','rb') as f_p: + eval_dataset_full = pickle.load(f_p) + +with open('test_dataset_A.pickle','rb') as f_p: + test_dataset = pickle.load(f_p) + +device = 'cuda' +model = AutoModelForSequenceClassification.from_pretrained('./roberta_year_prediction/epoch_best') +model.eval() +model.to(device) + +with open('scalers.pickle', 'rb') as f_scaler: + scalers = pickle.load(f_scaler) + +def predict(dataset, out_f): + eval_dataloader = DataLoader(dataset, batch_size=1) + outputs = [] + + progress_bar = tqdm(range(len(eval_dataloader))) + + for batch in eval_dataloader: + batch['input_ids'] = torch.stack(batch['input_ids']).permute(1,0).to(device) + batch['attention_mask'] = torch.stack(batch['attention_mask']).permute(1,0).to(device) + batch['labels'] = batch['year_middle_float_scaled'].to(device).float() + + batch['labels'].to(device) + batch['input_ids'].to(device) + batch['attention_mask'].to(device) + + for c in set(batch.keys()) - {'input_ids', 'attention_mask', 'labels'}: + del batch[c] + outputs.extend(model(**batch).logits.tolist()) + progress_bar.update(1) + outputs_transformed = scalers['year_middle_float'].inverse_transform(outputs) + + with open(out_f,'w') as f_out: + + for o in outputs_transformed: + f_out.write(str(o[0]) + '\n') + +predict(eval_dataset_full, '../dev-0/out.tsv') +predict(test_dataset, '../test-A/out.tsv') diff --git a/hf_roberta_base/04_predict_from_file.py b/hf_roberta_base/04_predict_from_file.py new file mode 100644 index 0000000..186df77 --- /dev/null +++ b/hf_roberta_base/04_predict_from_file.py @@ -0,0 +1,53 @@ +import pickle +import torch +from transformers import AutoModelForSequenceClassification, AutoTokenizer +from torch.utils.data import DataLoader +from tqdm.auto import tqdm + +#with open('train_dataset.pickle','rb') as f_p: +# train_dataset = pickle.load(f_p) +# +#with open('eval_dataset_small.pickle','rb') as f_p: +# eval_dataset_small = pickle.load(f_p) +# +#with open('eval_dataset_full.pickle','rb') as f_p: +# eval_dataset_full = pickle.load(f_p) +# +#with open('test_dataset_A.pickle','rb') as f_p: +# test_dataset_A = pickle.load(f_p) + +with open('dev-0_huggingface_format.csv','r') as f_p: + eval_dataset_full = f_p.readlines() + +with open('test-A_huggingface_format.csv','r') as f_p: + test_dataset = f_p.readlines() + +device = 'cuda' +model = AutoModelForSequenceClassification.from_pretrained('./roberta_year_prediction/epoch_best') +tokenizer = AutoTokenizer.from_pretrained('roberta-base') +model.eval() +model.to(device) + +with open('scalers.pickle', 'rb') as f_scaler: + scalers = pickle.load(f_scaler) + +tokenizer = AutoTokenizer.from_pretrained('roberta-base') +def predict(dataset, out_f): + outputs = [] + + for sample in tqdm(dataset[1:]): + y, t = sample.split('\t') + t = t.rstrip() + + t = tokenizer(t, padding="max_length", truncation=True, max_length=512, return_tensors='pt').to('cuda') + + outputs.extend(model(**t).logits.tolist()) + outputs_transformed = scalers['year'].inverse_transform(outputs) + + with open(out_f,'w') as f_out: + + for o in outputs_transformed: + f_out.write(str(o[0]) + '\n') + +predict(eval_dataset_full, '../dev-0/out.tsv') +predict(test_dataset, '../test-A/out.tsv') diff --git a/hf_roberta_base/config.py b/hf_roberta_base/config.py new file mode 100644 index 0000000..150159e --- /dev/null +++ b/hf_roberta_base/config.py @@ -0,0 +1,4 @@ +#MODEL = '/home/wmi/RoBERTa/without_date/checkpoint-1325000' +MODEL = 'roberta-base' +TEST=True +