From 69cd2f5a8b54fe7cbfea7c35ae24db6244532247 Mon Sep 17 00:00:00 2001 From: Jakub Pokrywka Date: Wed, 15 Dec 2021 14:40:12 +0100 Subject: [PATCH] fixes --- hf_challam_roberta_base/01_create_datasets.py | 13 --- hf_challam_roberta_base/02_load_dataset.py | 53 ---------- .../03_train_pytorch_regression.py | 98 ------------------- hf_challam_roberta_base/04_predict.py | 55 ----------- .../04_predict_from_file.py | 52 ---------- hf_roberta_base/02_load_dataset.py | 23 ++--- .../03_train_pytorch_regression.py | 71 ++++++++++---- hf_roberta_base/04_predict.py | 19 +--- hf_roberta_base/04_predict_from_file.py | 2 +- hf_roberta_base/config.py | 2 + 10 files changed, 70 insertions(+), 318 deletions(-) delete mode 100644 hf_challam_roberta_base/01_create_datasets.py delete mode 100644 hf_challam_roberta_base/02_load_dataset.py delete mode 100644 hf_challam_roberta_base/03_train_pytorch_regression.py delete mode 100644 hf_challam_roberta_base/04_predict.py delete mode 100644 hf_challam_roberta_base/04_predict_from_file.py create mode 100644 hf_roberta_base/config.py diff --git a/hf_challam_roberta_base/01_create_datasets.py b/hf_challam_roberta_base/01_create_datasets.py deleted file mode 100644 index 2288a27..0000000 --- a/hf_challam_roberta_base/01_create_datasets.py +++ /dev/null @@ -1,13 +0,0 @@ -import datetime - -for split in 'train', 'dev-0': - with open(f'../{split}/in.tsv') as f_in, open(f'../{split}/expected.tsv') as f_exp, open(f'./{split}_huggingface_format.csv', 'w') as f_hf: - f_hf.write('year\ttext\n') - for line_in,line_exp in zip(f_in,f_exp): - f_hf.write(line_exp.rstrip() + '\t' + line_in.split('\t')[1]) - -for split in ('test-A',): - with open(f'../{split}/in.tsv') as f_in, open(f'./{split}_huggingface_format.csv', 'w') as f_hf: - f_hf.write('year\ttext\n') - for line_in in f_in: - f_hf.write('0.0' + '\t' + line_in.split('\t')[1]) diff --git a/hf_challam_roberta_base/02_load_dataset.py b/hf_challam_roberta_base/02_load_dataset.py deleted file mode 100644 index 49f5bee..0000000 --- a/hf_challam_roberta_base/02_load_dataset.py +++ /dev/null @@ -1,53 +0,0 @@ -import pickle -from datasets import load_dataset -from transformers import AutoTokenizer -from tqdm import tqdm -from sklearn.preprocessing import MinMaxScaler -import numpy as np - -dataset = load_dataset('csv', sep='\t', data_files={'train': ['./train_huggingface_format.csv'], 'test': ['./dev-0_huggingface_format.csv']}) -test_dataset_A = load_dataset('csv', sep='\t', data_files='test-A_huggingface_format.csv') - -tokenizer = AutoTokenizer.from_pretrained('without_date/checkpoint-395000') - -def tokenize_function(examples): - t = tokenizer(examples["text"], padding="max_length", truncation=True, max_length=512) - return t - -test_tokenized_datasets_A = test_dataset_A.map(tokenize_function, batched=True) -tokenized_datasets = dataset.map(tokenize_function, batched=True) - -train_dataset = tokenized_datasets["train"].shuffle(seed=42) -eval_dataset_full = tokenized_datasets["test"] -eval_dataset_small = tokenized_datasets["test"].select(range(2000)) -test_dataset_A = test_tokenized_datasets_A["train"] - - -scalers = dict() -scalers['year'] = MinMaxScaler().fit(np.array(train_dataset['year']).reshape(-1,1)) - -def add_scaled(example): - for factor in ('year',): - example[factor + '_scaled'] = scalers[factor].transform(np.array(example[factor]).reshape(-1,1)).reshape(1,-1)[0].item() - return example - -train_dataset = train_dataset.map(add_scaled) -eval_dataset_full = eval_dataset_full.map(add_scaled) -eval_dataset_small = eval_dataset_small.map(add_scaled) -#test_dataset_A = test_dataset_A.map(add_scaled) - - -with open('train_dataset.pickle','wb') as f_p: - pickle.dump(train_dataset, f_p) - -with open('eval_dataset_small.pickle','wb') as f_p: - pickle.dump(eval_dataset_small, f_p) - -with open('eval_dataset_full.pickle','wb') as f_p: - pickle.dump(eval_dataset_full, f_p) - -with open('test_dataset_A.pickle','wb') as f_p: - pickle.dump(test_dataset_A, f_p) - -with open('scalers.pickle','wb') as f_p: - pickle.dump(scalers, f_p) diff --git a/hf_challam_roberta_base/03_train_pytorch_regression.py b/hf_challam_roberta_base/03_train_pytorch_regression.py deleted file mode 100644 index ceffade..0000000 --- a/hf_challam_roberta_base/03_train_pytorch_regression.py +++ /dev/null @@ -1,98 +0,0 @@ -import pickle -from datasets import load_dataset -from transformers import AutoTokenizer, RobertaModel, RobertaTokenizer -from torch.utils.data import DataLoader -from transformers import AutoModelForSequenceClassification -from transformers import AdamW -from transformers import get_scheduler -import torch -from tqdm.auto import tqdm - -BATCH_SIZE = 4 - - -with open('train_dataset.pickle','rb') as f_p: - train_dataset = pickle.load(f_p) - -with open('eval_dataset_small.pickle','rb') as f_p: - eval_dataset_small = pickle.load(f_p) - -with open('eval_dataset_full.pickle','rb') as f_p: - eval_dataset_full = pickle.load(f_p) - - -train_dataloader = DataLoader(train_dataset, shuffle=True, batch_size=BATCH_SIZE) -eval_dataloader = DataLoader(eval_dataset_small, batch_size=BATCH_SIZE) - - -model = AutoModelForSequenceClassification.from_pretrained('without_date/checkpoint-395000', num_labels=1) -optimizer = AdamW(model.parameters(), lr=1e-6) - - -num_epochs = 1 -num_training_steps = num_epochs * len(train_dataloader) -lr_scheduler = get_scheduler( - "linear", - optimizer=optimizer, - num_warmup_steps=0, - num_training_steps=num_training_steps -) - - -device = torch.device("cuda") if torch.cuda.is_available() else torch.device("cpu") -model.to(device) - - -progress_bar = tqdm(range(num_training_steps)) -model.train() - -model.train() -model.to(device) - -def transform_batch(batch): - batch['input_ids'] = torch.stack(batch['input_ids']).permute(1,0).to(device) - batch['attention_mask'] = torch.stack(batch['attention_mask']).permute(1,0).to(device) - batch['labels'] = batch['year_scaled'].to(device).float() - - batch['labels'].to(device) - batch['input_ids'].to(device) - batch['attention_mask'].to(device) - - for c in set(batch.keys()) - {'input_ids', 'attention_mask', 'labels'}: - del batch[c] - return batch - - -def eval(): - model.eval() - eval_loss = 0.0 - for i, batch in enumerate(eval_dataloader): - batch = transform_batch(batch) - outputs = model(**batch) - loss = outputs.loss - eval_loss += loss.item() - print(f'eval loss: {eval_loss / i }') - model.train() - - -for epoch in range(num_epochs): - train_loss = 0.0 - for i, batch in enumerate(train_dataloader): - batch = transform_batch(batch) - outputs = model(**batch) - loss = outputs.loss - loss.backward() - - optimizer.step() - lr_scheduler.step() - optimizer.zero_grad() - progress_bar.update(1) - - train_loss += loss.item() - #import pdb; pdb.set_trace() - if i % 5000 == 0 and i > 1 : - print(f'train loss: {train_loss / 5000 }', end = '\t\t') - train_loss = 0.0 - eval() - -model.save_pretrained('roberta_year_prediction') diff --git a/hf_challam_roberta_base/04_predict.py b/hf_challam_roberta_base/04_predict.py deleted file mode 100644 index b9d30fd..0000000 --- a/hf_challam_roberta_base/04_predict.py +++ /dev/null @@ -1,55 +0,0 @@ -import pickle -import torch -from transformers import AutoModelForSequenceClassification -from torch.utils.data import DataLoader -from tqdm.auto import tqdm - -with open('train_dataset.pickle','rb') as f_p: - train_dataset = pickle.load(f_p) - -with open('eval_dataset_small.pickle','rb') as f_p: - eval_dataset_small = pickle.load(f_p) - -with open('eval_dataset_full.pickle','rb') as f_p: - eval_dataset_full = pickle.load(f_p) - -with open('test_dataset_A.pickle','rb') as f_p: - test_dataset_A = pickle.load(f_p) - - -device = 'cuda' -model = AutoModelForSequenceClassification.from_pretrained('./roberta_year_prediction') -model.eval() -model.to(device) - -with open('scalers.pickle', 'rb') as f_scaler: - scalers = pickle.load(f_scaler) - -def predict(dataset, out_f): - eval_dataloader = DataLoader(dataset, batch_size=1) - outputs = [] - - progress_bar = tqdm(range(len(eval_dataloader))) - - for batch in eval_dataloader: - batch['input_ids'] = torch.stack(batch['input_ids']).permute(1,0).to(device) - batch['attention_mask'] = torch.stack(batch['attention_mask']).permute(1,0).to(device) - batch['labels'] = batch['year_scaled'].to(device).float() - - batch['labels'].to(device) - batch['input_ids'].to(device) - batch['attention_mask'].to(device) - - for c in set(batch.keys()) - {'input_ids', 'attention_mask', 'labels'}: - del batch[c] - outputs.extend(model(**batch).logits.tolist()) - progress_bar.update(1) - outputs_transformed = scalers['year'].inverse_transform(outputs) - - with open(out_f,'w') as f_out: - - for o in outputs_transformed: - f_out.write(str(o[0]) + '\n') - -predict(eval_dataset_full, '../dev-0/out.tsv') -predict(eval_dataset_full, '../test-A/out.tsv') diff --git a/hf_challam_roberta_base/04_predict_from_file.py b/hf_challam_roberta_base/04_predict_from_file.py deleted file mode 100644 index 6012ef1..0000000 --- a/hf_challam_roberta_base/04_predict_from_file.py +++ /dev/null @@ -1,52 +0,0 @@ -import pickle -import torch -from transformers import AutoModelForSequenceClassification, AutoTokenizer -from torch.utils.data import DataLoader -from tqdm.auto import tqdm - -#with open('train_dataset.pickle','rb') as f_p: -# train_dataset = pickle.load(f_p) -# -#with open('eval_dataset_small.pickle','rb') as f_p: -# eval_dataset_small = pickle.load(f_p) -# -#with open('eval_dataset_full.pickle','rb') as f_p: -# eval_dataset_full = pickle.load(f_p) -# -#with open('test_dataset_A.pickle','rb') as f_p: -# test_dataset_A = pickle.load(f_p) - -with open('dev-0_huggingface_format.csv','r') as f_p: - eval_dataset_full = f_p.readlines() - -with open('test-A_huggingface_format.csv','r') as f_p: - test_dataset = f_p.readlines() - -device = 'cuda' -model = AutoModelForSequenceClassification.from_pretrained('./roberta_year_prediction') -tokenizer = AutoTokenizer.from_pretrained('without_date/checkpoint-395000/') -model.eval() -model.to(device) - -with open('scalers.pickle', 'rb') as f_scaler: - scalers = pickle.load(f_scaler) - -def predict(dataset, out_f): - outputs = [] - - for sample in tqdm(dataset[1:]): - y, t = sample.split('\t') - t = t.rstrip() - - t = tokenizer(t, padding="max_length", truncation=True, max_length=512, return_tensors='pt').to('cuda') - - outputs.extend(model(**t).logits.tolist()) - outputs_transformed = scalers['year'].inverse_transform(outputs) - - with open(out_f,'w') as f_out: - - for o in outputs_transformed: - f_out.write(str(o[0]) + '\n') - -predict(eval_dataset_full, '../dev-0/out.tsv') -predict(test_dataset, '../test-A/out.tsv') diff --git a/hf_roberta_base/02_load_dataset.py b/hf_roberta_base/02_load_dataset.py index 2f79e79..82ad6cf 100644 --- a/hf_roberta_base/02_load_dataset.py +++ b/hf_roberta_base/02_load_dataset.py @@ -1,3 +1,4 @@ +from config import MODEL, TEST import pickle from datasets import load_dataset, Dataset from transformers import AutoTokenizer @@ -5,17 +6,10 @@ from tqdm import tqdm from sklearn.preprocessing import MinMaxScaler import numpy as np -#dataset = load_dataset('csv', sep='\t', data_files={'train': ['./train_huggingface_format.csv'], 'test': ['./dev-0_huggingface_format.csv']}) -#test_dataset_A = load_dataset('csv', sep='\t', data_files='test-A_huggingface_format.csv') -# -tokenizer = AutoTokenizer.from_pretrained('roberta-base') -# +tokenizer = AutoTokenizer.from_pretrained(MODEL) def tokenize_function(examples): t = tokenizer(examples["text"], padding="max_length", truncation=True, max_length=512) return t -# -#test_tokenized_datasets_A = test_dataset_A.map(tokenize_function, batched=True) -#tokenized_datasets = dataset.map(tokenize_function, batched=True) def get_dataset_dict(dataset): with open(dataset) as f_in: @@ -30,9 +24,16 @@ def get_dataset_dict(dataset): return d train_dataset = Dataset.from_dict(get_dataset_dict('train_huggingface_format.csv')).map(tokenize_function, batched=True).shuffle(seed=42) -eval_dataset_full = Dataset.from_dict(get_dataset_dict('dev-0_huggingface_format.csv')).map(tokenize_function, batched=True).shuffle(seed=42) -eval_dataset_small = eval_dataset_full.select(range(2000)) -test_dataset_A = Dataset.from_dict(get_dataset_dict('test-A_huggingface_format.csv')).map(tokenize_function, batched=True).shuffle(seed=42) +eval_dataset_full = Dataset.from_dict(get_dataset_dict('dev-0_huggingface_format.csv')).map(tokenize_function, batched=True) +eval_dataset_small = eval_dataset_full.shuffle(seed=42).select(range(2000)) +test_dataset_A = Dataset.from_dict(get_dataset_dict('test-A_huggingface_format.csv')).map(tokenize_function, batched=True) + +if TEST: + train_dataset = train_dataset.select(range(500)) + eval_dataset_full = eval_dataset_full.select(range(400)) + eval_dataset_small = eval_dataset_small.select(range(50)) + test_dataset_A = test_dataset_A.select(range(200)) + scalers = dict() diff --git a/hf_roberta_base/03_train_pytorch_regression.py b/hf_roberta_base/03_train_pytorch_regression.py index 1c39f50..1b5eb30 100644 --- a/hf_roberta_base/03_train_pytorch_regression.py +++ b/hf_roberta_base/03_train_pytorch_regression.py @@ -1,3 +1,4 @@ +from config import MODEL, TEST import pickle from datasets import load_dataset from transformers import AutoTokenizer, RobertaModel, RobertaTokenizer @@ -9,6 +10,14 @@ import torch from tqdm.auto import tqdm BATCH_SIZE = 4 +EARLY_STOPPING = 3 +WARMUP_STEPS = 10_000 + +STEPS_EVAL = 5_000 +if TEST: + STEPS_EVAL = 100 + WARMUP_STEPS = 10 + with open('train_dataset.pickle','rb') as f_p: train_dataset = pickle.load(f_p) @@ -21,21 +30,21 @@ with open('eval_dataset_full.pickle','rb') as f_p: train_dataloader = DataLoader(train_dataset, shuffle=True, batch_size=BATCH_SIZE) -eval_dataloader = DataLoader(eval_dataset_small, batch_size=BATCH_SIZE) +eval_dataloader_small = DataLoader(eval_dataset_small, batch_size=BATCH_SIZE) +eval_dataloader_full = DataLoader(eval_dataset_full, batch_size=BATCH_SIZE) - -model = AutoModelForSequenceClassification.from_pretrained('roberta-base', num_labels=1) +model = AutoModelForSequenceClassification.from_pretrained(MODEL, num_labels=1) optimizer = AdamW(model.parameters(), lr=1e-6) -num_epochs = 1 +num_epochs = 5 num_training_steps = num_epochs * len(train_dataloader) -#lr_scheduler = get_scheduler( -# "linear", -# optimizer=optimizer, -# num_warmup_steps=0, -# num_training_steps=num_training_steps -#) +lr_scheduler = get_scheduler( + "linear", + optimizer=optimizer, + num_warmup_steps=WARMUP_STEPS, + num_training_steps=num_training_steps +) device = torch.device("cuda") if torch.cuda.is_available() else torch.device("cpu") @@ -62,18 +71,22 @@ def transform_batch(batch): return batch -def eval(): +def eval(full = False): model.eval() eval_loss = 0.0 - for i, batch in enumerate(eval_dataloader): + dataloader = eval_dataloader_full if full else eval_dataloader_small + for i, batch in enumerate(eval_dataloader_small): batch = transform_batch(batch) outputs = model(**batch) loss = outputs.loss eval_loss += loss.item() print(f'epoch {epoch} eval loss: {eval_loss / i }') model.train() + return eval_loss +best_eval_loss = 9999 +epochs_without_progress = 0 for epoch in range(num_epochs): train_loss = 0.0 for i, batch in enumerate(train_dataloader): @@ -81,17 +94,33 @@ for epoch in range(num_epochs): outputs = model(**batch) loss = outputs.loss loss.backward() - - optimizer.step() - #lr_scheduler.step() - optimizer.zero_grad() + train_loss += loss.item() progress_bar.update(1) - train_loss += loss.item() - #import pdb; pdb.set_trace() - if i % 5000 == 0 and i > 1 : - print(f' epoch {epoch} train loss: {train_loss / 5000 }', end = '\t\t') + # DELAYED UPDATE + #if i % 16 == 1 and i > 1: + # optimizer.step() + # #lr_scheduler.step() + # optimizer.zero_grad() + + # DELAYED UPDATE + optimizer.step() + lr_scheduler.step() + optimizer.zero_grad() + + if i % STEPS_EVAL == 0 and i > 1 : + print(f' epoch {epoch} train loss: {train_loss / STEPS_EVAL }', end = '\t\t') train_loss = 0.0 - eval() + eval(full = False) model.save_pretrained(f'roberta_year_prediction/epoch_{epoch}') + eval_loss = eval(full=True) + + if eval_loss < best_eval_loss: + model.save_pretrained(f'roberta_year_prediction/epoch_best') + best_eval_loss = eval_loss + else: + epochs_without_progress += 1 + + if epochs_without_progress > EARLY_STOPPING: + break diff --git a/hf_roberta_base/04_predict.py b/hf_roberta_base/04_predict.py index c39ed6d..e4a777e 100644 --- a/hf_roberta_base/04_predict.py +++ b/hf_roberta_base/04_predict.py @@ -4,23 +4,14 @@ from transformers import AutoModelForSequenceClassification from torch.utils.data import DataLoader from tqdm.auto import tqdm -#with open('train_dataset.pickle','rb') as f_p: -# train_dataset = pickle.load(f_p) -# -#with open('eval_dataset_small.pickle','rb') as f_p: -# eval_dataset_small = pickle.load(f_p) -# -#with open('eval_dataset_full.pickle','rb') as f_p: -# eval_dataset_full = pickle.load(f_p) -# -#with open('test_dataset_A.pickle','rb') as f_p: -# test_dataset_A = pickle.load(f_p) - with open('eval_dataset_full.pickle','rb') as f_p: eval_dataset_full = pickle.load(f_p) +with open('test_dataset_A.pickle','rb') as f_p: + test_dataset = pickle.load(f_p) + device = 'cuda' -model = AutoModelForSequenceClassification.from_pretrained('./roberta_year_prediction/epoch_0') +model = AutoModelForSequenceClassification.from_pretrained('./roberta_year_prediction/epoch_best') model.eval() model.to(device) @@ -54,4 +45,4 @@ def predict(dataset, out_f): f_out.write(str(o[0]) + '\n') predict(eval_dataset_full, '../dev-0/out.tsv') -predict(eval_dataset_full, '../test-A/out.tsv') +predict(test_dataset, '../test-A/out.tsv') diff --git a/hf_roberta_base/04_predict_from_file.py b/hf_roberta_base/04_predict_from_file.py index c0adbde..186df77 100644 --- a/hf_roberta_base/04_predict_from_file.py +++ b/hf_roberta_base/04_predict_from_file.py @@ -23,7 +23,7 @@ with open('test-A_huggingface_format.csv','r') as f_p: test_dataset = f_p.readlines() device = 'cuda' -model = AutoModelForSequenceClassification.from_pretrained('./roberta_year_prediction/epoch_0') +model = AutoModelForSequenceClassification.from_pretrained('./roberta_year_prediction/epoch_best') tokenizer = AutoTokenizer.from_pretrained('roberta-base') model.eval() model.to(device) diff --git a/hf_roberta_base/config.py b/hf_roberta_base/config.py new file mode 100644 index 0000000..398de52 --- /dev/null +++ b/hf_roberta_base/config.py @@ -0,0 +1,2 @@ +MODEL = '../MODELS/without_date/checkpoint-395000' +TEST=False