From d6d7a4dbda3bad35d70c0bf6a17b4d8b3a9e0664 Mon Sep 17 00:00:00 2001 From: kubapok Date: Fri, 24 Sep 2021 15:29:02 +0200 Subject: [PATCH] a --- roberta_temp/01_create_datasets.py | 33 +++++-- .../01a_create_guess_date_datasets.py | 17 ++++ roberta_temp/02_load_dataset.py | 24 +++++- roberta_temp/02a_load_guess_date_datasets.py | 53 ++++++++++++ roberta_temp/03_train.py | 14 ++- roberta_temp/03a_train_guess_day.py | 70 +++++++++++++++ roberta_temp/04_predict.py | 85 +++++++++++++++++++ 7 files changed, 280 insertions(+), 16 deletions(-) create mode 100644 roberta_temp/01a_create_guess_date_datasets.py create mode 100644 roberta_temp/02a_load_guess_date_datasets.py create mode 100644 roberta_temp/03a_train_guess_day.py create mode 100644 roberta_temp/04_predict.py diff --git a/roberta_temp/01_create_datasets.py b/roberta_temp/01_create_datasets.py index 4512557..2884c65 100644 --- a/roberta_temp/01_create_datasets.py +++ b/roberta_temp/01_create_datasets.py @@ -1,17 +1,32 @@ +import datetime from config import LABELS_DICT -with open('../test-A/in.tsv','r') as f_in, open(f'../test-A/huggingface_format_year.tsv', 'w') as f_hf: - f_hf.write('text\n') +with open('../test-A/in.csv','r') as f_in, open(f'../test-A/huggingface_format_year.csv', 'w') as f_hf: + f_hf.write('text\tyear_cont\tdate\tday_of_year\tday_of_month\tmonth\tyear\tweekday\tlabel\n') for line_in in f_in: - year, _, text = line_in.split('\t') - f_hf.write(year + '\t' + text) + year_cont, date, text = line_in.rstrip('\n').split('\t') + d = datetime.datetime.strptime(date,"%Y%m%d") + day_of_year = str(d.timetuple().tm_yday) + day_of_month = str(d.day) + month = str(d.month) + year = str(d.year) + weekday = str(d.weekday()) + day_of_year = str(d.timetuple().tm_yday) + f_hf.write(text +'\t' +year_cont +'\t'+ date + '\t' + day_of_year + '\t' + day_of_month + '\t' + month + '\t' + year + '\t' + weekday + '\t' + str('0') + '\n') for dataset in 'train', 'dev-0': - with open(f'../{dataset}/in.tsv') as f_in, open(f'../{dataset}/expected.tsv') as f_exp, open(f'../{dataset}/huggingface_format_year.tsv','w') as f_hf: - f_hf.write('text\tyear\tlabel\n') + with open(f'../{dataset}/in.csv') as f_in, open(f'../{dataset}/expected.csv') as f_exp, open(f'../{dataset}/huggingface_format_year.csv','w') as f_hf: + f_hf.write('text\tyear_cont\tdate\tday_of_year\tday_of_month\tmonth\tyear\tweekday\tlabel\n') for line_in, line_exp in zip(f_in, f_exp): - label = LABELS_DICT[line_exp.rstrip('\n')] - year,_,text = line_in.rstrip('\n').split('\t') - f_hf.write(text +'\t' +year +'\t'+ str(label) + '\n') + label = str(LABELS_DICT[line_exp.rstrip('\n')]) + year_cont,date,text = line_in.rstrip('\n').split('\t') + d = datetime.datetime.strptime(date,"%Y%m%d") + day_of_year = str(d.timetuple().tm_yday) + day_of_month = str(d.day) + month = str(d.month) + year = str(d.year) + weekday = str(d.weekday()) + day_of_year = str(d.timetuple().tm_yday) + f_hf.write(text +'\t' +year_cont +'\t'+ date + '\t'+ day_of_year + '\t' + day_of_month + '\t' + month + '\t' + year + '\t' + weekday + '\t' + label + '\n') diff --git a/roberta_temp/01a_create_guess_date_datasets.py b/roberta_temp/01a_create_guess_date_datasets.py new file mode 100644 index 0000000..0d0f498 --- /dev/null +++ b/roberta_temp/01a_create_guess_date_datasets.py @@ -0,0 +1,17 @@ +import datetime +for dataset in 'train', 'dev-0': + with open(f'../{dataset}/in.tsv') as f_in, open(f'../{dataset}/expected.tsv') as f_exp, open(f'../{dataset}/huggingface_guess_day.csv','w') as f_hf: + f_hf.write('text\tyear_cont\tdate\tday_of_year\tday_of_month\tmonth\tyear\tweekday\tlabel\n') + for line_in, line_exp in zip(f_in, f_exp): + year_cont,date,text = line_in.rstrip('\n').split('\t') + d = datetime.datetime.strptime(date,"%Y%m%d") + day_of_year = str(d.timetuple().tm_yday) + day_of_month = str(d.day) + month = str(d.month) + year = str(d.year) + weekday = str(d.weekday()) + day_of_year = str(d.timetuple().tm_yday) + #label = f'year:{year} month:{month} day:{day_of_month} weekday:{weekday}' + label = weekday + f_hf.write(text +'\t' +year_cont +'\t'+ date + '\t'+ day_of_year + '\t' + day_of_month + '\t' + month + '\t' + year + '\t' + weekday + '\t' + label + '\n') + diff --git a/roberta_temp/02_load_dataset.py b/roberta_temp/02_load_dataset.py index e1c288d..b9b609a 100644 --- a/roberta_temp/02_load_dataset.py +++ b/roberta_temp/02_load_dataset.py @@ -2,23 +2,39 @@ import pickle from datasets import load_dataset from transformers import AutoTokenizer from config import MODEL +from tqdm import tqdm -dataset = load_dataset('csv', sep='\t', data_files={'train': ['../train/huggingface_format_year.tsv'], 'test': ['../dev-0/huggingface_format_year.tsv']}) -test_dataset = load_dataset('csv', sep='\t', data_files='../test-A/huggingface_format_year.tsv') +dataset = load_dataset('csv', sep='\t', data_files={'train': ['../train/huggingface_format_year.csv'], 'test': ['../dev-0/huggingface_format_year.csv']}) +test_dataset = load_dataset('csv', sep='\t', data_files='../test-A/huggingface_format_year.csv') tokenizer = AutoTokenizer.from_pretrained(MODEL) def tokenize_function(examples): - return tokenizer(examples["text"], padding="max_length", truncation=True) + t = tokenizer(examples["text"], padding="max_length", truncation=True) + examples['year'] = [x - 1995 for x in examples['year']] + for column in 'date', 'day_of_month', 'day_of_year', 'month', 'year', 'weekday', 'year_cont': + t[column] = [[a] * b.index(1) + [0] *(len(b) - b.index(1)) for a,b in zip(examples[column], t['input_ids'])] + return t -tokenized_datasets = dataset.map(tokenize_function, batched=True) test_tokenized_datasets = test_dataset.map(tokenize_function, batched=True) +tokenized_datasets = dataset.map(tokenize_function, batched=True) + + +#for d in ('train', 'test'): +# for i in tqdm(range(len(tokenized_datasets[d]))): +# tokenized_datasets[d][i][column] = [tokenized_datasets[d][i][column] ] * 512 #len(tokenized_datasets[d][i]['input_ids']) +# +#d = 'train' +#for column in tqdm(('date', 'day_of_month', 'day_of_year', 'month', 'year', 'year_cont')): +# for i in tqdm(range(len(test_tokenized_datasets[d]))): +# test_tokenized_datasets[d][i][column] = [test_tokenized_datasets[d][i][column] ] * 512 #len(test_tokenized_datasets[d][i]['input_ids']) train_dataset = tokenized_datasets["train"].shuffle(seed=42) eval_dataset_full = tokenized_datasets["test"] eval_dataset_small = tokenized_datasets["test"].select(range(2000)) test_dataset = test_tokenized_datasets["train"] + with open('train_dataset.pickle','wb') as f_p: pickle.dump(train_dataset, f_p) diff --git a/roberta_temp/02a_load_guess_date_datasets.py b/roberta_temp/02a_load_guess_date_datasets.py new file mode 100644 index 0000000..6f4065d --- /dev/null +++ b/roberta_temp/02a_load_guess_date_datasets.py @@ -0,0 +1,53 @@ +import pickle +from datasets import load_dataset +from transformers import AutoTokenizer +from config import MODEL +from tqdm import tqdm + +dataset = load_dataset('csv', sep='\t', data_files={'train': ['../train/huggingface_guess_day.csv'], 'test': ['../dev-0/huggingface_guess_day.csv']}) +test_dataset = load_dataset('csv', sep='\t', data_files='../test-A/huggingface_format_year.tsv') + +tokenizer = AutoTokenizer.from_pretrained(MODEL) + +def tokenize_function(examples): + t = tokenizer(examples["text"], padding="max_length", truncation=True) + examples['year'] = [x - 1995 for x in examples['year']] + for column in 'date', 'day_of_month', 'day_of_year', 'month', 'year', 'weekday', 'year_cont': + try: + t[column] = [[a] * b.index(1) + [0] *(len(b) - b.index(1)) for a,b in zip(examples[column], t['input_ids'])] + except: + pass + return t + +test_tokenized_datasets = test_dataset.map(tokenize_function, batched=True) +tokenized_datasets = dataset.map(tokenize_function, batched=True) + + +#for d in ('train', 'test'): +# for i in tqdm(range(len(tokenized_datasets[d]))): +# tokenized_datasets[d][i][column] = [tokenized_datasets[d][i][column] ] * 512 #len(tokenized_datasets[d][i]['input_ids']) +# +#d = 'train' +#for column in tqdm(('date', 'day_of_month', 'day_of_year', 'month', 'year', 'year_cont')): +# for i in tqdm(range(len(test_tokenized_datasets[d]))): +# test_tokenized_datasets[d][i][column] = [test_tokenized_datasets[d][i][column] ] * 512 #len(test_tokenized_datasets[d][i]['input_ids']) + +train_dataset = tokenized_datasets["train"].shuffle(seed=42) +eval_dataset_full = tokenized_datasets["test"] +eval_dataset_small = tokenized_datasets["test"].select(range(2000)) +test_dataset = test_tokenized_datasets["train"] + + +with open('train_dataset.pickle','wb') as f_p: + pickle.dump(train_dataset, f_p) + +with open('eval_dataset_small.pickle','wb') as f_p: + pickle.dump(eval_dataset_small, f_p) + +with open('eval_dataset_full.pickle','wb') as f_p: + pickle.dump(eval_dataset_full, f_p) + +with open('test_dataset.pickle','wb') as f_p: + pickle.dump(test_dataset, f_p) + + diff --git a/roberta_temp/03_train.py b/roberta_temp/03_train.py index e72a902..1fb51ed 100644 --- a/roberta_temp/03_train.py +++ b/roberta_temp/03_train.py @@ -16,7 +16,15 @@ with open('test_dataset.pickle','rb') as f_p: from transformers import AutoModelForSequenceClassification -model = AutoModelForSequenceClassification.from_pretrained(MODEL, num_labels=7) +#model = AutoModelForSequenceClassification.from_pretrained(MODEL, num_labels=7) +model_clean = AutoModelForSequenceClassification.from_pretrained(MODEL, num_labels=7) +model = AutoModelForSequenceClassification.from_pretrained('test_trainer_guess_weekday/checkpoint-6000',num_labels=7) +import torch +with torch.no_grad(): + model.classifier.dense.weight = model_clean.classifier.dense.weight + model.classifier.out_proj.weight = model_clean.classifier.out_proj.weight + +del model_clean from transformers import TrainingArguments @@ -27,9 +35,9 @@ training_args = TrainingArguments("test_trainer", evaluation_strategy='steps', #eval_steps=2_000, #save_steps=2_000, - eval_steps=20_000, + eval_steps=2_000, save_steps=20_000, - num_train_epochs=1, + num_train_epochs=5, gradient_accumulation_steps=2, learning_rate = 1e-6, #warmup_steps=4_000, diff --git a/roberta_temp/03a_train_guess_day.py b/roberta_temp/03a_train_guess_day.py new file mode 100644 index 0000000..40935ac --- /dev/null +++ b/roberta_temp/03a_train_guess_day.py @@ -0,0 +1,70 @@ +import pickle +from config import LABELS_LIST, MODEL + +with open('train_dataset.pickle','rb') as f_p: + train_dataset = pickle.load(f_p) + +with open('eval_dataset_small.pickle','rb') as f_p: + eval_dataset_small = pickle.load(f_p) + +with open('eval_dataset_full.pickle','rb') as f_p: + eval_dataset_full = pickle.load(f_p) + + + +from transformers import AutoModelForSequenceClassification + +model = AutoModelForSequenceClassification.from_pretrained(MODEL, num_labels=7) + +from transformers import TrainingArguments + + +training_args = TrainingArguments("test_trainer", + per_device_train_batch_size=4, + per_device_eval_batch_size=4, + evaluation_strategy='steps', + #eval_steps=2_000, + #save_steps=2_000, + eval_steps=2_000, + save_steps=20_000, + num_train_epochs=1, + gradient_accumulation_steps=2, + learning_rate = 1e-6, + #warmup_steps=4_000, + warmup_steps=4, + load_best_model_at_end=True, + ) + +import numpy as np +from datasets import load_metric + +metric = load_metric("accuracy") + +def compute_metrics(eval_pred): + logits, labels = eval_pred + predictions = np.argmax(logits, axis=-1) + return metric.compute(predictions=predictions, references=labels) + + +from transformers import Trainer + +trainer = Trainer( + model=model, + args=training_args, + train_dataset=train_dataset, + eval_dataset=eval_dataset_small, + compute_metrics=compute_metrics, + ) + +#trainer.train(resume_from_checkpoint=True) +trainer.train() +trainer.save_model("./roberta-retrained") +trainer.evaluate() + + +eval_predictions = trainer.predict(eval_dataset_full).predictions.argmax(1) + +with open('../dev-0/out.tsv', 'w') as f_out: + for pred in eval_predictions: + f_out.write(LABELS_LIST[pred] + '\n') + diff --git a/roberta_temp/04_predict.py b/roberta_temp/04_predict.py new file mode 100644 index 0000000..a66e3ee --- /dev/null +++ b/roberta_temp/04_predict.py @@ -0,0 +1,85 @@ +import pickle +from config import LABELS_LIST, MODEL + +with open('train_dataset.pickle','rb') as f_p: + train_dataset = pickle.load(f_p) + +with open('eval_dataset_small.pickle','rb') as f_p: + eval_dataset_small = pickle.load(f_p) + +with open('eval_dataset_full.pickle','rb') as f_p: + eval_dataset_full = pickle.load(f_p) + +with open('test_dataset.pickle','rb') as f_p: + test_dataset = pickle.load(f_p) + + +from transformers import AutoModelForSequenceClassification + +model = AutoModelForSequenceClassification.from_pretrained('roberta-retrained/') + +from transformers import TrainingArguments + + +training_args = TrainingArguments("test_trainer", + per_device_train_batch_size=4, + per_device_eval_batch_size=4, + evaluation_strategy='steps', + #eval_steps=2_000, + #save_steps=2_000, + eval_steps=2_000, + save_steps=20_000, + num_train_epochs=1, + gradient_accumulation_steps=2, + learning_rate = 1e-6, + #warmup_steps=4_000, + warmup_steps=4, + load_best_model_at_end=True, + ) + +import numpy as np +from datasets import load_metric + +metric = load_metric("accuracy") + +def compute_metrics(eval_pred): + logits, labels = eval_pred + predictions = np.argmax(logits, axis=-1) + return metric.compute(predictions=predictions, references=labels) + + +from transformers import Trainer + +trainer = Trainer( + model=model, + args=training_args, + train_dataset=train_dataset, + eval_dataset=eval_dataset_small, + compute_metrics=compute_metrics, + ) + + +eval_predictions = trainer.predict(eval_dataset_full).predictions.argmax(1) + +with open('../dev-0/out.tsv', 'w') as f_out: + for pred in eval_predictions: + f_out.write(LABELS_LIST[pred] + '\n') + +test_predictions = trainer.predict(test_dataset).predictions.argmax(1) +with open('../test-A/out.tsv', 'w') as f_out: + for pred in test_predictions: + f_out.write(LABELS_LIST[pred] + '\n') + +#model = AutoModelForSequenceClassification.from_pretrained('roberta-retrained/') + +#for dataset in ('dev-0', 'test-A'): +# with open(f'../{dataset}/in.tsv') as f_in, open(f'../{dataset}/out.tsv','w') as f_out: +# for line_in in tqdm(f_in, total=150_000): +# _,_, text = line_in.split('\t') +# text = text.rstrip('\n') +# inputs = tokenizer(text, padding=True, truncation=True, return_tensors="pt").to(device) +# outputs = model(**inputs) +# probs = outputs[0].softmax(1) +# prediction = LABELS_LIST[probs.argmax(1)] +# f_out.write(prediction + '\n') +#