diff --git a/roberta_temp/01_create_datasets.py b/roberta_temp/01_create_datasets.py new file mode 100644 index 0000000..4512557 --- /dev/null +++ b/roberta_temp/01_create_datasets.py @@ -0,0 +1,17 @@ +from config import LABELS_DICT + +with open('../test-A/in.tsv','r') as f_in, open(f'../test-A/huggingface_format_year.tsv', 'w') as f_hf: + f_hf.write('text\n') + for line_in in f_in: + year, _, text = line_in.split('\t') + f_hf.write(year + '\t' + text) + + +for dataset in 'train', 'dev-0': + with open(f'../{dataset}/in.tsv') as f_in, open(f'../{dataset}/expected.tsv') as f_exp, open(f'../{dataset}/huggingface_format_year.tsv','w') as f_hf: + f_hf.write('text\tyear\tlabel\n') + for line_in, line_exp in zip(f_in, f_exp): + label = LABELS_DICT[line_exp.rstrip('\n')] + year,_,text = line_in.rstrip('\n').split('\t') + f_hf.write(text +'\t' +year +'\t'+ str(label) + '\n') + diff --git a/roberta_temp/02_load_dataset.py b/roberta_temp/02_load_dataset.py new file mode 100644 index 0000000..e1c288d --- /dev/null +++ b/roberta_temp/02_load_dataset.py @@ -0,0 +1,34 @@ +import pickle +from datasets import load_dataset +from transformers import AutoTokenizer +from config import MODEL + +dataset = load_dataset('csv', sep='\t', data_files={'train': ['../train/huggingface_format_year.tsv'], 'test': ['../dev-0/huggingface_format_year.tsv']}) +test_dataset = load_dataset('csv', sep='\t', data_files='../test-A/huggingface_format_year.tsv') + +tokenizer = AutoTokenizer.from_pretrained(MODEL) + +def tokenize_function(examples): + return tokenizer(examples["text"], padding="max_length", truncation=True) + +tokenized_datasets = dataset.map(tokenize_function, batched=True) +test_tokenized_datasets = test_dataset.map(tokenize_function, batched=True) + +train_dataset = tokenized_datasets["train"].shuffle(seed=42) +eval_dataset_full = tokenized_datasets["test"] +eval_dataset_small = tokenized_datasets["test"].select(range(2000)) +test_dataset = test_tokenized_datasets["train"] + +with open('train_dataset.pickle','wb') as f_p: + pickle.dump(train_dataset, f_p) + +with open('eval_dataset_small.pickle','wb') as f_p: + pickle.dump(eval_dataset_small, f_p) + +with open('eval_dataset_full.pickle','wb') as f_p: + pickle.dump(eval_dataset_full, f_p) + +with open('test_dataset.pickle','wb') as f_p: + pickle.dump(test_dataset, f_p) + + diff --git a/roberta_temp/03_train.py b/roberta_temp/03_train.py new file mode 100644 index 0000000..e72a902 --- /dev/null +++ b/roberta_temp/03_train.py @@ -0,0 +1,76 @@ +import pickle +from config import LABELS_LIST, MODEL + +with open('train_dataset.pickle','rb') as f_p: + train_dataset = pickle.load(f_p) + +with open('eval_dataset_small.pickle','rb') as f_p: + eval_dataset_small = pickle.load(f_p) + +with open('eval_dataset_full.pickle','rb') as f_p: + eval_dataset_full = pickle.load(f_p) + +with open('test_dataset.pickle','rb') as f_p: + test_dataset = pickle.load(f_p) + + +from transformers import AutoModelForSequenceClassification + +model = AutoModelForSequenceClassification.from_pretrained(MODEL, num_labels=7) + +from transformers import TrainingArguments + + +training_args = TrainingArguments("test_trainer", + per_device_train_batch_size=4, + per_device_eval_batch_size=4, + evaluation_strategy='steps', + #eval_steps=2_000, + #save_steps=2_000, + eval_steps=20_000, + save_steps=20_000, + num_train_epochs=1, + gradient_accumulation_steps=2, + learning_rate = 1e-6, + #warmup_steps=4_000, + warmup_steps=4, + load_best_model_at_end=True, + ) + +import numpy as np +from datasets import load_metric + +metric = load_metric("accuracy") + +def compute_metrics(eval_pred): + logits, labels = eval_pred + predictions = np.argmax(logits, axis=-1) + return metric.compute(predictions=predictions, references=labels) + + +from transformers import Trainer + +trainer = Trainer( + model=model, + args=training_args, + train_dataset=train_dataset, + eval_dataset=eval_dataset_small, + compute_metrics=compute_metrics, + ) + +#trainer.train(resume_from_checkpoint=True) +trainer.train() +trainer.save_model("./roberta-retrained") +trainer.evaluate() + + +eval_predictions = trainer.predict(eval_dataset_full).predictions.argmax(1) + +with open('../dev-0/out.tsv', 'w') as f_out: + for pred in eval_predictions: + f_out.write(LABELS_LIST[pred] + '\n') + +test_predictions = trainer.predict(test_dataset).predictions.argmax(1) +with open('../test-A/out.tsv', 'w') as f_out: + for pred in test_predictions: + f_out.write(LABELS_LIST[pred] + '\n') diff --git a/roberta_temp/config.py b/roberta_temp/config.py new file mode 100644 index 0000000..4b62f72 --- /dev/null +++ b/roberta_temp/config.py @@ -0,0 +1,18 @@ +LABELS_DICT = {'news':0, + 'sport':1, + 'business':2, + 'opinion':3, + 'culture':4, + 'lifestyle':5, + 'removed':6} + + +LABELS_LIST = ['news', + 'sport', + 'business', + 'opinion', + 'culture', + 'lifestyle', + 'removed'] + +MODEL = 'roberta-base' diff --git a/roberta_temp/howto b/roberta_temp/howto new file mode 100644 index 0000000..54b3df3 --- /dev/null +++ b/roberta_temp/howto @@ -0,0 +1,3 @@ +conda activate temproberta + +pip install dataset==1.8.0 diff --git a/roberta_temp/logs b/roberta_temp/logs new file mode 100644 index 0000000..03af046 --- /dev/null +++ b/roberta_temp/logs @@ -0,0 +1,304 @@ +{'loss': 1.5985, 'learning_rate': 9.96656735733833e-07, 'epoch': 0.0} +{'loss': 1.4434, 'learning_rate': 9.932865096590678e-07, 'epoch': 0.01} +{'loss': 1.4104, 'learning_rate': 9.899162835843027e-07, 'epoch': 0.01} +{'loss': 1.3694, 'learning_rate': 9.865460575095376e-07, 'epoch': 0.01} +{'loss': 1.3011, 'learning_rate': 9.831758314347725e-07, 'epoch': 0.02} +{'loss': 1.2598, 'learning_rate': 9.798056053600075e-07, 'epoch': 0.02} +{'loss': 1.2188, 'learning_rate': 9.764353792852424e-07, 'epoch': 0.02} +{'loss': 1.2129, 'learning_rate': 9.730651532104773e-07, 'epoch': 0.03} +{'loss': 1.1953, 'learning_rate': 9.696949271357122e-07, 'epoch': 0.03} +{'loss': 1.1626, 'learning_rate': 9.66324701060947e-07, 'epoch': 0.03} +{'loss': 1.1446, 'learning_rate': 9.62954474986182e-07, 'epoch': 0.04} +{'loss': 1.123, 'learning_rate': 9.59584248911417e-07, 'epoch': 0.04} +{'loss': 1.0813, 'learning_rate': 9.562140228366518e-07, 'epoch': 0.04} +{'loss': 1.107, 'learning_rate': 9.528437967618867e-07, 'epoch': 0.05} +{'loss': 1.0496, 'learning_rate': 9.494735706871216e-07, 'epoch': 0.05} +{'loss': 1.0236, 'learning_rate': 9.461033446123565e-07, 'epoch': 0.05} +{'loss': 1.0117, 'learning_rate': 9.427331185375914e-07, 'epoch': 0.06} +{'loss': 1.0022, 'learning_rate': 9.393628924628264e-07, 'epoch': 0.06} +{'loss': 0.9896, 'learning_rate': 9.359926663880613e-07, 'epoch': 0.06} +{'loss': 0.971, 'learning_rate': 9.326224403132962e-07, 'epoch': 0.07} +{'loss': 0.9597, 'learning_rate': 9.292522142385311e-07, 'epoch': 0.07} +{'loss': 0.938, 'learning_rate': 9.25881988163766e-07, 'epoch': 0.07} +{'loss': 0.927, 'learning_rate': 9.225117620890009e-07, 'epoch': 0.08} +{'loss': 0.9277, 'learning_rate': 9.191415360142358e-07, 'epoch': 0.08} +{'loss': 0.9196, 'learning_rate': 9.157713099394707e-07, 'epoch': 0.08} +{'loss': 0.9034, 'learning_rate': 9.124010838647056e-07, 'epoch': 0.09} +{'loss': 0.916, 'learning_rate': 9.090308577899405e-07, 'epoch': 0.09} +{'loss': 0.899, 'learning_rate': 9.056606317151754e-07, 'epoch': 0.09} +{'loss': 0.9338, 'learning_rate': 9.022904056404103e-07, 'epoch': 0.1} +{'loss': 0.9074, 'learning_rate': 8.989201795656452e-07, 'epoch': 0.1} +{'loss': 0.845, 'learning_rate': 8.955499534908801e-07, 'epoch': 0.1} +{'loss': 0.8706, 'learning_rate': 8.92179727416115e-07, 'epoch': 0.11} +{'loss': 0.8712, 'learning_rate': 8.8880950134135e-07, 'epoch': 0.11} +{'loss': 0.8563, 'learning_rate': 8.854392752665848e-07, 'epoch': 0.11} +{'loss': 0.8663, 'learning_rate': 8.820690491918198e-07, 'epoch': 0.12} +{'loss': 0.8732, 'learning_rate': 8.786988231170547e-07, 'epoch': 0.12} +{'loss': 0.86, 'learning_rate': 8.753285970422896e-07, 'epoch': 0.12} +{'loss': 0.8505, 'learning_rate': 8.719583709675245e-07, 'epoch': 0.13} +{'loss': 0.8694, 'learning_rate': 8.685881448927593e-07, 'epoch': 0.13} +{'loss': 0.8466, 'learning_rate': 8.652179188179942e-07, 'epoch': 0.13} +{'eval_loss': 0.8035842180252075, 'eval_accuracy': 0.7115, 'eval_runtime': 48.7576, 'eval_samples_per_second': 41.019, 'eval_steps_per_second': 10.255, 'epoch': 0.13} +{'loss': 0.8135, 'learning_rate': 8.618476927432292e-07, 'epoch': 0.14} +{'loss': 0.8507, 'learning_rate': 8.584774666684641e-07, 'epoch': 0.14} +{'loss': 0.8194, 'learning_rate': 8.55107240593699e-07, 'epoch': 0.14} +{'loss': 0.8195, 'learning_rate': 8.517370145189338e-07, 'epoch': 0.15} +{'loss': 0.8242, 'learning_rate': 8.483667884441687e-07, 'epoch': 0.15} +{'loss': 0.8514, 'learning_rate': 8.449965623694038e-07, 'epoch': 0.16} +{'loss': 0.8117, 'learning_rate': 8.416263362946387e-07, 'epoch': 0.16} +{'loss': 0.8211, 'learning_rate': 8.382561102198736e-07, 'epoch': 0.16} +{'loss': 0.8021, 'learning_rate': 8.348858841451084e-07, 'epoch': 0.17} +{'loss': 0.8135, 'learning_rate': 8.315156580703433e-07, 'epoch': 0.17} +{'loss': 0.7927, 'learning_rate': 8.281454319955782e-07, 'epoch': 0.17} +{'loss': 0.7682, 'learning_rate': 8.247752059208132e-07, 'epoch': 0.18} +{'loss': 0.8137, 'learning_rate': 8.214049798460481e-07, 'epoch': 0.18} +{'loss': 0.8133, 'learning_rate': 8.180347537712829e-07, 'epoch': 0.18} +{'loss': 0.81, 'learning_rate': 8.146645276965178e-07, 'epoch': 0.19} +{'loss': 0.7963, 'learning_rate': 8.112943016217527e-07, 'epoch': 0.19} +{'loss': 0.7781, 'learning_rate': 8.079240755469876e-07, 'epoch': 0.19} +{'loss': 0.8009, 'learning_rate': 8.045538494722226e-07, 'epoch': 0.2} +{'loss': 0.7945, 'learning_rate': 8.011836233974574e-07, 'epoch': 0.2} +{'loss': 0.7793, 'learning_rate': 7.978133973226924e-07, 'epoch': 0.2} +{'loss': 0.8012, 'learning_rate': 7.944431712479273e-07, 'epoch': 0.21} +{'loss': 0.7858, 'learning_rate': 7.910729451731622e-07, 'epoch': 0.21} +{'loss': 0.7899, 'learning_rate': 7.877027190983972e-07, 'epoch': 0.21} +{'loss': 0.7696, 'learning_rate': 7.84332493023632e-07, 'epoch': 0.22} +{'loss': 0.7749, 'learning_rate': 7.809622669488669e-07, 'epoch': 0.22} +{'loss': 0.781, 'learning_rate': 7.775920408741018e-07, 'epoch': 0.22} +{'loss': 0.7769, 'learning_rate': 7.742218147993367e-07, 'epoch': 0.23} +{'loss': 0.8053, 'learning_rate': 7.708515887245716e-07, 'epoch': 0.23} +{'loss': 0.7649, 'learning_rate': 7.674813626498065e-07, 'epoch': 0.23} +{'loss': 0.7428, 'learning_rate': 7.641111365750414e-07, 'epoch': 0.24} +{'loss': 0.746, 'learning_rate': 7.607409105002763e-07, 'epoch': 0.24} +{'loss': 0.7635, 'learning_rate': 7.573706844255112e-07, 'epoch': 0.24} +{'loss': 0.7728, 'learning_rate': 7.540004583507461e-07, 'epoch': 0.25} +{'loss': 0.7821, 'learning_rate': 7.50630232275981e-07, 'epoch': 0.25} +{'loss': 0.7348, 'learning_rate': 7.47260006201216e-07, 'epoch': 0.25} +{'loss': 0.7643, 'learning_rate': 7.438897801264509e-07, 'epoch': 0.26} +{'loss': 0.7521, 'learning_rate': 7.405195540516858e-07, 'epoch': 0.26} +{'loss': 0.7755, 'learning_rate': 7.371493279769207e-07, 'epoch': 0.26} +{'loss': 0.7596, 'learning_rate': 7.337791019021555e-07, 'epoch': 0.27} +{'loss': 0.7626, 'learning_rate': 7.304088758273904e-07, 'epoch': 0.27} +{'eval_loss': 0.7012397646903992, 'eval_accuracy': 0.7305, 'eval_runtime': 48.2546, 'eval_samples_per_second': 41.447, 'eval_steps_per_second': 10.362, 'epoch': 0.27} +{'loss': 0.7675, 'learning_rate': 7.270386497526254e-07, 'epoch': 0.27} +{'loss': 0.7742, 'learning_rate': 7.236684236778603e-07, 'epoch': 0.28} +{'loss': 0.7682, 'learning_rate': 7.202981976030952e-07, 'epoch': 0.28} +{'loss': 0.7625, 'learning_rate': 7.1692797152833e-07, 'epoch': 0.28} +{'loss': 0.7523, 'learning_rate': 7.135577454535649e-07, 'epoch': 0.29} +{'loss': 0.7553, 'learning_rate': 7.101875193787999e-07, 'epoch': 0.29} +{'loss': 0.728, 'learning_rate': 7.068172933040349e-07, 'epoch': 0.29} +{'loss': 0.7547, 'learning_rate': 7.034470672292698e-07, 'epoch': 0.3} +{'loss': 0.7218, 'learning_rate': 7.000768411545046e-07, 'epoch': 0.3} +{'loss': 0.7513, 'learning_rate': 6.967066150797395e-07, 'epoch': 0.3} +{'loss': 0.7499, 'learning_rate': 6.933363890049744e-07, 'epoch': 0.31} +{'loss': 0.752, 'learning_rate': 6.899661629302094e-07, 'epoch': 0.31} +{'loss': 0.7424, 'learning_rate': 6.865959368554443e-07, 'epoch': 0.31} +{'loss': 0.7438, 'learning_rate': 6.832257107806791e-07, 'epoch': 0.32} +{'loss': 0.7331, 'learning_rate': 6.79855484705914e-07, 'epoch': 0.32} +{'loss': 0.7384, 'learning_rate': 6.764852586311489e-07, 'epoch': 0.32} +{'loss': 0.7305, 'learning_rate': 6.731150325563838e-07, 'epoch': 0.33} +{'loss': 0.7172, 'learning_rate': 6.697448064816188e-07, 'epoch': 0.33} +{'loss': 0.7233, 'learning_rate': 6.663745804068536e-07, 'epoch': 0.33} +{'loss': 0.7342, 'learning_rate': 6.630043543320885e-07, 'epoch': 0.34} +{'loss': 0.7204, 'learning_rate': 6.596341282573235e-07, 'epoch': 0.34} +{'loss': 0.7342, 'learning_rate': 6.562639021825584e-07, 'epoch': 0.34} +{'loss': 0.7429, 'learning_rate': 6.528936761077934e-07, 'epoch': 0.35} +{'loss': 0.7311, 'learning_rate': 6.495234500330282e-07, 'epoch': 0.35} +{'loss': 0.7137, 'learning_rate': 6.461532239582631e-07, 'epoch': 0.35} +{'loss': 0.7306, 'learning_rate': 6.42782997883498e-07, 'epoch': 0.36} +{'loss': 0.7233, 'learning_rate': 6.394127718087329e-07, 'epoch': 0.36} +{'loss': 0.7236, 'learning_rate': 6.360425457339678e-07, 'epoch': 0.36} +{'loss': 0.7233, 'learning_rate': 6.326723196592027e-07, 'epoch': 0.37} +{'loss': 0.7432, 'learning_rate': 6.293020935844376e-07, 'epoch': 0.37} +{'loss': 0.7007, 'learning_rate': 6.259318675096725e-07, 'epoch': 0.37} +{'loss': 0.7007, 'learning_rate': 6.225616414349074e-07, 'epoch': 0.38} +{'loss': 0.7075, 'learning_rate': 6.191914153601423e-07, 'epoch': 0.38} +{'loss': 0.7009, 'learning_rate': 6.158211892853771e-07, 'epoch': 0.38} +{'loss': 0.7243, 'learning_rate': 6.124509632106122e-07, 'epoch': 0.39} +{'loss': 0.7153, 'learning_rate': 6.090807371358471e-07, 'epoch': 0.39} +{'loss': 0.6989, 'learning_rate': 6.05710511061082e-07, 'epoch': 0.39} +{'loss': 0.7306, 'learning_rate': 6.023402849863169e-07, 'epoch': 0.4} +{'loss': 0.6969, 'learning_rate': 5.989700589115517e-07, 'epoch': 0.4} +{'loss': 0.7261, 'learning_rate': 5.955998328367867e-07, 'epoch': 0.4} +{'eval_loss': 0.6748808026313782, 'eval_accuracy': 0.754, 'eval_runtime': 47.9976, 'eval_samples_per_second': 41.669, 'eval_steps_per_second': 10.417, 'epoch': 0.4} +{'loss': 0.7291, 'learning_rate': 5.922296067620216e-07, 'epoch': 0.41} +{'loss': 0.7322, 'learning_rate': 5.888593806872565e-07, 'epoch': 0.41} +{'loss': 0.7045, 'learning_rate': 5.854891546124914e-07, 'epoch': 0.41} +{'loss': 0.6872, 'learning_rate': 5.821189285377262e-07, 'epoch': 0.42} +{'loss': 0.7286, 'learning_rate': 5.787487024629611e-07, 'epoch': 0.42} +{'loss': 0.7197, 'learning_rate': 5.753784763881961e-07, 'epoch': 0.42} +{'loss': 0.704, 'learning_rate': 5.72008250313431e-07, 'epoch': 0.43} +{'loss': 0.7027, 'learning_rate': 5.68638024238666e-07, 'epoch': 0.43} +{'loss': 0.7077, 'learning_rate': 5.652677981639008e-07, 'epoch': 0.43} +{'loss': 0.7234, 'learning_rate': 5.618975720891357e-07, 'epoch': 0.44} +{'loss': 0.699, 'learning_rate': 5.585273460143706e-07, 'epoch': 0.44} +{'loss': 0.6987, 'learning_rate': 5.551571199396056e-07, 'epoch': 0.44} +{'loss': 0.6861, 'learning_rate': 5.517868938648405e-07, 'epoch': 0.45} +{'loss': 0.6883, 'learning_rate': 5.484166677900753e-07, 'epoch': 0.45} +{'loss': 0.6772, 'learning_rate': 5.450464417153102e-07, 'epoch': 0.45} +{'loss': 0.6853, 'learning_rate': 5.416762156405451e-07, 'epoch': 0.46} +{'loss': 0.6806, 'learning_rate': 5.3830598956578e-07, 'epoch': 0.46} +{'loss': 0.6824, 'learning_rate': 5.34935763491015e-07, 'epoch': 0.47} +{'loss': 0.7165, 'learning_rate': 5.315655374162498e-07, 'epoch': 0.47} +{'loss': 0.6938, 'learning_rate': 5.281953113414847e-07, 'epoch': 0.47} +{'loss': 0.7011, 'learning_rate': 5.248250852667196e-07, 'epoch': 0.48} +{'loss': 0.6793, 'learning_rate': 5.214548591919546e-07, 'epoch': 0.48} +{'loss': 0.7228, 'learning_rate': 5.180846331171896e-07, 'epoch': 0.48} +{'loss': 0.6816, 'learning_rate': 5.147144070424244e-07, 'epoch': 0.49} +{'loss': 0.7034, 'learning_rate': 5.113441809676593e-07, 'epoch': 0.49} +{'loss': 0.6742, 'learning_rate': 5.079739548928942e-07, 'epoch': 0.49} +{'loss': 0.6889, 'learning_rate': 5.046037288181291e-07, 'epoch': 0.5} +{'loss': 0.7136, 'learning_rate': 5.01233502743364e-07, 'epoch': 0.5} +{'loss': 0.7001, 'learning_rate': 4.978632766685989e-07, 'epoch': 0.5} +{'loss': 0.7121, 'learning_rate': 4.944930505938338e-07, 'epoch': 0.51} +{'loss': 0.6866, 'learning_rate': 4.911228245190687e-07, 'epoch': 0.51} +{'loss': 0.693, 'learning_rate': 4.877525984443036e-07, 'epoch': 0.51} +{'loss': 0.6929, 'learning_rate': 4.843823723695385e-07, 'epoch': 0.52} +{'loss': 0.6979, 'learning_rate': 4.810121462947734e-07, 'epoch': 0.52} +{'loss': 0.6989, 'learning_rate': 4.776419202200083e-07, 'epoch': 0.52} +{'loss': 0.6848, 'learning_rate': 4.7427169414524326e-07, 'epoch': 0.53} +{'loss': 0.693, 'learning_rate': 4.7090146807047816e-07, 'epoch': 0.53} +{'loss': 0.6549, 'learning_rate': 4.67531241995713e-07, 'epoch': 0.53} +{'loss': 0.7135, 'learning_rate': 4.64161015920948e-07, 'epoch': 0.54} +{'loss': 0.6701, 'learning_rate': 4.6079078984618283e-07, 'epoch': 0.54} +{'eval_loss': 0.6424754858016968, 'eval_accuracy': 0.766, 'eval_runtime': 48.0613, 'eval_samples_per_second': 41.614, 'eval_steps_per_second': 10.403, 'epoch': 0.54} +{'loss': 0.6826, 'learning_rate': 4.5742056377141774e-07, 'epoch': 0.54} +{'loss': 0.6939, 'learning_rate': 4.540503376966527e-07, 'epoch': 0.55} +{'loss': 0.6811, 'learning_rate': 4.5068011162188756e-07, 'epoch': 0.55} +{'loss': 0.695, 'learning_rate': 4.4730988554712246e-07, 'epoch': 0.55} +{'loss': 0.6861, 'learning_rate': 4.439396594723574e-07, 'epoch': 0.56} +{'loss': 0.7125, 'learning_rate': 4.405694333975923e-07, 'epoch': 0.56} +{'loss': 0.6876, 'learning_rate': 4.371992073228272e-07, 'epoch': 0.56} +{'loss': 0.6839, 'learning_rate': 4.338289812480621e-07, 'epoch': 0.57} +{'loss': 0.6864, 'learning_rate': 4.30458755173297e-07, 'epoch': 0.57} +{'loss': 0.7134, 'learning_rate': 4.2708852909853196e-07, 'epoch': 0.57} +{'loss': 0.6977, 'learning_rate': 4.237183030237668e-07, 'epoch': 0.58} +{'loss': 0.7034, 'learning_rate': 4.203480769490017e-07, 'epoch': 0.58} +{'loss': 0.6755, 'learning_rate': 4.1697785087423663e-07, 'epoch': 0.58} +{'loss': 0.6755, 'learning_rate': 4.1360762479947154e-07, 'epoch': 0.59} +{'loss': 0.7243, 'learning_rate': 4.1023739872470644e-07, 'epoch': 0.59} +{'loss': 0.6836, 'learning_rate': 4.0686717264994135e-07, 'epoch': 0.59} +{'loss': 0.6614, 'learning_rate': 4.0349694657517626e-07, 'epoch': 0.6} +{'loss': 0.6693, 'learning_rate': 4.001267205004111e-07, 'epoch': 0.6} +{'loss': 0.6732, 'learning_rate': 3.967564944256461e-07, 'epoch': 0.6} +{'loss': 0.6631, 'learning_rate': 3.93386268350881e-07, 'epoch': 0.61} +{'loss': 0.6508, 'learning_rate': 3.9001604227611584e-07, 'epoch': 0.61} +{'loss': 0.6704, 'learning_rate': 3.866458162013508e-07, 'epoch': 0.61} +{'loss': 0.6752, 'learning_rate': 3.8327559012658565e-07, 'epoch': 0.62} +{'loss': 0.6699, 'learning_rate': 3.7990536405182056e-07, 'epoch': 0.62} +{'loss': 0.6702, 'learning_rate': 3.765351379770555e-07, 'epoch': 0.62} +{'loss': 0.6564, 'learning_rate': 3.731649119022904e-07, 'epoch': 0.63} +{'loss': 0.6714, 'learning_rate': 3.6979468582752533e-07, 'epoch': 0.63} +{'loss': 0.6715, 'learning_rate': 3.664244597527602e-07, 'epoch': 0.63} +{'loss': 0.6754, 'learning_rate': 3.630542336779951e-07, 'epoch': 0.64} +{'loss': 0.6664, 'learning_rate': 3.5968400760323006e-07, 'epoch': 0.64} +{'loss': 0.6747, 'learning_rate': 3.563137815284649e-07, 'epoch': 0.64} +{'loss': 0.6786, 'learning_rate': 3.529435554536998e-07, 'epoch': 0.65} +{'loss': 0.7, 'learning_rate': 3.495733293789347e-07, 'epoch': 0.65} +{'loss': 0.6933, 'learning_rate': 3.4620310330416963e-07, 'epoch': 0.65} +{'loss': 0.6725, 'learning_rate': 3.4283287722940454e-07, 'epoch': 0.66} +{'loss': 0.6428, 'learning_rate': 3.3946265115463945e-07, 'epoch': 0.66} +{'loss': 0.6433, 'learning_rate': 3.3609242507987436e-07, 'epoch': 0.66} +{'loss': 0.6597, 'learning_rate': 3.327221990051092e-07, 'epoch': 0.67} +{'loss': 0.6531, 'learning_rate': 3.2935197293034417e-07, 'epoch': 0.67} +{'loss': 0.6835, 'learning_rate': 3.259817468555791e-07, 'epoch': 0.67} +{'eval_loss': 0.6216332912445068, 'eval_accuracy': 0.768, 'eval_runtime': 48.0111, 'eval_samples_per_second': 41.657, 'eval_steps_per_second': 10.414, 'epoch': 0.67} +{'loss': 0.643, 'learning_rate': 3.2261152078081393e-07, 'epoch': 0.68} +{'loss': 0.6761, 'learning_rate': 3.192412947060489e-07, 'epoch': 0.68} +{'loss': 0.6655, 'learning_rate': 3.1587106863128375e-07, 'epoch': 0.68} +{'loss': 0.6587, 'learning_rate': 3.1250084255651866e-07, 'epoch': 0.69} +{'loss': 0.6904, 'learning_rate': 3.091306164817536e-07, 'epoch': 0.69} +{'loss': 0.6702, 'learning_rate': 3.0576039040698847e-07, 'epoch': 0.69} +{'loss': 0.6354, 'learning_rate': 3.0239016433222343e-07, 'epoch': 0.7} +{'loss': 0.6671, 'learning_rate': 2.990199382574583e-07, 'epoch': 0.7} +{'loss': 0.6635, 'learning_rate': 2.956497121826932e-07, 'epoch': 0.7} +{'loss': 0.6455, 'learning_rate': 2.9227948610792815e-07, 'epoch': 0.71} +{'loss': 0.638, 'learning_rate': 2.88909260033163e-07, 'epoch': 0.71} +{'loss': 0.653, 'learning_rate': 2.855390339583979e-07, 'epoch': 0.71} +{'loss': 0.6725, 'learning_rate': 2.821688078836328e-07, 'epoch': 0.72} +{'loss': 0.6551, 'learning_rate': 2.7879858180886773e-07, 'epoch': 0.72} +{'loss': 0.6526, 'learning_rate': 2.7542835573410264e-07, 'epoch': 0.72} +{'loss': 0.6806, 'learning_rate': 2.7205812965933754e-07, 'epoch': 0.73} +{'loss': 0.6892, 'learning_rate': 2.6868790358457245e-07, 'epoch': 0.73} +{'loss': 0.6721, 'learning_rate': 2.653176775098073e-07, 'epoch': 0.73} +{'loss': 0.6647, 'learning_rate': 2.6194745143504227e-07, 'epoch': 0.74} +{'loss': 0.6435, 'learning_rate': 2.585772253602772e-07, 'epoch': 0.74} +{'loss': 0.6609, 'learning_rate': 2.5520699928551203e-07, 'epoch': 0.74} +{'loss': 0.6616, 'learning_rate': 2.51836773210747e-07, 'epoch': 0.75} +{'loss': 0.6725, 'learning_rate': 2.4846654713598184e-07, 'epoch': 0.75} +{'loss': 0.6717, 'learning_rate': 2.4509632106121675e-07, 'epoch': 0.75} +{'loss': 0.6418, 'learning_rate': 2.417260949864517e-07, 'epoch': 0.76} +{'loss': 0.6396, 'learning_rate': 2.3835586891168657e-07, 'epoch': 0.76} +{'loss': 0.6584, 'learning_rate': 2.349856428369215e-07, 'epoch': 0.77} +{'loss': 0.6557, 'learning_rate': 2.316154167621564e-07, 'epoch': 0.77} +{'loss': 0.6671, 'learning_rate': 2.282451906873913e-07, 'epoch': 0.77} +{'loss': 0.6591, 'learning_rate': 2.248749646126262e-07, 'epoch': 0.78} +{'loss': 0.6708, 'learning_rate': 2.215047385378611e-07, 'epoch': 0.78} +{'loss': 0.6771, 'learning_rate': 2.1813451246309604e-07, 'epoch': 0.78} +{'loss': 0.6727, 'learning_rate': 2.1476428638833092e-07, 'epoch': 0.79} +{'loss': 0.6466, 'learning_rate': 2.1139406031356583e-07, 'epoch': 0.79} +{'loss': 0.6665, 'learning_rate': 2.0802383423880073e-07, 'epoch': 0.79} +{'loss': 0.6445, 'learning_rate': 2.0465360816403561e-07, 'epoch': 0.8} +{'loss': 0.6471, 'learning_rate': 2.0128338208927055e-07, 'epoch': 0.8} +{'loss': 0.6812, 'learning_rate': 1.9791315601450546e-07, 'epoch': 0.8} +{'loss': 0.6925, 'learning_rate': 1.9454292993974034e-07, 'epoch': 0.81} +{'loss': 0.63, 'learning_rate': 1.9117270386497524e-07, 'epoch': 0.81} +{'eval_loss': 0.623985230922699, 'eval_accuracy': 0.779, 'eval_runtime': 47.928, 'eval_samples_per_second': 41.729, 'eval_steps_per_second': 10.432, 'epoch': 0.81} +{'loss': 0.6499, 'learning_rate': 1.8780247779021015e-07, 'epoch': 0.81} +{'loss': 0.6684, 'learning_rate': 1.8443225171544509e-07, 'epoch': 0.82} +{'loss': 0.6976, 'learning_rate': 1.8106202564067997e-07, 'epoch': 0.82} +{'loss': 0.6495, 'learning_rate': 1.7769179956591487e-07, 'epoch': 0.82} +{'loss': 0.6589, 'learning_rate': 1.7432157349114978e-07, 'epoch': 0.83} +{'loss': 0.6599, 'learning_rate': 1.7095134741638466e-07, 'epoch': 0.83} +{'loss': 0.6584, 'learning_rate': 1.675811213416196e-07, 'epoch': 0.83} +{'loss': 0.6276, 'learning_rate': 1.642108952668545e-07, 'epoch': 0.84} +{'loss': 0.6491, 'learning_rate': 1.6084066919208939e-07, 'epoch': 0.84} +{'loss': 0.6616, 'learning_rate': 1.574704431173243e-07, 'epoch': 0.84} +{'loss': 0.6788, 'learning_rate': 1.541002170425592e-07, 'epoch': 0.85} +{'loss': 0.6773, 'learning_rate': 1.5072999096779413e-07, 'epoch': 0.85} +{'loss': 0.6602, 'learning_rate': 1.4735976489302901e-07, 'epoch': 0.85} +{'loss': 0.6625, 'learning_rate': 1.4398953881826392e-07, 'epoch': 0.86} +{'loss': 0.6739, 'learning_rate': 1.4061931274349883e-07, 'epoch': 0.86} +{'loss': 0.6581, 'learning_rate': 1.372490866687337e-07, 'epoch': 0.86} +{'loss': 0.648, 'learning_rate': 1.3387886059396864e-07, 'epoch': 0.87} +{'loss': 0.6596, 'learning_rate': 1.3050863451920355e-07, 'epoch': 0.87} +{'loss': 0.6626, 'learning_rate': 1.2713840844443846e-07, 'epoch': 0.87} +{'loss': 0.6221, 'learning_rate': 1.2376818236967334e-07, 'epoch': 0.88} +{'loss': 0.6426, 'learning_rate': 1.2039795629490825e-07, 'epoch': 0.88} +{'loss': 0.6636, 'learning_rate': 1.1702773022014316e-07, 'epoch': 0.88} +{'loss': 0.6432, 'learning_rate': 1.1365750414537808e-07, 'epoch': 0.89} +{'loss': 0.6399, 'learning_rate': 1.1028727807061297e-07, 'epoch': 0.89} +{'loss': 0.6711, 'learning_rate': 1.0691705199584786e-07, 'epoch': 0.89} +{'loss': 0.6518, 'learning_rate': 1.0354682592108279e-07, 'epoch': 0.9} +{'loss': 0.6644, 'learning_rate': 1.0017659984631768e-07, 'epoch': 0.9} +{'loss': 0.6708, 'learning_rate': 9.68063737715526e-08, 'epoch': 0.9} +{'loss': 0.6548, 'learning_rate': 9.34361476967875e-08, 'epoch': 0.91} +{'loss': 0.6562, 'learning_rate': 9.006592162202239e-08, 'epoch': 0.91} +{'loss': 0.6766, 'learning_rate': 8.669569554725731e-08, 'epoch': 0.91} +{'loss': 0.6482, 'learning_rate': 8.33254694724922e-08, 'epoch': 0.92} +{'loss': 0.6704, 'learning_rate': 7.995524339772712e-08, 'epoch': 0.92} +{'loss': 0.6484, 'learning_rate': 7.658501732296202e-08, 'epoch': 0.92} +{'loss': 0.6528, 'learning_rate': 7.321479124819691e-08, 'epoch': 0.93} +{'loss': 0.6646, 'learning_rate': 6.984456517343183e-08, 'epoch': 0.93} +{'loss': 0.6562, 'learning_rate': 6.647433909866673e-08, 'epoch': 0.93} +{'loss': 0.6415, 'learning_rate': 6.310411302390165e-08, 'epoch': 0.94} +{'loss': 0.6495, 'learning_rate': 5.973388694913654e-08, 'epoch': 0.94} +{'loss': 0.6527, 'learning_rate': 5.636366087437145e-08, 'epoch': 0.94} +{'eval_loss': 0.6111205816268921, 'eval_accuracy': 0.7775, 'eval_runtime': 48.0776, 'eval_samples_per_second': 41.599, 'eval_steps_per_second': 10.4, 'epoch': 0.94} +{'loss': 0.6629, 'learning_rate': 5.299343479960636e-08, 'epoch': 0.95} +{'loss': 0.6689, 'learning_rate': 4.9623208724841265e-08, 'epoch': 0.95} +{'loss': 0.6499, 'learning_rate': 4.625298265007616e-08, 'epoch': 0.95} +{'loss': 0.6584, 'learning_rate': 4.288275657531107e-08, 'epoch': 0.96} +{'loss': 0.6276, 'learning_rate': 3.9512530500545974e-08, 'epoch': 0.96} +{'loss': 0.6629, 'learning_rate': 3.614230442578088e-08, 'epoch': 0.96} +{'loss': 0.6573, 'learning_rate': 3.277207835101579e-08, 'epoch': 0.97} +{'loss': 0.6632, 'learning_rate': 2.940185227625069e-08, 'epoch': 0.97} +{'loss': 0.6425, 'learning_rate': 2.6031626201485594e-08, 'epoch': 0.97} +{'loss': 0.659, 'learning_rate': 2.26614001267205e-08, 'epoch': 0.98} +{'loss': 0.6864, 'learning_rate': 1.9291174051955406e-08, 'epoch': 0.98} +{'loss': 0.6621, 'learning_rate': 1.5920947977190307e-08, 'epoch': 0.98} +{'loss': 0.6644, 'learning_rate': 1.2550721902425214e-08, 'epoch': 0.99} +{'loss': 0.6386, 'learning_rate': 9.180495827660118e-09, 'epoch': 0.99} +{'loss': 0.634, 'learning_rate': 5.810269752895023e-09, 'epoch': 0.99} +{'loss': 0.6659, 'learning_rate': 2.4400436781299287e-09, 'epoch': 1.0} +{'train_runtime': 70469.6653, 'train_samples_per_second': 16.843, 'train_steps_per_second': 2.105, 'train_loss': 0.7453787245144715, 'epoch': 1.0}