From d6d7a4dbda3bad35d70c0bf6a17b4d8b3a9e0664 Mon Sep 17 00:00:00 2001
From: kubapok <jakubpokrywka@gmail.com>
Date: Fri, 24 Sep 2021 15:29:02 +0200
Subject: [PATCH] a

---
 roberta_temp/01_create_datasets.py            | 33 +++++--
 .../01a_create_guess_date_datasets.py         | 17 ++++
 roberta_temp/02_load_dataset.py               | 24 +++++-
 roberta_temp/02a_load_guess_date_datasets.py  | 53 ++++++++++++
 roberta_temp/03_train.py                      | 14 ++-
 roberta_temp/03a_train_guess_day.py           | 70 +++++++++++++++
 roberta_temp/04_predict.py                    | 85 +++++++++++++++++++
 7 files changed, 280 insertions(+), 16 deletions(-)
 create mode 100644 roberta_temp/01a_create_guess_date_datasets.py
 create mode 100644 roberta_temp/02a_load_guess_date_datasets.py
 create mode 100644 roberta_temp/03a_train_guess_day.py
 create mode 100644 roberta_temp/04_predict.py

diff --git a/roberta_temp/01_create_datasets.py b/roberta_temp/01_create_datasets.py
index 4512557..2884c65 100644
--- a/roberta_temp/01_create_datasets.py
+++ b/roberta_temp/01_create_datasets.py
@@ -1,17 +1,32 @@
+import datetime
 from config import LABELS_DICT
 
-with open('../test-A/in.tsv','r') as f_in, open(f'../test-A/huggingface_format_year.tsv', 'w') as f_hf:
-    f_hf.write('text\n')
+with open('../test-A/in.csv','r') as f_in, open(f'../test-A/huggingface_format_year.csv', 'w') as f_hf:
+    f_hf.write('text\tyear_cont\tdate\tday_of_year\tday_of_month\tmonth\tyear\tweekday\tlabel\n')
     for line_in in f_in:
-        year, _, text = line_in.split('\t')
-        f_hf.write(year + '\t' + text)
+        year_cont, date, text = line_in.rstrip('\n').split('\t')
+        d = datetime.datetime.strptime(date,"%Y%m%d")
+        day_of_year = str(d.timetuple().tm_yday)
+        day_of_month = str(d.day)
+        month = str(d.month)
+        year = str(d.year)
+        weekday = str(d.weekday())
+        day_of_year = str(d.timetuple().tm_yday)
+        f_hf.write(text +'\t' +year_cont +'\t'+ date + '\t' +  day_of_year + '\t' + day_of_month + '\t' + month + '\t' + year + '\t' + weekday + '\t' +  str('0') + '\n')
 
 
 for dataset in 'train', 'dev-0':
-    with open(f'../{dataset}/in.tsv') as f_in, open(f'../{dataset}/expected.tsv') as f_exp, open(f'../{dataset}/huggingface_format_year.tsv','w') as f_hf:
-        f_hf.write('text\tyear\tlabel\n')
+    with open(f'../{dataset}/in.csv') as f_in, open(f'../{dataset}/expected.csv') as f_exp, open(f'../{dataset}/huggingface_format_year.csv','w') as f_hf:
+        f_hf.write('text\tyear_cont\tdate\tday_of_year\tday_of_month\tmonth\tyear\tweekday\tlabel\n')
         for line_in, line_exp in zip(f_in, f_exp):
-            label = LABELS_DICT[line_exp.rstrip('\n')]
-            year,_,text = line_in.rstrip('\n').split('\t')
-            f_hf.write(text +'\t' +year +'\t'+ str(label) + '\n')
+            label = str(LABELS_DICT[line_exp.rstrip('\n')])
+            year_cont,date,text = line_in.rstrip('\n').split('\t')
+            d = datetime.datetime.strptime(date,"%Y%m%d")
+            day_of_year = str(d.timetuple().tm_yday)
+            day_of_month = str(d.day)
+            month = str(d.month)
+            year = str(d.year)
+            weekday = str(d.weekday())
+            day_of_year = str(d.timetuple().tm_yday)
+            f_hf.write(text +'\t' +year_cont +'\t'+ date + '\t'+ day_of_year + '\t' + day_of_month + '\t' + month + '\t' + year + '\t' + weekday + '\t'  +  label + '\n')
 
diff --git a/roberta_temp/01a_create_guess_date_datasets.py b/roberta_temp/01a_create_guess_date_datasets.py
new file mode 100644
index 0000000..0d0f498
--- /dev/null
+++ b/roberta_temp/01a_create_guess_date_datasets.py
@@ -0,0 +1,17 @@
+import datetime
+for dataset in 'train', 'dev-0':
+    with open(f'../{dataset}/in.tsv') as f_in, open(f'../{dataset}/expected.tsv') as f_exp, open(f'../{dataset}/huggingface_guess_day.csv','w') as f_hf:
+        f_hf.write('text\tyear_cont\tdate\tday_of_year\tday_of_month\tmonth\tyear\tweekday\tlabel\n')
+        for line_in, line_exp in zip(f_in, f_exp):
+            year_cont,date,text = line_in.rstrip('\n').split('\t')
+            d = datetime.datetime.strptime(date,"%Y%m%d")
+            day_of_year = str(d.timetuple().tm_yday)
+            day_of_month = str(d.day)
+            month = str(d.month)
+            year = str(d.year)
+            weekday = str(d.weekday())
+            day_of_year = str(d.timetuple().tm_yday)
+            #label = f'year:{year} month:{month} day:{day_of_month} weekday:{weekday}'
+            label = weekday
+            f_hf.write(text +'\t' +year_cont +'\t'+ date + '\t'+ day_of_year + '\t' + day_of_month + '\t' + month + '\t' + year + '\t' + weekday + '\t' +  label + '\n')
+
diff --git a/roberta_temp/02_load_dataset.py b/roberta_temp/02_load_dataset.py
index e1c288d..b9b609a 100644
--- a/roberta_temp/02_load_dataset.py
+++ b/roberta_temp/02_load_dataset.py
@@ -2,23 +2,39 @@ import pickle
 from datasets import load_dataset
 from transformers import AutoTokenizer
 from config import MODEL
+from tqdm import tqdm
 
-dataset = load_dataset('csv', sep='\t', data_files={'train': ['../train/huggingface_format_year.tsv'], 'test': ['../dev-0/huggingface_format_year.tsv']})
-test_dataset = load_dataset('csv', sep='\t', data_files='../test-A/huggingface_format_year.tsv')
+dataset = load_dataset('csv', sep='\t', data_files={'train': ['../train/huggingface_format_year.csv'], 'test': ['../dev-0/huggingface_format_year.csv']})
+test_dataset = load_dataset('csv', sep='\t', data_files='../test-A/huggingface_format_year.csv')
 
 tokenizer = AutoTokenizer.from_pretrained(MODEL)
 
 def tokenize_function(examples):
-    return tokenizer(examples["text"], padding="max_length", truncation=True)
+    t = tokenizer(examples["text"], padding="max_length", truncation=True)
+    examples['year'] = [x - 1995 for x in examples['year']]
+    for column in 'date', 'day_of_month', 'day_of_year', 'month', 'year', 'weekday', 'year_cont':
+        t[column] = [[a] * b.index(1) + [0] *(len(b) - b.index(1)) for a,b in zip(examples[column], t['input_ids'])]
+    return t
 
-tokenized_datasets = dataset.map(tokenize_function, batched=True)
 test_tokenized_datasets = test_dataset.map(tokenize_function, batched=True)
+tokenized_datasets = dataset.map(tokenize_function, batched=True)
+
+
+#for d in ('train', 'test'):
+#        for i in tqdm(range(len(tokenized_datasets[d]))):
+#            tokenized_datasets[d][i][column] = [tokenized_datasets[d][i][column] ] * 512 #len(tokenized_datasets[d][i]['input_ids'])
+#
+#d = 'train'
+#for column in tqdm(('date', 'day_of_month', 'day_of_year', 'month', 'year', 'year_cont')):
+#    for i in tqdm(range(len(test_tokenized_datasets[d]))):
+#        test_tokenized_datasets[d][i][column] = [test_tokenized_datasets[d][i][column] ] * 512 #len(test_tokenized_datasets[d][i]['input_ids'])
 
 train_dataset = tokenized_datasets["train"].shuffle(seed=42)
 eval_dataset_full = tokenized_datasets["test"]
 eval_dataset_small = tokenized_datasets["test"].select(range(2000))
 test_dataset = test_tokenized_datasets["train"]
 
+
 with open('train_dataset.pickle','wb') as f_p:
     pickle.dump(train_dataset, f_p)
 
diff --git a/roberta_temp/02a_load_guess_date_datasets.py b/roberta_temp/02a_load_guess_date_datasets.py
new file mode 100644
index 0000000..6f4065d
--- /dev/null
+++ b/roberta_temp/02a_load_guess_date_datasets.py
@@ -0,0 +1,53 @@
+import pickle
+from datasets import load_dataset
+from transformers import AutoTokenizer
+from config import MODEL
+from tqdm import tqdm
+
+dataset = load_dataset('csv', sep='\t', data_files={'train': ['../train/huggingface_guess_day.csv'], 'test': ['../dev-0/huggingface_guess_day.csv']})
+test_dataset = load_dataset('csv', sep='\t', data_files='../test-A/huggingface_format_year.tsv')
+
+tokenizer = AutoTokenizer.from_pretrained(MODEL)
+
+def tokenize_function(examples):
+    t = tokenizer(examples["text"], padding="max_length", truncation=True)
+    examples['year'] = [x - 1995 for x in examples['year']]
+    for column in 'date', 'day_of_month', 'day_of_year', 'month', 'year', 'weekday', 'year_cont':
+        try:
+            t[column] = [[a] * b.index(1) + [0] *(len(b) - b.index(1)) for a,b in zip(examples[column], t['input_ids'])]
+        except:
+            pass
+    return t
+
+test_tokenized_datasets = test_dataset.map(tokenize_function, batched=True)
+tokenized_datasets = dataset.map(tokenize_function, batched=True)
+
+
+#for d in ('train', 'test'):
+#        for i in tqdm(range(len(tokenized_datasets[d]))):
+#            tokenized_datasets[d][i][column] = [tokenized_datasets[d][i][column] ] * 512 #len(tokenized_datasets[d][i]['input_ids'])
+#
+#d = 'train'
+#for column in tqdm(('date', 'day_of_month', 'day_of_year', 'month', 'year', 'year_cont')):
+#    for i in tqdm(range(len(test_tokenized_datasets[d]))):
+#        test_tokenized_datasets[d][i][column] = [test_tokenized_datasets[d][i][column] ] * 512 #len(test_tokenized_datasets[d][i]['input_ids'])
+
+train_dataset = tokenized_datasets["train"].shuffle(seed=42)
+eval_dataset_full = tokenized_datasets["test"]
+eval_dataset_small = tokenized_datasets["test"].select(range(2000))
+test_dataset = test_tokenized_datasets["train"]
+
+
+with open('train_dataset.pickle','wb') as f_p:
+    pickle.dump(train_dataset, f_p)
+
+with open('eval_dataset_small.pickle','wb') as f_p:
+    pickle.dump(eval_dataset_small, f_p)
+
+with open('eval_dataset_full.pickle','wb') as f_p:
+    pickle.dump(eval_dataset_full, f_p)
+
+with open('test_dataset.pickle','wb') as f_p:
+    pickle.dump(test_dataset, f_p)
+     
+
diff --git a/roberta_temp/03_train.py b/roberta_temp/03_train.py
index e72a902..1fb51ed 100644
--- a/roberta_temp/03_train.py
+++ b/roberta_temp/03_train.py
@@ -16,7 +16,15 @@ with open('test_dataset.pickle','rb') as f_p:
 
 from transformers import AutoModelForSequenceClassification
 
-model = AutoModelForSequenceClassification.from_pretrained(MODEL, num_labels=7)
+#model = AutoModelForSequenceClassification.from_pretrained(MODEL, num_labels=7)
+model_clean = AutoModelForSequenceClassification.from_pretrained(MODEL, num_labels=7)
+model = AutoModelForSequenceClassification.from_pretrained('test_trainer_guess_weekday/checkpoint-6000',num_labels=7)
+import torch
+with torch.no_grad():
+    model.classifier.dense.weight = model_clean.classifier.dense.weight
+    model.classifier.out_proj.weight = model_clean.classifier.out_proj.weight
+
+del model_clean
 
 from transformers import TrainingArguments
 
@@ -27,9 +35,9 @@ training_args = TrainingArguments("test_trainer",
         evaluation_strategy='steps',
         #eval_steps=2_000,
         #save_steps=2_000,
-        eval_steps=20_000,
+        eval_steps=2_000,
         save_steps=20_000,
-        num_train_epochs=1,
+        num_train_epochs=5,
         gradient_accumulation_steps=2,
         learning_rate = 1e-6,
         #warmup_steps=4_000,
diff --git a/roberta_temp/03a_train_guess_day.py b/roberta_temp/03a_train_guess_day.py
new file mode 100644
index 0000000..40935ac
--- /dev/null
+++ b/roberta_temp/03a_train_guess_day.py
@@ -0,0 +1,70 @@
+import pickle
+from config import LABELS_LIST, MODEL
+
+with open('train_dataset.pickle','rb') as f_p:
+    train_dataset = pickle.load(f_p)
+
+with open('eval_dataset_small.pickle','rb') as f_p:
+    eval_dataset_small = pickle.load(f_p)
+
+with open('eval_dataset_full.pickle','rb') as f_p:
+    eval_dataset_full = pickle.load(f_p)
+
+
+
+from transformers import AutoModelForSequenceClassification
+
+model = AutoModelForSequenceClassification.from_pretrained(MODEL, num_labels=7)
+
+from transformers import TrainingArguments
+
+
+training_args = TrainingArguments("test_trainer",
+        per_device_train_batch_size=4,
+        per_device_eval_batch_size=4,
+        evaluation_strategy='steps',
+        #eval_steps=2_000,
+        #save_steps=2_000,
+        eval_steps=2_000,
+        save_steps=20_000,
+        num_train_epochs=1,
+        gradient_accumulation_steps=2,
+        learning_rate = 1e-6,
+        #warmup_steps=4_000,
+        warmup_steps=4,
+        load_best_model_at_end=True,
+        )
+
+import numpy as np
+from datasets import load_metric
+
+metric = load_metric("accuracy")
+
+def compute_metrics(eval_pred):
+    logits, labels = eval_pred
+    predictions = np.argmax(logits, axis=-1)
+    return metric.compute(predictions=predictions, references=labels)
+
+
+from transformers import Trainer
+
+trainer = Trainer(
+            model=model,
+            args=training_args,
+            train_dataset=train_dataset,
+            eval_dataset=eval_dataset_small,
+            compute_metrics=compute_metrics,
+            )
+
+#trainer.train(resume_from_checkpoint=True)
+trainer.train()
+trainer.save_model("./roberta-retrained")
+trainer.evaluate()
+
+
+eval_predictions = trainer.predict(eval_dataset_full).predictions.argmax(1)
+
+with open('../dev-0/out.tsv', 'w') as f_out:
+    for pred in eval_predictions:
+        f_out.write(LABELS_LIST[pred] + '\n')
+
diff --git a/roberta_temp/04_predict.py b/roberta_temp/04_predict.py
new file mode 100644
index 0000000..a66e3ee
--- /dev/null
+++ b/roberta_temp/04_predict.py
@@ -0,0 +1,85 @@
+import pickle
+from config import LABELS_LIST, MODEL
+
+with open('train_dataset.pickle','rb') as f_p:
+    train_dataset = pickle.load(f_p)
+
+with open('eval_dataset_small.pickle','rb') as f_p:
+    eval_dataset_small = pickle.load(f_p)
+
+with open('eval_dataset_full.pickle','rb') as f_p:
+    eval_dataset_full = pickle.load(f_p)
+
+with open('test_dataset.pickle','rb') as f_p:
+    test_dataset = pickle.load(f_p)
+
+
+from transformers import AutoModelForSequenceClassification
+
+model = AutoModelForSequenceClassification.from_pretrained('roberta-retrained/')
+
+from transformers import TrainingArguments
+
+
+training_args = TrainingArguments("test_trainer",
+        per_device_train_batch_size=4,
+        per_device_eval_batch_size=4,
+        evaluation_strategy='steps',
+        #eval_steps=2_000,
+        #save_steps=2_000,
+        eval_steps=2_000,
+        save_steps=20_000,
+        num_train_epochs=1,
+        gradient_accumulation_steps=2,
+        learning_rate = 1e-6,
+        #warmup_steps=4_000,
+        warmup_steps=4,
+        load_best_model_at_end=True,
+        )
+
+import numpy as np
+from datasets import load_metric
+
+metric = load_metric("accuracy")
+
+def compute_metrics(eval_pred):
+    logits, labels = eval_pred
+    predictions = np.argmax(logits, axis=-1)
+    return metric.compute(predictions=predictions, references=labels)
+
+
+from transformers import Trainer
+
+trainer = Trainer(
+            model=model,
+            args=training_args,
+            train_dataset=train_dataset,
+            eval_dataset=eval_dataset_small,
+            compute_metrics=compute_metrics,
+            )
+
+
+eval_predictions = trainer.predict(eval_dataset_full).predictions.argmax(1)
+
+with open('../dev-0/out.tsv', 'w') as f_out:
+    for pred in eval_predictions:
+        f_out.write(LABELS_LIST[pred] + '\n')
+
+test_predictions = trainer.predict(test_dataset).predictions.argmax(1)
+with open('../test-A/out.tsv', 'w') as f_out:
+    for pred in test_predictions:
+        f_out.write(LABELS_LIST[pred] + '\n')
+
+#model = AutoModelForSequenceClassification.from_pretrained('roberta-retrained/')
+
+#for dataset in ('dev-0', 'test-A'):
+#    with open(f'../{dataset}/in.tsv') as f_in, open(f'../{dataset}/out.tsv','w') as f_out:
+#        for line_in in tqdm(f_in, total=150_000):
+#            _,_, text = line_in.split('\t')
+#            text = text.rstrip('\n')
+#            inputs = tokenizer(text, padding=True, truncation=True, return_tensors="pt").to(device)
+#            outputs = model(**inputs)
+#            probs = outputs[0].softmax(1)
+#            prediction = LABELS_LIST[probs.argmax(1)]
+#            f_out.write(prediction + '\n')
+#