a
This commit is contained in:
parent
cfa09497b6
commit
d6d7a4dbda
@ -1,17 +1,32 @@
|
|||||||
|
import datetime
|
||||||
from config import LABELS_DICT
|
from config import LABELS_DICT
|
||||||
|
|
||||||
with open('../test-A/in.tsv','r') as f_in, open(f'../test-A/huggingface_format_year.tsv', 'w') as f_hf:
|
with open('../test-A/in.csv','r') as f_in, open(f'../test-A/huggingface_format_year.csv', 'w') as f_hf:
|
||||||
f_hf.write('text\n')
|
f_hf.write('text\tyear_cont\tdate\tday_of_year\tday_of_month\tmonth\tyear\tweekday\tlabel\n')
|
||||||
for line_in in f_in:
|
for line_in in f_in:
|
||||||
year, _, text = line_in.split('\t')
|
year_cont, date, text = line_in.rstrip('\n').split('\t')
|
||||||
f_hf.write(year + '\t' + text)
|
d = datetime.datetime.strptime(date,"%Y%m%d")
|
||||||
|
day_of_year = str(d.timetuple().tm_yday)
|
||||||
|
day_of_month = str(d.day)
|
||||||
|
month = str(d.month)
|
||||||
|
year = str(d.year)
|
||||||
|
weekday = str(d.weekday())
|
||||||
|
day_of_year = str(d.timetuple().tm_yday)
|
||||||
|
f_hf.write(text +'\t' +year_cont +'\t'+ date + '\t' + day_of_year + '\t' + day_of_month + '\t' + month + '\t' + year + '\t' + weekday + '\t' + str('0') + '\n')
|
||||||
|
|
||||||
|
|
||||||
for dataset in 'train', 'dev-0':
|
for dataset in 'train', 'dev-0':
|
||||||
with open(f'../{dataset}/in.tsv') as f_in, open(f'../{dataset}/expected.tsv') as f_exp, open(f'../{dataset}/huggingface_format_year.tsv','w') as f_hf:
|
with open(f'../{dataset}/in.csv') as f_in, open(f'../{dataset}/expected.csv') as f_exp, open(f'../{dataset}/huggingface_format_year.csv','w') as f_hf:
|
||||||
f_hf.write('text\tyear\tlabel\n')
|
f_hf.write('text\tyear_cont\tdate\tday_of_year\tday_of_month\tmonth\tyear\tweekday\tlabel\n')
|
||||||
for line_in, line_exp in zip(f_in, f_exp):
|
for line_in, line_exp in zip(f_in, f_exp):
|
||||||
label = LABELS_DICT[line_exp.rstrip('\n')]
|
label = str(LABELS_DICT[line_exp.rstrip('\n')])
|
||||||
year,_,text = line_in.rstrip('\n').split('\t')
|
year_cont,date,text = line_in.rstrip('\n').split('\t')
|
||||||
f_hf.write(text +'\t' +year +'\t'+ str(label) + '\n')
|
d = datetime.datetime.strptime(date,"%Y%m%d")
|
||||||
|
day_of_year = str(d.timetuple().tm_yday)
|
||||||
|
day_of_month = str(d.day)
|
||||||
|
month = str(d.month)
|
||||||
|
year = str(d.year)
|
||||||
|
weekday = str(d.weekday())
|
||||||
|
day_of_year = str(d.timetuple().tm_yday)
|
||||||
|
f_hf.write(text +'\t' +year_cont +'\t'+ date + '\t'+ day_of_year + '\t' + day_of_month + '\t' + month + '\t' + year + '\t' + weekday + '\t' + label + '\n')
|
||||||
|
|
||||||
|
17
roberta_temp/01a_create_guess_date_datasets.py
Normal file
17
roberta_temp/01a_create_guess_date_datasets.py
Normal file
@ -0,0 +1,17 @@
|
|||||||
|
import datetime
|
||||||
|
for dataset in 'train', 'dev-0':
|
||||||
|
with open(f'../{dataset}/in.tsv') as f_in, open(f'../{dataset}/expected.tsv') as f_exp, open(f'../{dataset}/huggingface_guess_day.csv','w') as f_hf:
|
||||||
|
f_hf.write('text\tyear_cont\tdate\tday_of_year\tday_of_month\tmonth\tyear\tweekday\tlabel\n')
|
||||||
|
for line_in, line_exp in zip(f_in, f_exp):
|
||||||
|
year_cont,date,text = line_in.rstrip('\n').split('\t')
|
||||||
|
d = datetime.datetime.strptime(date,"%Y%m%d")
|
||||||
|
day_of_year = str(d.timetuple().tm_yday)
|
||||||
|
day_of_month = str(d.day)
|
||||||
|
month = str(d.month)
|
||||||
|
year = str(d.year)
|
||||||
|
weekday = str(d.weekday())
|
||||||
|
day_of_year = str(d.timetuple().tm_yday)
|
||||||
|
#label = f'year:{year} month:{month} day:{day_of_month} weekday:{weekday}'
|
||||||
|
label = weekday
|
||||||
|
f_hf.write(text +'\t' +year_cont +'\t'+ date + '\t'+ day_of_year + '\t' + day_of_month + '\t' + month + '\t' + year + '\t' + weekday + '\t' + label + '\n')
|
||||||
|
|
@ -2,23 +2,39 @@ import pickle
|
|||||||
from datasets import load_dataset
|
from datasets import load_dataset
|
||||||
from transformers import AutoTokenizer
|
from transformers import AutoTokenizer
|
||||||
from config import MODEL
|
from config import MODEL
|
||||||
|
from tqdm import tqdm
|
||||||
|
|
||||||
dataset = load_dataset('csv', sep='\t', data_files={'train': ['../train/huggingface_format_year.tsv'], 'test': ['../dev-0/huggingface_format_year.tsv']})
|
dataset = load_dataset('csv', sep='\t', data_files={'train': ['../train/huggingface_format_year.csv'], 'test': ['../dev-0/huggingface_format_year.csv']})
|
||||||
test_dataset = load_dataset('csv', sep='\t', data_files='../test-A/huggingface_format_year.tsv')
|
test_dataset = load_dataset('csv', sep='\t', data_files='../test-A/huggingface_format_year.csv')
|
||||||
|
|
||||||
tokenizer = AutoTokenizer.from_pretrained(MODEL)
|
tokenizer = AutoTokenizer.from_pretrained(MODEL)
|
||||||
|
|
||||||
def tokenize_function(examples):
|
def tokenize_function(examples):
|
||||||
return tokenizer(examples["text"], padding="max_length", truncation=True)
|
t = tokenizer(examples["text"], padding="max_length", truncation=True)
|
||||||
|
examples['year'] = [x - 1995 for x in examples['year']]
|
||||||
|
for column in 'date', 'day_of_month', 'day_of_year', 'month', 'year', 'weekday', 'year_cont':
|
||||||
|
t[column] = [[a] * b.index(1) + [0] *(len(b) - b.index(1)) for a,b in zip(examples[column], t['input_ids'])]
|
||||||
|
return t
|
||||||
|
|
||||||
tokenized_datasets = dataset.map(tokenize_function, batched=True)
|
|
||||||
test_tokenized_datasets = test_dataset.map(tokenize_function, batched=True)
|
test_tokenized_datasets = test_dataset.map(tokenize_function, batched=True)
|
||||||
|
tokenized_datasets = dataset.map(tokenize_function, batched=True)
|
||||||
|
|
||||||
|
|
||||||
|
#for d in ('train', 'test'):
|
||||||
|
# for i in tqdm(range(len(tokenized_datasets[d]))):
|
||||||
|
# tokenized_datasets[d][i][column] = [tokenized_datasets[d][i][column] ] * 512 #len(tokenized_datasets[d][i]['input_ids'])
|
||||||
|
#
|
||||||
|
#d = 'train'
|
||||||
|
#for column in tqdm(('date', 'day_of_month', 'day_of_year', 'month', 'year', 'year_cont')):
|
||||||
|
# for i in tqdm(range(len(test_tokenized_datasets[d]))):
|
||||||
|
# test_tokenized_datasets[d][i][column] = [test_tokenized_datasets[d][i][column] ] * 512 #len(test_tokenized_datasets[d][i]['input_ids'])
|
||||||
|
|
||||||
train_dataset = tokenized_datasets["train"].shuffle(seed=42)
|
train_dataset = tokenized_datasets["train"].shuffle(seed=42)
|
||||||
eval_dataset_full = tokenized_datasets["test"]
|
eval_dataset_full = tokenized_datasets["test"]
|
||||||
eval_dataset_small = tokenized_datasets["test"].select(range(2000))
|
eval_dataset_small = tokenized_datasets["test"].select(range(2000))
|
||||||
test_dataset = test_tokenized_datasets["train"]
|
test_dataset = test_tokenized_datasets["train"]
|
||||||
|
|
||||||
|
|
||||||
with open('train_dataset.pickle','wb') as f_p:
|
with open('train_dataset.pickle','wb') as f_p:
|
||||||
pickle.dump(train_dataset, f_p)
|
pickle.dump(train_dataset, f_p)
|
||||||
|
|
||||||
|
53
roberta_temp/02a_load_guess_date_datasets.py
Normal file
53
roberta_temp/02a_load_guess_date_datasets.py
Normal file
@ -0,0 +1,53 @@
|
|||||||
|
import pickle
|
||||||
|
from datasets import load_dataset
|
||||||
|
from transformers import AutoTokenizer
|
||||||
|
from config import MODEL
|
||||||
|
from tqdm import tqdm
|
||||||
|
|
||||||
|
dataset = load_dataset('csv', sep='\t', data_files={'train': ['../train/huggingface_guess_day.csv'], 'test': ['../dev-0/huggingface_guess_day.csv']})
|
||||||
|
test_dataset = load_dataset('csv', sep='\t', data_files='../test-A/huggingface_format_year.tsv')
|
||||||
|
|
||||||
|
tokenizer = AutoTokenizer.from_pretrained(MODEL)
|
||||||
|
|
||||||
|
def tokenize_function(examples):
|
||||||
|
t = tokenizer(examples["text"], padding="max_length", truncation=True)
|
||||||
|
examples['year'] = [x - 1995 for x in examples['year']]
|
||||||
|
for column in 'date', 'day_of_month', 'day_of_year', 'month', 'year', 'weekday', 'year_cont':
|
||||||
|
try:
|
||||||
|
t[column] = [[a] * b.index(1) + [0] *(len(b) - b.index(1)) for a,b in zip(examples[column], t['input_ids'])]
|
||||||
|
except:
|
||||||
|
pass
|
||||||
|
return t
|
||||||
|
|
||||||
|
test_tokenized_datasets = test_dataset.map(tokenize_function, batched=True)
|
||||||
|
tokenized_datasets = dataset.map(tokenize_function, batched=True)
|
||||||
|
|
||||||
|
|
||||||
|
#for d in ('train', 'test'):
|
||||||
|
# for i in tqdm(range(len(tokenized_datasets[d]))):
|
||||||
|
# tokenized_datasets[d][i][column] = [tokenized_datasets[d][i][column] ] * 512 #len(tokenized_datasets[d][i]['input_ids'])
|
||||||
|
#
|
||||||
|
#d = 'train'
|
||||||
|
#for column in tqdm(('date', 'day_of_month', 'day_of_year', 'month', 'year', 'year_cont')):
|
||||||
|
# for i in tqdm(range(len(test_tokenized_datasets[d]))):
|
||||||
|
# test_tokenized_datasets[d][i][column] = [test_tokenized_datasets[d][i][column] ] * 512 #len(test_tokenized_datasets[d][i]['input_ids'])
|
||||||
|
|
||||||
|
train_dataset = tokenized_datasets["train"].shuffle(seed=42)
|
||||||
|
eval_dataset_full = tokenized_datasets["test"]
|
||||||
|
eval_dataset_small = tokenized_datasets["test"].select(range(2000))
|
||||||
|
test_dataset = test_tokenized_datasets["train"]
|
||||||
|
|
||||||
|
|
||||||
|
with open('train_dataset.pickle','wb') as f_p:
|
||||||
|
pickle.dump(train_dataset, f_p)
|
||||||
|
|
||||||
|
with open('eval_dataset_small.pickle','wb') as f_p:
|
||||||
|
pickle.dump(eval_dataset_small, f_p)
|
||||||
|
|
||||||
|
with open('eval_dataset_full.pickle','wb') as f_p:
|
||||||
|
pickle.dump(eval_dataset_full, f_p)
|
||||||
|
|
||||||
|
with open('test_dataset.pickle','wb') as f_p:
|
||||||
|
pickle.dump(test_dataset, f_p)
|
||||||
|
|
||||||
|
|
@ -16,7 +16,15 @@ with open('test_dataset.pickle','rb') as f_p:
|
|||||||
|
|
||||||
from transformers import AutoModelForSequenceClassification
|
from transformers import AutoModelForSequenceClassification
|
||||||
|
|
||||||
model = AutoModelForSequenceClassification.from_pretrained(MODEL, num_labels=7)
|
#model = AutoModelForSequenceClassification.from_pretrained(MODEL, num_labels=7)
|
||||||
|
model_clean = AutoModelForSequenceClassification.from_pretrained(MODEL, num_labels=7)
|
||||||
|
model = AutoModelForSequenceClassification.from_pretrained('test_trainer_guess_weekday/checkpoint-6000',num_labels=7)
|
||||||
|
import torch
|
||||||
|
with torch.no_grad():
|
||||||
|
model.classifier.dense.weight = model_clean.classifier.dense.weight
|
||||||
|
model.classifier.out_proj.weight = model_clean.classifier.out_proj.weight
|
||||||
|
|
||||||
|
del model_clean
|
||||||
|
|
||||||
from transformers import TrainingArguments
|
from transformers import TrainingArguments
|
||||||
|
|
||||||
@ -27,9 +35,9 @@ training_args = TrainingArguments("test_trainer",
|
|||||||
evaluation_strategy='steps',
|
evaluation_strategy='steps',
|
||||||
#eval_steps=2_000,
|
#eval_steps=2_000,
|
||||||
#save_steps=2_000,
|
#save_steps=2_000,
|
||||||
eval_steps=20_000,
|
eval_steps=2_000,
|
||||||
save_steps=20_000,
|
save_steps=20_000,
|
||||||
num_train_epochs=1,
|
num_train_epochs=5,
|
||||||
gradient_accumulation_steps=2,
|
gradient_accumulation_steps=2,
|
||||||
learning_rate = 1e-6,
|
learning_rate = 1e-6,
|
||||||
#warmup_steps=4_000,
|
#warmup_steps=4_000,
|
||||||
|
70
roberta_temp/03a_train_guess_day.py
Normal file
70
roberta_temp/03a_train_guess_day.py
Normal file
@ -0,0 +1,70 @@
|
|||||||
|
import pickle
|
||||||
|
from config import LABELS_LIST, MODEL
|
||||||
|
|
||||||
|
with open('train_dataset.pickle','rb') as f_p:
|
||||||
|
train_dataset = pickle.load(f_p)
|
||||||
|
|
||||||
|
with open('eval_dataset_small.pickle','rb') as f_p:
|
||||||
|
eval_dataset_small = pickle.load(f_p)
|
||||||
|
|
||||||
|
with open('eval_dataset_full.pickle','rb') as f_p:
|
||||||
|
eval_dataset_full = pickle.load(f_p)
|
||||||
|
|
||||||
|
|
||||||
|
|
||||||
|
from transformers import AutoModelForSequenceClassification
|
||||||
|
|
||||||
|
model = AutoModelForSequenceClassification.from_pretrained(MODEL, num_labels=7)
|
||||||
|
|
||||||
|
from transformers import TrainingArguments
|
||||||
|
|
||||||
|
|
||||||
|
training_args = TrainingArguments("test_trainer",
|
||||||
|
per_device_train_batch_size=4,
|
||||||
|
per_device_eval_batch_size=4,
|
||||||
|
evaluation_strategy='steps',
|
||||||
|
#eval_steps=2_000,
|
||||||
|
#save_steps=2_000,
|
||||||
|
eval_steps=2_000,
|
||||||
|
save_steps=20_000,
|
||||||
|
num_train_epochs=1,
|
||||||
|
gradient_accumulation_steps=2,
|
||||||
|
learning_rate = 1e-6,
|
||||||
|
#warmup_steps=4_000,
|
||||||
|
warmup_steps=4,
|
||||||
|
load_best_model_at_end=True,
|
||||||
|
)
|
||||||
|
|
||||||
|
import numpy as np
|
||||||
|
from datasets import load_metric
|
||||||
|
|
||||||
|
metric = load_metric("accuracy")
|
||||||
|
|
||||||
|
def compute_metrics(eval_pred):
|
||||||
|
logits, labels = eval_pred
|
||||||
|
predictions = np.argmax(logits, axis=-1)
|
||||||
|
return metric.compute(predictions=predictions, references=labels)
|
||||||
|
|
||||||
|
|
||||||
|
from transformers import Trainer
|
||||||
|
|
||||||
|
trainer = Trainer(
|
||||||
|
model=model,
|
||||||
|
args=training_args,
|
||||||
|
train_dataset=train_dataset,
|
||||||
|
eval_dataset=eval_dataset_small,
|
||||||
|
compute_metrics=compute_metrics,
|
||||||
|
)
|
||||||
|
|
||||||
|
#trainer.train(resume_from_checkpoint=True)
|
||||||
|
trainer.train()
|
||||||
|
trainer.save_model("./roberta-retrained")
|
||||||
|
trainer.evaluate()
|
||||||
|
|
||||||
|
|
||||||
|
eval_predictions = trainer.predict(eval_dataset_full).predictions.argmax(1)
|
||||||
|
|
||||||
|
with open('../dev-0/out.tsv', 'w') as f_out:
|
||||||
|
for pred in eval_predictions:
|
||||||
|
f_out.write(LABELS_LIST[pred] + '\n')
|
||||||
|
|
85
roberta_temp/04_predict.py
Normal file
85
roberta_temp/04_predict.py
Normal file
@ -0,0 +1,85 @@
|
|||||||
|
import pickle
|
||||||
|
from config import LABELS_LIST, MODEL
|
||||||
|
|
||||||
|
with open('train_dataset.pickle','rb') as f_p:
|
||||||
|
train_dataset = pickle.load(f_p)
|
||||||
|
|
||||||
|
with open('eval_dataset_small.pickle','rb') as f_p:
|
||||||
|
eval_dataset_small = pickle.load(f_p)
|
||||||
|
|
||||||
|
with open('eval_dataset_full.pickle','rb') as f_p:
|
||||||
|
eval_dataset_full = pickle.load(f_p)
|
||||||
|
|
||||||
|
with open('test_dataset.pickle','rb') as f_p:
|
||||||
|
test_dataset = pickle.load(f_p)
|
||||||
|
|
||||||
|
|
||||||
|
from transformers import AutoModelForSequenceClassification
|
||||||
|
|
||||||
|
model = AutoModelForSequenceClassification.from_pretrained('roberta-retrained/')
|
||||||
|
|
||||||
|
from transformers import TrainingArguments
|
||||||
|
|
||||||
|
|
||||||
|
training_args = TrainingArguments("test_trainer",
|
||||||
|
per_device_train_batch_size=4,
|
||||||
|
per_device_eval_batch_size=4,
|
||||||
|
evaluation_strategy='steps',
|
||||||
|
#eval_steps=2_000,
|
||||||
|
#save_steps=2_000,
|
||||||
|
eval_steps=2_000,
|
||||||
|
save_steps=20_000,
|
||||||
|
num_train_epochs=1,
|
||||||
|
gradient_accumulation_steps=2,
|
||||||
|
learning_rate = 1e-6,
|
||||||
|
#warmup_steps=4_000,
|
||||||
|
warmup_steps=4,
|
||||||
|
load_best_model_at_end=True,
|
||||||
|
)
|
||||||
|
|
||||||
|
import numpy as np
|
||||||
|
from datasets import load_metric
|
||||||
|
|
||||||
|
metric = load_metric("accuracy")
|
||||||
|
|
||||||
|
def compute_metrics(eval_pred):
|
||||||
|
logits, labels = eval_pred
|
||||||
|
predictions = np.argmax(logits, axis=-1)
|
||||||
|
return metric.compute(predictions=predictions, references=labels)
|
||||||
|
|
||||||
|
|
||||||
|
from transformers import Trainer
|
||||||
|
|
||||||
|
trainer = Trainer(
|
||||||
|
model=model,
|
||||||
|
args=training_args,
|
||||||
|
train_dataset=train_dataset,
|
||||||
|
eval_dataset=eval_dataset_small,
|
||||||
|
compute_metrics=compute_metrics,
|
||||||
|
)
|
||||||
|
|
||||||
|
|
||||||
|
eval_predictions = trainer.predict(eval_dataset_full).predictions.argmax(1)
|
||||||
|
|
||||||
|
with open('../dev-0/out.tsv', 'w') as f_out:
|
||||||
|
for pred in eval_predictions:
|
||||||
|
f_out.write(LABELS_LIST[pred] + '\n')
|
||||||
|
|
||||||
|
test_predictions = trainer.predict(test_dataset).predictions.argmax(1)
|
||||||
|
with open('../test-A/out.tsv', 'w') as f_out:
|
||||||
|
for pred in test_predictions:
|
||||||
|
f_out.write(LABELS_LIST[pred] + '\n')
|
||||||
|
|
||||||
|
#model = AutoModelForSequenceClassification.from_pretrained('roberta-retrained/')
|
||||||
|
|
||||||
|
#for dataset in ('dev-0', 'test-A'):
|
||||||
|
# with open(f'../{dataset}/in.tsv') as f_in, open(f'../{dataset}/out.tsv','w') as f_out:
|
||||||
|
# for line_in in tqdm(f_in, total=150_000):
|
||||||
|
# _,_, text = line_in.split('\t')
|
||||||
|
# text = text.rstrip('\n')
|
||||||
|
# inputs = tokenizer(text, padding=True, truncation=True, return_tensors="pt").to(device)
|
||||||
|
# outputs = model(**inputs)
|
||||||
|
# probs = outputs[0].softmax(1)
|
||||||
|
# prediction = LABELS_LIST[probs.argmax(1)]
|
||||||
|
# f_out.write(prediction + '\n')
|
||||||
|
#
|
Loading…
Reference in New Issue
Block a user