huggingface herbert base
This commit is contained in:
parent
bee7eaa312
commit
488bcd58af
21140
dev-0/out.tsv
21140
dev-0/out.tsv
File diff suppressed because it is too large
Load Diff
16
herbert/01_create_datasets.py
Normal file
16
herbert/01_create_datasets.py
Normal file
@ -0,0 +1,16 @@
|
|||||||
|
|
||||||
|
with open('../test-A/in.tsv','r') as f_in, open(f'../test-A/huggingface_format.tsv', 'w') as f_hf:
|
||||||
|
f_hf.write('text\n')
|
||||||
|
for line_in in f_in:
|
||||||
|
text = line_in.replace('\t', ' ')
|
||||||
|
f_hf.write(text)
|
||||||
|
|
||||||
|
|
||||||
|
for dataset in 'train', 'dev-0':
|
||||||
|
with open(f'../{dataset}/in.tsv') as f_in, open(f'../{dataset}/expected.tsv') as f_exp, open(f'../{dataset}/huggingface_format.tsv','w') as f_hf:
|
||||||
|
f_hf.write('text\tlabel\n')
|
||||||
|
for line_in, line_exp in zip(f_in, f_exp):
|
||||||
|
label = line_exp.rstrip('\n')
|
||||||
|
text = line_in.replace('\t', ' ').rstrip('\n')
|
||||||
|
f_hf.write(text +'\t'+ str(label) + '\n')
|
||||||
|
|
28
herbert/02_load_dataset.py
Normal file
28
herbert/02_load_dataset.py
Normal file
@ -0,0 +1,28 @@
|
|||||||
|
import pickle
|
||||||
|
from datasets import load_dataset
|
||||||
|
from transformers import AutoTokenizer
|
||||||
|
from config import MODEL, TOKENIZER
|
||||||
|
|
||||||
|
|
||||||
|
dataset = load_dataset('csv', sep='\t', data_files={'train': ['../train/huggingface_format.tsv'], 'test': ['../dev-0/huggingface_format.tsv']})
|
||||||
|
test_dataset = load_dataset('csv', sep = '\t', data_files ='../test-A/huggingface_format.tsv')
|
||||||
|
|
||||||
|
tokenizer = AutoTokenizer.from_pretrained(TOKENIZER)
|
||||||
|
def tokenize_function(examples):
|
||||||
|
return tokenizer(examples["text"], padding="max_length", truncation=True)
|
||||||
|
|
||||||
|
tokenized_datasets = dataset.map(tokenize_function, batched=True)
|
||||||
|
test_tokenized_datasets = test_dataset.map(tokenize_function, batched=True)
|
||||||
|
|
||||||
|
train_dataset = tokenized_datasets["train"].shuffle(seed=42)
|
||||||
|
eval_dataset = tokenized_datasets["test"]
|
||||||
|
test_dataset = test_tokenized_datasets["train"]
|
||||||
|
|
||||||
|
with open('train_dataset.pickle','wb') as f_p:
|
||||||
|
pickle.dump(train_dataset, f_p)
|
||||||
|
|
||||||
|
with open('eval_dataset.pickle','wb') as f_p:
|
||||||
|
pickle.dump(eval_dataset, f_p)
|
||||||
|
|
||||||
|
with open('test_dataset.pickle','wb') as f_p:
|
||||||
|
pickle.dump(test_dataset, f_p)
|
73
herbert/03_train.py
Normal file
73
herbert/03_train.py
Normal file
@ -0,0 +1,73 @@
|
|||||||
|
import pickle
|
||||||
|
from config import MODEL
|
||||||
|
from scipy.special import softmax
|
||||||
|
|
||||||
|
with open('train_dataset.pickle','rb') as f_p:
|
||||||
|
train_dataset = pickle.load(f_p)
|
||||||
|
|
||||||
|
with open('eval_dataset.pickle','rb') as f_p:
|
||||||
|
eval_dataset = pickle.load(f_p)
|
||||||
|
|
||||||
|
with open('test_dataset.pickle','rb') as f_p:
|
||||||
|
test_dataset = pickle.load(f_p)
|
||||||
|
|
||||||
|
|
||||||
|
from transformers import AutoModelForSequenceClassification
|
||||||
|
|
||||||
|
model = AutoModelForSequenceClassification.from_pretrained(MODEL, num_labels=2)
|
||||||
|
|
||||||
|
from transformers import TrainingArguments
|
||||||
|
|
||||||
|
|
||||||
|
training_args = TrainingArguments("test_trainer",
|
||||||
|
per_device_train_batch_size=4,
|
||||||
|
per_device_eval_batch_size=4,
|
||||||
|
evaluation_strategy='steps',
|
||||||
|
eval_steps=2_000,
|
||||||
|
gradient_accumulation_steps=10,
|
||||||
|
learning_rate=2e-6,
|
||||||
|
warmup_steps=4_000,
|
||||||
|
num_train_epochs=10,
|
||||||
|
load_best_model_at_end=True)
|
||||||
|
|
||||||
|
|
||||||
|
|
||||||
|
|
||||||
|
import numpy as np
|
||||||
|
from datasets import load_metric
|
||||||
|
|
||||||
|
metric = load_metric("f1")
|
||||||
|
|
||||||
|
def compute_metrics(eval_pred):
|
||||||
|
logits, labels = eval_pred
|
||||||
|
predictions = np.argmax(logits, axis=-1)
|
||||||
|
return metric.compute(predictions=predictions, references=labels)
|
||||||
|
|
||||||
|
|
||||||
|
from transformers import Trainer
|
||||||
|
|
||||||
|
trainer = Trainer(
|
||||||
|
model=model,
|
||||||
|
args=training_args,
|
||||||
|
train_dataset=train_dataset,
|
||||||
|
eval_dataset=eval_dataset,
|
||||||
|
compute_metrics=compute_metrics,
|
||||||
|
)
|
||||||
|
|
||||||
|
trainer.train()
|
||||||
|
trainer.save_model('./roberta_retrained')
|
||||||
|
trainer.evaluate()
|
||||||
|
|
||||||
|
|
||||||
|
eval_predictions = trainer.predict(eval_dataset).predictions
|
||||||
|
|
||||||
|
with open('../dev-0/out.tsv', 'w') as f_out:
|
||||||
|
for pred in eval_predictions:
|
||||||
|
pred = softmax(pred)[1]
|
||||||
|
f_out.write(str(pred) + '\n')
|
||||||
|
|
||||||
|
test_predictions = trainer.predict(test_dataset).predictions
|
||||||
|
with open('../test-A/out.tsv', 'w') as f_out:
|
||||||
|
for pred in test_predictions:
|
||||||
|
pred = softmax(pred)[1]
|
||||||
|
f_out.write(str(pred) + '\n')
|
Loading…
Reference in New Issue
Block a user