roberta no year from scratch
This commit is contained in:
parent
d6d7a4dbda
commit
e5f134bba4
52470
dev-0/out.tsv
52470
dev-0/out.tsv
File diff suppressed because it is too large
Load Diff
17
roberta_no_year_from_scratch/01_create_datasets.py
Normal file
17
roberta_no_year_from_scratch/01_create_datasets.py
Normal file
@ -0,0 +1,17 @@
|
||||
from config import LABELS_DICT
|
||||
|
||||
with open('../test-A/in.tsv','r') as f_in, open(f'../test-A/huggingface_format.tsv', 'w') as f_hf:
|
||||
f_hf.write('text\n')
|
||||
for line_in in f_in:
|
||||
_,_, text = line_in.split('\t')
|
||||
f_hf.write(text)
|
||||
|
||||
|
||||
for dataset in 'train', 'dev-0':
|
||||
with open(f'../{dataset}/in.tsv') as f_in, open(f'../{dataset}/expected.tsv') as f_exp, open(f'../{dataset}/huggingface_format.tsv','w') as f_hf:
|
||||
f_hf.write('text\tlabel\n')
|
||||
for line_in, line_exp in zip(f_in, f_exp):
|
||||
label = LABELS_DICT[line_exp.rstrip('\n')]
|
||||
_,_,text = line_in.rstrip('\n').split('\t')
|
||||
f_hf.write(text +'\t'+ str(label) + '\n')
|
||||
|
32
roberta_no_year_from_scratch/02_load_dataset.py
Normal file
32
roberta_no_year_from_scratch/02_load_dataset.py
Normal file
@ -0,0 +1,32 @@
|
||||
import pickle
|
||||
from datasets import load_dataset
|
||||
from transformers import AutoTokenizer, RobertaTokenizer
|
||||
from config import MODEL
|
||||
|
||||
dataset = load_dataset('csv', sep='\t', data_files={'train': ['../train/huggingface_format.tsv'], 'test': ['../dev-0/huggingface_format.tsv']})
|
||||
test_dataset = load_dataset('csv', sep='\t', data_files ='../test-A/huggingface_format.tsv')
|
||||
|
||||
tokenizer = RobertaTokenizer.from_pretrained(MODEL)
|
||||
|
||||
def tokenize_function(examples):
|
||||
return tokenizer(examples["text"], padding="max_length", truncation=True)
|
||||
|
||||
tokenized_datasets = dataset.map(tokenize_function, batched=True)
|
||||
test_tokenized_datasets = test_dataset.map(tokenize_function, batched=True)
|
||||
|
||||
train_dataset = tokenized_datasets["train"].shuffle(seed=42)
|
||||
eval_dataset_full = tokenized_datasets["test"]
|
||||
eval_dataset_small = tokenized_datasets["test"].select(range(2000))
|
||||
test_dataset = test_tokenized_datasets["train"]
|
||||
|
||||
with open('train_dataset.pickle','wb') as f_p:
|
||||
pickle.dump(train_dataset, f_p)
|
||||
|
||||
with open('eval_dataset_small.pickle','wb') as f_p:
|
||||
pickle.dump(eval_dataset_small, f_p)
|
||||
|
||||
with open('eval_dataset_full.pickle','wb') as f_p:
|
||||
pickle.dump(eval_dataset_full, f_p)
|
||||
|
||||
with open('test_dataset.pickle','wb') as f_p:
|
||||
pickle.dump(test_dataset, f_p)
|
78
roberta_no_year_from_scratch/03_train.py
Normal file
78
roberta_no_year_from_scratch/03_train.py
Normal file
@ -0,0 +1,78 @@
|
||||
import pickle
|
||||
from config import LABELS_LIST, MODEL
|
||||
|
||||
with open('train_dataset.pickle','rb') as f_p:
|
||||
train_dataset = pickle.load(f_p)
|
||||
|
||||
with open('eval_dataset_small.pickle','rb') as f_p:
|
||||
eval_dataset_small = pickle.load(f_p)
|
||||
|
||||
with open('eval_dataset_full.pickle','rb') as f_p:
|
||||
eval_dataset_full = pickle.load(f_p)
|
||||
|
||||
with open('test_dataset.pickle','rb') as f_p:
|
||||
test_dataset = pickle.load(f_p)
|
||||
|
||||
|
||||
from transformers import RobertaTokenizer, RobertaForSequenceClassification, RobertaConfig
|
||||
|
||||
#model = RobertaForSequenceClassification(RobertaConfig(num_labels=7))
|
||||
model = RobertaForSequenceClassification.from_pretrained('roberta-base',num_labels=7)
|
||||
model = RobertaForSequenceClassification(model.config)
|
||||
|
||||
from transformers import TrainingArguments
|
||||
|
||||
|
||||
training_args = TrainingArguments("test_trainer",
|
||||
per_device_train_batch_size=4,
|
||||
per_device_eval_batch_size=4,
|
||||
evaluation_strategy='steps',
|
||||
#eval_steps=2_000,
|
||||
#save_steps=2_000,
|
||||
eval_steps=20_000,
|
||||
save_steps=20_000,
|
||||
num_train_epochs=20,
|
||||
gradient_accumulation_steps=2,
|
||||
learning_rate = 1e-6,
|
||||
#warmup_steps=4_000,
|
||||
warmup_steps=4,
|
||||
load_best_model_at_end=True,
|
||||
)
|
||||
|
||||
import numpy as np
|
||||
from datasets import load_metric
|
||||
|
||||
metric = load_metric("accuracy")
|
||||
|
||||
def compute_metrics(eval_pred):
|
||||
logits, labels = eval_pred
|
||||
predictions = np.argmax(logits, axis=-1)
|
||||
return metric.compute(predictions=predictions, references=labels)
|
||||
|
||||
|
||||
from transformers import Trainer
|
||||
|
||||
trainer = Trainer(
|
||||
model=model,
|
||||
args=training_args,
|
||||
train_dataset=train_dataset,
|
||||
eval_dataset=eval_dataset_small,
|
||||
compute_metrics=compute_metrics,
|
||||
)
|
||||
|
||||
trainer.train(resume_from_checkpoint=True)
|
||||
#trainer.train()
|
||||
trainer.save_model("./roberta-retrained")
|
||||
trainer.evaluate()
|
||||
|
||||
|
||||
eval_predictions = trainer.predict(eval_dataset_full).predictions.argmax(1)
|
||||
|
||||
with open('../dev-0/out.tsv', 'w') as f_out:
|
||||
for pred in eval_predictions:
|
||||
f_out.write(LABELS_LIST[pred] + '\n')
|
||||
|
||||
test_predictions = trainer.predict(test_dataset).predictions.argmax(1)
|
||||
with open('../test-A/out.tsv', 'w') as f_out:
|
||||
for pred in test_predictions:
|
||||
f_out.write(LABELS_LIST[pred] + '\n')
|
24
roberta_no_year_from_scratch/04_predict.py
Normal file
24
roberta_no_year_from_scratch/04_predict.py
Normal file
@ -0,0 +1,24 @@
|
||||
import pickle
|
||||
from config import LABELS_LIST, MODEL
|
||||
from transformers import AutoTokenizer
|
||||
from tqdm import tqdm
|
||||
|
||||
device = 'cpu'
|
||||
|
||||
|
||||
from transformers import AutoModelForSequenceClassification
|
||||
|
||||
model = AutoModelForSequenceClassification.from_pretrained('test_trainer/checkpoint-620000/')
|
||||
tokenizer = AutoTokenizer.from_pretrained(MODEL)
|
||||
|
||||
for dataset in ('dev-0', 'test-A'):
|
||||
with open(f'../{dataset}/in.tsv') as f_in, open(f'../{dataset}/out.tsv','w') as f_out:
|
||||
for line_in in tqdm(f_in, total=150_000):
|
||||
_,_, text = line_in.split('\t')
|
||||
text = text.rstrip('\n')
|
||||
inputs = tokenizer(text, padding=True, truncation=True, return_tensors="pt").to(device)
|
||||
outputs = model(**inputs)
|
||||
probs = outputs[0].softmax(1)
|
||||
prediction = LABELS_LIST[probs.argmax(1)]
|
||||
f_out.write(prediction + '\n')
|
||||
|
18
roberta_no_year_from_scratch/config.py
Normal file
18
roberta_no_year_from_scratch/config.py
Normal file
@ -0,0 +1,18 @@
|
||||
LABELS_DICT = {'news':0,
|
||||
'sport':1,
|
||||
'business':2,
|
||||
'opinion':3,
|
||||
'culture':4,
|
||||
'lifestyle':5,
|
||||
'removed':6}
|
||||
|
||||
|
||||
LABELS_LIST = ['news',
|
||||
'sport',
|
||||
'business',
|
||||
'opinion',
|
||||
'culture',
|
||||
'lifestyle',
|
||||
'removed']
|
||||
|
||||
MODEL = 'roberta-base'
|
0
roberta_no_year_from_scratch/logs
Normal file
0
roberta_no_year_from_scratch/logs
Normal file
50216
test-A/out.tsv
50216
test-A/out.tsv
File diff suppressed because it is too large
Load Diff
Loading…
Reference in New Issue
Block a user