hf_roberta_base_classification

This commit is contained in:
Jakub Pokrywka 2021-12-29 12:31:58 +01:00
parent 9ec36ba822
commit c51a389af7
7 changed files with 297809 additions and 297442 deletions

298268
dev-0/out.tsv

File diff suppressed because it is too large Load Diff

View File

@ -0,0 +1,60 @@
import datetime
import calendar
def to_fractional_year(d: datetime.datetime) -> float:
"""
Converts a date stamp to a fractional year (i.e. number like `1939.781`)
"""
is_leap = calendar.isleap(d.year)
t = d.timetuple()
day_of_year = t.tm_yday
day_time = (60 * 60 * t.tm_hour + 60 * t.tm_min + t.tm_sec) / (24 * 60 * 60)
days_in_year = 366 if is_leap else 365
return d.year + ((day_of_year - 1 + day_time) / days_in_year)
def fractional_to_date(fractional):
eps = 0.0001
year = int(fractional)
is_leap = calendar.isleap(year)
modulus = fractional % 1
days_in_year = 366 if is_leap else 365
day_of_year = int( days_in_year * modulus + eps )
d = datetime.datetime(year, 1,1) + datetime.timedelta(days = day_of_year )
return d
dates = (datetime.datetime(1825,10,30),
datetime.datetime(1825,10,31),
datetime.datetime(1900,1,1),
datetime.datetime(1900,12,1),
datetime.datetime(1900,12,31),
datetime.datetime(1930,2,28),
datetime.datetime(1932,2,29),
)
for split in 'train', 'dev-0':
with open(f'../{split}/in.tsv') as f_in, open(f'../{split}/expected.tsv') as f_exp, open(f'./{split}_huggingface_format.csv', 'w') as f_hf:
f_hf.write('year_cont\tyear\tmonth\tday\tweekday\tday_of_year\ttext\n')
for line_in,line_exp in zip(f_in,f_exp):
year_cont = float(line_exp.rstrip())
date = fractional_to_date(year_cont)
year = date.year
month = date.month
day = date.day
weekday = date.weekday()
day_of_year = date.timetuple().tm_yday
#f_hf.write(line_exp.rstrip() + '\t' + line_in)
f_hf.write(f'{year_cont}\t{year}\t{month}\t{day}\t{weekday}\t{day_of_year}\t{line_in}')
for split in ('test-A',):
with open(f'../{split}/in.tsv') as f_in, open(f'./{split}_huggingface_format.csv', 'w') as f_hf:
f_hf.write('year_cont\tyear\tmonth\tday\tweekday\tday_of_year\ttext\n')
for line_in in f_in:
f_hf.write(f'0\t0\t0\t0\t0\t0\t{line_in}')

View File

@ -0,0 +1,75 @@
from config import MODEL, TEST
import pickle
from datasets import load_dataset, Dataset
from transformers import AutoTokenizer
from tqdm import tqdm
from sklearn.preprocessing import MinMaxScaler
import numpy as np
from config import *
tokenizer = AutoTokenizer.from_pretrained(MODEL)
def tokenize_function(examples):
t = tokenizer(examples["text"], padding="max_length", truncation=True, max_length=40)
return t
def get_dataset_dict(dataset):
with open(dataset) as f_in:
next(f_in)
d = dict()
d['year_cont'] = list()
d['year'] = list()
d['month'] = list()
d['day'] = list()
d['weekday'] = list()
d['day_of_year'] = list()
d['text'] = list()
for l in f_in:
yc,y,m,day,w,dy,t= l.rstrip().split('\t')
d['year_cont'].append(yc)
d['year'].append(int(y) - MIN_YEAR)
d['month'].append(int(m))
d['day'].append(int(day))
d['weekday'].append(int(w))
d['day_of_year'].append(int(dy))
d['text'].append(t)
return d
train_dataset = Dataset.from_dict(get_dataset_dict('train_huggingface_format.csv')).map(tokenize_function, batched=True).shuffle(seed=42)
eval_dataset_full = Dataset.from_dict(get_dataset_dict('dev-0_huggingface_format.csv')).map(tokenize_function, batched=True)
eval_dataset_small = eval_dataset_full.shuffle(seed=42).select(range(2000))
test_dataset_A = Dataset.from_dict(get_dataset_dict('test-A_huggingface_format.csv')).map(tokenize_function, batched=True)
if TEST:
train_dataset = train_dataset.select(range(25))
eval_dataset_full = eval_dataset_full.select(range(400))
eval_dataset_small = eval_dataset_small.select(range(50))
test_dataset_A = test_dataset_A.select(range(200))
scalers = dict()
scalers['year'] = MinMaxScaler().fit(np.array(train_dataset['year']).reshape(-1,1))
def add_scaled(example):
for factor in ('year',):
example[factor + '_scaled'] = scalers[factor].transform(np.array(example[factor]).reshape(-1,1)).reshape(1,-1)[0].item()
return example
train_dataset = train_dataset.map(add_scaled)
eval_dataset_full = eval_dataset_full.map(add_scaled)
eval_dataset_small = eval_dataset_small.map(add_scaled)
test_dataset_A = test_dataset_A.map(add_scaled)
with open('train_dataset.pickle','wb') as f_p:
pickle.dump(train_dataset, f_p)
with open('eval_dataset_small.pickle','wb') as f_p:
pickle.dump(eval_dataset_small, f_p)
with open('eval_dataset_full.pickle','wb') as f_p:
pickle.dump(eval_dataset_full, f_p)
with open('test_dataset_A.pickle','wb') as f_p:
pickle.dump(test_dataset_A, f_p)
with open('scalers.pickle','wb') as f_p:
pickle.dump(scalers, f_p)

View File

@ -0,0 +1,151 @@
from config import *
import pickle
from datasets import load_dataset
from transformers import AutoTokenizer, RobertaModel, RobertaTokenizer
from torch.utils.data import DataLoader
from transformers import AutoModelForSequenceClassification
from torch.optim import Adam
from transformers import get_scheduler
import torch
from tqdm.auto import tqdm
import os
import pickle
from regressor_head import RegressorHead
from classification_head import YearClassificationHead
try:
os.mkdir('roberta_year_prediction')
except Exception:
pass
def pickle_model_save(name):
with open(f'roberta_year_prediction/{name}', 'wb') as f:
pickle.dump(model,f)
if TEST:
STEPS_EVAL = 10
WARMUP_STEPS = 10
with open('train_dataset.pickle','rb') as f_p:
train_dataset = pickle.load(f_p)
with open('eval_dataset_small.pickle','rb') as f_p:
eval_dataset_small = pickle.load(f_p)
with open('eval_dataset_full.pickle','rb') as f_p:
eval_dataset_full = pickle.load(f_p)
train_dataloader = DataLoader(train_dataset, shuffle=True, batch_size=BATCH_SIZE)
eval_dataloader_small = DataLoader(eval_dataset_small, batch_size=BATCH_SIZE)
eval_dataloader_full = DataLoader(eval_dataset_full, batch_size=BATCH_SIZE)
device = torch.device("cuda") if torch.cuda.is_available() else torch.device("cpu")
model = RobertaModel.from_pretrained('roberta-base')
#model = RobertaModel(model.config)
model.regressor_head = YearClassificationHead(768, MIN_YEAR, MAX_YEAR).to('cuda')
model.to(device)
optimizer = Adam(model.parameters(), lr=LR)
num_training_steps = NUM_EPOCHS * len(train_dataloader)
#lr_scheduler = get_scheduler(
# "linear",
# optimizer=optimizer,
# num_warmup_steps=WARMUP_STEPS,
# num_training_steps=num_training_steps
#)
progress_bar = tqdm(range(num_training_steps))
model.train()
model.train()
model.to(device)
def transform_batch(batch):
batch['input_ids'] = torch.stack(batch['input_ids']).permute(1,0).to(device)
batch['attention_mask'] = torch.stack(batch['attention_mask']).permute(1,0).to(device)
labels = batch['year'].to(device)
batch['input_ids'].to(device)
batch['attention_mask'].to(device)
for c in set(batch.keys()) - {'input_ids', 'attention_mask'}:
del batch[c]
return batch, labels
def eval(full = False):
model.eval()
with torch.no_grad():
eval_loss = 0.0
dataloader = eval_dataloader_full if full else eval_dataloader_small
items_passed = 0
for i, batch in enumerate(dataloader):
items_passed += len(batch)
batch, labels = transform_batch(batch)
outputs = model(**batch)[0]
outputs = model.regressor_head(outputs)
loss = criterion(outputs.squeeze(), labels)
eval_loss += loss.item()
eval_loss = (eval_loss / items_passed)
print(f'eval loss full={full}: {eval_loss:.5f}', end = '\n')
model.train()
return eval_loss
#criterion = torch.nn.MSELoss(reduction='sum').to(device)
criterion = torch.nn.CrossEntropyLoss(reduction='sum').to(device)
best_eval_loss = 9999
epochs_without_progress = 0
for epoch in range(NUM_EPOCHS):
train_loss = 0.0
items_passed = 0
for i, batch in enumerate(train_dataloader):
items_passed += len(batch)
batch, labels = transform_batch(batch)
outputs = model(**batch)[0]
outputs = model.regressor_head(outputs)
loss = criterion(outputs.squeeze(), labels)
loss.backward()
train_loss += loss.item()
progress_bar.update(1)
optimizer.step()
#lr_scheduler.step()
optimizer.zero_grad()
model.zero_grad()
if i % STEPS_EVAL == 0 and i > 1 :
print(f' epoch {epoch} train loss: {(train_loss / items_passed):.5f}', end = '\t')
items_passed = 0
train_loss = 0.0
eval(full = False)
eval_loss = eval(full=True)
pickle_model_save(f'epoch_{epoch}')
pickle_model_save(f'epoch_last')
if eval_loss < best_eval_loss:
pickle_model_save(f'epoch_best')
print('\nsaving best model')
best_eval_loss = eval_loss
else:
epochs_without_progress += 1
print(f'epochs_witohut_progress: {epochs_without_progress}')
if epochs_without_progress > EARLY_STOPPING:
print('early stopping')
break
print(f'best_eval_loss: {best_eval_loss:5f}', end = '\n')

View File

@ -0,0 +1,70 @@
import pickle
import torch
from transformers import AutoTokenizer, RobertaModel, RobertaTokenizer
from regressor_head import RegressorHead
from classification_head import YearClassificationHead
from torch.utils.data import DataLoader
from tqdm.auto import tqdm
from config import *
with open('eval_dataset_full.pickle','rb') as f_p:
eval_dataset_full = pickle.load(f_p)
with open('test_dataset_A.pickle','rb') as f_p:
test_dataset = pickle.load(f_p)
device = 'cuda'
with open('./roberta_year_prediction/epoch_best', 'rb') as f:
model = pickle.load(f)
model.eval()
model.to(device)
lrelu = torch.nn.LeakyReLU(0.0)
def hard_clip(t):
t = lrelu(t)
t = -lrelu(-t + 1 ) + 1
return t
with open('scalers.pickle', 'rb') as f_scaler:
scalers = pickle.load(f_scaler)
def transform_batch(batch):
batch['input_ids'] = torch.stack(batch['input_ids']).permute(1,0).to(device)
batch['attention_mask'] = torch.stack(batch['attention_mask']).permute(1,0).to(device)
labels = batch['year'].to(device)
batch['input_ids'].to(device)
batch['attention_mask'].to(device)
for c in set(batch.keys()) - {'input_ids', 'attention_mask'}:
del batch[c]
return batch, labels
def predict(dataset, out_f):
eval_dataloader = DataLoader(dataset, batch_size=20)
outputs = []
progress_bar = tqdm(range(len(eval_dataloader)))
for batch in eval_dataloader:
batch, labels = transform_batch(batch)
o = model(**batch)[0]
o = model.regressor_head(o)
o = torch.argmax(o,1)
outputs.extend(o.tolist())
progress_bar.update(1)
outputs = [a + MIN_YEAR for a in outputs]
with open(out_f,'w') as f_out:
for o in outputs:
f_out.write(str(o) + '\n')
predict(eval_dataset_full, '../dev-0/out.tsv')
predict(test_dataset, '../test-A/out.tsv')

View File

@ -0,0 +1,11 @@
#MODEL = '../MODELS/without_date/checkpoint-395000'
MODEL = 'roberta-base'
BATCH_SIZE = 90
EARLY_STOPPING = 3
WARMUP_STEPS = 5_000
LR=1e-5
NUM_EPOCHS = 20
STEPS_EVAL = 500
TEST=False
MIN_YEAR=1996
MAX_YEAR=2019

File diff suppressed because it is too large Load Diff