From 82f05fb08858a00635139f91b9451206fd6ec189 Mon Sep 17 00:00:00 2001 From: jakubknczny Date: Sun, 20 Jun 2021 15:19:39 +0200 Subject: [PATCH] add bert --- .gitignore | 4 +++ bert.py | 103 +++++++++++++++++++++++++++++++++++++++++++++++++++++ 2 files changed, 107 insertions(+) create mode 100644 bert.py diff --git a/.gitignore b/.gitignore index 1c18d74..9056fbf 100644 --- a/.gitignore +++ b/.gitignore @@ -6,3 +6,7 @@ *.o .DS_Store .token +mlruns +results +logs +.idea \ No newline at end of file diff --git a/bert.py b/bert.py new file mode 100644 index 0000000..9d8f296 --- /dev/null +++ b/bert.py @@ -0,0 +1,103 @@ +import torch +from transformers.file_utils import is_tf_available, is_torch_available, is_torch_tpu_available +from transformers import BertTokenizerFast, BertForSequenceClassification +from transformers import Trainer, TrainingArguments +import numpy as np +import random +from sklearn.metrics import accuracy_score +import pandas as pd + + +class Dataset(torch.utils.data.Dataset): + def __init__(self, encodings, labels): + self.encodings = encodings + self.labels = labels + + def __getitem__(self, idx): + item = {k: torch.tensor(v[idx]) for k, v in self.encodings.items()} + item["labels"] = torch.tensor([self.labels[idx]]) + return item + + def __len__(self): + return len(self.labels) + + +def set_seed(seed: int): + random.seed(seed) + np.random.seed(seed) + if is_torch_available(): + torch.manual_seed(seed) + torch.cuda.manual_seed_all(seed) + if is_tf_available(): + import tensorflow as tf + + tf.random.set_seed(seed) + + +def compute_metrics(pred): + labels = pred.label_ids + preds = pred.predictions.argmax(-1) + acc = accuracy_score(labels, preds) + return { + 'accuracy': acc, + } + + +def get_prediction(text): + inputs = tokenizer(text, padding=True, truncation=True, max_length=max_length, return_tensors="pt").to("cuda") + outputs = model(**inputs) + probs = outputs[0].softmax(1) + return probs.argmax() + + +set_seed(1) + +train_texts = \ + pd.read_csv('train/in.tsv.xz', compression='xz', sep='\t', header=None, error_bad_lines=False, quoting=3)[0].tolist() +train_labels = pd.read_csv('train/expected.tsv', sep='\t', header=None, quoting=3)[0].tolist() +dev_texts = pd.read_csv('dev-0/in.tsv.xz', compression='xz', sep='\t', header=None, quoting=3)[0].tolist() +dev_labels = pd.read_csv('dev-0/expected.tsv', sep='\t', header=None, quoting=3)[0].tolist() +# test_texts = pd.read_table('test-A/in.tsv.xz', compression='xz', sep='\t', header=None, quoting=3) + +model_name = "bert-base-uncased" +max_length = 512 +tokenizer = BertTokenizerFast.from_pretrained(model_name, do_lower_case=True) + +train_encodings = tokenizer(train_texts, truncation=True, padding=True, max_length=max_length) +valid_encodings = tokenizer(dev_texts, truncation=True, padding=True, max_length=max_length) + +train_dataset = Dataset(train_encodings, train_labels) +valid_dataset = Dataset(valid_encodings, dev_labels) + +model = BertForSequenceClassification.from_pretrained( + model_name, num_labels=len(pd.unique(train_labels))).to("cuda") + +training_args = TrainingArguments( + output_dir='./results', # output directory + num_train_epochs=3, # total number of training epochs + per_device_train_batch_size=1, # batch size per device during training + per_device_eval_batch_size=1, # batch size for evaluation + warmup_steps=500, # number of warmup steps for learning rate scheduler + weight_decay=0.01, # strength of weight decay + logging_dir='./logs', # directory for storing logs + load_best_model_at_end=True, # load the best model when finished training (default metric is loss) + # but you can specify `metric_for_best_model` argument to change to accuracy or other metric + logging_steps=200, # log & save weights each logging_steps + evaluation_strategy="steps", # evaluate each `logging_steps` +) + +trainer = Trainer( + model=model, # the instantiated Transformers model to be trained + args=training_args, # training arguments, defined above + train_dataset=train_dataset, # training dataset + eval_dataset=valid_dataset, # evaluation dataset + compute_metrics=compute_metrics, # the callback that computes metrics of interest +) + +trainer.train() + +trainer.evaluate() + +model_path = "bert-base-uncased" +model.save_pretrained(model_path) +tokenizer.save_pretrained(model_path)