import torch from transformers.file_utils import is_tf_available, is_torch_available, is_torch_tpu_available from transformers import BertTokenizerFast, BertForSequenceClassification from transformers import Trainer, TrainingArguments import numpy as np import random from sklearn.metrics import accuracy_score import pandas as pd class Dataset(torch.utils.data.Dataset): def __init__(self, encodings, labels): self.encodings = encodings self.labels = labels def __getitem__(self, idx): item = {k: torch.tensor(v[idx]) for k, v in self.encodings.items()} item["labels"] = torch.tensor([self.labels[idx]]) return item def __len__(self): return len(self.labels) def set_seed(seed: int): random.seed(seed) np.random.seed(seed) if is_torch_available(): torch.manual_seed(seed) torch.cuda.manual_seed_all(seed) if is_tf_available(): import tensorflow as tf tf.random.set_seed(seed) def compute_metrics(pred): labels = pred.label_ids preds = pred.predictions.argmax(-1) acc = accuracy_score(labels, preds) return { 'accuracy': acc, } def get_prediction(text): inputs = tokenizer(text, padding=True, truncation=True, max_length=max_length, return_tensors="pt").to("cuda") outputs = model(**inputs) probs = outputs[0].softmax(1) return probs.argmax() set_seed(1) train_texts = \ pd.read_csv('train/in.tsv.xz', compression='xz', sep='\t', header=None, error_bad_lines=False, quoting=3)[0].tolist() train_labels = pd.read_csv('train/expected.tsv', sep='\t', header=None, quoting=3)[0].tolist() dev_texts = pd.read_csv('dev-0/in.tsv.xz', compression='xz', sep='\t', header=None, quoting=3)[0].tolist() dev_labels = pd.read_csv('dev-0/expected.tsv', sep='\t', header=None, quoting=3)[0].tolist() # test_texts = pd.read_table('test-A/in.tsv.xz', compression='xz', sep='\t', header=None, quoting=3) model_name = "bert-base-uncased" max_length = 512 tokenizer = BertTokenizerFast.from_pretrained(model_name, do_lower_case=True) train_encodings = tokenizer(train_texts, truncation=True, padding=True, max_length=max_length) valid_encodings = tokenizer(dev_texts, truncation=True, padding=True, max_length=max_length) train_dataset = Dataset(train_encodings, train_labels) valid_dataset = Dataset(valid_encodings, dev_labels) model = BertForSequenceClassification.from_pretrained( model_name, num_labels=len(pd.unique(train_labels))).to("cuda") training_args = TrainingArguments( output_dir='./results', # output directory num_train_epochs=3, # total number of training epochs per_device_train_batch_size=1, # batch size per device during training per_device_eval_batch_size=1, # batch size for evaluation warmup_steps=500, # number of warmup steps for learning rate scheduler weight_decay=0.01, # strength of weight decay logging_dir='./logs', # directory for storing logs load_best_model_at_end=True, # load the best model when finished training (default metric is loss) # but you can specify `metric_for_best_model` argument to change to accuracy or other metric logging_steps=200, # log & save weights each logging_steps evaluation_strategy="steps", # evaluate each `logging_steps` ) trainer = Trainer( model=model, # the instantiated Transformers model to be trained args=training_args, # training arguments, defined above train_dataset=train_dataset, # training dataset eval_dataset=valid_dataset, # evaluation dataset compute_metrics=compute_metrics, # the callback that computes metrics of interest ) trainer.train() trainer.evaluate() model_path = "bert-base-uncased" model.save_pretrained(model_path) tokenizer.save_pretrained(model_path)