import torch from transformers.file_utils import is_tf_available, is_torch_available from transformers import BertTokenizerFast, BertForSequenceClassification from transformers import Trainer, TrainingArguments import numpy as np import random from sklearn.metrics import accuracy_score import pandas as pd class Dataset(torch.utils.data.Dataset): def __init__(self, encodings, labels): self.encodings = encodings self.labels = labels def __getitem__(self, idx): item = {k: torch.tensor(v[idx]) for k, v in self.encodings.items()} item["labels"] = torch.tensor([self.labels[idx]]) return item def __len__(self): return len(self.labels) def set_seed(seed: int): random.seed(seed) np.random.seed(seed) if is_torch_available(): torch.manual_seed(seed) torch.cuda.manual_seed_all(seed) if is_tf_available(): import tensorflow as tf tf.random.set_seed(seed) def compute_metrics(pred): labels = pred.label_ids preds = pred.predictions.argmax(-1) acc = accuracy_score(labels, preds) return { 'accuracy': acc, } def get_prediction(text): inputs = tokenizer(text, padding=True, truncation=True, max_length=max_length, return_tensors="pt").to("cuda") outputs = model(**inputs) return outputs[0].softmax(1).argmax() set_seed(1) SAMPLES = 2000 train_texts = \ pd.read_csv('train/in.tsv.xz', compression='xz', sep='\t', header=None, error_bad_lines=False, quoting=3)[0][:SAMPLES].tolist() train_labels = pd.read_csv('train/expected.tsv', sep='\t', header=None, quoting=3)[0][:SAMPLES].tolist() dev_texts = pd.read_csv('dev-0/in.tsv.xz', compression='xz', sep='\t', header=None, quoting=3)[0].tolist() dev_labels = pd.read_csv('dev-0/expected.tsv', sep='\t', header=None, quoting=3)[0].tolist() model_name = "bert-base-uncased" max_length = 512 tokenizer = BertTokenizerFast.from_pretrained(model_name, do_lower_case=True) train_encodings = tokenizer(train_texts, truncation=True, padding=True, max_length=max_length) valid_encodings = tokenizer(dev_texts, truncation=True, padding=True, max_length=max_length) train_dataset = Dataset(train_encodings, train_labels) valid_dataset = Dataset(valid_encodings, dev_labels) model = BertForSequenceClassification.from_pretrained( model_name, num_labels=len(pd.unique(train_labels))).to("cuda") training_args = TrainingArguments( output_dir='./results', num_train_epochs=1, per_device_train_batch_size=4, per_device_eval_batch_size=4, warmup_steps=500, weight_decay=0.005, logging_dir='./logs', load_best_model_at_end=True, logging_steps=250, evaluation_strategy="steps", ) trainer = Trainer( model=model, args=training_args, train_dataset=train_dataset, eval_dataset=valid_dataset, compute_metrics=compute_metrics, ) trainer.train() trainer.evaluate() model_path = "bert-base-uncased-2k" model.save_pretrained(model_path) tokenizer.save_pretrained(model_path)