diff --git a/.ipynb_checkpoints/Untitled-checkpoint.ipynb b/.ipynb_checkpoints/Untitled-checkpoint.ipynb new file mode 100644 index 0000000..7fec515 --- /dev/null +++ b/.ipynb_checkpoints/Untitled-checkpoint.ipynb @@ -0,0 +1,6 @@ +{ + "cells": [], + "metadata": {}, + "nbformat": 4, + "nbformat_minor": 4 +} diff --git a/.ipynb_checkpoints/Untitled1-checkpoint.ipynb b/.ipynb_checkpoints/Untitled1-checkpoint.ipynb new file mode 100644 index 0000000..7fec515 --- /dev/null +++ b/.ipynb_checkpoints/Untitled1-checkpoint.ipynb @@ -0,0 +1,6 @@ +{ + "cells": [], + "metadata": {}, + "nbformat": 4, + "nbformat_minor": 4 +} diff --git a/bert.py b/bert.py new file mode 100644 index 0000000..dfa6b83 --- /dev/null +++ b/bert.py @@ -0,0 +1,72 @@ +from transformers import AutoTokenizer, AutoModelForSequenceClassification, TrainingArguments, Trainer +import random +import torch + +PATHS = ['train/in.tsv', 'train/expected.tsv', 'dev-0/in.tsv', 'test-A/in.tsv', './dev-0/out.tsv', './test-A/out.tsv'] +PRE_TRAINED = 'roberta-base' + +def get_data(path): + data = [] + with open(path, encoding='utf-8') as f: + data = f.readlines() + + return data + +class IMDbDataset(torch.utils.data.Dataset): + def __init__(self, encodings, labels): + self.encodings = encodings + self.labels = labels + + def __getitem__(self, idx): + item = {key: torch.tensor(val[idx]) for key, val in self.encodings.items()} + item['labels'] = torch.tensor(self.labels[idx]) + return item + + def __len__(self): + return len(self.labels) + +def prepare(data_train_X, data_train_Y): + tokenizer = AutoTokenizer.from_pretrained(PRE_TRAINED) + model = AutoModelForSequenceClassification.from_pretrained(PRE_TRAINED, num_labels=2) + encoded_input = tokenizer([text[0] for text in list(zip(data_train_X, data_train_Y))], truncation=True, padding=True) + train_dataset = IMDbDataset(encoded_input , [int(text[1]) for text in list(zip(data_train_X, data_train_Y))]) + + return train_dataset, model + + +def trainer(train_dataset, model): + training_args = TrainingArguments( + output_dir='./results', # output directory + num_train_epochs=3, # total number of training epochs + per_device_train_batch_size=16, # batch size per device during training + per_device_eval_batch_size=64, # batch size for evaluation + warmup_steps=500, # number of warmup steps for learning rate scheduler + weight_decay=0.01, # strength of weight decay + logging_dir='./logs', # directory for storing logs + logging_steps=10, + ) + + trainer = Trainer( + model=model, # the instantiated Transformers model to be trained + args=training_args, # training arguments, defined above + train_dataset=train_dataset, # training dataset + ) + + trainer.train() + + +def main(): + #data + X_train = get_data(PATHS[0]) + y_train = get_data(PATHS[1]) + X_dev = get_data(PATHS[2]) + X_test = get_data(PATHS[3]) + + #prepare + train_dataset, model = prepare(X_train, y_train) + + #trainer + trainer(train_dataset, model) + +if __name__ == '__main__': + main() \ No newline at end of file diff --git a/logs/1624271570.744457/events.out.tfevents.1624271570.POZ-PIOTRU.20712.1 b/logs/1624271570.744457/events.out.tfevents.1624271570.POZ-PIOTRU.20712.1 new file mode 100644 index 0000000..602b05f Binary files /dev/null and b/logs/1624271570.744457/events.out.tfevents.1624271570.POZ-PIOTRU.20712.1 differ diff --git a/logs/1624272912.687533/events.out.tfevents.1624272912.POZ-PIOTRU.4624.1 b/logs/1624272912.687533/events.out.tfevents.1624272912.POZ-PIOTRU.4624.1 new file mode 100644 index 0000000..c4e7584 Binary files /dev/null and b/logs/1624272912.687533/events.out.tfevents.1624272912.POZ-PIOTRU.4624.1 differ diff --git a/logs/1624273024.1357148/events.out.tfevents.1624273024.POZ-PIOTRU.19528.1 b/logs/1624273024.1357148/events.out.tfevents.1624273024.POZ-PIOTRU.19528.1 new file mode 100644 index 0000000..34a8d8e Binary files /dev/null and b/logs/1624273024.1357148/events.out.tfevents.1624273024.POZ-PIOTRU.19528.1 differ diff --git a/logs/events.out.tfevents.1624271570.POZ-PIOTRU.20712.0 b/logs/events.out.tfevents.1624271570.POZ-PIOTRU.20712.0 new file mode 100644 index 0000000..c561e3a Binary files /dev/null and b/logs/events.out.tfevents.1624271570.POZ-PIOTRU.20712.0 differ diff --git a/logs/events.out.tfevents.1624272912.POZ-PIOTRU.4624.0 b/logs/events.out.tfevents.1624272912.POZ-PIOTRU.4624.0 new file mode 100644 index 0000000..e8c8560 Binary files /dev/null and b/logs/events.out.tfevents.1624272912.POZ-PIOTRU.4624.0 differ diff --git a/logs/events.out.tfevents.1624273023.POZ-PIOTRU.19528.0 b/logs/events.out.tfevents.1624273023.POZ-PIOTRU.19528.0 new file mode 100644 index 0000000..0370bff Binary files /dev/null and b/logs/events.out.tfevents.1624273023.POZ-PIOTRU.19528.0 differ diff --git a/runs/Jun21_11-25-51_POZ-PIOTRU/1624267551.4343011/events.out.tfevents.1624267551.POZ-PIOTRU.14680.1 b/runs/Jun21_11-25-51_POZ-PIOTRU/1624267551.4343011/events.out.tfevents.1624267551.POZ-PIOTRU.14680.1 new file mode 100644 index 0000000..5a21884 Binary files /dev/null and b/runs/Jun21_11-25-51_POZ-PIOTRU/1624267551.4343011/events.out.tfevents.1624267551.POZ-PIOTRU.14680.1 differ diff --git a/runs/Jun21_11-25-51_POZ-PIOTRU/events.out.tfevents.1624267551.POZ-PIOTRU.14680.0 b/runs/Jun21_11-25-51_POZ-PIOTRU/events.out.tfevents.1624267551.POZ-PIOTRU.14680.0 new file mode 100644 index 0000000..c226bfe Binary files /dev/null and b/runs/Jun21_11-25-51_POZ-PIOTRU/events.out.tfevents.1624267551.POZ-PIOTRU.14680.0 differ diff --git a/runs/Jun21_11-54-36_POZ-PIOTRU/1624269277.2385628/events.out.tfevents.1624269277.POZ-PIOTRU.20144.1 b/runs/Jun21_11-54-36_POZ-PIOTRU/1624269277.2385628/events.out.tfevents.1624269277.POZ-PIOTRU.20144.1 new file mode 100644 index 0000000..cd970ad Binary files /dev/null and b/runs/Jun21_11-54-36_POZ-PIOTRU/1624269277.2385628/events.out.tfevents.1624269277.POZ-PIOTRU.20144.1 differ diff --git a/runs/Jun21_11-54-36_POZ-PIOTRU/events.out.tfevents.1624269277.POZ-PIOTRU.20144.0 b/runs/Jun21_11-54-36_POZ-PIOTRU/events.out.tfevents.1624269277.POZ-PIOTRU.20144.0 new file mode 100644 index 0000000..fe432a6 Binary files /dev/null and b/runs/Jun21_11-54-36_POZ-PIOTRU/events.out.tfevents.1624269277.POZ-PIOTRU.20144.0 differ