# Setup

## Requirements

In [None]:
!pip install torch
!pip install datasets
!pip install transformers
!pip install scikit-learn
!pip install evaluate
!pip install accelerate
!pip install sentencepiece
!pip install protobuf
!pip install sacrebleu
!pip install py7zr


Looking in indexes: https://pypi.org/simple, https://us-python.pkg.dev/colab-wheels/public/simple/
Looking in indexes: https://pypi.org/simple, https://us-python.pkg.dev/colab-wheels/public/simple/
Looking in indexes: https://pypi.org/simple, https://us-python.pkg.dev/colab-wheels/public/simple/
Looking in indexes: https://pypi.org/simple, https://us-python.pkg.dev/colab-wheels/public/simple/
Looking in indexes: https://pypi.org/simple, https://us-python.pkg.dev/colab-wheels/public/simple/
Looking in indexes: https://pypi.org/simple, https://us-python.pkg.dev/colab-wheels/public/simple/
Looking in indexes: https://pypi.org/simple, https://us-python.pkg.dev/colab-wheels/public/simple/
Looking in indexes: https://pypi.org/simple, https://us-python.pkg.dev/colab-wheels/public/simple/
Looking in indexes: https://pypi.org/simple, https://us-python.pkg.dev/colab-wheels/public/simple/
Looking in indexes: https://pypi.org/simple, https://us-python.pkg.dev/colab-wheels/public/simple/


## Imports

In [None]:
import os
import json
import torch
from google.colab import drive
from pathlib import Path
from typing import Dict, List
from datasets import load_dataset
from transformers import T5Tokenizer

## Loading data

In [None]:
loaded_data = load_dataset('emotion')
!mkdir -v -p data
train_path = Path('data/train.json')
valid_path = Path('data/valid.json')
test_path = Path('data/test.json')
data_train, data_valid, data_test = [], [], []



  0%|          | 0/3 [00:00<?, ?it/s]

In [None]:
for source_data, dataset, max_size in [
  (loaded_data['train'], data_train, None),
  (loaded_data['validation'], data_valid, None),
  (loaded_data['test'], data_test, None),
]:
  for i, data in enumerate(source_data):
    if max_size is not None and i >= max_size:
      break
    data_line = {
      'label': int(data['label']),
      'text': data['text'],
    }
    dataset.append(data_line)

print(f'Train: {len(data_train):6d}')
print(f'Valid: {len(data_valid):6d}')
print(f'Test: {len(data_test):6d}')

Train:  16000
Valid:   2000
Test:   2000


In [None]:
MAP_LABEL_TRANSLATION = {
    0: 'sadness',
    1: 'joy',
    2: 'love',
    3: 'anger',
    4: 'fear',
    5: 'surprise',
}

In [None]:
def save_as_translations(original_save_path: Path, data_to_save: List[Dict]) -> None:
    file_name = 's2s-' + original_save_path.name
    file_path = original_save_path.parent / file_name

    print(f'Saving into: {file_path}')
    with open(file_path, 'wt') as f_write:
        for data_line in data_to_save:
            label = data_line['label']
            new_label = MAP_LABEL_TRANSLATION[label]
            data_line['label'] = new_label
            data_line_str = json.dumps(data_line)
            f_write.write(f'{data_line_str}\n')

In [None]:
for file_path, data_to_save in [(train_path, data_train), (valid_path, data_valid), (test_path, data_test)]:
  print(f'Saving into: {file_path}')
  with open(file_path, 'wt') as f_write:
    for data_line in data_to_save:
      data_line_str = json.dumps(data_line)
      f_write.write(f'{data_line_str}\n')
  
  save_as_translations(file_path, data_to_save)

Saving into: data/train.json
Saving into: data/s2s-train.json
Saving into: data/valid.json
Saving into: data/s2s-valid.json
Saving into: data/test.json
Saving into: data/s2s-test.json


In [None]:
!head data/train.json

{"label": 0, "text": "i didnt feel humiliated"}
{"label": 0, "text": "i can go from feeling so hopeless to so damned hopeful just from being around someone who cares and is awake"}
{"label": 3, "text": "im grabbing a minute to post i feel greedy wrong"}
{"label": 2, "text": "i am ever feeling nostalgic about the fireplace i will know that it is still on the property"}
{"label": 3, "text": "i am feeling grouchy"}
{"label": 0, "text": "ive been feeling a little burdened lately wasnt sure why that was"}
{"label": 5, "text": "ive been taking or milligrams or times recommended amount and ive fallen asleep a lot faster but i also feel like so funny"}
{"label": 4, "text": "i feel as confused about life as a teenager or as jaded as a year old man"}
{"label": 1, "text": "i have been with petronas for years i feel that petronas has performed well and made a huge profit"}
{"label": 2, "text": "i feel romantic too"}


In [None]:
!head data/s2s-train.json

{"label": "sadness", "text": "i didnt feel humiliated"}
{"label": "sadness", "text": "i can go from feeling so hopeless to so damned hopeful just from being around someone who cares and is awake"}
{"label": "anger", "text": "im grabbing a minute to post i feel greedy wrong"}
{"label": "love", "text": "i am ever feeling nostalgic about the fireplace i will know that it is still on the property"}
{"label": "anger", "text": "i am feeling grouchy"}
{"label": "sadness", "text": "ive been feeling a little burdened lately wasnt sure why that was"}
{"label": "surprise", "text": "ive been taking or milligrams or times recommended amount and ive fallen asleep a lot faster but i also feel like so funny"}
{"label": "fear", "text": "i feel as confused about life as a teenager or as jaded as a year old man"}
{"label": "joy", "text": "i have been with petronas for years i feel that petronas has performed well and made a huge profit"}
{"label": "love", "text": "i feel romantic too"}


In [None]:
# create tiny datasets for debugging purposes
for file_name in ["train", "valid", "test"]:
  print(f"=== {file_name} ===")
  all_text = Path(f"data/{file_name}.json").read_text().split('\n')
  text = all_text[:250] + all_text[-250:]
  Path(f"data/{file_name}-500.json").write_text("\n".join(text))

=== train ===
=== valid ===
=== test ===


In [None]:
!wc -l data/*

   2000 data/s2s-test.json
  16000 data/s2s-train.json
   2000 data/s2s-valid.json
    499 data/test-500.json
   2000 data/test.json
    499 data/train-500.json
  16000 data/train.json
    499 data/valid-500.json
   2000 data/valid.json
  41497 total


# GPU Info

In [None]:
!nvidia-smi

Sun Feb 12 23:30:18 2023       
+-----------------------------------------------------------------------------+
| NVIDIA-SMI 510.47.03    Driver Version: 510.47.03    CUDA Version: 11.6     |
|-------------------------------+----------------------+----------------------+
| GPU  Name        Persistence-M| Bus-Id        Disp.A | Volatile Uncorr. ECC |
| Fan  Temp  Perf  Pwr:Usage/Cap|         Memory-Usage | GPU-Util  Compute M. |
|                               |                      |               MIG M. |
|   0  Tesla T4            Off  | 00000000:00:04.0 Off |                    0 |
| N/A   56C    P0    26W /  70W |      0MiB / 15360MiB |      0%      Default |
|                               |                      |                  N/A |
+-------------------------------+----------------------+----------------------+
                                                                               
+-----------------------------------------------------------------------------+
| Proces

In [None]:
os.environ['TOKENIZERS_PARALLELISM'] = 'true'

# Run

In [None]:
!wget 'https://git.wmi.amu.edu.pl/s444465/projekt-glebokie/raw/branch/master/run_glue.py' -O 'run_glue.py'

--2023-02-12 23:30:18--  https://git.wmi.amu.edu.pl/s444465/projekt-glebokie/raw/branch/master/run_glue.py
Resolving git.wmi.amu.edu.pl (git.wmi.amu.edu.pl)... 150.254.78.40
Connecting to git.wmi.amu.edu.pl (git.wmi.amu.edu.pl)|150.254.78.40|:443... connected.
HTTP request sent, awaiting response... 200 OK
Length: 30601 (30K) [text/plain]
Saving to: ‘run_glue.py’


2023-02-12 23:30:18 (982 KB/s) - ‘run_glue.py’ saved [30601/30601]



In [None]:
!wget 'https://git.wmi.amu.edu.pl/s444465/projekt-glebokie/raw/branch/master/roberta.py' -O 'roberta.py'

--2023-02-12 23:30:18--  https://git.wmi.amu.edu.pl/s444465/projekt-glebokie/raw/branch/master/roberta.py
Resolving git.wmi.amu.edu.pl (git.wmi.amu.edu.pl)... 150.254.78.40
Connecting to git.wmi.amu.edu.pl (git.wmi.amu.edu.pl)|150.254.78.40|:443... connected.
HTTP request sent, awaiting response... 200 OK
Length: 12783 (12K) [text/plain]
Saving to: ‘roberta.py’


2023-02-12 23:30:18 (263 MB/s) - ‘roberta.py’ saved [12783/12783]



In [None]:
!wget 'https://git.wmi.amu.edu.pl/s444465/projekt-glebokie/raw/branch/master/gpt2.py' -O 'gpt2.py'

--2023-02-12 23:30:18--  https://git.wmi.amu.edu.pl/s444465/projekt-glebokie/raw/branch/master/gpt2.py
Resolving git.wmi.amu.edu.pl (git.wmi.amu.edu.pl)... 150.254.78.40
Connecting to git.wmi.amu.edu.pl (git.wmi.amu.edu.pl)|150.254.78.40|:443... connected.
HTTP request sent, awaiting response... 200 OK
Length: 8017 (7.8K) [text/plain]
Saving to: ‘gpt2.py’


2023-02-12 23:30:19 (1.42 GB/s) - ‘gpt2.py’ saved [8017/8017]



In [None]:
torch.cuda.empty_cache()

In [None]:
! python run_glue.py \
  --cache_dir .cache_training \
  --model_name_or_path gpt2 \
  --custom_model gpt2_hidden \
  --train_file data/train.json  \
  --validation_file data/valid.json \
  --test_file data/test.json \
  --per_device_train_batch_size 8 \
  --per_device_eval_batch_size 8 \
  --do_train \
  --do_eval \
  --do_predict \
  --max_seq_length 128 \
  --num_train_epochs 1 \
  --metric_for_best_model accuracy \
  --greater_is_better True \
  --overwrite_output_dir \
  --output_dir out/emotion/gpt2

2023-02-12 23:30:29.286531: W tensorflow/compiler/xla/stream_executor/platform/default/dso_loader.cc:64] Could not load dynamic library 'libnvinfer.so.7'; dlerror: libnvinfer.so.7: cannot open shared object file: No such file or directory; LD_LIBRARY_PATH: /usr/lib64-nvidia
2023-02-12 23:30:29.287316: W tensorflow/compiler/xla/stream_executor/platform/default/dso_loader.cc:64] Could not load dynamic library 'libnvinfer_plugin.so.7'; dlerror: libnvinfer_plugin.so.7: cannot open shared object file: No such file or directory; LD_LIBRARY_PATH: /usr/lib64-nvidia
INFO:__main__:Training/evaluation parameters TrainingArguments(
_n_gpu=1,
adafactor=False,
adam_beta1=0.9,
adam_beta2=0.999,
adam_epsilon=1e-08,
auto_find_batch_size=False,
bf16=False,
bf16_full_eval=False,
data_seed=None,
dataloader_drop_last=False,
dataloader_num_workers=0,
dataloader_pin_memory=True,
ddp_bucket_cap_mb=None,
ddp_find_unused_parameters=None,
ddp_timeout=1800,
debug=[],
deepspeed=None,
disable_tqdm=False,
do_eval=Tr

# Save model

In [None]:
drive.mount('/content/drive')
!cp -r /content/out/emotion /content/drive/MyDrive/models

Drive already mounted at /content/drive; to attempt to forcibly remount, call drive.mount("/content/drive", force_remount=True).
