{"nbformat":4,"nbformat_minor":0,"metadata":{"colab":{"provenance":[],"gpuType":"A100","machine_shape":"hm"},"kernelspec":{"name":"python3","display_name":"Python 3"},"language_info":{"name":"python"},"accelerator":"GPU"},"cells":[{"cell_type":"code","source":["import locale\n","locale.getpreferredencoding = lambda: \"UTF-8\""],"metadata":{"id":"0cKJSrCDIC5c","executionInfo":{"status":"ok","timestamp":1687337064661,"user_tz":-120,"elapsed":5,"user":{"displayName":"Michał Ulaniuk","userId":"07769450445479269606"}}},"execution_count":1,"outputs":[]},{"cell_type":"code","source":["!pip install transformers torch accelerate"],"metadata":{"colab":{"base_uri":"https://localhost:8080/"},"id":"TVWZUBhyPfpa","outputId":"21e76e7e-6d88-41f1-8367-ec2f5862bfd0","executionInfo":{"status":"ok","timestamp":1687337068501,"user_tz":-120,"elapsed":3844,"user":{"displayName":"Michał Ulaniuk","userId":"07769450445479269606"}}},"execution_count":2,"outputs":[{"output_type":"stream","name":"stdout","text":["Looking in indexes: https://pypi.org/simple, https://us-python.pkg.dev/colab-wheels/public/simple/\n","Requirement already satisfied: transformers in /usr/local/lib/python3.10/dist-packages (4.30.2)\n","Requirement already satisfied: torch in /usr/local/lib/python3.10/dist-packages (2.0.1+cu118)\n","Requirement already satisfied: accelerate in /usr/local/lib/python3.10/dist-packages (0.20.3)\n","Requirement already satisfied: filelock in /usr/local/lib/python3.10/dist-packages (from transformers) (3.12.0)\n","Requirement already satisfied: huggingface-hub<1.0,>=0.14.1 in /usr/local/lib/python3.10/dist-packages (from transformers) (0.15.1)\n","Requirement already satisfied: numpy>=1.17 in /usr/local/lib/python3.10/dist-packages (from transformers) (1.22.4)\n","Requirement already satisfied: packaging>=20.0 in /usr/local/lib/python3.10/dist-packages (from transformers) (23.1)\n","Requirement already satisfied: pyyaml>=5.1 in /usr/local/lib/python3.10/dist-packages (from transformers) (6.0)\n","Requirement already satisfied: regex!=2019.12.17 in /usr/local/lib/python3.10/dist-packages (from transformers) (2022.10.31)\n","Requirement already satisfied: requests in /usr/local/lib/python3.10/dist-packages (from transformers) (2.27.1)\n","Requirement already satisfied: tokenizers!=0.11.3,<0.14,>=0.11.1 in /usr/local/lib/python3.10/dist-packages (from transformers) (0.13.3)\n","Requirement already satisfied: safetensors>=0.3.1 in /usr/local/lib/python3.10/dist-packages (from transformers) (0.3.1)\n","Requirement already satisfied: tqdm>=4.27 in /usr/local/lib/python3.10/dist-packages (from transformers) (4.65.0)\n","Requirement already satisfied: typing-extensions in /usr/local/lib/python3.10/dist-packages (from torch) (4.5.0)\n","Requirement already satisfied: sympy in /usr/local/lib/python3.10/dist-packages (from torch) (1.11.1)\n","Requirement already satisfied: networkx in /usr/local/lib/python3.10/dist-packages (from torch) (3.1)\n","Requirement already satisfied: jinja2 in /usr/local/lib/python3.10/dist-packages (from torch) (3.1.2)\n","Requirement already satisfied: triton==2.0.0 in /usr/local/lib/python3.10/dist-packages (from torch) (2.0.0)\n","Requirement already satisfied: cmake in /usr/local/lib/python3.10/dist-packages (from triton==2.0.0->torch) (3.25.2)\n","Requirement already satisfied: lit in /usr/local/lib/python3.10/dist-packages (from triton==2.0.0->torch) (16.0.5)\n","Requirement already satisfied: psutil in /usr/local/lib/python3.10/dist-packages (from accelerate) (5.9.5)\n","Requirement already satisfied: fsspec in /usr/local/lib/python3.10/dist-packages (from huggingface-hub<1.0,>=0.14.1->transformers) (2023.4.0)\n","Requirement already satisfied: MarkupSafe>=2.0 in /usr/local/lib/python3.10/dist-packages (from jinja2->torch) (2.1.2)\n","Requirement already satisfied: urllib3<1.27,>=1.21.1 in /usr/local/lib/python3.10/dist-packages (from requests->transformers) (1.26.15)\n","Requirement already satisfied: certifi>=2017.4.17 in /usr/local/lib/python3.10/dist-packages (from requests->transformers) (2022.12.7)\n","Requirement already satisfied: charset-normalizer~=2.0.0 in /usr/local/lib/python3.10/dist-packages (from requests->transformers) (2.0.12)\n","Requirement already satisfied: idna<4,>=2.5 in /usr/local/lib/python3.10/dist-packages (from requests->transformers) (3.4)\n","Requirement already satisfied: mpmath>=0.19 in /usr/local/lib/python3.10/dist-packages (from sympy->torch) (1.3.0)\n"]}]},{"cell_type":"code","source":["import pandas as pd"],"metadata":{"id":"2NPC0SFrzVQS","executionInfo":{"status":"ok","timestamp":1687337068860,"user_tz":-120,"elapsed":365,"user":{"displayName":"Michał Ulaniuk","userId":"07769450445479269606"}}},"execution_count":3,"outputs":[]},{"cell_type":"code","execution_count":4,"metadata":{"id":"LdRQU2xnOrst","executionInfo":{"status":"ok","timestamp":1687337077900,"user_tz":-120,"elapsed":9042,"user":{"displayName":"Michał Ulaniuk","userId":"07769450445479269606"}}},"outputs":[],"source":["from transformers import AutoTokenizer, AutoModelForCausalLM, set_seed\n","\n","model = AutoModelForCausalLM.from_pretrained('flax-community/papuGaPT2')\n","tokenizer = AutoTokenizer.from_pretrained('flax-community/papuGaPT2')\n","\n","# model = AutoModelForCausalLM.from_pretrained('sdadas/polish-gpt2-medium')\n","# tokenizer = AutoTokenizer.from_pretrained('sdadas/polish-gpt2-medium')\n","\n","tokenizer.pad_token = tokenizer.eos_token"]},{"cell_type":"markdown","source":["# Wczytanie danych do finetuningu\n","Dane stworzyliśmy ręcznie oraz za pomocą ChatGPT."],"metadata":{"id":"IY2e11OjS54T"}},{"cell_type":"code","source":["from google.colab import drive\n","\n","drive.mount('/content/gdrive/', force_remount=True)\n","working_dir = '/content/gdrive/My Drive/empatia/'"],"metadata":{"colab":{"base_uri":"https://localhost:8080/"},"id":"pSSQJy4zTDDr","outputId":"b8b98736-cc1a-4df5-9912-5ba2c5727749","executionInfo":{"status":"ok","timestamp":1687337080648,"user_tz":-120,"elapsed":2761,"user":{"displayName":"Michał Ulaniuk","userId":"07769450445479269606"}}},"execution_count":5,"outputs":[{"output_type":"stream","name":"stdout","text":["Mounted at /content/gdrive/\n"]}]},{"cell_type":"code","source":["dialogs_df = pd.read_csv(working_dir + 'data/dialogs.csv')\n","dialogs2_df = pd.read_csv(working_dir + 'data/dialogs2.csv')\n","\n","dialogs_df = pd.concat([dialogs_df, dialogs2_df])\n","\n","texts = 'question: ' + dialogs_df['question'] + \"\\nanswer: \" + dialogs_df['answer']\n","texts = texts.tolist()\n","\n","print(texts[10])"],"metadata":{"colab":{"base_uri":"https://localhost:8080/"},"id":"tD7U4Qa5UhEf","outputId":"5b15a372-fb35-47a7-c005-588924925204","executionInfo":{"status":"ok","timestamp":1687337080649,"user_tz":-120,"elapsed":18,"user":{"displayName":"Michał Ulaniuk","userId":"07769450445479269606"}}},"execution_count":6,"outputs":[{"output_type":"stream","name":"stdout","text":["question: powodzenia w szkole.\n","answer: Dziękuję bardzo.\n"]}]},{"cell_type":"code","source":["dialogs_df.sample(5)"],"metadata":{"colab":{"base_uri":"https://localhost:8080/","height":206},"id":"xRYVKkMw0EQd","executionInfo":{"status":"ok","timestamp":1687337080650,"user_tz":-120,"elapsed":14,"user":{"displayName":"Michał Ulaniuk","userId":"07769450445479269606"}},"outputId":"b53239ec-cf14-4da1-b9c2-ba910d3f5bc7"},"execution_count":7,"outputs":[{"output_type":"execute_result","data":{"text/plain":[" question \\\n","405 Szkoda, że nie mogę pracować mniej. Czuję si... \n","548 Tak, to było o wiele prostsze. Cieszyliśmy się... \n","564 Dowiedziałem się więc czegoś, co bardzo mnie z... \n","142 moja wina, miałem obowiązki do zrobienia. \n","384 brzmi jakby to była bliska gra. \n","\n"," answer \n","405 Próbowałem tego, czego naprawdę potrzebuję, je... \n","548 życie było proste wtedy nie było! bardzo ładny. \n","564 Moje dziecko wyszło za moimi plecami i wymknęł... \n","142 w porządku. \n","384 dlatego była to tak świetna gra. "],"text/html":["\n","
\n"," | question | \n","answer | \n","
---|---|---|
405 | \n","Szkoda, że nie mogę pracować mniej. Czuję si... | \n","Próbowałem tego, czego naprawdę potrzebuję, je... | \n","
548 | \n","Tak, to było o wiele prostsze. Cieszyliśmy się... | \n","życie było proste wtedy nie było! bardzo ładny. | \n","
564 | \n","Dowiedziałem się więc czegoś, co bardzo mnie z... | \n","Moje dziecko wyszło za moimi plecami i wymknęł... | \n","
142 | \n","moja wina, miałem obowiązki do zrobienia. | \n","w porządku. | \n","
384 | \n","brzmi jakby to była bliska gra. | \n","dlatego była to tak świetna gra. | \n","