{"nbformat":4,"nbformat_minor":0,"metadata":{"colab":{"provenance":[],"gpuType":"T4","authorship_tag":"ABX9TyNr6vkmxwcIjQs5T29RhomZ"},"kernelspec":{"name":"python3","display_name":"Python 3"},"language_info":{"name":"python"},"accelerator":"GPU","widgets":{"application/vnd.jupyter.widget-state+json":{"d502a36179314d89a33811fd6501b195":{"model_module":"@jupyter-widgets/controls","model_name":"HBoxModel","model_module_version":"1.5.0","state":{"_dom_classes":[],"_model_module":"@jupyter-widgets/controls","_model_module_version":"1.5.0","_model_name":"HBoxModel","_view_count":null,"_view_module":"@jupyter-widgets/controls","_view_module_version":"1.5.0","_view_name":"HBoxView","box_style":"","children":["IPY_MODEL_2147ee5d3bf644a7b938175c162f0389","IPY_MODEL_d3b624210c8e4f4b8082e84b3a660137","IPY_MODEL_f33dde8e63eb4f02bbb0f7a56dc53c6c"],"layout":"IPY_MODEL_0c3fa63e1598448081c4d89c9fa48359"}},"2147ee5d3bf644a7b938175c162f0389":{"model_module":"@jupyter-widgets/controls","model_name":"HTMLModel","model_module_version":"1.5.0","state":{"_dom_classes":[],"_model_module":"@jupyter-widgets/controls","_model_module_version":"1.5.0","_model_name":"HTMLModel","_view_count":null,"_view_module":"@jupyter-widgets/controls","_view_module_version":"1.5.0","_view_name":"HTMLView","description":"","description_tooltip":null,"layout":"IPY_MODEL_adf721931ab34250989179521fcec956","placeholder":"​","style":"IPY_MODEL_c6aa731c30344e4886accb4fa0eaf1a5","value":"Map: 100%"}},"d3b624210c8e4f4b8082e84b3a660137":{"model_module":"@jupyter-widgets/controls","model_name":"FloatProgressModel","model_module_version":"1.5.0","state":{"_dom_classes":[],"_model_module":"@jupyter-widgets/controls","_model_module_version":"1.5.0","_model_name":"FloatProgressModel","_view_count":null,"_view_module":"@jupyter-widgets/controls","_view_module_version":"1.5.0","_view_name":"ProgressView","bar_style":"success","description":"","description_tooltip":null,"layout":"IPY_MODEL_8b752a2e2ad248ff92edae699da24399","max":65749,"min":0,"orientation":"horizontal","style":"IPY_MODEL_82c3c2a482fc42f6abcd1352198b19d7","value":65749}},"f33dde8e63eb4f02bbb0f7a56dc53c6c":{"model_module":"@jupyter-widgets/controls","model_name":"HTMLModel","model_module_version":"1.5.0","state":{"_dom_classes":[],"_model_module":"@jupyter-widgets/controls","_model_module_version":"1.5.0","_model_name":"HTMLModel","_view_count":null,"_view_module":"@jupyter-widgets/controls","_view_module_version":"1.5.0","_view_name":"HTMLView","description":"","description_tooltip":null,"layout":"IPY_MODEL_f42a6a01002d412db0f5e02cde71e2fe","placeholder":"​","style":"IPY_MODEL_75a6a234e4aa4eed8ca35d489af7d89c","value":" 65749/65749 [00:10<00:00, 6445.88 examples/s]"}},"0c3fa63e1598448081c4d89c9fa48359":{"model_module":"@jupyter-widgets/base","model_name":"LayoutModel","model_module_version":"1.2.0","state":{"_model_module":"@jupyter-widgets/base","_model_module_version":"1.2.0","_model_name":"LayoutModel","_view_count":null,"_view_module":"@jupyter-widgets/base","_view_module_version":"1.2.0","_view_name":"LayoutView","align_content":null,"align_items":null,"align_self":null,"border":null,"bottom":null,"display":null,"flex":null,"flex_flow":null,"grid_area":null,"grid_auto_columns":null,"grid_auto_flow":null,"grid_auto_rows":null,"grid_column":null,"grid_gap":null,"grid_row":null,"grid_template_areas":null,"grid_template_columns":null,"grid_template_rows":null,"height":null,"justify_content":null,"justify_items":null,"left":null,"margin":null,"max_height":null,"max_width":null,"min_height":null,"min_width":null,"object_fit":null,"object_position":null,"order":null,"overflow":null,"overflow_x":null,"overflow_y":null,"padding":null,"right":null,"top":null,"visibility":null,"width":null}},"adf721931ab34250989179521fcec956":{"model_module":"@jupyter-widgets/base","model_name":"LayoutModel","model_module_version":"1.2.0","state":{"_model_module":"@jupyter-widgets/base","_model_module_version":"1.2.0","_model_name":"LayoutModel","_view_count":null,"_view_module":"@jupyter-widgets/base","_view_module_version":"1.2.0","_view_name":"LayoutView","align_content":null,"align_items":null,"align_self":null,"border":null,"bottom":null,"display":null,"flex":null,"flex_flow":null,"grid_area":null,"grid_auto_columns":null,"grid_auto_flow":null,"grid_auto_rows":null,"grid_column":null,"grid_gap":null,"grid_row":null,"grid_template_areas":null,"grid_template_columns":null,"grid_template_rows":null,"height":null,"justify_content":null,"justify_items":null,"left":null,"margin":null,"max_height":null,"max_width":null,"min_height":null,"min_width":null,"object_fit":null,"object_position":null,"order":null,"overflow":null,"overflow_x":null,"overflow_y":null,"padding":null,"right":null,"top":null,"visibility":null,"width":null}},"c6aa731c30344e4886accb4fa0eaf1a5":{"model_module":"@jupyter-widgets/controls","model_name":"DescriptionStyleModel","model_module_version":"1.5.0","state":{"_model_module":"@jupyter-widgets/controls","_model_module_version":"1.5.0","_model_name":"DescriptionStyleModel","_view_count":null,"_view_module":"@jupyter-widgets/base","_view_module_version":"1.2.0","_view_name":"StyleView","description_width":""}},"8b752a2e2ad248ff92edae699da24399":{"model_module":"@jupyter-widgets/base","model_name":"LayoutModel","model_module_version":"1.2.0","state":{"_model_module":"@jupyter-widgets/base","_model_module_version":"1.2.0","_model_name":"LayoutModel","_view_count":null,"_view_module":"@jupyter-widgets/base","_view_module_version":"1.2.0","_view_name":"LayoutView","align_content":null,"align_items":null,"align_self":null,"border":null,"bottom":null,"display":null,"flex":null,"flex_flow":null,"grid_area":null,"grid_auto_columns":null,"grid_auto_flow":null,"grid_auto_rows":null,"grid_column":null,"grid_gap":null,"grid_row":null,"grid_template_areas":null,"grid_template_columns":null,"grid_template_rows":null,"height":null,"justify_content":null,"justify_items":null,"left":null,"margin":null,"max_height":null,"max_width":null,"min_height":null,"min_width":null,"object_fit":null,"object_position":null,"order":null,"overflow":null,"overflow_x":null,"overflow_y":null,"padding":null,"right":null,"top":null,"visibility":null,"width":null}},"82c3c2a482fc42f6abcd1352198b19d7":{"model_module":"@jupyter-widgets/controls","model_name":"ProgressStyleModel","model_module_version":"1.5.0","state":{"_model_module":"@jupyter-widgets/controls","_model_module_version":"1.5.0","_model_name":"ProgressStyleModel","_view_count":null,"_view_module":"@jupyter-widgets/base","_view_module_version":"1.2.0","_view_name":"StyleView","bar_color":null,"description_width":""}},"f42a6a01002d412db0f5e02cde71e2fe":{"model_module":"@jupyter-widgets/base","model_name":"LayoutModel","model_module_version":"1.2.0","state":{"_model_module":"@jupyter-widgets/base","_model_module_version":"1.2.0","_model_name":"LayoutModel","_view_count":null,"_view_module":"@jupyter-widgets/base","_view_module_version":"1.2.0","_view_name":"LayoutView","align_content":null,"align_items":null,"align_self":null,"border":null,"bottom":null,"display":null,"flex":null,"flex_flow":null,"grid_area":null,"grid_auto_columns":null,"grid_auto_flow":null,"grid_auto_rows":null,"grid_column":null,"grid_gap":null,"grid_row":null,"grid_template_areas":null,"grid_template_columns":null,"grid_template_rows":null,"height":null,"justify_content":null,"justify_items":null,"left":null,"margin":null,"max_height":null,"max_width":null,"min_height":null,"min_width":null,"object_fit":null,"object_position":null,"order":null,"overflow":null,"overflow_x":null,"overflow_y":null,"padding":null,"right":null,"top":null,"visibility":null,"width":null}},"75a6a234e4aa4eed8ca35d489af7d89c":{"model_module":"@jupyter-widgets/controls","model_name":"DescriptionStyleModel","model_module_version":"1.5.0","state":{"_model_module":"@jupyter-widgets/controls","_model_module_version":"1.5.0","_model_name":"DescriptionStyleModel","_view_count":null,"_view_module":"@jupyter-widgets/base","_view_module_version":"1.2.0","_view_name":"StyleView","description_width":""}},"63af2ab9157d4cbf885f0707d3181dc2":{"model_module":"@jupyter-widgets/controls","model_name":"HBoxModel","model_module_version":"1.5.0","state":{"_dom_classes":[],"_model_module":"@jupyter-widgets/controls","_model_module_version":"1.5.0","_model_name":"HBoxModel","_view_count":null,"_view_module":"@jupyter-widgets/controls","_view_module_version":"1.5.0","_view_name":"HBoxView","box_style":"","children":["IPY_MODEL_3b7882580e9d49c788f86bf26fce8a40","IPY_MODEL_1e009e37b1d8485fa3d124c9edbf662d","IPY_MODEL_75526ccab9954b15bd08c11015bcddfe"],"layout":"IPY_MODEL_3623928df1d849b8aa8caec3a530b7dc"}},"3b7882580e9d49c788f86bf26fce8a40":{"model_module":"@jupyter-widgets/controls","model_name":"HTMLModel","model_module_version":"1.5.0","state":{"_dom_classes":[],"_model_module":"@jupyter-widgets/controls","_model_module_version":"1.5.0","_model_name":"HTMLModel","_view_count":null,"_view_module":"@jupyter-widgets/controls","_view_module_version":"1.5.0","_view_name":"HTMLView","description":"","description_tooltip":null,"layout":"IPY_MODEL_59ac5b7bed6644e6ac0f5ae0b0ef6b8f","placeholder":"​","style":"IPY_MODEL_6f5df399f34c48b5ab201cfb61aa9712","value":"Map: 100%"}},"1e009e37b1d8485fa3d124c9edbf662d":{"model_module":"@jupyter-widgets/controls","model_name":"FloatProgressModel","model_module_version":"1.5.0","state":{"_dom_classes":[],"_model_module":"@jupyter-widgets/controls","_model_module_version":"1.5.0","_model_name":"FloatProgressModel","_view_count":null,"_view_module":"@jupyter-widgets/controls","_view_module_version":"1.5.0","_view_name":"ProgressView","bar_style":"success","description":"","description_tooltip":null,"layout":"IPY_MODEL_7459d4b778f5434a9ca58852b1e9fa67","max":1600,"min":0,"orientation":"horizontal","style":"IPY_MODEL_cb1b5ea7ffc242949bbbdbec8ff3e061","value":1600}},"75526ccab9954b15bd08c11015bcddfe":{"model_module":"@jupyter-widgets/controls","model_name":"HTMLModel","model_module_version":"1.5.0","state":{"_dom_classes":[],"_model_module":"@jupyter-widgets/controls","_model_module_version":"1.5.0","_model_name":"HTMLModel","_view_count":null,"_view_module":"@jupyter-widgets/controls","_view_module_version":"1.5.0","_view_name":"HTMLView","description":"","description_tooltip":null,"layout":"IPY_MODEL_522b1b1f1e424ca99036d0467a5246a6","placeholder":"​","style":"IPY_MODEL_f6e5f11380e44f78a03e90eb13661692","value":" 1600/1600 [00:00<00:00, 1889.08 examples/s]"}},"3623928df1d849b8aa8caec3a530b7dc":{"model_module":"@jupyter-widgets/base","model_name":"LayoutModel","model_module_version":"1.2.0","state":{"_model_module":"@jupyter-widgets/base","_model_module_version":"1.2.0","_model_name":"LayoutModel","_view_count":null,"_view_module":"@jupyter-widgets/base","_view_module_version":"1.2.0","_view_name":"LayoutView","align_content":null,"align_items":null,"align_self":null,"border":null,"bottom":null,"display":null,"flex":null,"flex_flow":null,"grid_area":null,"grid_auto_columns":null,"grid_auto_flow":null,"grid_auto_rows":null,"grid_column":null,"grid_gap":null,"grid_row":null,"grid_template_areas":null,"grid_template_columns":null,"grid_template_rows":null,"height":null,"justify_content":null,"justify_items":null,"left":null,"margin":null,"max_height":null,"max_width":null,"min_height":null,"min_width":null,"object_fit":null,"object_position":null,"order":null,"overflow":null,"overflow_x":null,"overflow_y":null,"padding":null,"right":null,"top":null,"visibility":null,"width":null}},"59ac5b7bed6644e6ac0f5ae0b0ef6b8f":{"model_module":"@jupyter-widgets/base","model_name":"LayoutModel","model_module_version":"1.2.0","state":{"_model_module":"@jupyter-widgets/base","_model_module_version":"1.2.0","_model_name":"LayoutModel","_view_count":null,"_view_module":"@jupyter-widgets/base","_view_module_version":"1.2.0","_view_name":"LayoutView","align_content":null,"align_items":null,"align_self":null,"border":null,"bottom":null,"display":null,"flex":null,"flex_flow":null,"grid_area":null,"grid_auto_columns":null,"grid_auto_flow":null,"grid_auto_rows":null,"grid_column":null,"grid_gap":null,"grid_row":null,"grid_template_areas":null,"grid_template_columns":null,"grid_template_rows":null,"height":null,"justify_content":null,"justify_items":null,"left":null,"margin":null,"max_height":null,"max_width":null,"min_height":null,"min_width":null,"object_fit":null,"object_position":null,"order":null,"overflow":null,"overflow_x":null,"overflow_y":null,"padding":null,"right":null,"top":null,"visibility":null,"width":null}},"6f5df399f34c48b5ab201cfb61aa9712":{"model_module":"@jupyter-widgets/controls","model_name":"DescriptionStyleModel","model_module_version":"1.5.0","state":{"_model_module":"@jupyter-widgets/controls","_model_module_version":"1.5.0","_model_name":"DescriptionStyleModel","_view_count":null,"_view_module":"@jupyter-widgets/base","_view_module_version":"1.2.0","_view_name":"StyleView","description_width":""}},"7459d4b778f5434a9ca58852b1e9fa67":{"model_module":"@jupyter-widgets/base","model_name":"LayoutModel","model_module_version":"1.2.0","state":{"_model_module":"@jupyter-widgets/base","_model_module_version":"1.2.0","_model_name":"LayoutModel","_view_count":null,"_view_module":"@jupyter-widgets/base","_view_module_version":"1.2.0","_view_name":"LayoutView","align_content":null,"align_items":null,"align_self":null,"border":null,"bottom":null,"display":null,"flex":null,"flex_flow":null,"grid_area":null,"grid_auto_columns":null,"grid_auto_flow":null,"grid_auto_rows":null,"grid_column":null,"grid_gap":null,"grid_row":null,"grid_template_areas":null,"grid_template_columns":null,"grid_template_rows":null,"height":null,"justify_content":null,"justify_items":null,"left":null,"margin":null,"max_height":null,"max_width":null,"min_height":null,"min_width":null,"object_fit":null,"object_position":null,"order":null,"overflow":null,"overflow_x":null,"overflow_y":null,"padding":null,"right":null,"top":null,"visibility":null,"width":null}},"cb1b5ea7ffc242949bbbdbec8ff3e061":{"model_module":"@jupyter-widgets/controls","model_name":"ProgressStyleModel","model_module_version":"1.5.0","state":{"_model_module":"@jupyter-widgets/controls","_model_module_version":"1.5.0","_model_name":"ProgressStyleModel","_view_count":null,"_view_module":"@jupyter-widgets/base","_view_module_version":"1.2.0","_view_name":"StyleView","bar_color":null,"description_width":""}},"522b1b1f1e424ca99036d0467a5246a6":{"model_module":"@jupyter-widgets/base","model_name":"LayoutModel","model_module_version":"1.2.0","state":{"_model_module":"@jupyter-widgets/base","_model_module_version":"1.2.0","_model_name":"LayoutModel","_view_count":null,"_view_module":"@jupyter-widgets/base","_view_module_version":"1.2.0","_view_name":"LayoutView","align_content":null,"align_items":null,"align_self":null,"border":null,"bottom":null,"display":null,"flex":null,"flex_flow":null,"grid_area":null,"grid_auto_columns":null,"grid_auto_flow":null,"grid_auto_rows":null,"grid_column":null,"grid_gap":null,"grid_row":null,"grid_template_areas":null,"grid_template_columns":null,"grid_template_rows":null,"height":null,"justify_content":null,"justify_items":null,"left":null,"margin":null,"max_height":null,"max_width":null,"min_height":null,"min_width":null,"object_fit":null,"object_position":null,"order":null,"overflow":null,"overflow_x":null,"overflow_y":null,"padding":null,"right":null,"top":null,"visibility":null,"width":null}},"f6e5f11380e44f78a03e90eb13661692":{"model_module":"@jupyter-widgets/controls","model_name":"DescriptionStyleModel","model_module_version":"1.5.0","state":{"_model_module":"@jupyter-widgets/controls","_model_module_version":"1.5.0","_model_name":"DescriptionStyleModel","_view_count":null,"_view_module":"@jupyter-widgets/base","_view_module_version":"1.2.0","_view_name":"StyleView","description_width":""}}}}},"cells":[{"cell_type":"markdown","source":["Instalacja 'datasets' i 'transformers'"],"metadata":{"id":"lpCiZfrEPO94"}},{"cell_type":"code","source":["!pip install datasets\n","!pip install transformers"],"metadata":{"colab":{"base_uri":"https://localhost:8080/"},"id":"2vMqRhtRDe2L","executionInfo":{"status":"ok","timestamp":1704971960885,"user_tz":-60,"elapsed":11301,"user":{"displayName":"Marcin Rostkowski","userId":"16749256502154511679"}},"outputId":"44068e37-b6a5-43de-a85b-ecdb4ee89867"},"execution_count":15,"outputs":[{"output_type":"stream","name":"stdout","text":["Requirement already satisfied: datasets in /usr/local/lib/python3.10/dist-packages (2.16.1)\n","Requirement already satisfied: filelock in /usr/local/lib/python3.10/dist-packages (from datasets) (3.13.1)\n","Requirement already satisfied: numpy>=1.17 in /usr/local/lib/python3.10/dist-packages (from datasets) (1.23.5)\n","Requirement already satisfied: pyarrow>=8.0.0 in /usr/local/lib/python3.10/dist-packages (from datasets) (10.0.1)\n","Requirement already satisfied: pyarrow-hotfix in /usr/local/lib/python3.10/dist-packages (from datasets) (0.6)\n","Requirement already satisfied: dill<0.3.8,>=0.3.0 in /usr/local/lib/python3.10/dist-packages (from datasets) (0.3.7)\n","Requirement already satisfied: pandas in /usr/local/lib/python3.10/dist-packages (from datasets) (1.5.3)\n","Requirement already satisfied: requests>=2.19.0 in /usr/local/lib/python3.10/dist-packages (from datasets) (2.31.0)\n","Requirement already satisfied: tqdm>=4.62.1 in /usr/local/lib/python3.10/dist-packages (from datasets) (4.66.1)\n","Requirement already satisfied: xxhash in /usr/local/lib/python3.10/dist-packages (from datasets) (3.4.1)\n","Requirement already satisfied: multiprocess in /usr/local/lib/python3.10/dist-packages (from datasets) (0.70.15)\n","Requirement already satisfied: fsspec[http]<=2023.10.0,>=2023.1.0 in /usr/local/lib/python3.10/dist-packages (from datasets) (2023.6.0)\n","Requirement already satisfied: aiohttp in /usr/local/lib/python3.10/dist-packages (from datasets) (3.9.1)\n","Requirement already satisfied: huggingface-hub>=0.19.4 in /usr/local/lib/python3.10/dist-packages (from datasets) (0.20.2)\n","Requirement already satisfied: packaging in /usr/local/lib/python3.10/dist-packages (from datasets) (23.2)\n","Requirement already satisfied: pyyaml>=5.1 in /usr/local/lib/python3.10/dist-packages (from datasets) (6.0.1)\n","Requirement already satisfied: attrs>=17.3.0 in /usr/local/lib/python3.10/dist-packages (from aiohttp->datasets) (23.2.0)\n","Requirement already satisfied: multidict<7.0,>=4.5 in /usr/local/lib/python3.10/dist-packages (from aiohttp->datasets) (6.0.4)\n","Requirement already satisfied: yarl<2.0,>=1.0 in /usr/local/lib/python3.10/dist-packages (from aiohttp->datasets) (1.9.4)\n","Requirement already satisfied: frozenlist>=1.1.1 in /usr/local/lib/python3.10/dist-packages (from aiohttp->datasets) (1.4.1)\n","Requirement already satisfied: aiosignal>=1.1.2 in /usr/local/lib/python3.10/dist-packages (from aiohttp->datasets) (1.3.1)\n","Requirement already satisfied: async-timeout<5.0,>=4.0 in /usr/local/lib/python3.10/dist-packages (from aiohttp->datasets) (4.0.3)\n","Requirement already satisfied: typing-extensions>=3.7.4.3 in /usr/local/lib/python3.10/dist-packages (from huggingface-hub>=0.19.4->datasets) (4.5.0)\n","Requirement already satisfied: charset-normalizer<4,>=2 in /usr/local/lib/python3.10/dist-packages (from requests>=2.19.0->datasets) (3.3.2)\n","Requirement already satisfied: idna<4,>=2.5 in /usr/local/lib/python3.10/dist-packages (from requests>=2.19.0->datasets) (3.6)\n","Requirement already satisfied: urllib3<3,>=1.21.1 in /usr/local/lib/python3.10/dist-packages (from requests>=2.19.0->datasets) (2.0.7)\n","Requirement already satisfied: certifi>=2017.4.17 in /usr/local/lib/python3.10/dist-packages (from requests>=2.19.0->datasets) (2023.11.17)\n","Requirement already satisfied: python-dateutil>=2.8.1 in /usr/local/lib/python3.10/dist-packages (from pandas->datasets) (2.8.2)\n","Requirement already satisfied: pytz>=2020.1 in /usr/local/lib/python3.10/dist-packages (from pandas->datasets) (2023.3.post1)\n","Requirement already satisfied: six>=1.5 in /usr/local/lib/python3.10/dist-packages (from python-dateutil>=2.8.1->pandas->datasets) (1.16.0)\n","Requirement already satisfied: transformers in /usr/local/lib/python3.10/dist-packages (4.35.2)\n","Requirement already satisfied: filelock in /usr/local/lib/python3.10/dist-packages (from transformers) (3.13.1)\n","Requirement already satisfied: huggingface-hub<1.0,>=0.16.4 in /usr/local/lib/python3.10/dist-packages (from transformers) (0.20.2)\n","Requirement already satisfied: numpy>=1.17 in /usr/local/lib/python3.10/dist-packages (from transformers) (1.23.5)\n","Requirement already satisfied: packaging>=20.0 in /usr/local/lib/python3.10/dist-packages (from transformers) (23.2)\n","Requirement already satisfied: pyyaml>=5.1 in /usr/local/lib/python3.10/dist-packages (from transformers) (6.0.1)\n","Requirement already satisfied: regex!=2019.12.17 in /usr/local/lib/python3.10/dist-packages (from transformers) (2023.6.3)\n","Requirement already satisfied: requests in /usr/local/lib/python3.10/dist-packages (from transformers) (2.31.0)\n","Requirement already satisfied: tokenizers<0.19,>=0.14 in /usr/local/lib/python3.10/dist-packages (from transformers) (0.15.0)\n","Requirement already satisfied: safetensors>=0.3.1 in /usr/local/lib/python3.10/dist-packages (from transformers) (0.4.1)\n","Requirement already satisfied: tqdm>=4.27 in /usr/local/lib/python3.10/dist-packages (from transformers) (4.66.1)\n","Requirement already satisfied: fsspec>=2023.5.0 in /usr/local/lib/python3.10/dist-packages (from huggingface-hub<1.0,>=0.16.4->transformers) (2023.6.0)\n","Requirement already satisfied: typing-extensions>=3.7.4.3 in /usr/local/lib/python3.10/dist-packages (from huggingface-hub<1.0,>=0.16.4->transformers) (4.5.0)\n","Requirement already satisfied: charset-normalizer<4,>=2 in /usr/local/lib/python3.10/dist-packages (from requests->transformers) (3.3.2)\n","Requirement already satisfied: idna<4,>=2.5 in /usr/local/lib/python3.10/dist-packages (from requests->transformers) (3.6)\n","Requirement already satisfied: urllib3<3,>=1.21.1 in /usr/local/lib/python3.10/dist-packages (from requests->transformers) (2.0.7)\n","Requirement already satisfied: certifi>=2017.4.17 in /usr/local/lib/python3.10/dist-packages (from requests->transformers) (2023.11.17)\n"]}]},{"cell_type":"code","source":["from datasets import load_dataset\n","import torch\n","from transformers import AutoTokenizer, RobertaForSequenceClassification, RobertaTokenizerFast, TrainingArguments, Trainer\n","from sklearn.metrics import accuracy_score, precision_recall_fscore_support"],"metadata":{"id":"Tm6o96r0DjyR","executionInfo":{"status":"ok","timestamp":1704971960885,"user_tz":-60,"elapsed":4,"user":{"displayName":"Marcin Rostkowski","userId":"16749256502154511679"}}},"execution_count":16,"outputs":[]},{"cell_type":"markdown","source":["Ładowanie i przetwarzanie zbioru danych"],"metadata":{"id":"fFfCDO3DaweH"}},{"cell_type":"code","source":["def load_and_process_dataset():\n"," dataset = load_dataset(\"sst2\")\n"," dataset.remove_columns('idx')\n"," del dataset['test']\n"," dataset['test'] = dataset['validation']\n"," del dataset['validation']\n"," split_dataset = dataset['train'].train_test_split(test_size=1600)\n"," dataset['train'] = split_dataset['train']\n"," dataset['validation'] = split_dataset['test']\n"," return dataset"],"metadata":{"id":"aBoO_QjBIBWo","executionInfo":{"status":"ok","timestamp":1704971960885,"user_tz":-60,"elapsed":3,"user":{"displayName":"Marcin Rostkowski","userId":"16749256502154511679"}}},"execution_count":17,"outputs":[]},{"cell_type":"code","source":["def compute_metrics(pred):\n"," labels = pred.label_ids\n"," preds = pred.predictions.argmax(-1)\n"," precision, recall, f1, _ = precision_recall_fscore_support(labels, preds, average='binary')\n"," acc = accuracy_score(labels, preds)\n"," return {\n"," 'accuracy': acc,\n"," 'f1': f1,\n"," 'precision': precision,\n"," 'recall': recall\n"," }"],"metadata":{"id":"I5RWPHXSIdAe","executionInfo":{"status":"ok","timestamp":1704971960885,"user_tz":-60,"elapsed":3,"user":{"displayName":"Marcin Rostkowski","userId":"16749256502154511679"}}},"execution_count":18,"outputs":[]},{"cell_type":"code","source":["dataset = load_and_process_dataset()\n","dataset"],"metadata":{"colab":{"base_uri":"https://localhost:8080/"},"id":"ZigpgiICII4M","executionInfo":{"status":"ok","timestamp":1704971962662,"user_tz":-60,"elapsed":1780,"user":{"displayName":"Marcin Rostkowski","userId":"16749256502154511679"}},"outputId":"df5ec756-089e-4f27-cf25-c55b86f76830"},"execution_count":19,"outputs":[{"output_type":"execute_result","data":{"text/plain":["DatasetDict({\n"," train: Dataset({\n"," features: ['idx', 'sentence', 'label'],\n"," num_rows: 65749\n"," })\n"," test: Dataset({\n"," features: ['idx', 'sentence', 'label'],\n"," num_rows: 872\n"," })\n"," validation: Dataset({\n"," features: ['idx', 'sentence', 'label'],\n"," num_rows: 1600\n"," })\n","})"]},"metadata":{},"execution_count":19}]},{"cell_type":"code","source":["train = dataset['train']\n","validation = dataset['validation']\n","test = dataset['test']"],"metadata":{"id":"sEIRpLvlIL1d","executionInfo":{"status":"ok","timestamp":1704971962662,"user_tz":-60,"elapsed":3,"user":{"displayName":"Marcin Rostkowski","userId":"16749256502154511679"}}},"execution_count":20,"outputs":[]},{"cell_type":"code","source":["model = RobertaForSequenceClassification.from_pretrained('roberta-base')\n","tokenizer = RobertaTokenizerFast.from_pretrained('roberta-base', max_length = 512)"],"metadata":{"colab":{"base_uri":"https://localhost:8080/"},"id":"48_A8N38IsWV","executionInfo":{"status":"ok","timestamp":1704971965756,"user_tz":-60,"elapsed":3096,"user":{"displayName":"Marcin Rostkowski","userId":"16749256502154511679"}},"outputId":"5512a0c2-7b8b-47ef-abbe-d2c71843161f"},"execution_count":21,"outputs":[{"output_type":"stream","name":"stderr","text":["Some weights of RobertaForSequenceClassification were not initialized from the model checkpoint at roberta-base and are newly initialized: ['classifier.dense.weight', 'classifier.out_proj.bias', 'classifier.out_proj.weight', 'classifier.dense.bias']\n","You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.\n"]}]},{"cell_type":"code","source":["def tokenization(batched_text):\n"," return tokenizer(batched_text['sentence'], padding = True, truncation=True)\n","\n","\n","train_data = train.map(tokenization, batched = True, batch_size = len(train))\n","val_data = validation.map(tokenization, batched = True, batch_size = len(validation))\n","test_data = test.map(tokenization, batched = True, batch_size = len(test))"],"metadata":{"colab":{"base_uri":"https://localhost:8080/","height":81,"referenced_widgets":["d502a36179314d89a33811fd6501b195","2147ee5d3bf644a7b938175c162f0389","d3b624210c8e4f4b8082e84b3a660137","f33dde8e63eb4f02bbb0f7a56dc53c6c","0c3fa63e1598448081c4d89c9fa48359","adf721931ab34250989179521fcec956","c6aa731c30344e4886accb4fa0eaf1a5","8b752a2e2ad248ff92edae699da24399","82c3c2a482fc42f6abcd1352198b19d7","f42a6a01002d412db0f5e02cde71e2fe","75a6a234e4aa4eed8ca35d489af7d89c","63af2ab9157d4cbf885f0707d3181dc2","3b7882580e9d49c788f86bf26fce8a40","1e009e37b1d8485fa3d124c9edbf662d","75526ccab9954b15bd08c11015bcddfe","3623928df1d849b8aa8caec3a530b7dc","59ac5b7bed6644e6ac0f5ae0b0ef6b8f","6f5df399f34c48b5ab201cfb61aa9712","7459d4b778f5434a9ca58852b1e9fa67","cb1b5ea7ffc242949bbbdbec8ff3e061","522b1b1f1e424ca99036d0467a5246a6","f6e5f11380e44f78a03e90eb13661692"]},"id":"nvfdwixrI21L","executionInfo":{"status":"ok","timestamp":1704971977027,"user_tz":-60,"elapsed":11276,"user":{"displayName":"Marcin Rostkowski","userId":"16749256502154511679"}},"outputId":"ab84b527-c537-4c7e-f429-9d60c0e525aa"},"execution_count":22,"outputs":[{"output_type":"display_data","data":{"text/plain":["Map: 0%| | 0/65749 [00:00=4.34,<4.35'"],"metadata":{"colab":{"base_uri":"https://localhost:8080/","height":963},"id":"SV1ZYaTaJP8y","executionInfo":{"status":"ok","timestamp":1704972008185,"user_tz":-60,"elapsed":31161,"user":{"displayName":"Marcin Rostkowski","userId":"16749256502154511679"}},"outputId":"76b7d9ad-4432-4eb1-ff2c-03f7d4b6c078"},"execution_count":25,"outputs":[{"output_type":"stream","name":"stdout","text":["Collecting transformers[torch]<4.35,>=4.34\n"," Downloading transformers-4.34.1-py3-none-any.whl (7.7 MB)\n","\u001b[2K \u001b[90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━\u001b[0m \u001b[32m7.7/7.7 MB\u001b[0m \u001b[31m15.3 MB/s\u001b[0m eta \u001b[36m0:00:00\u001b[0m\n","\u001b[?25hRequirement already satisfied: filelock in /usr/local/lib/python3.10/dist-packages (from transformers[torch]<4.35,>=4.34) (3.13.1)\n","Requirement already satisfied: huggingface-hub<1.0,>=0.16.4 in /usr/local/lib/python3.10/dist-packages (from transformers[torch]<4.35,>=4.34) (0.20.2)\n","Requirement already satisfied: numpy>=1.17 in /usr/local/lib/python3.10/dist-packages (from transformers[torch]<4.35,>=4.34) (1.23.5)\n","Requirement already satisfied: packaging>=20.0 in /usr/local/lib/python3.10/dist-packages (from transformers[torch]<4.35,>=4.34) (23.2)\n","Requirement already satisfied: pyyaml>=5.1 in /usr/local/lib/python3.10/dist-packages (from transformers[torch]<4.35,>=4.34) (6.0.1)\n","Requirement already satisfied: regex!=2019.12.17 in /usr/local/lib/python3.10/dist-packages (from transformers[torch]<4.35,>=4.34) (2023.6.3)\n","Requirement already satisfied: requests in /usr/local/lib/python3.10/dist-packages (from transformers[torch]<4.35,>=4.34) (2.31.0)\n","Collecting tokenizers<0.15,>=0.14 (from transformers[torch]<4.35,>=4.34)\n"," Downloading tokenizers-0.14.1-cp310-cp310-manylinux_2_17_x86_64.manylinux2014_x86_64.whl (3.8 MB)\n","\u001b[2K \u001b[90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━\u001b[0m \u001b[32m3.8/3.8 MB\u001b[0m \u001b[31m31.4 MB/s\u001b[0m eta \u001b[36m0:00:00\u001b[0m\n","\u001b[?25hRequirement already satisfied: safetensors>=0.3.1 in /usr/local/lib/python3.10/dist-packages (from transformers[torch]<4.35,>=4.34) (0.4.1)\n","Requirement already satisfied: tqdm>=4.27 in /usr/local/lib/python3.10/dist-packages (from transformers[torch]<4.35,>=4.34) (4.66.1)\n","Requirement already satisfied: torch!=1.12.0,>=1.10 in /usr/local/lib/python3.10/dist-packages (from transformers[torch]<4.35,>=4.34) (2.1.0+cu121)\n","Requirement already satisfied: accelerate>=0.20.3 in /usr/local/lib/python3.10/dist-packages (from transformers[torch]<4.35,>=4.34) (0.26.0)\n","Requirement already satisfied: psutil in /usr/local/lib/python3.10/dist-packages (from accelerate>=0.20.3->transformers[torch]<4.35,>=4.34) (5.9.5)\n","Requirement already satisfied: fsspec>=2023.5.0 in /usr/local/lib/python3.10/dist-packages (from huggingface-hub<1.0,>=0.16.4->transformers[torch]<4.35,>=4.34) (2023.6.0)\n","Requirement already satisfied: typing-extensions>=3.7.4.3 in /usr/local/lib/python3.10/dist-packages (from huggingface-hub<1.0,>=0.16.4->transformers[torch]<4.35,>=4.34) (4.5.0)\n","Collecting huggingface-hub<1.0,>=0.16.4 (from transformers[torch]<4.35,>=4.34)\n"," Downloading huggingface_hub-0.17.3-py3-none-any.whl (295 kB)\n","\u001b[2K \u001b[90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━\u001b[0m \u001b[32m295.0/295.0 kB\u001b[0m \u001b[31m31.3 MB/s\u001b[0m eta \u001b[36m0:00:00\u001b[0m\n","\u001b[?25hRequirement already satisfied: sympy in /usr/local/lib/python3.10/dist-packages (from torch!=1.12.0,>=1.10->transformers[torch]<4.35,>=4.34) (1.12)\n","Requirement already satisfied: networkx in /usr/local/lib/python3.10/dist-packages (from torch!=1.12.0,>=1.10->transformers[torch]<4.35,>=4.34) (3.2.1)\n","Requirement already satisfied: jinja2 in /usr/local/lib/python3.10/dist-packages (from torch!=1.12.0,>=1.10->transformers[torch]<4.35,>=4.34) (3.1.2)\n","Requirement already satisfied: triton==2.1.0 in /usr/local/lib/python3.10/dist-packages (from torch!=1.12.0,>=1.10->transformers[torch]<4.35,>=4.34) (2.1.0)\n","Requirement already satisfied: charset-normalizer<4,>=2 in /usr/local/lib/python3.10/dist-packages (from requests->transformers[torch]<4.35,>=4.34) (3.3.2)\n","Requirement already satisfied: idna<4,>=2.5 in /usr/local/lib/python3.10/dist-packages (from requests->transformers[torch]<4.35,>=4.34) (3.6)\n","Requirement already satisfied: urllib3<3,>=1.21.1 in /usr/local/lib/python3.10/dist-packages (from requests->transformers[torch]<4.35,>=4.34) (2.0.7)\n","Requirement already satisfied: certifi>=2017.4.17 in /usr/local/lib/python3.10/dist-packages (from requests->transformers[torch]<4.35,>=4.34) (2023.11.17)\n","Requirement already satisfied: MarkupSafe>=2.0 in /usr/local/lib/python3.10/dist-packages (from jinja2->torch!=1.12.0,>=1.10->transformers[torch]<4.35,>=4.34) (2.1.3)\n","Requirement already satisfied: mpmath>=0.19 in /usr/local/lib/python3.10/dist-packages (from sympy->torch!=1.12.0,>=1.10->transformers[torch]<4.35,>=4.34) (1.3.0)\n","Installing collected packages: huggingface-hub, tokenizers, transformers\n"," Attempting uninstall: huggingface-hub\n"," Found existing installation: huggingface-hub 0.20.2\n"," Uninstalling huggingface-hub-0.20.2:\n"," Successfully uninstalled huggingface-hub-0.20.2\n"," Attempting uninstall: tokenizers\n"," Found existing installation: tokenizers 0.15.0\n"," Uninstalling tokenizers-0.15.0:\n"," Successfully uninstalled tokenizers-0.15.0\n"," Attempting uninstall: transformers\n"," Found existing installation: transformers 4.35.2\n"," Uninstalling transformers-4.35.2:\n"," Successfully uninstalled transformers-4.35.2\n","\u001b[31mERROR: pip's dependency resolver does not currently take into account all the packages that are installed. This behaviour is the source of the following dependency conflicts.\n","datasets 2.16.1 requires huggingface-hub>=0.19.4, but you have huggingface-hub 0.17.3 which is incompatible.\u001b[0m\u001b[31m\n","\u001b[0mSuccessfully installed huggingface-hub-0.17.3 tokenizers-0.14.1 transformers-4.34.1\n"]},{"output_type":"display_data","data":{"application/vnd.colab-display-data+json":{"pip_warning":{"packages":["huggingface_hub","transformers"]}}},"metadata":{}}]},{"cell_type":"code","source":["training_args = TrainingArguments(\n"," output_dir = './results',\n"," num_train_epochs=3,\n"," per_device_train_batch_size = 4,\n"," gradient_accumulation_steps = 16,\n"," per_device_eval_batch_size= 8,\n"," evaluation_strategy = \"epoch\",\n"," disable_tqdm = False,\n"," load_best_model_at_end=False,\n"," warmup_steps=500,\n"," weight_decay=0.01,\n"," logging_steps = 8,\n"," fp16 = True,\n"," logging_dir='./logs',\n"," dataloader_num_workers = 2,\n"," run_name = 'roberta-classification',\n"," optim=\"adamw_torch\"\n",")"],"metadata":{"id":"Bh4KGnanJcMd","executionInfo":{"status":"ok","timestamp":1704972008186,"user_tz":-60,"elapsed":10,"user":{"displayName":"Marcin Rostkowski","userId":"16749256502154511679"}}},"execution_count":26,"outputs":[]},{"cell_type":"code","source":["trainer = Trainer(\n"," model=model,\n"," args=training_args,\n"," compute_metrics=compute_metrics,\n"," train_dataset=train_data,\n"," eval_dataset=val_data,\n",")"],"metadata":{"id":"QjXxYbhAKb79","executionInfo":{"status":"ok","timestamp":1704972008579,"user_tz":-60,"elapsed":403,"user":{"displayName":"Marcin Rostkowski","userId":"16749256502154511679"}}},"execution_count":27,"outputs":[]},{"cell_type":"code","source":["trainer.train()"],"metadata":{"colab":{"base_uri":"https://localhost:8080/","height":204},"id":"NdWbmHzAKhxg","executionInfo":{"status":"ok","timestamp":1704974525766,"user_tz":-60,"elapsed":2517188,"user":{"displayName":"Marcin Rostkowski","userId":"16749256502154511679"}},"outputId":"07e09dc7-2780-4baa-ee90-7de9620b2428"},"execution_count":28,"outputs":[{"output_type":"display_data","data":{"text/plain":[""],"text/html":["\n","
\n"," \n"," \n"," [3081/3081 41:54, Epoch 2/3]\n","
\n"," \n"," \n"," \n"," \n"," \n"," \n"," \n"," \n"," \n"," \n"," \n"," \n"," \n"," \n"," \n"," \n"," \n"," \n"," \n"," \n"," \n"," \n"," \n"," \n"," \n"," \n"," \n"," \n"," \n"," \n"," \n"," \n"," \n"," \n"," \n"," \n"," \n"," \n"," \n"," \n"," \n","
EpochTraining LossValidation LossAccuracyF1PrecisionRecall
00.2075000.2096510.9237500.9340540.9094740.960000
10.2172000.1712520.9437500.9499440.9510020.948889
20.0673000.1730040.9393750.9461410.9456160.946667

"]},"metadata":{}},{"output_type":"execute_result","data":{"text/plain":["TrainOutput(global_step=3081, training_loss=0.18958045851048694, metrics={'train_runtime': 2517.0617, 'train_samples_per_second': 78.364, 'train_steps_per_second': 1.224, 'total_flos': 6788946644810280.0, 'train_loss': 0.18958045851048694, 'epoch': 3.0})"]},"metadata":{},"execution_count":28}]},{"cell_type":"code","source":["print(model)"],"metadata":{"colab":{"base_uri":"https://localhost:8080/"},"id":"nDIdkph4MWVk","executionInfo":{"status":"ok","timestamp":1704974525766,"user_tz":-60,"elapsed":6,"user":{"displayName":"Marcin Rostkowski","userId":"16749256502154511679"}},"outputId":"8a548188-0554-4b39-fe79-47a3625cd356"},"execution_count":29,"outputs":[{"output_type":"stream","name":"stdout","text":["RobertaForSequenceClassification(\n"," (roberta): RobertaModel(\n"," (embeddings): RobertaEmbeddings(\n"," (word_embeddings): Embedding(50265, 768, padding_idx=1)\n"," (position_embeddings): Embedding(514, 768, padding_idx=1)\n"," (token_type_embeddings): Embedding(1, 768)\n"," (LayerNorm): LayerNorm((768,), eps=1e-05, elementwise_affine=True)\n"," (dropout): Dropout(p=0.1, inplace=False)\n"," )\n"," (encoder): RobertaEncoder(\n"," (layer): ModuleList(\n"," (0-11): 12 x RobertaLayer(\n"," (attention): RobertaAttention(\n"," (self): RobertaSelfAttention(\n"," (query): Linear(in_features=768, out_features=768, bias=True)\n"," (key): Linear(in_features=768, out_features=768, bias=True)\n"," (value): Linear(in_features=768, out_features=768, bias=True)\n"," (dropout): Dropout(p=0.1, inplace=False)\n"," )\n"," (output): RobertaSelfOutput(\n"," (dense): Linear(in_features=768, out_features=768, bias=True)\n"," (LayerNorm): LayerNorm((768,), eps=1e-05, elementwise_affine=True)\n"," (dropout): Dropout(p=0.1, inplace=False)\n"," )\n"," )\n"," (intermediate): RobertaIntermediate(\n"," (dense): Linear(in_features=768, out_features=3072, bias=True)\n"," (intermediate_act_fn): GELUActivation()\n"," )\n"," (output): RobertaOutput(\n"," (dense): Linear(in_features=3072, out_features=768, bias=True)\n"," (LayerNorm): LayerNorm((768,), eps=1e-05, elementwise_affine=True)\n"," (dropout): Dropout(p=0.1, inplace=False)\n"," )\n"," )\n"," )\n"," )\n"," )\n"," (classifier): RobertaClassificationHead(\n"," (dense): Linear(in_features=768, out_features=768, bias=True)\n"," (dropout): Dropout(p=0.1, inplace=False)\n"," (out_proj): Linear(in_features=768, out_features=2, bias=True)\n"," )\n",")\n"]}]},{"cell_type":"code","source":["trainer.evaluate()"],"metadata":{"id":"RZLJraiWfCPG"},"execution_count":null,"outputs":[]}]}