",
+ "text/html": "\n \n \n
\n [ 2/762 : < :, Epoch 0.00/3]\n
\n \n \n \n Step | \n Training Loss | \n
\n \n \n \n
"
+ },
+ "metadata": {},
+ "output_type": "display_data"
+ },
+ {
+ "name": "stderr",
+ "output_type": "stream",
+ "text": [
+ "Saving model checkpoint to data/bert-finetuned-subjqa/checkpoint-254\n",
+ "Configuration saved in data/bert-finetuned-subjqa/checkpoint-254/config.json\n",
+ "Model weights saved in data/bert-finetuned-subjqa/checkpoint-254/pytorch_model.bin\n",
+ "tokenizer config file saved in data/bert-finetuned-subjqa/checkpoint-254/tokenizer_config.json\n",
+ "Special tokens file saved in data/bert-finetuned-subjqa/checkpoint-254/special_tokens_map.json\n",
+ "Saving model checkpoint to data/bert-finetuned-subjqa/checkpoint-508\n",
+ "Configuration saved in data/bert-finetuned-subjqa/checkpoint-508/config.json\n",
+ "Model weights saved in data/bert-finetuned-subjqa/checkpoint-508/pytorch_model.bin\n",
+ "tokenizer config file saved in data/bert-finetuned-subjqa/checkpoint-508/tokenizer_config.json\n",
+ "Special tokens file saved in data/bert-finetuned-subjqa/checkpoint-508/special_tokens_map.json\n",
+ "Saving model checkpoint to data/bert-finetuned-subjqa/checkpoint-762\n",
+ "Configuration saved in data/bert-finetuned-subjqa/checkpoint-762/config.json\n",
+ "Model weights saved in data/bert-finetuned-subjqa/checkpoint-762/pytorch_model.bin\n",
+ "tokenizer config file saved in data/bert-finetuned-subjqa/checkpoint-762/tokenizer_config.json\n",
+ "Special tokens file saved in data/bert-finetuned-subjqa/checkpoint-762/special_tokens_map.json\n",
+ "\n",
+ "\n",
+ "Training completed. Do not forget to share your model on huggingface.co/models =)\n",
+ "\n",
+ "\n"
+ ]
+ },
+ {
+ "data": {
+ "text/plain": "TrainOutput(global_step=762, training_loss=0.7420042443463183, metrics={'train_runtime': 336.3364, 'train_samples_per_second': 18.107, 'train_steps_per_second': 2.266, 'total_flos': 1193472936391680.0, 'train_loss': 0.7420042443463183, 'epoch': 3.0})"
+ },
+ "execution_count": 53,
+ "metadata": {},
+ "output_type": "execute_result"
+ }
+ ],
+ "source": [
+ "from transformers import TrainingArguments\n",
+ "from transformers import Trainer\n",
+ "\n",
+ "\n",
+ "args = TrainingArguments(\n",
+ " output_dir=\"data/bert-finetuned-subjqa\",\n",
+ " overwrite_output_dir=True,\n",
+ " evaluation_strategy=\"no\",\n",
+ " save_strategy=\"epoch\",\n",
+ " learning_rate=2e-5,\n",
+ " num_train_epochs=3,\n",
+ " weight_decay=0.01,\n",
+ " fp16=True,\n",
+ ")\n",
+ "trainer = Trainer(\n",
+ " model=model,\n",
+ " args=args,\n",
+ " train_dataset=train_dataset,\n",
+ " eval_dataset=validation_dataset,\n",
+ " tokenizer=tokenizer,\n",
+ ")\n",
+ "trainer.train()"
+ ],
+ "metadata": {
+ "collapsed": false,
+ "pycharm": {
+ "name": "#%%\n"
+ }
+ }
+ },
+ {
+ "cell_type": "code",
+ "execution_count": 54,
+ "outputs": [
+ {
+ "name": "stderr",
+ "output_type": "stream",
+ "text": [
+ "The following columns in the test set don't have a corresponding argument in `BertForQuestionAnswering.forward` and have been ignored: example_id, offset_mapping. If example_id, offset_mapping are not expected by `BertForQuestionAnswering.forward`, you can safely ignore this message.\n",
+ "***** Running Prediction *****\n",
+ " Num examples = 327\n",
+ " Batch size = 8\n"
+ ]
+ },
+ {
+ "data": {
+ "text/plain": "",
+ "text/html": "\n \n "
+ },
+ "metadata": {},
+ "output_type": "display_data"
+ },
+ {
+ "data": {
+ "text/plain": " 0%| | 0/265 [00:00, ?it/s]",
+ "application/vnd.jupyter.widget-view+json": {
+ "version_major": 2,
+ "version_minor": 0,
+ "model_id": "99e405e6118f4a57bdd65b16b75fbc55"
+ },
+ "application/json": {
+ "n": 0,
+ "total": 265,
+ "elapsed": 0.010312318801879883,
+ "ncols": null,
+ "nrows": 12,
+ "prefix": "",
+ "ascii": false,
+ "unit": "it",
+ "unit_scale": false,
+ "rate": null,
+ "bar_format": null,
+ "postfix": null,
+ "unit_divisor": 1000,
+ "initial": 0,
+ "colour": null
+ }
+ },
+ "metadata": {},
+ "output_type": "display_data"
+ },
+ {
+ "name": "stdout",
+ "output_type": "stream",
+ "text": [
+ "QUESTION:\t How was the hotel?\n",
+ "PREDICTED: Great setting at the end of the wharf\n",
+ "ACTUAL: ['excellent hotels']\n",
+ "QUESTION:\t How is the hotel?\n",
+ "PREDICTED: My wife and I took two trips to San Fran in 2004 and stayed at the best western both times\n",
+ "ACTUAL: ['The hotel location was great']\n",
+ "QUESTION:\t Is it value for money?\n",
+ "PREDICTED: excellent value for money\n",
+ "ACTUAL: ['And very reasonably priced. Overall, excellent value for money and highly recommended.']\n",
+ "{'exact_match': 13.584905660377359, 'f1': 30.577012885313252}\n"
+ ]
+ }
+ ],
+ "source": [
+ "predictions, _, _ = trainer.predict(validation_dataset)\n",
+ "start_logits, end_logits = predictions\n",
+ "_=compute_metrics(start_logits, end_logits, validation_dataset, subjqa[\"validation\"])"
+ ],
+ "metadata": {
+ "collapsed": false,
+ "pycharm": {
+ "name": "#%%\n"
+ }
+ }
+ },
+ {
+ "cell_type": "code",
+ "execution_count": 55,
+ "outputs": [
+ {
+ "name": "stderr",
+ "output_type": "stream",
+ "text": [
+ "loading configuration file https://huggingface.co/deepset/roberta-base-squad2/resolve/main/config.json from cache at /home/karo/.cache/huggingface/transformers/c40d0abb589629c48763f271020d0b1f602f5208c432c0874d420491ed37e28b.122ed338b3591c07dba452777c59ff52330edb340d3d56d67aa9117ad9905673\n",
+ "Model config RobertaConfig {\n",
+ " \"_name_or_path\": \"deepset/roberta-base-squad2\",\n",
+ " \"architectures\": [\n",
+ " \"RobertaForQuestionAnswering\"\n",
+ " ],\n",
+ " \"attention_probs_dropout_prob\": 0.1,\n",
+ " \"bos_token_id\": 0,\n",
+ " \"classifier_dropout\": null,\n",
+ " \"eos_token_id\": 2,\n",
+ " \"gradient_checkpointing\": false,\n",
+ " \"hidden_act\": \"gelu\",\n",
+ " \"hidden_dropout_prob\": 0.1,\n",
+ " \"hidden_size\": 768,\n",
+ " \"initializer_range\": 0.02,\n",
+ " \"intermediate_size\": 3072,\n",
+ " \"language\": \"english\",\n",
+ " \"layer_norm_eps\": 1e-05,\n",
+ " \"max_position_embeddings\": 514,\n",
+ " \"model_type\": \"roberta\",\n",
+ " \"name\": \"Roberta\",\n",
+ " \"num_attention_heads\": 12,\n",
+ " \"num_hidden_layers\": 12,\n",
+ " \"pad_token_id\": 1,\n",
+ " \"position_embedding_type\": \"absolute\",\n",
+ " \"transformers_version\": \"4.21.1\",\n",
+ " \"type_vocab_size\": 1,\n",
+ " \"use_cache\": true,\n",
+ " \"vocab_size\": 50265\n",
+ "}\n",
+ "\n",
+ "loading file https://huggingface.co/deepset/roberta-base-squad2/resolve/main/vocab.json from cache at /home/karo/.cache/huggingface/transformers/81c80edb4c6cefa5cae64ccfdb34b3b309ecaf60da99da7cd1c17e24a5d36eb5.647b4548b6d9ea817e82e7a9231a320231a1c9ea24053cc9e758f3fe68216f05\n",
+ "loading file https://huggingface.co/deepset/roberta-base-squad2/resolve/main/merges.txt from cache at /home/karo/.cache/huggingface/transformers/b87d46371731376b11768b7839b1a5938a4f77d6bd2d9b683f167df0026af432.5d12962c5ee615a4c803841266e9c3be9a691a924f72d395d3a6c6c81157788b\n",
+ "loading file https://huggingface.co/deepset/roberta-base-squad2/resolve/main/tokenizer.json from cache at None\n",
+ "loading file https://huggingface.co/deepset/roberta-base-squad2/resolve/main/added_tokens.json from cache at None\n",
+ "loading file https://huggingface.co/deepset/roberta-base-squad2/resolve/main/special_tokens_map.json from cache at /home/karo/.cache/huggingface/transformers/c9d2c178fac8d40234baa1833a3b1903d393729bf93ea34da247c07db24900d0.cb2244924ab24d706b02fd7fcedaea4531566537687a539ebb94db511fd122a0\n",
+ "loading file https://huggingface.co/deepset/roberta-base-squad2/resolve/main/tokenizer_config.json from cache at /home/karo/.cache/huggingface/transformers/e8a600814b69e3ee74bb4a7398cc6fef9812475010f16a6c9f151b2c2772b089.451739a2f3b82c3375da0dfc6af295bedc4567373b171f514dd09a4cc4b31513\n",
+ "loading configuration file https://huggingface.co/deepset/roberta-base-squad2/resolve/main/config.json from cache at /home/karo/.cache/huggingface/transformers/c40d0abb589629c48763f271020d0b1f602f5208c432c0874d420491ed37e28b.122ed338b3591c07dba452777c59ff52330edb340d3d56d67aa9117ad9905673\n",
+ "Model config RobertaConfig {\n",
+ " \"_name_or_path\": \"deepset/roberta-base-squad2\",\n",
+ " \"architectures\": [\n",
+ " \"RobertaForQuestionAnswering\"\n",
+ " ],\n",
+ " \"attention_probs_dropout_prob\": 0.1,\n",
+ " \"bos_token_id\": 0,\n",
+ " \"classifier_dropout\": null,\n",
+ " \"eos_token_id\": 2,\n",
+ " \"gradient_checkpointing\": false,\n",
+ " \"hidden_act\": \"gelu\",\n",
+ " \"hidden_dropout_prob\": 0.1,\n",
+ " \"hidden_size\": 768,\n",
+ " \"initializer_range\": 0.02,\n",
+ " \"intermediate_size\": 3072,\n",
+ " \"language\": \"english\",\n",
+ " \"layer_norm_eps\": 1e-05,\n",
+ " \"max_position_embeddings\": 514,\n",
+ " \"model_type\": \"roberta\",\n",
+ " \"name\": \"Roberta\",\n",
+ " \"num_attention_heads\": 12,\n",
+ " \"num_hidden_layers\": 12,\n",
+ " \"pad_token_id\": 1,\n",
+ " \"position_embedding_type\": \"absolute\",\n",
+ " \"transformers_version\": \"4.21.1\",\n",
+ " \"type_vocab_size\": 1,\n",
+ " \"use_cache\": true,\n",
+ " \"vocab_size\": 50265\n",
+ "}\n",
+ "\n",
+ "loading configuration file https://huggingface.co/deepset/roberta-base-squad2/resolve/main/config.json from cache at /home/karo/.cache/huggingface/transformers/c40d0abb589629c48763f271020d0b1f602f5208c432c0874d420491ed37e28b.122ed338b3591c07dba452777c59ff52330edb340d3d56d67aa9117ad9905673\n",
+ "Model config RobertaConfig {\n",
+ " \"_name_or_path\": \"deepset/roberta-base-squad2\",\n",
+ " \"architectures\": [\n",
+ " \"RobertaForQuestionAnswering\"\n",
+ " ],\n",
+ " \"attention_probs_dropout_prob\": 0.1,\n",
+ " \"bos_token_id\": 0,\n",
+ " \"classifier_dropout\": null,\n",
+ " \"eos_token_id\": 2,\n",
+ " \"gradient_checkpointing\": false,\n",
+ " \"hidden_act\": \"gelu\",\n",
+ " \"hidden_dropout_prob\": 0.1,\n",
+ " \"hidden_size\": 768,\n",
+ " \"initializer_range\": 0.02,\n",
+ " \"intermediate_size\": 3072,\n",
+ " \"language\": \"english\",\n",
+ " \"layer_norm_eps\": 1e-05,\n",
+ " \"max_position_embeddings\": 514,\n",
+ " \"model_type\": \"roberta\",\n",
+ " \"name\": \"Roberta\",\n",
+ " \"num_attention_heads\": 12,\n",
+ " \"num_hidden_layers\": 12,\n",
+ " \"pad_token_id\": 1,\n",
+ " \"position_embedding_type\": \"absolute\",\n",
+ " \"transformers_version\": \"4.21.1\",\n",
+ " \"type_vocab_size\": 1,\n",
+ " \"use_cache\": true,\n",
+ " \"vocab_size\": 50265\n",
+ "}\n",
+ "\n",
+ "loading configuration file https://huggingface.co/deepset/roberta-base-squad2/resolve/main/config.json from cache at /home/karo/.cache/huggingface/transformers/c40d0abb589629c48763f271020d0b1f602f5208c432c0874d420491ed37e28b.122ed338b3591c07dba452777c59ff52330edb340d3d56d67aa9117ad9905673\n",
+ "Model config RobertaConfig {\n",
+ " \"_name_or_path\": \"deepset/roberta-base-squad2\",\n",
+ " \"architectures\": [\n",
+ " \"RobertaForQuestionAnswering\"\n",
+ " ],\n",
+ " \"attention_probs_dropout_prob\": 0.1,\n",
+ " \"bos_token_id\": 0,\n",
+ " \"classifier_dropout\": null,\n",
+ " \"eos_token_id\": 2,\n",
+ " \"gradient_checkpointing\": false,\n",
+ " \"hidden_act\": \"gelu\",\n",
+ " \"hidden_dropout_prob\": 0.1,\n",
+ " \"hidden_size\": 768,\n",
+ " \"initializer_range\": 0.02,\n",
+ " \"intermediate_size\": 3072,\n",
+ " \"language\": \"english\",\n",
+ " \"layer_norm_eps\": 1e-05,\n",
+ " \"max_position_embeddings\": 514,\n",
+ " \"model_type\": \"roberta\",\n",
+ " \"name\": \"Roberta\",\n",
+ " \"num_attention_heads\": 12,\n",
+ " \"num_hidden_layers\": 12,\n",
+ " \"pad_token_id\": 1,\n",
+ " \"position_embedding_type\": \"absolute\",\n",
+ " \"transformers_version\": \"4.21.1\",\n",
+ " \"type_vocab_size\": 1,\n",
+ " \"use_cache\": true,\n",
+ " \"vocab_size\": 50265\n",
+ "}\n",
+ "\n",
+ "loading weights file https://huggingface.co/deepset/roberta-base-squad2/resolve/main/pytorch_model.bin from cache at /home/karo/.cache/huggingface/transformers/eac3273a8097dda671e3bea1db32c616e74f36a306c65b4858171c98d6db83e9.084aa7284f3a51fa1c8f0641aa04c47d366fbd18711f29d0a995693cfdbc9c9e\n",
+ "All model checkpoint weights were used when initializing RobertaForQuestionAnswering.\n",
+ "\n",
+ "All the weights of RobertaForQuestionAnswering were initialized from the model checkpoint at deepset/roberta-base-squad2.\n",
+ "If your task is similar to the task the model of the checkpoint was trained on, you can already use RobertaForQuestionAnswering for predictions without further training.\n"
+ ]
+ },
+ {
+ "data": {
+ "text/plain": " 0%| | 0/1 [00:00, ?ba/s]",
+ "application/vnd.jupyter.widget-view+json": {
+ "version_major": 2,
+ "version_minor": 0,
+ "model_id": "5e883ae315504badb1cd5c1bdf3a957d"
+ },
+ "application/json": {
+ "n": 0,
+ "total": 1,
+ "elapsed": 0.009602069854736328,
+ "ncols": null,
+ "nrows": 12,
+ "prefix": "",
+ "ascii": false,
+ "unit": "ba",
+ "unit_scale": false,
+ "rate": null,
+ "bar_format": null,
+ "postfix": null,
+ "unit_divisor": 1000,
+ "initial": 0,
+ "colour": null
+ }
+ },
+ "metadata": {},
+ "output_type": "display_data"
+ },
+ {
+ "name": "stdout",
+ "output_type": "stream",
+ "text": [
+ "RobertaTokenizerFast\n"
+ ]
+ },
+ {
+ "data": {
+ "text/plain": " 0%| | 0/265 [00:00, ?it/s]",
+ "application/vnd.jupyter.widget-view+json": {
+ "version_major": 2,
+ "version_minor": 0,
+ "model_id": "0b25cfa4cc1243459d19da7f8f3a0ed6"
+ },
+ "application/json": {
+ "n": 0,
+ "total": 265,
+ "elapsed": 0.00951838493347168,
+ "ncols": null,
+ "nrows": 12,
+ "prefix": "",
+ "ascii": false,
+ "unit": "it",
+ "unit_scale": false,
+ "rate": null,
+ "bar_format": null,
+ "postfix": null,
+ "unit_divisor": 1000,
+ "initial": 0,
+ "colour": null
+ }
+ },
+ "metadata": {},
+ "output_type": "display_data"
+ },
+ {
+ "name": "stdout",
+ "output_type": "stream",
+ "text": [
+ "QUESTION:\t How was the hotel?\n",
+ "PREDICTED: No complaints - really happy with what this hotel offered for the price\n",
+ "ACTUAL: ['excellent hotels']\n",
+ "QUESTION:\t How is the hotel?\n",
+ "PREDICTED: clean and neat\n",
+ "ACTUAL: ['The hotel location was great']\n",
+ "QUESTION:\t Is it value for money?\n",
+ "PREDICTED: excellent value for money\n",
+ "ACTUAL: ['And very reasonably priced. Overall, excellent value for money and highly recommended.']\n",
+ "{'exact_match': 11.69811320754717, 'f1': 37.040671268633204}\n"
+ ]
+ }
+ ],
+ "source": [
+ "trained_checkpoint = \"deepset/roberta-base-squad2\"\n",
+ "\n",
+ "tokenizer = AutoTokenizer.from_pretrained(trained_checkpoint)\n",
+ "trained_model = AutoModelForQuestionAnswering.from_pretrained(trained_checkpoint).to(device)\n",
+ "predict_from_trained()"
+ ],
+ "metadata": {
+ "collapsed": false,
+ "pycharm": {
+ "name": "#%%\n"
+ }
+ }
+ },
+ {
+ "cell_type": "code",
+ "execution_count": 56,
+ "outputs": [
+ {
+ "name": "stderr",
+ "output_type": "stream",
+ "text": [
+ "loading configuration file https://huggingface.co/deepset/roberta-base-squad2/resolve/main/config.json from cache at /home/karo/.cache/huggingface/transformers/c40d0abb589629c48763f271020d0b1f602f5208c432c0874d420491ed37e28b.122ed338b3591c07dba452777c59ff52330edb340d3d56d67aa9117ad9905673\n",
+ "Model config RobertaConfig {\n",
+ " \"_name_or_path\": \"deepset/roberta-base-squad2\",\n",
+ " \"architectures\": [\n",
+ " \"RobertaForQuestionAnswering\"\n",
+ " ],\n",
+ " \"attention_probs_dropout_prob\": 0.1,\n",
+ " \"bos_token_id\": 0,\n",
+ " \"classifier_dropout\": null,\n",
+ " \"eos_token_id\": 2,\n",
+ " \"gradient_checkpointing\": false,\n",
+ " \"hidden_act\": \"gelu\",\n",
+ " \"hidden_dropout_prob\": 0.1,\n",
+ " \"hidden_size\": 768,\n",
+ " \"initializer_range\": 0.02,\n",
+ " \"intermediate_size\": 3072,\n",
+ " \"language\": \"english\",\n",
+ " \"layer_norm_eps\": 1e-05,\n",
+ " \"max_position_embeddings\": 514,\n",
+ " \"model_type\": \"roberta\",\n",
+ " \"name\": \"Roberta\",\n",
+ " \"num_attention_heads\": 12,\n",
+ " \"num_hidden_layers\": 12,\n",
+ " \"pad_token_id\": 1,\n",
+ " \"position_embedding_type\": \"absolute\",\n",
+ " \"transformers_version\": \"4.21.1\",\n",
+ " \"type_vocab_size\": 1,\n",
+ " \"use_cache\": true,\n",
+ " \"vocab_size\": 50265\n",
+ "}\n",
+ "\n",
+ "loading weights file https://huggingface.co/deepset/roberta-base-squad2/resolve/main/pytorch_model.bin from cache at /home/karo/.cache/huggingface/transformers/eac3273a8097dda671e3bea1db32c616e74f36a306c65b4858171c98d6db83e9.084aa7284f3a51fa1c8f0641aa04c47d366fbd18711f29d0a995693cfdbc9c9e\n",
+ "All model checkpoint weights were used when initializing RobertaForQuestionAnswering.\n",
+ "\n",
+ "All the weights of RobertaForQuestionAnswering were initialized from the model checkpoint at deepset/roberta-base-squad2.\n",
+ "If your task is similar to the task the model of the checkpoint was trained on, you can already use RobertaForQuestionAnswering for predictions without further training.\n",
+ "loading configuration file https://huggingface.co/deepset/roberta-base-squad2/resolve/main/config.json from cache at /home/karo/.cache/huggingface/transformers/c40d0abb589629c48763f271020d0b1f602f5208c432c0874d420491ed37e28b.122ed338b3591c07dba452777c59ff52330edb340d3d56d67aa9117ad9905673\n",
+ "Model config RobertaConfig {\n",
+ " \"_name_or_path\": \"deepset/roberta-base-squad2\",\n",
+ " \"architectures\": [\n",
+ " \"RobertaForQuestionAnswering\"\n",
+ " ],\n",
+ " \"attention_probs_dropout_prob\": 0.1,\n",
+ " \"bos_token_id\": 0,\n",
+ " \"classifier_dropout\": null,\n",
+ " \"eos_token_id\": 2,\n",
+ " \"gradient_checkpointing\": false,\n",
+ " \"hidden_act\": \"gelu\",\n",
+ " \"hidden_dropout_prob\": 0.1,\n",
+ " \"hidden_size\": 768,\n",
+ " \"initializer_range\": 0.02,\n",
+ " \"intermediate_size\": 3072,\n",
+ " \"language\": \"english\",\n",
+ " \"layer_norm_eps\": 1e-05,\n",
+ " \"max_position_embeddings\": 514,\n",
+ " \"model_type\": \"roberta\",\n",
+ " \"name\": \"Roberta\",\n",
+ " \"num_attention_heads\": 12,\n",
+ " \"num_hidden_layers\": 12,\n",
+ " \"pad_token_id\": 1,\n",
+ " \"position_embedding_type\": \"absolute\",\n",
+ " \"transformers_version\": \"4.21.1\",\n",
+ " \"type_vocab_size\": 1,\n",
+ " \"use_cache\": true,\n",
+ " \"vocab_size\": 50265\n",
+ "}\n",
+ "\n",
+ "loading file https://huggingface.co/deepset/roberta-base-squad2/resolve/main/vocab.json from cache at /home/karo/.cache/huggingface/transformers/81c80edb4c6cefa5cae64ccfdb34b3b309ecaf60da99da7cd1c17e24a5d36eb5.647b4548b6d9ea817e82e7a9231a320231a1c9ea24053cc9e758f3fe68216f05\n",
+ "loading file https://huggingface.co/deepset/roberta-base-squad2/resolve/main/merges.txt from cache at /home/karo/.cache/huggingface/transformers/b87d46371731376b11768b7839b1a5938a4f77d6bd2d9b683f167df0026af432.5d12962c5ee615a4c803841266e9c3be9a691a924f72d395d3a6c6c81157788b\n",
+ "loading file https://huggingface.co/deepset/roberta-base-squad2/resolve/main/tokenizer.json from cache at None\n",
+ "loading file https://huggingface.co/deepset/roberta-base-squad2/resolve/main/added_tokens.json from cache at None\n",
+ "loading file https://huggingface.co/deepset/roberta-base-squad2/resolve/main/special_tokens_map.json from cache at /home/karo/.cache/huggingface/transformers/c9d2c178fac8d40234baa1833a3b1903d393729bf93ea34da247c07db24900d0.cb2244924ab24d706b02fd7fcedaea4531566537687a539ebb94db511fd122a0\n",
+ "loading file https://huggingface.co/deepset/roberta-base-squad2/resolve/main/tokenizer_config.json from cache at /home/karo/.cache/huggingface/transformers/e8a600814b69e3ee74bb4a7398cc6fef9812475010f16a6c9f151b2c2772b089.451739a2f3b82c3375da0dfc6af295bedc4567373b171f514dd09a4cc4b31513\n",
+ "loading configuration file https://huggingface.co/deepset/roberta-base-squad2/resolve/main/config.json from cache at /home/karo/.cache/huggingface/transformers/c40d0abb589629c48763f271020d0b1f602f5208c432c0874d420491ed37e28b.122ed338b3591c07dba452777c59ff52330edb340d3d56d67aa9117ad9905673\n",
+ "Model config RobertaConfig {\n",
+ " \"_name_or_path\": \"deepset/roberta-base-squad2\",\n",
+ " \"architectures\": [\n",
+ " \"RobertaForQuestionAnswering\"\n",
+ " ],\n",
+ " \"attention_probs_dropout_prob\": 0.1,\n",
+ " \"bos_token_id\": 0,\n",
+ " \"classifier_dropout\": null,\n",
+ " \"eos_token_id\": 2,\n",
+ " \"gradient_checkpointing\": false,\n",
+ " \"hidden_act\": \"gelu\",\n",
+ " \"hidden_dropout_prob\": 0.1,\n",
+ " \"hidden_size\": 768,\n",
+ " \"initializer_range\": 0.02,\n",
+ " \"intermediate_size\": 3072,\n",
+ " \"language\": \"english\",\n",
+ " \"layer_norm_eps\": 1e-05,\n",
+ " \"max_position_embeddings\": 514,\n",
+ " \"model_type\": \"roberta\",\n",
+ " \"name\": \"Roberta\",\n",
+ " \"num_attention_heads\": 12,\n",
+ " \"num_hidden_layers\": 12,\n",
+ " \"pad_token_id\": 1,\n",
+ " \"position_embedding_type\": \"absolute\",\n",
+ " \"transformers_version\": \"4.21.1\",\n",
+ " \"type_vocab_size\": 1,\n",
+ " \"use_cache\": true,\n",
+ " \"vocab_size\": 50265\n",
+ "}\n",
+ "\n",
+ "loading configuration file https://huggingface.co/deepset/roberta-base-squad2/resolve/main/config.json from cache at /home/karo/.cache/huggingface/transformers/c40d0abb589629c48763f271020d0b1f602f5208c432c0874d420491ed37e28b.122ed338b3591c07dba452777c59ff52330edb340d3d56d67aa9117ad9905673\n",
+ "Model config RobertaConfig {\n",
+ " \"_name_or_path\": \"deepset/roberta-base-squad2\",\n",
+ " \"architectures\": [\n",
+ " \"RobertaForQuestionAnswering\"\n",
+ " ],\n",
+ " \"attention_probs_dropout_prob\": 0.1,\n",
+ " \"bos_token_id\": 0,\n",
+ " \"classifier_dropout\": null,\n",
+ " \"eos_token_id\": 2,\n",
+ " \"gradient_checkpointing\": false,\n",
+ " \"hidden_act\": \"gelu\",\n",
+ " \"hidden_dropout_prob\": 0.1,\n",
+ " \"hidden_size\": 768,\n",
+ " \"initializer_range\": 0.02,\n",
+ " \"intermediate_size\": 3072,\n",
+ " \"language\": \"english\",\n",
+ " \"layer_norm_eps\": 1e-05,\n",
+ " \"max_position_embeddings\": 514,\n",
+ " \"model_type\": \"roberta\",\n",
+ " \"name\": \"Roberta\",\n",
+ " \"num_attention_heads\": 12,\n",
+ " \"num_hidden_layers\": 12,\n",
+ " \"pad_token_id\": 1,\n",
+ " \"position_embedding_type\": \"absolute\",\n",
+ " \"transformers_version\": \"4.21.1\",\n",
+ " \"type_vocab_size\": 1,\n",
+ " \"use_cache\": true,\n",
+ " \"vocab_size\": 50265\n",
+ "}\n",
+ "\n"
+ ]
+ },
+ {
+ "data": {
+ "text/plain": " 0%| | 0/2 [00:00, ?ba/s]",
+ "application/vnd.jupyter.widget-view+json": {
+ "version_major": 2,
+ "version_minor": 0,
+ "model_id": "562ecfd319d34d7b904fb8ace5b715d8"
+ },
+ "application/json": {
+ "n": 0,
+ "total": 2,
+ "elapsed": 0.009464502334594727,
+ "ncols": null,
+ "nrows": 12,
+ "prefix": "",
+ "ascii": false,
+ "unit": "ba",
+ "unit_scale": false,
+ "rate": null,
+ "bar_format": null,
+ "postfix": null,
+ "unit_divisor": 1000,
+ "initial": 0,
+ "colour": null
+ }
+ },
+ "metadata": {},
+ "output_type": "display_data"
+ },
+ {
+ "name": "stdout",
+ "output_type": "stream",
+ "text": [
+ "RobertaTokenizerFast\n",
+ "RobertaTokenizerFast\n",
+ "1666 1995\n"
+ ]
+ },
+ {
+ "data": {
+ "text/plain": " 0%| | 0/1 [00:00, ?ba/s]",
+ "application/vnd.jupyter.widget-view+json": {
+ "version_major": 2,
+ "version_minor": 0,
+ "model_id": "0a5a6b6a04854ee2ab05b0b01c71db44"
+ },
+ "application/json": {
+ "n": 0,
+ "total": 1,
+ "elapsed": 0.009068012237548828,
+ "ncols": null,
+ "nrows": 12,
+ "prefix": "",
+ "ascii": false,
+ "unit": "ba",
+ "unit_scale": false,
+ "rate": null,
+ "bar_format": null,
+ "postfix": null,
+ "unit_divisor": 1000,
+ "initial": 0,
+ "colour": null
+ }
+ },
+ "metadata": {},
+ "output_type": "display_data"
+ },
+ {
+ "name": "stdout",
+ "output_type": "stream",
+ "text": [
+ "RobertaTokenizerFast\n",
+ "265 321\n"
+ ]
+ }
+ ],
+ "source": [
+ "model_ckpt = \"deepset/roberta-base-squad2\"\n",
+ "\n",
+ "model = AutoModelForQuestionAnswering.from_pretrained(model_ckpt)\n",
+ "tokenizer = AutoTokenizer.from_pretrained(model_ckpt)\n",
+ "\n",
+ "roberta_train_dataset = subjqa[\"train\"].map(\n",
+ " preprocess_train_data,\n",
+ " batched=True,\n",
+ " remove_columns=subjqa[\"train\"].column_names,\n",
+ ")\n",
+ "\n",
+ "print(len(subjqa[\"train\"]), len(roberta_train_dataset))\n",
+ "roberta_validation_dataset = subjqa[\"validation\"].map(\n",
+ " preprocess_validation_examples,\n",
+ " batched=True,\n",
+ " remove_columns=subjqa[\"validation\"].column_names,\n",
+ ")\n",
+ "print(len(subjqa[\"validation\"]), len(roberta_validation_dataset))\n"
+ ],
+ "metadata": {
+ "collapsed": false,
+ "pycharm": {
+ "name": "#%%\n"
+ }
+ }
+ },
+ {
+ "cell_type": "code",
+ "execution_count": 57,
+ "outputs": [
+ {
+ "name": "stderr",
+ "output_type": "stream",
+ "text": [
+ "PyTorch: setting up devices\n",
+ "The default value for the training argument `--report_to` will change in v5 (from all installed integrations to none). In v5, you will need to use `--report_to all` to get the same behavior as now. You should start updating your code and make this info disappear :-).\n",
+ "Using cuda_amp half precision backend\n",
+ "The following columns in the training set don't have a corresponding argument in `RobertaForQuestionAnswering.forward` and have been ignored: offset_mapping, overflow_to_sample_mapping. If offset_mapping, overflow_to_sample_mapping are not expected by `RobertaForQuestionAnswering.forward`, you can safely ignore this message.\n",
+ "/home/karo/nlp-project/venv/lib/python3.10/site-packages/transformers/optimization.py:306: FutureWarning: This implementation of AdamW is deprecated and will be removed in a future version. Use the PyTorch implementation torch.optim.AdamW instead, or set `no_deprecation_warning=True` to disable this warning\n",
+ " warnings.warn(\n",
+ "***** Running training *****\n",
+ " Num examples = 1995\n",
+ " Num Epochs = 3\n",
+ " Instantaneous batch size per device = 8\n",
+ " Total train batch size (w. parallel, distributed & accumulation) = 8\n",
+ " Gradient Accumulation steps = 1\n",
+ " Total optimization steps = 750\n"
+ ]
+ },
+ {
+ "data": {
+ "text/plain": "",
+ "text/html": "\n \n \n
\n [ 2/750 : < :, Epoch 0.00/3]\n
\n \n \n \n Step | \n Training Loss | \n
\n \n \n \n
"
+ },
+ "metadata": {},
+ "output_type": "display_data"
+ },
+ {
+ "name": "stderr",
+ "output_type": "stream",
+ "text": [
+ "Saving model checkpoint to data/roberta-finetuned-subjqa/checkpoint-250\n",
+ "Configuration saved in data/roberta-finetuned-subjqa/checkpoint-250/config.json\n",
+ "Model weights saved in data/roberta-finetuned-subjqa/checkpoint-250/pytorch_model.bin\n",
+ "tokenizer config file saved in data/roberta-finetuned-subjqa/checkpoint-250/tokenizer_config.json\n",
+ "Special tokens file saved in data/roberta-finetuned-subjqa/checkpoint-250/special_tokens_map.json\n",
+ "Saving model checkpoint to data/roberta-finetuned-subjqa/checkpoint-500\n",
+ "Configuration saved in data/roberta-finetuned-subjqa/checkpoint-500/config.json\n",
+ "Model weights saved in data/roberta-finetuned-subjqa/checkpoint-500/pytorch_model.bin\n",
+ "tokenizer config file saved in data/roberta-finetuned-subjqa/checkpoint-500/tokenizer_config.json\n",
+ "Special tokens file saved in data/roberta-finetuned-subjqa/checkpoint-500/special_tokens_map.json\n",
+ "Saving model checkpoint to data/roberta-finetuned-subjqa/checkpoint-750\n",
+ "Configuration saved in data/roberta-finetuned-subjqa/checkpoint-750/config.json\n",
+ "Model weights saved in data/roberta-finetuned-subjqa/checkpoint-750/pytorch_model.bin\n",
+ "tokenizer config file saved in data/roberta-finetuned-subjqa/checkpoint-750/tokenizer_config.json\n",
+ "Special tokens file saved in data/roberta-finetuned-subjqa/checkpoint-750/special_tokens_map.json\n",
+ "\n",
+ "\n",
+ "Training completed. Do not forget to share your model on huggingface.co/models =)\n",
+ "\n",
+ "\n"
+ ]
+ },
+ {
+ "data": {
+ "text/plain": "TrainOutput(global_step=750, training_loss=0.4630465749104818, metrics={'train_runtime': 337.0904, 'train_samples_per_second': 17.755, 'train_steps_per_second': 2.225, 'total_flos': 1172895816798720.0, 'train_loss': 0.4630465749104818, 'epoch': 3.0})"
+ },
+ "execution_count": 57,
+ "metadata": {},
+ "output_type": "execute_result"
+ }
+ ],
+ "source": [
+ "args = TrainingArguments(\n",
+ " output_dir=\"data/roberta-finetuned-subjqa\",\n",
+ " overwrite_output_dir=True,\n",
+ " evaluation_strategy=\"no\",\n",
+ " save_strategy=\"epoch\",\n",
+ " learning_rate=2e-5,\n",
+ " num_train_epochs=3,\n",
+ " weight_decay=0.01,\n",
+ " fp16=True,\n",
+ ")\n",
+ "trainer = Trainer(\n",
+ " model=model,\n",
+ " args=args,\n",
+ " train_dataset=roberta_train_dataset,\n",
+ " eval_dataset=roberta_validation_dataset,\n",
+ " tokenizer=tokenizer,\n",
+ ")\n",
+ "trainer.train()"
+ ],
+ "metadata": {
+ "collapsed": false,
+ "pycharm": {
+ "name": "#%%\n"
+ }
+ }
+ },
+ {
+ "cell_type": "code",
+ "execution_count": 58,
+ "outputs": [
+ {
+ "name": "stderr",
+ "output_type": "stream",
+ "text": [
+ "The following columns in the test set don't have a corresponding argument in `RobertaForQuestionAnswering.forward` and have been ignored: example_id, offset_mapping. If example_id, offset_mapping are not expected by `RobertaForQuestionAnswering.forward`, you can safely ignore this message.\n",
+ "***** Running Prediction *****\n",
+ " Num examples = 321\n",
+ " Batch size = 8\n"
+ ]
+ },
+ {
+ "data": {
+ "text/plain": "",
+ "text/html": "\n \n "
+ },
+ "metadata": {},
+ "output_type": "display_data"
+ },
+ {
+ "data": {
+ "text/plain": " 0%| | 0/265 [00:00, ?it/s]",
+ "application/vnd.jupyter.widget-view+json": {
+ "version_major": 2,
+ "version_minor": 0,
+ "model_id": "f5095780d2c947c9ad51f61f804a3484"
+ },
+ "application/json": {
+ "n": 0,
+ "total": 265,
+ "elapsed": 0.009457588195800781,
+ "ncols": null,
+ "nrows": 12,
+ "prefix": "",
+ "ascii": false,
+ "unit": "it",
+ "unit_scale": false,
+ "rate": null,
+ "bar_format": null,
+ "postfix": null,
+ "unit_divisor": 1000,
+ "initial": 0,
+ "colour": null
+ }
+ },
+ "metadata": {},
+ "output_type": "display_data"
+ },
+ {
+ "name": "stdout",
+ "output_type": "stream",
+ "text": [
+ "QUESTION:\t How was the hotel?\n",
+ "PREDICTED: No complaints\n",
+ "ACTUAL: ['excellent hotels']\n",
+ "QUESTION:\t How is the hotel?\n",
+ "PREDICTED: The hotel location was great\n",
+ "ACTUAL: ['The hotel location was great']\n",
+ "QUESTION:\t Is it value for money?\n",
+ "PREDICTED: excellent value for money\n",
+ "ACTUAL: ['And very reasonably priced. Overall, excellent value for money and highly recommended.']\n",
+ "{'exact_match': 29.81132075471698, 'f1': 46.72268395172683}\n"
+ ]
+ }
+ ],
+ "source": [
+ "predictions, _, _ = trainer.predict(roberta_validation_dataset)\n",
+ "start_logits, end_logits = predictions\n",
+ "_=compute_metrics(start_logits, end_logits, roberta_validation_dataset, subjqa[\"validation\"])"
+ ],
+ "metadata": {
+ "collapsed": false,
+ "pycharm": {
+ "name": "#%%\n"
+ }
+ }
+ }
+ ],
+ "metadata": {
+ "kernelspec": {
+ "display_name": "Python 3",
+ "language": "python",
+ "name": "python3"
+ },
+ "language_info": {
+ "codemirror_mode": {
+ "name": "ipython",
+ "version": 2
+ },
+ "file_extension": ".py",
+ "mimetype": "text/x-python",
+ "name": "python",
+ "nbconvert_exporter": "python",
+ "pygments_lexer": "ipython2",
+ "version": "2.7.6"
+ }
+ },
+ "nbformat": 4,
+ "nbformat_minor": 0
+}
\ No newline at end of file
diff --git a/Konopka_Raport.docx b/Konopka_Raport.docx
new file mode 100644
index 0000000..7b728f9
Binary files /dev/null and b/Konopka_Raport.docx differ
diff --git a/README.md b/README.md
new file mode 100644
index 0000000..bb6aa4e
--- /dev/null
+++ b/README.md
@@ -0,0 +1,42 @@
+# Projekt - Metody uczenia maszynowego w przetwarzaniu języka naturalnego
+
+## Wymagania
+
+Do oceny projektu proszę przygotować:
+
+- odnośnik do repozytorium z plikami projektu (jeśli istnieje)
+- demonstrację projektu
+- krótki raport, zawierający:
+ - cel projektu / definicję problemu (jakie zagadnienie Państwo rozwiązywali)
+ - opis użytych danych (w jaki sposób zebrano dane, czy i jak dokonano wstępnego przetworzenia danych, ile przykładów
+ zawierają zbiory uczący i testowy)
+ - opis wykorzystanych metod (jakich modeli Państwo użyli)
+ - tabelkę z wynikami ewaluacji
+ - ewentualne wnioski
+
+Wzór raportu znajduje się w pliku Wzór raportu.docx
+
+## Cel
+
+Celem projektu było stworzenie modelu, który odpowiada na subiektywne pytania na podstawie kontekstu.
+
+## Zbiór danych
+
+Dane pobrane są z przygotowanego korpusu [SubjQA](https://huggingface.co/datasets/subjqa) w
+wersji tripadvisor oraz restaurants. Jest to zbiór zawierający subiektywne pytania, takich jak "How do you like the soup?".
+Zbiór danych jest zbudowany z tekstów ze stron internetowych, zawierających recenzje hoteli oraz resturacji oraz pytań i odpowiedzi.
+Nie wszystkie pytania mają odpowiedzi.
+Zbiór jest podzielony na train,test i validation, gdzie:
+ - tripadisor zawiera train:1165, test:230, validation:512 rekordów.
+ - restaurants zawiera train:1400, test:267, validation:266 rekordów.
+
+Ponieważ przy trenowaniu nie ma potrzeby używać splitu test, więc po połączeniu datasetów, przyłączeniu
+test do train i usunięciu rekordów ktore nie mają odpowiedzi na pytanie powstaje zbiór o wielkości:
+ - train: 1666
+ - validation: 265
+
+Tokenizowany kontekst musi mieć ograniczoną wielkość, w tym przypadku 384,
+z tego powodu dany zbiór został przetworzony używając stride=128 i max_lenght=384, co sprawiło że całkowita
+wielkość zbiru wyniosła:
+ - train 2030 features
+ - validation 327 features
\ No newline at end of file
diff --git a/requirements.txt b/requirements.txt
new file mode 100644
index 0000000..d51fef4
--- /dev/null
+++ b/requirements.txt
@@ -0,0 +1,10 @@
+jupyter
+torch==1.12.1+cu116
+torchaudio==0.12.1+cu116
+torchvision==0.13.1+cu116
+--extra-index-url https://download.pytorch.org/whl/cu116
+transformers==4.21.1
+datasets==2.4.0
+matplotlib
+pandas
+evaluate
\ No newline at end of file