{"metadata":{"kernelspec":{"name":"python3","display_name":"Python 3","language":"python"},"language_info":{"name":"python","version":"3.10.13","mimetype":"text/x-python","codemirror_mode":{"name":"ipython","version":3},"pygments_lexer":"ipython3","nbconvert_exporter":"python","file_extension":".py"},"kaggle":{"accelerator":"gpu","dataSources":[{"sourceId":8587424,"sourceType":"datasetVersion","datasetId":5135632}],"dockerImageVersionId":30716,"isInternetEnabled":true,"language":"python","sourceType":"notebook","isGpuEnabled":true}},"nbformat_minor":4,"nbformat":4,"cells":[{"cell_type":"code","source":"from transformers import (\n AutoModelForSeq2SeqLM,\n AutoTokenizer,\n DataCollatorForSeq2Seq,\n Seq2SeqTrainer,\n Seq2SeqTrainingArguments,\n pipeline,\n)\n\nfrom datasets import load_dataset\n\nmodel_name = \"google/umt5-small\"","metadata":{"execution":{"iopub.status.busy":"2024-06-03T11:18:55.032642Z","iopub.execute_input":"2024-06-03T11:18:55.033345Z","iopub.status.idle":"2024-06-03T11:19:13.773777Z","shell.execute_reply.started":"2024-06-03T11:18:55.033313Z","shell.execute_reply":"2024-06-03T11:19:13.772989Z"},"trusted":true},"execution_count":1,"outputs":[{"name":"stderr","text":"2024-06-03 11:19:02.256736: E external/local_xla/xla/stream_executor/cuda/cuda_dnn.cc:9261] Unable to register cuDNN factory: Attempting to register factory for plugin cuDNN when one has already been registered\n2024-06-03 11:19:02.256864: E external/local_xla/xla/stream_executor/cuda/cuda_fft.cc:607] Unable to register cuFFT factory: Attempting to register factory for plugin cuFFT when one has already been registered\n2024-06-03 11:19:02.368948: E external/local_xla/xla/stream_executor/cuda/cuda_blas.cc:1515] Unable to register cuBLAS factory: Attempting to register factory for plugin cuBLAS when one has already been registered\n","output_type":"stream"}]},{"cell_type":"code","source":"dataset = load_dataset('csv', data_files='/kaggle/input/ngl-data/nlg_data.csv', split='train').train_test_split(test_size=0.1)\ndataset","metadata":{"execution":{"iopub.status.busy":"2024-06-03T11:19:13.775364Z","iopub.execute_input":"2024-06-03T11:19:13.775904Z","iopub.status.idle":"2024-06-03T11:19:14.356839Z","shell.execute_reply.started":"2024-06-03T11:19:13.775878Z","shell.execute_reply":"2024-06-03T11:19:14.355976Z"},"trusted":true},"execution_count":2,"outputs":[{"output_type":"display_data","data":{"text/plain":"Generating train split: 0 examples [00:00, ? examples/s]","application/vnd.jupyter.widget-view+json":{"version_major":2,"version_minor":0,"model_id":"fdd37b65a44d42b2931bdc0db8229fa7"}},"metadata":{}},{"execution_count":2,"output_type":"execute_result","data":{"text/plain":"DatasetDict({\n train: Dataset({\n features: ['mr', 'ref'],\n num_rows: 18564\n })\n test: Dataset({\n features: ['mr', 'ref'],\n num_rows: 2063\n })\n})"},"metadata":{}}]},{"cell_type":"code","source":"tokenizer = AutoTokenizer.from_pretrained(model_name)\n\n\ndef tokenize_samples(samples):\n inputs = [f\"generate text: {mr}\" for mr in samples[\"mr\"]]\n\n tokenized_inputs = tokenizer(\n inputs,\n max_length=128,\n padding=\"max_length\",\n truncation=True,\n )\n\n labels = tokenizer(\n text_target=samples[\"ref\"],\n max_length=128,\n padding=\"max_length\",\n truncation=True,\n )\n\n labels[\"input_ids\"] = [\n [\n (token_id if token_id != tokenizer.pad_token_id else -100)\n for token_id in label\n ]\n for label in labels[\"input_ids\"]\n ]\n\n tokenized_inputs[\"labels\"] = labels[\"input_ids\"]\n return tokenized_inputs\n\n\ntokenized_dataset = dataset.map(\n tokenize_samples,\n batched=True,\n remove_columns=[\"mr\", \"ref\"],\n)\n\ntokenized_dataset","metadata":{"execution":{"iopub.status.busy":"2024-06-03T11:19:14.357803Z","iopub.execute_input":"2024-06-03T11:19:14.358052Z","iopub.status.idle":"2024-06-03T11:19:24.614600Z","shell.execute_reply.started":"2024-06-03T11:19:14.358030Z","shell.execute_reply":"2024-06-03T11:19:24.613696Z"},"trusted":true},"execution_count":3,"outputs":[{"output_type":"display_data","data":{"text/plain":"tokenizer_config.json: 0%| | 0.00/6.84k [00:00","text/html":"Tracking run with wandb version 0.17.0"},"metadata":{}},{"output_type":"display_data","data":{"text/plain":"","text/html":"Run data is saved locally in /kaggle/working/wandb/run-20240603_111947-zd4tutif"},"metadata":{}},{"output_type":"display_data","data":{"text/plain":"","text/html":"Syncing run /kaggle/working to Weights & Biases (docs)
"},"metadata":{}},{"output_type":"display_data","data":{"text/plain":"","text/html":" View project at https://wandb.ai/filnow42/huggingface"},"metadata":{}},{"output_type":"display_data","data":{"text/plain":"","text/html":" View run at https://wandb.ai/filnow42/huggingface/runs/zd4tutif"},"metadata":{}},{"output_type":"display_data","data":{"text/plain":"","text/html":"\n
\n \n \n [6963/6963 38:47, Epoch 3/3]\n
\n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n
EpochTraining LossValidation Loss
10.7329000.331611
20.3731000.246366
30.3269000.231167

"},"metadata":{}},{"name":"stderr","text":"There were missing keys in the checkpoint model loaded: ['encoder.embed_tokens.weight', 'decoder.embed_tokens.weight'].\n","output_type":"stream"},{"execution_count":6,"output_type":"execute_result","data":{"text/plain":"TrainOutput(global_step=6963, training_loss=1.0388871652717377, metrics={'train_runtime': 2359.6292, 'train_samples_per_second': 23.602, 'train_steps_per_second': 2.951, 'total_flos': 7499132383002624.0, 'train_loss': 1.0388871652717377, 'epoch': 3.0})"},"metadata":{}}]},{"cell_type":"code","source":"nlg = pipeline('summarization', model=model, tokenizer=tokenizer)","metadata":{"execution":{"iopub.status.busy":"2024-06-03T11:58:53.891542Z","iopub.execute_input":"2024-06-03T11:58:53.891952Z","iopub.status.idle":"2024-06-03T11:58:53.897775Z","shell.execute_reply.started":"2024-06-03T11:58:53.891924Z","shell.execute_reply":"2024-06-03T11:58:53.896741Z"},"trusted":true},"execution_count":7,"outputs":[]},{"cell_type":"code","source":"nlg(f'generate text: dish[tatar], price[50], ingredient[wolowina]')[0]['summary_text']","metadata":{"execution":{"iopub.status.busy":"2024-06-03T11:58:53.898928Z","iopub.execute_input":"2024-06-03T11:58:53.899234Z","iopub.status.idle":"2024-06-03T11:59:05.979970Z","shell.execute_reply.started":"2024-06-03T11:58:53.899195Z","shell.execute_reply":"2024-06-03T11:59:05.978805Z"},"trusted":true},"execution_count":8,"outputs":[{"execution_count":8,"output_type":"execute_result","data":{"text/plain":"'Nie mamy tatar w menu. Cena wynosi 50. Sk艂adnik to owoce.'"},"metadata":{}}]},{"cell_type":"code","source":"nlg(f'generate text: payment_methods[gotowka], price[150], addresses[ulica Dluga 5]')[0]['summary_text']","metadata":{"execution":{"iopub.status.busy":"2024-06-03T11:59:05.981291Z","iopub.execute_input":"2024-06-03T11:59:05.981585Z","iopub.status.idle":"2024-06-03T11:59:06.533378Z","shell.execute_reply.started":"2024-06-03T11:59:05.981559Z","shell.execute_reply":"2024-06-03T11:59:06.532379Z"},"trusted":true},"execution_count":9,"outputs":[{"execution_count":9,"output_type":"execute_result","data":{"text/plain":"'Nie obs艂ugujemy p艂atno艣ci gotowka. Cena wynosi 150. Oczywi艣cie, dostarczymy na ulica Dluga 5.'"},"metadata":{}}]},{"cell_type":"code","source":"nlg(f'generate text: dish[tiramisu], ingredient[mleko], allergy[laktoza]')[0]['summary_text']","metadata":{"execution":{"iopub.status.busy":"2024-06-03T11:59:06.537427Z","iopub.execute_input":"2024-06-03T11:59:06.538123Z","iopub.status.idle":"2024-06-03T11:59:06.938435Z","shell.execute_reply.started":"2024-06-03T11:59:06.538081Z","shell.execute_reply":"2024-06-03T11:59:06.937299Z"},"trusted":true},"execution_count":10,"outputs":[{"execution_count":10,"output_type":"execute_result","data":{"text/plain":"'Nie mamy tiramisu w menu. Sk艂adnik mleko jest dost臋pny. Nie zawiera alergenu laktoza.'"},"metadata":{}}]},{"cell_type":"code","source":"nlg(f'generate text: time[dziesiata]')[0]['summary_text']","metadata":{"execution":{"iopub.status.busy":"2024-06-03T11:59:06.939929Z","iopub.execute_input":"2024-06-03T11:59:06.940331Z","iopub.status.idle":"2024-06-03T11:59:07.132913Z","shell.execute_reply.started":"2024-06-03T11:59:06.940292Z","shell.execute_reply":"2024-06-03T11:59:07.131901Z"},"trusted":true},"execution_count":11,"outputs":[{"name":"stderr","text":"Your max_length is set to 20, but your input_length is only 10. Since this is a summarization task, where outputs shorter than the input are typically wanted, you might consider decreasing max_length manually, e.g. summarizer('...', max_length=5)\n","output_type":"stream"},{"execution_count":11,"output_type":"execute_result","data":{"text/plain":"'Zamkni臋te o dziesiata.'"},"metadata":{}}]},{"cell_type":"code","source":"nlg(f'generate text: dish[spaghetti], ingredient[ser]')[0]['summary_text']","metadata":{"execution":{"iopub.status.busy":"2024-06-03T11:59:07.134067Z","iopub.execute_input":"2024-06-03T11:59:07.134671Z","iopub.status.idle":"2024-06-03T11:59:07.405347Z","shell.execute_reply.started":"2024-06-03T11:59:07.134642Z","shell.execute_reply":"2024-06-03T11:59:07.404117Z"},"trusted":true},"execution_count":12,"outputs":[{"name":"stderr","text":"Your max_length is set to 20, but your input_length is only 14. Since this is a summarization task, where outputs shorter than the input are typically wanted, you might consider decreasing max_length manually, e.g. summarizer('...', max_length=7)\n","output_type":"stream"},{"execution_count":12,"output_type":"execute_result","data":{"text/plain":"'Nie mamy spaghetti w menu. Sk艂adnik ser jest dost臋pny.'"},"metadata":{}}]},{"cell_type":"code","source":"nlg(f'generate text: dish[pierogi], ingredient[kozi ser]')[0]['summary_text']","metadata":{"execution":{"iopub.status.busy":"2024-06-03T11:59:07.407270Z","iopub.execute_input":"2024-06-03T11:59:07.410442Z","iopub.status.idle":"2024-06-03T11:59:07.697634Z","shell.execute_reply.started":"2024-06-03T11:59:07.410396Z","shell.execute_reply":"2024-06-03T11:59:07.695355Z"},"trusted":true},"execution_count":13,"outputs":[{"name":"stderr","text":"Your max_length is set to 20, but your input_length is only 16. Since this is a summarization task, where outputs shorter than the input are typically wanted, you might consider decreasing max_length manually, e.g. summarizer('...', max_length=8)\n","output_type":"stream"},{"execution_count":13,"output_type":"execute_result","data":{"text/plain":"'Nie mamy pierogi w menu. Sk艂adnik to koti ser.'"},"metadata":{}}]},{"cell_type":"code","source":"nlg(f'generate text: time[23:00], adres[ul Kr贸tka 256]')[0]['summary_text']","metadata":{"execution":{"iopub.status.busy":"2024-06-03T11:59:07.698906Z","iopub.execute_input":"2024-06-03T11:59:07.699269Z","iopub.status.idle":"2024-06-03T11:59:08.138934Z","shell.execute_reply.started":"2024-06-03T11:59:07.699233Z","shell.execute_reply":"2024-06-03T11:59:08.137833Z"},"trusted":true},"execution_count":14,"outputs":[{"execution_count":14,"output_type":"execute_result","data":{"text/plain":"'Zamkni臋te o 23:00. Nie dostarczamy na ulica Kr贸tka 256.'"},"metadata":{}}]},{"cell_type":"code","source":"model.save_pretrained(\"/kaggle/working\")","metadata":{"execution":{"iopub.status.busy":"2024-06-03T11:59:08.140399Z","iopub.execute_input":"2024-06-03T11:59:08.140718Z","iopub.status.idle":"2024-06-03T11:59:11.078579Z","shell.execute_reply.started":"2024-06-03T11:59:08.140689Z","shell.execute_reply":"2024-06-03T11:59:11.077378Z"},"trusted":true},"execution_count":15,"outputs":[]},{"cell_type":"code","source":"from kaggle_secrets import UserSecretsClient\nuser_secrets = UserSecretsClient()\nsecret_value_0 = user_secrets.get_secret(\"huggingface-write\")","metadata":{"execution":{"iopub.status.busy":"2024-06-03T12:03:34.283930Z","iopub.execute_input":"2024-06-03T12:03:34.284674Z","iopub.status.idle":"2024-06-03T12:03:34.468881Z","shell.execute_reply.started":"2024-06-03T12:03:34.284637Z","shell.execute_reply":"2024-06-03T12:03:34.467812Z"},"trusted":true},"execution_count":18,"outputs":[]},{"cell_type":"code","source":"from huggingface_hub import login\nlogin(secret_value_0)","metadata":{"execution":{"iopub.status.busy":"2024-06-03T12:03:38.979682Z","iopub.execute_input":"2024-06-03T12:03:38.980042Z","iopub.status.idle":"2024-06-03T12:03:39.119457Z","shell.execute_reply.started":"2024-06-03T12:03:38.980011Z","shell.execute_reply":"2024-06-03T12:03:39.118367Z"},"trusted":true},"execution_count":19,"outputs":[{"name":"stdout","text":"The token has not been saved to the git credentials helper. Pass `add_to_git_credential=True` in this function directly or `--add-to-git-credential` if using via `huggingface-cli` if you want to set the git credential as well.\nToken is valid (permission: write).\nYour token has been saved to /root/.cache/huggingface/token\nLogin successful\n","output_type":"stream"}]},{"cell_type":"code","source":"trainer.push_to_hub(\"filnow/nlg-umt5-pol\")","metadata":{"execution":{"iopub.status.busy":"2024-06-03T12:03:45.289755Z","iopub.execute_input":"2024-06-03T12:03:45.290131Z","iopub.status.idle":"2024-06-03T12:04:24.555639Z","shell.execute_reply.started":"2024-06-03T12:03:45.290099Z","shell.execute_reply":"2024-06-03T12:04:24.554427Z"},"trusted":true},"execution_count":20,"outputs":[{"output_type":"display_data","data":{"text/plain":"events.out.tfevents.1717413574.743112a2decd.34.0: 0%| | 0.00/9.10k [00:00