40 KiB
40 KiB
import torch
from torch.utils.data import DataLoader
from datasets import load_dataset, disable_caching, DatasetDict
from transformers import RobertaForMaskedLM, RobertaConfig, RobertaTokenizer, DataCollatorForLanguageModeling
disable_caching()
/home/s452638/magisterka/magisterka_env/lib/python3.8/site-packages/tqdm/auto.py:21: TqdmWarning: IProgress not found. Please update jupyter and ipywidgets. See https://ipywidgets.readthedocs.io/en/stable/user_install.html from .autonotebook import tqdm as notebook_tqdm
train_data = load_dataset("/work/s452638/datasets/the-stack-python", split="train")
train_data
Dataset({ features: ['hexsha', 'size', 'ext', 'lang', 'max_stars_repo_path', 'max_stars_repo_name', 'max_stars_repo_head_hexsha', 'max_stars_repo_licenses', 'max_stars_count', 'max_stars_repo_stars_event_min_datetime', 'max_stars_repo_stars_event_max_datetime', 'max_issues_repo_path', 'max_issues_repo_name', 'max_issues_repo_head_hexsha', 'max_issues_repo_licenses', 'max_issues_count', 'max_issues_repo_issues_event_min_datetime', 'max_issues_repo_issues_event_max_datetime', 'max_forks_repo_path', 'max_forks_repo_name', 'max_forks_repo_head_hexsha', 'max_forks_repo_licenses', 'max_forks_count', 'max_forks_repo_forks_event_min_datetime', 'max_forks_repo_forks_event_max_datetime', 'content', 'avg_line_length', 'max_line_length', 'alphanum_fraction'], num_rows: 12962249 })
valid_data = load_dataset('json', data_files='/work/s452638/datasets/CodeSearchNet/python/valid.jsonl')
valid_data
DatasetDict({ train: Dataset({ features: ['repo', 'path', 'func_name', 'original_string', 'language', 'code', 'code_tokens', 'docstring', 'docstring_tokens', 'sha', 'url', 'partition'], num_rows: 13914 }) })
test_data = load_dataset('json', data_files='/work/s452638/datasets/CodeSearchNet/python/test.jsonl')
test_data
DatasetDict({ train: Dataset({ features: ['repo', 'path', 'func_name', 'original_string', 'language', 'code', 'code_tokens', 'docstring', 'docstring_tokens', 'sha', 'url', 'partition'], num_rows: 14918 }) })
train_data = train_data.rename_column('content', 'code')
tokenizer = RobertaTokenizer.from_pretrained('microsoft/codebert-base', clean_up_tokenization_spaces=True)
tokenizer
RobertaTokenizer(name_or_path='microsoft/codebert-base', vocab_size=50265, model_max_length=512, is_fast=False, padding_side='right', truncation_side='right', special_tokens={'bos_token': '<s>', 'eos_token': '</s>', 'unk_token': '<unk>', 'sep_token': '</s>', 'pad_token': '<pad>', 'cls_token': '<s>', 'mask_token': '<mask>'}, clean_up_tokenization_spaces=True), added_tokens_decoder={ 0: AddedToken("<s>", rstrip=False, lstrip=False, single_word=False, normalized=True, special=True), 1: AddedToken("<pad>", rstrip=False, lstrip=False, single_word=False, normalized=True, special=True), 2: AddedToken("</s>", rstrip=False, lstrip=False, single_word=False, normalized=True, special=True), 3: AddedToken("<unk>", rstrip=False, lstrip=False, single_word=False, normalized=True, special=True), 50264: AddedToken("<mask>", rstrip=False, lstrip=True, single_word=False, normalized=False, special=True), }
def tokenize_function(examples):
return tokenizer(examples['code'], truncation=True, padding='max_length', max_length=512, return_tensors='pt')
train_data = train_data.map(tokenize_function, batched=True, remove_columns=['code'], desc='Running tokenizer')
valid_data = valid_data.map(tokenize_function, batched=True, remove_columns=['code'], desc='Running tokenizer')
test_data = test_data.map(tokenize_function, batched=True, remove_columns=['code'], desc='Running tokenizer')
Running tokenizer: 0%| | 1000/12962249 [00:17<61:47:17, 58.27 examples/s]
[0;31m---------------------------------------------------------------------------[0m [0;31mKeyboardInterrupt[0m Traceback (most recent call last) Cell [0;32mIn[7], line 4[0m [1;32m 1[0m [38;5;28;01mdef[39;00m [38;5;21mtokenize_function[39m(examples): [1;32m 2[0m [38;5;28;01mreturn[39;00m tokenizer(examples[[38;5;124m'[39m[38;5;124mcode[39m[38;5;124m'[39m], truncation[38;5;241m=[39m[38;5;28;01mTrue[39;00m, padding[38;5;241m=[39m[38;5;124m'[39m[38;5;124mmax_length[39m[38;5;124m'[39m, max_length[38;5;241m=[39m[38;5;241m512[39m, return_tensors[38;5;241m=[39m[38;5;124m'[39m[38;5;124mpt[39m[38;5;124m'[39m) [0;32m----> 4[0m train_data [38;5;241m=[39m [43mtrain_data[49m[38;5;241;43m.[39;49m[43mmap[49m[43m([49m[43mtokenize_function[49m[43m,[49m[43m [49m[43mbatched[49m[38;5;241;43m=[39;49m[38;5;28;43;01mTrue[39;49;00m[43m,[49m[43m [49m[43mremove_columns[49m[38;5;241;43m=[39;49m[43m[[49m[38;5;124;43m'[39;49m[38;5;124;43mcode[39;49m[38;5;124;43m'[39;49m[43m][49m[43m,[49m[43m [49m[43mdesc[49m[38;5;241;43m=[39;49m[38;5;124;43m'[39;49m[38;5;124;43mRunning tokenizer[39;49m[38;5;124;43m'[39;49m[43m)[49m [1;32m 5[0m valid_data [38;5;241m=[39m valid_data[38;5;241m.[39mmap(tokenize_function, batched[38;5;241m=[39m[38;5;28;01mTrue[39;00m, remove_columns[38;5;241m=[39m[[38;5;124m'[39m[38;5;124mcode[39m[38;5;124m'[39m], desc[38;5;241m=[39m[38;5;124m'[39m[38;5;124mRunning tokenizer[39m[38;5;124m'[39m) [1;32m 6[0m test_data [38;5;241m=[39m test_data[38;5;241m.[39mmap(tokenize_function, batched[38;5;241m=[39m[38;5;28;01mTrue[39;00m, remove_columns[38;5;241m=[39m[[38;5;124m'[39m[38;5;124mcode[39m[38;5;124m'[39m], desc[38;5;241m=[39m[38;5;124m'[39m[38;5;124mRunning tokenizer[39m[38;5;124m'[39m) File [0;32m~/magisterka/magisterka_env/lib/python3.8/site-packages/datasets/arrow_dataset.py:602[0m, in [0;36mtransmit_tasks.<locals>.wrapper[0;34m(*args, **kwargs)[0m [1;32m 600[0m [38;5;28mself[39m: [38;5;124m"[39m[38;5;124mDataset[39m[38;5;124m"[39m [38;5;241m=[39m kwargs[38;5;241m.[39mpop([38;5;124m"[39m[38;5;124mself[39m[38;5;124m"[39m) [1;32m 601[0m [38;5;66;03m# apply actual function[39;00m [0;32m--> 602[0m out: Union[[38;5;124m"[39m[38;5;124mDataset[39m[38;5;124m"[39m, [38;5;124m"[39m[38;5;124mDatasetDict[39m[38;5;124m"[39m] [38;5;241m=[39m [43mfunc[49m[43m([49m[38;5;28;43mself[39;49m[43m,[49m[43m [49m[38;5;241;43m*[39;49m[43margs[49m[43m,[49m[43m [49m[38;5;241;43m*[39;49m[38;5;241;43m*[39;49m[43mkwargs[49m[43m)[49m [1;32m 603[0m datasets: List[[38;5;124m"[39m[38;5;124mDataset[39m[38;5;124m"[39m] [38;5;241m=[39m [38;5;28mlist[39m(out[38;5;241m.[39mvalues()) [38;5;28;01mif[39;00m [38;5;28misinstance[39m(out, [38;5;28mdict[39m) [38;5;28;01melse[39;00m [out] [1;32m 604[0m [38;5;28;01mfor[39;00m dataset [38;5;129;01min[39;00m datasets: [1;32m 605[0m [38;5;66;03m# Remove task templates if a column mapping of the template is no longer valid[39;00m File [0;32m~/magisterka/magisterka_env/lib/python3.8/site-packages/datasets/arrow_dataset.py:567[0m, in [0;36mtransmit_format.<locals>.wrapper[0;34m(*args, **kwargs)[0m [1;32m 560[0m self_format [38;5;241m=[39m { [1;32m 561[0m [38;5;124m"[39m[38;5;124mtype[39m[38;5;124m"[39m: [38;5;28mself[39m[38;5;241m.[39m_format_type, [1;32m 562[0m [38;5;124m"[39m[38;5;124mformat_kwargs[39m[38;5;124m"[39m: [38;5;28mself[39m[38;5;241m.[39m_format_kwargs, [1;32m 563[0m [38;5;124m"[39m[38;5;124mcolumns[39m[38;5;124m"[39m: [38;5;28mself[39m[38;5;241m.[39m_format_columns, [1;32m 564[0m [38;5;124m"[39m[38;5;124moutput_all_columns[39m[38;5;124m"[39m: [38;5;28mself[39m[38;5;241m.[39m_output_all_columns, [1;32m 565[0m } [1;32m 566[0m [38;5;66;03m# apply actual function[39;00m [0;32m--> 567[0m out: Union[[38;5;124m"[39m[38;5;124mDataset[39m[38;5;124m"[39m, [38;5;124m"[39m[38;5;124mDatasetDict[39m[38;5;124m"[39m] [38;5;241m=[39m [43mfunc[49m[43m([49m[38;5;28;43mself[39;49m[43m,[49m[43m [49m[38;5;241;43m*[39;49m[43margs[49m[43m,[49m[43m [49m[38;5;241;43m*[39;49m[38;5;241;43m*[39;49m[43mkwargs[49m[43m)[49m [1;32m 568[0m datasets: List[[38;5;124m"[39m[38;5;124mDataset[39m[38;5;124m"[39m] [38;5;241m=[39m [38;5;28mlist[39m(out[38;5;241m.[39mvalues()) [38;5;28;01mif[39;00m [38;5;28misinstance[39m(out, [38;5;28mdict[39m) [38;5;28;01melse[39;00m [out] [1;32m 569[0m [38;5;66;03m# re-apply format to the output[39;00m File [0;32m~/magisterka/magisterka_env/lib/python3.8/site-packages/datasets/arrow_dataset.py:3161[0m, in [0;36mDataset.map[0;34m(self, function, with_indices, with_rank, input_columns, batched, batch_size, drop_last_batch, remove_columns, keep_in_memory, load_from_cache_file, cache_file_name, writer_batch_size, features, disable_nullable, fn_kwargs, num_proc, suffix_template, new_fingerprint, desc)[0m [1;32m 3155[0m [38;5;28;01mif[39;00m transformed_dataset [38;5;129;01mis[39;00m [38;5;28;01mNone[39;00m: [1;32m 3156[0m [38;5;28;01mwith[39;00m hf_tqdm( [1;32m 3157[0m unit[38;5;241m=[39m[38;5;124m"[39m[38;5;124m examples[39m[38;5;124m"[39m, [1;32m 3158[0m total[38;5;241m=[39mpbar_total, [1;32m 3159[0m desc[38;5;241m=[39mdesc [38;5;129;01mor[39;00m [38;5;124m"[39m[38;5;124mMap[39m[38;5;124m"[39m, [1;32m 3160[0m ) [38;5;28;01mas[39;00m pbar: [0;32m-> 3161[0m [38;5;28;01mfor[39;00m rank, done, content [38;5;129;01min[39;00m Dataset[38;5;241m.[39m_map_single([38;5;241m*[39m[38;5;241m*[39mdataset_kwargs): [1;32m 3162[0m [38;5;28;01mif[39;00m done: [1;32m 3163[0m shards_done [38;5;241m+[39m[38;5;241m=[39m [38;5;241m1[39m File [0;32m~/magisterka/magisterka_env/lib/python3.8/site-packages/datasets/arrow_dataset.py:3552[0m, in [0;36mDataset._map_single[0;34m(shard, function, with_indices, with_rank, input_columns, batched, batch_size, drop_last_batch, remove_columns, keep_in_memory, cache_file_name, writer_batch_size, features, disable_nullable, fn_kwargs, new_fingerprint, rank, offset)[0m [1;32m 3548[0m indices [38;5;241m=[39m [38;5;28mlist[39m( [1;32m 3549[0m [38;5;28mrange[39m([38;5;241m*[39m([38;5;28mslice[39m(i, i [38;5;241m+[39m batch_size)[38;5;241m.[39mindices(shard[38;5;241m.[39mnum_rows))) [1;32m 3550[0m ) [38;5;66;03m# Something simpler?[39;00m [1;32m 3551[0m [38;5;28;01mtry[39;00m: [0;32m-> 3552[0m batch [38;5;241m=[39m [43mapply_function_on_filtered_inputs[49m[43m([49m [1;32m 3553[0m [43m [49m[43mbatch[49m[43m,[49m [1;32m 3554[0m [43m [49m[43mindices[49m[43m,[49m [1;32m 3555[0m [43m [49m[43mcheck_same_num_examples[49m[38;5;241;43m=[39;49m[38;5;28;43mlen[39;49m[43m([49m[43mshard[49m[38;5;241;43m.[39;49m[43mlist_indexes[49m[43m([49m[43m)[49m[43m)[49m[43m [49m[38;5;241;43m>[39;49m[43m [49m[38;5;241;43m0[39;49m[43m,[49m [1;32m 3556[0m [43m [49m[43moffset[49m[38;5;241;43m=[39;49m[43moffset[49m[43m,[49m [1;32m 3557[0m [43m [49m[43m)[49m [1;32m 3558[0m [38;5;28;01mexcept[39;00m NumExamplesMismatchError: [1;32m 3559[0m [38;5;28;01mraise[39;00m DatasetTransformationNotAllowedError( [1;32m 3560[0m [38;5;124m"[39m[38;5;124mUsing `.map` in batched mode on a dataset with attached indexes is allowed only if it doesn[39m[38;5;124m'[39m[38;5;124mt create or remove existing examples. You can first run `.drop_index() to remove your index and then re-add it.[39m[38;5;124m"[39m [1;32m 3561[0m ) [38;5;28;01mfrom[39;00m [38;5;28;01mNone[39;00m File [0;32m~/magisterka/magisterka_env/lib/python3.8/site-packages/datasets/arrow_dataset.py:3421[0m, in [0;36mDataset._map_single.<locals>.apply_function_on_filtered_inputs[0;34m(pa_inputs, indices, check_same_num_examples, offset)[0m [1;32m 3419[0m [38;5;28;01mif[39;00m with_rank: [1;32m 3420[0m additional_args [38;5;241m+[39m[38;5;241m=[39m (rank,) [0;32m-> 3421[0m processed_inputs [38;5;241m=[39m [43mfunction[49m[43m([49m[38;5;241;43m*[39;49m[43mfn_args[49m[43m,[49m[43m [49m[38;5;241;43m*[39;49m[43madditional_args[49m[43m,[49m[43m [49m[38;5;241;43m*[39;49m[38;5;241;43m*[39;49m[43mfn_kwargs[49m[43m)[49m [1;32m 3422[0m [38;5;28;01mif[39;00m [38;5;28misinstance[39m(processed_inputs, LazyDict): [1;32m 3423[0m processed_inputs [38;5;241m=[39m { [1;32m 3424[0m k: v [38;5;28;01mfor[39;00m k, v [38;5;129;01min[39;00m processed_inputs[38;5;241m.[39mdata[38;5;241m.[39mitems() [38;5;28;01mif[39;00m k [38;5;129;01mnot[39;00m [38;5;129;01min[39;00m processed_inputs[38;5;241m.[39mkeys_to_format [1;32m 3425[0m } Cell [0;32mIn[7], line 2[0m, in [0;36mtokenize_function[0;34m(examples)[0m [1;32m 1[0m [38;5;28;01mdef[39;00m [38;5;21mtokenize_function[39m(examples): [0;32m----> 2[0m [38;5;28;01mreturn[39;00m [43mtokenizer[49m[43m([49m[43mexamples[49m[43m[[49m[38;5;124;43m'[39;49m[38;5;124;43mcode[39;49m[38;5;124;43m'[39;49m[43m][49m[43m,[49m[43m [49m[43mtruncation[49m[38;5;241;43m=[39;49m[38;5;28;43;01mTrue[39;49;00m[43m,[49m[43m [49m[43mpadding[49m[38;5;241;43m=[39;49m[38;5;124;43m'[39;49m[38;5;124;43mmax_length[39;49m[38;5;124;43m'[39;49m[43m,[49m[43m [49m[43mmax_length[49m[38;5;241;43m=[39;49m[38;5;241;43m512[39;49m[43m,[49m[43m [49m[43mreturn_tensors[49m[38;5;241;43m=[39;49m[38;5;124;43m'[39;49m[38;5;124;43mpt[39;49m[38;5;124;43m'[39;49m[43m)[49m File [0;32m~/magisterka/magisterka_env/lib/python3.8/site-packages/transformers/tokenization_utils_base.py:3055[0m, in [0;36mPreTrainedTokenizerBase.__call__[0;34m(self, text, text_pair, text_target, text_pair_target, add_special_tokens, padding, truncation, max_length, stride, is_split_into_words, pad_to_multiple_of, return_tensors, return_token_type_ids, return_attention_mask, return_overflowing_tokens, return_special_tokens_mask, return_offsets_mapping, return_length, verbose, **kwargs)[0m [1;32m 3053[0m [38;5;28;01mif[39;00m [38;5;129;01mnot[39;00m [38;5;28mself[39m[38;5;241m.[39m_in_target_context_manager: [1;32m 3054[0m [38;5;28mself[39m[38;5;241m.[39m_switch_to_input_mode() [0;32m-> 3055[0m encodings [38;5;241m=[39m [38;5;28;43mself[39;49m[38;5;241;43m.[39;49m[43m_call_one[49m[43m([49m[43mtext[49m[38;5;241;43m=[39;49m[43mtext[49m[43m,[49m[43m [49m[43mtext_pair[49m[38;5;241;43m=[39;49m[43mtext_pair[49m[43m,[49m[43m [49m[38;5;241;43m*[39;49m[38;5;241;43m*[39;49m[43mall_kwargs[49m[43m)[49m [1;32m 3056[0m [38;5;28;01mif[39;00m text_target [38;5;129;01mis[39;00m [38;5;129;01mnot[39;00m [38;5;28;01mNone[39;00m: [1;32m 3057[0m [38;5;28mself[39m[38;5;241m.[39m_switch_to_target_mode() File [0;32m~/magisterka/magisterka_env/lib/python3.8/site-packages/transformers/tokenization_utils_base.py:3142[0m, in [0;36mPreTrainedTokenizerBase._call_one[0;34m(self, text, text_pair, add_special_tokens, padding, truncation, max_length, stride, is_split_into_words, pad_to_multiple_of, return_tensors, return_token_type_ids, return_attention_mask, return_overflowing_tokens, return_special_tokens_mask, return_offsets_mapping, return_length, verbose, split_special_tokens, **kwargs)[0m [1;32m 3137[0m [38;5;28;01mraise[39;00m [38;5;167;01mValueError[39;00m( [1;32m 3138[0m [38;5;124mf[39m[38;5;124m"[39m[38;5;124mbatch length of `text`: [39m[38;5;132;01m{[39;00m[38;5;28mlen[39m(text)[38;5;132;01m}[39;00m[38;5;124m does not match batch length of `text_pair`:[39m[38;5;124m"[39m [1;32m 3139[0m [38;5;124mf[39m[38;5;124m"[39m[38;5;124m [39m[38;5;132;01m{[39;00m[38;5;28mlen[39m(text_pair)[38;5;132;01m}[39;00m[38;5;124m.[39m[38;5;124m"[39m [1;32m 3140[0m ) [1;32m 3141[0m batch_text_or_text_pairs [38;5;241m=[39m [38;5;28mlist[39m([38;5;28mzip[39m(text, text_pair)) [38;5;28;01mif[39;00m text_pair [38;5;129;01mis[39;00m [38;5;129;01mnot[39;00m [38;5;28;01mNone[39;00m [38;5;28;01melse[39;00m text [0;32m-> 3142[0m [38;5;28;01mreturn[39;00m [38;5;28;43mself[39;49m[38;5;241;43m.[39;49m[43mbatch_encode_plus[49m[43m([49m [1;32m 3143[0m [43m [49m[43mbatch_text_or_text_pairs[49m[38;5;241;43m=[39;49m[43mbatch_text_or_text_pairs[49m[43m,[49m [1;32m 3144[0m [43m [49m[43madd_special_tokens[49m[38;5;241;43m=[39;49m[43madd_special_tokens[49m[43m,[49m [1;32m 3145[0m [43m [49m[43mpadding[49m[38;5;241;43m=[39;49m[43mpadding[49m[43m,[49m [1;32m 3146[0m [43m [49m[43mtruncation[49m[38;5;241;43m=[39;49m[43mtruncation[49m[43m,[49m [1;32m 3147[0m [43m [49m[43mmax_length[49m[38;5;241;43m=[39;49m[43mmax_length[49m[43m,[49m [1;32m 3148[0m [43m [49m[43mstride[49m[38;5;241;43m=[39;49m[43mstride[49m[43m,[49m [1;32m 3149[0m [43m [49m[43mis_split_into_words[49m[38;5;241;43m=[39;49m[43mis_split_into_words[49m[43m,[49m [1;32m 3150[0m [43m [49m[43mpad_to_multiple_of[49m[38;5;241;43m=[39;49m[43mpad_to_multiple_of[49m[43m,[49m [1;32m 3151[0m [43m [49m[43mreturn_tensors[49m[38;5;241;43m=[39;49m[43mreturn_tensors[49m[43m,[49m [1;32m 3152[0m [43m [49m[43mreturn_token_type_ids[49m[38;5;241;43m=[39;49m[43mreturn_token_type_ids[49m[43m,[49m [1;32m 3153[0m [43m [49m[43mreturn_attention_mask[49m[38;5;241;43m=[39;49m[43mreturn_attention_mask[49m[43m,[49m [1;32m 3154[0m [43m [49m[43mreturn_overflowing_tokens[49m[38;5;241;43m=[39;49m[43mreturn_overflowing_tokens[49m[43m,[49m [1;32m 3155[0m [43m [49m[43mreturn_special_tokens_mask[49m[38;5;241;43m=[39;49m[43mreturn_special_tokens_mask[49m[43m,[49m [1;32m 3156[0m [43m [49m[43mreturn_offsets_mapping[49m[38;5;241;43m=[39;49m[43mreturn_offsets_mapping[49m[43m,[49m [1;32m 3157[0m [43m [49m[43mreturn_length[49m[38;5;241;43m=[39;49m[43mreturn_length[49m[43m,[49m [1;32m 3158[0m [43m [49m[43mverbose[49m[38;5;241;43m=[39;49m[43mverbose[49m[43m,[49m [1;32m 3159[0m [43m [49m[43msplit_special_tokens[49m[38;5;241;43m=[39;49m[43msplit_special_tokens[49m[43m,[49m [1;32m 3160[0m [43m [49m[38;5;241;43m*[39;49m[38;5;241;43m*[39;49m[43mkwargs[49m[43m,[49m [1;32m 3161[0m [43m [49m[43m)[49m [1;32m 3162[0m [38;5;28;01melse[39;00m: [1;32m 3163[0m [38;5;28;01mreturn[39;00m [38;5;28mself[39m[38;5;241m.[39mencode_plus( [1;32m 3164[0m text[38;5;241m=[39mtext, [1;32m 3165[0m text_pair[38;5;241m=[39mtext_pair, [0;32m (...)[0m [1;32m 3182[0m [38;5;241m*[39m[38;5;241m*[39mkwargs, [1;32m 3183[0m ) File [0;32m~/magisterka/magisterka_env/lib/python3.8/site-packages/transformers/tokenization_utils_base.py:3338[0m, in [0;36mPreTrainedTokenizerBase.batch_encode_plus[0;34m(self, batch_text_or_text_pairs, add_special_tokens, padding, truncation, max_length, stride, is_split_into_words, pad_to_multiple_of, return_tensors, return_token_type_ids, return_attention_mask, return_overflowing_tokens, return_special_tokens_mask, return_offsets_mapping, return_length, verbose, split_special_tokens, **kwargs)[0m [1;32m 3328[0m [38;5;66;03m# Backward compatibility for 'truncation_strategy', 'pad_to_max_length'[39;00m [1;32m 3329[0m padding_strategy, truncation_strategy, max_length, kwargs [38;5;241m=[39m [38;5;28mself[39m[38;5;241m.[39m_get_padding_truncation_strategies( [1;32m 3330[0m padding[38;5;241m=[39mpadding, [1;32m 3331[0m truncation[38;5;241m=[39mtruncation, [0;32m (...)[0m [1;32m 3335[0m [38;5;241m*[39m[38;5;241m*[39mkwargs, [1;32m 3336[0m ) [0;32m-> 3338[0m [38;5;28;01mreturn[39;00m [38;5;28;43mself[39;49m[38;5;241;43m.[39;49m[43m_batch_encode_plus[49m[43m([49m [1;32m 3339[0m [43m [49m[43mbatch_text_or_text_pairs[49m[38;5;241;43m=[39;49m[43mbatch_text_or_text_pairs[49m[43m,[49m [1;32m 3340[0m [43m [49m[43madd_special_tokens[49m[38;5;241;43m=[39;49m[43madd_special_tokens[49m[43m,[49m [1;32m 3341[0m [43m [49m[43mpadding_strategy[49m[38;5;241;43m=[39;49m[43mpadding_strategy[49m[43m,[49m [1;32m 3342[0m [43m [49m[43mtruncation_strategy[49m[38;5;241;43m=[39;49m[43mtruncation_strategy[49m[43m,[49m [1;32m 3343[0m [43m [49m[43mmax_length[49m[38;5;241;43m=[39;49m[43mmax_length[49m[43m,[49m [1;32m 3344[0m [43m [49m[43mstride[49m[38;5;241;43m=[39;49m[43mstride[49m[43m,[49m [1;32m 3345[0m [43m [49m[43mis_split_into_words[49m[38;5;241;43m=[39;49m[43mis_split_into_words[49m[43m,[49m [1;32m 3346[0m [43m [49m[43mpad_to_multiple_of[49m[38;5;241;43m=[39;49m[43mpad_to_multiple_of[49m[43m,[49m [1;32m 3347[0m [43m [49m[43mreturn_tensors[49m[38;5;241;43m=[39;49m[43mreturn_tensors[49m[43m,[49m [1;32m 3348[0m [43m [49m[43mreturn_token_type_ids[49m[38;5;241;43m=[39;49m[43mreturn_token_type_ids[49m[43m,[49m [1;32m 3349[0m [43m [49m[43mreturn_attention_mask[49m[38;5;241;43m=[39;49m[43mreturn_attention_mask[49m[43m,[49m [1;32m 3350[0m [43m [49m[43mreturn_overflowing_tokens[49m[38;5;241;43m=[39;49m[43mreturn_overflowing_tokens[49m[43m,[49m [1;32m 3351[0m [43m [49m[43mreturn_special_tokens_mask[49m[38;5;241;43m=[39;49m[43mreturn_special_tokens_mask[49m[43m,[49m [1;32m 3352[0m [43m [49m[43mreturn_offsets_mapping[49m[38;5;241;43m=[39;49m[43mreturn_offsets_mapping[49m[43m,[49m [1;32m 3353[0m [43m [49m[43mreturn_length[49m[38;5;241;43m=[39;49m[43mreturn_length[49m[43m,[49m [1;32m 3354[0m [43m [49m[43mverbose[49m[38;5;241;43m=[39;49m[43mverbose[49m[43m,[49m [1;32m 3355[0m [43m [49m[43msplit_special_tokens[49m[38;5;241;43m=[39;49m[43msplit_special_tokens[49m[43m,[49m [1;32m 3356[0m [43m [49m[38;5;241;43m*[39;49m[38;5;241;43m*[39;49m[43mkwargs[49m[43m,[49m [1;32m 3357[0m [43m[49m[43m)[49m File [0;32m~/magisterka/magisterka_env/lib/python3.8/site-packages/transformers/tokenization_utils.py:882[0m, in [0;36mPreTrainedTokenizer._batch_encode_plus[0;34m(self, batch_text_or_text_pairs, add_special_tokens, padding_strategy, truncation_strategy, max_length, stride, is_split_into_words, pad_to_multiple_of, return_tensors, return_token_type_ids, return_attention_mask, return_overflowing_tokens, return_special_tokens_mask, return_offsets_mapping, return_length, verbose, split_special_tokens, **kwargs)[0m [1;32m 879[0m [38;5;28;01melse[39;00m: [1;32m 880[0m ids, pair_ids [38;5;241m=[39m ids_or_pair_ids [0;32m--> 882[0m first_ids [38;5;241m=[39m [43mget_input_ids[49m[43m([49m[43mids[49m[43m)[49m [1;32m 883[0m second_ids [38;5;241m=[39m get_input_ids(pair_ids) [38;5;28;01mif[39;00m pair_ids [38;5;129;01mis[39;00m [38;5;129;01mnot[39;00m [38;5;28;01mNone[39;00m [38;5;28;01melse[39;00m [38;5;28;01mNone[39;00m [1;32m 884[0m input_ids[38;5;241m.[39mappend((first_ids, second_ids)) File [0;32m~/magisterka/magisterka_env/lib/python3.8/site-packages/transformers/tokenization_utils.py:849[0m, in [0;36mPreTrainedTokenizer._batch_encode_plus.<locals>.get_input_ids[0;34m(text)[0m [1;32m 847[0m [38;5;28;01mdef[39;00m [38;5;21mget_input_ids[39m(text): [1;32m 848[0m [38;5;28;01mif[39;00m [38;5;28misinstance[39m(text, [38;5;28mstr[39m): [0;32m--> 849[0m tokens [38;5;241m=[39m [38;5;28;43mself[39;49m[38;5;241;43m.[39;49m[43mtokenize[49m[43m([49m[43mtext[49m[43m,[49m[43m [49m[38;5;241;43m*[39;49m[38;5;241;43m*[39;49m[43mkwargs[49m[43m)[49m [1;32m 850[0m [38;5;28;01mreturn[39;00m [38;5;28mself[39m[38;5;241m.[39mconvert_tokens_to_ids(tokens) [1;32m 851[0m [38;5;28;01melif[39;00m [38;5;28misinstance[39m(text, ([38;5;28mlist[39m, [38;5;28mtuple[39m)) [38;5;129;01mand[39;00m [38;5;28mlen[39m(text) [38;5;241m>[39m [38;5;241m0[39m [38;5;129;01mand[39;00m [38;5;28misinstance[39m(text[[38;5;241m0[39m], [38;5;28mstr[39m): File [0;32m~/magisterka/magisterka_env/lib/python3.8/site-packages/transformers/tokenization_utils.py:695[0m, in [0;36mPreTrainedTokenizer.tokenize[0;34m(self, text, **kwargs)[0m [1;32m 693[0m tokenized_text[38;5;241m.[39mappend(token) [1;32m 694[0m [38;5;28;01melse[39;00m: [0;32m--> 695[0m tokenized_text[38;5;241m.[39mextend([38;5;28;43mself[39;49m[38;5;241;43m.[39;49m[43m_tokenize[49m[43m([49m[43mtoken[49m[43m)[49m) [1;32m 696[0m [38;5;66;03m# ["This", " is", " something", "<special_token_1>", "else"][39;00m [1;32m 697[0m [38;5;28;01mreturn[39;00m tokenized_text File [0;32m~/magisterka/magisterka_env/lib/python3.8/site-packages/transformers/models/roberta/tokenization_roberta.py:270[0m, in [0;36mRobertaTokenizer._tokenize[0;34m(self, text)[0m [1;32m 268[0m [38;5;250m[39m[38;5;124;03m"""Tokenize a string."""[39;00m [1;32m 269[0m bpe_tokens [38;5;241m=[39m [] [0;32m--> 270[0m [38;5;28;01mfor[39;00m token [38;5;129;01min[39;00m [43mre[49m[38;5;241;43m.[39;49m[43mfindall[49m[43m([49m[38;5;28;43mself[39;49m[38;5;241;43m.[39;49m[43mpat[49m[43m,[49m[43m [49m[43mtext[49m[43m)[49m: [1;32m 271[0m token [38;5;241m=[39m [38;5;124m"[39m[38;5;124m"[39m[38;5;241m.[39mjoin( [1;32m 272[0m [38;5;28mself[39m[38;5;241m.[39mbyte_encoder[b] [38;5;28;01mfor[39;00m b [38;5;129;01min[39;00m token[38;5;241m.[39mencode([38;5;124m"[39m[38;5;124mutf-8[39m[38;5;124m"[39m) [1;32m 273[0m ) [38;5;66;03m# Maps all our bytes to unicode strings, avoiding control tokens of the BPE (spaces in our case)[39;00m [1;32m 274[0m bpe_tokens[38;5;241m.[39mextend(bpe_token [38;5;28;01mfor[39;00m bpe_token [38;5;129;01min[39;00m [38;5;28mself[39m[38;5;241m.[39mbpe(token)[38;5;241m.[39msplit([38;5;124m"[39m[38;5;124m [39m[38;5;124m"[39m)) File [0;32m~/magisterka/magisterka_env/lib/python3.8/site-packages/regex/regex.py:338[0m, in [0;36mfindall[0;34m(pattern, string, flags, pos, endpos, overlapped, concurrent, timeout, ignore_unused, **kwargs)[0m [1;32m 333[0m [38;5;250m[39m[38;5;124;03m"""Return a list of all matches in the string. The matches may be overlapped[39;00m [1;32m 334[0m [38;5;124;03mif overlapped is True. If one or more groups are present in the pattern,[39;00m [1;32m 335[0m [38;5;124;03mreturn a list of groups; this will be a list of tuples if the pattern has[39;00m [1;32m 336[0m [38;5;124;03mmore than one group. Empty matches are included in the result."""[39;00m [1;32m 337[0m pat [38;5;241m=[39m _compile(pattern, flags, ignore_unused, kwargs, [38;5;28;01mTrue[39;00m) [0;32m--> 338[0m [38;5;28;01mreturn[39;00m [43mpat[49m[38;5;241;43m.[39;49m[43mfindall[49m[43m([49m[43mstring[49m[43m,[49m[43m [49m[43mpos[49m[43m,[49m[43m [49m[43mendpos[49m[43m,[49m[43m [49m[43moverlapped[49m[43m,[49m[43m [49m[43mconcurrent[49m[43m,[49m[43m [49m[43mtimeout[49m[43m)[49m [0;31mKeyboardInterrupt[0m:
tokenized_datasets
data_collator = DataCollatorForLanguageModeling(tokenizer=tokenizer, mlm=True, mlm_probability=0.15)
data_collator
device = 'cuda' if torch.cuda.is_available() else 'cpu'
batch_size = 1
train_dataloader = DataLoader(tokenized_datasets['train'], batch_size=batch_size, shuffle=True, collate_fn=data_collator, generator=torch.Generator(device=device))
valid_dataloader = DataLoader(tokenized_datasets['valid'], batch_size=batch_size, shuffle=False, collate_fn=data_collator, generator=torch.Generator(device=device))
test_dataloader = DataLoader(tokenized_datasets['test'], batch_size=batch_size, shuffle=False, collate_fn=data_collator, generator=torch.Generator(device=device))