msc-patryk-bartkowiak/code/the_stack_test.ipynb

40 KiB
Raw Blame History

import torch
from torch.utils.data import DataLoader
from datasets import load_dataset, disable_caching, DatasetDict
from transformers import RobertaForMaskedLM, RobertaConfig, RobertaTokenizer, DataCollatorForLanguageModeling

disable_caching()
/home/s452638/magisterka/magisterka_env/lib/python3.8/site-packages/tqdm/auto.py:21: TqdmWarning: IProgress not found. Please update jupyter and ipywidgets. See https://ipywidgets.readthedocs.io/en/stable/user_install.html
  from .autonotebook import tqdm as notebook_tqdm
train_data = load_dataset("/work/s452638/datasets/the-stack-python", split="train")
train_data
Dataset({
    features: ['hexsha', 'size', 'ext', 'lang', 'max_stars_repo_path', 'max_stars_repo_name', 'max_stars_repo_head_hexsha', 'max_stars_repo_licenses', 'max_stars_count', 'max_stars_repo_stars_event_min_datetime', 'max_stars_repo_stars_event_max_datetime', 'max_issues_repo_path', 'max_issues_repo_name', 'max_issues_repo_head_hexsha', 'max_issues_repo_licenses', 'max_issues_count', 'max_issues_repo_issues_event_min_datetime', 'max_issues_repo_issues_event_max_datetime', 'max_forks_repo_path', 'max_forks_repo_name', 'max_forks_repo_head_hexsha', 'max_forks_repo_licenses', 'max_forks_count', 'max_forks_repo_forks_event_min_datetime', 'max_forks_repo_forks_event_max_datetime', 'content', 'avg_line_length', 'max_line_length', 'alphanum_fraction'],
    num_rows: 12962249
})
valid_data = load_dataset('json', data_files='/work/s452638/datasets/CodeSearchNet/python/valid.jsonl')
valid_data
DatasetDict({
    train: Dataset({
        features: ['repo', 'path', 'func_name', 'original_string', 'language', 'code', 'code_tokens', 'docstring', 'docstring_tokens', 'sha', 'url', 'partition'],
        num_rows: 13914
    })
})
test_data = load_dataset('json', data_files='/work/s452638/datasets/CodeSearchNet/python/test.jsonl')
test_data
DatasetDict({
    train: Dataset({
        features: ['repo', 'path', 'func_name', 'original_string', 'language', 'code', 'code_tokens', 'docstring', 'docstring_tokens', 'sha', 'url', 'partition'],
        num_rows: 14918
    })
})
train_data = train_data.rename_column('content', 'code')
tokenizer = RobertaTokenizer.from_pretrained('microsoft/codebert-base', clean_up_tokenization_spaces=True)
tokenizer
RobertaTokenizer(name_or_path='microsoft/codebert-base', vocab_size=50265, model_max_length=512, is_fast=False, padding_side='right', truncation_side='right', special_tokens={'bos_token': '<s>', 'eos_token': '</s>', 'unk_token': '<unk>', 'sep_token': '</s>', 'pad_token': '<pad>', 'cls_token': '<s>', 'mask_token': '<mask>'}, clean_up_tokenization_spaces=True),  added_tokens_decoder={
	0: AddedToken("<s>", rstrip=False, lstrip=False, single_word=False, normalized=True, special=True),
	1: AddedToken("<pad>", rstrip=False, lstrip=False, single_word=False, normalized=True, special=True),
	2: AddedToken("</s>", rstrip=False, lstrip=False, single_word=False, normalized=True, special=True),
	3: AddedToken("<unk>", rstrip=False, lstrip=False, single_word=False, normalized=True, special=True),
	50264: AddedToken("<mask>", rstrip=False, lstrip=True, single_word=False, normalized=False, special=True),
}
def tokenize_function(examples):
    return tokenizer(examples['code'], truncation=True, padding='max_length', max_length=512, return_tensors='pt')

train_data = train_data.map(tokenize_function, batched=True, remove_columns=['code'], desc='Running tokenizer')
valid_data = valid_data.map(tokenize_function, batched=True, remove_columns=['code'], desc='Running tokenizer')
test_data = test_data.map(tokenize_function, batched=True, remove_columns=['code'], desc='Running tokenizer')
Running tokenizer:   0%|          | 1000/12962249 [00:17<61:47:17, 58.27 examples/s]
---------------------------------------------------------------------------
KeyboardInterrupt                         Traceback (most recent call last)
Cell In[7], line 4
      1 def tokenize_function(examples):
      2     return tokenizer(examples['code'], truncation=True, padding='max_length', max_length=512, return_tensors='pt')
----> 4 train_data = train_data.map(tokenize_function, batched=True, remove_columns=['code'], desc='Running tokenizer')
      5 valid_data = valid_data.map(tokenize_function, batched=True, remove_columns=['code'], desc='Running tokenizer')
      6 test_data = test_data.map(tokenize_function, batched=True, remove_columns=['code'], desc='Running tokenizer')

File ~/magisterka/magisterka_env/lib/python3.8/site-packages/datasets/arrow_dataset.py:602, in transmit_tasks.<locals>.wrapper(*args, **kwargs)
    600     self: "Dataset" = kwargs.pop("self")
    601 # apply actual function
--> 602 out: Union["Dataset", "DatasetDict"] = func(self, *args, **kwargs)
    603 datasets: List["Dataset"] = list(out.values()) if isinstance(out, dict) else [out]
    604 for dataset in datasets:
    605     # Remove task templates if a column mapping of the template is no longer valid

File ~/magisterka/magisterka_env/lib/python3.8/site-packages/datasets/arrow_dataset.py:567, in transmit_format.<locals>.wrapper(*args, **kwargs)
    560 self_format = {
    561     "type": self._format_type,
    562     "format_kwargs": self._format_kwargs,
    563     "columns": self._format_columns,
    564     "output_all_columns": self._output_all_columns,
    565 }
    566 # apply actual function
--> 567 out: Union["Dataset", "DatasetDict"] = func(self, *args, **kwargs)
    568 datasets: List["Dataset"] = list(out.values()) if isinstance(out, dict) else [out]
    569 # re-apply format to the output

File ~/magisterka/magisterka_env/lib/python3.8/site-packages/datasets/arrow_dataset.py:3161, in Dataset.map(self, function, with_indices, with_rank, input_columns, batched, batch_size, drop_last_batch, remove_columns, keep_in_memory, load_from_cache_file, cache_file_name, writer_batch_size, features, disable_nullable, fn_kwargs, num_proc, suffix_template, new_fingerprint, desc)
   3155 if transformed_dataset is None:
   3156     with hf_tqdm(
   3157         unit=" examples",
   3158         total=pbar_total,
   3159         desc=desc or "Map",
   3160     ) as pbar:
-> 3161         for rank, done, content in Dataset._map_single(**dataset_kwargs):
   3162             if done:
   3163                 shards_done += 1

File ~/magisterka/magisterka_env/lib/python3.8/site-packages/datasets/arrow_dataset.py:3552, in Dataset._map_single(shard, function, with_indices, with_rank, input_columns, batched, batch_size, drop_last_batch, remove_columns, keep_in_memory, cache_file_name, writer_batch_size, features, disable_nullable, fn_kwargs, new_fingerprint, rank, offset)
   3548 indices = list(
   3549     range(*(slice(i, i + batch_size).indices(shard.num_rows)))
   3550 )  # Something simpler?
   3551 try:
-> 3552     batch = apply_function_on_filtered_inputs(
   3553         batch,
   3554         indices,
   3555         check_same_num_examples=len(shard.list_indexes()) > 0,
   3556         offset=offset,
   3557     )
   3558 except NumExamplesMismatchError:
   3559     raise DatasetTransformationNotAllowedError(
   3560         "Using `.map` in batched mode on a dataset with attached indexes is allowed only if it doesn't create or remove existing examples. You can first run `.drop_index() to remove your index and then re-add it."
   3561     ) from None

File ~/magisterka/magisterka_env/lib/python3.8/site-packages/datasets/arrow_dataset.py:3421, in Dataset._map_single.<locals>.apply_function_on_filtered_inputs(pa_inputs, indices, check_same_num_examples, offset)
   3419 if with_rank:
   3420     additional_args += (rank,)
-> 3421 processed_inputs = function(*fn_args, *additional_args, **fn_kwargs)
   3422 if isinstance(processed_inputs, LazyDict):
   3423     processed_inputs = {
   3424         k: v for k, v in processed_inputs.data.items() if k not in processed_inputs.keys_to_format
   3425     }

Cell In[7], line 2, in tokenize_function(examples)
      1 def tokenize_function(examples):
----> 2     return tokenizer(examples['code'], truncation=True, padding='max_length', max_length=512, return_tensors='pt')

File ~/magisterka/magisterka_env/lib/python3.8/site-packages/transformers/tokenization_utils_base.py:3055, in PreTrainedTokenizerBase.__call__(self, text, text_pair, text_target, text_pair_target, add_special_tokens, padding, truncation, max_length, stride, is_split_into_words, pad_to_multiple_of, return_tensors, return_token_type_ids, return_attention_mask, return_overflowing_tokens, return_special_tokens_mask, return_offsets_mapping, return_length, verbose, **kwargs)
   3053     if not self._in_target_context_manager:
   3054         self._switch_to_input_mode()
-> 3055     encodings = self._call_one(text=text, text_pair=text_pair, **all_kwargs)
   3056 if text_target is not None:
   3057     self._switch_to_target_mode()

File ~/magisterka/magisterka_env/lib/python3.8/site-packages/transformers/tokenization_utils_base.py:3142, in PreTrainedTokenizerBase._call_one(self, text, text_pair, add_special_tokens, padding, truncation, max_length, stride, is_split_into_words, pad_to_multiple_of, return_tensors, return_token_type_ids, return_attention_mask, return_overflowing_tokens, return_special_tokens_mask, return_offsets_mapping, return_length, verbose, split_special_tokens, **kwargs)
   3137         raise ValueError(
   3138             f"batch length of `text`: {len(text)} does not match batch length of `text_pair`:"
   3139             f" {len(text_pair)}."
   3140         )
   3141     batch_text_or_text_pairs = list(zip(text, text_pair)) if text_pair is not None else text
-> 3142     return self.batch_encode_plus(
   3143         batch_text_or_text_pairs=batch_text_or_text_pairs,
   3144         add_special_tokens=add_special_tokens,
   3145         padding=padding,
   3146         truncation=truncation,
   3147         max_length=max_length,
   3148         stride=stride,
   3149         is_split_into_words=is_split_into_words,
   3150         pad_to_multiple_of=pad_to_multiple_of,
   3151         return_tensors=return_tensors,
   3152         return_token_type_ids=return_token_type_ids,
   3153         return_attention_mask=return_attention_mask,
   3154         return_overflowing_tokens=return_overflowing_tokens,
   3155         return_special_tokens_mask=return_special_tokens_mask,
   3156         return_offsets_mapping=return_offsets_mapping,
   3157         return_length=return_length,
   3158         verbose=verbose,
   3159         split_special_tokens=split_special_tokens,
   3160         **kwargs,
   3161     )
   3162 else:
   3163     return self.encode_plus(
   3164         text=text,
   3165         text_pair=text_pair,
   (...)
   3182         **kwargs,
   3183     )

File ~/magisterka/magisterka_env/lib/python3.8/site-packages/transformers/tokenization_utils_base.py:3338, in PreTrainedTokenizerBase.batch_encode_plus(self, batch_text_or_text_pairs, add_special_tokens, padding, truncation, max_length, stride, is_split_into_words, pad_to_multiple_of, return_tensors, return_token_type_ids, return_attention_mask, return_overflowing_tokens, return_special_tokens_mask, return_offsets_mapping, return_length, verbose, split_special_tokens, **kwargs)
   3328 # Backward compatibility for 'truncation_strategy', 'pad_to_max_length'
   3329 padding_strategy, truncation_strategy, max_length, kwargs = self._get_padding_truncation_strategies(
   3330     padding=padding,
   3331     truncation=truncation,
   (...)
   3335     **kwargs,
   3336 )
-> 3338 return self._batch_encode_plus(
   3339     batch_text_or_text_pairs=batch_text_or_text_pairs,
   3340     add_special_tokens=add_special_tokens,
   3341     padding_strategy=padding_strategy,
   3342     truncation_strategy=truncation_strategy,
   3343     max_length=max_length,
   3344     stride=stride,
   3345     is_split_into_words=is_split_into_words,
   3346     pad_to_multiple_of=pad_to_multiple_of,
   3347     return_tensors=return_tensors,
   3348     return_token_type_ids=return_token_type_ids,
   3349     return_attention_mask=return_attention_mask,
   3350     return_overflowing_tokens=return_overflowing_tokens,
   3351     return_special_tokens_mask=return_special_tokens_mask,
   3352     return_offsets_mapping=return_offsets_mapping,
   3353     return_length=return_length,
   3354     verbose=verbose,
   3355     split_special_tokens=split_special_tokens,
   3356     **kwargs,
   3357 )

File ~/magisterka/magisterka_env/lib/python3.8/site-packages/transformers/tokenization_utils.py:882, in PreTrainedTokenizer._batch_encode_plus(self, batch_text_or_text_pairs, add_special_tokens, padding_strategy, truncation_strategy, max_length, stride, is_split_into_words, pad_to_multiple_of, return_tensors, return_token_type_ids, return_attention_mask, return_overflowing_tokens, return_special_tokens_mask, return_offsets_mapping, return_length, verbose, split_special_tokens, **kwargs)
    879 else:
    880     ids, pair_ids = ids_or_pair_ids
--> 882 first_ids = get_input_ids(ids)
    883 second_ids = get_input_ids(pair_ids) if pair_ids is not None else None
    884 input_ids.append((first_ids, second_ids))

File ~/magisterka/magisterka_env/lib/python3.8/site-packages/transformers/tokenization_utils.py:849, in PreTrainedTokenizer._batch_encode_plus.<locals>.get_input_ids(text)
    847 def get_input_ids(text):
    848     if isinstance(text, str):
--> 849         tokens = self.tokenize(text, **kwargs)
    850         return self.convert_tokens_to_ids(tokens)
    851     elif isinstance(text, (list, tuple)) and len(text) > 0 and isinstance(text[0], str):

File ~/magisterka/magisterka_env/lib/python3.8/site-packages/transformers/tokenization_utils.py:695, in PreTrainedTokenizer.tokenize(self, text, **kwargs)
    693         tokenized_text.append(token)
    694     else:
--> 695         tokenized_text.extend(self._tokenize(token))
    696 # ["This", " is", " something", "<special_token_1>", "else"]
    697 return tokenized_text

File ~/magisterka/magisterka_env/lib/python3.8/site-packages/transformers/models/roberta/tokenization_roberta.py:270, in RobertaTokenizer._tokenize(self, text)
    268 """Tokenize a string."""
    269 bpe_tokens = []
--> 270 for token in re.findall(self.pat, text):
    271     token = "".join(
    272         self.byte_encoder[b] for b in token.encode("utf-8")
    273     )  # Maps all our bytes to unicode strings, avoiding control tokens of the BPE (spaces in our case)
    274     bpe_tokens.extend(bpe_token for bpe_token in self.bpe(token).split(" "))

File ~/magisterka/magisterka_env/lib/python3.8/site-packages/regex/regex.py:338, in findall(pattern, string, flags, pos, endpos, overlapped, concurrent, timeout, ignore_unused, **kwargs)
    333 """Return a list of all matches in the string. The matches may be overlapped
    334 if overlapped is True. If one or more groups are present in the pattern,
    335 return a list of groups; this will be a list of tuples if the pattern has
    336 more than one group. Empty matches are included in the result."""
    337 pat = _compile(pattern, flags, ignore_unused, kwargs, True)
--> 338 return pat.findall(string, pos, endpos, overlapped, concurrent, timeout)

KeyboardInterrupt: 
tokenized_datasets
data_collator = DataCollatorForLanguageModeling(tokenizer=tokenizer, mlm=True, mlm_probability=0.15)
data_collator
device = 'cuda' if torch.cuda.is_available() else 'cpu'
batch_size = 1
train_dataloader = DataLoader(tokenized_datasets['train'], batch_size=batch_size, shuffle=True, collate_fn=data_collator, generator=torch.Generator(device=device))
valid_dataloader = DataLoader(tokenized_datasets['valid'], batch_size=batch_size, shuffle=False, collate_fn=data_collator, generator=torch.Generator(device=device))
test_dataloader = DataLoader(tokenized_datasets['test'], batch_size=batch_size, shuffle=False, collate_fn=data_collator, generator=torch.Generator(device=device))