58 lines
2.2 KiB
Python
58 lines
2.2 KiB
Python
from datasets import load_dataset, disable_caching
|
|
from transformers import RobertaTokenizer
|
|
|
|
disable_caching()
|
|
|
|
|
|
def visible_print(text):
|
|
print('\n\n')
|
|
print('=' * 100)
|
|
print(text)
|
|
print('=' * 100)
|
|
print('\n\n')
|
|
|
|
|
|
if __name__ == '__main__':
|
|
# Load the dataset
|
|
train_data = load_dataset('/work/s452638/datasets/the-stack-python', split='train')
|
|
valid_data = load_dataset('json', data_files='/work/s452638/datasets/CodeSearchNet/python/valid.jsonl')['train']
|
|
test_data = load_dataset('json', data_files='/work/s452638/datasets/CodeSearchNet/python/test.jsonl')['train']
|
|
|
|
visible_print('Loaded data')
|
|
|
|
# Rename the columns
|
|
train_data = train_data.rename_column('content', 'code')
|
|
|
|
# Remove all the columns except the code
|
|
train_columns = train_data.column_names
|
|
valid_columns = valid_data.column_names
|
|
test_columns = test_data.column_names
|
|
|
|
train_columns.remove('code')
|
|
valid_columns.remove('code')
|
|
test_columns.remove('code')
|
|
|
|
train_data = train_data.remove_columns(train_columns)
|
|
valid_data = valid_data.remove_columns(valid_columns)
|
|
test_data = test_data.remove_columns(test_columns)
|
|
|
|
visible_print('Removed unnecessary columns')
|
|
|
|
# Tokenize the data
|
|
tokenizer = RobertaTokenizer.from_pretrained('microsoft/codebert-base', clean_up_tokenization_spaces=True)
|
|
|
|
def tokenize_function(examples):
|
|
return tokenizer(examples['code'], truncation=True, padding='max_length', max_length=512, return_tensors='pt')
|
|
|
|
train_data = train_data.map(tokenize_function, batched=True, remove_columns=['code'], desc='[Train] Running tokenizer', num_proc=8)
|
|
valid_data = valid_data.map(tokenize_function, batched=True, remove_columns=['code'], desc='[Valid] Running tokenizer', num_proc=8)
|
|
test_data = test_data.map(tokenize_function, batched=True, remove_columns=['code'], desc='[Test] Running tokenizer', num_proc=8)
|
|
|
|
visible_print('Tokenized data')
|
|
|
|
# Save the tokenized data
|
|
train_data.save_to_disk('/work/s452638/datasets/the-stack-python-tokenized/train')
|
|
valid_data.save_to_disk('/work/s452638/datasets/the-stack-python-tokenized/valid')
|
|
test_data.save_to_disk('/work/s452638/datasets/the-stack-python-tokenized/test')
|
|
|
|
visible_print('Saved tokenized data') |