msc-patryk-bartkowiak/code/preprocess_data.py

58 lines
2.2 KiB
Python

from datasets import load_dataset, disable_caching
from transformers import RobertaTokenizer
disable_caching()
def visible_print(text):
print('\n\n')
print('=' * 100)
print(text)
print('=' * 100)
print('\n\n')
if __name__ == '__main__':
# Load the dataset
train_data = load_dataset('/work/s452638/datasets/the-stack-python', split='train')
valid_data = load_dataset('json', data_files='/work/s452638/datasets/CodeSearchNet/python/valid.jsonl')['train']
test_data = load_dataset('json', data_files='/work/s452638/datasets/CodeSearchNet/python/test.jsonl')['train']
visible_print('Loaded data')
# Rename the columns
train_data = train_data.rename_column('content', 'code')
# Remove all the columns except the code
train_columns = train_data.column_names
valid_columns = valid_data.column_names
test_columns = test_data.column_names
train_columns.remove('code')
valid_columns.remove('code')
test_columns.remove('code')
train_data = train_data.remove_columns(train_columns)
valid_data = valid_data.remove_columns(valid_columns)
test_data = test_data.remove_columns(test_columns)
visible_print('Removed unnecessary columns')
# Tokenize the data
tokenizer = RobertaTokenizer.from_pretrained('microsoft/codebert-base', clean_up_tokenization_spaces=True)
def tokenize_function(examples):
return tokenizer(examples['code'], truncation=True, padding='max_length', max_length=512, return_tensors='pt')
train_data = train_data.map(tokenize_function, batched=True, remove_columns=['code'], desc='[Train] Running tokenizer', num_proc=8)
valid_data = valid_data.map(tokenize_function, batched=True, remove_columns=['code'], desc='[Valid] Running tokenizer', num_proc=8)
test_data = test_data.map(tokenize_function, batched=True, remove_columns=['code'], desc='[Test] Running tokenizer', num_proc=8)
visible_print('Tokenized data')
# Save the tokenized data
train_data.save_to_disk('/work/s452638/datasets/the-stack-python-tokenized/train')
valid_data.save_to_disk('/work/s452638/datasets/the-stack-python-tokenized/valid')
test_data.save_to_disk('/work/s452638/datasets/the-stack-python-tokenized/test')
visible_print('Saved tokenized data')