import random import json import logging from datasets import load_dataset from pathlib import Path from typing import List, Dict logger = logging.getLogger(__name__) MAP_LABEL_TRANSLATION = { 0: 'no hate', 1: 'hate', } def save_as_translations(original_save_path: Path, data_to_save: List[Dict]) -> None: file_name = 'translations-' + original_save_path.name file_path = original_save_path.parent / file_name print(f'Saving into: {file_path}') with open(file_path, 'wt') as f_write: for data_line in data_to_save: label = data_line['label'] new_label = MAP_LABEL_TRANSLATION[label] data_line['label'] = new_label data_line_str = json.dumps(data_line) f_write.write(f'{data_line_str}\n') def main() -> None: # Downloading Data loaded_data = load_dataset("tweets_hate_speech_detection") logger.info(f'Loaded dataset hate tweets: {loaded_data}') # Equalization of class division hate = [item for item in loaded_data["train"] if item["label"] == 1] no_hate = [item for item in loaded_data["train"] if item["label"] == 0][:2500] data_train, data_valid, data_test = hate + no_hate, [], [] logger.info(f'Train: {len(data_train):6d}') # Generate test and validation datasets while len(data_test) != 500: x = random.choice(loaded_data["train"]) if x not in data_train: data_test.append(x) logger.info(f'Test: {len(data_test):6d}') while len(data_valid) != 500: x = random.choice(loaded_data["train"]) if x not in (data_train, data_test): data_valid.append(x) logger.info(f'Valid: {len(data_valid):6d}') save_path = Path('data/') save_train_path = save_path / 'train.json' save_valid_path = save_path / 'valid.json' save_test_path = save_path / 'test.json' for file_path, data_to_save in [ (save_train_path, data_train), (save_valid_path, data_valid), (save_test_path, data_test) ]: print(f'Saving into: {file_path}') with open(file_path, 'wt') as f_write: for data_line in data_to_save: data_line_str = json.dumps(data_line) f_write.write(f'{data_line_str}\n') save_as_translations(file_path, data_to_save) if __name__ == '__main__': main()