83 lines
2.3 KiB
Python
83 lines
2.3 KiB
Python
import random
|
|
import json
|
|
import logging
|
|
|
|
|
|
from datasets import load_dataset
|
|
from pathlib import Path
|
|
from typing import List, Dict
|
|
|
|
|
|
logger = logging.getLogger(__name__)
|
|
|
|
|
|
MAP_LABEL_TRANSLATION = {
|
|
0: 'no hate',
|
|
1: 'hate',
|
|
}
|
|
|
|
|
|
def save_as_translations(original_save_path: Path, data_to_save: List[Dict]) -> None:
|
|
file_name = 'translations-' + original_save_path.name
|
|
file_path = original_save_path.parent / file_name
|
|
|
|
print(f'Saving into: {file_path}')
|
|
with open(file_path, 'wt') as f_write:
|
|
for data_line in data_to_save:
|
|
label = data_line['label']
|
|
new_label = MAP_LABEL_TRANSLATION[label]
|
|
data_line['label'] = new_label
|
|
data_line_str = json.dumps(data_line)
|
|
f_write.write(f'{data_line_str}\n')
|
|
|
|
|
|
def main() -> None:
|
|
|
|
# Downloading Data
|
|
loaded_data = load_dataset("tweets_hate_speech_detection")
|
|
logger.info(f'Loaded dataset hate tweets: {loaded_data}')
|
|
|
|
# Equalization of class division
|
|
hate = [item for item in loaded_data["train"] if item["label"] == 1]
|
|
no_hate = [item for item in loaded_data["train"] if item["label"] == 0][:2500]
|
|
|
|
data_train, data_valid, data_test = hate + no_hate, [], []
|
|
logger.info(f'Train: {len(data_train):6d}')
|
|
|
|
# Generate test and validation datasets
|
|
while len(data_test) != 500:
|
|
x = random.choice(loaded_data["train"])
|
|
if x not in data_train:
|
|
data_test.append(x)
|
|
|
|
logger.info(f'Test: {len(data_test):6d}')
|
|
|
|
while len(data_valid) != 500:
|
|
x = random.choice(loaded_data["train"])
|
|
if x not in (data_train, data_test):
|
|
data_valid.append(x)
|
|
|
|
logger.info(f'Valid: {len(data_valid):6d}')
|
|
|
|
save_path = Path('data/')
|
|
save_train_path = save_path / 'train.json'
|
|
save_valid_path = save_path / 'valid.json'
|
|
save_test_path = save_path / 'test.json'
|
|
|
|
for file_path, data_to_save in [
|
|
(save_train_path, data_train),
|
|
(save_valid_path, data_valid),
|
|
(save_test_path, data_test)
|
|
]:
|
|
print(f'Saving into: {file_path}')
|
|
with open(file_path, 'wt') as f_write:
|
|
for data_line in data_to_save:
|
|
data_line_str = json.dumps(data_line)
|
|
f_write.write(f'{data_line_str}\n')
|
|
|
|
save_as_translations(file_path, data_to_save)
|
|
|
|
|
|
if __name__ == '__main__':
|
|
main()
|