import lzma import json def preprocess_train_data(X, y): parsed_data = [] for line, masked in zip(X, y): left = line.strip().split('\t')[6].replace('\\n', ' ') right = line.strip().split('\t')[7].replace('\\n', ' ') masked = masked.strip() text = left + f' {masked} ' + right parsed_data.append({'text': text}) return parsed_data with lzma.open('train/in.tsv.xz', mode='rt', encoding='utf-8') as f: X = f.readlines() with open('train/expected.tsv', mode='rt', encoding='utf-8') as f: y = f.readlines() data = preprocess_train_data(X, y) data = data[:10000] train_data = data[:int(len(data) * 0.8)] val_data = data[int(len(data) * 0.8):] with open('train/train.json', mode='wt', encoding='utf-8') as f: json.dump(train_data, f) with open('train/val.json', mode='wt', encoding='utf-8') as f: json.dump(val_data, f)