challenging-america-word-ga.../prepare_dataset.py

37 lines
894 B
Python

import lzma
import json
def preprocess_train_data(X, y):
parsed_data = []
for line, masked in zip(X, y):
left = line.strip().split('\t')[6].replace('\\n', ' ')
right = line.strip().split('\t')[7].replace('\\n', ' ')
masked = masked.strip()
text = left + f' {masked} ' + right
parsed_data.append({'text': text})
return parsed_data
with lzma.open('train/in.tsv.xz', mode='rt', encoding='utf-8') as f:
X = f.readlines()
with open('train/expected.tsv', mode='rt', encoding='utf-8') as f:
y = f.readlines()
data = preprocess_train_data(X, y)
data = data[:10000]
train_data = data[:int(len(data) * 0.8)]
val_data = data[int(len(data) * 0.8):]
with open('train/train.json', mode='wt', encoding='utf-8') as f:
json.dump(train_data, f)
with open('train/val.json', mode='wt', encoding='utf-8') as f:
json.dump(val_data, f)