37 lines
894 B
Python
37 lines
894 B
Python
|
import lzma
|
||
|
import json
|
||
|
|
||
|
|
||
|
def preprocess_train_data(X, y):
|
||
|
parsed_data = []
|
||
|
|
||
|
for line, masked in zip(X, y):
|
||
|
left = line.strip().split('\t')[6].replace('\\n', ' ')
|
||
|
right = line.strip().split('\t')[7].replace('\\n', ' ')
|
||
|
masked = masked.strip()
|
||
|
text = left + f' {masked} ' + right
|
||
|
|
||
|
parsed_data.append({'text': text})
|
||
|
|
||
|
return parsed_data
|
||
|
|
||
|
|
||
|
with lzma.open('train/in.tsv.xz', mode='rt', encoding='utf-8') as f:
|
||
|
X = f.readlines()
|
||
|
|
||
|
with open('train/expected.tsv', mode='rt', encoding='utf-8') as f:
|
||
|
y = f.readlines()
|
||
|
|
||
|
data = preprocess_train_data(X, y)
|
||
|
|
||
|
data = data[:10000]
|
||
|
|
||
|
train_data = data[:int(len(data) * 0.8)]
|
||
|
val_data = data[int(len(data) * 0.8):]
|
||
|
|
||
|
with open('train/train.json', mode='wt', encoding='utf-8') as f:
|
||
|
json.dump(train_data, f)
|
||
|
|
||
|
with open('train/val.json', mode='wt', encoding='utf-8') as f:
|
||
|
json.dump(val_data, f)
|