1.7 KiB
1.7 KiB
import lzma
f = lzma.open('train/in.tsv.xz', mode='rt', encoding='utf-8')
e = open('train/expected.tsv', 'r', encoding='utf-8')
with open('train/gpt2-parsed.csv', 'w', encoding='utf-8') as file:
file.write("text\n")
for line, expected in zip(f, e):
separated = line.split('\t')
first_part = separated[6][-100:].replace('\n', ' ')
second_part = separated[7][:50].replace('\n', ' ')
expected = expected.replace('\n', '')
first_part = first_part.replace('"', '')
second_part = second_part.replace('"', '')
expected = expected.replace('"', '')
first_part = first_part.replace(r'\n', ' ')
second_part = second_part.replace(r'\n', ' ')
expected = expected.replace(r'\n', '')
file.write('"' + first_part + " " + expected + " " + second_part + '"\n')
f.close()
e.close()