challenging-america-word-ga.../parser.ipynb

1.7 KiB

import lzma

f = lzma.open('train/in.tsv.xz', mode='rt', encoding='utf-8')
e = open('train/expected.tsv', 'r', encoding='utf-8')

with open('train/gpt2-parsed.csv', 'w', encoding='utf-8') as file:
    file.write("text\n")
    for line, expected in zip(f, e):
        separated = line.split('\t')
        first_part = separated[6][-100:].replace('\n', ' ')
        second_part = separated[7][:50].replace('\n', ' ')
        expected = expected.replace('\n', '')

        first_part = first_part.replace('"', '')
        second_part = second_part.replace('"', '')
        expected = expected.replace('"', '')

        first_part = first_part.replace(r'\n', ' ')
        second_part = second_part.replace(r'\n', ' ')
        expected = expected.replace(r'\n', '')

        file.write('"' + first_part + " " + expected + " " + second_part + '"\n')

f.close()
e.close()