60 lines
1.7 KiB
Plaintext
60 lines
1.7 KiB
Plaintext
{
|
|
"cells": [
|
|
{
|
|
"cell_type": "code",
|
|
"execution_count": 2,
|
|
"metadata": {},
|
|
"outputs": [],
|
|
"source": [
|
|
"import lzma\n",
|
|
"\n",
|
|
"f = lzma.open('train/in.tsv.xz', mode='rt', encoding='utf-8')\n",
|
|
"e = open('train/expected.tsv', 'r', encoding='utf-8')\n",
|
|
"\n",
|
|
"with open('train/gpt2-parsed.csv', 'w', encoding='utf-8') as file:\n",
|
|
" file.write(\"text\\n\")\n",
|
|
" for line, expected in zip(f, e):\n",
|
|
" separated = line.split('\\t')\n",
|
|
" first_part = separated[6][-100:].replace('\\n', ' ')\n",
|
|
" second_part = separated[7][:50].replace('\\n', ' ')\n",
|
|
" expected = expected.replace('\\n', '')\n",
|
|
"\n",
|
|
" first_part = first_part.replace('\"', '')\n",
|
|
" second_part = second_part.replace('\"', '')\n",
|
|
" expected = expected.replace('\"', '')\n",
|
|
"\n",
|
|
" first_part = first_part.replace(r'\\n', ' ')\n",
|
|
" second_part = second_part.replace(r'\\n', ' ')\n",
|
|
" expected = expected.replace(r'\\n', '')\n",
|
|
"\n",
|
|
" file.write('\"' + first_part + \" \" + expected + \" \" + second_part + '\"\\n')\n",
|
|
"\n",
|
|
"f.close()\n",
|
|
"e.close()"
|
|
]
|
|
}
|
|
],
|
|
"metadata": {
|
|
"kernelspec": {
|
|
"display_name": "modelowanie",
|
|
"language": "python",
|
|
"name": "python3"
|
|
},
|
|
"language_info": {
|
|
"codemirror_mode": {
|
|
"name": "ipython",
|
|
"version": 3
|
|
},
|
|
"file_extension": ".py",
|
|
"mimetype": "text/x-python",
|
|
"name": "python",
|
|
"nbconvert_exporter": "python",
|
|
"pygments_lexer": "ipython3",
|
|
"version": "3.10.10"
|
|
},
|
|
"orig_nbformat": 4
|
|
},
|
|
"nbformat": 4,
|
|
"nbformat_minor": 2
|
|
}
|