challenging-america-word-ga.../parser.ipynb

60 lines
1.7 KiB
Plaintext
Raw Normal View History

2023-06-15 15:30:16 +02:00
{
"cells": [
{
"cell_type": "code",
"execution_count": 2,
"metadata": {},
"outputs": [],
"source": [
"import lzma\n",
"\n",
"f = lzma.open('train/in.tsv.xz', mode='rt', encoding='utf-8')\n",
"e = open('train/expected.tsv', 'r', encoding='utf-8')\n",
"\n",
"with open('train/gpt2-parsed.csv', 'w', encoding='utf-8') as file:\n",
" file.write(\"text\\n\")\n",
" for line, expected in zip(f, e):\n",
" separated = line.split('\\t')\n",
" first_part = separated[6][-100:].replace('\\n', ' ')\n",
" second_part = separated[7][:50].replace('\\n', ' ')\n",
" expected = expected.replace('\\n', '')\n",
"\n",
" first_part = first_part.replace('\"', '')\n",
" second_part = second_part.replace('\"', '')\n",
" expected = expected.replace('\"', '')\n",
"\n",
" first_part = first_part.replace(r'\\n', ' ')\n",
" second_part = second_part.replace(r'\\n', ' ')\n",
" expected = expected.replace(r'\\n', '')\n",
"\n",
" file.write('\"' + first_part + \" \" + expected + \" \" + second_part + '\"\\n')\n",
"\n",
"f.close()\n",
"e.close()"
]
}
],
"metadata": {
"kernelspec": {
"display_name": "modelowanie",
"language": "python",
"name": "python3"
},
"language_info": {
"codemirror_mode": {
"name": "ipython",
"version": 3
},
"file_extension": ".py",
"mimetype": "text/x-python",
"name": "python",
"nbconvert_exporter": "python",
"pygments_lexer": "ipython3",
"version": "3.10.10"
},
"orig_nbformat": 4
},
"nbformat": 4,
"nbformat_minor": 2
}