gpt2 finetuning

This commit is contained in:
Jakub Adamski 2023-06-15 15:30:16 +02:00
parent fbe7c00390
commit a87a4d2c53
5 changed files with 17999 additions and 17936 deletions

4
.gitignore vendored
View File

@ -9,4 +9,6 @@
dev-0/in.tsv
test-A/in.tsv
train/in.tsv
train/in.tsv
gpt2-parsed.csv

File diff suppressed because it is too large Load Diff

View File

@ -1,7 +1,9 @@
description: trigram nn
description: gpt2 finetuning
tags:
- neural-network
- trigram
- gpt2
params:
top: 50
unwanted-params:
- model-file
- vocab-file

59
parser.ipynb Normal file
View File

@ -0,0 +1,59 @@
{
"cells": [
{
"cell_type": "code",
"execution_count": 2,
"metadata": {},
"outputs": [],
"source": [
"import lzma\n",
"\n",
"f = lzma.open('train/in.tsv.xz', mode='rt', encoding='utf-8')\n",
"e = open('train/expected.tsv', 'r', encoding='utf-8')\n",
"\n",
"with open('train/gpt2-parsed.csv', 'w', encoding='utf-8') as file:\n",
" file.write(\"text\\n\")\n",
" for line, expected in zip(f, e):\n",
" separated = line.split('\\t')\n",
" first_part = separated[6][-100:].replace('\\n', ' ')\n",
" second_part = separated[7][:50].replace('\\n', ' ')\n",
" expected = expected.replace('\\n', '')\n",
"\n",
" first_part = first_part.replace('\"', '')\n",
" second_part = second_part.replace('\"', '')\n",
" expected = expected.replace('\"', '')\n",
"\n",
" first_part = first_part.replace(r'\\n', ' ')\n",
" second_part = second_part.replace(r'\\n', ' ')\n",
" expected = expected.replace(r'\\n', '')\n",
"\n",
" file.write('\"' + first_part + \" \" + expected + \" \" + second_part + '\"\\n')\n",
"\n",
"f.close()\n",
"e.close()"
]
}
],
"metadata": {
"kernelspec": {
"display_name": "modelowanie",
"language": "python",
"name": "python3"
},
"language_info": {
"codemirror_mode": {
"name": "ipython",
"version": 3
},
"file_extension": ".py",
"mimetype": "text/x-python",
"name": "python",
"nbconvert_exporter": "python",
"pygments_lexer": "ipython3",
"version": "3.10.10"
},
"orig_nbformat": 4
},
"nbformat": 4,
"nbformat_minor": 2
}

File diff suppressed because it is too large Load Diff