gpt2 finetuning
This commit is contained in:
parent
fbe7c00390
commit
a87a4d2c53
2
.gitignore
vendored
2
.gitignore
vendored
@ -10,3 +10,5 @@
|
|||||||
dev-0/in.tsv
|
dev-0/in.tsv
|
||||||
test-A/in.tsv
|
test-A/in.tsv
|
||||||
train/in.tsv
|
train/in.tsv
|
||||||
|
|
||||||
|
gpt2-parsed.csv
|
21038
dev-0/out.tsv
21038
dev-0/out.tsv
File diff suppressed because it is too large
Load Diff
@ -1,7 +1,9 @@
|
|||||||
description: trigram nn
|
description: gpt2 finetuning
|
||||||
tags:
|
tags:
|
||||||
- neural-network
|
- neural-network
|
||||||
- trigram
|
- gpt2
|
||||||
|
params:
|
||||||
|
top: 50
|
||||||
unwanted-params:
|
unwanted-params:
|
||||||
- model-file
|
- model-file
|
||||||
- vocab-file
|
- vocab-file
|
||||||
|
59
parser.ipynb
Normal file
59
parser.ipynb
Normal file
@ -0,0 +1,59 @@
|
|||||||
|
{
|
||||||
|
"cells": [
|
||||||
|
{
|
||||||
|
"cell_type": "code",
|
||||||
|
"execution_count": 2,
|
||||||
|
"metadata": {},
|
||||||
|
"outputs": [],
|
||||||
|
"source": [
|
||||||
|
"import lzma\n",
|
||||||
|
"\n",
|
||||||
|
"f = lzma.open('train/in.tsv.xz', mode='rt', encoding='utf-8')\n",
|
||||||
|
"e = open('train/expected.tsv', 'r', encoding='utf-8')\n",
|
||||||
|
"\n",
|
||||||
|
"with open('train/gpt2-parsed.csv', 'w', encoding='utf-8') as file:\n",
|
||||||
|
" file.write(\"text\\n\")\n",
|
||||||
|
" for line, expected in zip(f, e):\n",
|
||||||
|
" separated = line.split('\\t')\n",
|
||||||
|
" first_part = separated[6][-100:].replace('\\n', ' ')\n",
|
||||||
|
" second_part = separated[7][:50].replace('\\n', ' ')\n",
|
||||||
|
" expected = expected.replace('\\n', '')\n",
|
||||||
|
"\n",
|
||||||
|
" first_part = first_part.replace('\"', '')\n",
|
||||||
|
" second_part = second_part.replace('\"', '')\n",
|
||||||
|
" expected = expected.replace('\"', '')\n",
|
||||||
|
"\n",
|
||||||
|
" first_part = first_part.replace(r'\\n', ' ')\n",
|
||||||
|
" second_part = second_part.replace(r'\\n', ' ')\n",
|
||||||
|
" expected = expected.replace(r'\\n', '')\n",
|
||||||
|
"\n",
|
||||||
|
" file.write('\"' + first_part + \" \" + expected + \" \" + second_part + '\"\\n')\n",
|
||||||
|
"\n",
|
||||||
|
"f.close()\n",
|
||||||
|
"e.close()"
|
||||||
|
]
|
||||||
|
}
|
||||||
|
],
|
||||||
|
"metadata": {
|
||||||
|
"kernelspec": {
|
||||||
|
"display_name": "modelowanie",
|
||||||
|
"language": "python",
|
||||||
|
"name": "python3"
|
||||||
|
},
|
||||||
|
"language_info": {
|
||||||
|
"codemirror_mode": {
|
||||||
|
"name": "ipython",
|
||||||
|
"version": 3
|
||||||
|
},
|
||||||
|
"file_extension": ".py",
|
||||||
|
"mimetype": "text/x-python",
|
||||||
|
"name": "python",
|
||||||
|
"nbconvert_exporter": "python",
|
||||||
|
"pygments_lexer": "ipython3",
|
||||||
|
"version": "3.10.10"
|
||||||
|
},
|
||||||
|
"orig_nbformat": 4
|
||||||
|
},
|
||||||
|
"nbformat": 4,
|
||||||
|
"nbformat_minor": 2
|
||||||
|
}
|
14828
test-A/out.tsv
14828
test-A/out.tsv
File diff suppressed because it is too large
Load Diff
Loading…
Reference in New Issue
Block a user