gpt2 finetuning
This commit is contained in:
parent
fbe7c00390
commit
a87a4d2c53
2
.gitignore
vendored
2
.gitignore
vendored
@ -10,3 +10,5 @@
|
||||
dev-0/in.tsv
|
||||
test-A/in.tsv
|
||||
train/in.tsv
|
||||
|
||||
gpt2-parsed.csv
|
21038
dev-0/out.tsv
21038
dev-0/out.tsv
File diff suppressed because it is too large
Load Diff
@ -1,7 +1,9 @@
|
||||
description: trigram nn
|
||||
description: gpt2 finetuning
|
||||
tags:
|
||||
- neural-network
|
||||
- trigram
|
||||
- gpt2
|
||||
params:
|
||||
top: 50
|
||||
unwanted-params:
|
||||
- model-file
|
||||
- vocab-file
|
||||
|
59
parser.ipynb
Normal file
59
parser.ipynb
Normal file
@ -0,0 +1,59 @@
|
||||
{
|
||||
"cells": [
|
||||
{
|
||||
"cell_type": "code",
|
||||
"execution_count": 2,
|
||||
"metadata": {},
|
||||
"outputs": [],
|
||||
"source": [
|
||||
"import lzma\n",
|
||||
"\n",
|
||||
"f = lzma.open('train/in.tsv.xz', mode='rt', encoding='utf-8')\n",
|
||||
"e = open('train/expected.tsv', 'r', encoding='utf-8')\n",
|
||||
"\n",
|
||||
"with open('train/gpt2-parsed.csv', 'w', encoding='utf-8') as file:\n",
|
||||
" file.write(\"text\\n\")\n",
|
||||
" for line, expected in zip(f, e):\n",
|
||||
" separated = line.split('\\t')\n",
|
||||
" first_part = separated[6][-100:].replace('\\n', ' ')\n",
|
||||
" second_part = separated[7][:50].replace('\\n', ' ')\n",
|
||||
" expected = expected.replace('\\n', '')\n",
|
||||
"\n",
|
||||
" first_part = first_part.replace('\"', '')\n",
|
||||
" second_part = second_part.replace('\"', '')\n",
|
||||
" expected = expected.replace('\"', '')\n",
|
||||
"\n",
|
||||
" first_part = first_part.replace(r'\\n', ' ')\n",
|
||||
" second_part = second_part.replace(r'\\n', ' ')\n",
|
||||
" expected = expected.replace(r'\\n', '')\n",
|
||||
"\n",
|
||||
" file.write('\"' + first_part + \" \" + expected + \" \" + second_part + '\"\\n')\n",
|
||||
"\n",
|
||||
"f.close()\n",
|
||||
"e.close()"
|
||||
]
|
||||
}
|
||||
],
|
||||
"metadata": {
|
||||
"kernelspec": {
|
||||
"display_name": "modelowanie",
|
||||
"language": "python",
|
||||
"name": "python3"
|
||||
},
|
||||
"language_info": {
|
||||
"codemirror_mode": {
|
||||
"name": "ipython",
|
||||
"version": 3
|
||||
},
|
||||
"file_extension": ".py",
|
||||
"mimetype": "text/x-python",
|
||||
"name": "python",
|
||||
"nbconvert_exporter": "python",
|
||||
"pygments_lexer": "ipython3",
|
||||
"version": "3.10.10"
|
||||
},
|
||||
"orig_nbformat": 4
|
||||
},
|
||||
"nbformat": 4,
|
||||
"nbformat_minor": 2
|
||||
}
|
14828
test-A/out.tsv
14828
test-A/out.tsv
File diff suppressed because it is too large
Load Diff
Loading…
Reference in New Issue
Block a user