s444354 larger_set
This commit is contained in:
parent
6a6204e613
commit
f69ed316f2
40000
dev-0/out.tsv
40000
dev-0/out.tsv
File diff suppressed because it is too large
Load Diff
23126
dev-1/out.tsv
23126
dev-1/out.tsv
File diff suppressed because it is too large
Load Diff
63
run.ipynb
63
run.ipynb
@ -2,7 +2,7 @@
|
|||||||
"cells": [
|
"cells": [
|
||||||
{
|
{
|
||||||
"cell_type": "code",
|
"cell_type": "code",
|
||||||
"execution_count": 16,
|
"execution_count": 1,
|
||||||
"id": "f844d81d",
|
"id": "f844d81d",
|
||||||
"metadata": {},
|
"metadata": {},
|
||||||
"outputs": [],
|
"outputs": [],
|
||||||
@ -17,7 +17,7 @@
|
|||||||
},
|
},
|
||||||
{
|
{
|
||||||
"cell_type": "code",
|
"cell_type": "code",
|
||||||
"execution_count": 18,
|
"execution_count": 13,
|
||||||
"id": "e18dcd2f",
|
"id": "e18dcd2f",
|
||||||
"metadata": {},
|
"metadata": {},
|
||||||
"outputs": [],
|
"outputs": [],
|
||||||
@ -28,7 +28,7 @@
|
|||||||
},
|
},
|
||||||
{
|
{
|
||||||
"cell_type": "code",
|
"cell_type": "code",
|
||||||
"execution_count": 19,
|
"execution_count": 3,
|
||||||
"id": "23006157",
|
"id": "23006157",
|
||||||
"metadata": {},
|
"metadata": {},
|
||||||
"outputs": [],
|
"outputs": [],
|
||||||
@ -44,7 +44,7 @@
|
|||||||
},
|
},
|
||||||
{
|
{
|
||||||
"cell_type": "code",
|
"cell_type": "code",
|
||||||
"execution_count": 29,
|
"execution_count": 11,
|
||||||
"id": "7fc3427f",
|
"id": "7fc3427f",
|
||||||
"metadata": {},
|
"metadata": {},
|
||||||
"outputs": [],
|
"outputs": [],
|
||||||
@ -57,42 +57,49 @@
|
|||||||
},
|
},
|
||||||
{
|
{
|
||||||
"cell_type": "code",
|
"cell_type": "code",
|
||||||
"execution_count": 22,
|
"execution_count": 15,
|
||||||
"id": "b92a45ce",
|
"id": "e27d685b",
|
||||||
"metadata": {},
|
|
||||||
"outputs": [],
|
|
||||||
"source": [
|
|
||||||
"df = df[['Text', 'Begin']]\n",
|
|
||||||
"X_train = df['Text']\n",
|
|
||||||
"y_train = df['Begin']"
|
|
||||||
]
|
|
||||||
},
|
|
||||||
{
|
|
||||||
"cell_type": "code",
|
|
||||||
"execution_count": 23,
|
|
||||||
"id": "7c6d4186",
|
|
||||||
"metadata": {},
|
"metadata": {},
|
||||||
"outputs": [
|
"outputs": [
|
||||||
{
|
{
|
||||||
"data": {
|
"name": "stdout",
|
||||||
"text/plain": [
|
"output_type": "stream",
|
||||||
"Pipeline(steps=[('tfidfvectorizer', TfidfVectorizer()),\n",
|
"text": [
|
||||||
" ('linearregression', LinearRegression())])"
|
"[Pipeline] ... (step 1 of 2) Processing tfidfvectorizer, total= 1.3min\n"
|
||||||
]
|
]
|
||||||
},
|
},
|
||||||
"execution_count": 23,
|
{
|
||||||
"metadata": {},
|
"ename": "KeyboardInterrupt",
|
||||||
"output_type": "execute_result"
|
"evalue": "",
|
||||||
|
"output_type": "error",
|
||||||
|
"traceback": [
|
||||||
|
"\u001b[1;31m---------------------------------------------------------------------------\u001b[0m",
|
||||||
|
"\u001b[1;31mKeyboardInterrupt\u001b[0m Traceback (most recent call last)",
|
||||||
|
"\u001b[1;32m~\\AppData\\Local\\Temp/ipykernel_21668/3545253539.py\u001b[0m in \u001b[0;36m<module>\u001b[1;34m\u001b[0m\n\u001b[0;32m 4\u001b[0m \u001b[0my_train\u001b[0m \u001b[1;33m=\u001b[0m \u001b[0mdf\u001b[0m\u001b[1;33m[\u001b[0m\u001b[1;34m'Begin'\u001b[0m\u001b[1;33m]\u001b[0m\u001b[1;33m\u001b[0m\u001b[1;33m\u001b[0m\u001b[0m\n\u001b[0;32m 5\u001b[0m \u001b[0mmodel\u001b[0m \u001b[1;33m=\u001b[0m \u001b[0mmake_pipeline\u001b[0m\u001b[1;33m(\u001b[0m\u001b[0mTfidfVectorizer\u001b[0m\u001b[1;33m(\u001b[0m\u001b[1;33m)\u001b[0m\u001b[1;33m,\u001b[0m \u001b[0mLinearRegression\u001b[0m\u001b[1;33m(\u001b[0m\u001b[1;33m)\u001b[0m\u001b[1;33m,\u001b[0m \u001b[0mverbose\u001b[0m\u001b[1;33m=\u001b[0m\u001b[1;36m2\u001b[0m\u001b[1;33m)\u001b[0m\u001b[1;33m\u001b[0m\u001b[1;33m\u001b[0m\u001b[0m\n\u001b[1;32m----> 6\u001b[1;33m \u001b[0mmodel\u001b[0m\u001b[1;33m.\u001b[0m\u001b[0mfit\u001b[0m\u001b[1;33m(\u001b[0m\u001b[0mX_train\u001b[0m\u001b[1;33m,\u001b[0m \u001b[0my_train\u001b[0m\u001b[1;33m)\u001b[0m\u001b[1;33m\u001b[0m\u001b[1;33m\u001b[0m\u001b[0m\n\u001b[0m",
|
||||||
|
"\u001b[1;32m~\\anaconda3\\lib\\site-packages\\sklearn\\pipeline.py\u001b[0m in \u001b[0;36mfit\u001b[1;34m(self, X, y, **fit_params)\u001b[0m\n\u001b[0;32m 392\u001b[0m \u001b[1;32mif\u001b[0m \u001b[0mself\u001b[0m\u001b[1;33m.\u001b[0m\u001b[0m_final_estimator\u001b[0m \u001b[1;33m!=\u001b[0m \u001b[1;34m\"passthrough\"\u001b[0m\u001b[1;33m:\u001b[0m\u001b[1;33m\u001b[0m\u001b[1;33m\u001b[0m\u001b[0m\n\u001b[0;32m 393\u001b[0m \u001b[0mfit_params_last_step\u001b[0m \u001b[1;33m=\u001b[0m \u001b[0mfit_params_steps\u001b[0m\u001b[1;33m[\u001b[0m\u001b[0mself\u001b[0m\u001b[1;33m.\u001b[0m\u001b[0msteps\u001b[0m\u001b[1;33m[\u001b[0m\u001b[1;33m-\u001b[0m\u001b[1;36m1\u001b[0m\u001b[1;33m]\u001b[0m\u001b[1;33m[\u001b[0m\u001b[1;36m0\u001b[0m\u001b[1;33m]\u001b[0m\u001b[1;33m]\u001b[0m\u001b[1;33m\u001b[0m\u001b[1;33m\u001b[0m\u001b[0m\n\u001b[1;32m--> 394\u001b[1;33m \u001b[0mself\u001b[0m\u001b[1;33m.\u001b[0m\u001b[0m_final_estimator\u001b[0m\u001b[1;33m.\u001b[0m\u001b[0mfit\u001b[0m\u001b[1;33m(\u001b[0m\u001b[0mXt\u001b[0m\u001b[1;33m,\u001b[0m \u001b[0my\u001b[0m\u001b[1;33m,\u001b[0m \u001b[1;33m**\u001b[0m\u001b[0mfit_params_last_step\u001b[0m\u001b[1;33m)\u001b[0m\u001b[1;33m\u001b[0m\u001b[1;33m\u001b[0m\u001b[0m\n\u001b[0m\u001b[0;32m 395\u001b[0m \u001b[1;33m\u001b[0m\u001b[0m\n\u001b[0;32m 396\u001b[0m \u001b[1;32mreturn\u001b[0m \u001b[0mself\u001b[0m\u001b[1;33m\u001b[0m\u001b[1;33m\u001b[0m\u001b[0m\n",
|
||||||
|
"\u001b[1;32m~\\anaconda3\\lib\\site-packages\\sklearn\\linear_model\\_base.py\u001b[0m in \u001b[0;36mfit\u001b[1;34m(self, X, y, sample_weight)\u001b[0m\n\u001b[0;32m 704\u001b[0m \u001b[1;33m\u001b[0m\u001b[0m\n\u001b[0;32m 705\u001b[0m \u001b[1;32mif\u001b[0m \u001b[0my\u001b[0m\u001b[1;33m.\u001b[0m\u001b[0mndim\u001b[0m \u001b[1;33m<\u001b[0m \u001b[1;36m2\u001b[0m\u001b[1;33m:\u001b[0m\u001b[1;33m\u001b[0m\u001b[1;33m\u001b[0m\u001b[0m\n\u001b[1;32m--> 706\u001b[1;33m \u001b[0mout\u001b[0m \u001b[1;33m=\u001b[0m \u001b[0msparse_lsqr\u001b[0m\u001b[1;33m(\u001b[0m\u001b[0mX_centered\u001b[0m\u001b[1;33m,\u001b[0m \u001b[0my\u001b[0m\u001b[1;33m)\u001b[0m\u001b[1;33m\u001b[0m\u001b[1;33m\u001b[0m\u001b[0m\n\u001b[0m\u001b[0;32m 707\u001b[0m \u001b[0mself\u001b[0m\u001b[1;33m.\u001b[0m\u001b[0mcoef_\u001b[0m \u001b[1;33m=\u001b[0m \u001b[0mout\u001b[0m\u001b[1;33m[\u001b[0m\u001b[1;36m0\u001b[0m\u001b[1;33m]\u001b[0m\u001b[1;33m\u001b[0m\u001b[1;33m\u001b[0m\u001b[0m\n\u001b[0;32m 708\u001b[0m \u001b[0mself\u001b[0m\u001b[1;33m.\u001b[0m\u001b[0m_residues\u001b[0m \u001b[1;33m=\u001b[0m \u001b[0mout\u001b[0m\u001b[1;33m[\u001b[0m\u001b[1;36m3\u001b[0m\u001b[1;33m]\u001b[0m\u001b[1;33m\u001b[0m\u001b[1;33m\u001b[0m\u001b[0m\n",
|
||||||
|
"\u001b[1;32m~\\anaconda3\\lib\\site-packages\\scipy\\sparse\\linalg\\isolve\\lsqr.py\u001b[0m in \u001b[0;36mlsqr\u001b[1;34m(A, b, damp, atol, btol, conlim, iter_lim, show, calc_var, x0)\u001b[0m\n\u001b[0;32m 411\u001b[0m \u001b[1;31m# beta*u = a*v - alfa*u,\u001b[0m\u001b[1;33m\u001b[0m\u001b[1;33m\u001b[0m\u001b[0m\n\u001b[0;32m 412\u001b[0m \u001b[1;31m# alfa*v = A'*u - beta*v.\u001b[0m\u001b[1;33m\u001b[0m\u001b[1;33m\u001b[0m\u001b[0m\n\u001b[1;32m--> 413\u001b[1;33m \u001b[0mu\u001b[0m \u001b[1;33m=\u001b[0m \u001b[0mA\u001b[0m\u001b[1;33m.\u001b[0m\u001b[0mmatvec\u001b[0m\u001b[1;33m(\u001b[0m\u001b[0mv\u001b[0m\u001b[1;33m)\u001b[0m \u001b[1;33m-\u001b[0m \u001b[0malfa\u001b[0m \u001b[1;33m*\u001b[0m \u001b[0mu\u001b[0m\u001b[1;33m\u001b[0m\u001b[1;33m\u001b[0m\u001b[0m\n\u001b[0m\u001b[0;32m 414\u001b[0m \u001b[0mbeta\u001b[0m \u001b[1;33m=\u001b[0m \u001b[0mnp\u001b[0m\u001b[1;33m.\u001b[0m\u001b[0mlinalg\u001b[0m\u001b[1;33m.\u001b[0m\u001b[0mnorm\u001b[0m\u001b[1;33m(\u001b[0m\u001b[0mu\u001b[0m\u001b[1;33m)\u001b[0m\u001b[1;33m\u001b[0m\u001b[1;33m\u001b[0m\u001b[0m\n\u001b[0;32m 415\u001b[0m \u001b[1;33m\u001b[0m\u001b[0m\n",
|
||||||
|
"\u001b[1;32m~\\anaconda3\\lib\\site-packages\\scipy\\sparse\\linalg\\interface.py\u001b[0m in \u001b[0;36mmatvec\u001b[1;34m(self, x)\u001b[0m\n\u001b[0;32m 230\u001b[0m \u001b[1;32mraise\u001b[0m \u001b[0mValueError\u001b[0m\u001b[1;33m(\u001b[0m\u001b[1;34m'dimension mismatch'\u001b[0m\u001b[1;33m)\u001b[0m\u001b[1;33m\u001b[0m\u001b[1;33m\u001b[0m\u001b[0m\n\u001b[0;32m 231\u001b[0m \u001b[1;33m\u001b[0m\u001b[0m\n\u001b[1;32m--> 232\u001b[1;33m \u001b[0my\u001b[0m \u001b[1;33m=\u001b[0m \u001b[0mself\u001b[0m\u001b[1;33m.\u001b[0m\u001b[0m_matvec\u001b[0m\u001b[1;33m(\u001b[0m\u001b[0mx\u001b[0m\u001b[1;33m)\u001b[0m\u001b[1;33m\u001b[0m\u001b[1;33m\u001b[0m\u001b[0m\n\u001b[0m\u001b[0;32m 233\u001b[0m \u001b[1;33m\u001b[0m\u001b[0m\n\u001b[0;32m 234\u001b[0m \u001b[1;32mif\u001b[0m \u001b[0misinstance\u001b[0m\u001b[1;33m(\u001b[0m\u001b[0mx\u001b[0m\u001b[1;33m,\u001b[0m \u001b[0mnp\u001b[0m\u001b[1;33m.\u001b[0m\u001b[0mmatrix\u001b[0m\u001b[1;33m)\u001b[0m\u001b[1;33m:\u001b[0m\u001b[1;33m\u001b[0m\u001b[1;33m\u001b[0m\u001b[0m\n",
|
||||||
|
"\u001b[1;32m~\\anaconda3\\lib\\site-packages\\scipy\\sparse\\linalg\\interface.py\u001b[0m in \u001b[0;36m_matvec\u001b[1;34m(self, x)\u001b[0m\n\u001b[0;32m 528\u001b[0m \u001b[1;33m\u001b[0m\u001b[0m\n\u001b[0;32m 529\u001b[0m \u001b[1;32mdef\u001b[0m \u001b[0m_matvec\u001b[0m\u001b[1;33m(\u001b[0m\u001b[0mself\u001b[0m\u001b[1;33m,\u001b[0m \u001b[0mx\u001b[0m\u001b[1;33m)\u001b[0m\u001b[1;33m:\u001b[0m\u001b[1;33m\u001b[0m\u001b[1;33m\u001b[0m\u001b[0m\n\u001b[1;32m--> 530\u001b[1;33m \u001b[1;32mreturn\u001b[0m \u001b[0mself\u001b[0m\u001b[1;33m.\u001b[0m\u001b[0m__matvec_impl\u001b[0m\u001b[1;33m(\u001b[0m\u001b[0mx\u001b[0m\u001b[1;33m)\u001b[0m\u001b[1;33m\u001b[0m\u001b[1;33m\u001b[0m\u001b[0m\n\u001b[0m\u001b[0;32m 531\u001b[0m \u001b[1;33m\u001b[0m\u001b[0m\n\u001b[0;32m 532\u001b[0m \u001b[1;32mdef\u001b[0m \u001b[0m_rmatvec\u001b[0m\u001b[1;33m(\u001b[0m\u001b[0mself\u001b[0m\u001b[1;33m,\u001b[0m \u001b[0mx\u001b[0m\u001b[1;33m)\u001b[0m\u001b[1;33m:\u001b[0m\u001b[1;33m\u001b[0m\u001b[1;33m\u001b[0m\u001b[0m\n",
|
||||||
|
"\u001b[1;32m~\\anaconda3\\lib\\site-packages\\sklearn\\linear_model\\_base.py\u001b[0m in \u001b[0;36mmatvec\u001b[1;34m(b)\u001b[0m\n\u001b[0;32m 694\u001b[0m \u001b[1;33m\u001b[0m\u001b[0m\n\u001b[0;32m 695\u001b[0m \u001b[1;32mdef\u001b[0m \u001b[0mmatvec\u001b[0m\u001b[1;33m(\u001b[0m\u001b[0mb\u001b[0m\u001b[1;33m)\u001b[0m\u001b[1;33m:\u001b[0m\u001b[1;33m\u001b[0m\u001b[1;33m\u001b[0m\u001b[0m\n\u001b[1;32m--> 696\u001b[1;33m \u001b[1;32mreturn\u001b[0m \u001b[0mX\u001b[0m\u001b[1;33m.\u001b[0m\u001b[0mdot\u001b[0m\u001b[1;33m(\u001b[0m\u001b[0mb\u001b[0m\u001b[1;33m)\u001b[0m \u001b[1;33m-\u001b[0m \u001b[0mb\u001b[0m\u001b[1;33m.\u001b[0m\u001b[0mdot\u001b[0m\u001b[1;33m(\u001b[0m\u001b[0mX_offset_scale\u001b[0m\u001b[1;33m)\u001b[0m\u001b[1;33m\u001b[0m\u001b[1;33m\u001b[0m\u001b[0m\n\u001b[0m\u001b[0;32m 697\u001b[0m \u001b[1;33m\u001b[0m\u001b[0m\n\u001b[0;32m 698\u001b[0m \u001b[1;32mdef\u001b[0m \u001b[0mrmatvec\u001b[0m\u001b[1;33m(\u001b[0m\u001b[0mb\u001b[0m\u001b[1;33m)\u001b[0m\u001b[1;33m:\u001b[0m\u001b[1;33m\u001b[0m\u001b[1;33m\u001b[0m\u001b[0m\n",
|
||||||
|
"\u001b[1;32m~\\anaconda3\\lib\\site-packages\\scipy\\sparse\\base.py\u001b[0m in \u001b[0;36mdot\u001b[1;34m(self, other)\u001b[0m\n\u001b[0;32m 357\u001b[0m \u001b[1;33m\u001b[0m\u001b[0m\n\u001b[0;32m 358\u001b[0m \"\"\"\n\u001b[1;32m--> 359\u001b[1;33m \u001b[1;32mreturn\u001b[0m \u001b[0mself\u001b[0m \u001b[1;33m*\u001b[0m \u001b[0mother\u001b[0m\u001b[1;33m\u001b[0m\u001b[1;33m\u001b[0m\u001b[0m\n\u001b[0m\u001b[0;32m 360\u001b[0m \u001b[1;33m\u001b[0m\u001b[0m\n\u001b[0;32m 361\u001b[0m \u001b[1;32mdef\u001b[0m \u001b[0mpower\u001b[0m\u001b[1;33m(\u001b[0m\u001b[0mself\u001b[0m\u001b[1;33m,\u001b[0m \u001b[0mn\u001b[0m\u001b[1;33m,\u001b[0m \u001b[0mdtype\u001b[0m\u001b[1;33m=\u001b[0m\u001b[1;32mNone\u001b[0m\u001b[1;33m)\u001b[0m\u001b[1;33m:\u001b[0m\u001b[1;33m\u001b[0m\u001b[1;33m\u001b[0m\u001b[0m\n",
|
||||||
|
"\u001b[1;32m~\\anaconda3\\lib\\site-packages\\scipy\\sparse\\base.py\u001b[0m in \u001b[0;36m__mul__\u001b[1;34m(self, other)\u001b[0m\n\u001b[0;32m 465\u001b[0m \u001b[1;31m# Fast path for the most common case\u001b[0m\u001b[1;33m\u001b[0m\u001b[1;33m\u001b[0m\u001b[0m\n\u001b[0;32m 466\u001b[0m \u001b[1;32mif\u001b[0m \u001b[0mother\u001b[0m\u001b[1;33m.\u001b[0m\u001b[0mshape\u001b[0m \u001b[1;33m==\u001b[0m \u001b[1;33m(\u001b[0m\u001b[0mN\u001b[0m\u001b[1;33m,\u001b[0m\u001b[1;33m)\u001b[0m\u001b[1;33m:\u001b[0m\u001b[1;33m\u001b[0m\u001b[1;33m\u001b[0m\u001b[0m\n\u001b[1;32m--> 467\u001b[1;33m \u001b[1;32mreturn\u001b[0m \u001b[0mself\u001b[0m\u001b[1;33m.\u001b[0m\u001b[0m_mul_vector\u001b[0m\u001b[1;33m(\u001b[0m\u001b[0mother\u001b[0m\u001b[1;33m)\u001b[0m\u001b[1;33m\u001b[0m\u001b[1;33m\u001b[0m\u001b[0m\n\u001b[0m\u001b[0;32m 468\u001b[0m \u001b[1;32melif\u001b[0m \u001b[0mother\u001b[0m\u001b[1;33m.\u001b[0m\u001b[0mshape\u001b[0m \u001b[1;33m==\u001b[0m \u001b[1;33m(\u001b[0m\u001b[0mN\u001b[0m\u001b[1;33m,\u001b[0m \u001b[1;36m1\u001b[0m\u001b[1;33m)\u001b[0m\u001b[1;33m:\u001b[0m\u001b[1;33m\u001b[0m\u001b[1;33m\u001b[0m\u001b[0m\n\u001b[0;32m 469\u001b[0m \u001b[1;32mreturn\u001b[0m \u001b[0mself\u001b[0m\u001b[1;33m.\u001b[0m\u001b[0m_mul_vector\u001b[0m\u001b[1;33m(\u001b[0m\u001b[0mother\u001b[0m\u001b[1;33m.\u001b[0m\u001b[0mravel\u001b[0m\u001b[1;33m(\u001b[0m\u001b[1;33m)\u001b[0m\u001b[1;33m)\u001b[0m\u001b[1;33m.\u001b[0m\u001b[0mreshape\u001b[0m\u001b[1;33m(\u001b[0m\u001b[0mM\u001b[0m\u001b[1;33m,\u001b[0m \u001b[1;36m1\u001b[0m\u001b[1;33m)\u001b[0m\u001b[1;33m\u001b[0m\u001b[1;33m\u001b[0m\u001b[0m\n",
|
||||||
|
"\u001b[1;32m~\\anaconda3\\lib\\site-packages\\scipy\\sparse\\compressed.py\u001b[0m in \u001b[0;36m_mul_vector\u001b[1;34m(self, other)\u001b[0m\n\u001b[0;32m 476\u001b[0m \u001b[1;31m# csr_matvec or csc_matvec\u001b[0m\u001b[1;33m\u001b[0m\u001b[1;33m\u001b[0m\u001b[0m\n\u001b[0;32m 477\u001b[0m \u001b[0mfn\u001b[0m \u001b[1;33m=\u001b[0m \u001b[0mgetattr\u001b[0m\u001b[1;33m(\u001b[0m\u001b[0m_sparsetools\u001b[0m\u001b[1;33m,\u001b[0m \u001b[0mself\u001b[0m\u001b[1;33m.\u001b[0m\u001b[0mformat\u001b[0m \u001b[1;33m+\u001b[0m \u001b[1;34m'_matvec'\u001b[0m\u001b[1;33m)\u001b[0m\u001b[1;33m\u001b[0m\u001b[1;33m\u001b[0m\u001b[0m\n\u001b[1;32m--> 478\u001b[1;33m \u001b[0mfn\u001b[0m\u001b[1;33m(\u001b[0m\u001b[0mM\u001b[0m\u001b[1;33m,\u001b[0m \u001b[0mN\u001b[0m\u001b[1;33m,\u001b[0m \u001b[0mself\u001b[0m\u001b[1;33m.\u001b[0m\u001b[0mindptr\u001b[0m\u001b[1;33m,\u001b[0m \u001b[0mself\u001b[0m\u001b[1;33m.\u001b[0m\u001b[0mindices\u001b[0m\u001b[1;33m,\u001b[0m \u001b[0mself\u001b[0m\u001b[1;33m.\u001b[0m\u001b[0mdata\u001b[0m\u001b[1;33m,\u001b[0m \u001b[0mother\u001b[0m\u001b[1;33m,\u001b[0m \u001b[0mresult\u001b[0m\u001b[1;33m)\u001b[0m\u001b[1;33m\u001b[0m\u001b[1;33m\u001b[0m\u001b[0m\n\u001b[0m\u001b[0;32m 479\u001b[0m \u001b[1;33m\u001b[0m\u001b[0m\n\u001b[0;32m 480\u001b[0m \u001b[1;32mreturn\u001b[0m \u001b[0mresult\u001b[0m\u001b[1;33m\u001b[0m\u001b[1;33m\u001b[0m\u001b[0m\n",
|
||||||
|
"\u001b[1;31mKeyboardInterrupt\u001b[0m: "
|
||||||
|
]
|
||||||
}
|
}
|
||||||
],
|
],
|
||||||
"source": [
|
"source": [
|
||||||
|
"df = df[['Text', 'Begin']]\n",
|
||||||
|
"X_train = df['Text']\n",
|
||||||
|
"y_train = df['Begin']\n",
|
||||||
"model = make_pipeline(TfidfVectorizer(), LinearRegression())\n",
|
"model = make_pipeline(TfidfVectorizer(), LinearRegression())\n",
|
||||||
"model.fit(X_train, y_train)"
|
"model.fit(X_train, y_train)"
|
||||||
]
|
]
|
||||||
},
|
},
|
||||||
{
|
{
|
||||||
"cell_type": "code",
|
"cell_type": "code",
|
||||||
"execution_count": 30,
|
"execution_count": null,
|
||||||
"id": "7497ecb0",
|
"id": "7497ecb0",
|
||||||
"metadata": {},
|
"metadata": {},
|
||||||
"outputs": [],
|
"outputs": [],
|
||||||
|
17
run.py
17
run.py
@ -1,7 +1,7 @@
|
|||||||
#!/usr/bin/env python
|
#!/usr/bin/env python
|
||||||
# coding: utf-8
|
# coding: utf-8
|
||||||
|
|
||||||
# In[16]:
|
# In[1]:
|
||||||
|
|
||||||
|
|
||||||
import lzma
|
import lzma
|
||||||
@ -12,14 +12,14 @@ from sklearn.metrics import mean_squared_error
|
|||||||
import pandas as pd
|
import pandas as pd
|
||||||
|
|
||||||
|
|
||||||
# In[18]:
|
# In[13]:
|
||||||
|
|
||||||
|
|
||||||
with lzma.open('train/train.tsv.xz', 'rt', encoding="utf-8") as f:
|
with lzma.open('train/train.tsv.xz', 'rt', encoding="utf-8") as f:
|
||||||
df = pd.read_csv(f, sep='\t', names=['Begin', 'End', 'Title', 'Publisher', 'Text'])
|
df = pd.read_csv(f, sep='\t', names=['Begin', 'End', 'Title', 'Publisher', 'Text'])
|
||||||
|
|
||||||
|
|
||||||
# In[19]:
|
# In[3]:
|
||||||
|
|
||||||
|
|
||||||
def readFile(filename):
|
def readFile(filename):
|
||||||
@ -31,7 +31,7 @@ def readFile(filename):
|
|||||||
return result
|
return result
|
||||||
|
|
||||||
|
|
||||||
# In[29]:
|
# In[11]:
|
||||||
|
|
||||||
|
|
||||||
def predict(filename, predictions):
|
def predict(filename, predictions):
|
||||||
@ -40,22 +40,17 @@ def predict(filename, predictions):
|
|||||||
f.write(str(p) + "\n")
|
f.write(str(p) + "\n")
|
||||||
|
|
||||||
|
|
||||||
# In[22]:
|
# In[15]:
|
||||||
|
|
||||||
|
|
||||||
df = df[['Text', 'Begin']]
|
df = df[['Text', 'Begin']]
|
||||||
X_train = df['Text']
|
X_train = df['Text']
|
||||||
y_train = df['Begin']
|
y_train = df['Begin']
|
||||||
|
|
||||||
|
|
||||||
# In[23]:
|
|
||||||
|
|
||||||
|
|
||||||
model = make_pipeline(TfidfVectorizer(), LinearRegression())
|
model = make_pipeline(TfidfVectorizer(), LinearRegression())
|
||||||
model.fit(X_train, y_train)
|
model.fit(X_train, y_train)
|
||||||
|
|
||||||
|
|
||||||
# In[30]:
|
# In[ ]:
|
||||||
|
|
||||||
|
|
||||||
filenames=[('dev-0/in.tsv',"dev-0/out.tsv"), ('dev-1/in.tsv', "dev-1/out.tsv"), ('test-A/in.tsv', 'test-A/out.tsv')]
|
filenames=[('dev-0/in.tsv',"dev-0/out.tsv"), ('dev-1/in.tsv', "dev-1/out.tsv"), ('test-A/in.tsv', 'test-A/out.tsv')]
|
||||||
|
28440
test-A/out.tsv
28440
test-A/out.tsv
File diff suppressed because it is too large
Load Diff
Loading…
Reference in New Issue
Block a user