new prediction

This commit is contained in:
Karol Idaszak 2022-05-25 08:52:17 +02:00
parent 4b92825d62
commit 7fdacb38a6
6 changed files with 31578 additions and 77400 deletions

File diff suppressed because it is too large Load Diff

File diff suppressed because it is too large Load Diff

View File

@ -109,7 +109,7 @@
},
{
"cell_type": "code",
"execution_count": 9,
"execution_count": 10,
"id": "a19d253a",
"metadata": {},
"outputs": [
@ -1119,7 +1119,7 @@
" ...]"
]
},
"execution_count": 9,
"execution_count": 10,
"metadata": {},
"output_type": "execute_result"
}
@ -1130,7 +1130,7 @@
},
{
"cell_type": "code",
"execution_count": 10,
"execution_count": 11,
"id": "645a328e",
"metadata": {},
"outputs": [
@ -2140,13 +2140,13 @@
" ...]"
]
},
"execution_count": 10,
"execution_count": 11,
"metadata": {},
"output_type": "execute_result"
}
],
"source": [
"X_train = lzma.open(\"train/train.tsv.xz\", mode='rt', encoding='utf-8').readlines()\n",
"# X_train = lzma.open(\"train/train.tsv.xz\", mode='rt', encoding='utf-8').readlines()\n",
"Y_train = [i.split('\\t')[:2] for i in X_train]\n",
"Y_train = [sum([float(a) for a in i])/2 for i in Y_train]\n",
"Y_train"
@ -2154,7 +2154,7 @@
},
{
"cell_type": "code",
"execution_count": 11,
"execution_count": 12,
"id": "c9a393b0",
"metadata": {},
"outputs": [
@ -3164,7 +3164,7 @@
" ...]"
]
},
"execution_count": 11,
"execution_count": 12,
"metadata": {},
"output_type": "execute_result"
}
@ -3176,7 +3176,7 @@
},
{
"cell_type": "code",
"execution_count": 12,
"execution_count": 13,
"id": "8fe1f0fb",
"metadata": {},
"outputs": [],
@ -3190,7 +3190,7 @@
},
{
"cell_type": "code",
"execution_count": 13,
"execution_count": 14,
"id": "dda7ac37",
"metadata": {},
"outputs": [],
@ -3200,22 +3200,14 @@
},
{
"cell_type": "code",
"execution_count": null,
"id": "e95cac9e",
"metadata": {},
"outputs": [],
"source": []
},
{
"cell_type": "code",
"execution_count": 14,
"execution_count": 16,
"id": "b70295cf",
"metadata": {},
"outputs": [],
"source": [
"t = reg.predict(X_dev0)\n",
"out = pd.DataFrame(t)\n",
"out.to_csv('outdev1.tsv', sep='\\t', index=False, header=False)"
"out.to_csv('dev-0/out.tsv', sep='\\t', index=False, header=False)"
]
},
{
@ -3304,25 +3296,9 @@
"execution_count": 19,
"id": "f87b88b6",
"metadata": {},
"outputs": [
{
"ename": "ValueError",
"evalue": "Found input variables with inconsistent numbers of samples: [20000, 11563]",
"output_type": "error",
"traceback": [
"\u001b[1;31m---------------------------------------------------------------------------\u001b[0m",
"\u001b[1;31mValueError\u001b[0m Traceback (most recent call last)",
"Input \u001b[1;32mIn [19]\u001b[0m, in \u001b[0;36m<cell line: 1>\u001b[1;34m()\u001b[0m\n\u001b[1;32m----> 1\u001b[0m \u001b[38;5;28mprint\u001b[39m(\u001b[43mreg\u001b[49m\u001b[38;5;241;43m.\u001b[39;49m\u001b[43mscore\u001b[49m\u001b[43m(\u001b[49m\u001b[43mX_dev1\u001b[49m\u001b[43m,\u001b[49m\u001b[43m \u001b[49m\u001b[43my_expected_dev1\u001b[49m\u001b[43m)\u001b[49m)\n",
"File \u001b[1;32m~\\AppData\\Roaming\\Python\\Python310\\site-packages\\sklearn\\base.py:706\u001b[0m, in \u001b[0;36mRegressorMixin.score\u001b[1;34m(self, X, y, sample_weight)\u001b[0m\n\u001b[0;32m 703\u001b[0m \u001b[38;5;28;01mfrom\u001b[39;00m \u001b[38;5;21;01m.\u001b[39;00m\u001b[38;5;21;01mmetrics\u001b[39;00m \u001b[38;5;28;01mimport\u001b[39;00m r2_score\n\u001b[0;32m 705\u001b[0m y_pred \u001b[38;5;241m=\u001b[39m \u001b[38;5;28mself\u001b[39m\u001b[38;5;241m.\u001b[39mpredict(X)\n\u001b[1;32m--> 706\u001b[0m \u001b[38;5;28;01mreturn\u001b[39;00m \u001b[43mr2_score\u001b[49m\u001b[43m(\u001b[49m\u001b[43my\u001b[49m\u001b[43m,\u001b[49m\u001b[43m \u001b[49m\u001b[43my_pred\u001b[49m\u001b[43m,\u001b[49m\u001b[43m \u001b[49m\u001b[43msample_weight\u001b[49m\u001b[38;5;241;43m=\u001b[39;49m\u001b[43msample_weight\u001b[49m\u001b[43m)\u001b[49m\n",
"File \u001b[1;32m~\\AppData\\Roaming\\Python\\Python310\\site-packages\\sklearn\\metrics\\_regression.py:789\u001b[0m, in \u001b[0;36mr2_score\u001b[1;34m(y_true, y_pred, sample_weight, multioutput)\u001b[0m\n\u001b[0;32m 702\u001b[0m \u001b[38;5;28;01mdef\u001b[39;00m \u001b[38;5;21mr2_score\u001b[39m(y_true, y_pred, \u001b[38;5;241m*\u001b[39m, sample_weight\u001b[38;5;241m=\u001b[39m\u001b[38;5;28;01mNone\u001b[39;00m, multioutput\u001b[38;5;241m=\u001b[39m\u001b[38;5;124m\"\u001b[39m\u001b[38;5;124muniform_average\u001b[39m\u001b[38;5;124m\"\u001b[39m):\n\u001b[0;32m 703\u001b[0m \u001b[38;5;124;03m\"\"\":math:`R^2` (coefficient of determination) regression score function.\u001b[39;00m\n\u001b[0;32m 704\u001b[0m \n\u001b[0;32m 705\u001b[0m \u001b[38;5;124;03m Best possible score is 1.0 and it can be negative (because the\u001b[39;00m\n\u001b[1;32m (...)\u001b[0m\n\u001b[0;32m 787\u001b[0m \u001b[38;5;124;03m -3.0\u001b[39;00m\n\u001b[0;32m 788\u001b[0m \u001b[38;5;124;03m \"\"\"\u001b[39;00m\n\u001b[1;32m--> 789\u001b[0m y_type, y_true, y_pred, multioutput \u001b[38;5;241m=\u001b[39m \u001b[43m_check_reg_targets\u001b[49m\u001b[43m(\u001b[49m\n\u001b[0;32m 790\u001b[0m \u001b[43m \u001b[49m\u001b[43my_true\u001b[49m\u001b[43m,\u001b[49m\u001b[43m \u001b[49m\u001b[43my_pred\u001b[49m\u001b[43m,\u001b[49m\u001b[43m \u001b[49m\u001b[43mmultioutput\u001b[49m\n\u001b[0;32m 791\u001b[0m \u001b[43m \u001b[49m\u001b[43m)\u001b[49m\n\u001b[0;32m 792\u001b[0m check_consistent_length(y_true, y_pred, sample_weight)\n\u001b[0;32m 794\u001b[0m \u001b[38;5;28;01mif\u001b[39;00m _num_samples(y_pred) \u001b[38;5;241m<\u001b[39m \u001b[38;5;241m2\u001b[39m:\n",
"File \u001b[1;32m~\\AppData\\Roaming\\Python\\Python310\\site-packages\\sklearn\\metrics\\_regression.py:94\u001b[0m, in \u001b[0;36m_check_reg_targets\u001b[1;34m(y_true, y_pred, multioutput, dtype)\u001b[0m\n\u001b[0;32m 60\u001b[0m \u001b[38;5;28;01mdef\u001b[39;00m \u001b[38;5;21m_check_reg_targets\u001b[39m(y_true, y_pred, multioutput, dtype\u001b[38;5;241m=\u001b[39m\u001b[38;5;124m\"\u001b[39m\u001b[38;5;124mnumeric\u001b[39m\u001b[38;5;124m\"\u001b[39m):\n\u001b[0;32m 61\u001b[0m \u001b[38;5;124;03m\"\"\"Check that y_true and y_pred belong to the same regression task.\u001b[39;00m\n\u001b[0;32m 62\u001b[0m \n\u001b[0;32m 63\u001b[0m \u001b[38;5;124;03m Parameters\u001b[39;00m\n\u001b[1;32m (...)\u001b[0m\n\u001b[0;32m 92\u001b[0m \u001b[38;5;124;03m the dtype argument passed to check_array.\u001b[39;00m\n\u001b[0;32m 93\u001b[0m \u001b[38;5;124;03m \"\"\"\u001b[39;00m\n\u001b[1;32m---> 94\u001b[0m \u001b[43mcheck_consistent_length\u001b[49m\u001b[43m(\u001b[49m\u001b[43my_true\u001b[49m\u001b[43m,\u001b[49m\u001b[43m \u001b[49m\u001b[43my_pred\u001b[49m\u001b[43m)\u001b[49m\n\u001b[0;32m 95\u001b[0m y_true \u001b[38;5;241m=\u001b[39m check_array(y_true, ensure_2d\u001b[38;5;241m=\u001b[39m\u001b[38;5;28;01mFalse\u001b[39;00m, dtype\u001b[38;5;241m=\u001b[39mdtype)\n\u001b[0;32m 96\u001b[0m y_pred \u001b[38;5;241m=\u001b[39m check_array(y_pred, ensure_2d\u001b[38;5;241m=\u001b[39m\u001b[38;5;28;01mFalse\u001b[39;00m, dtype\u001b[38;5;241m=\u001b[39mdtype)\n",
"File \u001b[1;32m~\\AppData\\Roaming\\Python\\Python310\\site-packages\\sklearn\\utils\\validation.py:332\u001b[0m, in \u001b[0;36mcheck_consistent_length\u001b[1;34m(*arrays)\u001b[0m\n\u001b[0;32m 330\u001b[0m uniques \u001b[38;5;241m=\u001b[39m np\u001b[38;5;241m.\u001b[39munique(lengths)\n\u001b[0;32m 331\u001b[0m \u001b[38;5;28;01mif\u001b[39;00m \u001b[38;5;28mlen\u001b[39m(uniques) \u001b[38;5;241m>\u001b[39m \u001b[38;5;241m1\u001b[39m:\n\u001b[1;32m--> 332\u001b[0m \u001b[38;5;28;01mraise\u001b[39;00m \u001b[38;5;167;01mValueError\u001b[39;00m(\n\u001b[0;32m 333\u001b[0m \u001b[38;5;124m\"\u001b[39m\u001b[38;5;124mFound input variables with inconsistent numbers of samples: \u001b[39m\u001b[38;5;132;01m%r\u001b[39;00m\u001b[38;5;124m\"\u001b[39m\n\u001b[0;32m 334\u001b[0m \u001b[38;5;241m%\u001b[39m [\u001b[38;5;28mint\u001b[39m(l) \u001b[38;5;28;01mfor\u001b[39;00m l \u001b[38;5;129;01min\u001b[39;00m lengths]\n\u001b[0;32m 335\u001b[0m )\n",
"\u001b[1;31mValueError\u001b[0m: Found input variables with inconsistent numbers of samples: [20000, 11563]"
]
}
],
"outputs": [],
"source": [
"print(reg.score(X_dev1, y_expected_dev1))"
"# print(reg.score(X_dev1, y_expected_dev1))"
]
},
{
@ -3330,24 +3306,9 @@
"execution_count": 20,
"id": "1abd7196",
"metadata": {},
"outputs": [
{
"ename": "ValueError",
"evalue": "Found input variables with inconsistent numbers of samples: [20000, 11563]",
"output_type": "error",
"traceback": [
"\u001b[1;31m---------------------------------------------------------------------------\u001b[0m",
"\u001b[1;31mValueError\u001b[0m Traceback (most recent call last)",
"Input \u001b[1;32mIn [20]\u001b[0m, in \u001b[0;36m<cell line: 1>\u001b[1;34m()\u001b[0m\n\u001b[1;32m----> 1\u001b[0m \u001b[43mmean_squared_error\u001b[49m\u001b[43m(\u001b[49m\u001b[43my_expected_dev1\u001b[49m\u001b[43m,\u001b[49m\u001b[43m \u001b[49m\u001b[43mreg\u001b[49m\u001b[38;5;241;43m.\u001b[39;49m\u001b[43mpredict\u001b[49m\u001b[43m(\u001b[49m\u001b[43mX_dev1\u001b[49m\u001b[43m)\u001b[49m\u001b[43m,\u001b[49m\u001b[43m \u001b[49m\u001b[43msquared\u001b[49m\u001b[38;5;241;43m=\u001b[39;49m\u001b[38;5;28;43;01mFalse\u001b[39;49;00m\u001b[43m)\u001b[49m\n",
"File \u001b[1;32m~\\AppData\\Roaming\\Python\\Python310\\site-packages\\sklearn\\metrics\\_regression.py:438\u001b[0m, in \u001b[0;36mmean_squared_error\u001b[1;34m(y_true, y_pred, sample_weight, multioutput, squared)\u001b[0m\n\u001b[0;32m 378\u001b[0m \u001b[38;5;28;01mdef\u001b[39;00m \u001b[38;5;21mmean_squared_error\u001b[39m(\n\u001b[0;32m 379\u001b[0m y_true, y_pred, \u001b[38;5;241m*\u001b[39m, sample_weight\u001b[38;5;241m=\u001b[39m\u001b[38;5;28;01mNone\u001b[39;00m, multioutput\u001b[38;5;241m=\u001b[39m\u001b[38;5;124m\"\u001b[39m\u001b[38;5;124muniform_average\u001b[39m\u001b[38;5;124m\"\u001b[39m, squared\u001b[38;5;241m=\u001b[39m\u001b[38;5;28;01mTrue\u001b[39;00m\n\u001b[0;32m 380\u001b[0m ):\n\u001b[0;32m 381\u001b[0m \u001b[38;5;124;03m\"\"\"Mean squared error regression loss.\u001b[39;00m\n\u001b[0;32m 382\u001b[0m \n\u001b[0;32m 383\u001b[0m \u001b[38;5;124;03m Read more in the :ref:`User Guide <mean_squared_error>`.\u001b[39;00m\n\u001b[1;32m (...)\u001b[0m\n\u001b[0;32m 436\u001b[0m \u001b[38;5;124;03m 0.825...\u001b[39;00m\n\u001b[0;32m 437\u001b[0m \u001b[38;5;124;03m \"\"\"\u001b[39;00m\n\u001b[1;32m--> 438\u001b[0m y_type, y_true, y_pred, multioutput \u001b[38;5;241m=\u001b[39m \u001b[43m_check_reg_targets\u001b[49m\u001b[43m(\u001b[49m\n\u001b[0;32m 439\u001b[0m \u001b[43m \u001b[49m\u001b[43my_true\u001b[49m\u001b[43m,\u001b[49m\u001b[43m \u001b[49m\u001b[43my_pred\u001b[49m\u001b[43m,\u001b[49m\u001b[43m \u001b[49m\u001b[43mmultioutput\u001b[49m\n\u001b[0;32m 440\u001b[0m \u001b[43m \u001b[49m\u001b[43m)\u001b[49m\n\u001b[0;32m 441\u001b[0m check_consistent_length(y_true, y_pred, sample_weight)\n\u001b[0;32m 442\u001b[0m output_errors \u001b[38;5;241m=\u001b[39m np\u001b[38;5;241m.\u001b[39maverage((y_true \u001b[38;5;241m-\u001b[39m y_pred) \u001b[38;5;241m*\u001b[39m\u001b[38;5;241m*\u001b[39m \u001b[38;5;241m2\u001b[39m, axis\u001b[38;5;241m=\u001b[39m\u001b[38;5;241m0\u001b[39m, weights\u001b[38;5;241m=\u001b[39msample_weight)\n",
"File \u001b[1;32m~\\AppData\\Roaming\\Python\\Python310\\site-packages\\sklearn\\metrics\\_regression.py:94\u001b[0m, in \u001b[0;36m_check_reg_targets\u001b[1;34m(y_true, y_pred, multioutput, dtype)\u001b[0m\n\u001b[0;32m 60\u001b[0m \u001b[38;5;28;01mdef\u001b[39;00m \u001b[38;5;21m_check_reg_targets\u001b[39m(y_true, y_pred, multioutput, dtype\u001b[38;5;241m=\u001b[39m\u001b[38;5;124m\"\u001b[39m\u001b[38;5;124mnumeric\u001b[39m\u001b[38;5;124m\"\u001b[39m):\n\u001b[0;32m 61\u001b[0m \u001b[38;5;124;03m\"\"\"Check that y_true and y_pred belong to the same regression task.\u001b[39;00m\n\u001b[0;32m 62\u001b[0m \n\u001b[0;32m 63\u001b[0m \u001b[38;5;124;03m Parameters\u001b[39;00m\n\u001b[1;32m (...)\u001b[0m\n\u001b[0;32m 92\u001b[0m \u001b[38;5;124;03m the dtype argument passed to check_array.\u001b[39;00m\n\u001b[0;32m 93\u001b[0m \u001b[38;5;124;03m \"\"\"\u001b[39;00m\n\u001b[1;32m---> 94\u001b[0m \u001b[43mcheck_consistent_length\u001b[49m\u001b[43m(\u001b[49m\u001b[43my_true\u001b[49m\u001b[43m,\u001b[49m\u001b[43m \u001b[49m\u001b[43my_pred\u001b[49m\u001b[43m)\u001b[49m\n\u001b[0;32m 95\u001b[0m y_true \u001b[38;5;241m=\u001b[39m check_array(y_true, ensure_2d\u001b[38;5;241m=\u001b[39m\u001b[38;5;28;01mFalse\u001b[39;00m, dtype\u001b[38;5;241m=\u001b[39mdtype)\n\u001b[0;32m 96\u001b[0m y_pred \u001b[38;5;241m=\u001b[39m check_array(y_pred, ensure_2d\u001b[38;5;241m=\u001b[39m\u001b[38;5;28;01mFalse\u001b[39;00m, dtype\u001b[38;5;241m=\u001b[39mdtype)\n",
"File \u001b[1;32m~\\AppData\\Roaming\\Python\\Python310\\site-packages\\sklearn\\utils\\validation.py:332\u001b[0m, in \u001b[0;36mcheck_consistent_length\u001b[1;34m(*arrays)\u001b[0m\n\u001b[0;32m 330\u001b[0m uniques \u001b[38;5;241m=\u001b[39m np\u001b[38;5;241m.\u001b[39munique(lengths)\n\u001b[0;32m 331\u001b[0m \u001b[38;5;28;01mif\u001b[39;00m \u001b[38;5;28mlen\u001b[39m(uniques) \u001b[38;5;241m>\u001b[39m \u001b[38;5;241m1\u001b[39m:\n\u001b[1;32m--> 332\u001b[0m \u001b[38;5;28;01mraise\u001b[39;00m \u001b[38;5;167;01mValueError\u001b[39;00m(\n\u001b[0;32m 333\u001b[0m \u001b[38;5;124m\"\u001b[39m\u001b[38;5;124mFound input variables with inconsistent numbers of samples: \u001b[39m\u001b[38;5;132;01m%r\u001b[39;00m\u001b[38;5;124m\"\u001b[39m\n\u001b[0;32m 334\u001b[0m \u001b[38;5;241m%\u001b[39m [\u001b[38;5;28mint\u001b[39m(l) \u001b[38;5;28;01mfor\u001b[39;00m l \u001b[38;5;129;01min\u001b[39;00m lengths]\n\u001b[0;32m 335\u001b[0m )\n",
"\u001b[1;31mValueError\u001b[0m: Found input variables with inconsistent numbers of samples: [20000, 11563]"
]
}
],
"outputs": [],
"source": [
"mean_squared_error(y_expected_dev1, reg.predict(X_dev1), squared=False)\n"
"# mean_squared_error(y_expected_dev1, reg.predict(X_dev1), squared=False)\n"
]
},
{

11563
outdev0.tsv

File diff suppressed because it is too large Load Diff

20000
outdev1.tsv

File diff suppressed because it is too large Load Diff

14220
outtest.tsv

File diff suppressed because it is too large Load Diff