feat: data statictic n dataset division

This commit is contained in:
Filip Patyk 2023-03-21 23:45:10 +01:00
parent 212a1d6e38
commit a2e7b94789
1 changed files with 106 additions and 9 deletions

View File

@ -2,18 +2,19 @@
"cells": [
{
"cell_type": "code",
"execution_count": 7,
"execution_count": 3,
"metadata": {},
"outputs": [],
"source": [
"import numpy as np\n",
"import pandas as pd\n",
"from IPython.display import display,Markdown"
"from IPython.display import display,Markdown\n",
"from sklearn.model_selection import train_test_split"
]
},
{
"cell_type": "code",
"execution_count": 8,
"execution_count": 4,
"metadata": {},
"outputs": [],
"source": [
@ -27,7 +28,7 @@
},
{
"cell_type": "code",
"execution_count": 12,
"execution_count": 5,
"metadata": {},
"outputs": [],
"source": [
@ -47,7 +48,7 @@
},
{
"cell_type": "code",
"execution_count": 13,
"execution_count": 6,
"metadata": {},
"outputs": [
{
@ -330,7 +331,7 @@
},
{
"cell_type": "code",
"execution_count": 14,
"execution_count": 7,
"metadata": {},
"outputs": [
{
@ -448,7 +449,7 @@
},
{
"cell_type": "code",
"execution_count": 17,
"execution_count": 8,
"metadata": {},
"outputs": [
{
@ -479,11 +480,107 @@
"source": [
"display(dataset.info())"
]
},
{
"cell_type": "code",
"execution_count": 28,
"metadata": {},
"outputs": [
{
"data": {
"text/markdown": [
"### STD"
],
"text/plain": [
"<IPython.core.display.Markdown object>"
]
},
"metadata": {},
"output_type": "display_data"
},
{
"name": "stdout",
"output_type": "stream",
"text": [
"y_train std: 0.49939397301167954\n",
"y_val std: 0.4997839588710888\n",
"y_test std: 0.4998194469400359\n"
]
},
{
"data": {
"text/markdown": [
"### MEAN"
],
"text/plain": [
"<IPython.core.display.Markdown object>"
]
},
"metadata": {},
"output_type": "display_data"
},
{
"name": "stdout",
"output_type": "stream",
"text": [
"y_train mean: 0.475249178684782\n",
"y_val mean: 0.4835189309576837\n",
"y_test mean: 0.4846325167037862\n"
]
},
{
"data": {
"text/markdown": [
"### Count"
],
"text/plain": [
"<IPython.core.display.Markdown object>"
]
},
"metadata": {},
"output_type": "display_data"
},
{
"name": "stdout",
"output_type": "stream",
"text": [
"y_train count: 35918\n",
"y_val count: 4490\n",
"y_test count: 4490\n"
]
}
],
"source": [
"# creating train, val , test datasets dataset 8:1:1\n",
"X_train, X_val_test, y_train, y_valtest = train_test_split(dataset[\"text\"],dataset[\"Value\"],test_size=0.2, shuffle=True)\n",
"X_test, X_val, y_test, y_val = train_test_split(X_val_test,y_valtest,test_size=0.5, shuffle=True)\n",
"display(Markdown(\"### STD\"))\n",
"print(f\"y_train std: {y_train.std()}\")\n",
"print(f\"y_val std: {y_val.std()}\")\n",
"print(f\"y_test std: {y_test.std()}\")\n",
"\n",
"display(Markdown(\"### MEAN\"))\n",
"print(f\"y_train mean: {y_train.mean()}\")\n",
"print(f\"y_val mean: {y_val.mean()}\")\n",
"print(f\"y_test mean: {y_test.mean()}\")\n",
"\n",
"display(Markdown(\"### Count\"))\n",
"print(f\"y_train count: {y_train.count()}\")\n",
"print(f\"y_val count: {y_val.count()}\")\n",
"print(f\"y_test count: {y_test.count()}\")\n"
]
},
{
"cell_type": "code",
"execution_count": null,
"metadata": {},
"outputs": [],
"source": []
}
],
"metadata": {
"kernelspec": {
"display_name": "deep",
"display_name": "dl",
"language": "python",
"name": "python3"
},
@ -502,7 +599,7 @@
"orig_nbformat": 4,
"vscode": {
"interpreter": {
"hash": "1e61067c2f2e27a88e433eed08bcab15943261b719f4667f6d0d352911f3557f"
"hash": "6e9239598a6712340c2b580c5c929949b8a813e86738fb7cf0a67c11d0863b74"
}
}
},