diff --git a/dane.ipynb b/dane.ipynb index 36eae4a..53b8a44 100644 --- a/dane.ipynb +++ b/dane.ipynb @@ -2,18 +2,19 @@ "cells": [ { "cell_type": "code", - "execution_count": 7, + "execution_count": 3, "metadata": {}, "outputs": [], "source": [ "import numpy as np\n", "import pandas as pd\n", - "from IPython.display import display,Markdown" + "from IPython.display import display,Markdown\n", + "from sklearn.model_selection import train_test_split" ] }, { "cell_type": "code", - "execution_count": 8, + "execution_count": 4, "metadata": {}, "outputs": [], "source": [ @@ -27,7 +28,7 @@ }, { "cell_type": "code", - "execution_count": 12, + "execution_count": 5, "metadata": {}, "outputs": [], "source": [ @@ -47,7 +48,7 @@ }, { "cell_type": "code", - "execution_count": 13, + "execution_count": 6, "metadata": {}, "outputs": [ { @@ -330,7 +331,7 @@ }, { "cell_type": "code", - "execution_count": 14, + "execution_count": 7, "metadata": {}, "outputs": [ { @@ -448,7 +449,7 @@ }, { "cell_type": "code", - "execution_count": 17, + "execution_count": 8, "metadata": {}, "outputs": [ { @@ -479,11 +480,107 @@ "source": [ "display(dataset.info())" ] + }, + { + "cell_type": "code", + "execution_count": 28, + "metadata": {}, + "outputs": [ + { + "data": { + "text/markdown": [ + "### STD" + ], + "text/plain": [ + "" + ] + }, + "metadata": {}, + "output_type": "display_data" + }, + { + "name": "stdout", + "output_type": "stream", + "text": [ + "y_train std: 0.49939397301167954\n", + "y_val std: 0.4997839588710888\n", + "y_test std: 0.4998194469400359\n" + ] + }, + { + "data": { + "text/markdown": [ + "### MEAN" + ], + "text/plain": [ + "" + ] + }, + "metadata": {}, + "output_type": "display_data" + }, + { + "name": "stdout", + "output_type": "stream", + "text": [ + "y_train mean: 0.475249178684782\n", + "y_val mean: 0.4835189309576837\n", + "y_test mean: 0.4846325167037862\n" + ] + }, + { + "data": { + "text/markdown": [ + "### Count" + ], + "text/plain": [ + "" + ] + }, + "metadata": {}, + "output_type": "display_data" + }, + { + "name": "stdout", + "output_type": "stream", + "text": [ + "y_train count: 35918\n", + "y_val count: 4490\n", + "y_test count: 4490\n" + ] + } + ], + "source": [ + "# creating train, val , test datasets dataset 8:1:1\n", + "X_train, X_val_test, y_train, y_valtest = train_test_split(dataset[\"text\"],dataset[\"Value\"],test_size=0.2, shuffle=True)\n", + "X_test, X_val, y_test, y_val = train_test_split(X_val_test,y_valtest,test_size=0.5, shuffle=True)\n", + "display(Markdown(\"### STD\"))\n", + "print(f\"y_train std: {y_train.std()}\")\n", + "print(f\"y_val std: {y_val.std()}\")\n", + "print(f\"y_test std: {y_test.std()}\")\n", + "\n", + "display(Markdown(\"### MEAN\"))\n", + "print(f\"y_train mean: {y_train.mean()}\")\n", + "print(f\"y_val mean: {y_val.mean()}\")\n", + "print(f\"y_test mean: {y_test.mean()}\")\n", + "\n", + "display(Markdown(\"### Count\"))\n", + "print(f\"y_train count: {y_train.count()}\")\n", + "print(f\"y_val count: {y_val.count()}\")\n", + "print(f\"y_test count: {y_test.count()}\")\n" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": {}, + "outputs": [], + "source": [] } ], "metadata": { "kernelspec": { - "display_name": "deep", + "display_name": "dl", "language": "python", "name": "python3" }, @@ -502,7 +599,7 @@ "orig_nbformat": 4, "vscode": { "interpreter": { - "hash": "1e61067c2f2e27a88e433eed08bcab15943261b719f4667f6d0d352911f3557f" + "hash": "6e9239598a6712340c2b580c5c929949b8a813e86738fb7cf0a67c11d0863b74" } } },