{
 "cells": [
  {
   "cell_type": "markdown",
   "id": "academic-calvin",
   "metadata": {},
   "source": [
    "### Skrypt do ściagnięcia zbiory danych"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "id": "compound-politics",
   "metadata": {},
   "outputs": [],
   "source": [
    "!pip install --user kaggle \n",
    "!pip install --user pandas\n",
    "!pip install --user numpy\n",
    "!pip install --user seaborn\n",
    "!pip install -U scikit-learn"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "id": "hundred-limitation",
   "metadata": {},
   "outputs": [],
   "source": [
    "!echo \"Downloading dataset from Kaggle...\"\n",
    "!kaggle datasets download -d harshitshankhdhar/imdb-dataset-of-top-1000-movies-and-tv-shows\n",
    "!echo \"Done.\""
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "id": "provincial-circuit",
   "metadata": {},
   "outputs": [],
   "source": [
    "!echo \"Unzipping archive\"\n",
    "!files=$(unzip imdb-dataset-of-top-1000-movies-and-tv-shows.zip | tail -n +2 | cut -d ' ' -f 4)\n",
    "!echo \"Done.\""
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 81,
   "id": "armed-brisbane",
   "metadata": {},
   "outputs": [],
   "source": [
    "import pandas as pd\n",
    "import numpy as np\n",
    "data=pd.read_csv('imdb_top_1000.csv')\n",
    "# data"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 82,
   "id": "nominated-grenada",
   "metadata": {},
   "outputs": [
    {
     "name": "stdout",
     "output_type": "stream",
     "text": [
      "1001 imdb_top_1000.csv\n"
     ]
    }
   ],
   "source": [
    "#Wielkosc zbioru\n",
    "!wc -l imdb_top_1000.csv"
   ]
  },
  {
   "cell_type": "markdown",
   "id": "generic-success",
   "metadata": {},
   "source": [
    "## Usuwanie kolumn\n",
    "- Poster_Link: kolumna zawierająca linki do plakatów promujących film\n",
    "- Overview: kolumna zawierająca recenzje poszczególnych filmów"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 83,
   "id": "compliant-synthesis",
   "metadata": {},
   "outputs": [],
   "source": [
    "data.drop(columns=[\"Poster_Link\"], inplace=True)\n",
    "data.drop(columns=[\"Overview\"], inplace=True)\n",
    "\n",
    "# data"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 84,
   "id": "reserved-whole",
   "metadata": {},
   "outputs": [],
   "source": [
    "# Lowercase na polach tekstowych\n",
    "data[\"Series_Title\"] = data[\"Series_Title\"].str.lower()\n",
    "data[\"Genre\"] = data[\"Genre\"].str.lower()\n",
    "data[\"Director\"] = data[\"Director\"].str.lower()\n",
    "data[\"Star1\"] = data[\"Star1\"].str.lower()\n",
    "data[\"Star2\"] = data[\"Star2\"].str.lower()\n",
    "data[\"Star3\"] = data[\"Star3\"].str.lower()\n",
    "data[\"Star4\"] = data[\"Star4\"].str.lower()\n",
    "\n",
    "# Usunięcie Nan i string to int \n",
    "data = data.replace(np.nan, '', regex=True)\n",
    "data[\"Gross\"] = data[\"Gross\"].str.replace(',', '')\n",
    "data[\"Gross\"] = pd.to_numeric(data[\"Gross\"], errors='coerce')\n",
    "\n",
    "data = data.dropna()"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 86,
   "id": "given-sodium",
   "metadata": {},
   "outputs": [
    {
     "data": {
      "text/html": [
       "<div>\n",
       "<style scoped>\n",
       "    .dataframe tbody tr th:only-of-type {\n",
       "        vertical-align: middle;\n",
       "    }\n",
       "\n",
       "    .dataframe tbody tr th {\n",
       "        vertical-align: top;\n",
       "    }\n",
       "\n",
       "    .dataframe thead th {\n",
       "        text-align: right;\n",
       "    }\n",
       "</style>\n",
       "<table border=\"1\" class=\"dataframe\">\n",
       "  <thead>\n",
       "    <tr style=\"text-align: right;\">\n",
       "      <th></th>\n",
       "      <th>Series_Title</th>\n",
       "      <th>Released_Year</th>\n",
       "      <th>Certificate</th>\n",
       "      <th>Runtime</th>\n",
       "      <th>Genre</th>\n",
       "      <th>IMDB_Rating</th>\n",
       "      <th>Meta_score</th>\n",
       "      <th>Director</th>\n",
       "      <th>Star1</th>\n",
       "      <th>Star2</th>\n",
       "      <th>Star3</th>\n",
       "      <th>Star4</th>\n",
       "      <th>No_of_Votes</th>\n",
       "      <th>Gross</th>\n",
       "    </tr>\n",
       "  </thead>\n",
       "  <tbody>\n",
       "    <tr>\n",
       "      <th>count</th>\n",
       "      <td>831</td>\n",
       "      <td>831</td>\n",
       "      <td>831</td>\n",
       "      <td>831</td>\n",
       "      <td>831</td>\n",
       "      <td>831.000000</td>\n",
       "      <td>831</td>\n",
       "      <td>831</td>\n",
       "      <td>831</td>\n",
       "      <td>831</td>\n",
       "      <td>831</td>\n",
       "      <td>831</td>\n",
       "      <td>8.310000e+02</td>\n",
       "      <td>8.310000e+02</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>unique</th>\n",
       "      <td>831</td>\n",
       "      <td>95</td>\n",
       "      <td>14</td>\n",
       "      <td>133</td>\n",
       "      <td>182</td>\n",
       "      <td>NaN</td>\n",
       "      <td>64</td>\n",
       "      <td>472</td>\n",
       "      <td>556</td>\n",
       "      <td>704</td>\n",
       "      <td>737</td>\n",
       "      <td>782</td>\n",
       "      <td>NaN</td>\n",
       "      <td>NaN</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>top</th>\n",
       "      <td>a streetcar named desire</td>\n",
       "      <td>2014</td>\n",
       "      <td>U</td>\n",
       "      <td>101 min</td>\n",
       "      <td>drama</td>\n",
       "      <td>NaN</td>\n",
       "      <td></td>\n",
       "      <td>steven spielberg</td>\n",
       "      <td>tom hanks</td>\n",
       "      <td>emma watson</td>\n",
       "      <td>rupert grint</td>\n",
       "      <td>michael caine</td>\n",
       "      <td>NaN</td>\n",
       "      <td>NaN</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>freq</th>\n",
       "      <td>1</td>\n",
       "      <td>31</td>\n",
       "      <td>200</td>\n",
       "      <td>21</td>\n",
       "      <td>75</td>\n",
       "      <td>NaN</td>\n",
       "      <td>81</td>\n",
       "      <td>13</td>\n",
       "      <td>12</td>\n",
       "      <td>7</td>\n",
       "      <td>5</td>\n",
       "      <td>4</td>\n",
       "      <td>NaN</td>\n",
       "      <td>NaN</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>mean</th>\n",
       "      <td>NaN</td>\n",
       "      <td>NaN</td>\n",
       "      <td>NaN</td>\n",
       "      <td>NaN</td>\n",
       "      <td>NaN</td>\n",
       "      <td>7.946931</td>\n",
       "      <td>NaN</td>\n",
       "      <td>NaN</td>\n",
       "      <td>NaN</td>\n",
       "      <td>NaN</td>\n",
       "      <td>NaN</td>\n",
       "      <td>NaN</td>\n",
       "      <td>3.152499e+05</td>\n",
       "      <td>6.803475e+07</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>std</th>\n",
       "      <td>NaN</td>\n",
       "      <td>NaN</td>\n",
       "      <td>NaN</td>\n",
       "      <td>NaN</td>\n",
       "      <td>NaN</td>\n",
       "      <td>0.283204</td>\n",
       "      <td>NaN</td>\n",
       "      <td>NaN</td>\n",
       "      <td>NaN</td>\n",
       "      <td>NaN</td>\n",
       "      <td>NaN</td>\n",
       "      <td>NaN</td>\n",
       "      <td>3.436443e+05</td>\n",
       "      <td>1.097500e+08</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>min</th>\n",
       "      <td>NaN</td>\n",
       "      <td>NaN</td>\n",
       "      <td>NaN</td>\n",
       "      <td>NaN</td>\n",
       "      <td>NaN</td>\n",
       "      <td>7.600000</td>\n",
       "      <td>NaN</td>\n",
       "      <td>NaN</td>\n",
       "      <td>NaN</td>\n",
       "      <td>NaN</td>\n",
       "      <td>NaN</td>\n",
       "      <td>NaN</td>\n",
       "      <td>2.508800e+04</td>\n",
       "      <td>1.305000e+03</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>25%</th>\n",
       "      <td>NaN</td>\n",
       "      <td>NaN</td>\n",
       "      <td>NaN</td>\n",
       "      <td>NaN</td>\n",
       "      <td>NaN</td>\n",
       "      <td>7.700000</td>\n",
       "      <td>NaN</td>\n",
       "      <td>NaN</td>\n",
       "      <td>NaN</td>\n",
       "      <td>NaN</td>\n",
       "      <td>NaN</td>\n",
       "      <td>NaN</td>\n",
       "      <td>7.143000e+04</td>\n",
       "      <td>3.253559e+06</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>50%</th>\n",
       "      <td>NaN</td>\n",
       "      <td>NaN</td>\n",
       "      <td>NaN</td>\n",
       "      <td>NaN</td>\n",
       "      <td>NaN</td>\n",
       "      <td>7.900000</td>\n",
       "      <td>NaN</td>\n",
       "      <td>NaN</td>\n",
       "      <td>NaN</td>\n",
       "      <td>NaN</td>\n",
       "      <td>NaN</td>\n",
       "      <td>NaN</td>\n",
       "      <td>1.867340e+05</td>\n",
       "      <td>2.353089e+07</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>75%</th>\n",
       "      <td>NaN</td>\n",
       "      <td>NaN</td>\n",
       "      <td>NaN</td>\n",
       "      <td>NaN</td>\n",
       "      <td>NaN</td>\n",
       "      <td>8.100000</td>\n",
       "      <td>NaN</td>\n",
       "      <td>NaN</td>\n",
       "      <td>NaN</td>\n",
       "      <td>NaN</td>\n",
       "      <td>NaN</td>\n",
       "      <td>NaN</td>\n",
       "      <td>4.457210e+05</td>\n",
       "      <td>8.075089e+07</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>max</th>\n",
       "      <td>NaN</td>\n",
       "      <td>NaN</td>\n",
       "      <td>NaN</td>\n",
       "      <td>NaN</td>\n",
       "      <td>NaN</td>\n",
       "      <td>9.300000</td>\n",
       "      <td>NaN</td>\n",
       "      <td>NaN</td>\n",
       "      <td>NaN</td>\n",
       "      <td>NaN</td>\n",
       "      <td>NaN</td>\n",
       "      <td>NaN</td>\n",
       "      <td>2.343110e+06</td>\n",
       "      <td>9.366622e+08</td>\n",
       "    </tr>\n",
       "  </tbody>\n",
       "</table>\n",
       "</div>"
      ],
      "text/plain": [
       "                    Series_Title Released_Year Certificate  Runtime  Genre  \\\n",
       "count                        831           831         831      831    831   \n",
       "unique                       831            95          14      133    182   \n",
       "top     a streetcar named desire          2014           U  101 min  drama   \n",
       "freq                           1            31         200       21     75   \n",
       "mean                         NaN           NaN         NaN      NaN    NaN   \n",
       "std                          NaN           NaN         NaN      NaN    NaN   \n",
       "min                          NaN           NaN         NaN      NaN    NaN   \n",
       "25%                          NaN           NaN         NaN      NaN    NaN   \n",
       "50%                          NaN           NaN         NaN      NaN    NaN   \n",
       "75%                          NaN           NaN         NaN      NaN    NaN   \n",
       "max                          NaN           NaN         NaN      NaN    NaN   \n",
       "\n",
       "        IMDB_Rating Meta_score          Director      Star1        Star2  \\\n",
       "count    831.000000        831               831        831          831   \n",
       "unique          NaN         64               472        556          704   \n",
       "top             NaN             steven spielberg  tom hanks  emma watson   \n",
       "freq            NaN         81                13         12            7   \n",
       "mean       7.946931        NaN               NaN        NaN          NaN   \n",
       "std        0.283204        NaN               NaN        NaN          NaN   \n",
       "min        7.600000        NaN               NaN        NaN          NaN   \n",
       "25%        7.700000        NaN               NaN        NaN          NaN   \n",
       "50%        7.900000        NaN               NaN        NaN          NaN   \n",
       "75%        8.100000        NaN               NaN        NaN          NaN   \n",
       "max        9.300000        NaN               NaN        NaN          NaN   \n",
       "\n",
       "               Star3          Star4   No_of_Votes         Gross  \n",
       "count            831            831  8.310000e+02  8.310000e+02  \n",
       "unique           737            782           NaN           NaN  \n",
       "top     rupert grint  michael caine           NaN           NaN  \n",
       "freq               5              4           NaN           NaN  \n",
       "mean             NaN            NaN  3.152499e+05  6.803475e+07  \n",
       "std              NaN            NaN  3.436443e+05  1.097500e+08  \n",
       "min              NaN            NaN  2.508800e+04  1.305000e+03  \n",
       "25%              NaN            NaN  7.143000e+04  3.253559e+06  \n",
       "50%              NaN            NaN  1.867340e+05  2.353089e+07  \n",
       "75%              NaN            NaN  4.457210e+05  8.075089e+07  \n",
       "max              NaN            NaN  2.343110e+06  9.366622e+08  "
      ]
     },
     "execution_count": 86,
     "metadata": {},
     "output_type": "execute_result"
    }
   ],
   "source": [
    "data.describe(include='all')"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 74,
   "id": "effective-treasury",
   "metadata": {},
   "outputs": [
    {
     "data": {
      "text/plain": [
       "IMDB_Rating           7.9\n",
       "No_of_Votes      186734.0\n",
       "Gross          23530892.0\n",
       "dtype: float64"
      ]
     },
     "execution_count": 74,
     "metadata": {},
     "output_type": "execute_result"
    }
   ],
   "source": [
    "data.median()"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 87,
   "id": "egyptian-sacramento",
   "metadata": {},
   "outputs": [
    {
     "data": {
      "text/plain": [
       "(831, 14)"
      ]
     },
     "execution_count": 87,
     "metadata": {},
     "output_type": "execute_result"
    }
   ],
   "source": [
    "data.shape"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 88,
   "id": "intended-christmas",
   "metadata": {},
   "outputs": [
    {
     "name": "stdout",
     "output_type": "stream",
     "text": [
      "(601, 14)\n",
      "(115, 14)\n",
      "(115, 14)\n"
     ]
    }
   ],
   "source": [
    "from sklearn.model_selection import train_test_split\n",
    "import sklearn\n",
    "\n",
    "data_train, data_test = train_test_split(data, test_size=230, random_state=1)\n",
    "data_test, data_dev = train_test_split(data_test, test_size=115, random_state=1)\n",
    "print(data_train.shape)\n",
    "print(data_test.shape)\n",
    "print(data_dev.shape)"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 76,
   "id": "little-gravity",
   "metadata": {},
   "outputs": [
    {
     "data": {
      "text/plain": [
       "0.1913477537437604"
      ]
     },
     "execution_count": 76,
     "metadata": {},
     "output_type": "execute_result"
    }
   ],
   "source": [
    "data_test.size/data_train.size"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 89,
   "id": "executive-canada",
   "metadata": {},
   "outputs": [
    {
     "data": {
      "text/html": [
       "<div>\n",
       "<style scoped>\n",
       "    .dataframe tbody tr th:only-of-type {\n",
       "        vertical-align: middle;\n",
       "    }\n",
       "\n",
       "    .dataframe tbody tr th {\n",
       "        vertical-align: top;\n",
       "    }\n",
       "\n",
       "    .dataframe thead th {\n",
       "        text-align: right;\n",
       "    }\n",
       "</style>\n",
       "<table border=\"1\" class=\"dataframe\">\n",
       "  <thead>\n",
       "    <tr style=\"text-align: right;\">\n",
       "      <th></th>\n",
       "      <th>Series_Title</th>\n",
       "      <th>Released_Year</th>\n",
       "      <th>Certificate</th>\n",
       "      <th>Runtime</th>\n",
       "      <th>Genre</th>\n",
       "      <th>IMDB_Rating</th>\n",
       "      <th>Meta_score</th>\n",
       "      <th>Director</th>\n",
       "      <th>Star1</th>\n",
       "      <th>Star2</th>\n",
       "      <th>Star3</th>\n",
       "      <th>Star4</th>\n",
       "      <th>No_of_Votes</th>\n",
       "      <th>Gross</th>\n",
       "    </tr>\n",
       "  </thead>\n",
       "  <tbody>\n",
       "    <tr>\n",
       "      <th>count</th>\n",
       "      <td>601</td>\n",
       "      <td>601</td>\n",
       "      <td>601</td>\n",
       "      <td>601</td>\n",
       "      <td>601</td>\n",
       "      <td>601.000000</td>\n",
       "      <td>601</td>\n",
       "      <td>601</td>\n",
       "      <td>601</td>\n",
       "      <td>601</td>\n",
       "      <td>601</td>\n",
       "      <td>601</td>\n",
       "      <td>6.010000e+02</td>\n",
       "      <td>6.010000e+02</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>unique</th>\n",
       "      <td>601</td>\n",
       "      <td>90</td>\n",
       "      <td>13</td>\n",
       "      <td>121</td>\n",
       "      <td>162</td>\n",
       "      <td>NaN</td>\n",
       "      <td>59</td>\n",
       "      <td>378</td>\n",
       "      <td>438</td>\n",
       "      <td>530</td>\n",
       "      <td>556</td>\n",
       "      <td>577</td>\n",
       "      <td>NaN</td>\n",
       "      <td>NaN</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>top</th>\n",
       "      <td>what ever happened to baby jane?</td>\n",
       "      <td>2014</td>\n",
       "      <td>U</td>\n",
       "      <td>101 min</td>\n",
       "      <td>drama</td>\n",
       "      <td>NaN</td>\n",
       "      <td></td>\n",
       "      <td>martin scorsese</td>\n",
       "      <td>clint eastwood</td>\n",
       "      <td>emma watson</td>\n",
       "      <td>joe pesci</td>\n",
       "      <td>michael caine</td>\n",
       "      <td>NaN</td>\n",
       "      <td>NaN</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>freq</th>\n",
       "      <td>1</td>\n",
       "      <td>22</td>\n",
       "      <td>143</td>\n",
       "      <td>17</td>\n",
       "      <td>53</td>\n",
       "      <td>NaN</td>\n",
       "      <td>53</td>\n",
       "      <td>10</td>\n",
       "      <td>10</td>\n",
       "      <td>5</td>\n",
       "      <td>4</td>\n",
       "      <td>4</td>\n",
       "      <td>NaN</td>\n",
       "      <td>NaN</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>mean</th>\n",
       "      <td>NaN</td>\n",
       "      <td>NaN</td>\n",
       "      <td>NaN</td>\n",
       "      <td>NaN</td>\n",
       "      <td>NaN</td>\n",
       "      <td>7.947920</td>\n",
       "      <td>NaN</td>\n",
       "      <td>NaN</td>\n",
       "      <td>NaN</td>\n",
       "      <td>NaN</td>\n",
       "      <td>NaN</td>\n",
       "      <td>NaN</td>\n",
       "      <td>3.174649e+05</td>\n",
       "      <td>6.775699e+07</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>std</th>\n",
       "      <td>NaN</td>\n",
       "      <td>NaN</td>\n",
       "      <td>NaN</td>\n",
       "      <td>NaN</td>\n",
       "      <td>NaN</td>\n",
       "      <td>0.280238</td>\n",
       "      <td>NaN</td>\n",
       "      <td>NaN</td>\n",
       "      <td>NaN</td>\n",
       "      <td>NaN</td>\n",
       "      <td>NaN</td>\n",
       "      <td>NaN</td>\n",
       "      <td>3.407094e+05</td>\n",
       "      <td>1.095511e+08</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>min</th>\n",
       "      <td>NaN</td>\n",
       "      <td>NaN</td>\n",
       "      <td>NaN</td>\n",
       "      <td>NaN</td>\n",
       "      <td>NaN</td>\n",
       "      <td>7.600000</td>\n",
       "      <td>NaN</td>\n",
       "      <td>NaN</td>\n",
       "      <td>NaN</td>\n",
       "      <td>NaN</td>\n",
       "      <td>NaN</td>\n",
       "      <td>NaN</td>\n",
       "      <td>2.508800e+04</td>\n",
       "      <td>1.305000e+03</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>25%</th>\n",
       "      <td>NaN</td>\n",
       "      <td>NaN</td>\n",
       "      <td>NaN</td>\n",
       "      <td>NaN</td>\n",
       "      <td>NaN</td>\n",
       "      <td>7.700000</td>\n",
       "      <td>NaN</td>\n",
       "      <td>NaN</td>\n",
       "      <td>NaN</td>\n",
       "      <td>NaN</td>\n",
       "      <td>NaN</td>\n",
       "      <td>NaN</td>\n",
       "      <td>6.846300e+04</td>\n",
       "      <td>3.151130e+06</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>50%</th>\n",
       "      <td>NaN</td>\n",
       "      <td>NaN</td>\n",
       "      <td>NaN</td>\n",
       "      <td>NaN</td>\n",
       "      <td>NaN</td>\n",
       "      <td>7.900000</td>\n",
       "      <td>NaN</td>\n",
       "      <td>NaN</td>\n",
       "      <td>NaN</td>\n",
       "      <td>NaN</td>\n",
       "      <td>NaN</td>\n",
       "      <td>NaN</td>\n",
       "      <td>1.897160e+05</td>\n",
       "      <td>2.365000e+07</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>75%</th>\n",
       "      <td>NaN</td>\n",
       "      <td>NaN</td>\n",
       "      <td>NaN</td>\n",
       "      <td>NaN</td>\n",
       "      <td>NaN</td>\n",
       "      <td>8.100000</td>\n",
       "      <td>NaN</td>\n",
       "      <td>NaN</td>\n",
       "      <td>NaN</td>\n",
       "      <td>NaN</td>\n",
       "      <td>NaN</td>\n",
       "      <td>NaN</td>\n",
       "      <td>4.622520e+05</td>\n",
       "      <td>7.891296e+07</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>max</th>\n",
       "      <td>NaN</td>\n",
       "      <td>NaN</td>\n",
       "      <td>NaN</td>\n",
       "      <td>NaN</td>\n",
       "      <td>NaN</td>\n",
       "      <td>9.200000</td>\n",
       "      <td>NaN</td>\n",
       "      <td>NaN</td>\n",
       "      <td>NaN</td>\n",
       "      <td>NaN</td>\n",
       "      <td>NaN</td>\n",
       "      <td>NaN</td>\n",
       "      <td>2.303232e+06</td>\n",
       "      <td>8.583730e+08</td>\n",
       "    </tr>\n",
       "  </tbody>\n",
       "</table>\n",
       "</div>"
      ],
      "text/plain": [
       "                            Series_Title Released_Year Certificate  Runtime  \\\n",
       "count                                601           601         601      601   \n",
       "unique                               601            90          13      121   \n",
       "top     what ever happened to baby jane?          2014           U  101 min   \n",
       "freq                                   1            22         143       17   \n",
       "mean                                 NaN           NaN         NaN      NaN   \n",
       "std                                  NaN           NaN         NaN      NaN   \n",
       "min                                  NaN           NaN         NaN      NaN   \n",
       "25%                                  NaN           NaN         NaN      NaN   \n",
       "50%                                  NaN           NaN         NaN      NaN   \n",
       "75%                                  NaN           NaN         NaN      NaN   \n",
       "max                                  NaN           NaN         NaN      NaN   \n",
       "\n",
       "        Genre  IMDB_Rating Meta_score         Director           Star1  \\\n",
       "count     601   601.000000        601              601             601   \n",
       "unique    162          NaN         59              378             438   \n",
       "top     drama          NaN             martin scorsese  clint eastwood   \n",
       "freq       53          NaN         53               10              10   \n",
       "mean      NaN     7.947920        NaN              NaN             NaN   \n",
       "std       NaN     0.280238        NaN              NaN             NaN   \n",
       "min       NaN     7.600000        NaN              NaN             NaN   \n",
       "25%       NaN     7.700000        NaN              NaN             NaN   \n",
       "50%       NaN     7.900000        NaN              NaN             NaN   \n",
       "75%       NaN     8.100000        NaN              NaN             NaN   \n",
       "max       NaN     9.200000        NaN              NaN             NaN   \n",
       "\n",
       "              Star2      Star3          Star4   No_of_Votes         Gross  \n",
       "count           601        601            601  6.010000e+02  6.010000e+02  \n",
       "unique          530        556            577           NaN           NaN  \n",
       "top     emma watson  joe pesci  michael caine           NaN           NaN  \n",
       "freq              5          4              4           NaN           NaN  \n",
       "mean            NaN        NaN            NaN  3.174649e+05  6.775699e+07  \n",
       "std             NaN        NaN            NaN  3.407094e+05  1.095511e+08  \n",
       "min             NaN        NaN            NaN  2.508800e+04  1.305000e+03  \n",
       "25%             NaN        NaN            NaN  6.846300e+04  3.151130e+06  \n",
       "50%             NaN        NaN            NaN  1.897160e+05  2.365000e+07  \n",
       "75%             NaN        NaN            NaN  4.622520e+05  7.891296e+07  \n",
       "max             NaN        NaN            NaN  2.303232e+06  8.583730e+08  "
      ]
     },
     "execution_count": 89,
     "metadata": {},
     "output_type": "execute_result"
    }
   ],
   "source": [
    "data_train.describe(include=\"all\")"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 78,
   "id": "alert-campus",
   "metadata": {},
   "outputs": [
    {
     "data": {
      "text/html": [
       "<div>\n",
       "<style scoped>\n",
       "    .dataframe tbody tr th:only-of-type {\n",
       "        vertical-align: middle;\n",
       "    }\n",
       "\n",
       "    .dataframe tbody tr th {\n",
       "        vertical-align: top;\n",
       "    }\n",
       "\n",
       "    .dataframe thead th {\n",
       "        text-align: right;\n",
       "    }\n",
       "</style>\n",
       "<table border=\"1\" class=\"dataframe\">\n",
       "  <thead>\n",
       "    <tr style=\"text-align: right;\">\n",
       "      <th></th>\n",
       "      <th>Series_Title</th>\n",
       "      <th>Released_Year</th>\n",
       "      <th>Certificate</th>\n",
       "      <th>Runtime</th>\n",
       "      <th>Genre</th>\n",
       "      <th>IMDB_Rating</th>\n",
       "      <th>Meta_score</th>\n",
       "      <th>Director</th>\n",
       "      <th>Star1</th>\n",
       "      <th>Star2</th>\n",
       "      <th>Star3</th>\n",
       "      <th>Star4</th>\n",
       "      <th>No_of_Votes</th>\n",
       "      <th>Gross</th>\n",
       "    </tr>\n",
       "  </thead>\n",
       "  <tbody>\n",
       "    <tr>\n",
       "      <th>count</th>\n",
       "      <td>115</td>\n",
       "      <td>115</td>\n",
       "      <td>115</td>\n",
       "      <td>115</td>\n",
       "      <td>115</td>\n",
       "      <td>115.000000</td>\n",
       "      <td>115</td>\n",
       "      <td>115</td>\n",
       "      <td>115</td>\n",
       "      <td>115</td>\n",
       "      <td>115</td>\n",
       "      <td>115</td>\n",
       "      <td>1.150000e+02</td>\n",
       "      <td>1.150000e+02</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>unique</th>\n",
       "      <td>115</td>\n",
       "      <td>57</td>\n",
       "      <td>10</td>\n",
       "      <td>64</td>\n",
       "      <td>59</td>\n",
       "      <td>NaN</td>\n",
       "      <td>44</td>\n",
       "      <td>105</td>\n",
       "      <td>100</td>\n",
       "      <td>113</td>\n",
       "      <td>109</td>\n",
       "      <td>114</td>\n",
       "      <td>NaN</td>\n",
       "      <td>NaN</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>top</th>\n",
       "      <td>queen</td>\n",
       "      <td>2013</td>\n",
       "      <td>U</td>\n",
       "      <td>102 min</td>\n",
       "      <td>drama</td>\n",
       "      <td>NaN</td>\n",
       "      <td></td>\n",
       "      <td>frank darabont</td>\n",
       "      <td>al pacino</td>\n",
       "      <td>emma watson</td>\n",
       "      <td>carrie fisher</td>\n",
       "      <td>lucy liu</td>\n",
       "      <td>NaN</td>\n",
       "      <td>NaN</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>freq</th>\n",
       "      <td>1</td>\n",
       "      <td>7</td>\n",
       "      <td>30</td>\n",
       "      <td>7</td>\n",
       "      <td>14</td>\n",
       "      <td>NaN</td>\n",
       "      <td>16</td>\n",
       "      <td>2</td>\n",
       "      <td>4</td>\n",
       "      <td>2</td>\n",
       "      <td>2</td>\n",
       "      <td>2</td>\n",
       "      <td>NaN</td>\n",
       "      <td>NaN</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>mean</th>\n",
       "      <td>NaN</td>\n",
       "      <td>NaN</td>\n",
       "      <td>NaN</td>\n",
       "      <td>NaN</td>\n",
       "      <td>NaN</td>\n",
       "      <td>7.947826</td>\n",
       "      <td>NaN</td>\n",
       "      <td>NaN</td>\n",
       "      <td>NaN</td>\n",
       "      <td>NaN</td>\n",
       "      <td>NaN</td>\n",
       "      <td>NaN</td>\n",
       "      <td>3.140691e+05</td>\n",
       "      <td>6.622925e+07</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>std</th>\n",
       "      <td>NaN</td>\n",
       "      <td>NaN</td>\n",
       "      <td>NaN</td>\n",
       "      <td>NaN</td>\n",
       "      <td>NaN</td>\n",
       "      <td>0.313259</td>\n",
       "      <td>NaN</td>\n",
       "      <td>NaN</td>\n",
       "      <td>NaN</td>\n",
       "      <td>NaN</td>\n",
       "      <td>NaN</td>\n",
       "      <td>NaN</td>\n",
       "      <td>3.647432e+05</td>\n",
       "      <td>9.085320e+07</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>min</th>\n",
       "      <td>NaN</td>\n",
       "      <td>NaN</td>\n",
       "      <td>NaN</td>\n",
       "      <td>NaN</td>\n",
       "      <td>NaN</td>\n",
       "      <td>7.600000</td>\n",
       "      <td>NaN</td>\n",
       "      <td>NaN</td>\n",
       "      <td>NaN</td>\n",
       "      <td>NaN</td>\n",
       "      <td>NaN</td>\n",
       "      <td>NaN</td>\n",
       "      <td>2.669700e+04</td>\n",
       "      <td>1.095000e+04</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>25%</th>\n",
       "      <td>NaN</td>\n",
       "      <td>NaN</td>\n",
       "      <td>NaN</td>\n",
       "      <td>NaN</td>\n",
       "      <td>NaN</td>\n",
       "      <td>7.700000</td>\n",
       "      <td>NaN</td>\n",
       "      <td>NaN</td>\n",
       "      <td>NaN</td>\n",
       "      <td>NaN</td>\n",
       "      <td>NaN</td>\n",
       "      <td>NaN</td>\n",
       "      <td>7.206000e+04</td>\n",
       "      <td>4.232562e+06</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>50%</th>\n",
       "      <td>NaN</td>\n",
       "      <td>NaN</td>\n",
       "      <td>NaN</td>\n",
       "      <td>NaN</td>\n",
       "      <td>NaN</td>\n",
       "      <td>7.900000</td>\n",
       "      <td>NaN</td>\n",
       "      <td>NaN</td>\n",
       "      <td>NaN</td>\n",
       "      <td>NaN</td>\n",
       "      <td>NaN</td>\n",
       "      <td>NaN</td>\n",
       "      <td>1.654650e+05</td>\n",
       "      <td>2.602096e+07</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>75%</th>\n",
       "      <td>NaN</td>\n",
       "      <td>NaN</td>\n",
       "      <td>NaN</td>\n",
       "      <td>NaN</td>\n",
       "      <td>NaN</td>\n",
       "      <td>8.100000</td>\n",
       "      <td>NaN</td>\n",
       "      <td>NaN</td>\n",
       "      <td>NaN</td>\n",
       "      <td>NaN</td>\n",
       "      <td>NaN</td>\n",
       "      <td>NaN</td>\n",
       "      <td>4.268040e+05</td>\n",
       "      <td>7.556908e+07</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>max</th>\n",
       "      <td>NaN</td>\n",
       "      <td>NaN</td>\n",
       "      <td>NaN</td>\n",
       "      <td>NaN</td>\n",
       "      <td>NaN</td>\n",
       "      <td>9.300000</td>\n",
       "      <td>NaN</td>\n",
       "      <td>NaN</td>\n",
       "      <td>NaN</td>\n",
       "      <td>NaN</td>\n",
       "      <td>NaN</td>\n",
       "      <td>NaN</td>\n",
       "      <td>2.343110e+06</td>\n",
       "      <td>3.808433e+08</td>\n",
       "    </tr>\n",
       "  </tbody>\n",
       "</table>\n",
       "</div>"
      ],
      "text/plain": [
       "       Series_Title Released_Year Certificate  Runtime  Genre  IMDB_Rating  \\\n",
       "count           115           115         115      115    115   115.000000   \n",
       "unique          115            57          10       64     59          NaN   \n",
       "top           queen          2013           U  102 min  drama          NaN   \n",
       "freq              1             7          30        7     14          NaN   \n",
       "mean            NaN           NaN         NaN      NaN    NaN     7.947826   \n",
       "std             NaN           NaN         NaN      NaN    NaN     0.313259   \n",
       "min             NaN           NaN         NaN      NaN    NaN     7.600000   \n",
       "25%             NaN           NaN         NaN      NaN    NaN     7.700000   \n",
       "50%             NaN           NaN         NaN      NaN    NaN     7.900000   \n",
       "75%             NaN           NaN         NaN      NaN    NaN     8.100000   \n",
       "max             NaN           NaN         NaN      NaN    NaN     9.300000   \n",
       "\n",
       "       Meta_score        Director      Star1        Star2          Star3  \\\n",
       "count         115             115        115          115            115   \n",
       "unique         44             105        100          113            109   \n",
       "top                frank darabont  al pacino  emma watson  carrie fisher   \n",
       "freq           16               2          4            2              2   \n",
       "mean          NaN             NaN        NaN          NaN            NaN   \n",
       "std           NaN             NaN        NaN          NaN            NaN   \n",
       "min           NaN             NaN        NaN          NaN            NaN   \n",
       "25%           NaN             NaN        NaN          NaN            NaN   \n",
       "50%           NaN             NaN        NaN          NaN            NaN   \n",
       "75%           NaN             NaN        NaN          NaN            NaN   \n",
       "max           NaN             NaN        NaN          NaN            NaN   \n",
       "\n",
       "           Star4   No_of_Votes         Gross  \n",
       "count        115  1.150000e+02  1.150000e+02  \n",
       "unique       114           NaN           NaN  \n",
       "top     lucy liu           NaN           NaN  \n",
       "freq           2           NaN           NaN  \n",
       "mean         NaN  3.140691e+05  6.622925e+07  \n",
       "std          NaN  3.647432e+05  9.085320e+07  \n",
       "min          NaN  2.669700e+04  1.095000e+04  \n",
       "25%          NaN  7.206000e+04  4.232562e+06  \n",
       "50%          NaN  1.654650e+05  2.602096e+07  \n",
       "75%          NaN  4.268040e+05  7.556908e+07  \n",
       "max          NaN  2.343110e+06  3.808433e+08  "
      ]
     },
     "execution_count": 78,
     "metadata": {},
     "output_type": "execute_result"
    }
   ],
   "source": [
    "data_test.describe(include=\"all\")"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 79,
   "id": "little-mathematics",
   "metadata": {},
   "outputs": [
    {
     "data": {
      "text/html": [
       "<div>\n",
       "<style scoped>\n",
       "    .dataframe tbody tr th:only-of-type {\n",
       "        vertical-align: middle;\n",
       "    }\n",
       "\n",
       "    .dataframe tbody tr th {\n",
       "        vertical-align: top;\n",
       "    }\n",
       "\n",
       "    .dataframe thead th {\n",
       "        text-align: right;\n",
       "    }\n",
       "</style>\n",
       "<table border=\"1\" class=\"dataframe\">\n",
       "  <thead>\n",
       "    <tr style=\"text-align: right;\">\n",
       "      <th></th>\n",
       "      <th>Series_Title</th>\n",
       "      <th>Released_Year</th>\n",
       "      <th>Certificate</th>\n",
       "      <th>Runtime</th>\n",
       "      <th>Genre</th>\n",
       "      <th>IMDB_Rating</th>\n",
       "      <th>Meta_score</th>\n",
       "      <th>Director</th>\n",
       "      <th>Star1</th>\n",
       "      <th>Star2</th>\n",
       "      <th>Star3</th>\n",
       "      <th>Star4</th>\n",
       "      <th>No_of_Votes</th>\n",
       "      <th>Gross</th>\n",
       "    </tr>\n",
       "  </thead>\n",
       "  <tbody>\n",
       "    <tr>\n",
       "      <th>count</th>\n",
       "      <td>115</td>\n",
       "      <td>115</td>\n",
       "      <td>115</td>\n",
       "      <td>115</td>\n",
       "      <td>115</td>\n",
       "      <td>115.000000</td>\n",
       "      <td>115</td>\n",
       "      <td>115</td>\n",
       "      <td>115</td>\n",
       "      <td>115</td>\n",
       "      <td>115</td>\n",
       "      <td>115</td>\n",
       "      <td>1.150000e+02</td>\n",
       "      <td>1.150000e+02</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>unique</th>\n",
       "      <td>115</td>\n",
       "      <td>56</td>\n",
       "      <td>8</td>\n",
       "      <td>72</td>\n",
       "      <td>71</td>\n",
       "      <td>NaN</td>\n",
       "      <td>42</td>\n",
       "      <td>101</td>\n",
       "      <td>104</td>\n",
       "      <td>115</td>\n",
       "      <td>115</td>\n",
       "      <td>112</td>\n",
       "      <td>NaN</td>\n",
       "      <td>NaN</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>top</th>\n",
       "      <td>mr. smith goes to washington</td>\n",
       "      <td>2004</td>\n",
       "      <td>UA</td>\n",
       "      <td>120 min</td>\n",
       "      <td>drama</td>\n",
       "      <td>NaN</td>\n",
       "      <td></td>\n",
       "      <td>billy wilder</td>\n",
       "      <td>johnny depp</td>\n",
       "      <td>charlize theron</td>\n",
       "      <td>joel edgerton</td>\n",
       "      <td>kevin bacon</td>\n",
       "      <td>NaN</td>\n",
       "      <td>NaN</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>freq</th>\n",
       "      <td>1</td>\n",
       "      <td>6</td>\n",
       "      <td>28</td>\n",
       "      <td>5</td>\n",
       "      <td>8</td>\n",
       "      <td>NaN</td>\n",
       "      <td>12</td>\n",
       "      <td>3</td>\n",
       "      <td>3</td>\n",
       "      <td>1</td>\n",
       "      <td>1</td>\n",
       "      <td>2</td>\n",
       "      <td>NaN</td>\n",
       "      <td>NaN</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>mean</th>\n",
       "      <td>NaN</td>\n",
       "      <td>NaN</td>\n",
       "      <td>NaN</td>\n",
       "      <td>NaN</td>\n",
       "      <td>NaN</td>\n",
       "      <td>7.940870</td>\n",
       "      <td>NaN</td>\n",
       "      <td>NaN</td>\n",
       "      <td>NaN</td>\n",
       "      <td>NaN</td>\n",
       "      <td>NaN</td>\n",
       "      <td>NaN</td>\n",
       "      <td>3.048547e+05</td>\n",
       "      <td>7.129188e+07</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>std</th>\n",
       "      <td>NaN</td>\n",
       "      <td>NaN</td>\n",
       "      <td>NaN</td>\n",
       "      <td>NaN</td>\n",
       "      <td>NaN</td>\n",
       "      <td>0.269143</td>\n",
       "      <td>NaN</td>\n",
       "      <td>NaN</td>\n",
       "      <td>NaN</td>\n",
       "      <td>NaN</td>\n",
       "      <td>NaN</td>\n",
       "      <td>NaN</td>\n",
       "      <td>3.400764e+05</td>\n",
       "      <td>1.275242e+08</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>min</th>\n",
       "      <td>NaN</td>\n",
       "      <td>NaN</td>\n",
       "      <td>NaN</td>\n",
       "      <td>NaN</td>\n",
       "      <td>NaN</td>\n",
       "      <td>7.600000</td>\n",
       "      <td>NaN</td>\n",
       "      <td>NaN</td>\n",
       "      <td>NaN</td>\n",
       "      <td>NaN</td>\n",
       "      <td>NaN</td>\n",
       "      <td>NaN</td>\n",
       "      <td>2.522900e+04</td>\n",
       "      <td>3.600000e+03</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>25%</th>\n",
       "      <td>NaN</td>\n",
       "      <td>NaN</td>\n",
       "      <td>NaN</td>\n",
       "      <td>NaN</td>\n",
       "      <td>NaN</td>\n",
       "      <td>7.700000</td>\n",
       "      <td>NaN</td>\n",
       "      <td>NaN</td>\n",
       "      <td>NaN</td>\n",
       "      <td>NaN</td>\n",
       "      <td>NaN</td>\n",
       "      <td>NaN</td>\n",
       "      <td>8.123350e+04</td>\n",
       "      <td>3.425538e+06</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>50%</th>\n",
       "      <td>NaN</td>\n",
       "      <td>NaN</td>\n",
       "      <td>NaN</td>\n",
       "      <td>NaN</td>\n",
       "      <td>NaN</td>\n",
       "      <td>7.900000</td>\n",
       "      <td>NaN</td>\n",
       "      <td>NaN</td>\n",
       "      <td>NaN</td>\n",
       "      <td>NaN</td>\n",
       "      <td>NaN</td>\n",
       "      <td>NaN</td>\n",
       "      <td>1.820090e+05</td>\n",
       "      <td>2.018666e+07</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>75%</th>\n",
       "      <td>NaN</td>\n",
       "      <td>NaN</td>\n",
       "      <td>NaN</td>\n",
       "      <td>NaN</td>\n",
       "      <td>NaN</td>\n",
       "      <td>8.100000</td>\n",
       "      <td>NaN</td>\n",
       "      <td>NaN</td>\n",
       "      <td>NaN</td>\n",
       "      <td>NaN</td>\n",
       "      <td>NaN</td>\n",
       "      <td>NaN</td>\n",
       "      <td>4.148195e+05</td>\n",
       "      <td>8.406197e+07</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>max</th>\n",
       "      <td>NaN</td>\n",
       "      <td>NaN</td>\n",
       "      <td>NaN</td>\n",
       "      <td>NaN</td>\n",
       "      <td>NaN</td>\n",
       "      <td>8.800000</td>\n",
       "      <td>NaN</td>\n",
       "      <td>NaN</td>\n",
       "      <td>NaN</td>\n",
       "      <td>NaN</td>\n",
       "      <td>NaN</td>\n",
       "      <td>NaN</td>\n",
       "      <td>2.067042e+06</td>\n",
       "      <td>9.366622e+08</td>\n",
       "    </tr>\n",
       "  </tbody>\n",
       "</table>\n",
       "</div>"
      ],
      "text/plain": [
       "                        Series_Title Released_Year Certificate  Runtime  \\\n",
       "count                            115           115         115      115   \n",
       "unique                           115            56           8       72   \n",
       "top     mr. smith goes to washington          2004          UA  120 min   \n",
       "freq                               1             6          28        5   \n",
       "mean                             NaN           NaN         NaN      NaN   \n",
       "std                              NaN           NaN         NaN      NaN   \n",
       "min                              NaN           NaN         NaN      NaN   \n",
       "25%                              NaN           NaN         NaN      NaN   \n",
       "50%                              NaN           NaN         NaN      NaN   \n",
       "75%                              NaN           NaN         NaN      NaN   \n",
       "max                              NaN           NaN         NaN      NaN   \n",
       "\n",
       "        Genre  IMDB_Rating Meta_score      Director        Star1  \\\n",
       "count     115   115.000000        115           115          115   \n",
       "unique     71          NaN         42           101          104   \n",
       "top     drama          NaN             billy wilder  johnny depp   \n",
       "freq        8          NaN         12             3            3   \n",
       "mean      NaN     7.940870        NaN           NaN          NaN   \n",
       "std       NaN     0.269143        NaN           NaN          NaN   \n",
       "min       NaN     7.600000        NaN           NaN          NaN   \n",
       "25%       NaN     7.700000        NaN           NaN          NaN   \n",
       "50%       NaN     7.900000        NaN           NaN          NaN   \n",
       "75%       NaN     8.100000        NaN           NaN          NaN   \n",
       "max       NaN     8.800000        NaN           NaN          NaN   \n",
       "\n",
       "                  Star2          Star3        Star4   No_of_Votes  \\\n",
       "count               115            115          115  1.150000e+02   \n",
       "unique              115            115          112           NaN   \n",
       "top     charlize theron  joel edgerton  kevin bacon           NaN   \n",
       "freq                  1              1            2           NaN   \n",
       "mean                NaN            NaN          NaN  3.048547e+05   \n",
       "std                 NaN            NaN          NaN  3.400764e+05   \n",
       "min                 NaN            NaN          NaN  2.522900e+04   \n",
       "25%                 NaN            NaN          NaN  8.123350e+04   \n",
       "50%                 NaN            NaN          NaN  1.820090e+05   \n",
       "75%                 NaN            NaN          NaN  4.148195e+05   \n",
       "max                 NaN            NaN          NaN  2.067042e+06   \n",
       "\n",
       "               Gross  \n",
       "count   1.150000e+02  \n",
       "unique           NaN  \n",
       "top              NaN  \n",
       "freq             NaN  \n",
       "mean    7.129188e+07  \n",
       "std     1.275242e+08  \n",
       "min     3.600000e+03  \n",
       "25%     3.425538e+06  \n",
       "50%     2.018666e+07  \n",
       "75%     8.406197e+07  \n",
       "max     9.366622e+08  "
      ]
     },
     "execution_count": 79,
     "metadata": {},
     "output_type": "execute_result"
    }
   ],
   "source": [
    "data_dev.describe(include=\"all\")"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 80,
   "id": "sufficient-parade",
   "metadata": {},
   "outputs": [],
   "source": [
    "data_test.to_csv(\"data_test.csv\", encoding=\"utf-8\", index=False)\n",
    "data_dev.to_csv(\"data_dev.csv\", encoding=\"utf-8\", index=False)\n",
    "data_train.to_csv(\"data_train.csv\", encoding=\"utf-8\", index=False)"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "id": "accompanied-virtue",
   "metadata": {},
   "outputs": [],
   "source": []
  }
 ],
 "metadata": {
  "kernelspec": {
   "display_name": "Python 3",
   "language": "python",
   "name": "python3"
  },
  "language_info": {
   "codemirror_mode": {
    "name": "ipython",
    "version": 3
   },
   "file_extension": ".py",
   "mimetype": "text/x-python",
   "name": "python",
   "nbconvert_exporter": "python",
   "pygments_lexer": "ipython3",
   "version": "3.7.3"
  }
 },
 "nbformat": 4,
 "nbformat_minor": 5
}