{ "cells": [ { "cell_type": "markdown", "id": "academic-calvin", "metadata": {}, "source": [ "### Skrypt do ściagnięcia zbiory danych" ] }, { "cell_type": "code", "execution_count": null, "id": "compound-politics", "metadata": {}, "outputs": [], "source": [ "!pip install --user kaggle \n", "!pip install --user pandas\n", "!pip install --user numpy\n", "!pip install --user seaborn\n", "!pip install -U scikit-learn" ] }, { "cell_type": "code", "execution_count": null, "id": "hundred-limitation", "metadata": {}, "outputs": [], "source": [ "!echo \"Downloading dataset from Kaggle...\"\n", "!kaggle datasets download -d harshitshankhdhar/imdb-dataset-of-top-1000-movies-and-tv-shows\n", "!echo \"Done.\"" ] }, { "cell_type": "code", "execution_count": null, "id": "provincial-circuit", "metadata": {}, "outputs": [], "source": [ "!echo \"Unzipping archive\"\n", "!files=$(unzip imdb-dataset-of-top-1000-movies-and-tv-shows.zip | tail -n +2 | cut -d ' ' -f 4)\n", "!echo \"Done.\"" ] }, { "cell_type": "code", "execution_count": 81, "id": "armed-brisbane", "metadata": {}, "outputs": [], "source": [ "import pandas as pd\n", "import numpy as np\n", "data=pd.read_csv('imdb_top_1000.csv')\n", "# data" ] }, { "cell_type": "code", "execution_count": 82, "id": "nominated-grenada", "metadata": {}, "outputs": [ { "name": "stdout", "output_type": "stream", "text": [ "1001 imdb_top_1000.csv\n" ] } ], "source": [ "#Wielkosc zbioru\n", "!wc -l imdb_top_1000.csv" ] }, { "cell_type": "markdown", "id": "generic-success", "metadata": {}, "source": [ "## Usuwanie kolumn\n", "- Poster_Link: kolumna zawierająca linki do plakatów promujących film\n", "- Overview: kolumna zawierająca recenzje poszczególnych filmów" ] }, { "cell_type": "code", "execution_count": 83, "id": "compliant-synthesis", "metadata": {}, "outputs": [], "source": [ "data.drop(columns=[\"Poster_Link\"], inplace=True)\n", "data.drop(columns=[\"Overview\"], inplace=True)\n", "\n", "# data" ] }, { "cell_type": "code", "execution_count": 84, "id": "reserved-whole", "metadata": {}, "outputs": [], "source": [ "# Lowercase na polach tekstowych\n", "data[\"Series_Title\"] = data[\"Series_Title\"].str.lower()\n", "data[\"Genre\"] = data[\"Genre\"].str.lower()\n", "data[\"Director\"] = data[\"Director\"].str.lower()\n", "data[\"Star1\"] = data[\"Star1\"].str.lower()\n", "data[\"Star2\"] = data[\"Star2\"].str.lower()\n", "data[\"Star3\"] = data[\"Star3\"].str.lower()\n", "data[\"Star4\"] = data[\"Star4\"].str.lower()\n", "\n", "# Usunięcie Nan i string to int \n", "data = data.replace(np.nan, '', regex=True)\n", "data[\"Gross\"] = data[\"Gross\"].str.replace(',', '')\n", "data[\"Gross\"] = pd.to_numeric(data[\"Gross\"], errors='coerce')\n", "\n", "data = data.dropna()" ] }, { "cell_type": "code", "execution_count": 86, "id": "given-sodium", "metadata": {}, "outputs": [ { "data": { "text/html": [ "<div>\n", "<style scoped>\n", " .dataframe tbody tr th:only-of-type {\n", " vertical-align: middle;\n", " }\n", "\n", " .dataframe tbody tr th {\n", " vertical-align: top;\n", " }\n", "\n", " .dataframe thead th {\n", " text-align: right;\n", " }\n", "</style>\n", "<table border=\"1\" class=\"dataframe\">\n", " <thead>\n", " <tr style=\"text-align: right;\">\n", " <th></th>\n", " <th>Series_Title</th>\n", " <th>Released_Year</th>\n", " <th>Certificate</th>\n", " <th>Runtime</th>\n", " <th>Genre</th>\n", " <th>IMDB_Rating</th>\n", " <th>Meta_score</th>\n", " <th>Director</th>\n", " <th>Star1</th>\n", " <th>Star2</th>\n", " <th>Star3</th>\n", " <th>Star4</th>\n", " <th>No_of_Votes</th>\n", " <th>Gross</th>\n", " </tr>\n", " </thead>\n", " <tbody>\n", " <tr>\n", " <th>count</th>\n", " <td>831</td>\n", " <td>831</td>\n", " <td>831</td>\n", " <td>831</td>\n", " <td>831</td>\n", " <td>831.000000</td>\n", " <td>831</td>\n", " <td>831</td>\n", " <td>831</td>\n", " <td>831</td>\n", " <td>831</td>\n", " <td>831</td>\n", " <td>8.310000e+02</td>\n", " <td>8.310000e+02</td>\n", " </tr>\n", " <tr>\n", " <th>unique</th>\n", " <td>831</td>\n", " <td>95</td>\n", " <td>14</td>\n", " <td>133</td>\n", " <td>182</td>\n", " <td>NaN</td>\n", " <td>64</td>\n", " <td>472</td>\n", " <td>556</td>\n", " <td>704</td>\n", " <td>737</td>\n", " <td>782</td>\n", " <td>NaN</td>\n", " <td>NaN</td>\n", " </tr>\n", " <tr>\n", " <th>top</th>\n", " <td>a streetcar named desire</td>\n", " <td>2014</td>\n", " <td>U</td>\n", " <td>101 min</td>\n", " <td>drama</td>\n", " <td>NaN</td>\n", " <td></td>\n", " <td>steven spielberg</td>\n", " <td>tom hanks</td>\n", " <td>emma watson</td>\n", " <td>rupert grint</td>\n", " <td>michael caine</td>\n", " <td>NaN</td>\n", " <td>NaN</td>\n", " </tr>\n", " <tr>\n", " <th>freq</th>\n", " <td>1</td>\n", " <td>31</td>\n", " <td>200</td>\n", " <td>21</td>\n", " <td>75</td>\n", " <td>NaN</td>\n", " <td>81</td>\n", " <td>13</td>\n", " <td>12</td>\n", " <td>7</td>\n", " <td>5</td>\n", " <td>4</td>\n", " <td>NaN</td>\n", " <td>NaN</td>\n", " </tr>\n", " <tr>\n", " <th>mean</th>\n", " <td>NaN</td>\n", " <td>NaN</td>\n", " <td>NaN</td>\n", " <td>NaN</td>\n", " <td>NaN</td>\n", " <td>7.946931</td>\n", " <td>NaN</td>\n", " <td>NaN</td>\n", " <td>NaN</td>\n", " <td>NaN</td>\n", " <td>NaN</td>\n", " <td>NaN</td>\n", " <td>3.152499e+05</td>\n", " <td>6.803475e+07</td>\n", " </tr>\n", " <tr>\n", " <th>std</th>\n", " <td>NaN</td>\n", " <td>NaN</td>\n", " <td>NaN</td>\n", " <td>NaN</td>\n", " <td>NaN</td>\n", " <td>0.283204</td>\n", " <td>NaN</td>\n", " <td>NaN</td>\n", " <td>NaN</td>\n", " <td>NaN</td>\n", " <td>NaN</td>\n", " <td>NaN</td>\n", " <td>3.436443e+05</td>\n", " <td>1.097500e+08</td>\n", " </tr>\n", " <tr>\n", " <th>min</th>\n", " <td>NaN</td>\n", " <td>NaN</td>\n", " <td>NaN</td>\n", " <td>NaN</td>\n", " <td>NaN</td>\n", " <td>7.600000</td>\n", " <td>NaN</td>\n", " <td>NaN</td>\n", " <td>NaN</td>\n", " <td>NaN</td>\n", " <td>NaN</td>\n", " <td>NaN</td>\n", " <td>2.508800e+04</td>\n", " <td>1.305000e+03</td>\n", " </tr>\n", " <tr>\n", " <th>25%</th>\n", " <td>NaN</td>\n", " <td>NaN</td>\n", " <td>NaN</td>\n", " <td>NaN</td>\n", " <td>NaN</td>\n", " <td>7.700000</td>\n", " <td>NaN</td>\n", " <td>NaN</td>\n", " <td>NaN</td>\n", " <td>NaN</td>\n", " <td>NaN</td>\n", " <td>NaN</td>\n", " <td>7.143000e+04</td>\n", " <td>3.253559e+06</td>\n", " </tr>\n", " <tr>\n", " <th>50%</th>\n", " <td>NaN</td>\n", " <td>NaN</td>\n", " <td>NaN</td>\n", " <td>NaN</td>\n", " <td>NaN</td>\n", " <td>7.900000</td>\n", " <td>NaN</td>\n", " <td>NaN</td>\n", " <td>NaN</td>\n", " <td>NaN</td>\n", " <td>NaN</td>\n", " <td>NaN</td>\n", " <td>1.867340e+05</td>\n", " <td>2.353089e+07</td>\n", " </tr>\n", " <tr>\n", " <th>75%</th>\n", " <td>NaN</td>\n", " <td>NaN</td>\n", " <td>NaN</td>\n", " <td>NaN</td>\n", " <td>NaN</td>\n", " <td>8.100000</td>\n", " <td>NaN</td>\n", " <td>NaN</td>\n", " <td>NaN</td>\n", " <td>NaN</td>\n", " <td>NaN</td>\n", " <td>NaN</td>\n", " <td>4.457210e+05</td>\n", " <td>8.075089e+07</td>\n", " </tr>\n", " <tr>\n", " <th>max</th>\n", " <td>NaN</td>\n", " <td>NaN</td>\n", " <td>NaN</td>\n", " <td>NaN</td>\n", " <td>NaN</td>\n", " <td>9.300000</td>\n", " <td>NaN</td>\n", " <td>NaN</td>\n", " <td>NaN</td>\n", " <td>NaN</td>\n", " <td>NaN</td>\n", " <td>NaN</td>\n", " <td>2.343110e+06</td>\n", " <td>9.366622e+08</td>\n", " </tr>\n", " </tbody>\n", "</table>\n", "</div>" ], "text/plain": [ " Series_Title Released_Year Certificate Runtime Genre \\\n", "count 831 831 831 831 831 \n", "unique 831 95 14 133 182 \n", "top a streetcar named desire 2014 U 101 min drama \n", "freq 1 31 200 21 75 \n", "mean NaN NaN NaN NaN NaN \n", "std NaN NaN NaN NaN NaN \n", "min NaN NaN NaN NaN NaN \n", "25% NaN NaN NaN NaN NaN \n", "50% NaN NaN NaN NaN NaN \n", "75% NaN NaN NaN NaN NaN \n", "max NaN NaN NaN NaN NaN \n", "\n", " IMDB_Rating Meta_score Director Star1 Star2 \\\n", "count 831.000000 831 831 831 831 \n", "unique NaN 64 472 556 704 \n", "top NaN steven spielberg tom hanks emma watson \n", "freq NaN 81 13 12 7 \n", "mean 7.946931 NaN NaN NaN NaN \n", "std 0.283204 NaN NaN NaN NaN \n", "min 7.600000 NaN NaN NaN NaN \n", "25% 7.700000 NaN NaN NaN NaN \n", "50% 7.900000 NaN NaN NaN NaN \n", "75% 8.100000 NaN NaN NaN NaN \n", "max 9.300000 NaN NaN NaN NaN \n", "\n", " Star3 Star4 No_of_Votes Gross \n", "count 831 831 8.310000e+02 8.310000e+02 \n", "unique 737 782 NaN NaN \n", "top rupert grint michael caine NaN NaN \n", "freq 5 4 NaN NaN \n", "mean NaN NaN 3.152499e+05 6.803475e+07 \n", "std NaN NaN 3.436443e+05 1.097500e+08 \n", "min NaN NaN 2.508800e+04 1.305000e+03 \n", "25% NaN NaN 7.143000e+04 3.253559e+06 \n", "50% NaN NaN 1.867340e+05 2.353089e+07 \n", "75% NaN NaN 4.457210e+05 8.075089e+07 \n", "max NaN NaN 2.343110e+06 9.366622e+08 " ] }, "execution_count": 86, "metadata": {}, "output_type": "execute_result" } ], "source": [ "data.describe(include='all')" ] }, { "cell_type": "code", "execution_count": 74, "id": "effective-treasury", "metadata": {}, "outputs": [ { "data": { "text/plain": [ "IMDB_Rating 7.9\n", "No_of_Votes 186734.0\n", "Gross 23530892.0\n", "dtype: float64" ] }, "execution_count": 74, "metadata": {}, "output_type": "execute_result" } ], "source": [ "data.median()" ] }, { "cell_type": "code", "execution_count": 87, "id": "egyptian-sacramento", "metadata": {}, "outputs": [ { "data": { "text/plain": [ "(831, 14)" ] }, "execution_count": 87, "metadata": {}, "output_type": "execute_result" } ], "source": [ "data.shape" ] }, { "cell_type": "code", "execution_count": 88, "id": "intended-christmas", "metadata": {}, "outputs": [ { "name": "stdout", "output_type": "stream", "text": [ "(601, 14)\n", "(115, 14)\n", "(115, 14)\n" ] } ], "source": [ "from sklearn.model_selection import train_test_split\n", "import sklearn\n", "\n", "data_train, data_test = train_test_split(data, test_size=230, random_state=1)\n", "data_test, data_dev = train_test_split(data_test, test_size=115, random_state=1)\n", "print(data_train.shape)\n", "print(data_test.shape)\n", "print(data_dev.shape)" ] }, { "cell_type": "code", "execution_count": 76, "id": "little-gravity", "metadata": {}, "outputs": [ { "data": { "text/plain": [ "0.1913477537437604" ] }, "execution_count": 76, "metadata": {}, "output_type": "execute_result" } ], "source": [ "data_test.size/data_train.size" ] }, { "cell_type": "code", "execution_count": 89, "id": "executive-canada", "metadata": {}, "outputs": [ { "data": { "text/html": [ "<div>\n", "<style scoped>\n", " .dataframe tbody tr th:only-of-type {\n", " vertical-align: middle;\n", " }\n", "\n", " .dataframe tbody tr th {\n", " vertical-align: top;\n", " }\n", "\n", " .dataframe thead th {\n", " text-align: right;\n", " }\n", "</style>\n", "<table border=\"1\" class=\"dataframe\">\n", " <thead>\n", " <tr style=\"text-align: right;\">\n", " <th></th>\n", " <th>Series_Title</th>\n", " <th>Released_Year</th>\n", " <th>Certificate</th>\n", " <th>Runtime</th>\n", " <th>Genre</th>\n", " <th>IMDB_Rating</th>\n", " <th>Meta_score</th>\n", " <th>Director</th>\n", " <th>Star1</th>\n", " <th>Star2</th>\n", " <th>Star3</th>\n", " <th>Star4</th>\n", " <th>No_of_Votes</th>\n", " <th>Gross</th>\n", " </tr>\n", " </thead>\n", " <tbody>\n", " <tr>\n", " <th>count</th>\n", " <td>601</td>\n", " <td>601</td>\n", " <td>601</td>\n", " <td>601</td>\n", " <td>601</td>\n", " <td>601.000000</td>\n", " <td>601</td>\n", " <td>601</td>\n", " <td>601</td>\n", " <td>601</td>\n", " <td>601</td>\n", " <td>601</td>\n", " <td>6.010000e+02</td>\n", " <td>6.010000e+02</td>\n", " </tr>\n", " <tr>\n", " <th>unique</th>\n", " <td>601</td>\n", " <td>90</td>\n", " <td>13</td>\n", " <td>121</td>\n", " <td>162</td>\n", " <td>NaN</td>\n", " <td>59</td>\n", " <td>378</td>\n", " <td>438</td>\n", " <td>530</td>\n", " <td>556</td>\n", " <td>577</td>\n", " <td>NaN</td>\n", " <td>NaN</td>\n", " </tr>\n", " <tr>\n", " <th>top</th>\n", " <td>what ever happened to baby jane?</td>\n", " <td>2014</td>\n", " <td>U</td>\n", " <td>101 min</td>\n", " <td>drama</td>\n", " <td>NaN</td>\n", " <td></td>\n", " <td>martin scorsese</td>\n", " <td>clint eastwood</td>\n", " <td>emma watson</td>\n", " <td>joe pesci</td>\n", " <td>michael caine</td>\n", " <td>NaN</td>\n", " <td>NaN</td>\n", " </tr>\n", " <tr>\n", " <th>freq</th>\n", " <td>1</td>\n", " <td>22</td>\n", " <td>143</td>\n", " <td>17</td>\n", " <td>53</td>\n", " <td>NaN</td>\n", " <td>53</td>\n", " <td>10</td>\n", " <td>10</td>\n", " <td>5</td>\n", " <td>4</td>\n", " <td>4</td>\n", " <td>NaN</td>\n", " <td>NaN</td>\n", " </tr>\n", " <tr>\n", " <th>mean</th>\n", " <td>NaN</td>\n", " <td>NaN</td>\n", " <td>NaN</td>\n", " <td>NaN</td>\n", " <td>NaN</td>\n", " <td>7.947920</td>\n", " <td>NaN</td>\n", " <td>NaN</td>\n", " <td>NaN</td>\n", " <td>NaN</td>\n", " <td>NaN</td>\n", " <td>NaN</td>\n", " <td>3.174649e+05</td>\n", " <td>6.775699e+07</td>\n", " </tr>\n", " <tr>\n", " <th>std</th>\n", " <td>NaN</td>\n", " <td>NaN</td>\n", " <td>NaN</td>\n", " <td>NaN</td>\n", " <td>NaN</td>\n", " <td>0.280238</td>\n", " <td>NaN</td>\n", " <td>NaN</td>\n", " <td>NaN</td>\n", " <td>NaN</td>\n", " <td>NaN</td>\n", " <td>NaN</td>\n", " <td>3.407094e+05</td>\n", " <td>1.095511e+08</td>\n", " </tr>\n", " <tr>\n", " <th>min</th>\n", " <td>NaN</td>\n", " <td>NaN</td>\n", " <td>NaN</td>\n", " <td>NaN</td>\n", " <td>NaN</td>\n", " <td>7.600000</td>\n", " <td>NaN</td>\n", " <td>NaN</td>\n", " <td>NaN</td>\n", " <td>NaN</td>\n", " <td>NaN</td>\n", " <td>NaN</td>\n", " <td>2.508800e+04</td>\n", " <td>1.305000e+03</td>\n", " </tr>\n", " <tr>\n", " <th>25%</th>\n", " <td>NaN</td>\n", " <td>NaN</td>\n", " <td>NaN</td>\n", " <td>NaN</td>\n", " <td>NaN</td>\n", " <td>7.700000</td>\n", " <td>NaN</td>\n", " <td>NaN</td>\n", " <td>NaN</td>\n", " <td>NaN</td>\n", " <td>NaN</td>\n", " <td>NaN</td>\n", " <td>6.846300e+04</td>\n", " <td>3.151130e+06</td>\n", " </tr>\n", " <tr>\n", " <th>50%</th>\n", " <td>NaN</td>\n", " <td>NaN</td>\n", " <td>NaN</td>\n", " <td>NaN</td>\n", " <td>NaN</td>\n", " <td>7.900000</td>\n", " <td>NaN</td>\n", " <td>NaN</td>\n", " <td>NaN</td>\n", " <td>NaN</td>\n", " <td>NaN</td>\n", " <td>NaN</td>\n", " <td>1.897160e+05</td>\n", " <td>2.365000e+07</td>\n", " </tr>\n", " <tr>\n", " <th>75%</th>\n", " <td>NaN</td>\n", " <td>NaN</td>\n", " <td>NaN</td>\n", " <td>NaN</td>\n", " <td>NaN</td>\n", " <td>8.100000</td>\n", " <td>NaN</td>\n", " <td>NaN</td>\n", " <td>NaN</td>\n", " <td>NaN</td>\n", " <td>NaN</td>\n", " <td>NaN</td>\n", " <td>4.622520e+05</td>\n", " <td>7.891296e+07</td>\n", " </tr>\n", " <tr>\n", " <th>max</th>\n", " <td>NaN</td>\n", " <td>NaN</td>\n", " <td>NaN</td>\n", " <td>NaN</td>\n", " <td>NaN</td>\n", " <td>9.200000</td>\n", " <td>NaN</td>\n", " <td>NaN</td>\n", " <td>NaN</td>\n", " <td>NaN</td>\n", " <td>NaN</td>\n", " <td>NaN</td>\n", " <td>2.303232e+06</td>\n", " <td>8.583730e+08</td>\n", " </tr>\n", " </tbody>\n", "</table>\n", "</div>" ], "text/plain": [ " Series_Title Released_Year Certificate Runtime \\\n", "count 601 601 601 601 \n", "unique 601 90 13 121 \n", "top what ever happened to baby jane? 2014 U 101 min \n", "freq 1 22 143 17 \n", "mean NaN NaN NaN NaN \n", "std NaN NaN NaN NaN \n", "min NaN NaN NaN NaN \n", "25% NaN NaN NaN NaN \n", "50% NaN NaN NaN NaN \n", "75% NaN NaN NaN NaN \n", "max NaN NaN NaN NaN \n", "\n", " Genre IMDB_Rating Meta_score Director Star1 \\\n", "count 601 601.000000 601 601 601 \n", "unique 162 NaN 59 378 438 \n", "top drama NaN martin scorsese clint eastwood \n", "freq 53 NaN 53 10 10 \n", "mean NaN 7.947920 NaN NaN NaN \n", "std NaN 0.280238 NaN NaN NaN \n", "min NaN 7.600000 NaN NaN NaN \n", "25% NaN 7.700000 NaN NaN NaN \n", "50% NaN 7.900000 NaN NaN NaN \n", "75% NaN 8.100000 NaN NaN NaN \n", "max NaN 9.200000 NaN NaN NaN \n", "\n", " Star2 Star3 Star4 No_of_Votes Gross \n", "count 601 601 601 6.010000e+02 6.010000e+02 \n", "unique 530 556 577 NaN NaN \n", "top emma watson joe pesci michael caine NaN NaN \n", "freq 5 4 4 NaN NaN \n", "mean NaN NaN NaN 3.174649e+05 6.775699e+07 \n", "std NaN NaN NaN 3.407094e+05 1.095511e+08 \n", "min NaN NaN NaN 2.508800e+04 1.305000e+03 \n", "25% NaN NaN NaN 6.846300e+04 3.151130e+06 \n", "50% NaN NaN NaN 1.897160e+05 2.365000e+07 \n", "75% NaN NaN NaN 4.622520e+05 7.891296e+07 \n", "max NaN NaN NaN 2.303232e+06 8.583730e+08 " ] }, "execution_count": 89, "metadata": {}, "output_type": "execute_result" } ], "source": [ "data_train.describe(include=\"all\")" ] }, { "cell_type": "code", "execution_count": 78, "id": "alert-campus", "metadata": {}, "outputs": [ { "data": { "text/html": [ "<div>\n", "<style scoped>\n", " .dataframe tbody tr th:only-of-type {\n", " vertical-align: middle;\n", " }\n", "\n", " .dataframe tbody tr th {\n", " vertical-align: top;\n", " }\n", "\n", " .dataframe thead th {\n", " text-align: right;\n", " }\n", "</style>\n", "<table border=\"1\" class=\"dataframe\">\n", " <thead>\n", " <tr style=\"text-align: right;\">\n", " <th></th>\n", " <th>Series_Title</th>\n", " <th>Released_Year</th>\n", " <th>Certificate</th>\n", " <th>Runtime</th>\n", " <th>Genre</th>\n", " <th>IMDB_Rating</th>\n", " <th>Meta_score</th>\n", " <th>Director</th>\n", " <th>Star1</th>\n", " <th>Star2</th>\n", " <th>Star3</th>\n", " <th>Star4</th>\n", " <th>No_of_Votes</th>\n", " <th>Gross</th>\n", " </tr>\n", " </thead>\n", " <tbody>\n", " <tr>\n", " <th>count</th>\n", " <td>115</td>\n", " <td>115</td>\n", " <td>115</td>\n", " <td>115</td>\n", " <td>115</td>\n", " <td>115.000000</td>\n", " <td>115</td>\n", " <td>115</td>\n", " <td>115</td>\n", " <td>115</td>\n", " <td>115</td>\n", " <td>115</td>\n", " <td>1.150000e+02</td>\n", " <td>1.150000e+02</td>\n", " </tr>\n", " <tr>\n", " <th>unique</th>\n", " <td>115</td>\n", " <td>57</td>\n", " <td>10</td>\n", " <td>64</td>\n", " <td>59</td>\n", " <td>NaN</td>\n", " <td>44</td>\n", " <td>105</td>\n", " <td>100</td>\n", " <td>113</td>\n", " <td>109</td>\n", " <td>114</td>\n", " <td>NaN</td>\n", " <td>NaN</td>\n", " </tr>\n", " <tr>\n", " <th>top</th>\n", " <td>queen</td>\n", " <td>2013</td>\n", " <td>U</td>\n", " <td>102 min</td>\n", " <td>drama</td>\n", " <td>NaN</td>\n", " <td></td>\n", " <td>frank darabont</td>\n", " <td>al pacino</td>\n", " <td>emma watson</td>\n", " <td>carrie fisher</td>\n", " <td>lucy liu</td>\n", " <td>NaN</td>\n", " <td>NaN</td>\n", " </tr>\n", " <tr>\n", " <th>freq</th>\n", " <td>1</td>\n", " <td>7</td>\n", " <td>30</td>\n", " <td>7</td>\n", " <td>14</td>\n", " <td>NaN</td>\n", " <td>16</td>\n", " <td>2</td>\n", " <td>4</td>\n", " <td>2</td>\n", " <td>2</td>\n", " <td>2</td>\n", " <td>NaN</td>\n", " <td>NaN</td>\n", " </tr>\n", " <tr>\n", " <th>mean</th>\n", " <td>NaN</td>\n", " <td>NaN</td>\n", " <td>NaN</td>\n", " <td>NaN</td>\n", " <td>NaN</td>\n", " <td>7.947826</td>\n", " <td>NaN</td>\n", " <td>NaN</td>\n", " <td>NaN</td>\n", " <td>NaN</td>\n", " <td>NaN</td>\n", " <td>NaN</td>\n", " <td>3.140691e+05</td>\n", " <td>6.622925e+07</td>\n", " </tr>\n", " <tr>\n", " <th>std</th>\n", " <td>NaN</td>\n", " <td>NaN</td>\n", " <td>NaN</td>\n", " <td>NaN</td>\n", " <td>NaN</td>\n", " <td>0.313259</td>\n", " <td>NaN</td>\n", " <td>NaN</td>\n", " <td>NaN</td>\n", " <td>NaN</td>\n", " <td>NaN</td>\n", " <td>NaN</td>\n", " <td>3.647432e+05</td>\n", " <td>9.085320e+07</td>\n", " </tr>\n", " <tr>\n", " <th>min</th>\n", " <td>NaN</td>\n", " <td>NaN</td>\n", " <td>NaN</td>\n", " <td>NaN</td>\n", " <td>NaN</td>\n", " <td>7.600000</td>\n", " <td>NaN</td>\n", " <td>NaN</td>\n", " <td>NaN</td>\n", " <td>NaN</td>\n", " <td>NaN</td>\n", " <td>NaN</td>\n", " <td>2.669700e+04</td>\n", " <td>1.095000e+04</td>\n", " </tr>\n", " <tr>\n", " <th>25%</th>\n", " <td>NaN</td>\n", " <td>NaN</td>\n", " <td>NaN</td>\n", " <td>NaN</td>\n", " <td>NaN</td>\n", " <td>7.700000</td>\n", " <td>NaN</td>\n", " <td>NaN</td>\n", " <td>NaN</td>\n", " <td>NaN</td>\n", " <td>NaN</td>\n", " <td>NaN</td>\n", " <td>7.206000e+04</td>\n", " <td>4.232562e+06</td>\n", " </tr>\n", " <tr>\n", " <th>50%</th>\n", " <td>NaN</td>\n", " <td>NaN</td>\n", " <td>NaN</td>\n", " <td>NaN</td>\n", " <td>NaN</td>\n", " <td>7.900000</td>\n", " <td>NaN</td>\n", " <td>NaN</td>\n", " <td>NaN</td>\n", " <td>NaN</td>\n", " <td>NaN</td>\n", " <td>NaN</td>\n", " <td>1.654650e+05</td>\n", " <td>2.602096e+07</td>\n", " </tr>\n", " <tr>\n", " <th>75%</th>\n", " <td>NaN</td>\n", " <td>NaN</td>\n", " <td>NaN</td>\n", " <td>NaN</td>\n", " <td>NaN</td>\n", " <td>8.100000</td>\n", " <td>NaN</td>\n", " <td>NaN</td>\n", " <td>NaN</td>\n", " <td>NaN</td>\n", " <td>NaN</td>\n", " <td>NaN</td>\n", " <td>4.268040e+05</td>\n", " <td>7.556908e+07</td>\n", " </tr>\n", " <tr>\n", " <th>max</th>\n", " <td>NaN</td>\n", " <td>NaN</td>\n", " <td>NaN</td>\n", " <td>NaN</td>\n", " <td>NaN</td>\n", " <td>9.300000</td>\n", " <td>NaN</td>\n", " <td>NaN</td>\n", " <td>NaN</td>\n", " <td>NaN</td>\n", " <td>NaN</td>\n", " <td>NaN</td>\n", " <td>2.343110e+06</td>\n", " <td>3.808433e+08</td>\n", " </tr>\n", " </tbody>\n", "</table>\n", "</div>" ], "text/plain": [ " Series_Title Released_Year Certificate Runtime Genre IMDB_Rating \\\n", "count 115 115 115 115 115 115.000000 \n", "unique 115 57 10 64 59 NaN \n", "top queen 2013 U 102 min drama NaN \n", "freq 1 7 30 7 14 NaN \n", "mean NaN NaN NaN NaN NaN 7.947826 \n", "std NaN NaN NaN NaN NaN 0.313259 \n", "min NaN NaN NaN NaN NaN 7.600000 \n", "25% NaN NaN NaN NaN NaN 7.700000 \n", "50% NaN NaN NaN NaN NaN 7.900000 \n", "75% NaN NaN NaN NaN NaN 8.100000 \n", "max NaN NaN NaN NaN NaN 9.300000 \n", "\n", " Meta_score Director Star1 Star2 Star3 \\\n", "count 115 115 115 115 115 \n", "unique 44 105 100 113 109 \n", "top frank darabont al pacino emma watson carrie fisher \n", "freq 16 2 4 2 2 \n", "mean NaN NaN NaN NaN NaN \n", "std NaN NaN NaN NaN NaN \n", "min NaN NaN NaN NaN NaN \n", "25% NaN NaN NaN NaN NaN \n", "50% NaN NaN NaN NaN NaN \n", "75% NaN NaN NaN NaN NaN \n", "max NaN NaN NaN NaN NaN \n", "\n", " Star4 No_of_Votes Gross \n", "count 115 1.150000e+02 1.150000e+02 \n", "unique 114 NaN NaN \n", "top lucy liu NaN NaN \n", "freq 2 NaN NaN \n", "mean NaN 3.140691e+05 6.622925e+07 \n", "std NaN 3.647432e+05 9.085320e+07 \n", "min NaN 2.669700e+04 1.095000e+04 \n", "25% NaN 7.206000e+04 4.232562e+06 \n", "50% NaN 1.654650e+05 2.602096e+07 \n", "75% NaN 4.268040e+05 7.556908e+07 \n", "max NaN 2.343110e+06 3.808433e+08 " ] }, "execution_count": 78, "metadata": {}, "output_type": "execute_result" } ], "source": [ "data_test.describe(include=\"all\")" ] }, { "cell_type": "code", "execution_count": 79, "id": "little-mathematics", "metadata": {}, "outputs": [ { "data": { "text/html": [ "<div>\n", "<style scoped>\n", " .dataframe tbody tr th:only-of-type {\n", " vertical-align: middle;\n", " }\n", "\n", " .dataframe tbody tr th {\n", " vertical-align: top;\n", " }\n", "\n", " .dataframe thead th {\n", " text-align: right;\n", " }\n", "</style>\n", "<table border=\"1\" class=\"dataframe\">\n", " <thead>\n", " <tr style=\"text-align: right;\">\n", " <th></th>\n", " <th>Series_Title</th>\n", " <th>Released_Year</th>\n", " <th>Certificate</th>\n", " <th>Runtime</th>\n", " <th>Genre</th>\n", " <th>IMDB_Rating</th>\n", " <th>Meta_score</th>\n", " <th>Director</th>\n", " <th>Star1</th>\n", " <th>Star2</th>\n", " <th>Star3</th>\n", " <th>Star4</th>\n", " <th>No_of_Votes</th>\n", " <th>Gross</th>\n", " </tr>\n", " </thead>\n", " <tbody>\n", " <tr>\n", " <th>count</th>\n", " <td>115</td>\n", " <td>115</td>\n", " <td>115</td>\n", " <td>115</td>\n", " <td>115</td>\n", " <td>115.000000</td>\n", " <td>115</td>\n", " <td>115</td>\n", " <td>115</td>\n", " <td>115</td>\n", " <td>115</td>\n", " <td>115</td>\n", " <td>1.150000e+02</td>\n", " <td>1.150000e+02</td>\n", " </tr>\n", " <tr>\n", " <th>unique</th>\n", " <td>115</td>\n", " <td>56</td>\n", " <td>8</td>\n", " <td>72</td>\n", " <td>71</td>\n", " <td>NaN</td>\n", " <td>42</td>\n", " <td>101</td>\n", " <td>104</td>\n", " <td>115</td>\n", " <td>115</td>\n", " <td>112</td>\n", " <td>NaN</td>\n", " <td>NaN</td>\n", " </tr>\n", " <tr>\n", " <th>top</th>\n", " <td>mr. smith goes to washington</td>\n", " <td>2004</td>\n", " <td>UA</td>\n", " <td>120 min</td>\n", " <td>drama</td>\n", " <td>NaN</td>\n", " <td></td>\n", " <td>billy wilder</td>\n", " <td>johnny depp</td>\n", " <td>charlize theron</td>\n", " <td>joel edgerton</td>\n", " <td>kevin bacon</td>\n", " <td>NaN</td>\n", " <td>NaN</td>\n", " </tr>\n", " <tr>\n", " <th>freq</th>\n", " <td>1</td>\n", " <td>6</td>\n", " <td>28</td>\n", " <td>5</td>\n", " <td>8</td>\n", " <td>NaN</td>\n", " <td>12</td>\n", " <td>3</td>\n", " <td>3</td>\n", " <td>1</td>\n", " <td>1</td>\n", " <td>2</td>\n", " <td>NaN</td>\n", " <td>NaN</td>\n", " </tr>\n", " <tr>\n", " <th>mean</th>\n", " <td>NaN</td>\n", " <td>NaN</td>\n", " <td>NaN</td>\n", " <td>NaN</td>\n", " <td>NaN</td>\n", " <td>7.940870</td>\n", " <td>NaN</td>\n", " <td>NaN</td>\n", " <td>NaN</td>\n", " <td>NaN</td>\n", " <td>NaN</td>\n", " <td>NaN</td>\n", " <td>3.048547e+05</td>\n", " <td>7.129188e+07</td>\n", " </tr>\n", " <tr>\n", " <th>std</th>\n", " <td>NaN</td>\n", " <td>NaN</td>\n", " <td>NaN</td>\n", " <td>NaN</td>\n", " <td>NaN</td>\n", " <td>0.269143</td>\n", " <td>NaN</td>\n", " <td>NaN</td>\n", " <td>NaN</td>\n", " <td>NaN</td>\n", " <td>NaN</td>\n", " <td>NaN</td>\n", " <td>3.400764e+05</td>\n", " <td>1.275242e+08</td>\n", " </tr>\n", " <tr>\n", " <th>min</th>\n", " <td>NaN</td>\n", " <td>NaN</td>\n", " <td>NaN</td>\n", " <td>NaN</td>\n", " <td>NaN</td>\n", " <td>7.600000</td>\n", " <td>NaN</td>\n", " <td>NaN</td>\n", " <td>NaN</td>\n", " <td>NaN</td>\n", " <td>NaN</td>\n", " <td>NaN</td>\n", " <td>2.522900e+04</td>\n", " <td>3.600000e+03</td>\n", " </tr>\n", " <tr>\n", " <th>25%</th>\n", " <td>NaN</td>\n", " <td>NaN</td>\n", " <td>NaN</td>\n", " <td>NaN</td>\n", " <td>NaN</td>\n", " <td>7.700000</td>\n", " <td>NaN</td>\n", " <td>NaN</td>\n", " <td>NaN</td>\n", " <td>NaN</td>\n", " <td>NaN</td>\n", " <td>NaN</td>\n", " <td>8.123350e+04</td>\n", " <td>3.425538e+06</td>\n", " </tr>\n", " <tr>\n", " <th>50%</th>\n", " <td>NaN</td>\n", " <td>NaN</td>\n", " <td>NaN</td>\n", " <td>NaN</td>\n", " <td>NaN</td>\n", " <td>7.900000</td>\n", " <td>NaN</td>\n", " <td>NaN</td>\n", " <td>NaN</td>\n", " <td>NaN</td>\n", " <td>NaN</td>\n", " <td>NaN</td>\n", " <td>1.820090e+05</td>\n", " <td>2.018666e+07</td>\n", " </tr>\n", " <tr>\n", " <th>75%</th>\n", " <td>NaN</td>\n", " <td>NaN</td>\n", " <td>NaN</td>\n", " <td>NaN</td>\n", " <td>NaN</td>\n", " <td>8.100000</td>\n", " <td>NaN</td>\n", " <td>NaN</td>\n", " <td>NaN</td>\n", " <td>NaN</td>\n", " <td>NaN</td>\n", " <td>NaN</td>\n", " <td>4.148195e+05</td>\n", " <td>8.406197e+07</td>\n", " </tr>\n", " <tr>\n", " <th>max</th>\n", " <td>NaN</td>\n", " <td>NaN</td>\n", " <td>NaN</td>\n", " <td>NaN</td>\n", " <td>NaN</td>\n", " <td>8.800000</td>\n", " <td>NaN</td>\n", " <td>NaN</td>\n", " <td>NaN</td>\n", " <td>NaN</td>\n", " <td>NaN</td>\n", " <td>NaN</td>\n", " <td>2.067042e+06</td>\n", " <td>9.366622e+08</td>\n", " </tr>\n", " </tbody>\n", "</table>\n", "</div>" ], "text/plain": [ " Series_Title Released_Year Certificate Runtime \\\n", "count 115 115 115 115 \n", "unique 115 56 8 72 \n", "top mr. smith goes to washington 2004 UA 120 min \n", "freq 1 6 28 5 \n", "mean NaN NaN NaN NaN \n", "std NaN NaN NaN NaN \n", "min NaN NaN NaN NaN \n", "25% NaN NaN NaN NaN \n", "50% NaN NaN NaN NaN \n", "75% NaN NaN NaN NaN \n", "max NaN NaN NaN NaN \n", "\n", " Genre IMDB_Rating Meta_score Director Star1 \\\n", "count 115 115.000000 115 115 115 \n", "unique 71 NaN 42 101 104 \n", "top drama NaN billy wilder johnny depp \n", "freq 8 NaN 12 3 3 \n", "mean NaN 7.940870 NaN NaN NaN \n", "std NaN 0.269143 NaN NaN NaN \n", "min NaN 7.600000 NaN NaN NaN \n", "25% NaN 7.700000 NaN NaN NaN \n", "50% NaN 7.900000 NaN NaN NaN \n", "75% NaN 8.100000 NaN NaN NaN \n", "max NaN 8.800000 NaN NaN NaN \n", "\n", " Star2 Star3 Star4 No_of_Votes \\\n", "count 115 115 115 1.150000e+02 \n", "unique 115 115 112 NaN \n", "top charlize theron joel edgerton kevin bacon NaN \n", "freq 1 1 2 NaN \n", "mean NaN NaN NaN 3.048547e+05 \n", "std NaN NaN NaN 3.400764e+05 \n", "min NaN NaN NaN 2.522900e+04 \n", "25% NaN NaN NaN 8.123350e+04 \n", "50% NaN NaN NaN 1.820090e+05 \n", "75% NaN NaN NaN 4.148195e+05 \n", "max NaN NaN NaN 2.067042e+06 \n", "\n", " Gross \n", "count 1.150000e+02 \n", "unique NaN \n", "top NaN \n", "freq NaN \n", "mean 7.129188e+07 \n", "std 1.275242e+08 \n", "min 3.600000e+03 \n", "25% 3.425538e+06 \n", "50% 2.018666e+07 \n", "75% 8.406197e+07 \n", "max 9.366622e+08 " ] }, "execution_count": 79, "metadata": {}, "output_type": "execute_result" } ], "source": [ "data_dev.describe(include=\"all\")" ] }, { "cell_type": "code", "execution_count": 80, "id": "sufficient-parade", "metadata": {}, "outputs": [], "source": [ "data_test.to_csv(\"data_test.csv\", encoding=\"utf-8\", index=False)\n", "data_dev.to_csv(\"data_dev.csv\", encoding=\"utf-8\", index=False)\n", "data_train.to_csv(\"data_train.csv\", encoding=\"utf-8\", index=False)" ] }, { "cell_type": "code", "execution_count": null, "id": "accompanied-virtue", "metadata": {}, "outputs": [], "source": [] } ], "metadata": { "kernelspec": { "display_name": "Python 3", "language": "python", "name": "python3" }, "language_info": { "codemirror_mode": { "name": "ipython", "version": 3 }, "file_extension": ".py", "mimetype": "text/x-python", "name": "python", "nbconvert_exporter": "python", "pygments_lexer": "ipython3", "version": "3.7.3" } }, "nbformat": 4, "nbformat_minor": 5 }