1417 lines
47 KiB
Plaintext
1417 lines
47 KiB
Plaintext
{
|
|
"cells": [
|
|
{
|
|
"cell_type": "markdown",
|
|
"id": "academic-calvin",
|
|
"metadata": {},
|
|
"source": [
|
|
"### Skrypt do ściagnięcia zbiory danych"
|
|
]
|
|
},
|
|
{
|
|
"cell_type": "code",
|
|
"execution_count": null,
|
|
"id": "compound-politics",
|
|
"metadata": {},
|
|
"outputs": [],
|
|
"source": [
|
|
"!pip install --user kaggle \n",
|
|
"!pip install --user pandas\n",
|
|
"!pip install --user numpy\n",
|
|
"!pip install --user seaborn\n",
|
|
"!pip install -U scikit-learn"
|
|
]
|
|
},
|
|
{
|
|
"cell_type": "code",
|
|
"execution_count": null,
|
|
"id": "hundred-limitation",
|
|
"metadata": {},
|
|
"outputs": [],
|
|
"source": [
|
|
"!echo \"Downloading dataset from Kaggle...\"\n",
|
|
"!kaggle datasets download -d harshitshankhdhar/imdb-dataset-of-top-1000-movies-and-tv-shows\n",
|
|
"!echo \"Done.\""
|
|
]
|
|
},
|
|
{
|
|
"cell_type": "code",
|
|
"execution_count": null,
|
|
"id": "provincial-circuit",
|
|
"metadata": {},
|
|
"outputs": [],
|
|
"source": [
|
|
"!echo \"Unzipping archive\"\n",
|
|
"!files=$(unzip imdb-dataset-of-top-1000-movies-and-tv-shows.zip | tail -n +2 | cut -d ' ' -f 4)\n",
|
|
"!echo \"Done.\""
|
|
]
|
|
},
|
|
{
|
|
"cell_type": "code",
|
|
"execution_count": 81,
|
|
"id": "armed-brisbane",
|
|
"metadata": {},
|
|
"outputs": [],
|
|
"source": [
|
|
"import pandas as pd\n",
|
|
"import numpy as np\n",
|
|
"data=pd.read_csv('imdb_top_1000.csv')\n",
|
|
"# data"
|
|
]
|
|
},
|
|
{
|
|
"cell_type": "code",
|
|
"execution_count": 82,
|
|
"id": "nominated-grenada",
|
|
"metadata": {},
|
|
"outputs": [
|
|
{
|
|
"name": "stdout",
|
|
"output_type": "stream",
|
|
"text": [
|
|
"1001 imdb_top_1000.csv\n"
|
|
]
|
|
}
|
|
],
|
|
"source": [
|
|
"#Wielkosc zbioru\n",
|
|
"!wc -l imdb_top_1000.csv"
|
|
]
|
|
},
|
|
{
|
|
"cell_type": "markdown",
|
|
"id": "generic-success",
|
|
"metadata": {},
|
|
"source": [
|
|
"## Usuwanie kolumn\n",
|
|
"- Poster_Link: kolumna zawierająca linki do plakatów promujących film\n",
|
|
"- Overview: kolumna zawierająca recenzje poszczególnych filmów"
|
|
]
|
|
},
|
|
{
|
|
"cell_type": "code",
|
|
"execution_count": 83,
|
|
"id": "compliant-synthesis",
|
|
"metadata": {},
|
|
"outputs": [],
|
|
"source": [
|
|
"data.drop(columns=[\"Poster_Link\"], inplace=True)\n",
|
|
"data.drop(columns=[\"Overview\"], inplace=True)\n",
|
|
"\n",
|
|
"# data"
|
|
]
|
|
},
|
|
{
|
|
"cell_type": "code",
|
|
"execution_count": 84,
|
|
"id": "reserved-whole",
|
|
"metadata": {},
|
|
"outputs": [],
|
|
"source": [
|
|
"# Lowercase na polach tekstowych\n",
|
|
"data[\"Series_Title\"] = data[\"Series_Title\"].str.lower()\n",
|
|
"data[\"Genre\"] = data[\"Genre\"].str.lower()\n",
|
|
"data[\"Director\"] = data[\"Director\"].str.lower()\n",
|
|
"data[\"Star1\"] = data[\"Star1\"].str.lower()\n",
|
|
"data[\"Star2\"] = data[\"Star2\"].str.lower()\n",
|
|
"data[\"Star3\"] = data[\"Star3\"].str.lower()\n",
|
|
"data[\"Star4\"] = data[\"Star4\"].str.lower()\n",
|
|
"\n",
|
|
"# Usunięcie Nan i string to int \n",
|
|
"data = data.replace(np.nan, '', regex=True)\n",
|
|
"data[\"Gross\"] = data[\"Gross\"].str.replace(',', '')\n",
|
|
"data[\"Gross\"] = pd.to_numeric(data[\"Gross\"], errors='coerce')\n",
|
|
"\n",
|
|
"data = data.dropna()"
|
|
]
|
|
},
|
|
{
|
|
"cell_type": "code",
|
|
"execution_count": 86,
|
|
"id": "given-sodium",
|
|
"metadata": {},
|
|
"outputs": [
|
|
{
|
|
"data": {
|
|
"text/html": [
|
|
"<div>\n",
|
|
"<style scoped>\n",
|
|
" .dataframe tbody tr th:only-of-type {\n",
|
|
" vertical-align: middle;\n",
|
|
" }\n",
|
|
"\n",
|
|
" .dataframe tbody tr th {\n",
|
|
" vertical-align: top;\n",
|
|
" }\n",
|
|
"\n",
|
|
" .dataframe thead th {\n",
|
|
" text-align: right;\n",
|
|
" }\n",
|
|
"</style>\n",
|
|
"<table border=\"1\" class=\"dataframe\">\n",
|
|
" <thead>\n",
|
|
" <tr style=\"text-align: right;\">\n",
|
|
" <th></th>\n",
|
|
" <th>Series_Title</th>\n",
|
|
" <th>Released_Year</th>\n",
|
|
" <th>Certificate</th>\n",
|
|
" <th>Runtime</th>\n",
|
|
" <th>Genre</th>\n",
|
|
" <th>IMDB_Rating</th>\n",
|
|
" <th>Meta_score</th>\n",
|
|
" <th>Director</th>\n",
|
|
" <th>Star1</th>\n",
|
|
" <th>Star2</th>\n",
|
|
" <th>Star3</th>\n",
|
|
" <th>Star4</th>\n",
|
|
" <th>No_of_Votes</th>\n",
|
|
" <th>Gross</th>\n",
|
|
" </tr>\n",
|
|
" </thead>\n",
|
|
" <tbody>\n",
|
|
" <tr>\n",
|
|
" <th>count</th>\n",
|
|
" <td>831</td>\n",
|
|
" <td>831</td>\n",
|
|
" <td>831</td>\n",
|
|
" <td>831</td>\n",
|
|
" <td>831</td>\n",
|
|
" <td>831.000000</td>\n",
|
|
" <td>831</td>\n",
|
|
" <td>831</td>\n",
|
|
" <td>831</td>\n",
|
|
" <td>831</td>\n",
|
|
" <td>831</td>\n",
|
|
" <td>831</td>\n",
|
|
" <td>8.310000e+02</td>\n",
|
|
" <td>8.310000e+02</td>\n",
|
|
" </tr>\n",
|
|
" <tr>\n",
|
|
" <th>unique</th>\n",
|
|
" <td>831</td>\n",
|
|
" <td>95</td>\n",
|
|
" <td>14</td>\n",
|
|
" <td>133</td>\n",
|
|
" <td>182</td>\n",
|
|
" <td>NaN</td>\n",
|
|
" <td>64</td>\n",
|
|
" <td>472</td>\n",
|
|
" <td>556</td>\n",
|
|
" <td>704</td>\n",
|
|
" <td>737</td>\n",
|
|
" <td>782</td>\n",
|
|
" <td>NaN</td>\n",
|
|
" <td>NaN</td>\n",
|
|
" </tr>\n",
|
|
" <tr>\n",
|
|
" <th>top</th>\n",
|
|
" <td>a streetcar named desire</td>\n",
|
|
" <td>2014</td>\n",
|
|
" <td>U</td>\n",
|
|
" <td>101 min</td>\n",
|
|
" <td>drama</td>\n",
|
|
" <td>NaN</td>\n",
|
|
" <td></td>\n",
|
|
" <td>steven spielberg</td>\n",
|
|
" <td>tom hanks</td>\n",
|
|
" <td>emma watson</td>\n",
|
|
" <td>rupert grint</td>\n",
|
|
" <td>michael caine</td>\n",
|
|
" <td>NaN</td>\n",
|
|
" <td>NaN</td>\n",
|
|
" </tr>\n",
|
|
" <tr>\n",
|
|
" <th>freq</th>\n",
|
|
" <td>1</td>\n",
|
|
" <td>31</td>\n",
|
|
" <td>200</td>\n",
|
|
" <td>21</td>\n",
|
|
" <td>75</td>\n",
|
|
" <td>NaN</td>\n",
|
|
" <td>81</td>\n",
|
|
" <td>13</td>\n",
|
|
" <td>12</td>\n",
|
|
" <td>7</td>\n",
|
|
" <td>5</td>\n",
|
|
" <td>4</td>\n",
|
|
" <td>NaN</td>\n",
|
|
" <td>NaN</td>\n",
|
|
" </tr>\n",
|
|
" <tr>\n",
|
|
" <th>mean</th>\n",
|
|
" <td>NaN</td>\n",
|
|
" <td>NaN</td>\n",
|
|
" <td>NaN</td>\n",
|
|
" <td>NaN</td>\n",
|
|
" <td>NaN</td>\n",
|
|
" <td>7.946931</td>\n",
|
|
" <td>NaN</td>\n",
|
|
" <td>NaN</td>\n",
|
|
" <td>NaN</td>\n",
|
|
" <td>NaN</td>\n",
|
|
" <td>NaN</td>\n",
|
|
" <td>NaN</td>\n",
|
|
" <td>3.152499e+05</td>\n",
|
|
" <td>6.803475e+07</td>\n",
|
|
" </tr>\n",
|
|
" <tr>\n",
|
|
" <th>std</th>\n",
|
|
" <td>NaN</td>\n",
|
|
" <td>NaN</td>\n",
|
|
" <td>NaN</td>\n",
|
|
" <td>NaN</td>\n",
|
|
" <td>NaN</td>\n",
|
|
" <td>0.283204</td>\n",
|
|
" <td>NaN</td>\n",
|
|
" <td>NaN</td>\n",
|
|
" <td>NaN</td>\n",
|
|
" <td>NaN</td>\n",
|
|
" <td>NaN</td>\n",
|
|
" <td>NaN</td>\n",
|
|
" <td>3.436443e+05</td>\n",
|
|
" <td>1.097500e+08</td>\n",
|
|
" </tr>\n",
|
|
" <tr>\n",
|
|
" <th>min</th>\n",
|
|
" <td>NaN</td>\n",
|
|
" <td>NaN</td>\n",
|
|
" <td>NaN</td>\n",
|
|
" <td>NaN</td>\n",
|
|
" <td>NaN</td>\n",
|
|
" <td>7.600000</td>\n",
|
|
" <td>NaN</td>\n",
|
|
" <td>NaN</td>\n",
|
|
" <td>NaN</td>\n",
|
|
" <td>NaN</td>\n",
|
|
" <td>NaN</td>\n",
|
|
" <td>NaN</td>\n",
|
|
" <td>2.508800e+04</td>\n",
|
|
" <td>1.305000e+03</td>\n",
|
|
" </tr>\n",
|
|
" <tr>\n",
|
|
" <th>25%</th>\n",
|
|
" <td>NaN</td>\n",
|
|
" <td>NaN</td>\n",
|
|
" <td>NaN</td>\n",
|
|
" <td>NaN</td>\n",
|
|
" <td>NaN</td>\n",
|
|
" <td>7.700000</td>\n",
|
|
" <td>NaN</td>\n",
|
|
" <td>NaN</td>\n",
|
|
" <td>NaN</td>\n",
|
|
" <td>NaN</td>\n",
|
|
" <td>NaN</td>\n",
|
|
" <td>NaN</td>\n",
|
|
" <td>7.143000e+04</td>\n",
|
|
" <td>3.253559e+06</td>\n",
|
|
" </tr>\n",
|
|
" <tr>\n",
|
|
" <th>50%</th>\n",
|
|
" <td>NaN</td>\n",
|
|
" <td>NaN</td>\n",
|
|
" <td>NaN</td>\n",
|
|
" <td>NaN</td>\n",
|
|
" <td>NaN</td>\n",
|
|
" <td>7.900000</td>\n",
|
|
" <td>NaN</td>\n",
|
|
" <td>NaN</td>\n",
|
|
" <td>NaN</td>\n",
|
|
" <td>NaN</td>\n",
|
|
" <td>NaN</td>\n",
|
|
" <td>NaN</td>\n",
|
|
" <td>1.867340e+05</td>\n",
|
|
" <td>2.353089e+07</td>\n",
|
|
" </tr>\n",
|
|
" <tr>\n",
|
|
" <th>75%</th>\n",
|
|
" <td>NaN</td>\n",
|
|
" <td>NaN</td>\n",
|
|
" <td>NaN</td>\n",
|
|
" <td>NaN</td>\n",
|
|
" <td>NaN</td>\n",
|
|
" <td>8.100000</td>\n",
|
|
" <td>NaN</td>\n",
|
|
" <td>NaN</td>\n",
|
|
" <td>NaN</td>\n",
|
|
" <td>NaN</td>\n",
|
|
" <td>NaN</td>\n",
|
|
" <td>NaN</td>\n",
|
|
" <td>4.457210e+05</td>\n",
|
|
" <td>8.075089e+07</td>\n",
|
|
" </tr>\n",
|
|
" <tr>\n",
|
|
" <th>max</th>\n",
|
|
" <td>NaN</td>\n",
|
|
" <td>NaN</td>\n",
|
|
" <td>NaN</td>\n",
|
|
" <td>NaN</td>\n",
|
|
" <td>NaN</td>\n",
|
|
" <td>9.300000</td>\n",
|
|
" <td>NaN</td>\n",
|
|
" <td>NaN</td>\n",
|
|
" <td>NaN</td>\n",
|
|
" <td>NaN</td>\n",
|
|
" <td>NaN</td>\n",
|
|
" <td>NaN</td>\n",
|
|
" <td>2.343110e+06</td>\n",
|
|
" <td>9.366622e+08</td>\n",
|
|
" </tr>\n",
|
|
" </tbody>\n",
|
|
"</table>\n",
|
|
"</div>"
|
|
],
|
|
"text/plain": [
|
|
" Series_Title Released_Year Certificate Runtime Genre \\\n",
|
|
"count 831 831 831 831 831 \n",
|
|
"unique 831 95 14 133 182 \n",
|
|
"top a streetcar named desire 2014 U 101 min drama \n",
|
|
"freq 1 31 200 21 75 \n",
|
|
"mean NaN NaN NaN NaN NaN \n",
|
|
"std NaN NaN NaN NaN NaN \n",
|
|
"min NaN NaN NaN NaN NaN \n",
|
|
"25% NaN NaN NaN NaN NaN \n",
|
|
"50% NaN NaN NaN NaN NaN \n",
|
|
"75% NaN NaN NaN NaN NaN \n",
|
|
"max NaN NaN NaN NaN NaN \n",
|
|
"\n",
|
|
" IMDB_Rating Meta_score Director Star1 Star2 \\\n",
|
|
"count 831.000000 831 831 831 831 \n",
|
|
"unique NaN 64 472 556 704 \n",
|
|
"top NaN steven spielberg tom hanks emma watson \n",
|
|
"freq NaN 81 13 12 7 \n",
|
|
"mean 7.946931 NaN NaN NaN NaN \n",
|
|
"std 0.283204 NaN NaN NaN NaN \n",
|
|
"min 7.600000 NaN NaN NaN NaN \n",
|
|
"25% 7.700000 NaN NaN NaN NaN \n",
|
|
"50% 7.900000 NaN NaN NaN NaN \n",
|
|
"75% 8.100000 NaN NaN NaN NaN \n",
|
|
"max 9.300000 NaN NaN NaN NaN \n",
|
|
"\n",
|
|
" Star3 Star4 No_of_Votes Gross \n",
|
|
"count 831 831 8.310000e+02 8.310000e+02 \n",
|
|
"unique 737 782 NaN NaN \n",
|
|
"top rupert grint michael caine NaN NaN \n",
|
|
"freq 5 4 NaN NaN \n",
|
|
"mean NaN NaN 3.152499e+05 6.803475e+07 \n",
|
|
"std NaN NaN 3.436443e+05 1.097500e+08 \n",
|
|
"min NaN NaN 2.508800e+04 1.305000e+03 \n",
|
|
"25% NaN NaN 7.143000e+04 3.253559e+06 \n",
|
|
"50% NaN NaN 1.867340e+05 2.353089e+07 \n",
|
|
"75% NaN NaN 4.457210e+05 8.075089e+07 \n",
|
|
"max NaN NaN 2.343110e+06 9.366622e+08 "
|
|
]
|
|
},
|
|
"execution_count": 86,
|
|
"metadata": {},
|
|
"output_type": "execute_result"
|
|
}
|
|
],
|
|
"source": [
|
|
"data.describe(include='all')"
|
|
]
|
|
},
|
|
{
|
|
"cell_type": "code",
|
|
"execution_count": 74,
|
|
"id": "effective-treasury",
|
|
"metadata": {},
|
|
"outputs": [
|
|
{
|
|
"data": {
|
|
"text/plain": [
|
|
"IMDB_Rating 7.9\n",
|
|
"No_of_Votes 186734.0\n",
|
|
"Gross 23530892.0\n",
|
|
"dtype: float64"
|
|
]
|
|
},
|
|
"execution_count": 74,
|
|
"metadata": {},
|
|
"output_type": "execute_result"
|
|
}
|
|
],
|
|
"source": [
|
|
"data.median()"
|
|
]
|
|
},
|
|
{
|
|
"cell_type": "code",
|
|
"execution_count": 87,
|
|
"id": "egyptian-sacramento",
|
|
"metadata": {},
|
|
"outputs": [
|
|
{
|
|
"data": {
|
|
"text/plain": [
|
|
"(831, 14)"
|
|
]
|
|
},
|
|
"execution_count": 87,
|
|
"metadata": {},
|
|
"output_type": "execute_result"
|
|
}
|
|
],
|
|
"source": [
|
|
"data.shape"
|
|
]
|
|
},
|
|
{
|
|
"cell_type": "code",
|
|
"execution_count": 88,
|
|
"id": "intended-christmas",
|
|
"metadata": {},
|
|
"outputs": [
|
|
{
|
|
"name": "stdout",
|
|
"output_type": "stream",
|
|
"text": [
|
|
"(601, 14)\n",
|
|
"(115, 14)\n",
|
|
"(115, 14)\n"
|
|
]
|
|
}
|
|
],
|
|
"source": [
|
|
"from sklearn.model_selection import train_test_split\n",
|
|
"import sklearn\n",
|
|
"\n",
|
|
"data_train, data_test = train_test_split(data, test_size=230, random_state=1)\n",
|
|
"data_test, data_dev = train_test_split(data_test, test_size=115, random_state=1)\n",
|
|
"print(data_train.shape)\n",
|
|
"print(data_test.shape)\n",
|
|
"print(data_dev.shape)"
|
|
]
|
|
},
|
|
{
|
|
"cell_type": "code",
|
|
"execution_count": 76,
|
|
"id": "little-gravity",
|
|
"metadata": {},
|
|
"outputs": [
|
|
{
|
|
"data": {
|
|
"text/plain": [
|
|
"0.1913477537437604"
|
|
]
|
|
},
|
|
"execution_count": 76,
|
|
"metadata": {},
|
|
"output_type": "execute_result"
|
|
}
|
|
],
|
|
"source": [
|
|
"data_test.size/data_train.size"
|
|
]
|
|
},
|
|
{
|
|
"cell_type": "code",
|
|
"execution_count": 89,
|
|
"id": "executive-canada",
|
|
"metadata": {},
|
|
"outputs": [
|
|
{
|
|
"data": {
|
|
"text/html": [
|
|
"<div>\n",
|
|
"<style scoped>\n",
|
|
" .dataframe tbody tr th:only-of-type {\n",
|
|
" vertical-align: middle;\n",
|
|
" }\n",
|
|
"\n",
|
|
" .dataframe tbody tr th {\n",
|
|
" vertical-align: top;\n",
|
|
" }\n",
|
|
"\n",
|
|
" .dataframe thead th {\n",
|
|
" text-align: right;\n",
|
|
" }\n",
|
|
"</style>\n",
|
|
"<table border=\"1\" class=\"dataframe\">\n",
|
|
" <thead>\n",
|
|
" <tr style=\"text-align: right;\">\n",
|
|
" <th></th>\n",
|
|
" <th>Series_Title</th>\n",
|
|
" <th>Released_Year</th>\n",
|
|
" <th>Certificate</th>\n",
|
|
" <th>Runtime</th>\n",
|
|
" <th>Genre</th>\n",
|
|
" <th>IMDB_Rating</th>\n",
|
|
" <th>Meta_score</th>\n",
|
|
" <th>Director</th>\n",
|
|
" <th>Star1</th>\n",
|
|
" <th>Star2</th>\n",
|
|
" <th>Star3</th>\n",
|
|
" <th>Star4</th>\n",
|
|
" <th>No_of_Votes</th>\n",
|
|
" <th>Gross</th>\n",
|
|
" </tr>\n",
|
|
" </thead>\n",
|
|
" <tbody>\n",
|
|
" <tr>\n",
|
|
" <th>count</th>\n",
|
|
" <td>601</td>\n",
|
|
" <td>601</td>\n",
|
|
" <td>601</td>\n",
|
|
" <td>601</td>\n",
|
|
" <td>601</td>\n",
|
|
" <td>601.000000</td>\n",
|
|
" <td>601</td>\n",
|
|
" <td>601</td>\n",
|
|
" <td>601</td>\n",
|
|
" <td>601</td>\n",
|
|
" <td>601</td>\n",
|
|
" <td>601</td>\n",
|
|
" <td>6.010000e+02</td>\n",
|
|
" <td>6.010000e+02</td>\n",
|
|
" </tr>\n",
|
|
" <tr>\n",
|
|
" <th>unique</th>\n",
|
|
" <td>601</td>\n",
|
|
" <td>90</td>\n",
|
|
" <td>13</td>\n",
|
|
" <td>121</td>\n",
|
|
" <td>162</td>\n",
|
|
" <td>NaN</td>\n",
|
|
" <td>59</td>\n",
|
|
" <td>378</td>\n",
|
|
" <td>438</td>\n",
|
|
" <td>530</td>\n",
|
|
" <td>556</td>\n",
|
|
" <td>577</td>\n",
|
|
" <td>NaN</td>\n",
|
|
" <td>NaN</td>\n",
|
|
" </tr>\n",
|
|
" <tr>\n",
|
|
" <th>top</th>\n",
|
|
" <td>what ever happened to baby jane?</td>\n",
|
|
" <td>2014</td>\n",
|
|
" <td>U</td>\n",
|
|
" <td>101 min</td>\n",
|
|
" <td>drama</td>\n",
|
|
" <td>NaN</td>\n",
|
|
" <td></td>\n",
|
|
" <td>martin scorsese</td>\n",
|
|
" <td>clint eastwood</td>\n",
|
|
" <td>emma watson</td>\n",
|
|
" <td>joe pesci</td>\n",
|
|
" <td>michael caine</td>\n",
|
|
" <td>NaN</td>\n",
|
|
" <td>NaN</td>\n",
|
|
" </tr>\n",
|
|
" <tr>\n",
|
|
" <th>freq</th>\n",
|
|
" <td>1</td>\n",
|
|
" <td>22</td>\n",
|
|
" <td>143</td>\n",
|
|
" <td>17</td>\n",
|
|
" <td>53</td>\n",
|
|
" <td>NaN</td>\n",
|
|
" <td>53</td>\n",
|
|
" <td>10</td>\n",
|
|
" <td>10</td>\n",
|
|
" <td>5</td>\n",
|
|
" <td>4</td>\n",
|
|
" <td>4</td>\n",
|
|
" <td>NaN</td>\n",
|
|
" <td>NaN</td>\n",
|
|
" </tr>\n",
|
|
" <tr>\n",
|
|
" <th>mean</th>\n",
|
|
" <td>NaN</td>\n",
|
|
" <td>NaN</td>\n",
|
|
" <td>NaN</td>\n",
|
|
" <td>NaN</td>\n",
|
|
" <td>NaN</td>\n",
|
|
" <td>7.947920</td>\n",
|
|
" <td>NaN</td>\n",
|
|
" <td>NaN</td>\n",
|
|
" <td>NaN</td>\n",
|
|
" <td>NaN</td>\n",
|
|
" <td>NaN</td>\n",
|
|
" <td>NaN</td>\n",
|
|
" <td>3.174649e+05</td>\n",
|
|
" <td>6.775699e+07</td>\n",
|
|
" </tr>\n",
|
|
" <tr>\n",
|
|
" <th>std</th>\n",
|
|
" <td>NaN</td>\n",
|
|
" <td>NaN</td>\n",
|
|
" <td>NaN</td>\n",
|
|
" <td>NaN</td>\n",
|
|
" <td>NaN</td>\n",
|
|
" <td>0.280238</td>\n",
|
|
" <td>NaN</td>\n",
|
|
" <td>NaN</td>\n",
|
|
" <td>NaN</td>\n",
|
|
" <td>NaN</td>\n",
|
|
" <td>NaN</td>\n",
|
|
" <td>NaN</td>\n",
|
|
" <td>3.407094e+05</td>\n",
|
|
" <td>1.095511e+08</td>\n",
|
|
" </tr>\n",
|
|
" <tr>\n",
|
|
" <th>min</th>\n",
|
|
" <td>NaN</td>\n",
|
|
" <td>NaN</td>\n",
|
|
" <td>NaN</td>\n",
|
|
" <td>NaN</td>\n",
|
|
" <td>NaN</td>\n",
|
|
" <td>7.600000</td>\n",
|
|
" <td>NaN</td>\n",
|
|
" <td>NaN</td>\n",
|
|
" <td>NaN</td>\n",
|
|
" <td>NaN</td>\n",
|
|
" <td>NaN</td>\n",
|
|
" <td>NaN</td>\n",
|
|
" <td>2.508800e+04</td>\n",
|
|
" <td>1.305000e+03</td>\n",
|
|
" </tr>\n",
|
|
" <tr>\n",
|
|
" <th>25%</th>\n",
|
|
" <td>NaN</td>\n",
|
|
" <td>NaN</td>\n",
|
|
" <td>NaN</td>\n",
|
|
" <td>NaN</td>\n",
|
|
" <td>NaN</td>\n",
|
|
" <td>7.700000</td>\n",
|
|
" <td>NaN</td>\n",
|
|
" <td>NaN</td>\n",
|
|
" <td>NaN</td>\n",
|
|
" <td>NaN</td>\n",
|
|
" <td>NaN</td>\n",
|
|
" <td>NaN</td>\n",
|
|
" <td>6.846300e+04</td>\n",
|
|
" <td>3.151130e+06</td>\n",
|
|
" </tr>\n",
|
|
" <tr>\n",
|
|
" <th>50%</th>\n",
|
|
" <td>NaN</td>\n",
|
|
" <td>NaN</td>\n",
|
|
" <td>NaN</td>\n",
|
|
" <td>NaN</td>\n",
|
|
" <td>NaN</td>\n",
|
|
" <td>7.900000</td>\n",
|
|
" <td>NaN</td>\n",
|
|
" <td>NaN</td>\n",
|
|
" <td>NaN</td>\n",
|
|
" <td>NaN</td>\n",
|
|
" <td>NaN</td>\n",
|
|
" <td>NaN</td>\n",
|
|
" <td>1.897160e+05</td>\n",
|
|
" <td>2.365000e+07</td>\n",
|
|
" </tr>\n",
|
|
" <tr>\n",
|
|
" <th>75%</th>\n",
|
|
" <td>NaN</td>\n",
|
|
" <td>NaN</td>\n",
|
|
" <td>NaN</td>\n",
|
|
" <td>NaN</td>\n",
|
|
" <td>NaN</td>\n",
|
|
" <td>8.100000</td>\n",
|
|
" <td>NaN</td>\n",
|
|
" <td>NaN</td>\n",
|
|
" <td>NaN</td>\n",
|
|
" <td>NaN</td>\n",
|
|
" <td>NaN</td>\n",
|
|
" <td>NaN</td>\n",
|
|
" <td>4.622520e+05</td>\n",
|
|
" <td>7.891296e+07</td>\n",
|
|
" </tr>\n",
|
|
" <tr>\n",
|
|
" <th>max</th>\n",
|
|
" <td>NaN</td>\n",
|
|
" <td>NaN</td>\n",
|
|
" <td>NaN</td>\n",
|
|
" <td>NaN</td>\n",
|
|
" <td>NaN</td>\n",
|
|
" <td>9.200000</td>\n",
|
|
" <td>NaN</td>\n",
|
|
" <td>NaN</td>\n",
|
|
" <td>NaN</td>\n",
|
|
" <td>NaN</td>\n",
|
|
" <td>NaN</td>\n",
|
|
" <td>NaN</td>\n",
|
|
" <td>2.303232e+06</td>\n",
|
|
" <td>8.583730e+08</td>\n",
|
|
" </tr>\n",
|
|
" </tbody>\n",
|
|
"</table>\n",
|
|
"</div>"
|
|
],
|
|
"text/plain": [
|
|
" Series_Title Released_Year Certificate Runtime \\\n",
|
|
"count 601 601 601 601 \n",
|
|
"unique 601 90 13 121 \n",
|
|
"top what ever happened to baby jane? 2014 U 101 min \n",
|
|
"freq 1 22 143 17 \n",
|
|
"mean NaN NaN NaN NaN \n",
|
|
"std NaN NaN NaN NaN \n",
|
|
"min NaN NaN NaN NaN \n",
|
|
"25% NaN NaN NaN NaN \n",
|
|
"50% NaN NaN NaN NaN \n",
|
|
"75% NaN NaN NaN NaN \n",
|
|
"max NaN NaN NaN NaN \n",
|
|
"\n",
|
|
" Genre IMDB_Rating Meta_score Director Star1 \\\n",
|
|
"count 601 601.000000 601 601 601 \n",
|
|
"unique 162 NaN 59 378 438 \n",
|
|
"top drama NaN martin scorsese clint eastwood \n",
|
|
"freq 53 NaN 53 10 10 \n",
|
|
"mean NaN 7.947920 NaN NaN NaN \n",
|
|
"std NaN 0.280238 NaN NaN NaN \n",
|
|
"min NaN 7.600000 NaN NaN NaN \n",
|
|
"25% NaN 7.700000 NaN NaN NaN \n",
|
|
"50% NaN 7.900000 NaN NaN NaN \n",
|
|
"75% NaN 8.100000 NaN NaN NaN \n",
|
|
"max NaN 9.200000 NaN NaN NaN \n",
|
|
"\n",
|
|
" Star2 Star3 Star4 No_of_Votes Gross \n",
|
|
"count 601 601 601 6.010000e+02 6.010000e+02 \n",
|
|
"unique 530 556 577 NaN NaN \n",
|
|
"top emma watson joe pesci michael caine NaN NaN \n",
|
|
"freq 5 4 4 NaN NaN \n",
|
|
"mean NaN NaN NaN 3.174649e+05 6.775699e+07 \n",
|
|
"std NaN NaN NaN 3.407094e+05 1.095511e+08 \n",
|
|
"min NaN NaN NaN 2.508800e+04 1.305000e+03 \n",
|
|
"25% NaN NaN NaN 6.846300e+04 3.151130e+06 \n",
|
|
"50% NaN NaN NaN 1.897160e+05 2.365000e+07 \n",
|
|
"75% NaN NaN NaN 4.622520e+05 7.891296e+07 \n",
|
|
"max NaN NaN NaN 2.303232e+06 8.583730e+08 "
|
|
]
|
|
},
|
|
"execution_count": 89,
|
|
"metadata": {},
|
|
"output_type": "execute_result"
|
|
}
|
|
],
|
|
"source": [
|
|
"data_train.describe(include=\"all\")"
|
|
]
|
|
},
|
|
{
|
|
"cell_type": "code",
|
|
"execution_count": 78,
|
|
"id": "alert-campus",
|
|
"metadata": {},
|
|
"outputs": [
|
|
{
|
|
"data": {
|
|
"text/html": [
|
|
"<div>\n",
|
|
"<style scoped>\n",
|
|
" .dataframe tbody tr th:only-of-type {\n",
|
|
" vertical-align: middle;\n",
|
|
" }\n",
|
|
"\n",
|
|
" .dataframe tbody tr th {\n",
|
|
" vertical-align: top;\n",
|
|
" }\n",
|
|
"\n",
|
|
" .dataframe thead th {\n",
|
|
" text-align: right;\n",
|
|
" }\n",
|
|
"</style>\n",
|
|
"<table border=\"1\" class=\"dataframe\">\n",
|
|
" <thead>\n",
|
|
" <tr style=\"text-align: right;\">\n",
|
|
" <th></th>\n",
|
|
" <th>Series_Title</th>\n",
|
|
" <th>Released_Year</th>\n",
|
|
" <th>Certificate</th>\n",
|
|
" <th>Runtime</th>\n",
|
|
" <th>Genre</th>\n",
|
|
" <th>IMDB_Rating</th>\n",
|
|
" <th>Meta_score</th>\n",
|
|
" <th>Director</th>\n",
|
|
" <th>Star1</th>\n",
|
|
" <th>Star2</th>\n",
|
|
" <th>Star3</th>\n",
|
|
" <th>Star4</th>\n",
|
|
" <th>No_of_Votes</th>\n",
|
|
" <th>Gross</th>\n",
|
|
" </tr>\n",
|
|
" </thead>\n",
|
|
" <tbody>\n",
|
|
" <tr>\n",
|
|
" <th>count</th>\n",
|
|
" <td>115</td>\n",
|
|
" <td>115</td>\n",
|
|
" <td>115</td>\n",
|
|
" <td>115</td>\n",
|
|
" <td>115</td>\n",
|
|
" <td>115.000000</td>\n",
|
|
" <td>115</td>\n",
|
|
" <td>115</td>\n",
|
|
" <td>115</td>\n",
|
|
" <td>115</td>\n",
|
|
" <td>115</td>\n",
|
|
" <td>115</td>\n",
|
|
" <td>1.150000e+02</td>\n",
|
|
" <td>1.150000e+02</td>\n",
|
|
" </tr>\n",
|
|
" <tr>\n",
|
|
" <th>unique</th>\n",
|
|
" <td>115</td>\n",
|
|
" <td>57</td>\n",
|
|
" <td>10</td>\n",
|
|
" <td>64</td>\n",
|
|
" <td>59</td>\n",
|
|
" <td>NaN</td>\n",
|
|
" <td>44</td>\n",
|
|
" <td>105</td>\n",
|
|
" <td>100</td>\n",
|
|
" <td>113</td>\n",
|
|
" <td>109</td>\n",
|
|
" <td>114</td>\n",
|
|
" <td>NaN</td>\n",
|
|
" <td>NaN</td>\n",
|
|
" </tr>\n",
|
|
" <tr>\n",
|
|
" <th>top</th>\n",
|
|
" <td>queen</td>\n",
|
|
" <td>2013</td>\n",
|
|
" <td>U</td>\n",
|
|
" <td>102 min</td>\n",
|
|
" <td>drama</td>\n",
|
|
" <td>NaN</td>\n",
|
|
" <td></td>\n",
|
|
" <td>frank darabont</td>\n",
|
|
" <td>al pacino</td>\n",
|
|
" <td>emma watson</td>\n",
|
|
" <td>carrie fisher</td>\n",
|
|
" <td>lucy liu</td>\n",
|
|
" <td>NaN</td>\n",
|
|
" <td>NaN</td>\n",
|
|
" </tr>\n",
|
|
" <tr>\n",
|
|
" <th>freq</th>\n",
|
|
" <td>1</td>\n",
|
|
" <td>7</td>\n",
|
|
" <td>30</td>\n",
|
|
" <td>7</td>\n",
|
|
" <td>14</td>\n",
|
|
" <td>NaN</td>\n",
|
|
" <td>16</td>\n",
|
|
" <td>2</td>\n",
|
|
" <td>4</td>\n",
|
|
" <td>2</td>\n",
|
|
" <td>2</td>\n",
|
|
" <td>2</td>\n",
|
|
" <td>NaN</td>\n",
|
|
" <td>NaN</td>\n",
|
|
" </tr>\n",
|
|
" <tr>\n",
|
|
" <th>mean</th>\n",
|
|
" <td>NaN</td>\n",
|
|
" <td>NaN</td>\n",
|
|
" <td>NaN</td>\n",
|
|
" <td>NaN</td>\n",
|
|
" <td>NaN</td>\n",
|
|
" <td>7.947826</td>\n",
|
|
" <td>NaN</td>\n",
|
|
" <td>NaN</td>\n",
|
|
" <td>NaN</td>\n",
|
|
" <td>NaN</td>\n",
|
|
" <td>NaN</td>\n",
|
|
" <td>NaN</td>\n",
|
|
" <td>3.140691e+05</td>\n",
|
|
" <td>6.622925e+07</td>\n",
|
|
" </tr>\n",
|
|
" <tr>\n",
|
|
" <th>std</th>\n",
|
|
" <td>NaN</td>\n",
|
|
" <td>NaN</td>\n",
|
|
" <td>NaN</td>\n",
|
|
" <td>NaN</td>\n",
|
|
" <td>NaN</td>\n",
|
|
" <td>0.313259</td>\n",
|
|
" <td>NaN</td>\n",
|
|
" <td>NaN</td>\n",
|
|
" <td>NaN</td>\n",
|
|
" <td>NaN</td>\n",
|
|
" <td>NaN</td>\n",
|
|
" <td>NaN</td>\n",
|
|
" <td>3.647432e+05</td>\n",
|
|
" <td>9.085320e+07</td>\n",
|
|
" </tr>\n",
|
|
" <tr>\n",
|
|
" <th>min</th>\n",
|
|
" <td>NaN</td>\n",
|
|
" <td>NaN</td>\n",
|
|
" <td>NaN</td>\n",
|
|
" <td>NaN</td>\n",
|
|
" <td>NaN</td>\n",
|
|
" <td>7.600000</td>\n",
|
|
" <td>NaN</td>\n",
|
|
" <td>NaN</td>\n",
|
|
" <td>NaN</td>\n",
|
|
" <td>NaN</td>\n",
|
|
" <td>NaN</td>\n",
|
|
" <td>NaN</td>\n",
|
|
" <td>2.669700e+04</td>\n",
|
|
" <td>1.095000e+04</td>\n",
|
|
" </tr>\n",
|
|
" <tr>\n",
|
|
" <th>25%</th>\n",
|
|
" <td>NaN</td>\n",
|
|
" <td>NaN</td>\n",
|
|
" <td>NaN</td>\n",
|
|
" <td>NaN</td>\n",
|
|
" <td>NaN</td>\n",
|
|
" <td>7.700000</td>\n",
|
|
" <td>NaN</td>\n",
|
|
" <td>NaN</td>\n",
|
|
" <td>NaN</td>\n",
|
|
" <td>NaN</td>\n",
|
|
" <td>NaN</td>\n",
|
|
" <td>NaN</td>\n",
|
|
" <td>7.206000e+04</td>\n",
|
|
" <td>4.232562e+06</td>\n",
|
|
" </tr>\n",
|
|
" <tr>\n",
|
|
" <th>50%</th>\n",
|
|
" <td>NaN</td>\n",
|
|
" <td>NaN</td>\n",
|
|
" <td>NaN</td>\n",
|
|
" <td>NaN</td>\n",
|
|
" <td>NaN</td>\n",
|
|
" <td>7.900000</td>\n",
|
|
" <td>NaN</td>\n",
|
|
" <td>NaN</td>\n",
|
|
" <td>NaN</td>\n",
|
|
" <td>NaN</td>\n",
|
|
" <td>NaN</td>\n",
|
|
" <td>NaN</td>\n",
|
|
" <td>1.654650e+05</td>\n",
|
|
" <td>2.602096e+07</td>\n",
|
|
" </tr>\n",
|
|
" <tr>\n",
|
|
" <th>75%</th>\n",
|
|
" <td>NaN</td>\n",
|
|
" <td>NaN</td>\n",
|
|
" <td>NaN</td>\n",
|
|
" <td>NaN</td>\n",
|
|
" <td>NaN</td>\n",
|
|
" <td>8.100000</td>\n",
|
|
" <td>NaN</td>\n",
|
|
" <td>NaN</td>\n",
|
|
" <td>NaN</td>\n",
|
|
" <td>NaN</td>\n",
|
|
" <td>NaN</td>\n",
|
|
" <td>NaN</td>\n",
|
|
" <td>4.268040e+05</td>\n",
|
|
" <td>7.556908e+07</td>\n",
|
|
" </tr>\n",
|
|
" <tr>\n",
|
|
" <th>max</th>\n",
|
|
" <td>NaN</td>\n",
|
|
" <td>NaN</td>\n",
|
|
" <td>NaN</td>\n",
|
|
" <td>NaN</td>\n",
|
|
" <td>NaN</td>\n",
|
|
" <td>9.300000</td>\n",
|
|
" <td>NaN</td>\n",
|
|
" <td>NaN</td>\n",
|
|
" <td>NaN</td>\n",
|
|
" <td>NaN</td>\n",
|
|
" <td>NaN</td>\n",
|
|
" <td>NaN</td>\n",
|
|
" <td>2.343110e+06</td>\n",
|
|
" <td>3.808433e+08</td>\n",
|
|
" </tr>\n",
|
|
" </tbody>\n",
|
|
"</table>\n",
|
|
"</div>"
|
|
],
|
|
"text/plain": [
|
|
" Series_Title Released_Year Certificate Runtime Genre IMDB_Rating \\\n",
|
|
"count 115 115 115 115 115 115.000000 \n",
|
|
"unique 115 57 10 64 59 NaN \n",
|
|
"top queen 2013 U 102 min drama NaN \n",
|
|
"freq 1 7 30 7 14 NaN \n",
|
|
"mean NaN NaN NaN NaN NaN 7.947826 \n",
|
|
"std NaN NaN NaN NaN NaN 0.313259 \n",
|
|
"min NaN NaN NaN NaN NaN 7.600000 \n",
|
|
"25% NaN NaN NaN NaN NaN 7.700000 \n",
|
|
"50% NaN NaN NaN NaN NaN 7.900000 \n",
|
|
"75% NaN NaN NaN NaN NaN 8.100000 \n",
|
|
"max NaN NaN NaN NaN NaN 9.300000 \n",
|
|
"\n",
|
|
" Meta_score Director Star1 Star2 Star3 \\\n",
|
|
"count 115 115 115 115 115 \n",
|
|
"unique 44 105 100 113 109 \n",
|
|
"top frank darabont al pacino emma watson carrie fisher \n",
|
|
"freq 16 2 4 2 2 \n",
|
|
"mean NaN NaN NaN NaN NaN \n",
|
|
"std NaN NaN NaN NaN NaN \n",
|
|
"min NaN NaN NaN NaN NaN \n",
|
|
"25% NaN NaN NaN NaN NaN \n",
|
|
"50% NaN NaN NaN NaN NaN \n",
|
|
"75% NaN NaN NaN NaN NaN \n",
|
|
"max NaN NaN NaN NaN NaN \n",
|
|
"\n",
|
|
" Star4 No_of_Votes Gross \n",
|
|
"count 115 1.150000e+02 1.150000e+02 \n",
|
|
"unique 114 NaN NaN \n",
|
|
"top lucy liu NaN NaN \n",
|
|
"freq 2 NaN NaN \n",
|
|
"mean NaN 3.140691e+05 6.622925e+07 \n",
|
|
"std NaN 3.647432e+05 9.085320e+07 \n",
|
|
"min NaN 2.669700e+04 1.095000e+04 \n",
|
|
"25% NaN 7.206000e+04 4.232562e+06 \n",
|
|
"50% NaN 1.654650e+05 2.602096e+07 \n",
|
|
"75% NaN 4.268040e+05 7.556908e+07 \n",
|
|
"max NaN 2.343110e+06 3.808433e+08 "
|
|
]
|
|
},
|
|
"execution_count": 78,
|
|
"metadata": {},
|
|
"output_type": "execute_result"
|
|
}
|
|
],
|
|
"source": [
|
|
"data_test.describe(include=\"all\")"
|
|
]
|
|
},
|
|
{
|
|
"cell_type": "code",
|
|
"execution_count": 79,
|
|
"id": "little-mathematics",
|
|
"metadata": {},
|
|
"outputs": [
|
|
{
|
|
"data": {
|
|
"text/html": [
|
|
"<div>\n",
|
|
"<style scoped>\n",
|
|
" .dataframe tbody tr th:only-of-type {\n",
|
|
" vertical-align: middle;\n",
|
|
" }\n",
|
|
"\n",
|
|
" .dataframe tbody tr th {\n",
|
|
" vertical-align: top;\n",
|
|
" }\n",
|
|
"\n",
|
|
" .dataframe thead th {\n",
|
|
" text-align: right;\n",
|
|
" }\n",
|
|
"</style>\n",
|
|
"<table border=\"1\" class=\"dataframe\">\n",
|
|
" <thead>\n",
|
|
" <tr style=\"text-align: right;\">\n",
|
|
" <th></th>\n",
|
|
" <th>Series_Title</th>\n",
|
|
" <th>Released_Year</th>\n",
|
|
" <th>Certificate</th>\n",
|
|
" <th>Runtime</th>\n",
|
|
" <th>Genre</th>\n",
|
|
" <th>IMDB_Rating</th>\n",
|
|
" <th>Meta_score</th>\n",
|
|
" <th>Director</th>\n",
|
|
" <th>Star1</th>\n",
|
|
" <th>Star2</th>\n",
|
|
" <th>Star3</th>\n",
|
|
" <th>Star4</th>\n",
|
|
" <th>No_of_Votes</th>\n",
|
|
" <th>Gross</th>\n",
|
|
" </tr>\n",
|
|
" </thead>\n",
|
|
" <tbody>\n",
|
|
" <tr>\n",
|
|
" <th>count</th>\n",
|
|
" <td>115</td>\n",
|
|
" <td>115</td>\n",
|
|
" <td>115</td>\n",
|
|
" <td>115</td>\n",
|
|
" <td>115</td>\n",
|
|
" <td>115.000000</td>\n",
|
|
" <td>115</td>\n",
|
|
" <td>115</td>\n",
|
|
" <td>115</td>\n",
|
|
" <td>115</td>\n",
|
|
" <td>115</td>\n",
|
|
" <td>115</td>\n",
|
|
" <td>1.150000e+02</td>\n",
|
|
" <td>1.150000e+02</td>\n",
|
|
" </tr>\n",
|
|
" <tr>\n",
|
|
" <th>unique</th>\n",
|
|
" <td>115</td>\n",
|
|
" <td>56</td>\n",
|
|
" <td>8</td>\n",
|
|
" <td>72</td>\n",
|
|
" <td>71</td>\n",
|
|
" <td>NaN</td>\n",
|
|
" <td>42</td>\n",
|
|
" <td>101</td>\n",
|
|
" <td>104</td>\n",
|
|
" <td>115</td>\n",
|
|
" <td>115</td>\n",
|
|
" <td>112</td>\n",
|
|
" <td>NaN</td>\n",
|
|
" <td>NaN</td>\n",
|
|
" </tr>\n",
|
|
" <tr>\n",
|
|
" <th>top</th>\n",
|
|
" <td>mr. smith goes to washington</td>\n",
|
|
" <td>2004</td>\n",
|
|
" <td>UA</td>\n",
|
|
" <td>120 min</td>\n",
|
|
" <td>drama</td>\n",
|
|
" <td>NaN</td>\n",
|
|
" <td></td>\n",
|
|
" <td>billy wilder</td>\n",
|
|
" <td>johnny depp</td>\n",
|
|
" <td>charlize theron</td>\n",
|
|
" <td>joel edgerton</td>\n",
|
|
" <td>kevin bacon</td>\n",
|
|
" <td>NaN</td>\n",
|
|
" <td>NaN</td>\n",
|
|
" </tr>\n",
|
|
" <tr>\n",
|
|
" <th>freq</th>\n",
|
|
" <td>1</td>\n",
|
|
" <td>6</td>\n",
|
|
" <td>28</td>\n",
|
|
" <td>5</td>\n",
|
|
" <td>8</td>\n",
|
|
" <td>NaN</td>\n",
|
|
" <td>12</td>\n",
|
|
" <td>3</td>\n",
|
|
" <td>3</td>\n",
|
|
" <td>1</td>\n",
|
|
" <td>1</td>\n",
|
|
" <td>2</td>\n",
|
|
" <td>NaN</td>\n",
|
|
" <td>NaN</td>\n",
|
|
" </tr>\n",
|
|
" <tr>\n",
|
|
" <th>mean</th>\n",
|
|
" <td>NaN</td>\n",
|
|
" <td>NaN</td>\n",
|
|
" <td>NaN</td>\n",
|
|
" <td>NaN</td>\n",
|
|
" <td>NaN</td>\n",
|
|
" <td>7.940870</td>\n",
|
|
" <td>NaN</td>\n",
|
|
" <td>NaN</td>\n",
|
|
" <td>NaN</td>\n",
|
|
" <td>NaN</td>\n",
|
|
" <td>NaN</td>\n",
|
|
" <td>NaN</td>\n",
|
|
" <td>3.048547e+05</td>\n",
|
|
" <td>7.129188e+07</td>\n",
|
|
" </tr>\n",
|
|
" <tr>\n",
|
|
" <th>std</th>\n",
|
|
" <td>NaN</td>\n",
|
|
" <td>NaN</td>\n",
|
|
" <td>NaN</td>\n",
|
|
" <td>NaN</td>\n",
|
|
" <td>NaN</td>\n",
|
|
" <td>0.269143</td>\n",
|
|
" <td>NaN</td>\n",
|
|
" <td>NaN</td>\n",
|
|
" <td>NaN</td>\n",
|
|
" <td>NaN</td>\n",
|
|
" <td>NaN</td>\n",
|
|
" <td>NaN</td>\n",
|
|
" <td>3.400764e+05</td>\n",
|
|
" <td>1.275242e+08</td>\n",
|
|
" </tr>\n",
|
|
" <tr>\n",
|
|
" <th>min</th>\n",
|
|
" <td>NaN</td>\n",
|
|
" <td>NaN</td>\n",
|
|
" <td>NaN</td>\n",
|
|
" <td>NaN</td>\n",
|
|
" <td>NaN</td>\n",
|
|
" <td>7.600000</td>\n",
|
|
" <td>NaN</td>\n",
|
|
" <td>NaN</td>\n",
|
|
" <td>NaN</td>\n",
|
|
" <td>NaN</td>\n",
|
|
" <td>NaN</td>\n",
|
|
" <td>NaN</td>\n",
|
|
" <td>2.522900e+04</td>\n",
|
|
" <td>3.600000e+03</td>\n",
|
|
" </tr>\n",
|
|
" <tr>\n",
|
|
" <th>25%</th>\n",
|
|
" <td>NaN</td>\n",
|
|
" <td>NaN</td>\n",
|
|
" <td>NaN</td>\n",
|
|
" <td>NaN</td>\n",
|
|
" <td>NaN</td>\n",
|
|
" <td>7.700000</td>\n",
|
|
" <td>NaN</td>\n",
|
|
" <td>NaN</td>\n",
|
|
" <td>NaN</td>\n",
|
|
" <td>NaN</td>\n",
|
|
" <td>NaN</td>\n",
|
|
" <td>NaN</td>\n",
|
|
" <td>8.123350e+04</td>\n",
|
|
" <td>3.425538e+06</td>\n",
|
|
" </tr>\n",
|
|
" <tr>\n",
|
|
" <th>50%</th>\n",
|
|
" <td>NaN</td>\n",
|
|
" <td>NaN</td>\n",
|
|
" <td>NaN</td>\n",
|
|
" <td>NaN</td>\n",
|
|
" <td>NaN</td>\n",
|
|
" <td>7.900000</td>\n",
|
|
" <td>NaN</td>\n",
|
|
" <td>NaN</td>\n",
|
|
" <td>NaN</td>\n",
|
|
" <td>NaN</td>\n",
|
|
" <td>NaN</td>\n",
|
|
" <td>NaN</td>\n",
|
|
" <td>1.820090e+05</td>\n",
|
|
" <td>2.018666e+07</td>\n",
|
|
" </tr>\n",
|
|
" <tr>\n",
|
|
" <th>75%</th>\n",
|
|
" <td>NaN</td>\n",
|
|
" <td>NaN</td>\n",
|
|
" <td>NaN</td>\n",
|
|
" <td>NaN</td>\n",
|
|
" <td>NaN</td>\n",
|
|
" <td>8.100000</td>\n",
|
|
" <td>NaN</td>\n",
|
|
" <td>NaN</td>\n",
|
|
" <td>NaN</td>\n",
|
|
" <td>NaN</td>\n",
|
|
" <td>NaN</td>\n",
|
|
" <td>NaN</td>\n",
|
|
" <td>4.148195e+05</td>\n",
|
|
" <td>8.406197e+07</td>\n",
|
|
" </tr>\n",
|
|
" <tr>\n",
|
|
" <th>max</th>\n",
|
|
" <td>NaN</td>\n",
|
|
" <td>NaN</td>\n",
|
|
" <td>NaN</td>\n",
|
|
" <td>NaN</td>\n",
|
|
" <td>NaN</td>\n",
|
|
" <td>8.800000</td>\n",
|
|
" <td>NaN</td>\n",
|
|
" <td>NaN</td>\n",
|
|
" <td>NaN</td>\n",
|
|
" <td>NaN</td>\n",
|
|
" <td>NaN</td>\n",
|
|
" <td>NaN</td>\n",
|
|
" <td>2.067042e+06</td>\n",
|
|
" <td>9.366622e+08</td>\n",
|
|
" </tr>\n",
|
|
" </tbody>\n",
|
|
"</table>\n",
|
|
"</div>"
|
|
],
|
|
"text/plain": [
|
|
" Series_Title Released_Year Certificate Runtime \\\n",
|
|
"count 115 115 115 115 \n",
|
|
"unique 115 56 8 72 \n",
|
|
"top mr. smith goes to washington 2004 UA 120 min \n",
|
|
"freq 1 6 28 5 \n",
|
|
"mean NaN NaN NaN NaN \n",
|
|
"std NaN NaN NaN NaN \n",
|
|
"min NaN NaN NaN NaN \n",
|
|
"25% NaN NaN NaN NaN \n",
|
|
"50% NaN NaN NaN NaN \n",
|
|
"75% NaN NaN NaN NaN \n",
|
|
"max NaN NaN NaN NaN \n",
|
|
"\n",
|
|
" Genre IMDB_Rating Meta_score Director Star1 \\\n",
|
|
"count 115 115.000000 115 115 115 \n",
|
|
"unique 71 NaN 42 101 104 \n",
|
|
"top drama NaN billy wilder johnny depp \n",
|
|
"freq 8 NaN 12 3 3 \n",
|
|
"mean NaN 7.940870 NaN NaN NaN \n",
|
|
"std NaN 0.269143 NaN NaN NaN \n",
|
|
"min NaN 7.600000 NaN NaN NaN \n",
|
|
"25% NaN 7.700000 NaN NaN NaN \n",
|
|
"50% NaN 7.900000 NaN NaN NaN \n",
|
|
"75% NaN 8.100000 NaN NaN NaN \n",
|
|
"max NaN 8.800000 NaN NaN NaN \n",
|
|
"\n",
|
|
" Star2 Star3 Star4 No_of_Votes \\\n",
|
|
"count 115 115 115 1.150000e+02 \n",
|
|
"unique 115 115 112 NaN \n",
|
|
"top charlize theron joel edgerton kevin bacon NaN \n",
|
|
"freq 1 1 2 NaN \n",
|
|
"mean NaN NaN NaN 3.048547e+05 \n",
|
|
"std NaN NaN NaN 3.400764e+05 \n",
|
|
"min NaN NaN NaN 2.522900e+04 \n",
|
|
"25% NaN NaN NaN 8.123350e+04 \n",
|
|
"50% NaN NaN NaN 1.820090e+05 \n",
|
|
"75% NaN NaN NaN 4.148195e+05 \n",
|
|
"max NaN NaN NaN 2.067042e+06 \n",
|
|
"\n",
|
|
" Gross \n",
|
|
"count 1.150000e+02 \n",
|
|
"unique NaN \n",
|
|
"top NaN \n",
|
|
"freq NaN \n",
|
|
"mean 7.129188e+07 \n",
|
|
"std 1.275242e+08 \n",
|
|
"min 3.600000e+03 \n",
|
|
"25% 3.425538e+06 \n",
|
|
"50% 2.018666e+07 \n",
|
|
"75% 8.406197e+07 \n",
|
|
"max 9.366622e+08 "
|
|
]
|
|
},
|
|
"execution_count": 79,
|
|
"metadata": {},
|
|
"output_type": "execute_result"
|
|
}
|
|
],
|
|
"source": [
|
|
"data_dev.describe(include=\"all\")"
|
|
]
|
|
},
|
|
{
|
|
"cell_type": "code",
|
|
"execution_count": 80,
|
|
"id": "sufficient-parade",
|
|
"metadata": {},
|
|
"outputs": [],
|
|
"source": [
|
|
"data_test.to_csv(\"data_test.csv\", encoding=\"utf-8\", index=False)\n",
|
|
"data_dev.to_csv(\"data_dev.csv\", encoding=\"utf-8\", index=False)\n",
|
|
"data_train.to_csv(\"data_train.csv\", encoding=\"utf-8\", index=False)"
|
|
]
|
|
},
|
|
{
|
|
"cell_type": "code",
|
|
"execution_count": null,
|
|
"id": "accompanied-virtue",
|
|
"metadata": {},
|
|
"outputs": [],
|
|
"source": []
|
|
}
|
|
],
|
|
"metadata": {
|
|
"kernelspec": {
|
|
"display_name": "Python 3",
|
|
"language": "python",
|
|
"name": "python3"
|
|
},
|
|
"language_info": {
|
|
"codemirror_mode": {
|
|
"name": "ipython",
|
|
"version": 3
|
|
},
|
|
"file_extension": ".py",
|
|
"mimetype": "text/x-python",
|
|
"name": "python",
|
|
"nbconvert_exporter": "python",
|
|
"pygments_lexer": "ipython3",
|
|
"version": "3.7.3"
|
|
}
|
|
},
|
|
"nbformat": 4,
|
|
"nbformat_minor": 5
|
|
}
|