1391 lines
46 KiB
Plaintext
1391 lines
46 KiB
Plaintext
|
{
|
||
|
"cells": [
|
||
|
{
|
||
|
"cell_type": "markdown",
|
||
|
"id": "injured-operations",
|
||
|
"metadata": {},
|
||
|
"source": [
|
||
|
"### Skrypt do ściagnięcia zbiory danych"
|
||
|
]
|
||
|
},
|
||
|
{
|
||
|
"cell_type": "code",
|
||
|
"execution_count": null,
|
||
|
"id": "engaging-wholesale",
|
||
|
"metadata": {},
|
||
|
"outputs": [],
|
||
|
"source": [
|
||
|
"!pip install --user kaggle \n",
|
||
|
"!pip install --user pandas\n",
|
||
|
"!pip install --user numpy\n",
|
||
|
"!pip install --user seaborn\n",
|
||
|
"!pip install -U scikit-learn"
|
||
|
]
|
||
|
},
|
||
|
{
|
||
|
"cell_type": "code",
|
||
|
"execution_count": 2,
|
||
|
"id": "cleared-shower",
|
||
|
"metadata": {},
|
||
|
"outputs": [
|
||
|
{
|
||
|
"name": "stdout",
|
||
|
"output_type": "stream",
|
||
|
"text": [
|
||
|
"Downloading dataset from Kaggle...\n",
|
||
|
"/bin/bash: kaggle: command not found\n",
|
||
|
"Done.\n"
|
||
|
]
|
||
|
}
|
||
|
],
|
||
|
"source": [
|
||
|
"!echo \"Downloading dataset from Kaggle...\"\n",
|
||
|
"!kaggle datasets download -d harshitshankhdhar/imdb-dataset-of-top-1000-movies-and-tv-shows\n",
|
||
|
"!echo \"Done.\""
|
||
|
]
|
||
|
},
|
||
|
{
|
||
|
"cell_type": "code",
|
||
|
"execution_count": 1,
|
||
|
"id": "pleased-culture",
|
||
|
"metadata": {},
|
||
|
"outputs": [
|
||
|
{
|
||
|
"name": "stdout",
|
||
|
"output_type": "stream",
|
||
|
"text": [
|
||
|
"Unzipping archive\n",
|
||
|
"Done.\n"
|
||
|
]
|
||
|
}
|
||
|
],
|
||
|
"source": [
|
||
|
"!echo \"Unzipping archive\"\n",
|
||
|
"!files=$(unzip imdb-dataset-of-top-1000-movies-and-tv-shows.zip | tail -n +2 | cut -d ' ' -f 4)\n",
|
||
|
"!echo \"Done.\""
|
||
|
]
|
||
|
},
|
||
|
{
|
||
|
"cell_type": "code",
|
||
|
"execution_count": 92,
|
||
|
"id": "extended-moderator",
|
||
|
"metadata": {},
|
||
|
"outputs": [],
|
||
|
"source": [
|
||
|
"import pandas as pd\n",
|
||
|
"import numpy as np\n",
|
||
|
"data=pd.read_csv('imdb_top_1000.csv')\n",
|
||
|
"# data"
|
||
|
]
|
||
|
},
|
||
|
{
|
||
|
"cell_type": "markdown",
|
||
|
"id": "outer-allah",
|
||
|
"metadata": {},
|
||
|
"source": [
|
||
|
"## Usuwanie kolumn\n",
|
||
|
"- Poster_Link: kolumna zawierająca linki do plakatów promujących film\n",
|
||
|
"- Overview: kolumna zawierająca recenzje poszczególnych filmów"
|
||
|
]
|
||
|
},
|
||
|
{
|
||
|
"cell_type": "code",
|
||
|
"execution_count": 93,
|
||
|
"id": "strange-honduras",
|
||
|
"metadata": {},
|
||
|
"outputs": [],
|
||
|
"source": [
|
||
|
"data.drop(columns=[\"Poster_Link\"], inplace=True)\n",
|
||
|
"data.drop(columns=[\"Overview\"], inplace=True)\n",
|
||
|
"# data"
|
||
|
]
|
||
|
},
|
||
|
{
|
||
|
"cell_type": "code",
|
||
|
"execution_count": 95,
|
||
|
"id": "preceding-values",
|
||
|
"metadata": {},
|
||
|
"outputs": [],
|
||
|
"source": [
|
||
|
"# Lowercase na polach tekstowych\n",
|
||
|
"data[\"Series_Title\"] = data[\"Series_Title\"].str.lower()\n",
|
||
|
"data[\"Genre\"] = data[\"Genre\"].str.lower()\n",
|
||
|
"data[\"Director\"] = data[\"Director\"].str.lower()\n",
|
||
|
"data[\"Star1\"] = data[\"Star1\"].str.lower()\n",
|
||
|
"data[\"Star2\"] = data[\"Star2\"].str.lower()\n",
|
||
|
"data[\"Star3\"] = data[\"Star3\"].str.lower()\n",
|
||
|
"data[\"Star4\"] = data[\"Star4\"].str.lower()\n",
|
||
|
"\n",
|
||
|
"# Usunięcie Nan i string to int \n",
|
||
|
"data = data.replace(np.nan, '', regex=True)\n",
|
||
|
"data[\"Gross\"] = data[\"Gross\"].str.replace(',', '')\n",
|
||
|
"data[\"Gross\"] = pd.to_numeric(data[\"Gross\"])"
|
||
|
]
|
||
|
},
|
||
|
{
|
||
|
"cell_type": "code",
|
||
|
"execution_count": 97,
|
||
|
"id": "standard-rates",
|
||
|
"metadata": {},
|
||
|
"outputs": [
|
||
|
{
|
||
|
"name": "stdout",
|
||
|
"output_type": "stream",
|
||
|
"text": [
|
||
|
"1001 imdb_top_1000.csv\n"
|
||
|
]
|
||
|
}
|
||
|
],
|
||
|
"source": [
|
||
|
"#Wielkosc zbioru\n",
|
||
|
"!wc -l imdb_top_1000.csv"
|
||
|
]
|
||
|
},
|
||
|
{
|
||
|
"cell_type": "code",
|
||
|
"execution_count": 96,
|
||
|
"id": "experienced-nerve",
|
||
|
"metadata": {},
|
||
|
"outputs": [
|
||
|
{
|
||
|
"data": {
|
||
|
"text/html": [
|
||
|
"<div>\n",
|
||
|
"<style scoped>\n",
|
||
|
" .dataframe tbody tr th:only-of-type {\n",
|
||
|
" vertical-align: middle;\n",
|
||
|
" }\n",
|
||
|
"\n",
|
||
|
" .dataframe tbody tr th {\n",
|
||
|
" vertical-align: top;\n",
|
||
|
" }\n",
|
||
|
"\n",
|
||
|
" .dataframe thead th {\n",
|
||
|
" text-align: right;\n",
|
||
|
" }\n",
|
||
|
"</style>\n",
|
||
|
"<table border=\"1\" class=\"dataframe\">\n",
|
||
|
" <thead>\n",
|
||
|
" <tr style=\"text-align: right;\">\n",
|
||
|
" <th></th>\n",
|
||
|
" <th>Series_Title</th>\n",
|
||
|
" <th>Released_Year</th>\n",
|
||
|
" <th>Certificate</th>\n",
|
||
|
" <th>Runtime</th>\n",
|
||
|
" <th>Genre</th>\n",
|
||
|
" <th>IMDB_Rating</th>\n",
|
||
|
" <th>Meta_score</th>\n",
|
||
|
" <th>Director</th>\n",
|
||
|
" <th>Star1</th>\n",
|
||
|
" <th>Star2</th>\n",
|
||
|
" <th>Star3</th>\n",
|
||
|
" <th>Star4</th>\n",
|
||
|
" <th>No_of_Votes</th>\n",
|
||
|
" <th>Gross</th>\n",
|
||
|
" </tr>\n",
|
||
|
" </thead>\n",
|
||
|
" <tbody>\n",
|
||
|
" <tr>\n",
|
||
|
" <th>count</th>\n",
|
||
|
" <td>1000</td>\n",
|
||
|
" <td>1000</td>\n",
|
||
|
" <td>1000</td>\n",
|
||
|
" <td>1000</td>\n",
|
||
|
" <td>1000</td>\n",
|
||
|
" <td>1000.000000</td>\n",
|
||
|
" <td>1000</td>\n",
|
||
|
" <td>1000</td>\n",
|
||
|
" <td>1000</td>\n",
|
||
|
" <td>1000</td>\n",
|
||
|
" <td>1000</td>\n",
|
||
|
" <td>1000</td>\n",
|
||
|
" <td>1.000000e+03</td>\n",
|
||
|
" <td>8.310000e+02</td>\n",
|
||
|
" </tr>\n",
|
||
|
" <tr>\n",
|
||
|
" <th>unique</th>\n",
|
||
|
" <td>999</td>\n",
|
||
|
" <td>100</td>\n",
|
||
|
" <td>17</td>\n",
|
||
|
" <td>140</td>\n",
|
||
|
" <td>202</td>\n",
|
||
|
" <td>NaN</td>\n",
|
||
|
" <td>64</td>\n",
|
||
|
" <td>548</td>\n",
|
||
|
" <td>660</td>\n",
|
||
|
" <td>841</td>\n",
|
||
|
" <td>891</td>\n",
|
||
|
" <td>939</td>\n",
|
||
|
" <td>NaN</td>\n",
|
||
|
" <td>NaN</td>\n",
|
||
|
" </tr>\n",
|
||
|
" <tr>\n",
|
||
|
" <th>top</th>\n",
|
||
|
" <td>drishyam</td>\n",
|
||
|
" <td>2014</td>\n",
|
||
|
" <td>U</td>\n",
|
||
|
" <td>130 min</td>\n",
|
||
|
" <td>drama</td>\n",
|
||
|
" <td>NaN</td>\n",
|
||
|
" <td></td>\n",
|
||
|
" <td>alfred hitchcock</td>\n",
|
||
|
" <td>tom hanks</td>\n",
|
||
|
" <td>emma watson</td>\n",
|
||
|
" <td>rupert grint</td>\n",
|
||
|
" <td>michael caine</td>\n",
|
||
|
" <td>NaN</td>\n",
|
||
|
" <td>NaN</td>\n",
|
||
|
" </tr>\n",
|
||
|
" <tr>\n",
|
||
|
" <th>freq</th>\n",
|
||
|
" <td>2</td>\n",
|
||
|
" <td>32</td>\n",
|
||
|
" <td>234</td>\n",
|
||
|
" <td>23</td>\n",
|
||
|
" <td>85</td>\n",
|
||
|
" <td>NaN</td>\n",
|
||
|
" <td>157</td>\n",
|
||
|
" <td>14</td>\n",
|
||
|
" <td>12</td>\n",
|
||
|
" <td>7</td>\n",
|
||
|
" <td>5</td>\n",
|
||
|
" <td>4</td>\n",
|
||
|
" <td>NaN</td>\n",
|
||
|
" <td>NaN</td>\n",
|
||
|
" </tr>\n",
|
||
|
" <tr>\n",
|
||
|
" <th>mean</th>\n",
|
||
|
" <td>NaN</td>\n",
|
||
|
" <td>NaN</td>\n",
|
||
|
" <td>NaN</td>\n",
|
||
|
" <td>NaN</td>\n",
|
||
|
" <td>NaN</td>\n",
|
||
|
" <td>7.949300</td>\n",
|
||
|
" <td>NaN</td>\n",
|
||
|
" <td>NaN</td>\n",
|
||
|
" <td>NaN</td>\n",
|
||
|
" <td>NaN</td>\n",
|
||
|
" <td>NaN</td>\n",
|
||
|
" <td>NaN</td>\n",
|
||
|
" <td>2.736929e+05</td>\n",
|
||
|
" <td>6.803475e+07</td>\n",
|
||
|
" </tr>\n",
|
||
|
" <tr>\n",
|
||
|
" <th>std</th>\n",
|
||
|
" <td>NaN</td>\n",
|
||
|
" <td>NaN</td>\n",
|
||
|
" <td>NaN</td>\n",
|
||
|
" <td>NaN</td>\n",
|
||
|
" <td>NaN</td>\n",
|
||
|
" <td>0.275491</td>\n",
|
||
|
" <td>NaN</td>\n",
|
||
|
" <td>NaN</td>\n",
|
||
|
" <td>NaN</td>\n",
|
||
|
" <td>NaN</td>\n",
|
||
|
" <td>NaN</td>\n",
|
||
|
" <td>NaN</td>\n",
|
||
|
" <td>3.273727e+05</td>\n",
|
||
|
" <td>1.097500e+08</td>\n",
|
||
|
" </tr>\n",
|
||
|
" <tr>\n",
|
||
|
" <th>min</th>\n",
|
||
|
" <td>NaN</td>\n",
|
||
|
" <td>NaN</td>\n",
|
||
|
" <td>NaN</td>\n",
|
||
|
" <td>NaN</td>\n",
|
||
|
" <td>NaN</td>\n",
|
||
|
" <td>7.600000</td>\n",
|
||
|
" <td>NaN</td>\n",
|
||
|
" <td>NaN</td>\n",
|
||
|
" <td>NaN</td>\n",
|
||
|
" <td>NaN</td>\n",
|
||
|
" <td>NaN</td>\n",
|
||
|
" <td>NaN</td>\n",
|
||
|
" <td>2.508800e+04</td>\n",
|
||
|
" <td>1.305000e+03</td>\n",
|
||
|
" </tr>\n",
|
||
|
" <tr>\n",
|
||
|
" <th>25%</th>\n",
|
||
|
" <td>NaN</td>\n",
|
||
|
" <td>NaN</td>\n",
|
||
|
" <td>NaN</td>\n",
|
||
|
" <td>NaN</td>\n",
|
||
|
" <td>NaN</td>\n",
|
||
|
" <td>7.700000</td>\n",
|
||
|
" <td>NaN</td>\n",
|
||
|
" <td>NaN</td>\n",
|
||
|
" <td>NaN</td>\n",
|
||
|
" <td>NaN</td>\n",
|
||
|
" <td>NaN</td>\n",
|
||
|
" <td>NaN</td>\n",
|
||
|
" <td>5.552625e+04</td>\n",
|
||
|
" <td>3.253559e+06</td>\n",
|
||
|
" </tr>\n",
|
||
|
" <tr>\n",
|
||
|
" <th>50%</th>\n",
|
||
|
" <td>NaN</td>\n",
|
||
|
" <td>NaN</td>\n",
|
||
|
" <td>NaN</td>\n",
|
||
|
" <td>NaN</td>\n",
|
||
|
" <td>NaN</td>\n",
|
||
|
" <td>7.900000</td>\n",
|
||
|
" <td>NaN</td>\n",
|
||
|
" <td>NaN</td>\n",
|
||
|
" <td>NaN</td>\n",
|
||
|
" <td>NaN</td>\n",
|
||
|
" <td>NaN</td>\n",
|
||
|
" <td>NaN</td>\n",
|
||
|
" <td>1.385485e+05</td>\n",
|
||
|
" <td>2.353089e+07</td>\n",
|
||
|
" </tr>\n",
|
||
|
" <tr>\n",
|
||
|
" <th>75%</th>\n",
|
||
|
" <td>NaN</td>\n",
|
||
|
" <td>NaN</td>\n",
|
||
|
" <td>NaN</td>\n",
|
||
|
" <td>NaN</td>\n",
|
||
|
" <td>NaN</td>\n",
|
||
|
" <td>8.100000</td>\n",
|
||
|
" <td>NaN</td>\n",
|
||
|
" <td>NaN</td>\n",
|
||
|
" <td>NaN</td>\n",
|
||
|
" <td>NaN</td>\n",
|
||
|
" <td>NaN</td>\n",
|
||
|
" <td>NaN</td>\n",
|
||
|
" <td>3.741612e+05</td>\n",
|
||
|
" <td>8.075089e+07</td>\n",
|
||
|
" </tr>\n",
|
||
|
" <tr>\n",
|
||
|
" <th>max</th>\n",
|
||
|
" <td>NaN</td>\n",
|
||
|
" <td>NaN</td>\n",
|
||
|
" <td>NaN</td>\n",
|
||
|
" <td>NaN</td>\n",
|
||
|
" <td>NaN</td>\n",
|
||
|
" <td>9.300000</td>\n",
|
||
|
" <td>NaN</td>\n",
|
||
|
" <td>NaN</td>\n",
|
||
|
" <td>NaN</td>\n",
|
||
|
" <td>NaN</td>\n",
|
||
|
" <td>NaN</td>\n",
|
||
|
" <td>NaN</td>\n",
|
||
|
" <td>2.343110e+06</td>\n",
|
||
|
" <td>9.366622e+08</td>\n",
|
||
|
" </tr>\n",
|
||
|
" </tbody>\n",
|
||
|
"</table>\n",
|
||
|
"</div>"
|
||
|
],
|
||
|
"text/plain": [
|
||
|
" Series_Title Released_Year Certificate Runtime Genre IMDB_Rating \\\n",
|
||
|
"count 1000 1000 1000 1000 1000 1000.000000 \n",
|
||
|
"unique 999 100 17 140 202 NaN \n",
|
||
|
"top drishyam 2014 U 130 min drama NaN \n",
|
||
|
"freq 2 32 234 23 85 NaN \n",
|
||
|
"mean NaN NaN NaN NaN NaN 7.949300 \n",
|
||
|
"std NaN NaN NaN NaN NaN 0.275491 \n",
|
||
|
"min NaN NaN NaN NaN NaN 7.600000 \n",
|
||
|
"25% NaN NaN NaN NaN NaN 7.700000 \n",
|
||
|
"50% NaN NaN NaN NaN NaN 7.900000 \n",
|
||
|
"75% NaN NaN NaN NaN NaN 8.100000 \n",
|
||
|
"max NaN NaN NaN NaN NaN 9.300000 \n",
|
||
|
"\n",
|
||
|
" Meta_score Director Star1 Star2 Star3 \\\n",
|
||
|
"count 1000 1000 1000 1000 1000 \n",
|
||
|
"unique 64 548 660 841 891 \n",
|
||
|
"top alfred hitchcock tom hanks emma watson rupert grint \n",
|
||
|
"freq 157 14 12 7 5 \n",
|
||
|
"mean NaN NaN NaN NaN NaN \n",
|
||
|
"std NaN NaN NaN NaN NaN \n",
|
||
|
"min NaN NaN NaN NaN NaN \n",
|
||
|
"25% NaN NaN NaN NaN NaN \n",
|
||
|
"50% NaN NaN NaN NaN NaN \n",
|
||
|
"75% NaN NaN NaN NaN NaN \n",
|
||
|
"max NaN NaN NaN NaN NaN \n",
|
||
|
"\n",
|
||
|
" Star4 No_of_Votes Gross \n",
|
||
|
"count 1000 1.000000e+03 8.310000e+02 \n",
|
||
|
"unique 939 NaN NaN \n",
|
||
|
"top michael caine NaN NaN \n",
|
||
|
"freq 4 NaN NaN \n",
|
||
|
"mean NaN 2.736929e+05 6.803475e+07 \n",
|
||
|
"std NaN 3.273727e+05 1.097500e+08 \n",
|
||
|
"min NaN 2.508800e+04 1.305000e+03 \n",
|
||
|
"25% NaN 5.552625e+04 3.253559e+06 \n",
|
||
|
"50% NaN 1.385485e+05 2.353089e+07 \n",
|
||
|
"75% NaN 3.741612e+05 8.075089e+07 \n",
|
||
|
"max NaN 2.343110e+06 9.366622e+08 "
|
||
|
]
|
||
|
},
|
||
|
"execution_count": 96,
|
||
|
"metadata": {},
|
||
|
"output_type": "execute_result"
|
||
|
}
|
||
|
],
|
||
|
"source": [
|
||
|
"data.describe(include='all')"
|
||
|
]
|
||
|
},
|
||
|
{
|
||
|
"cell_type": "code",
|
||
|
"execution_count": 98,
|
||
|
"id": "academic-principle",
|
||
|
"metadata": {},
|
||
|
"outputs": [
|
||
|
{
|
||
|
"data": {
|
||
|
"text/plain": [
|
||
|
"IMDB_Rating 7.9\n",
|
||
|
"No_of_Votes 138548.5\n",
|
||
|
"Gross 23530892.0\n",
|
||
|
"dtype: float64"
|
||
|
]
|
||
|
},
|
||
|
"execution_count": 98,
|
||
|
"metadata": {},
|
||
|
"output_type": "execute_result"
|
||
|
}
|
||
|
],
|
||
|
"source": [
|
||
|
"data.median()"
|
||
|
]
|
||
|
},
|
||
|
{
|
||
|
"cell_type": "code",
|
||
|
"execution_count": 99,
|
||
|
"id": "spatial-unemployment",
|
||
|
"metadata": {},
|
||
|
"outputs": [
|
||
|
{
|
||
|
"name": "stdout",
|
||
|
"output_type": "stream",
|
||
|
"text": [
|
||
|
"(600, 14)\n",
|
||
|
"(200, 14)\n",
|
||
|
"(200, 14)\n"
|
||
|
]
|
||
|
}
|
||
|
],
|
||
|
"source": [
|
||
|
"from sklearn.model_selection import train_test_split\n",
|
||
|
"import sklearn\n",
|
||
|
"\n",
|
||
|
"data_train, data_test = train_test_split(data, test_size=400, random_state=1)\n",
|
||
|
"data_test, data_dev = train_test_split(data_test, test_size=200, random_state=1)\n",
|
||
|
"print(data_train.shape)\n",
|
||
|
"print(data_test.shape)\n",
|
||
|
"print(data_dev.shape)"
|
||
|
]
|
||
|
},
|
||
|
{
|
||
|
"cell_type": "code",
|
||
|
"execution_count": 100,
|
||
|
"id": "weird-webmaster",
|
||
|
"metadata": {},
|
||
|
"outputs": [
|
||
|
{
|
||
|
"data": {
|
||
|
"text/plain": [
|
||
|
"0.3333333333333333"
|
||
|
]
|
||
|
},
|
||
|
"execution_count": 100,
|
||
|
"metadata": {},
|
||
|
"output_type": "execute_result"
|
||
|
}
|
||
|
],
|
||
|
"source": [
|
||
|
"data_test.size/data_train.size"
|
||
|
]
|
||
|
},
|
||
|
{
|
||
|
"cell_type": "code",
|
||
|
"execution_count": 101,
|
||
|
"id": "narrow-assumption",
|
||
|
"metadata": {},
|
||
|
"outputs": [
|
||
|
{
|
||
|
"data": {
|
||
|
"text/html": [
|
||
|
"<div>\n",
|
||
|
"<style scoped>\n",
|
||
|
" .dataframe tbody tr th:only-of-type {\n",
|
||
|
" vertical-align: middle;\n",
|
||
|
" }\n",
|
||
|
"\n",
|
||
|
" .dataframe tbody tr th {\n",
|
||
|
" vertical-align: top;\n",
|
||
|
" }\n",
|
||
|
"\n",
|
||
|
" .dataframe thead th {\n",
|
||
|
" text-align: right;\n",
|
||
|
" }\n",
|
||
|
"</style>\n",
|
||
|
"<table border=\"1\" class=\"dataframe\">\n",
|
||
|
" <thead>\n",
|
||
|
" <tr style=\"text-align: right;\">\n",
|
||
|
" <th></th>\n",
|
||
|
" <th>Series_Title</th>\n",
|
||
|
" <th>Released_Year</th>\n",
|
||
|
" <th>Certificate</th>\n",
|
||
|
" <th>Runtime</th>\n",
|
||
|
" <th>Genre</th>\n",
|
||
|
" <th>IMDB_Rating</th>\n",
|
||
|
" <th>Meta_score</th>\n",
|
||
|
" <th>Director</th>\n",
|
||
|
" <th>Star1</th>\n",
|
||
|
" <th>Star2</th>\n",
|
||
|
" <th>Star3</th>\n",
|
||
|
" <th>Star4</th>\n",
|
||
|
" <th>No_of_Votes</th>\n",
|
||
|
" <th>Gross</th>\n",
|
||
|
" </tr>\n",
|
||
|
" </thead>\n",
|
||
|
" <tbody>\n",
|
||
|
" <tr>\n",
|
||
|
" <th>count</th>\n",
|
||
|
" <td>600</td>\n",
|
||
|
" <td>600</td>\n",
|
||
|
" <td>600</td>\n",
|
||
|
" <td>600</td>\n",
|
||
|
" <td>600</td>\n",
|
||
|
" <td>600.000000</td>\n",
|
||
|
" <td>600</td>\n",
|
||
|
" <td>600</td>\n",
|
||
|
" <td>600</td>\n",
|
||
|
" <td>600</td>\n",
|
||
|
" <td>600</td>\n",
|
||
|
" <td>600</td>\n",
|
||
|
" <td>6.000000e+02</td>\n",
|
||
|
" <td>5.050000e+02</td>\n",
|
||
|
" </tr>\n",
|
||
|
" <tr>\n",
|
||
|
" <th>unique</th>\n",
|
||
|
" <td>599</td>\n",
|
||
|
" <td>95</td>\n",
|
||
|
" <td>17</td>\n",
|
||
|
" <td>122</td>\n",
|
||
|
" <td>163</td>\n",
|
||
|
" <td>NaN</td>\n",
|
||
|
" <td>61</td>\n",
|
||
|
" <td>399</td>\n",
|
||
|
" <td>448</td>\n",
|
||
|
" <td>535</td>\n",
|
||
|
" <td>545</td>\n",
|
||
|
" <td>580</td>\n",
|
||
|
" <td>NaN</td>\n",
|
||
|
" <td>NaN</td>\n",
|
||
|
" </tr>\n",
|
||
|
" <tr>\n",
|
||
|
" <th>top</th>\n",
|
||
|
" <td>drishyam</td>\n",
|
||
|
" <td>2004</td>\n",
|
||
|
" <td>U</td>\n",
|
||
|
" <td>130 min</td>\n",
|
||
|
" <td>drama</td>\n",
|
||
|
" <td>NaN</td>\n",
|
||
|
" <td></td>\n",
|
||
|
" <td>david fincher</td>\n",
|
||
|
" <td>robert de niro</td>\n",
|
||
|
" <td>emma watson</td>\n",
|
||
|
" <td>rupert grint</td>\n",
|
||
|
" <td>michael caine</td>\n",
|
||
|
" <td>NaN</td>\n",
|
||
|
" <td>NaN</td>\n",
|
||
|
" </tr>\n",
|
||
|
" <tr>\n",
|
||
|
" <th>freq</th>\n",
|
||
|
" <td>2</td>\n",
|
||
|
" <td>22</td>\n",
|
||
|
" <td>133</td>\n",
|
||
|
" <td>17</td>\n",
|
||
|
" <td>45</td>\n",
|
||
|
" <td>NaN</td>\n",
|
||
|
" <td>88</td>\n",
|
||
|
" <td>8</td>\n",
|
||
|
" <td>8</td>\n",
|
||
|
" <td>6</td>\n",
|
||
|
" <td>4</td>\n",
|
||
|
" <td>3</td>\n",
|
||
|
" <td>NaN</td>\n",
|
||
|
" <td>NaN</td>\n",
|
||
|
" </tr>\n",
|
||
|
" <tr>\n",
|
||
|
" <th>mean</th>\n",
|
||
|
" <td>NaN</td>\n",
|
||
|
" <td>NaN</td>\n",
|
||
|
" <td>NaN</td>\n",
|
||
|
" <td>NaN</td>\n",
|
||
|
" <td>NaN</td>\n",
|
||
|
" <td>7.947167</td>\n",
|
||
|
" <td>NaN</td>\n",
|
||
|
" <td>NaN</td>\n",
|
||
|
" <td>NaN</td>\n",
|
||
|
" <td>NaN</td>\n",
|
||
|
" <td>NaN</td>\n",
|
||
|
" <td>NaN</td>\n",
|
||
|
" <td>2.765041e+05</td>\n",
|
||
|
" <td>6.726714e+07</td>\n",
|
||
|
" </tr>\n",
|
||
|
" <tr>\n",
|
||
|
" <th>std</th>\n",
|
||
|
" <td>NaN</td>\n",
|
||
|
" <td>NaN</td>\n",
|
||
|
" <td>NaN</td>\n",
|
||
|
" <td>NaN</td>\n",
|
||
|
" <td>NaN</td>\n",
|
||
|
" <td>0.269282</td>\n",
|
||
|
" <td>NaN</td>\n",
|
||
|
" <td>NaN</td>\n",
|
||
|
" <td>NaN</td>\n",
|
||
|
" <td>NaN</td>\n",
|
||
|
" <td>NaN</td>\n",
|
||
|
" <td>NaN</td>\n",
|
||
|
" <td>3.219888e+05</td>\n",
|
||
|
" <td>1.076309e+08</td>\n",
|
||
|
" </tr>\n",
|
||
|
" <tr>\n",
|
||
|
" <th>min</th>\n",
|
||
|
" <td>NaN</td>\n",
|
||
|
" <td>NaN</td>\n",
|
||
|
" <td>NaN</td>\n",
|
||
|
" <td>NaN</td>\n",
|
||
|
" <td>NaN</td>\n",
|
||
|
" <td>7.600000</td>\n",
|
||
|
" <td>NaN</td>\n",
|
||
|
" <td>NaN</td>\n",
|
||
|
" <td>NaN</td>\n",
|
||
|
" <td>NaN</td>\n",
|
||
|
" <td>NaN</td>\n",
|
||
|
" <td>NaN</td>\n",
|
||
|
" <td>2.508800e+04</td>\n",
|
||
|
" <td>1.305000e+03</td>\n",
|
||
|
" </tr>\n",
|
||
|
" <tr>\n",
|
||
|
" <th>25%</th>\n",
|
||
|
" <td>NaN</td>\n",
|
||
|
" <td>NaN</td>\n",
|
||
|
" <td>NaN</td>\n",
|
||
|
" <td>NaN</td>\n",
|
||
|
" <td>NaN</td>\n",
|
||
|
" <td>7.700000</td>\n",
|
||
|
" <td>NaN</td>\n",
|
||
|
" <td>NaN</td>\n",
|
||
|
" <td>NaN</td>\n",
|
||
|
" <td>NaN</td>\n",
|
||
|
" <td>NaN</td>\n",
|
||
|
" <td>NaN</td>\n",
|
||
|
" <td>5.705600e+04</td>\n",
|
||
|
" <td>3.108485e+06</td>\n",
|
||
|
" </tr>\n",
|
||
|
" <tr>\n",
|
||
|
" <th>50%</th>\n",
|
||
|
" <td>NaN</td>\n",
|
||
|
" <td>NaN</td>\n",
|
||
|
" <td>NaN</td>\n",
|
||
|
" <td>NaN</td>\n",
|
||
|
" <td>NaN</td>\n",
|
||
|
" <td>7.900000</td>\n",
|
||
|
" <td>NaN</td>\n",
|
||
|
" <td>NaN</td>\n",
|
||
|
" <td>NaN</td>\n",
|
||
|
" <td>NaN</td>\n",
|
||
|
" <td>NaN</td>\n",
|
||
|
" <td>NaN</td>\n",
|
||
|
" <td>1.414865e+05</td>\n",
|
||
|
" <td>2.447542e+07</td>\n",
|
||
|
" </tr>\n",
|
||
|
" <tr>\n",
|
||
|
" <th>75%</th>\n",
|
||
|
" <td>NaN</td>\n",
|
||
|
" <td>NaN</td>\n",
|
||
|
" <td>NaN</td>\n",
|
||
|
" <td>NaN</td>\n",
|
||
|
" <td>NaN</td>\n",
|
||
|
" <td>8.100000</td>\n",
|
||
|
" <td>NaN</td>\n",
|
||
|
" <td>NaN</td>\n",
|
||
|
" <td>NaN</td>\n",
|
||
|
" <td>NaN</td>\n",
|
||
|
" <td>NaN</td>\n",
|
||
|
" <td>NaN</td>\n",
|
||
|
" <td>3.907975e+05</td>\n",
|
||
|
" <td>8.340000e+07</td>\n",
|
||
|
" </tr>\n",
|
||
|
" <tr>\n",
|
||
|
" <th>max</th>\n",
|
||
|
" <td>NaN</td>\n",
|
||
|
" <td>NaN</td>\n",
|
||
|
" <td>NaN</td>\n",
|
||
|
" <td>NaN</td>\n",
|
||
|
" <td>NaN</td>\n",
|
||
|
" <td>9.200000</td>\n",
|
||
|
" <td>NaN</td>\n",
|
||
|
" <td>NaN</td>\n",
|
||
|
" <td>NaN</td>\n",
|
||
|
" <td>NaN</td>\n",
|
||
|
" <td>NaN</td>\n",
|
||
|
" <td>NaN</td>\n",
|
||
|
" <td>1.854740e+06</td>\n",
|
||
|
" <td>9.366622e+08</td>\n",
|
||
|
" </tr>\n",
|
||
|
" </tbody>\n",
|
||
|
"</table>\n",
|
||
|
"</div>"
|
||
|
],
|
||
|
"text/plain": [
|
||
|
" Series_Title Released_Year Certificate Runtime Genre IMDB_Rating \\\n",
|
||
|
"count 600 600 600 600 600 600.000000 \n",
|
||
|
"unique 599 95 17 122 163 NaN \n",
|
||
|
"top drishyam 2004 U 130 min drama NaN \n",
|
||
|
"freq 2 22 133 17 45 NaN \n",
|
||
|
"mean NaN NaN NaN NaN NaN 7.947167 \n",
|
||
|
"std NaN NaN NaN NaN NaN 0.269282 \n",
|
||
|
"min NaN NaN NaN NaN NaN 7.600000 \n",
|
||
|
"25% NaN NaN NaN NaN NaN 7.700000 \n",
|
||
|
"50% NaN NaN NaN NaN NaN 7.900000 \n",
|
||
|
"75% NaN NaN NaN NaN NaN 8.100000 \n",
|
||
|
"max NaN NaN NaN NaN NaN 9.200000 \n",
|
||
|
"\n",
|
||
|
" Meta_score Director Star1 Star2 Star3 \\\n",
|
||
|
"count 600 600 600 600 600 \n",
|
||
|
"unique 61 399 448 535 545 \n",
|
||
|
"top david fincher robert de niro emma watson rupert grint \n",
|
||
|
"freq 88 8 8 6 4 \n",
|
||
|
"mean NaN NaN NaN NaN NaN \n",
|
||
|
"std NaN NaN NaN NaN NaN \n",
|
||
|
"min NaN NaN NaN NaN NaN \n",
|
||
|
"25% NaN NaN NaN NaN NaN \n",
|
||
|
"50% NaN NaN NaN NaN NaN \n",
|
||
|
"75% NaN NaN NaN NaN NaN \n",
|
||
|
"max NaN NaN NaN NaN NaN \n",
|
||
|
"\n",
|
||
|
" Star4 No_of_Votes Gross \n",
|
||
|
"count 600 6.000000e+02 5.050000e+02 \n",
|
||
|
"unique 580 NaN NaN \n",
|
||
|
"top michael caine NaN NaN \n",
|
||
|
"freq 3 NaN NaN \n",
|
||
|
"mean NaN 2.765041e+05 6.726714e+07 \n",
|
||
|
"std NaN 3.219888e+05 1.076309e+08 \n",
|
||
|
"min NaN 2.508800e+04 1.305000e+03 \n",
|
||
|
"25% NaN 5.705600e+04 3.108485e+06 \n",
|
||
|
"50% NaN 1.414865e+05 2.447542e+07 \n",
|
||
|
"75% NaN 3.907975e+05 8.340000e+07 \n",
|
||
|
"max NaN 1.854740e+06 9.366622e+08 "
|
||
|
]
|
||
|
},
|
||
|
"execution_count": 101,
|
||
|
"metadata": {},
|
||
|
"output_type": "execute_result"
|
||
|
}
|
||
|
],
|
||
|
"source": [
|
||
|
"data_train.describe(include=\"all\")"
|
||
|
]
|
||
|
},
|
||
|
{
|
||
|
"cell_type": "code",
|
||
|
"execution_count": 102,
|
||
|
"id": "significant-median",
|
||
|
"metadata": {},
|
||
|
"outputs": [
|
||
|
{
|
||
|
"data": {
|
||
|
"text/html": [
|
||
|
"<div>\n",
|
||
|
"<style scoped>\n",
|
||
|
" .dataframe tbody tr th:only-of-type {\n",
|
||
|
" vertical-align: middle;\n",
|
||
|
" }\n",
|
||
|
"\n",
|
||
|
" .dataframe tbody tr th {\n",
|
||
|
" vertical-align: top;\n",
|
||
|
" }\n",
|
||
|
"\n",
|
||
|
" .dataframe thead th {\n",
|
||
|
" text-align: right;\n",
|
||
|
" }\n",
|
||
|
"</style>\n",
|
||
|
"<table border=\"1\" class=\"dataframe\">\n",
|
||
|
" <thead>\n",
|
||
|
" <tr style=\"text-align: right;\">\n",
|
||
|
" <th></th>\n",
|
||
|
" <th>Series_Title</th>\n",
|
||
|
" <th>Released_Year</th>\n",
|
||
|
" <th>Certificate</th>\n",
|
||
|
" <th>Runtime</th>\n",
|
||
|
" <th>Genre</th>\n",
|
||
|
" <th>IMDB_Rating</th>\n",
|
||
|
" <th>Meta_score</th>\n",
|
||
|
" <th>Director</th>\n",
|
||
|
" <th>Star1</th>\n",
|
||
|
" <th>Star2</th>\n",
|
||
|
" <th>Star3</th>\n",
|
||
|
" <th>Star4</th>\n",
|
||
|
" <th>No_of_Votes</th>\n",
|
||
|
" <th>Gross</th>\n",
|
||
|
" </tr>\n",
|
||
|
" </thead>\n",
|
||
|
" <tbody>\n",
|
||
|
" <tr>\n",
|
||
|
" <th>count</th>\n",
|
||
|
" <td>200</td>\n",
|
||
|
" <td>200</td>\n",
|
||
|
" <td>200</td>\n",
|
||
|
" <td>200</td>\n",
|
||
|
" <td>200</td>\n",
|
||
|
" <td>200.000000</td>\n",
|
||
|
" <td>200</td>\n",
|
||
|
" <td>200</td>\n",
|
||
|
" <td>200</td>\n",
|
||
|
" <td>200</td>\n",
|
||
|
" <td>200</td>\n",
|
||
|
" <td>200</td>\n",
|
||
|
" <td>2.000000e+02</td>\n",
|
||
|
" <td>1.600000e+02</td>\n",
|
||
|
" </tr>\n",
|
||
|
" <tr>\n",
|
||
|
" <th>unique</th>\n",
|
||
|
" <td>200</td>\n",
|
||
|
" <td>75</td>\n",
|
||
|
" <td>10</td>\n",
|
||
|
" <td>88</td>\n",
|
||
|
" <td>98</td>\n",
|
||
|
" <td>NaN</td>\n",
|
||
|
" <td>49</td>\n",
|
||
|
" <td>162</td>\n",
|
||
|
" <td>172</td>\n",
|
||
|
" <td>192</td>\n",
|
||
|
" <td>197</td>\n",
|
||
|
" <td>198</td>\n",
|
||
|
" <td>NaN</td>\n",
|
||
|
" <td>NaN</td>\n",
|
||
|
" </tr>\n",
|
||
|
" <tr>\n",
|
||
|
" <th>top</th>\n",
|
||
|
" <td>in america</td>\n",
|
||
|
" <td>2003</td>\n",
|
||
|
" <td>A</td>\n",
|
||
|
" <td>118 min</td>\n",
|
||
|
" <td>drama</td>\n",
|
||
|
" <td>NaN</td>\n",
|
||
|
" <td></td>\n",
|
||
|
" <td>woody allen</td>\n",
|
||
|
" <td>humphrey bogart</td>\n",
|
||
|
" <td>robert downey jr.</td>\n",
|
||
|
" <td>lea thompson</td>\n",
|
||
|
" <td>mark ruffalo</td>\n",
|
||
|
" <td>NaN</td>\n",
|
||
|
" <td>NaN</td>\n",
|
||
|
" </tr>\n",
|
||
|
" <tr>\n",
|
||
|
" <th>freq</th>\n",
|
||
|
" <td>1</td>\n",
|
||
|
" <td>6</td>\n",
|
||
|
" <td>48</td>\n",
|
||
|
" <td>7</td>\n",
|
||
|
" <td>23</td>\n",
|
||
|
" <td>NaN</td>\n",
|
||
|
" <td>30</td>\n",
|
||
|
" <td>5</td>\n",
|
||
|
" <td>4</td>\n",
|
||
|
" <td>2</td>\n",
|
||
|
" <td>2</td>\n",
|
||
|
" <td>2</td>\n",
|
||
|
" <td>NaN</td>\n",
|
||
|
" <td>NaN</td>\n",
|
||
|
" </tr>\n",
|
||
|
" <tr>\n",
|
||
|
" <th>mean</th>\n",
|
||
|
" <td>NaN</td>\n",
|
||
|
" <td>NaN</td>\n",
|
||
|
" <td>NaN</td>\n",
|
||
|
" <td>NaN</td>\n",
|
||
|
" <td>NaN</td>\n",
|
||
|
" <td>7.949500</td>\n",
|
||
|
" <td>NaN</td>\n",
|
||
|
" <td>NaN</td>\n",
|
||
|
" <td>NaN</td>\n",
|
||
|
" <td>NaN</td>\n",
|
||
|
" <td>NaN</td>\n",
|
||
|
" <td>NaN</td>\n",
|
||
|
" <td>2.826230e+05</td>\n",
|
||
|
" <td>7.249570e+07</td>\n",
|
||
|
" </tr>\n",
|
||
|
" <tr>\n",
|
||
|
" <th>std</th>\n",
|
||
|
" <td>NaN</td>\n",
|
||
|
" <td>NaN</td>\n",
|
||
|
" <td>NaN</td>\n",
|
||
|
" <td>NaN</td>\n",
|
||
|
" <td>NaN</td>\n",
|
||
|
" <td>0.290381</td>\n",
|
||
|
" <td>NaN</td>\n",
|
||
|
" <td>NaN</td>\n",
|
||
|
" <td>NaN</td>\n",
|
||
|
" <td>NaN</td>\n",
|
||
|
" <td>NaN</td>\n",
|
||
|
" <td>NaN</td>\n",
|
||
|
" <td>3.501372e+05</td>\n",
|
||
|
" <td>1.224538e+08</td>\n",
|
||
|
" </tr>\n",
|
||
|
" <tr>\n",
|
||
|
" <th>min</th>\n",
|
||
|
" <td>NaN</td>\n",
|
||
|
" <td>NaN</td>\n",
|
||
|
" <td>NaN</td>\n",
|
||
|
" <td>NaN</td>\n",
|
||
|
" <td>NaN</td>\n",
|
||
|
" <td>7.600000</td>\n",
|
||
|
" <td>NaN</td>\n",
|
||
|
" <td>NaN</td>\n",
|
||
|
" <td>NaN</td>\n",
|
||
|
" <td>NaN</td>\n",
|
||
|
" <td>NaN</td>\n",
|
||
|
" <td>NaN</td>\n",
|
||
|
" <td>2.519800e+04</td>\n",
|
||
|
" <td>6.013000e+03</td>\n",
|
||
|
" </tr>\n",
|
||
|
" <tr>\n",
|
||
|
" <th>25%</th>\n",
|
||
|
" <td>NaN</td>\n",
|
||
|
" <td>NaN</td>\n",
|
||
|
" <td>NaN</td>\n",
|
||
|
" <td>NaN</td>\n",
|
||
|
" <td>NaN</td>\n",
|
||
|
" <td>7.700000</td>\n",
|
||
|
" <td>NaN</td>\n",
|
||
|
" <td>NaN</td>\n",
|
||
|
" <td>NaN</td>\n",
|
||
|
" <td>NaN</td>\n",
|
||
|
" <td>NaN</td>\n",
|
||
|
" <td>NaN</td>\n",
|
||
|
" <td>5.038950e+04</td>\n",
|
||
|
" <td>3.786699e+06</td>\n",
|
||
|
" </tr>\n",
|
||
|
" <tr>\n",
|
||
|
" <th>50%</th>\n",
|
||
|
" <td>NaN</td>\n",
|
||
|
" <td>NaN</td>\n",
|
||
|
" <td>NaN</td>\n",
|
||
|
" <td>NaN</td>\n",
|
||
|
" <td>NaN</td>\n",
|
||
|
" <td>7.900000</td>\n",
|
||
|
" <td>NaN</td>\n",
|
||
|
" <td>NaN</td>\n",
|
||
|
" <td>NaN</td>\n",
|
||
|
" <td>NaN</td>\n",
|
||
|
" <td>NaN</td>\n",
|
||
|
" <td>NaN</td>\n",
|
||
|
" <td>1.354640e+05</td>\n",
|
||
|
" <td>2.325044e+07</td>\n",
|
||
|
" </tr>\n",
|
||
|
" <tr>\n",
|
||
|
" <th>75%</th>\n",
|
||
|
" <td>NaN</td>\n",
|
||
|
" <td>NaN</td>\n",
|
||
|
" <td>NaN</td>\n",
|
||
|
" <td>NaN</td>\n",
|
||
|
" <td>NaN</td>\n",
|
||
|
" <td>8.100000</td>\n",
|
||
|
" <td>NaN</td>\n",
|
||
|
" <td>NaN</td>\n",
|
||
|
" <td>NaN</td>\n",
|
||
|
" <td>NaN</td>\n",
|
||
|
" <td>NaN</td>\n",
|
||
|
" <td>NaN</td>\n",
|
||
|
" <td>3.851505e+05</td>\n",
|
||
|
" <td>7.603522e+07</td>\n",
|
||
|
" </tr>\n",
|
||
|
" <tr>\n",
|
||
|
" <th>max</th>\n",
|
||
|
" <td>NaN</td>\n",
|
||
|
" <td>NaN</td>\n",
|
||
|
" <td>NaN</td>\n",
|
||
|
" <td>NaN</td>\n",
|
||
|
" <td>NaN</td>\n",
|
||
|
" <td>9.000000</td>\n",
|
||
|
" <td>NaN</td>\n",
|
||
|
" <td>NaN</td>\n",
|
||
|
" <td>NaN</td>\n",
|
||
|
" <td>NaN</td>\n",
|
||
|
" <td>NaN</td>\n",
|
||
|
" <td>NaN</td>\n",
|
||
|
" <td>2.303232e+06</td>\n",
|
||
|
" <td>8.583730e+08</td>\n",
|
||
|
" </tr>\n",
|
||
|
" </tbody>\n",
|
||
|
"</table>\n",
|
||
|
"</div>"
|
||
|
],
|
||
|
"text/plain": [
|
||
|
" Series_Title Released_Year Certificate Runtime Genre IMDB_Rating \\\n",
|
||
|
"count 200 200 200 200 200 200.000000 \n",
|
||
|
"unique 200 75 10 88 98 NaN \n",
|
||
|
"top in america 2003 A 118 min drama NaN \n",
|
||
|
"freq 1 6 48 7 23 NaN \n",
|
||
|
"mean NaN NaN NaN NaN NaN 7.949500 \n",
|
||
|
"std NaN NaN NaN NaN NaN 0.290381 \n",
|
||
|
"min NaN NaN NaN NaN NaN 7.600000 \n",
|
||
|
"25% NaN NaN NaN NaN NaN 7.700000 \n",
|
||
|
"50% NaN NaN NaN NaN NaN 7.900000 \n",
|
||
|
"75% NaN NaN NaN NaN NaN 8.100000 \n",
|
||
|
"max NaN NaN NaN NaN NaN 9.000000 \n",
|
||
|
"\n",
|
||
|
" Meta_score Director Star1 Star2 \\\n",
|
||
|
"count 200 200 200 200 \n",
|
||
|
"unique 49 162 172 192 \n",
|
||
|
"top woody allen humphrey bogart robert downey jr. \n",
|
||
|
"freq 30 5 4 2 \n",
|
||
|
"mean NaN NaN NaN NaN \n",
|
||
|
"std NaN NaN NaN NaN \n",
|
||
|
"min NaN NaN NaN NaN \n",
|
||
|
"25% NaN NaN NaN NaN \n",
|
||
|
"50% NaN NaN NaN NaN \n",
|
||
|
"75% NaN NaN NaN NaN \n",
|
||
|
"max NaN NaN NaN NaN \n",
|
||
|
"\n",
|
||
|
" Star3 Star4 No_of_Votes Gross \n",
|
||
|
"count 200 200 2.000000e+02 1.600000e+02 \n",
|
||
|
"unique 197 198 NaN NaN \n",
|
||
|
"top lea thompson mark ruffalo NaN NaN \n",
|
||
|
"freq 2 2 NaN NaN \n",
|
||
|
"mean NaN NaN 2.826230e+05 7.249570e+07 \n",
|
||
|
"std NaN NaN 3.501372e+05 1.224538e+08 \n",
|
||
|
"min NaN NaN 2.519800e+04 6.013000e+03 \n",
|
||
|
"25% NaN NaN 5.038950e+04 3.786699e+06 \n",
|
||
|
"50% NaN NaN 1.354640e+05 2.325044e+07 \n",
|
||
|
"75% NaN NaN 3.851505e+05 7.603522e+07 \n",
|
||
|
"max NaN NaN 2.303232e+06 8.583730e+08 "
|
||
|
]
|
||
|
},
|
||
|
"execution_count": 102,
|
||
|
"metadata": {},
|
||
|
"output_type": "execute_result"
|
||
|
}
|
||
|
],
|
||
|
"source": [
|
||
|
"data_test.describe(include=\"all\")"
|
||
|
]
|
||
|
},
|
||
|
{
|
||
|
"cell_type": "code",
|
||
|
"execution_count": 103,
|
||
|
"id": "blessed-socket",
|
||
|
"metadata": {},
|
||
|
"outputs": [
|
||
|
{
|
||
|
"data": {
|
||
|
"text/html": [
|
||
|
"<div>\n",
|
||
|
"<style scoped>\n",
|
||
|
" .dataframe tbody tr th:only-of-type {\n",
|
||
|
" vertical-align: middle;\n",
|
||
|
" }\n",
|
||
|
"\n",
|
||
|
" .dataframe tbody tr th {\n",
|
||
|
" vertical-align: top;\n",
|
||
|
" }\n",
|
||
|
"\n",
|
||
|
" .dataframe thead th {\n",
|
||
|
" text-align: right;\n",
|
||
|
" }\n",
|
||
|
"</style>\n",
|
||
|
"<table border=\"1\" class=\"dataframe\">\n",
|
||
|
" <thead>\n",
|
||
|
" <tr style=\"text-align: right;\">\n",
|
||
|
" <th></th>\n",
|
||
|
" <th>Series_Title</th>\n",
|
||
|
" <th>Released_Year</th>\n",
|
||
|
" <th>Certificate</th>\n",
|
||
|
" <th>Runtime</th>\n",
|
||
|
" <th>Genre</th>\n",
|
||
|
" <th>IMDB_Rating</th>\n",
|
||
|
" <th>Meta_score</th>\n",
|
||
|
" <th>Director</th>\n",
|
||
|
" <th>Star1</th>\n",
|
||
|
" <th>Star2</th>\n",
|
||
|
" <th>Star3</th>\n",
|
||
|
" <th>Star4</th>\n",
|
||
|
" <th>No_of_Votes</th>\n",
|
||
|
" <th>Gross</th>\n",
|
||
|
" </tr>\n",
|
||
|
" </thead>\n",
|
||
|
" <tbody>\n",
|
||
|
" <tr>\n",
|
||
|
" <th>count</th>\n",
|
||
|
" <td>200</td>\n",
|
||
|
" <td>200</td>\n",
|
||
|
" <td>200</td>\n",
|
||
|
" <td>200</td>\n",
|
||
|
" <td>200</td>\n",
|
||
|
" <td>200.000000</td>\n",
|
||
|
" <td>200</td>\n",
|
||
|
" <td>200</td>\n",
|
||
|
" <td>200</td>\n",
|
||
|
" <td>200</td>\n",
|
||
|
" <td>200</td>\n",
|
||
|
" <td>200</td>\n",
|
||
|
" <td>2.000000e+02</td>\n",
|
||
|
" <td>1.660000e+02</td>\n",
|
||
|
" </tr>\n",
|
||
|
" <tr>\n",
|
||
|
" <th>unique</th>\n",
|
||
|
" <td>200</td>\n",
|
||
|
" <td>70</td>\n",
|
||
|
" <td>10</td>\n",
|
||
|
" <td>89</td>\n",
|
||
|
" <td>91</td>\n",
|
||
|
" <td>NaN</td>\n",
|
||
|
" <td>47</td>\n",
|
||
|
" <td>162</td>\n",
|
||
|
" <td>176</td>\n",
|
||
|
" <td>191</td>\n",
|
||
|
" <td>195</td>\n",
|
||
|
" <td>199</td>\n",
|
||
|
" <td>NaN</td>\n",
|
||
|
" <td>NaN</td>\n",
|
||
|
" </tr>\n",
|
||
|
" <tr>\n",
|
||
|
" <th>top</th>\n",
|
||
|
" <td>clerks</td>\n",
|
||
|
" <td>2014</td>\n",
|
||
|
" <td>U</td>\n",
|
||
|
" <td>106 min</td>\n",
|
||
|
" <td>drama</td>\n",
|
||
|
" <td>NaN</td>\n",
|
||
|
" <td></td>\n",
|
||
|
" <td>steven spielberg</td>\n",
|
||
|
" <td>toshirô mifune</td>\n",
|
||
|
" <td>ed harris</td>\n",
|
||
|
" <td>frances mcdormand</td>\n",
|
||
|
" <td>lucy liu</td>\n",
|
||
|
" <td>NaN</td>\n",
|
||
|
" <td>NaN</td>\n",
|
||
|
" </tr>\n",
|
||
|
" <tr>\n",
|
||
|
" <th>freq</th>\n",
|
||
|
" <td>1</td>\n",
|
||
|
" <td>11</td>\n",
|
||
|
" <td>57</td>\n",
|
||
|
" <td>6</td>\n",
|
||
|
" <td>17</td>\n",
|
||
|
" <td>NaN</td>\n",
|
||
|
" <td>39</td>\n",
|
||
|
" <td>6</td>\n",
|
||
|
" <td>4</td>\n",
|
||
|
" <td>3</td>\n",
|
||
|
" <td>3</td>\n",
|
||
|
" <td>2</td>\n",
|
||
|
" <td>NaN</td>\n",
|
||
|
" <td>NaN</td>\n",
|
||
|
" </tr>\n",
|
||
|
" <tr>\n",
|
||
|
" <th>mean</th>\n",
|
||
|
" <td>NaN</td>\n",
|
||
|
" <td>NaN</td>\n",
|
||
|
" <td>NaN</td>\n",
|
||
|
" <td>NaN</td>\n",
|
||
|
" <td>NaN</td>\n",
|
||
|
" <td>7.955500</td>\n",
|
||
|
" <td>NaN</td>\n",
|
||
|
" <td>NaN</td>\n",
|
||
|
" <td>NaN</td>\n",
|
||
|
" <td>NaN</td>\n",
|
||
|
" <td>NaN</td>\n",
|
||
|
" <td>NaN</td>\n",
|
||
|
" <td>2.563293e+05</td>\n",
|
||
|
" <td>6.607024e+07</td>\n",
|
||
|
" </tr>\n",
|
||
|
" <tr>\n",
|
||
|
" <th>std</th>\n",
|
||
|
" <td>NaN</td>\n",
|
||
|
" <td>NaN</td>\n",
|
||
|
" <td>NaN</td>\n",
|
||
|
" <td>NaN</td>\n",
|
||
|
" <td>NaN</td>\n",
|
||
|
" <td>0.279931</td>\n",
|
||
|
" <td>NaN</td>\n",
|
||
|
" <td>NaN</td>\n",
|
||
|
" <td>NaN</td>\n",
|
||
|
" <td>NaN</td>\n",
|
||
|
" <td>NaN</td>\n",
|
||
|
" <td>NaN</td>\n",
|
||
|
" <td>3.208478e+05</td>\n",
|
||
|
" <td>1.035885e+08</td>\n",
|
||
|
" </tr>\n",
|
||
|
" <tr>\n",
|
||
|
" <th>min</th>\n",
|
||
|
" <td>NaN</td>\n",
|
||
|
" <td>NaN</td>\n",
|
||
|
" <td>NaN</td>\n",
|
||
|
" <td>NaN</td>\n",
|
||
|
" <td>NaN</td>\n",
|
||
|
" <td>7.600000</td>\n",
|
||
|
" <td>NaN</td>\n",
|
||
|
" <td>NaN</td>\n",
|
||
|
" <td>NaN</td>\n",
|
||
|
" <td>NaN</td>\n",
|
||
|
" <td>NaN</td>\n",
|
||
|
" <td>NaN</td>\n",
|
||
|
" <td>2.593800e+04</td>\n",
|
||
|
" <td>6.460000e+03</td>\n",
|
||
|
" </tr>\n",
|
||
|
" <tr>\n",
|
||
|
" <th>25%</th>\n",
|
||
|
" <td>NaN</td>\n",
|
||
|
" <td>NaN</td>\n",
|
||
|
" <td>NaN</td>\n",
|
||
|
" <td>NaN</td>\n",
|
||
|
" <td>NaN</td>\n",
|
||
|
" <td>7.700000</td>\n",
|
||
|
" <td>NaN</td>\n",
|
||
|
" <td>NaN</td>\n",
|
||
|
" <td>NaN</td>\n",
|
||
|
" <td>NaN</td>\n",
|
||
|
" <td>NaN</td>\n",
|
||
|
" <td>NaN</td>\n",
|
||
|
" <td>5.946375e+04</td>\n",
|
||
|
" <td>3.392077e+06</td>\n",
|
||
|
" </tr>\n",
|
||
|
" <tr>\n",
|
||
|
" <th>50%</th>\n",
|
||
|
" <td>NaN</td>\n",
|
||
|
" <td>NaN</td>\n",
|
||
|
" <td>NaN</td>\n",
|
||
|
" <td>NaN</td>\n",
|
||
|
" <td>NaN</td>\n",
|
||
|
" <td>7.900000</td>\n",
|
||
|
" <td>NaN</td>\n",
|
||
|
" <td>NaN</td>\n",
|
||
|
" <td>NaN</td>\n",
|
||
|
" <td>NaN</td>\n",
|
||
|
" <td>NaN</td>\n",
|
||
|
" <td>NaN</td>\n",
|
||
|
" <td>1.256995e+05</td>\n",
|
||
|
" <td>2.249226e+07</td>\n",
|
||
|
" </tr>\n",
|
||
|
" <tr>\n",
|
||
|
" <th>75%</th>\n",
|
||
|
" <td>NaN</td>\n",
|
||
|
" <td>NaN</td>\n",
|
||
|
" <td>NaN</td>\n",
|
||
|
" <td>NaN</td>\n",
|
||
|
" <td>NaN</td>\n",
|
||
|
" <td>8.100000</td>\n",
|
||
|
" <td>NaN</td>\n",
|
||
|
" <td>NaN</td>\n",
|
||
|
" <td>NaN</td>\n",
|
||
|
" <td>NaN</td>\n",
|
||
|
" <td>NaN</td>\n",
|
||
|
" <td>NaN</td>\n",
|
||
|
" <td>3.365100e+05</td>\n",
|
||
|
" <td>7.597351e+07</td>\n",
|
||
|
" </tr>\n",
|
||
|
" <tr>\n",
|
||
|
" <th>max</th>\n",
|
||
|
" <td>NaN</td>\n",
|
||
|
" <td>NaN</td>\n",
|
||
|
" <td>NaN</td>\n",
|
||
|
" <td>NaN</td>\n",
|
||
|
" <td>NaN</td>\n",
|
||
|
" <td>9.300000</td>\n",
|
||
|
" <td>NaN</td>\n",
|
||
|
" <td>NaN</td>\n",
|
||
|
" <td>NaN</td>\n",
|
||
|
" <td>NaN</td>\n",
|
||
|
" <td>NaN</td>\n",
|
||
|
" <td>NaN</td>\n",
|
||
|
" <td>2.343110e+06</td>\n",
|
||
|
" <td>6.085817e+08</td>\n",
|
||
|
" </tr>\n",
|
||
|
" </tbody>\n",
|
||
|
"</table>\n",
|
||
|
"</div>"
|
||
|
],
|
||
|
"text/plain": [
|
||
|
" Series_Title Released_Year Certificate Runtime Genre IMDB_Rating \\\n",
|
||
|
"count 200 200 200 200 200 200.000000 \n",
|
||
|
"unique 200 70 10 89 91 NaN \n",
|
||
|
"top clerks 2014 U 106 min drama NaN \n",
|
||
|
"freq 1 11 57 6 17 NaN \n",
|
||
|
"mean NaN NaN NaN NaN NaN 7.955500 \n",
|
||
|
"std NaN NaN NaN NaN NaN 0.279931 \n",
|
||
|
"min NaN NaN NaN NaN NaN 7.600000 \n",
|
||
|
"25% NaN NaN NaN NaN NaN 7.700000 \n",
|
||
|
"50% NaN NaN NaN NaN NaN 7.900000 \n",
|
||
|
"75% NaN NaN NaN NaN NaN 8.100000 \n",
|
||
|
"max NaN NaN NaN NaN NaN 9.300000 \n",
|
||
|
"\n",
|
||
|
" Meta_score Director Star1 Star2 \\\n",
|
||
|
"count 200 200 200 200 \n",
|
||
|
"unique 47 162 176 191 \n",
|
||
|
"top steven spielberg toshirô mifune ed harris \n",
|
||
|
"freq 39 6 4 3 \n",
|
||
|
"mean NaN NaN NaN NaN \n",
|
||
|
"std NaN NaN NaN NaN \n",
|
||
|
"min NaN NaN NaN NaN \n",
|
||
|
"25% NaN NaN NaN NaN \n",
|
||
|
"50% NaN NaN NaN NaN \n",
|
||
|
"75% NaN NaN NaN NaN \n",
|
||
|
"max NaN NaN NaN NaN \n",
|
||
|
"\n",
|
||
|
" Star3 Star4 No_of_Votes Gross \n",
|
||
|
"count 200 200 2.000000e+02 1.660000e+02 \n",
|
||
|
"unique 195 199 NaN NaN \n",
|
||
|
"top frances mcdormand lucy liu NaN NaN \n",
|
||
|
"freq 3 2 NaN NaN \n",
|
||
|
"mean NaN NaN 2.563293e+05 6.607024e+07 \n",
|
||
|
"std NaN NaN 3.208478e+05 1.035885e+08 \n",
|
||
|
"min NaN NaN 2.593800e+04 6.460000e+03 \n",
|
||
|
"25% NaN NaN 5.946375e+04 3.392077e+06 \n",
|
||
|
"50% NaN NaN 1.256995e+05 2.249226e+07 \n",
|
||
|
"75% NaN NaN 3.365100e+05 7.597351e+07 \n",
|
||
|
"max NaN NaN 2.343110e+06 6.085817e+08 "
|
||
|
]
|
||
|
},
|
||
|
"execution_count": 103,
|
||
|
"metadata": {},
|
||
|
"output_type": "execute_result"
|
||
|
}
|
||
|
],
|
||
|
"source": [
|
||
|
"data_dev.describe(include=\"all\")"
|
||
|
]
|
||
|
},
|
||
|
{
|
||
|
"cell_type": "code",
|
||
|
"execution_count": null,
|
||
|
"id": "continuing-chambers",
|
||
|
"metadata": {},
|
||
|
"outputs": [],
|
||
|
"source": [
|
||
|
"data_test.to_csv(\"crime_test.csv\", encoding=\"utf-8\", index=False)\n",
|
||
|
"data_dev.to_csv(\"crime_dev.csv\", encoding=\"utf-8\", index=False)\n",
|
||
|
"crime_train.to_csv(\"crime_train.csv\", encoding=\"utf-8\", index=False)"
|
||
|
]
|
||
|
}
|
||
|
],
|
||
|
"metadata": {
|
||
|
"kernelspec": {
|
||
|
"display_name": "Python 3",
|
||
|
"language": "python",
|
||
|
"name": "python3"
|
||
|
},
|
||
|
"language_info": {
|
||
|
"codemirror_mode": {
|
||
|
"name": "ipython",
|
||
|
"version": 3
|
||
|
},
|
||
|
"file_extension": ".py",
|
||
|
"mimetype": "text/x-python",
|
||
|
"name": "python",
|
||
|
"nbconvert_exporter": "python",
|
||
|
"pygments_lexer": "ipython3",
|
||
|
"version": "3.7.3"
|
||
|
}
|
||
|
},
|
||
|
"nbformat": 4,
|
||
|
"nbformat_minor": 5
|
||
|
}
|