ium_444018/script.ipynb

1391 lines
46 KiB
Plaintext
Raw Normal View History

2022-03-21 02:06:46 +01:00
{
"cells": [
{
"cell_type": "markdown",
"id": "injured-operations",
"metadata": {},
"source": [
"### Skrypt do ściagnięcia zbiory danych"
]
},
{
"cell_type": "code",
"execution_count": null,
"id": "engaging-wholesale",
"metadata": {},
"outputs": [],
"source": [
"!pip install --user kaggle \n",
"!pip install --user pandas\n",
"!pip install --user numpy\n",
"!pip install --user seaborn\n",
"!pip install -U scikit-learn"
]
},
{
"cell_type": "code",
"execution_count": 2,
"id": "cleared-shower",
"metadata": {},
"outputs": [
{
"name": "stdout",
"output_type": "stream",
"text": [
"Downloading dataset from Kaggle...\n",
"/bin/bash: kaggle: command not found\n",
"Done.\n"
]
}
],
"source": [
"!echo \"Downloading dataset from Kaggle...\"\n",
"!kaggle datasets download -d harshitshankhdhar/imdb-dataset-of-top-1000-movies-and-tv-shows\n",
"!echo \"Done.\""
]
},
{
"cell_type": "code",
"execution_count": 1,
"id": "pleased-culture",
"metadata": {},
"outputs": [
{
"name": "stdout",
"output_type": "stream",
"text": [
"Unzipping archive\n",
"Done.\n"
]
}
],
"source": [
"!echo \"Unzipping archive\"\n",
"!files=$(unzip imdb-dataset-of-top-1000-movies-and-tv-shows.zip | tail -n +2 | cut -d ' ' -f 4)\n",
"!echo \"Done.\""
]
},
{
"cell_type": "code",
"execution_count": 92,
"id": "extended-moderator",
"metadata": {},
"outputs": [],
"source": [
"import pandas as pd\n",
"import numpy as np\n",
"data=pd.read_csv('imdb_top_1000.csv')\n",
"# data"
]
},
{
"cell_type": "markdown",
"id": "outer-allah",
"metadata": {},
"source": [
"## Usuwanie kolumn\n",
"- Poster_Link: kolumna zawierająca linki do plakatów promujących film\n",
"- Overview: kolumna zawierająca recenzje poszczególnych filmów"
]
},
{
"cell_type": "code",
"execution_count": 93,
"id": "strange-honduras",
"metadata": {},
"outputs": [],
"source": [
"data.drop(columns=[\"Poster_Link\"], inplace=True)\n",
"data.drop(columns=[\"Overview\"], inplace=True)\n",
"# data"
]
},
{
"cell_type": "code",
"execution_count": 95,
"id": "preceding-values",
"metadata": {},
"outputs": [],
"source": [
"# Lowercase na polach tekstowych\n",
"data[\"Series_Title\"] = data[\"Series_Title\"].str.lower()\n",
"data[\"Genre\"] = data[\"Genre\"].str.lower()\n",
"data[\"Director\"] = data[\"Director\"].str.lower()\n",
"data[\"Star1\"] = data[\"Star1\"].str.lower()\n",
"data[\"Star2\"] = data[\"Star2\"].str.lower()\n",
"data[\"Star3\"] = data[\"Star3\"].str.lower()\n",
"data[\"Star4\"] = data[\"Star4\"].str.lower()\n",
"\n",
"# Usunięcie Nan i string to int \n",
"data = data.replace(np.nan, '', regex=True)\n",
"data[\"Gross\"] = data[\"Gross\"].str.replace(',', '')\n",
"data[\"Gross\"] = pd.to_numeric(data[\"Gross\"])"
]
},
{
"cell_type": "code",
"execution_count": 97,
"id": "standard-rates",
"metadata": {},
"outputs": [
{
"name": "stdout",
"output_type": "stream",
"text": [
"1001 imdb_top_1000.csv\n"
]
}
],
"source": [
"#Wielkosc zbioru\n",
"!wc -l imdb_top_1000.csv"
]
},
{
"cell_type": "code",
"execution_count": 96,
"id": "experienced-nerve",
"metadata": {},
"outputs": [
{
"data": {
"text/html": [
"<div>\n",
"<style scoped>\n",
" .dataframe tbody tr th:only-of-type {\n",
" vertical-align: middle;\n",
" }\n",
"\n",
" .dataframe tbody tr th {\n",
" vertical-align: top;\n",
" }\n",
"\n",
" .dataframe thead th {\n",
" text-align: right;\n",
" }\n",
"</style>\n",
"<table border=\"1\" class=\"dataframe\">\n",
" <thead>\n",
" <tr style=\"text-align: right;\">\n",
" <th></th>\n",
" <th>Series_Title</th>\n",
" <th>Released_Year</th>\n",
" <th>Certificate</th>\n",
" <th>Runtime</th>\n",
" <th>Genre</th>\n",
" <th>IMDB_Rating</th>\n",
" <th>Meta_score</th>\n",
" <th>Director</th>\n",
" <th>Star1</th>\n",
" <th>Star2</th>\n",
" <th>Star3</th>\n",
" <th>Star4</th>\n",
" <th>No_of_Votes</th>\n",
" <th>Gross</th>\n",
" </tr>\n",
" </thead>\n",
" <tbody>\n",
" <tr>\n",
" <th>count</th>\n",
" <td>1000</td>\n",
" <td>1000</td>\n",
" <td>1000</td>\n",
" <td>1000</td>\n",
" <td>1000</td>\n",
" <td>1000.000000</td>\n",
" <td>1000</td>\n",
" <td>1000</td>\n",
" <td>1000</td>\n",
" <td>1000</td>\n",
" <td>1000</td>\n",
" <td>1000</td>\n",
" <td>1.000000e+03</td>\n",
" <td>8.310000e+02</td>\n",
" </tr>\n",
" <tr>\n",
" <th>unique</th>\n",
" <td>999</td>\n",
" <td>100</td>\n",
" <td>17</td>\n",
" <td>140</td>\n",
" <td>202</td>\n",
" <td>NaN</td>\n",
" <td>64</td>\n",
" <td>548</td>\n",
" <td>660</td>\n",
" <td>841</td>\n",
" <td>891</td>\n",
" <td>939</td>\n",
" <td>NaN</td>\n",
" <td>NaN</td>\n",
" </tr>\n",
" <tr>\n",
" <th>top</th>\n",
" <td>drishyam</td>\n",
" <td>2014</td>\n",
" <td>U</td>\n",
" <td>130 min</td>\n",
" <td>drama</td>\n",
" <td>NaN</td>\n",
" <td></td>\n",
" <td>alfred hitchcock</td>\n",
" <td>tom hanks</td>\n",
" <td>emma watson</td>\n",
" <td>rupert grint</td>\n",
" <td>michael caine</td>\n",
" <td>NaN</td>\n",
" <td>NaN</td>\n",
" </tr>\n",
" <tr>\n",
" <th>freq</th>\n",
" <td>2</td>\n",
" <td>32</td>\n",
" <td>234</td>\n",
" <td>23</td>\n",
" <td>85</td>\n",
" <td>NaN</td>\n",
" <td>157</td>\n",
" <td>14</td>\n",
" <td>12</td>\n",
" <td>7</td>\n",
" <td>5</td>\n",
" <td>4</td>\n",
" <td>NaN</td>\n",
" <td>NaN</td>\n",
" </tr>\n",
" <tr>\n",
" <th>mean</th>\n",
" <td>NaN</td>\n",
" <td>NaN</td>\n",
" <td>NaN</td>\n",
" <td>NaN</td>\n",
" <td>NaN</td>\n",
" <td>7.949300</td>\n",
" <td>NaN</td>\n",
" <td>NaN</td>\n",
" <td>NaN</td>\n",
" <td>NaN</td>\n",
" <td>NaN</td>\n",
" <td>NaN</td>\n",
" <td>2.736929e+05</td>\n",
" <td>6.803475e+07</td>\n",
" </tr>\n",
" <tr>\n",
" <th>std</th>\n",
" <td>NaN</td>\n",
" <td>NaN</td>\n",
" <td>NaN</td>\n",
" <td>NaN</td>\n",
" <td>NaN</td>\n",
" <td>0.275491</td>\n",
" <td>NaN</td>\n",
" <td>NaN</td>\n",
" <td>NaN</td>\n",
" <td>NaN</td>\n",
" <td>NaN</td>\n",
" <td>NaN</td>\n",
" <td>3.273727e+05</td>\n",
" <td>1.097500e+08</td>\n",
" </tr>\n",
" <tr>\n",
" <th>min</th>\n",
" <td>NaN</td>\n",
" <td>NaN</td>\n",
" <td>NaN</td>\n",
" <td>NaN</td>\n",
" <td>NaN</td>\n",
" <td>7.600000</td>\n",
" <td>NaN</td>\n",
" <td>NaN</td>\n",
" <td>NaN</td>\n",
" <td>NaN</td>\n",
" <td>NaN</td>\n",
" <td>NaN</td>\n",
" <td>2.508800e+04</td>\n",
" <td>1.305000e+03</td>\n",
" </tr>\n",
" <tr>\n",
" <th>25%</th>\n",
" <td>NaN</td>\n",
" <td>NaN</td>\n",
" <td>NaN</td>\n",
" <td>NaN</td>\n",
" <td>NaN</td>\n",
" <td>7.700000</td>\n",
" <td>NaN</td>\n",
" <td>NaN</td>\n",
" <td>NaN</td>\n",
" <td>NaN</td>\n",
" <td>NaN</td>\n",
" <td>NaN</td>\n",
" <td>5.552625e+04</td>\n",
" <td>3.253559e+06</td>\n",
" </tr>\n",
" <tr>\n",
" <th>50%</th>\n",
" <td>NaN</td>\n",
" <td>NaN</td>\n",
" <td>NaN</td>\n",
" <td>NaN</td>\n",
" <td>NaN</td>\n",
" <td>7.900000</td>\n",
" <td>NaN</td>\n",
" <td>NaN</td>\n",
" <td>NaN</td>\n",
" <td>NaN</td>\n",
" <td>NaN</td>\n",
" <td>NaN</td>\n",
" <td>1.385485e+05</td>\n",
" <td>2.353089e+07</td>\n",
" </tr>\n",
" <tr>\n",
" <th>75%</th>\n",
" <td>NaN</td>\n",
" <td>NaN</td>\n",
" <td>NaN</td>\n",
" <td>NaN</td>\n",
" <td>NaN</td>\n",
" <td>8.100000</td>\n",
" <td>NaN</td>\n",
" <td>NaN</td>\n",
" <td>NaN</td>\n",
" <td>NaN</td>\n",
" <td>NaN</td>\n",
" <td>NaN</td>\n",
" <td>3.741612e+05</td>\n",
" <td>8.075089e+07</td>\n",
" </tr>\n",
" <tr>\n",
" <th>max</th>\n",
" <td>NaN</td>\n",
" <td>NaN</td>\n",
" <td>NaN</td>\n",
" <td>NaN</td>\n",
" <td>NaN</td>\n",
" <td>9.300000</td>\n",
" <td>NaN</td>\n",
" <td>NaN</td>\n",
" <td>NaN</td>\n",
" <td>NaN</td>\n",
" <td>NaN</td>\n",
" <td>NaN</td>\n",
" <td>2.343110e+06</td>\n",
" <td>9.366622e+08</td>\n",
" </tr>\n",
" </tbody>\n",
"</table>\n",
"</div>"
],
"text/plain": [
" Series_Title Released_Year Certificate Runtime Genre IMDB_Rating \\\n",
"count 1000 1000 1000 1000 1000 1000.000000 \n",
"unique 999 100 17 140 202 NaN \n",
"top drishyam 2014 U 130 min drama NaN \n",
"freq 2 32 234 23 85 NaN \n",
"mean NaN NaN NaN NaN NaN 7.949300 \n",
"std NaN NaN NaN NaN NaN 0.275491 \n",
"min NaN NaN NaN NaN NaN 7.600000 \n",
"25% NaN NaN NaN NaN NaN 7.700000 \n",
"50% NaN NaN NaN NaN NaN 7.900000 \n",
"75% NaN NaN NaN NaN NaN 8.100000 \n",
"max NaN NaN NaN NaN NaN 9.300000 \n",
"\n",
" Meta_score Director Star1 Star2 Star3 \\\n",
"count 1000 1000 1000 1000 1000 \n",
"unique 64 548 660 841 891 \n",
"top alfred hitchcock tom hanks emma watson rupert grint \n",
"freq 157 14 12 7 5 \n",
"mean NaN NaN NaN NaN NaN \n",
"std NaN NaN NaN NaN NaN \n",
"min NaN NaN NaN NaN NaN \n",
"25% NaN NaN NaN NaN NaN \n",
"50% NaN NaN NaN NaN NaN \n",
"75% NaN NaN NaN NaN NaN \n",
"max NaN NaN NaN NaN NaN \n",
"\n",
" Star4 No_of_Votes Gross \n",
"count 1000 1.000000e+03 8.310000e+02 \n",
"unique 939 NaN NaN \n",
"top michael caine NaN NaN \n",
"freq 4 NaN NaN \n",
"mean NaN 2.736929e+05 6.803475e+07 \n",
"std NaN 3.273727e+05 1.097500e+08 \n",
"min NaN 2.508800e+04 1.305000e+03 \n",
"25% NaN 5.552625e+04 3.253559e+06 \n",
"50% NaN 1.385485e+05 2.353089e+07 \n",
"75% NaN 3.741612e+05 8.075089e+07 \n",
"max NaN 2.343110e+06 9.366622e+08 "
]
},
"execution_count": 96,
"metadata": {},
"output_type": "execute_result"
}
],
"source": [
"data.describe(include='all')"
]
},
{
"cell_type": "code",
"execution_count": 98,
"id": "academic-principle",
"metadata": {},
"outputs": [
{
"data": {
"text/plain": [
"IMDB_Rating 7.9\n",
"No_of_Votes 138548.5\n",
"Gross 23530892.0\n",
"dtype: float64"
]
},
"execution_count": 98,
"metadata": {},
"output_type": "execute_result"
}
],
"source": [
"data.median()"
]
},
{
"cell_type": "code",
"execution_count": 99,
"id": "spatial-unemployment",
"metadata": {},
"outputs": [
{
"name": "stdout",
"output_type": "stream",
"text": [
"(600, 14)\n",
"(200, 14)\n",
"(200, 14)\n"
]
}
],
"source": [
"from sklearn.model_selection import train_test_split\n",
"import sklearn\n",
"\n",
"data_train, data_test = train_test_split(data, test_size=400, random_state=1)\n",
"data_test, data_dev = train_test_split(data_test, test_size=200, random_state=1)\n",
"print(data_train.shape)\n",
"print(data_test.shape)\n",
"print(data_dev.shape)"
]
},
{
"cell_type": "code",
"execution_count": 100,
"id": "weird-webmaster",
"metadata": {},
"outputs": [
{
"data": {
"text/plain": [
"0.3333333333333333"
]
},
"execution_count": 100,
"metadata": {},
"output_type": "execute_result"
}
],
"source": [
"data_test.size/data_train.size"
]
},
{
"cell_type": "code",
"execution_count": 101,
"id": "narrow-assumption",
"metadata": {},
"outputs": [
{
"data": {
"text/html": [
"<div>\n",
"<style scoped>\n",
" .dataframe tbody tr th:only-of-type {\n",
" vertical-align: middle;\n",
" }\n",
"\n",
" .dataframe tbody tr th {\n",
" vertical-align: top;\n",
" }\n",
"\n",
" .dataframe thead th {\n",
" text-align: right;\n",
" }\n",
"</style>\n",
"<table border=\"1\" class=\"dataframe\">\n",
" <thead>\n",
" <tr style=\"text-align: right;\">\n",
" <th></th>\n",
" <th>Series_Title</th>\n",
" <th>Released_Year</th>\n",
" <th>Certificate</th>\n",
" <th>Runtime</th>\n",
" <th>Genre</th>\n",
" <th>IMDB_Rating</th>\n",
" <th>Meta_score</th>\n",
" <th>Director</th>\n",
" <th>Star1</th>\n",
" <th>Star2</th>\n",
" <th>Star3</th>\n",
" <th>Star4</th>\n",
" <th>No_of_Votes</th>\n",
" <th>Gross</th>\n",
" </tr>\n",
" </thead>\n",
" <tbody>\n",
" <tr>\n",
" <th>count</th>\n",
" <td>600</td>\n",
" <td>600</td>\n",
" <td>600</td>\n",
" <td>600</td>\n",
" <td>600</td>\n",
" <td>600.000000</td>\n",
" <td>600</td>\n",
" <td>600</td>\n",
" <td>600</td>\n",
" <td>600</td>\n",
" <td>600</td>\n",
" <td>600</td>\n",
" <td>6.000000e+02</td>\n",
" <td>5.050000e+02</td>\n",
" </tr>\n",
" <tr>\n",
" <th>unique</th>\n",
" <td>599</td>\n",
" <td>95</td>\n",
" <td>17</td>\n",
" <td>122</td>\n",
" <td>163</td>\n",
" <td>NaN</td>\n",
" <td>61</td>\n",
" <td>399</td>\n",
" <td>448</td>\n",
" <td>535</td>\n",
" <td>545</td>\n",
" <td>580</td>\n",
" <td>NaN</td>\n",
" <td>NaN</td>\n",
" </tr>\n",
" <tr>\n",
" <th>top</th>\n",
" <td>drishyam</td>\n",
" <td>2004</td>\n",
" <td>U</td>\n",
" <td>130 min</td>\n",
" <td>drama</td>\n",
" <td>NaN</td>\n",
" <td></td>\n",
" <td>david fincher</td>\n",
" <td>robert de niro</td>\n",
" <td>emma watson</td>\n",
" <td>rupert grint</td>\n",
" <td>michael caine</td>\n",
" <td>NaN</td>\n",
" <td>NaN</td>\n",
" </tr>\n",
" <tr>\n",
" <th>freq</th>\n",
" <td>2</td>\n",
" <td>22</td>\n",
" <td>133</td>\n",
" <td>17</td>\n",
" <td>45</td>\n",
" <td>NaN</td>\n",
" <td>88</td>\n",
" <td>8</td>\n",
" <td>8</td>\n",
" <td>6</td>\n",
" <td>4</td>\n",
" <td>3</td>\n",
" <td>NaN</td>\n",
" <td>NaN</td>\n",
" </tr>\n",
" <tr>\n",
" <th>mean</th>\n",
" <td>NaN</td>\n",
" <td>NaN</td>\n",
" <td>NaN</td>\n",
" <td>NaN</td>\n",
" <td>NaN</td>\n",
" <td>7.947167</td>\n",
" <td>NaN</td>\n",
" <td>NaN</td>\n",
" <td>NaN</td>\n",
" <td>NaN</td>\n",
" <td>NaN</td>\n",
" <td>NaN</td>\n",
" <td>2.765041e+05</td>\n",
" <td>6.726714e+07</td>\n",
" </tr>\n",
" <tr>\n",
" <th>std</th>\n",
" <td>NaN</td>\n",
" <td>NaN</td>\n",
" <td>NaN</td>\n",
" <td>NaN</td>\n",
" <td>NaN</td>\n",
" <td>0.269282</td>\n",
" <td>NaN</td>\n",
" <td>NaN</td>\n",
" <td>NaN</td>\n",
" <td>NaN</td>\n",
" <td>NaN</td>\n",
" <td>NaN</td>\n",
" <td>3.219888e+05</td>\n",
" <td>1.076309e+08</td>\n",
" </tr>\n",
" <tr>\n",
" <th>min</th>\n",
" <td>NaN</td>\n",
" <td>NaN</td>\n",
" <td>NaN</td>\n",
" <td>NaN</td>\n",
" <td>NaN</td>\n",
" <td>7.600000</td>\n",
" <td>NaN</td>\n",
" <td>NaN</td>\n",
" <td>NaN</td>\n",
" <td>NaN</td>\n",
" <td>NaN</td>\n",
" <td>NaN</td>\n",
" <td>2.508800e+04</td>\n",
" <td>1.305000e+03</td>\n",
" </tr>\n",
" <tr>\n",
" <th>25%</th>\n",
" <td>NaN</td>\n",
" <td>NaN</td>\n",
" <td>NaN</td>\n",
" <td>NaN</td>\n",
" <td>NaN</td>\n",
" <td>7.700000</td>\n",
" <td>NaN</td>\n",
" <td>NaN</td>\n",
" <td>NaN</td>\n",
" <td>NaN</td>\n",
" <td>NaN</td>\n",
" <td>NaN</td>\n",
" <td>5.705600e+04</td>\n",
" <td>3.108485e+06</td>\n",
" </tr>\n",
" <tr>\n",
" <th>50%</th>\n",
" <td>NaN</td>\n",
" <td>NaN</td>\n",
" <td>NaN</td>\n",
" <td>NaN</td>\n",
" <td>NaN</td>\n",
" <td>7.900000</td>\n",
" <td>NaN</td>\n",
" <td>NaN</td>\n",
" <td>NaN</td>\n",
" <td>NaN</td>\n",
" <td>NaN</td>\n",
" <td>NaN</td>\n",
" <td>1.414865e+05</td>\n",
" <td>2.447542e+07</td>\n",
" </tr>\n",
" <tr>\n",
" <th>75%</th>\n",
" <td>NaN</td>\n",
" <td>NaN</td>\n",
" <td>NaN</td>\n",
" <td>NaN</td>\n",
" <td>NaN</td>\n",
" <td>8.100000</td>\n",
" <td>NaN</td>\n",
" <td>NaN</td>\n",
" <td>NaN</td>\n",
" <td>NaN</td>\n",
" <td>NaN</td>\n",
" <td>NaN</td>\n",
" <td>3.907975e+05</td>\n",
" <td>8.340000e+07</td>\n",
" </tr>\n",
" <tr>\n",
" <th>max</th>\n",
" <td>NaN</td>\n",
" <td>NaN</td>\n",
" <td>NaN</td>\n",
" <td>NaN</td>\n",
" <td>NaN</td>\n",
" <td>9.200000</td>\n",
" <td>NaN</td>\n",
" <td>NaN</td>\n",
" <td>NaN</td>\n",
" <td>NaN</td>\n",
" <td>NaN</td>\n",
" <td>NaN</td>\n",
" <td>1.854740e+06</td>\n",
" <td>9.366622e+08</td>\n",
" </tr>\n",
" </tbody>\n",
"</table>\n",
"</div>"
],
"text/plain": [
" Series_Title Released_Year Certificate Runtime Genre IMDB_Rating \\\n",
"count 600 600 600 600 600 600.000000 \n",
"unique 599 95 17 122 163 NaN \n",
"top drishyam 2004 U 130 min drama NaN \n",
"freq 2 22 133 17 45 NaN \n",
"mean NaN NaN NaN NaN NaN 7.947167 \n",
"std NaN NaN NaN NaN NaN 0.269282 \n",
"min NaN NaN NaN NaN NaN 7.600000 \n",
"25% NaN NaN NaN NaN NaN 7.700000 \n",
"50% NaN NaN NaN NaN NaN 7.900000 \n",
"75% NaN NaN NaN NaN NaN 8.100000 \n",
"max NaN NaN NaN NaN NaN 9.200000 \n",
"\n",
" Meta_score Director Star1 Star2 Star3 \\\n",
"count 600 600 600 600 600 \n",
"unique 61 399 448 535 545 \n",
"top david fincher robert de niro emma watson rupert grint \n",
"freq 88 8 8 6 4 \n",
"mean NaN NaN NaN NaN NaN \n",
"std NaN NaN NaN NaN NaN \n",
"min NaN NaN NaN NaN NaN \n",
"25% NaN NaN NaN NaN NaN \n",
"50% NaN NaN NaN NaN NaN \n",
"75% NaN NaN NaN NaN NaN \n",
"max NaN NaN NaN NaN NaN \n",
"\n",
" Star4 No_of_Votes Gross \n",
"count 600 6.000000e+02 5.050000e+02 \n",
"unique 580 NaN NaN \n",
"top michael caine NaN NaN \n",
"freq 3 NaN NaN \n",
"mean NaN 2.765041e+05 6.726714e+07 \n",
"std NaN 3.219888e+05 1.076309e+08 \n",
"min NaN 2.508800e+04 1.305000e+03 \n",
"25% NaN 5.705600e+04 3.108485e+06 \n",
"50% NaN 1.414865e+05 2.447542e+07 \n",
"75% NaN 3.907975e+05 8.340000e+07 \n",
"max NaN 1.854740e+06 9.366622e+08 "
]
},
"execution_count": 101,
"metadata": {},
"output_type": "execute_result"
}
],
"source": [
"data_train.describe(include=\"all\")"
]
},
{
"cell_type": "code",
"execution_count": 102,
"id": "significant-median",
"metadata": {},
"outputs": [
{
"data": {
"text/html": [
"<div>\n",
"<style scoped>\n",
" .dataframe tbody tr th:only-of-type {\n",
" vertical-align: middle;\n",
" }\n",
"\n",
" .dataframe tbody tr th {\n",
" vertical-align: top;\n",
" }\n",
"\n",
" .dataframe thead th {\n",
" text-align: right;\n",
" }\n",
"</style>\n",
"<table border=\"1\" class=\"dataframe\">\n",
" <thead>\n",
" <tr style=\"text-align: right;\">\n",
" <th></th>\n",
" <th>Series_Title</th>\n",
" <th>Released_Year</th>\n",
" <th>Certificate</th>\n",
" <th>Runtime</th>\n",
" <th>Genre</th>\n",
" <th>IMDB_Rating</th>\n",
" <th>Meta_score</th>\n",
" <th>Director</th>\n",
" <th>Star1</th>\n",
" <th>Star2</th>\n",
" <th>Star3</th>\n",
" <th>Star4</th>\n",
" <th>No_of_Votes</th>\n",
" <th>Gross</th>\n",
" </tr>\n",
" </thead>\n",
" <tbody>\n",
" <tr>\n",
" <th>count</th>\n",
" <td>200</td>\n",
" <td>200</td>\n",
" <td>200</td>\n",
" <td>200</td>\n",
" <td>200</td>\n",
" <td>200.000000</td>\n",
" <td>200</td>\n",
" <td>200</td>\n",
" <td>200</td>\n",
" <td>200</td>\n",
" <td>200</td>\n",
" <td>200</td>\n",
" <td>2.000000e+02</td>\n",
" <td>1.600000e+02</td>\n",
" </tr>\n",
" <tr>\n",
" <th>unique</th>\n",
" <td>200</td>\n",
" <td>75</td>\n",
" <td>10</td>\n",
" <td>88</td>\n",
" <td>98</td>\n",
" <td>NaN</td>\n",
" <td>49</td>\n",
" <td>162</td>\n",
" <td>172</td>\n",
" <td>192</td>\n",
" <td>197</td>\n",
" <td>198</td>\n",
" <td>NaN</td>\n",
" <td>NaN</td>\n",
" </tr>\n",
" <tr>\n",
" <th>top</th>\n",
" <td>in america</td>\n",
" <td>2003</td>\n",
" <td>A</td>\n",
" <td>118 min</td>\n",
" <td>drama</td>\n",
" <td>NaN</td>\n",
" <td></td>\n",
" <td>woody allen</td>\n",
" <td>humphrey bogart</td>\n",
" <td>robert downey jr.</td>\n",
" <td>lea thompson</td>\n",
" <td>mark ruffalo</td>\n",
" <td>NaN</td>\n",
" <td>NaN</td>\n",
" </tr>\n",
" <tr>\n",
" <th>freq</th>\n",
" <td>1</td>\n",
" <td>6</td>\n",
" <td>48</td>\n",
" <td>7</td>\n",
" <td>23</td>\n",
" <td>NaN</td>\n",
" <td>30</td>\n",
" <td>5</td>\n",
" <td>4</td>\n",
" <td>2</td>\n",
" <td>2</td>\n",
" <td>2</td>\n",
" <td>NaN</td>\n",
" <td>NaN</td>\n",
" </tr>\n",
" <tr>\n",
" <th>mean</th>\n",
" <td>NaN</td>\n",
" <td>NaN</td>\n",
" <td>NaN</td>\n",
" <td>NaN</td>\n",
" <td>NaN</td>\n",
" <td>7.949500</td>\n",
" <td>NaN</td>\n",
" <td>NaN</td>\n",
" <td>NaN</td>\n",
" <td>NaN</td>\n",
" <td>NaN</td>\n",
" <td>NaN</td>\n",
" <td>2.826230e+05</td>\n",
" <td>7.249570e+07</td>\n",
" </tr>\n",
" <tr>\n",
" <th>std</th>\n",
" <td>NaN</td>\n",
" <td>NaN</td>\n",
" <td>NaN</td>\n",
" <td>NaN</td>\n",
" <td>NaN</td>\n",
" <td>0.290381</td>\n",
" <td>NaN</td>\n",
" <td>NaN</td>\n",
" <td>NaN</td>\n",
" <td>NaN</td>\n",
" <td>NaN</td>\n",
" <td>NaN</td>\n",
" <td>3.501372e+05</td>\n",
" <td>1.224538e+08</td>\n",
" </tr>\n",
" <tr>\n",
" <th>min</th>\n",
" <td>NaN</td>\n",
" <td>NaN</td>\n",
" <td>NaN</td>\n",
" <td>NaN</td>\n",
" <td>NaN</td>\n",
" <td>7.600000</td>\n",
" <td>NaN</td>\n",
" <td>NaN</td>\n",
" <td>NaN</td>\n",
" <td>NaN</td>\n",
" <td>NaN</td>\n",
" <td>NaN</td>\n",
" <td>2.519800e+04</td>\n",
" <td>6.013000e+03</td>\n",
" </tr>\n",
" <tr>\n",
" <th>25%</th>\n",
" <td>NaN</td>\n",
" <td>NaN</td>\n",
" <td>NaN</td>\n",
" <td>NaN</td>\n",
" <td>NaN</td>\n",
" <td>7.700000</td>\n",
" <td>NaN</td>\n",
" <td>NaN</td>\n",
" <td>NaN</td>\n",
" <td>NaN</td>\n",
" <td>NaN</td>\n",
" <td>NaN</td>\n",
" <td>5.038950e+04</td>\n",
" <td>3.786699e+06</td>\n",
" </tr>\n",
" <tr>\n",
" <th>50%</th>\n",
" <td>NaN</td>\n",
" <td>NaN</td>\n",
" <td>NaN</td>\n",
" <td>NaN</td>\n",
" <td>NaN</td>\n",
" <td>7.900000</td>\n",
" <td>NaN</td>\n",
" <td>NaN</td>\n",
" <td>NaN</td>\n",
" <td>NaN</td>\n",
" <td>NaN</td>\n",
" <td>NaN</td>\n",
" <td>1.354640e+05</td>\n",
" <td>2.325044e+07</td>\n",
" </tr>\n",
" <tr>\n",
" <th>75%</th>\n",
" <td>NaN</td>\n",
" <td>NaN</td>\n",
" <td>NaN</td>\n",
" <td>NaN</td>\n",
" <td>NaN</td>\n",
" <td>8.100000</td>\n",
" <td>NaN</td>\n",
" <td>NaN</td>\n",
" <td>NaN</td>\n",
" <td>NaN</td>\n",
" <td>NaN</td>\n",
" <td>NaN</td>\n",
" <td>3.851505e+05</td>\n",
" <td>7.603522e+07</td>\n",
" </tr>\n",
" <tr>\n",
" <th>max</th>\n",
" <td>NaN</td>\n",
" <td>NaN</td>\n",
" <td>NaN</td>\n",
" <td>NaN</td>\n",
" <td>NaN</td>\n",
" <td>9.000000</td>\n",
" <td>NaN</td>\n",
" <td>NaN</td>\n",
" <td>NaN</td>\n",
" <td>NaN</td>\n",
" <td>NaN</td>\n",
" <td>NaN</td>\n",
" <td>2.303232e+06</td>\n",
" <td>8.583730e+08</td>\n",
" </tr>\n",
" </tbody>\n",
"</table>\n",
"</div>"
],
"text/plain": [
" Series_Title Released_Year Certificate Runtime Genre IMDB_Rating \\\n",
"count 200 200 200 200 200 200.000000 \n",
"unique 200 75 10 88 98 NaN \n",
"top in america 2003 A 118 min drama NaN \n",
"freq 1 6 48 7 23 NaN \n",
"mean NaN NaN NaN NaN NaN 7.949500 \n",
"std NaN NaN NaN NaN NaN 0.290381 \n",
"min NaN NaN NaN NaN NaN 7.600000 \n",
"25% NaN NaN NaN NaN NaN 7.700000 \n",
"50% NaN NaN NaN NaN NaN 7.900000 \n",
"75% NaN NaN NaN NaN NaN 8.100000 \n",
"max NaN NaN NaN NaN NaN 9.000000 \n",
"\n",
" Meta_score Director Star1 Star2 \\\n",
"count 200 200 200 200 \n",
"unique 49 162 172 192 \n",
"top woody allen humphrey bogart robert downey jr. \n",
"freq 30 5 4 2 \n",
"mean NaN NaN NaN NaN \n",
"std NaN NaN NaN NaN \n",
"min NaN NaN NaN NaN \n",
"25% NaN NaN NaN NaN \n",
"50% NaN NaN NaN NaN \n",
"75% NaN NaN NaN NaN \n",
"max NaN NaN NaN NaN \n",
"\n",
" Star3 Star4 No_of_Votes Gross \n",
"count 200 200 2.000000e+02 1.600000e+02 \n",
"unique 197 198 NaN NaN \n",
"top lea thompson mark ruffalo NaN NaN \n",
"freq 2 2 NaN NaN \n",
"mean NaN NaN 2.826230e+05 7.249570e+07 \n",
"std NaN NaN 3.501372e+05 1.224538e+08 \n",
"min NaN NaN 2.519800e+04 6.013000e+03 \n",
"25% NaN NaN 5.038950e+04 3.786699e+06 \n",
"50% NaN NaN 1.354640e+05 2.325044e+07 \n",
"75% NaN NaN 3.851505e+05 7.603522e+07 \n",
"max NaN NaN 2.303232e+06 8.583730e+08 "
]
},
"execution_count": 102,
"metadata": {},
"output_type": "execute_result"
}
],
"source": [
"data_test.describe(include=\"all\")"
]
},
{
"cell_type": "code",
"execution_count": 103,
"id": "blessed-socket",
"metadata": {},
"outputs": [
{
"data": {
"text/html": [
"<div>\n",
"<style scoped>\n",
" .dataframe tbody tr th:only-of-type {\n",
" vertical-align: middle;\n",
" }\n",
"\n",
" .dataframe tbody tr th {\n",
" vertical-align: top;\n",
" }\n",
"\n",
" .dataframe thead th {\n",
" text-align: right;\n",
" }\n",
"</style>\n",
"<table border=\"1\" class=\"dataframe\">\n",
" <thead>\n",
" <tr style=\"text-align: right;\">\n",
" <th></th>\n",
" <th>Series_Title</th>\n",
" <th>Released_Year</th>\n",
" <th>Certificate</th>\n",
" <th>Runtime</th>\n",
" <th>Genre</th>\n",
" <th>IMDB_Rating</th>\n",
" <th>Meta_score</th>\n",
" <th>Director</th>\n",
" <th>Star1</th>\n",
" <th>Star2</th>\n",
" <th>Star3</th>\n",
" <th>Star4</th>\n",
" <th>No_of_Votes</th>\n",
" <th>Gross</th>\n",
" </tr>\n",
" </thead>\n",
" <tbody>\n",
" <tr>\n",
" <th>count</th>\n",
" <td>200</td>\n",
" <td>200</td>\n",
" <td>200</td>\n",
" <td>200</td>\n",
" <td>200</td>\n",
" <td>200.000000</td>\n",
" <td>200</td>\n",
" <td>200</td>\n",
" <td>200</td>\n",
" <td>200</td>\n",
" <td>200</td>\n",
" <td>200</td>\n",
" <td>2.000000e+02</td>\n",
" <td>1.660000e+02</td>\n",
" </tr>\n",
" <tr>\n",
" <th>unique</th>\n",
" <td>200</td>\n",
" <td>70</td>\n",
" <td>10</td>\n",
" <td>89</td>\n",
" <td>91</td>\n",
" <td>NaN</td>\n",
" <td>47</td>\n",
" <td>162</td>\n",
" <td>176</td>\n",
" <td>191</td>\n",
" <td>195</td>\n",
" <td>199</td>\n",
" <td>NaN</td>\n",
" <td>NaN</td>\n",
" </tr>\n",
" <tr>\n",
" <th>top</th>\n",
" <td>clerks</td>\n",
" <td>2014</td>\n",
" <td>U</td>\n",
" <td>106 min</td>\n",
" <td>drama</td>\n",
" <td>NaN</td>\n",
" <td></td>\n",
" <td>steven spielberg</td>\n",
" <td>toshirô mifune</td>\n",
" <td>ed harris</td>\n",
" <td>frances mcdormand</td>\n",
" <td>lucy liu</td>\n",
" <td>NaN</td>\n",
" <td>NaN</td>\n",
" </tr>\n",
" <tr>\n",
" <th>freq</th>\n",
" <td>1</td>\n",
" <td>11</td>\n",
" <td>57</td>\n",
" <td>6</td>\n",
" <td>17</td>\n",
" <td>NaN</td>\n",
" <td>39</td>\n",
" <td>6</td>\n",
" <td>4</td>\n",
" <td>3</td>\n",
" <td>3</td>\n",
" <td>2</td>\n",
" <td>NaN</td>\n",
" <td>NaN</td>\n",
" </tr>\n",
" <tr>\n",
" <th>mean</th>\n",
" <td>NaN</td>\n",
" <td>NaN</td>\n",
" <td>NaN</td>\n",
" <td>NaN</td>\n",
" <td>NaN</td>\n",
" <td>7.955500</td>\n",
" <td>NaN</td>\n",
" <td>NaN</td>\n",
" <td>NaN</td>\n",
" <td>NaN</td>\n",
" <td>NaN</td>\n",
" <td>NaN</td>\n",
" <td>2.563293e+05</td>\n",
" <td>6.607024e+07</td>\n",
" </tr>\n",
" <tr>\n",
" <th>std</th>\n",
" <td>NaN</td>\n",
" <td>NaN</td>\n",
" <td>NaN</td>\n",
" <td>NaN</td>\n",
" <td>NaN</td>\n",
" <td>0.279931</td>\n",
" <td>NaN</td>\n",
" <td>NaN</td>\n",
" <td>NaN</td>\n",
" <td>NaN</td>\n",
" <td>NaN</td>\n",
" <td>NaN</td>\n",
" <td>3.208478e+05</td>\n",
" <td>1.035885e+08</td>\n",
" </tr>\n",
" <tr>\n",
" <th>min</th>\n",
" <td>NaN</td>\n",
" <td>NaN</td>\n",
" <td>NaN</td>\n",
" <td>NaN</td>\n",
" <td>NaN</td>\n",
" <td>7.600000</td>\n",
" <td>NaN</td>\n",
" <td>NaN</td>\n",
" <td>NaN</td>\n",
" <td>NaN</td>\n",
" <td>NaN</td>\n",
" <td>NaN</td>\n",
" <td>2.593800e+04</td>\n",
" <td>6.460000e+03</td>\n",
" </tr>\n",
" <tr>\n",
" <th>25%</th>\n",
" <td>NaN</td>\n",
" <td>NaN</td>\n",
" <td>NaN</td>\n",
" <td>NaN</td>\n",
" <td>NaN</td>\n",
" <td>7.700000</td>\n",
" <td>NaN</td>\n",
" <td>NaN</td>\n",
" <td>NaN</td>\n",
" <td>NaN</td>\n",
" <td>NaN</td>\n",
" <td>NaN</td>\n",
" <td>5.946375e+04</td>\n",
" <td>3.392077e+06</td>\n",
" </tr>\n",
" <tr>\n",
" <th>50%</th>\n",
" <td>NaN</td>\n",
" <td>NaN</td>\n",
" <td>NaN</td>\n",
" <td>NaN</td>\n",
" <td>NaN</td>\n",
" <td>7.900000</td>\n",
" <td>NaN</td>\n",
" <td>NaN</td>\n",
" <td>NaN</td>\n",
" <td>NaN</td>\n",
" <td>NaN</td>\n",
" <td>NaN</td>\n",
" <td>1.256995e+05</td>\n",
" <td>2.249226e+07</td>\n",
" </tr>\n",
" <tr>\n",
" <th>75%</th>\n",
" <td>NaN</td>\n",
" <td>NaN</td>\n",
" <td>NaN</td>\n",
" <td>NaN</td>\n",
" <td>NaN</td>\n",
" <td>8.100000</td>\n",
" <td>NaN</td>\n",
" <td>NaN</td>\n",
" <td>NaN</td>\n",
" <td>NaN</td>\n",
" <td>NaN</td>\n",
" <td>NaN</td>\n",
" <td>3.365100e+05</td>\n",
" <td>7.597351e+07</td>\n",
" </tr>\n",
" <tr>\n",
" <th>max</th>\n",
" <td>NaN</td>\n",
" <td>NaN</td>\n",
" <td>NaN</td>\n",
" <td>NaN</td>\n",
" <td>NaN</td>\n",
" <td>9.300000</td>\n",
" <td>NaN</td>\n",
" <td>NaN</td>\n",
" <td>NaN</td>\n",
" <td>NaN</td>\n",
" <td>NaN</td>\n",
" <td>NaN</td>\n",
" <td>2.343110e+06</td>\n",
" <td>6.085817e+08</td>\n",
" </tr>\n",
" </tbody>\n",
"</table>\n",
"</div>"
],
"text/plain": [
" Series_Title Released_Year Certificate Runtime Genre IMDB_Rating \\\n",
"count 200 200 200 200 200 200.000000 \n",
"unique 200 70 10 89 91 NaN \n",
"top clerks 2014 U 106 min drama NaN \n",
"freq 1 11 57 6 17 NaN \n",
"mean NaN NaN NaN NaN NaN 7.955500 \n",
"std NaN NaN NaN NaN NaN 0.279931 \n",
"min NaN NaN NaN NaN NaN 7.600000 \n",
"25% NaN NaN NaN NaN NaN 7.700000 \n",
"50% NaN NaN NaN NaN NaN 7.900000 \n",
"75% NaN NaN NaN NaN NaN 8.100000 \n",
"max NaN NaN NaN NaN NaN 9.300000 \n",
"\n",
" Meta_score Director Star1 Star2 \\\n",
"count 200 200 200 200 \n",
"unique 47 162 176 191 \n",
"top steven spielberg toshirô mifune ed harris \n",
"freq 39 6 4 3 \n",
"mean NaN NaN NaN NaN \n",
"std NaN NaN NaN NaN \n",
"min NaN NaN NaN NaN \n",
"25% NaN NaN NaN NaN \n",
"50% NaN NaN NaN NaN \n",
"75% NaN NaN NaN NaN \n",
"max NaN NaN NaN NaN \n",
"\n",
" Star3 Star4 No_of_Votes Gross \n",
"count 200 200 2.000000e+02 1.660000e+02 \n",
"unique 195 199 NaN NaN \n",
"top frances mcdormand lucy liu NaN NaN \n",
"freq 3 2 NaN NaN \n",
"mean NaN NaN 2.563293e+05 6.607024e+07 \n",
"std NaN NaN 3.208478e+05 1.035885e+08 \n",
"min NaN NaN 2.593800e+04 6.460000e+03 \n",
"25% NaN NaN 5.946375e+04 3.392077e+06 \n",
"50% NaN NaN 1.256995e+05 2.249226e+07 \n",
"75% NaN NaN 3.365100e+05 7.597351e+07 \n",
"max NaN NaN 2.343110e+06 6.085817e+08 "
]
},
"execution_count": 103,
"metadata": {},
"output_type": "execute_result"
}
],
"source": [
"data_dev.describe(include=\"all\")"
]
},
{
"cell_type": "code",
"execution_count": null,
"id": "continuing-chambers",
"metadata": {},
"outputs": [],
"source": [
"data_test.to_csv(\"crime_test.csv\", encoding=\"utf-8\", index=False)\n",
"data_dev.to_csv(\"crime_dev.csv\", encoding=\"utf-8\", index=False)\n",
"crime_train.to_csv(\"crime_train.csv\", encoding=\"utf-8\", index=False)"
]
}
],
"metadata": {
"kernelspec": {
"display_name": "Python 3",
"language": "python",
"name": "python3"
},
"language_info": {
"codemirror_mode": {
"name": "ipython",
"version": 3
},
"file_extension": ".py",
"mimetype": "text/x-python",
"name": "python",
"nbconvert_exporter": "python",
"pygments_lexer": "ipython3",
"version": "3.7.3"
}
},
"nbformat": 4,
"nbformat_minor": 5
}