przetwarzanie_jezyka_natura.../P1/imdb/main.ipynb
2023-07-04 20:28:47 +02:00

1675 lines
60 KiB
Plaintext
Raw Permalink Blame History

This file contains ambiguous Unicode characters

This file contains Unicode characters that might be confused with other characters. If you think that this is intentional, you can safely ignore this warning. Use the Escape button to reveal them.

{
"cells": [
{
"cell_type": "code",
"execution_count": 57,
"metadata": {},
"outputs": [],
"source": [
"import pandas as pd\n",
"import numpy as np"
]
},
{
"cell_type": "code",
"execution_count": 58,
"metadata": {},
"outputs": [
{
"data": {
"text/html": [
"<div>\n",
"<style scoped>\n",
" .dataframe tbody tr th:only-of-type {\n",
" vertical-align: middle;\n",
" }\n",
"\n",
" .dataframe tbody tr th {\n",
" vertical-align: top;\n",
" }\n",
"\n",
" .dataframe thead th {\n",
" text-align: right;\n",
" }\n",
"</style>\n",
"<table border=\"1\" class=\"dataframe\">\n",
" <thead>\n",
" <tr style=\"text-align: right;\">\n",
" <th></th>\n",
" <th>MOVIES</th>\n",
" <th>YEAR</th>\n",
" <th>GENRE</th>\n",
" <th>RATING</th>\n",
" <th>ONE-LINE</th>\n",
" <th>STARS</th>\n",
" <th>VOTES</th>\n",
" <th>RunTime</th>\n",
" <th>Gross</th>\n",
" </tr>\n",
" </thead>\n",
" <tbody>\n",
" <tr>\n",
" <th>0</th>\n",
" <td>Blood Red Sky</td>\n",
" <td>(2021)</td>\n",
" <td>\\nAction, Horror, Thriller</td>\n",
" <td>6.1</td>\n",
" <td>\\nA woman with a mysterious illness is forced ...</td>\n",
" <td>\\n Director:\\nPeter Thorwarth\\n| \\n Star...</td>\n",
" <td>21,062</td>\n",
" <td>121.0</td>\n",
" <td>NaN</td>\n",
" </tr>\n",
" <tr>\n",
" <th>1</th>\n",
" <td>Masters of the Universe: Revelation</td>\n",
" <td>(2021 )</td>\n",
" <td>\\nAnimation, Action, Adventure</td>\n",
" <td>5.0</td>\n",
" <td>\\nThe war for Eternia begins again in what may...</td>\n",
" <td>\\n \\n Stars:\\nChris Wood, \\nSara...</td>\n",
" <td>17,870</td>\n",
" <td>25.0</td>\n",
" <td>NaN</td>\n",
" </tr>\n",
" <tr>\n",
" <th>2</th>\n",
" <td>The Walking Dead</td>\n",
" <td>(20102022)</td>\n",
" <td>\\nDrama, Horror, Thriller</td>\n",
" <td>8.2</td>\n",
" <td>\\nSheriff Deputy Rick Grimes wakes up from a c...</td>\n",
" <td>\\n \\n Stars:\\nAndrew Lincoln, \\n...</td>\n",
" <td>885,805</td>\n",
" <td>44.0</td>\n",
" <td>NaN</td>\n",
" </tr>\n",
" <tr>\n",
" <th>3</th>\n",
" <td>Rick and Morty</td>\n",
" <td>(2013 )</td>\n",
" <td>\\nAnimation, Adventure, Comedy</td>\n",
" <td>9.2</td>\n",
" <td>\\nAn animated series that follows the exploits...</td>\n",
" <td>\\n \\n Stars:\\nJustin Roiland, \\n...</td>\n",
" <td>414,849</td>\n",
" <td>23.0</td>\n",
" <td>NaN</td>\n",
" </tr>\n",
" <tr>\n",
" <th>4</th>\n",
" <td>Army of Thieves</td>\n",
" <td>(2021)</td>\n",
" <td>\\nAction, Crime, Horror</td>\n",
" <td>NaN</td>\n",
" <td>\\nA prequel, set before the events of Army of ...</td>\n",
" <td>\\n Director:\\nMatthias Schweighöfer\\n| \\n ...</td>\n",
" <td>NaN</td>\n",
" <td>NaN</td>\n",
" <td>NaN</td>\n",
" </tr>\n",
" </tbody>\n",
"</table>\n",
"</div>"
],
"text/plain": [
" MOVIES YEAR \\\n",
"0 Blood Red Sky (2021) \n",
"1 Masters of the Universe: Revelation (2021 ) \n",
"2 The Walking Dead (20102022) \n",
"3 Rick and Morty (2013 ) \n",
"4 Army of Thieves (2021) \n",
"\n",
" GENRE RATING \\\n",
"0 \\nAction, Horror, Thriller 6.1 \n",
"1 \\nAnimation, Action, Adventure 5.0 \n",
"2 \\nDrama, Horror, Thriller 8.2 \n",
"3 \\nAnimation, Adventure, Comedy 9.2 \n",
"4 \\nAction, Crime, Horror NaN \n",
"\n",
" ONE-LINE \\\n",
"0 \\nA woman with a mysterious illness is forced ... \n",
"1 \\nThe war for Eternia begins again in what may... \n",
"2 \\nSheriff Deputy Rick Grimes wakes up from a c... \n",
"3 \\nAn animated series that follows the exploits... \n",
"4 \\nA prequel, set before the events of Army of ... \n",
"\n",
" STARS VOTES RunTime Gross \n",
"0 \\n Director:\\nPeter Thorwarth\\n| \\n Star... 21,062 121.0 NaN \n",
"1 \\n \\n Stars:\\nChris Wood, \\nSara... 17,870 25.0 NaN \n",
"2 \\n \\n Stars:\\nAndrew Lincoln, \\n... 885,805 44.0 NaN \n",
"3 \\n \\n Stars:\\nJustin Roiland, \\n... 414,849 23.0 NaN \n",
"4 \\n Director:\\nMatthias Schweighöfer\\n| \\n ... NaN NaN NaN "
]
},
"execution_count": 58,
"metadata": {},
"output_type": "execute_result"
}
],
"source": [
"df = pd.read_csv('movies.csv',header=0)\n",
"df.head()"
]
},
{
"cell_type": "code",
"execution_count": 59,
"metadata": {},
"outputs": [
{
"data": {
"text/html": [
"<div>\n",
"<style scoped>\n",
" .dataframe tbody tr th:only-of-type {\n",
" vertical-align: middle;\n",
" }\n",
"\n",
" .dataframe tbody tr th {\n",
" vertical-align: top;\n",
" }\n",
"\n",
" .dataframe thead th {\n",
" text-align: right;\n",
" }\n",
"</style>\n",
"<table border=\"1\" class=\"dataframe\">\n",
" <thead>\n",
" <tr style=\"text-align: right;\">\n",
" <th></th>\n",
" <th>MOVIES</th>\n",
" <th>YEAR</th>\n",
" <th>GENRE</th>\n",
" <th>RATING</th>\n",
" <th>ONE-LINE</th>\n",
" <th>STARS</th>\n",
" <th>VOTES</th>\n",
" <th>RunTime</th>\n",
" <th>Gross</th>\n",
" </tr>\n",
" </thead>\n",
" <tbody>\n",
" <tr>\n",
" <th>0</th>\n",
" <td>Blood Red Sky</td>\n",
" <td>(2021)</td>\n",
" <td>Action, Horror, Thriller</td>\n",
" <td>6.1</td>\n",
" <td>A woman with a mysterious illness is forced in...</td>\n",
" <td>Director:Peter Thorwarth| Stars:Peri Baume...</td>\n",
" <td>21,062</td>\n",
" <td>121.0</td>\n",
" <td>NaN</td>\n",
" </tr>\n",
" <tr>\n",
" <th>1</th>\n",
" <td>Masters of the Universe: Revelation</td>\n",
" <td>(2021 )</td>\n",
" <td>Animation, Action, Adventure</td>\n",
" <td>5.0</td>\n",
" <td>The war for Eternia begins again in what may b...</td>\n",
" <td>Stars:Chris Wood, Sarah Michelle Gellar, Lena ...</td>\n",
" <td>17,870</td>\n",
" <td>25.0</td>\n",
" <td>NaN</td>\n",
" </tr>\n",
" <tr>\n",
" <th>2</th>\n",
" <td>The Walking Dead</td>\n",
" <td>(20102022)</td>\n",
" <td>Drama, Horror, Thriller</td>\n",
" <td>8.2</td>\n",
" <td>Sheriff Deputy Rick Grimes wakes up from a com...</td>\n",
" <td>Stars:Andrew Lincoln, Norman Reedus, Melissa M...</td>\n",
" <td>885,805</td>\n",
" <td>44.0</td>\n",
" <td>NaN</td>\n",
" </tr>\n",
" <tr>\n",
" <th>3</th>\n",
" <td>Rick and Morty</td>\n",
" <td>(2013 )</td>\n",
" <td>Animation, Adventure, Comedy</td>\n",
" <td>9.2</td>\n",
" <td>An animated series that follows the exploits o...</td>\n",
" <td>Stars:Justin Roiland, Chris Parnell, Spencer G...</td>\n",
" <td>414,849</td>\n",
" <td>23.0</td>\n",
" <td>NaN</td>\n",
" </tr>\n",
" <tr>\n",
" <th>4</th>\n",
" <td>Army of Thieves</td>\n",
" <td>(2021)</td>\n",
" <td>Action, Crime, Horror</td>\n",
" <td>NaN</td>\n",
" <td>A prequel, set before the events of Army of th...</td>\n",
" <td>Director:Matthias Schweighöfer| Stars:Matt...</td>\n",
" <td>NaN</td>\n",
" <td>NaN</td>\n",
" <td>NaN</td>\n",
" </tr>\n",
" </tbody>\n",
"</table>\n",
"</div>"
],
"text/plain": [
" MOVIES YEAR \\\n",
"0 Blood Red Sky (2021) \n",
"1 Masters of the Universe: Revelation (2021 ) \n",
"2 The Walking Dead (20102022) \n",
"3 Rick and Morty (2013 ) \n",
"4 Army of Thieves (2021) \n",
"\n",
" GENRE RATING \\\n",
"0 Action, Horror, Thriller 6.1 \n",
"1 Animation, Action, Adventure 5.0 \n",
"2 Drama, Horror, Thriller 8.2 \n",
"3 Animation, Adventure, Comedy 9.2 \n",
"4 Action, Crime, Horror NaN \n",
"\n",
" ONE-LINE \\\n",
"0 A woman with a mysterious illness is forced in... \n",
"1 The war for Eternia begins again in what may b... \n",
"2 Sheriff Deputy Rick Grimes wakes up from a com... \n",
"3 An animated series that follows the exploits o... \n",
"4 A prequel, set before the events of Army of th... \n",
"\n",
" STARS VOTES RunTime Gross \n",
"0 Director:Peter Thorwarth| Stars:Peri Baume... 21,062 121.0 NaN \n",
"1 Stars:Chris Wood, Sarah Michelle Gellar, Lena ... 17,870 25.0 NaN \n",
"2 Stars:Andrew Lincoln, Norman Reedus, Melissa M... 885,805 44.0 NaN \n",
"3 Stars:Justin Roiland, Chris Parnell, Spencer G... 414,849 23.0 NaN \n",
"4 Director:Matthias Schweighöfer| Stars:Matt... NaN NaN NaN "
]
},
"execution_count": 59,
"metadata": {},
"output_type": "execute_result"
}
],
"source": [
"df['GENRE'] = df['GENRE'].str.replace('\\n','')\n",
"df['ONE-LINE'] = df['ONE-LINE'].str.replace('\\n','')\n",
"df['STARS'] = df['STARS'].str.replace('\\n','')\n",
"\n",
"df['GENRE'] = df['GENRE'].str.strip()\n",
"df['ONE-LINE'] = df['ONE-LINE'].str.strip()\n",
"df['STARS'] = df['STARS'].str.strip()\n",
"\n",
"df.head()"
]
},
{
"cell_type": "code",
"execution_count": 60,
"metadata": {},
"outputs": [
{
"data": {
"text/html": [
"<div>\n",
"<style scoped>\n",
" .dataframe tbody tr th:only-of-type {\n",
" vertical-align: middle;\n",
" }\n",
"\n",
" .dataframe tbody tr th {\n",
" vertical-align: top;\n",
" }\n",
"\n",
" .dataframe thead th {\n",
" text-align: right;\n",
" }\n",
"</style>\n",
"<table border=\"1\" class=\"dataframe\">\n",
" <thead>\n",
" <tr style=\"text-align: right;\">\n",
" <th></th>\n",
" <th>MOVIES</th>\n",
" <th>YEAR</th>\n",
" <th>GENRE</th>\n",
" <th>RATING</th>\n",
" <th>STARS</th>\n",
" <th>VOTES</th>\n",
" </tr>\n",
" </thead>\n",
" <tbody>\n",
" <tr>\n",
" <th>0</th>\n",
" <td>Blood Red Sky</td>\n",
" <td>(2021)</td>\n",
" <td>Action, Horror, Thriller</td>\n",
" <td>6.1</td>\n",
" <td>Director:Peter Thorwarth| Stars:Peri Baume...</td>\n",
" <td>21,062</td>\n",
" </tr>\n",
" <tr>\n",
" <th>1</th>\n",
" <td>Masters of the Universe: Revelation</td>\n",
" <td>(2021 )</td>\n",
" <td>Animation, Action, Adventure</td>\n",
" <td>5.0</td>\n",
" <td>Stars:Chris Wood, Sarah Michelle Gellar, Lena ...</td>\n",
" <td>17,870</td>\n",
" </tr>\n",
" <tr>\n",
" <th>2</th>\n",
" <td>The Walking Dead</td>\n",
" <td>(20102022)</td>\n",
" <td>Drama, Horror, Thriller</td>\n",
" <td>8.2</td>\n",
" <td>Stars:Andrew Lincoln, Norman Reedus, Melissa M...</td>\n",
" <td>885,805</td>\n",
" </tr>\n",
" <tr>\n",
" <th>3</th>\n",
" <td>Rick and Morty</td>\n",
" <td>(2013 )</td>\n",
" <td>Animation, Adventure, Comedy</td>\n",
" <td>9.2</td>\n",
" <td>Stars:Justin Roiland, Chris Parnell, Spencer G...</td>\n",
" <td>414,849</td>\n",
" </tr>\n",
" <tr>\n",
" <th>4</th>\n",
" <td>Army of Thieves</td>\n",
" <td>(2021)</td>\n",
" <td>Action, Crime, Horror</td>\n",
" <td>NaN</td>\n",
" <td>Director:Matthias Schweighöfer| Stars:Matt...</td>\n",
" <td>NaN</td>\n",
" </tr>\n",
" </tbody>\n",
"</table>\n",
"</div>"
],
"text/plain": [
" MOVIES YEAR \\\n",
"0 Blood Red Sky (2021) \n",
"1 Masters of the Universe: Revelation (2021 ) \n",
"2 The Walking Dead (20102022) \n",
"3 Rick and Morty (2013 ) \n",
"4 Army of Thieves (2021) \n",
"\n",
" GENRE RATING \\\n",
"0 Action, Horror, Thriller 6.1 \n",
"1 Animation, Action, Adventure 5.0 \n",
"2 Drama, Horror, Thriller 8.2 \n",
"3 Animation, Adventure, Comedy 9.2 \n",
"4 Action, Crime, Horror NaN \n",
"\n",
" STARS VOTES \n",
"0 Director:Peter Thorwarth| Stars:Peri Baume... 21,062 \n",
"1 Stars:Chris Wood, Sarah Michelle Gellar, Lena ... 17,870 \n",
"2 Stars:Andrew Lincoln, Norman Reedus, Melissa M... 885,805 \n",
"3 Stars:Justin Roiland, Chris Parnell, Spencer G... 414,849 \n",
"4 Director:Matthias Schweighöfer| Stars:Matt... NaN "
]
},
"execution_count": 60,
"metadata": {},
"output_type": "execute_result"
}
],
"source": [
"del df['Gross'], df['RunTime'], df['ONE-LINE']\n",
"\n",
"df.head()"
]
},
{
"cell_type": "code",
"execution_count": 61,
"metadata": {},
"outputs": [],
"source": [
"import re\n",
"\n",
"def get_start_year(year):\n",
" result = re.search(r'\\(([0-9]{4})\\-([0-9]{4})\\)',year)\n",
" if result:\n",
" return result.group(1)\n",
" result = re.search(r'\\(([0-9]{4})\\)',year)\n",
" if result:\n",
" return result.group(1)\n",
"\n",
"def get_end_year(year):\n",
" result = re.search(r'\\(([0-9]{4})\\-([0-9]{4})\\)',year)\n",
" if result:\n",
" return result.group(2)\n",
" result = re.search(r'\\(([0-9]{4})\\)',year)\n",
" if result:\n",
" return result.group(1)\n",
"\n",
"df.dropna(inplace=True)\n",
"df['start_year'] = df['YEAR'].apply(lambda y : get_start_year(y))\n",
"df['end_year'] = df['YEAR'].apply(lambda y : get_end_year(y))"
]
},
{
"cell_type": "code",
"execution_count": 62,
"metadata": {},
"outputs": [
{
"data": {
"text/plain": [
"0 Peri Baumeister, Carl Anton Koch, Alexander Sc...\n",
"6 Shailene Woodley, Joe Alwyn, Wendy Nottingham,...\n",
"10 Karen Gillan, Lena Headey, Carla Gugino, Miche...\n",
"12 Kiana Madeira, Olivia Scott Welch, Benjamin Fl...\n",
"20 Sadie Sink, Emily Rudd, Ryan Simpkins, McCabe ...\n",
"Name: stars, dtype: object"
]
},
"execution_count": 62,
"metadata": {},
"output_type": "execute_result"
}
],
"source": [
"import re\n",
"\n",
"def extract_director(direct):\n",
" result = re.search(r'(Director:|Directors:)(.*)\\|',direct)\n",
" if result:\n",
" return result.group(2).strip()\n",
" return ''\n",
"\n",
"def extract_stars(stars):\n",
" result = re.search(r'(Stars:|Star:)(.*)',stars)\n",
" if result:\n",
" return result.group(2).strip()\n",
" return ''\n",
"\n",
"df['directors'] = df['STARS'].apply(lambda d : extract_director(d))\n",
"df['stars'] = df['STARS'].apply(lambda s : extract_stars(s))\n",
"df.dropna(inplace=True)\n",
"df['stars'].head()"
]
},
{
"cell_type": "code",
"execution_count": 63,
"metadata": {},
"outputs": [
{
"data": {
"text/html": [
"<div>\n",
"<style scoped>\n",
" .dataframe tbody tr th:only-of-type {\n",
" vertical-align: middle;\n",
" }\n",
"\n",
" .dataframe tbody tr th {\n",
" vertical-align: top;\n",
" }\n",
"\n",
" .dataframe thead th {\n",
" text-align: right;\n",
" }\n",
"</style>\n",
"<table border=\"1\" class=\"dataframe\">\n",
" <thead>\n",
" <tr style=\"text-align: right;\">\n",
" <th></th>\n",
" <th>MOVIES</th>\n",
" <th>YEAR</th>\n",
" <th>GENRE</th>\n",
" <th>RATING</th>\n",
" <th>STARS</th>\n",
" <th>VOTES</th>\n",
" <th>start_year</th>\n",
" <th>end_year</th>\n",
" <th>directors</th>\n",
" <th>stars</th>\n",
" <th>genres</th>\n",
" </tr>\n",
" </thead>\n",
" <tbody>\n",
" <tr>\n",
" <th>0</th>\n",
" <td>Blood Red Sky</td>\n",
" <td>(2021)</td>\n",
" <td>Action, Horror, Thriller</td>\n",
" <td>6.1</td>\n",
" <td>Director:Peter Thorwarth| Stars:Peri Baume...</td>\n",
" <td>21,062</td>\n",
" <td>2021</td>\n",
" <td>2021</td>\n",
" <td>Peter Thorwarth</td>\n",
" <td>Peri Baumeister|Carl Anton Koch|Alexander Sche...</td>\n",
" <td>Action|Horror|Thriller</td>\n",
" </tr>\n",
" <tr>\n",
" <th>6</th>\n",
" <td>The Last Letter from Your Lover</td>\n",
" <td>(2021)</td>\n",
" <td>Drama, Romance</td>\n",
" <td>6.8</td>\n",
" <td>Director:Augustine Frizzell| Stars:Shailen...</td>\n",
" <td>5,283</td>\n",
" <td>2021</td>\n",
" <td>2021</td>\n",
" <td>Augustine Frizzell</td>\n",
" <td>Shailene Woodley|Joe Alwyn|Wendy Nottingham|Fe...</td>\n",
" <td>Drama|Romance</td>\n",
" </tr>\n",
" <tr>\n",
" <th>10</th>\n",
" <td>Gunpowder Milkshake</td>\n",
" <td>(2021)</td>\n",
" <td>Action, Adventure, Thriller</td>\n",
" <td>6.0</td>\n",
" <td>Director:Navot Papushado| Stars:Karen Gill...</td>\n",
" <td>17,989</td>\n",
" <td>2021</td>\n",
" <td>2021</td>\n",
" <td>Navot Papushado</td>\n",
" <td>Karen Gillan|Lena Headey|Carla Gugino|Michelle...</td>\n",
" <td>Action|Adventure|Thriller</td>\n",
" </tr>\n",
" <tr>\n",
" <th>12</th>\n",
" <td>Fear Street: 1994</td>\n",
" <td>(2021)</td>\n",
" <td>Drama, Horror, Mystery</td>\n",
" <td>6.2</td>\n",
" <td>Director:Leigh Janiak| Stars:Kiana Madeira...</td>\n",
" <td>50,148</td>\n",
" <td>2021</td>\n",
" <td>2021</td>\n",
" <td>Leigh Janiak</td>\n",
" <td>Kiana Madeira|Olivia Scott Welch|Benjamin Flor...</td>\n",
" <td>Drama|Horror|Mystery</td>\n",
" </tr>\n",
" <tr>\n",
" <th>20</th>\n",
" <td>Fear Street: 1978</td>\n",
" <td>(2021)</td>\n",
" <td>Drama, Horror, Mystery</td>\n",
" <td>6.8</td>\n",
" <td>Director:Leigh Janiak| Stars:Sadie Sink, E...</td>\n",
" <td>36,634</td>\n",
" <td>2021</td>\n",
" <td>2021</td>\n",
" <td>Leigh Janiak</td>\n",
" <td>Sadie Sink|Emily Rudd|Ryan Simpkins|McCabe Slye</td>\n",
" <td>Drama|Horror|Mystery</td>\n",
" </tr>\n",
" </tbody>\n",
"</table>\n",
"</div>"
],
"text/plain": [
" MOVIES YEAR GENRE \\\n",
"0 Blood Red Sky (2021) Action, Horror, Thriller \n",
"6 The Last Letter from Your Lover (2021) Drama, Romance \n",
"10 Gunpowder Milkshake (2021) Action, Adventure, Thriller \n",
"12 Fear Street: 1994 (2021) Drama, Horror, Mystery \n",
"20 Fear Street: 1978 (2021) Drama, Horror, Mystery \n",
"\n",
" RATING STARS VOTES \\\n",
"0 6.1 Director:Peter Thorwarth| Stars:Peri Baume... 21,062 \n",
"6 6.8 Director:Augustine Frizzell| Stars:Shailen... 5,283 \n",
"10 6.0 Director:Navot Papushado| Stars:Karen Gill... 17,989 \n",
"12 6.2 Director:Leigh Janiak| Stars:Kiana Madeira... 50,148 \n",
"20 6.8 Director:Leigh Janiak| Stars:Sadie Sink, E... 36,634 \n",
"\n",
" start_year end_year directors \\\n",
"0 2021 2021 Peter Thorwarth \n",
"6 2021 2021 Augustine Frizzell \n",
"10 2021 2021 Navot Papushado \n",
"12 2021 2021 Leigh Janiak \n",
"20 2021 2021 Leigh Janiak \n",
"\n",
" stars \\\n",
"0 Peri Baumeister|Carl Anton Koch|Alexander Sche... \n",
"6 Shailene Woodley|Joe Alwyn|Wendy Nottingham|Fe... \n",
"10 Karen Gillan|Lena Headey|Carla Gugino|Michelle... \n",
"12 Kiana Madeira|Olivia Scott Welch|Benjamin Flor... \n",
"20 Sadie Sink|Emily Rudd|Ryan Simpkins|McCabe Slye \n",
"\n",
" genres \n",
"0 Action|Horror|Thriller \n",
"6 Drama|Romance \n",
"10 Action|Adventure|Thriller \n",
"12 Drama|Horror|Mystery \n",
"20 Drama|Horror|Mystery "
]
},
"execution_count": 63,
"metadata": {},
"output_type": "execute_result"
}
],
"source": [
"def strip_all_str(to_strip:str):\n",
" striped_str = []\n",
" for s in to_strip:\n",
" striped_str.append(s.strip())\n",
" return striped_str\n",
"\n",
"df['stars'] = df['stars'].str.split(',')\n",
"df['directors'] = df['directors'].str.split(',')\n",
"df['genres'] = df['GENRE'].str.split(',')\n",
"\n",
"df['stars'] = df['stars'].apply(lambda s : strip_all_str(s))\n",
"df['directors'] = df['directors'].apply(lambda s : strip_all_str(s))\n",
"df['genres'] = df['genres'].apply(lambda s : strip_all_str(s))\n",
"\n",
"df['stars'] = df['stars'].apply(lambda s : '|'.join(s))\n",
"df['directors'] = df['directors'].apply(lambda s : '|'.join(s))\n",
"df['genres'] = df['genres'].apply(lambda s : '|'.join(s))\n",
"df.head()"
]
},
{
"cell_type": "code",
"execution_count": 64,
"metadata": {},
"outputs": [
{
"data": {
"text/html": [
"<div>\n",
"<style scoped>\n",
" .dataframe tbody tr th:only-of-type {\n",
" vertical-align: middle;\n",
" }\n",
"\n",
" .dataframe tbody tr th {\n",
" vertical-align: top;\n",
" }\n",
"\n",
" .dataframe thead th {\n",
" text-align: right;\n",
" }\n",
"</style>\n",
"<table border=\"1\" class=\"dataframe\">\n",
" <thead>\n",
" <tr style=\"text-align: right;\">\n",
" <th></th>\n",
" <th>MOVIES</th>\n",
" <th>YEAR</th>\n",
" <th>GENRE</th>\n",
" <th>RATING</th>\n",
" <th>STARS</th>\n",
" <th>VOTES</th>\n",
" <th>start_year</th>\n",
" <th>end_year</th>\n",
" <th>directors</th>\n",
" <th>stars</th>\n",
" <th>...</th>\n",
" <th>Greg Kading</th>\n",
" <th>Griffin Gluck</th>\n",
" <th>Peri Baumeister</th>\n",
" <th>Greg Chun</th>\n",
" <th>Carlos Belloso</th>\n",
" <th>Carlos Barbosa</th>\n",
" <th>John Abraham</th>\n",
" <th>Mauricio Argüelles</th>\n",
" <th>Maurice Compte</th>\n",
" <th>John Belushi</th>\n",
" </tr>\n",
" </thead>\n",
" <tbody>\n",
" <tr>\n",
" <th>0</th>\n",
" <td>Blood Red Sky</td>\n",
" <td>(2021)</td>\n",
" <td>Action, Horror, Thriller</td>\n",
" <td>6.1</td>\n",
" <td>Director:Peter Thorwarth| Stars:Peri Baume...</td>\n",
" <td>21,062</td>\n",
" <td>2021</td>\n",
" <td>2021</td>\n",
" <td>Peter Thorwarth</td>\n",
" <td>Peri Baumeister|Carl Anton Koch|Alexander Sche...</td>\n",
" <td>...</td>\n",
" <td>0</td>\n",
" <td>0</td>\n",
" <td>1</td>\n",
" <td>0</td>\n",
" <td>0</td>\n",
" <td>0</td>\n",
" <td>0</td>\n",
" <td>0</td>\n",
" <td>0</td>\n",
" <td>0</td>\n",
" </tr>\n",
" <tr>\n",
" <th>6</th>\n",
" <td>The Last Letter from Your Lover</td>\n",
" <td>(2021)</td>\n",
" <td>Drama, Romance</td>\n",
" <td>6.8</td>\n",
" <td>Director:Augustine Frizzell| Stars:Shailen...</td>\n",
" <td>5,283</td>\n",
" <td>2021</td>\n",
" <td>2021</td>\n",
" <td>Augustine Frizzell</td>\n",
" <td>Shailene Woodley|Joe Alwyn|Wendy Nottingham|Fe...</td>\n",
" <td>...</td>\n",
" <td>0</td>\n",
" <td>0</td>\n",
" <td>0</td>\n",
" <td>0</td>\n",
" <td>0</td>\n",
" <td>0</td>\n",
" <td>0</td>\n",
" <td>0</td>\n",
" <td>0</td>\n",
" <td>0</td>\n",
" </tr>\n",
" <tr>\n",
" <th>10</th>\n",
" <td>Gunpowder Milkshake</td>\n",
" <td>(2021)</td>\n",
" <td>Action, Adventure, Thriller</td>\n",
" <td>6.0</td>\n",
" <td>Director:Navot Papushado| Stars:Karen Gill...</td>\n",
" <td>17,989</td>\n",
" <td>2021</td>\n",
" <td>2021</td>\n",
" <td>Navot Papushado</td>\n",
" <td>Karen Gillan|Lena Headey|Carla Gugino|Michelle...</td>\n",
" <td>...</td>\n",
" <td>0</td>\n",
" <td>0</td>\n",
" <td>0</td>\n",
" <td>0</td>\n",
" <td>0</td>\n",
" <td>0</td>\n",
" <td>0</td>\n",
" <td>0</td>\n",
" <td>0</td>\n",
" <td>0</td>\n",
" </tr>\n",
" <tr>\n",
" <th>12</th>\n",
" <td>Fear Street: 1994</td>\n",
" <td>(2021)</td>\n",
" <td>Drama, Horror, Mystery</td>\n",
" <td>6.2</td>\n",
" <td>Director:Leigh Janiak| Stars:Kiana Madeira...</td>\n",
" <td>50,148</td>\n",
" <td>2021</td>\n",
" <td>2021</td>\n",
" <td>Leigh Janiak</td>\n",
" <td>Kiana Madeira|Olivia Scott Welch|Benjamin Flor...</td>\n",
" <td>...</td>\n",
" <td>0</td>\n",
" <td>0</td>\n",
" <td>0</td>\n",
" <td>0</td>\n",
" <td>0</td>\n",
" <td>0</td>\n",
" <td>0</td>\n",
" <td>0</td>\n",
" <td>0</td>\n",
" <td>0</td>\n",
" </tr>\n",
" <tr>\n",
" <th>20</th>\n",
" <td>Fear Street: 1978</td>\n",
" <td>(2021)</td>\n",
" <td>Drama, Horror, Mystery</td>\n",
" <td>6.8</td>\n",
" <td>Director:Leigh Janiak| Stars:Sadie Sink, E...</td>\n",
" <td>36,634</td>\n",
" <td>2021</td>\n",
" <td>2021</td>\n",
" <td>Leigh Janiak</td>\n",
" <td>Sadie Sink|Emily Rudd|Ryan Simpkins|McCabe Slye</td>\n",
" <td>...</td>\n",
" <td>0</td>\n",
" <td>0</td>\n",
" <td>0</td>\n",
" <td>0</td>\n",
" <td>0</td>\n",
" <td>0</td>\n",
" <td>0</td>\n",
" <td>0</td>\n",
" <td>0</td>\n",
" <td>0</td>\n",
" </tr>\n",
" </tbody>\n",
"</table>\n",
"<p>5 rows × 2038 columns</p>\n",
"</div>"
],
"text/plain": [
" MOVIES YEAR GENRE \\\n",
"0 Blood Red Sky (2021) Action, Horror, Thriller \n",
"6 The Last Letter from Your Lover (2021) Drama, Romance \n",
"10 Gunpowder Milkshake (2021) Action, Adventure, Thriller \n",
"12 Fear Street: 1994 (2021) Drama, Horror, Mystery \n",
"20 Fear Street: 1978 (2021) Drama, Horror, Mystery \n",
"\n",
" RATING STARS VOTES \\\n",
"0 6.1 Director:Peter Thorwarth| Stars:Peri Baume... 21,062 \n",
"6 6.8 Director:Augustine Frizzell| Stars:Shailen... 5,283 \n",
"10 6.0 Director:Navot Papushado| Stars:Karen Gill... 17,989 \n",
"12 6.2 Director:Leigh Janiak| Stars:Kiana Madeira... 50,148 \n",
"20 6.8 Director:Leigh Janiak| Stars:Sadie Sink, E... 36,634 \n",
"\n",
" start_year end_year directors \\\n",
"0 2021 2021 Peter Thorwarth \n",
"6 2021 2021 Augustine Frizzell \n",
"10 2021 2021 Navot Papushado \n",
"12 2021 2021 Leigh Janiak \n",
"20 2021 2021 Leigh Janiak \n",
"\n",
" stars ... Greg Kading \\\n",
"0 Peri Baumeister|Carl Anton Koch|Alexander Sche... ... 0 \n",
"6 Shailene Woodley|Joe Alwyn|Wendy Nottingham|Fe... ... 0 \n",
"10 Karen Gillan|Lena Headey|Carla Gugino|Michelle... ... 0 \n",
"12 Kiana Madeira|Olivia Scott Welch|Benjamin Flor... ... 0 \n",
"20 Sadie Sink|Emily Rudd|Ryan Simpkins|McCabe Slye ... 0 \n",
"\n",
" Griffin Gluck Peri Baumeister Greg Chun Carlos Belloso Carlos Barbosa \\\n",
"0 0 1 0 0 0 \n",
"6 0 0 0 0 0 \n",
"10 0 0 0 0 0 \n",
"12 0 0 0 0 0 \n",
"20 0 0 0 0 0 \n",
"\n",
" John Abraham Mauricio Argüelles Maurice Compte John Belushi \n",
"0 0 0 0 0 \n",
"6 0 0 0 0 \n",
"10 0 0 0 0 \n",
"12 0 0 0 0 \n",
"20 0 0 0 0 \n",
"\n",
"[5 rows x 2038 columns]"
]
},
"execution_count": 64,
"metadata": {},
"output_type": "execute_result"
}
],
"source": [
"df_star_dummies = df['stars'].str.get_dummies(sep='|')\n",
"df_directors_dummies = df['directors'].str.get_dummies(sep='|')\n",
"df_genres_dummies = df['genres'].str.get_dummies(sep='|')\n",
"\n",
"most_important_values = list(df_star_dummies.sum().sort_values(ascending=False).head(1000).to_dict().keys())\n",
"df_star_dummies = df_star_dummies.loc[:,most_important_values]\n",
"most_important_values = list(df_directors_dummies.sum().sort_values(ascending=False).head(1000).to_dict().keys())\n",
"df_directors_dummies = df_directors_dummies.loc[:,most_important_values]\n",
"most_important_values = list(df_genres_dummies.sum().sort_values(ascending=False).head(1000).to_dict().keys())\n",
"df_genres_dummies = df_genres_dummies.loc[:,most_important_values]\n",
"\n",
"df = pd.concat([df,df_genres_dummies,df_directors_dummies,df_star_dummies],axis=1)\n",
"\n",
"df.head()"
]
},
{
"cell_type": "code",
"execution_count": 65,
"metadata": {},
"outputs": [
{
"data": {
"text/html": [
"<div>\n",
"<style scoped>\n",
" .dataframe tbody tr th:only-of-type {\n",
" vertical-align: middle;\n",
" }\n",
"\n",
" .dataframe tbody tr th {\n",
" vertical-align: top;\n",
" }\n",
"\n",
" .dataframe thead th {\n",
" text-align: right;\n",
" }\n",
"</style>\n",
"<table border=\"1\" class=\"dataframe\">\n",
" <thead>\n",
" <tr style=\"text-align: right;\">\n",
" <th></th>\n",
" <th>MOVIES</th>\n",
" <th>YEAR</th>\n",
" <th>GENRE</th>\n",
" <th>RATING</th>\n",
" <th>STARS</th>\n",
" <th>VOTES</th>\n",
" <th>start_year</th>\n",
" <th>end_year</th>\n",
" <th>directors</th>\n",
" <th>stars</th>\n",
" <th>...</th>\n",
" <th>Greg Kading</th>\n",
" <th>Griffin Gluck</th>\n",
" <th>Peri Baumeister</th>\n",
" <th>Greg Chun</th>\n",
" <th>Carlos Belloso</th>\n",
" <th>Carlos Barbosa</th>\n",
" <th>John Abraham</th>\n",
" <th>Mauricio Argüelles</th>\n",
" <th>Maurice Compte</th>\n",
" <th>John Belushi</th>\n",
" </tr>\n",
" </thead>\n",
" <tbody>\n",
" <tr>\n",
" <th>0</th>\n",
" <td>Blood Red Sky</td>\n",
" <td>(2021)</td>\n",
" <td>Action, Horror, Thriller</td>\n",
" <td>6.1</td>\n",
" <td>Director:Peter Thorwarth| Stars:Peri Baume...</td>\n",
" <td>21062</td>\n",
" <td>2021</td>\n",
" <td>2021</td>\n",
" <td>Peter Thorwarth</td>\n",
" <td>Peri Baumeister|Carl Anton Koch|Alexander Sche...</td>\n",
" <td>...</td>\n",
" <td>0</td>\n",
" <td>0</td>\n",
" <td>1</td>\n",
" <td>0</td>\n",
" <td>0</td>\n",
" <td>0</td>\n",
" <td>0</td>\n",
" <td>0</td>\n",
" <td>0</td>\n",
" <td>0</td>\n",
" </tr>\n",
" <tr>\n",
" <th>6</th>\n",
" <td>The Last Letter from Your Lover</td>\n",
" <td>(2021)</td>\n",
" <td>Drama, Romance</td>\n",
" <td>6.8</td>\n",
" <td>Director:Augustine Frizzell| Stars:Shailen...</td>\n",
" <td>5283</td>\n",
" <td>2021</td>\n",
" <td>2021</td>\n",
" <td>Augustine Frizzell</td>\n",
" <td>Shailene Woodley|Joe Alwyn|Wendy Nottingham|Fe...</td>\n",
" <td>...</td>\n",
" <td>0</td>\n",
" <td>0</td>\n",
" <td>0</td>\n",
" <td>0</td>\n",
" <td>0</td>\n",
" <td>0</td>\n",
" <td>0</td>\n",
" <td>0</td>\n",
" <td>0</td>\n",
" <td>0</td>\n",
" </tr>\n",
" <tr>\n",
" <th>10</th>\n",
" <td>Gunpowder Milkshake</td>\n",
" <td>(2021)</td>\n",
" <td>Action, Adventure, Thriller</td>\n",
" <td>6.0</td>\n",
" <td>Director:Navot Papushado| Stars:Karen Gill...</td>\n",
" <td>17989</td>\n",
" <td>2021</td>\n",
" <td>2021</td>\n",
" <td>Navot Papushado</td>\n",
" <td>Karen Gillan|Lena Headey|Carla Gugino|Michelle...</td>\n",
" <td>...</td>\n",
" <td>0</td>\n",
" <td>0</td>\n",
" <td>0</td>\n",
" <td>0</td>\n",
" <td>0</td>\n",
" <td>0</td>\n",
" <td>0</td>\n",
" <td>0</td>\n",
" <td>0</td>\n",
" <td>0</td>\n",
" </tr>\n",
" <tr>\n",
" <th>12</th>\n",
" <td>Fear Street: 1994</td>\n",
" <td>(2021)</td>\n",
" <td>Drama, Horror, Mystery</td>\n",
" <td>6.2</td>\n",
" <td>Director:Leigh Janiak| Stars:Kiana Madeira...</td>\n",
" <td>50148</td>\n",
" <td>2021</td>\n",
" <td>2021</td>\n",
" <td>Leigh Janiak</td>\n",
" <td>Kiana Madeira|Olivia Scott Welch|Benjamin Flor...</td>\n",
" <td>...</td>\n",
" <td>0</td>\n",
" <td>0</td>\n",
" <td>0</td>\n",
" <td>0</td>\n",
" <td>0</td>\n",
" <td>0</td>\n",
" <td>0</td>\n",
" <td>0</td>\n",
" <td>0</td>\n",
" <td>0</td>\n",
" </tr>\n",
" <tr>\n",
" <th>20</th>\n",
" <td>Fear Street: 1978</td>\n",
" <td>(2021)</td>\n",
" <td>Drama, Horror, Mystery</td>\n",
" <td>6.8</td>\n",
" <td>Director:Leigh Janiak| Stars:Sadie Sink, E...</td>\n",
" <td>36634</td>\n",
" <td>2021</td>\n",
" <td>2021</td>\n",
" <td>Leigh Janiak</td>\n",
" <td>Sadie Sink|Emily Rudd|Ryan Simpkins|McCabe Slye</td>\n",
" <td>...</td>\n",
" <td>0</td>\n",
" <td>0</td>\n",
" <td>0</td>\n",
" <td>0</td>\n",
" <td>0</td>\n",
" <td>0</td>\n",
" <td>0</td>\n",
" <td>0</td>\n",
" <td>0</td>\n",
" <td>0</td>\n",
" </tr>\n",
" </tbody>\n",
"</table>\n",
"<p>5 rows × 2038 columns</p>\n",
"</div>"
],
"text/plain": [
" MOVIES YEAR GENRE \\\n",
"0 Blood Red Sky (2021) Action, Horror, Thriller \n",
"6 The Last Letter from Your Lover (2021) Drama, Romance \n",
"10 Gunpowder Milkshake (2021) Action, Adventure, Thriller \n",
"12 Fear Street: 1994 (2021) Drama, Horror, Mystery \n",
"20 Fear Street: 1978 (2021) Drama, Horror, Mystery \n",
"\n",
" RATING STARS VOTES \\\n",
"0 6.1 Director:Peter Thorwarth| Stars:Peri Baume... 21062 \n",
"6 6.8 Director:Augustine Frizzell| Stars:Shailen... 5283 \n",
"10 6.0 Director:Navot Papushado| Stars:Karen Gill... 17989 \n",
"12 6.2 Director:Leigh Janiak| Stars:Kiana Madeira... 50148 \n",
"20 6.8 Director:Leigh Janiak| Stars:Sadie Sink, E... 36634 \n",
"\n",
" start_year end_year directors \\\n",
"0 2021 2021 Peter Thorwarth \n",
"6 2021 2021 Augustine Frizzell \n",
"10 2021 2021 Navot Papushado \n",
"12 2021 2021 Leigh Janiak \n",
"20 2021 2021 Leigh Janiak \n",
"\n",
" stars ... Greg Kading \\\n",
"0 Peri Baumeister|Carl Anton Koch|Alexander Sche... ... 0 \n",
"6 Shailene Woodley|Joe Alwyn|Wendy Nottingham|Fe... ... 0 \n",
"10 Karen Gillan|Lena Headey|Carla Gugino|Michelle... ... 0 \n",
"12 Kiana Madeira|Olivia Scott Welch|Benjamin Flor... ... 0 \n",
"20 Sadie Sink|Emily Rudd|Ryan Simpkins|McCabe Slye ... 0 \n",
"\n",
" Griffin Gluck Peri Baumeister Greg Chun Carlos Belloso Carlos Barbosa \\\n",
"0 0 1 0 0 0 \n",
"6 0 0 0 0 0 \n",
"10 0 0 0 0 0 \n",
"12 0 0 0 0 0 \n",
"20 0 0 0 0 0 \n",
"\n",
" John Abraham Mauricio Argüelles Maurice Compte John Belushi \n",
"0 0 0 0 0 \n",
"6 0 0 0 0 \n",
"10 0 0 0 0 \n",
"12 0 0 0 0 \n",
"20 0 0 0 0 \n",
"\n",
"[5 rows x 2038 columns]"
]
},
"execution_count": 65,
"metadata": {},
"output_type": "execute_result"
}
],
"source": [
"df['start_year'] = df['start_year'].apply(lambda x : int(x))\n",
"df['end_year'] = df['end_year'].apply(lambda x : int(x))\n",
"df['RATING'] = df['RATING'].apply(lambda x : float(x))\n",
"\n",
"df['VOTES'] = df['VOTES'].str.replace(',','')\n",
"df['VOTES'] = df['VOTES'].apply(lambda x : int(x))\n",
"\n",
"df.head()"
]
},
{
"cell_type": "code",
"execution_count": 66,
"metadata": {},
"outputs": [],
"source": [
"cleaned_df = df\n",
"del cleaned_df['MOVIES'], cleaned_df['GENRE'],cleaned_df['STARS'],cleaned_df['directors'],cleaned_df['stars'],cleaned_df['genres'],cleaned_df['YEAR']\n",
"cleaned_df.dropna(inplace=True)"
]
},
{
"cell_type": "code",
"execution_count": 67,
"metadata": {},
"outputs": [
{
"data": {
"text/html": [
"<div>\n",
"<style scoped>\n",
" .dataframe tbody tr th:only-of-type {\n",
" vertical-align: middle;\n",
" }\n",
"\n",
" .dataframe tbody tr th {\n",
" vertical-align: top;\n",
" }\n",
"\n",
" .dataframe thead th {\n",
" text-align: right;\n",
" }\n",
"</style>\n",
"<table border=\"1\" class=\"dataframe\">\n",
" <thead>\n",
" <tr style=\"text-align: right;\">\n",
" <th></th>\n",
" <th>VOTES</th>\n",
" <th>start_year</th>\n",
" <th>end_year</th>\n",
" <th>Drama</th>\n",
" <th>Comedy</th>\n",
" <th>Documentary</th>\n",
" <th>Crime</th>\n",
" <th>Action</th>\n",
" <th>Thriller</th>\n",
" <th>Romance</th>\n",
" <th>...</th>\n",
" <th>Greg Kading</th>\n",
" <th>Griffin Gluck</th>\n",
" <th>Peri Baumeister</th>\n",
" <th>Greg Chun</th>\n",
" <th>Carlos Belloso</th>\n",
" <th>Carlos Barbosa</th>\n",
" <th>John Abraham</th>\n",
" <th>Mauricio Argüelles</th>\n",
" <th>Maurice Compte</th>\n",
" <th>John Belushi</th>\n",
" </tr>\n",
" </thead>\n",
" <tbody>\n",
" <tr>\n",
" <th>508</th>\n",
" <td>200206</td>\n",
" <td>2013</td>\n",
" <td>2013</td>\n",
" <td>0</td>\n",
" <td>1</td>\n",
" <td>0</td>\n",
" <td>0</td>\n",
" <td>0</td>\n",
" <td>0</td>\n",
" <td>0</td>\n",
" <td>...</td>\n",
" <td>0</td>\n",
" <td>0</td>\n",
" <td>0</td>\n",
" <td>0</td>\n",
" <td>0</td>\n",
" <td>0</td>\n",
" <td>0</td>\n",
" <td>0</td>\n",
" <td>0</td>\n",
" <td>0</td>\n",
" </tr>\n",
" <tr>\n",
" <th>509</th>\n",
" <td>34984</td>\n",
" <td>2018</td>\n",
" <td>2018</td>\n",
" <td>1</td>\n",
" <td>0</td>\n",
" <td>0</td>\n",
" <td>1</td>\n",
" <td>0</td>\n",
" <td>0</td>\n",
" <td>0</td>\n",
" <td>...</td>\n",
" <td>0</td>\n",
" <td>0</td>\n",
" <td>0</td>\n",
" <td>0</td>\n",
" <td>0</td>\n",
" <td>0</td>\n",
" <td>0</td>\n",
" <td>0</td>\n",
" <td>0</td>\n",
" <td>0</td>\n",
" </tr>\n",
" <tr>\n",
" <th>510</th>\n",
" <td>124972</td>\n",
" <td>1993</td>\n",
" <td>1993</td>\n",
" <td>1</td>\n",
" <td>0</td>\n",
" <td>0</td>\n",
" <td>0</td>\n",
" <td>0</td>\n",
" <td>1</td>\n",
" <td>0</td>\n",
" <td>...</td>\n",
" <td>0</td>\n",
" <td>0</td>\n",
" <td>0</td>\n",
" <td>0</td>\n",
" <td>0</td>\n",
" <td>0</td>\n",
" <td>0</td>\n",
" <td>0</td>\n",
" <td>0</td>\n",
" <td>0</td>\n",
" </tr>\n",
" <tr>\n",
" <th>512</th>\n",
" <td>21572</td>\n",
" <td>2018</td>\n",
" <td>2018</td>\n",
" <td>1</td>\n",
" <td>0</td>\n",
" <td>0</td>\n",
" <td>1</td>\n",
" <td>0</td>\n",
" <td>0</td>\n",
" <td>0</td>\n",
" <td>...</td>\n",
" <td>0</td>\n",
" <td>0</td>\n",
" <td>0</td>\n",
" <td>0</td>\n",
" <td>0</td>\n",
" <td>0</td>\n",
" <td>0</td>\n",
" <td>0</td>\n",
" <td>0</td>\n",
" <td>0</td>\n",
" </tr>\n",
" <tr>\n",
" <th>513</th>\n",
" <td>3082</td>\n",
" <td>2020</td>\n",
" <td>2020</td>\n",
" <td>1</td>\n",
" <td>0</td>\n",
" <td>0</td>\n",
" <td>0</td>\n",
" <td>0</td>\n",
" <td>0</td>\n",
" <td>0</td>\n",
" <td>...</td>\n",
" <td>0</td>\n",
" <td>0</td>\n",
" <td>0</td>\n",
" <td>0</td>\n",
" <td>0</td>\n",
" <td>0</td>\n",
" <td>0</td>\n",
" <td>0</td>\n",
" <td>0</td>\n",
" <td>0</td>\n",
" </tr>\n",
" <tr>\n",
" <th>...</th>\n",
" <td>...</td>\n",
" <td>...</td>\n",
" <td>...</td>\n",
" <td>...</td>\n",
" <td>...</td>\n",
" <td>...</td>\n",
" <td>...</td>\n",
" <td>...</td>\n",
" <td>...</td>\n",
" <td>...</td>\n",
" <td>...</td>\n",
" <td>...</td>\n",
" <td>...</td>\n",
" <td>...</td>\n",
" <td>...</td>\n",
" <td>...</td>\n",
" <td>...</td>\n",
" <td>...</td>\n",
" <td>...</td>\n",
" <td>...</td>\n",
" <td>...</td>\n",
" </tr>\n",
" <tr>\n",
" <th>9618</th>\n",
" <td>49</td>\n",
" <td>2021</td>\n",
" <td>2021</td>\n",
" <td>0</td>\n",
" <td>1</td>\n",
" <td>0</td>\n",
" <td>0</td>\n",
" <td>0</td>\n",
" <td>0</td>\n",
" <td>0</td>\n",
" <td>...</td>\n",
" <td>0</td>\n",
" <td>0</td>\n",
" <td>0</td>\n",
" <td>0</td>\n",
" <td>0</td>\n",
" <td>0</td>\n",
" <td>0</td>\n",
" <td>0</td>\n",
" <td>0</td>\n",
" <td>0</td>\n",
" </tr>\n",
" <tr>\n",
" <th>9814</th>\n",
" <td>175</td>\n",
" <td>2021</td>\n",
" <td>2021</td>\n",
" <td>0</td>\n",
" <td>0</td>\n",
" <td>1</td>\n",
" <td>1</td>\n",
" <td>0</td>\n",
" <td>0</td>\n",
" <td>0</td>\n",
" <td>...</td>\n",
" <td>0</td>\n",
" <td>0</td>\n",
" <td>0</td>\n",
" <td>0</td>\n",
" <td>0</td>\n",
" <td>0</td>\n",
" <td>0</td>\n",
" <td>0</td>\n",
" <td>0</td>\n",
" <td>0</td>\n",
" </tr>\n",
" <tr>\n",
" <th>9825</th>\n",
" <td>156</td>\n",
" <td>2021</td>\n",
" <td>2021</td>\n",
" <td>0</td>\n",
" <td>0</td>\n",
" <td>1</td>\n",
" <td>1</td>\n",
" <td>0</td>\n",
" <td>0</td>\n",
" <td>0</td>\n",
" <td>...</td>\n",
" <td>0</td>\n",
" <td>0</td>\n",
" <td>0</td>\n",
" <td>0</td>\n",
" <td>0</td>\n",
" <td>0</td>\n",
" <td>0</td>\n",
" <td>0</td>\n",
" <td>0</td>\n",
" <td>0</td>\n",
" </tr>\n",
" <tr>\n",
" <th>9826</th>\n",
" <td>145</td>\n",
" <td>2021</td>\n",
" <td>2021</td>\n",
" <td>0</td>\n",
" <td>0</td>\n",
" <td>1</td>\n",
" <td>1</td>\n",
" <td>0</td>\n",
" <td>0</td>\n",
" <td>0</td>\n",
" <td>...</td>\n",
" <td>0</td>\n",
" <td>0</td>\n",
" <td>0</td>\n",
" <td>0</td>\n",
" <td>0</td>\n",
" <td>0</td>\n",
" <td>0</td>\n",
" <td>0</td>\n",
" <td>0</td>\n",
" <td>0</td>\n",
" </tr>\n",
" <tr>\n",
" <th>9827</th>\n",
" <td>137</td>\n",
" <td>2021</td>\n",
" <td>2021</td>\n",
" <td>0</td>\n",
" <td>0</td>\n",
" <td>1</td>\n",
" <td>1</td>\n",
" <td>0</td>\n",
" <td>0</td>\n",
" <td>0</td>\n",
" <td>...</td>\n",
" <td>0</td>\n",
" <td>0</td>\n",
" <td>0</td>\n",
" <td>0</td>\n",
" <td>0</td>\n",
" <td>0</td>\n",
" <td>0</td>\n",
" <td>0</td>\n",
" <td>0</td>\n",
" <td>0</td>\n",
" </tr>\n",
" </tbody>\n",
"</table>\n",
"<p>3563 rows × 2030 columns</p>\n",
"</div>"
],
"text/plain": [
" VOTES start_year end_year Drama Comedy Documentary Crime Action \\\n",
"508 200206 2013 2013 0 1 0 0 0 \n",
"509 34984 2018 2018 1 0 0 1 0 \n",
"510 124972 1993 1993 1 0 0 0 0 \n",
"512 21572 2018 2018 1 0 0 1 0 \n",
"513 3082 2020 2020 1 0 0 0 0 \n",
"... ... ... ... ... ... ... ... ... \n",
"9618 49 2021 2021 0 1 0 0 0 \n",
"9814 175 2021 2021 0 0 1 1 0 \n",
"9825 156 2021 2021 0 0 1 1 0 \n",
"9826 145 2021 2021 0 0 1 1 0 \n",
"9827 137 2021 2021 0 0 1 1 0 \n",
"\n",
" Thriller Romance ... Greg Kading Griffin Gluck Peri Baumeister \\\n",
"508 0 0 ... 0 0 0 \n",
"509 0 0 ... 0 0 0 \n",
"510 1 0 ... 0 0 0 \n",
"512 0 0 ... 0 0 0 \n",
"513 0 0 ... 0 0 0 \n",
"... ... ... ... ... ... ... \n",
"9618 0 0 ... 0 0 0 \n",
"9814 0 0 ... 0 0 0 \n",
"9825 0 0 ... 0 0 0 \n",
"9826 0 0 ... 0 0 0 \n",
"9827 0 0 ... 0 0 0 \n",
"\n",
" Greg Chun Carlos Belloso Carlos Barbosa John Abraham \\\n",
"508 0 0 0 0 \n",
"509 0 0 0 0 \n",
"510 0 0 0 0 \n",
"512 0 0 0 0 \n",
"513 0 0 0 0 \n",
"... ... ... ... ... \n",
"9618 0 0 0 0 \n",
"9814 0 0 0 0 \n",
"9825 0 0 0 0 \n",
"9826 0 0 0 0 \n",
"9827 0 0 0 0 \n",
"\n",
" Mauricio Argüelles Maurice Compte John Belushi \n",
"508 0 0 0 \n",
"509 0 0 0 \n",
"510 0 0 0 \n",
"512 0 0 0 \n",
"513 0 0 0 \n",
"... ... ... ... \n",
"9618 0 0 0 \n",
"9814 0 0 0 \n",
"9825 0 0 0 \n",
"9826 0 0 0 \n",
"9827 0 0 0 \n",
"\n",
"[3563 rows x 2030 columns]"
]
},
"execution_count": 67,
"metadata": {},
"output_type": "execute_result"
}
],
"source": [
"first_values = cleaned_df[200:]\n",
"features = first_values.loc[:, df.columns!='RATING']\n",
"labels = first_values[['RATING']].values\n",
"features"
]
},
{
"cell_type": "code",
"execution_count": 68,
"metadata": {},
"outputs": [],
"source": [
"from sklearn.model_selection import train_test_split\n",
"from sklearn.svm import SVC,LinearSVC\n",
"from sklearn.tree import DecisionTreeRegressor\n",
"from sklearn.ensemble import RandomForestRegressor\n",
"from sklearn.linear_model import LinearRegression\n",
"from sklearn.preprocessing import PolynomialFeatures\n",
"from sklearn.linear_model import Ridge\n",
"from sklearn.neighbors import KNeighborsClassifier\n",
"from sklearn.metrics import r2_score"
]
},
{
"cell_type": "code",
"execution_count": 69,
"metadata": {},
"outputs": [],
"source": [
"train, test, train_labels, test_labels = train_test_split(features,labels,test_size=0.33)"
]
},
{
"cell_type": "code",
"execution_count": 70,
"metadata": {},
"outputs": [
{
"data": {
"text/plain": [
"-0.03440859287218978"
]
},
"execution_count": 70,
"metadata": {},
"output_type": "execute_result"
}
],
"source": [
"decision_tree = DecisionTreeRegressor()\n",
"decision_tree.fit(train, train_labels)\n",
"Y_pred = decision_tree.predict(test)\n",
"acc_decision_tree = r2_score(test_labels,Y_pred)\n",
"acc_decision_tree"
]
},
{
"cell_type": "code",
"execution_count": 71,
"metadata": {},
"outputs": [
{
"data": {
"text/plain": [
"-32814680366.75818"
]
},
"execution_count": 71,
"metadata": {},
"output_type": "execute_result"
}
],
"source": [
"linreg = LinearRegression()\n",
"linreg.fit(train, train_labels)\n",
"Y_pred = linreg.predict(test)\n",
"acc_log = r2_score(test_labels,Y_pred)\n",
"acc_log"
]
},
{
"cell_type": "code",
"execution_count": 72,
"metadata": {},
"outputs": [
{
"data": {
"text/plain": [
"0.39096903616388345"
]
},
"execution_count": 72,
"metadata": {},
"output_type": "execute_result"
}
],
"source": [
"ridge_reg = Ridge(alpha=1,max_iter=100,tol=0.1)\n",
"ridge_reg.fit(train,train_labels)\n",
"Y_pred = ridge_reg.predict(test)\n",
"ridge_log = r2_score(test_labels,Y_pred)\n",
"ridge_log"
]
}
],
"metadata": {
"kernelspec": {
"display_name": "Python 3",
"language": "python",
"name": "python3"
},
"language_info": {
"codemirror_mode": {
"name": "ipython",
"version": 3
},
"file_extension": ".py",
"mimetype": "text/x-python",
"name": "python",
"nbconvert_exporter": "python",
"pygments_lexer": "ipython3",
"version": "3.8.10"
},
"orig_nbformat": 4,
"vscode": {
"interpreter": {
"hash": "916dbcbb3f70747c44a77c7bcd40155683ae19c65e1c03b4aa3499c5328201f1"
}
}
},
"nbformat": 4,
"nbformat_minor": 2
}