60 KiB
60 KiB
import pandas as pd
import numpy as np
df = pd.read_csv('movies.csv',header=0)
df.head()
MOVIES | YEAR | GENRE | RATING | ONE-LINE | STARS | VOTES | RunTime | Gross | |
---|---|---|---|---|---|---|---|---|---|
0 | Blood Red Sky | (2021) | \nAction, Horror, Thriller | 6.1 | \nA woman with a mysterious illness is forced ... | \n Director:\nPeter Thorwarth\n| \n Star... | 21,062 | 121.0 | NaN |
1 | Masters of the Universe: Revelation | (2021– ) | \nAnimation, Action, Adventure | 5.0 | \nThe war for Eternia begins again in what may... | \n \n Stars:\nChris Wood, \nSara... | 17,870 | 25.0 | NaN |
2 | The Walking Dead | (2010–2022) | \nDrama, Horror, Thriller | 8.2 | \nSheriff Deputy Rick Grimes wakes up from a c... | \n \n Stars:\nAndrew Lincoln, \n... | 885,805 | 44.0 | NaN |
3 | Rick and Morty | (2013– ) | \nAnimation, Adventure, Comedy | 9.2 | \nAn animated series that follows the exploits... | \n \n Stars:\nJustin Roiland, \n... | 414,849 | 23.0 | NaN |
4 | Army of Thieves | (2021) | \nAction, Crime, Horror | NaN | \nA prequel, set before the events of Army of ... | \n Director:\nMatthias Schweighöfer\n| \n ... | NaN | NaN | NaN |
df['GENRE'] = df['GENRE'].str.replace('\n','')
df['ONE-LINE'] = df['ONE-LINE'].str.replace('\n','')
df['STARS'] = df['STARS'].str.replace('\n','')
df['GENRE'] = df['GENRE'].str.strip()
df['ONE-LINE'] = df['ONE-LINE'].str.strip()
df['STARS'] = df['STARS'].str.strip()
df.head()
MOVIES | YEAR | GENRE | RATING | ONE-LINE | STARS | VOTES | RunTime | Gross | |
---|---|---|---|---|---|---|---|---|---|
0 | Blood Red Sky | (2021) | Action, Horror, Thriller | 6.1 | A woman with a mysterious illness is forced in... | Director:Peter Thorwarth| Stars:Peri Baume... | 21,062 | 121.0 | NaN |
1 | Masters of the Universe: Revelation | (2021– ) | Animation, Action, Adventure | 5.0 | The war for Eternia begins again in what may b... | Stars:Chris Wood, Sarah Michelle Gellar, Lena ... | 17,870 | 25.0 | NaN |
2 | The Walking Dead | (2010–2022) | Drama, Horror, Thriller | 8.2 | Sheriff Deputy Rick Grimes wakes up from a com... | Stars:Andrew Lincoln, Norman Reedus, Melissa M... | 885,805 | 44.0 | NaN |
3 | Rick and Morty | (2013– ) | Animation, Adventure, Comedy | 9.2 | An animated series that follows the exploits o... | Stars:Justin Roiland, Chris Parnell, Spencer G... | 414,849 | 23.0 | NaN |
4 | Army of Thieves | (2021) | Action, Crime, Horror | NaN | A prequel, set before the events of Army of th... | Director:Matthias Schweighöfer| Stars:Matt... | NaN | NaN | NaN |
del df['Gross'], df['RunTime'], df['ONE-LINE']
df.head()
MOVIES | YEAR | GENRE | RATING | STARS | VOTES | |
---|---|---|---|---|---|---|
0 | Blood Red Sky | (2021) | Action, Horror, Thriller | 6.1 | Director:Peter Thorwarth| Stars:Peri Baume... | 21,062 |
1 | Masters of the Universe: Revelation | (2021– ) | Animation, Action, Adventure | 5.0 | Stars:Chris Wood, Sarah Michelle Gellar, Lena ... | 17,870 |
2 | The Walking Dead | (2010–2022) | Drama, Horror, Thriller | 8.2 | Stars:Andrew Lincoln, Norman Reedus, Melissa M... | 885,805 |
3 | Rick and Morty | (2013– ) | Animation, Adventure, Comedy | 9.2 | Stars:Justin Roiland, Chris Parnell, Spencer G... | 414,849 |
4 | Army of Thieves | (2021) | Action, Crime, Horror | NaN | Director:Matthias Schweighöfer| Stars:Matt... | NaN |
import re
def get_start_year(year):
result = re.search(r'\(([0-9]{4})\-([0-9]{4})\)',year)
if result:
return result.group(1)
result = re.search(r'\(([0-9]{4})\)',year)
if result:
return result.group(1)
def get_end_year(year):
result = re.search(r'\(([0-9]{4})\-([0-9]{4})\)',year)
if result:
return result.group(2)
result = re.search(r'\(([0-9]{4})\)',year)
if result:
return result.group(1)
df.dropna(inplace=True)
df['start_year'] = df['YEAR'].apply(lambda y : get_start_year(y))
df['end_year'] = df['YEAR'].apply(lambda y : get_end_year(y))
import re
def extract_director(direct):
result = re.search(r'(Director:|Directors:)(.*)\|',direct)
if result:
return result.group(2).strip()
return ''
def extract_stars(stars):
result = re.search(r'(Stars:|Star:)(.*)',stars)
if result:
return result.group(2).strip()
return ''
df['directors'] = df['STARS'].apply(lambda d : extract_director(d))
df['stars'] = df['STARS'].apply(lambda s : extract_stars(s))
df.dropna(inplace=True)
df['stars'].head()
0 Peri Baumeister, Carl Anton Koch, Alexander Sc... 6 Shailene Woodley, Joe Alwyn, Wendy Nottingham,... 10 Karen Gillan, Lena Headey, Carla Gugino, Miche... 12 Kiana Madeira, Olivia Scott Welch, Benjamin Fl... 20 Sadie Sink, Emily Rudd, Ryan Simpkins, McCabe ... Name: stars, dtype: object
def strip_all_str(to_strip:str):
striped_str = []
for s in to_strip:
striped_str.append(s.strip())
return striped_str
df['stars'] = df['stars'].str.split(',')
df['directors'] = df['directors'].str.split(',')
df['genres'] = df['GENRE'].str.split(',')
df['stars'] = df['stars'].apply(lambda s : strip_all_str(s))
df['directors'] = df['directors'].apply(lambda s : strip_all_str(s))
df['genres'] = df['genres'].apply(lambda s : strip_all_str(s))
df['stars'] = df['stars'].apply(lambda s : '|'.join(s))
df['directors'] = df['directors'].apply(lambda s : '|'.join(s))
df['genres'] = df['genres'].apply(lambda s : '|'.join(s))
df.head()
MOVIES | YEAR | GENRE | RATING | STARS | VOTES | start_year | end_year | directors | stars | genres | |
---|---|---|---|---|---|---|---|---|---|---|---|
0 | Blood Red Sky | (2021) | Action, Horror, Thriller | 6.1 | Director:Peter Thorwarth| Stars:Peri Baume... | 21,062 | 2021 | 2021 | Peter Thorwarth | Peri Baumeister|Carl Anton Koch|Alexander Sche... | Action|Horror|Thriller |
6 | The Last Letter from Your Lover | (2021) | Drama, Romance | 6.8 | Director:Augustine Frizzell| Stars:Shailen... | 5,283 | 2021 | 2021 | Augustine Frizzell | Shailene Woodley|Joe Alwyn|Wendy Nottingham|Fe... | Drama|Romance |
10 | Gunpowder Milkshake | (2021) | Action, Adventure, Thriller | 6.0 | Director:Navot Papushado| Stars:Karen Gill... | 17,989 | 2021 | 2021 | Navot Papushado | Karen Gillan|Lena Headey|Carla Gugino|Michelle... | Action|Adventure|Thriller |
12 | Fear Street: 1994 | (2021) | Drama, Horror, Mystery | 6.2 | Director:Leigh Janiak| Stars:Kiana Madeira... | 50,148 | 2021 | 2021 | Leigh Janiak | Kiana Madeira|Olivia Scott Welch|Benjamin Flor... | Drama|Horror|Mystery |
20 | Fear Street: 1978 | (2021) | Drama, Horror, Mystery | 6.8 | Director:Leigh Janiak| Stars:Sadie Sink, E... | 36,634 | 2021 | 2021 | Leigh Janiak | Sadie Sink|Emily Rudd|Ryan Simpkins|McCabe Slye | Drama|Horror|Mystery |
df_star_dummies = df['stars'].str.get_dummies(sep='|')
df_directors_dummies = df['directors'].str.get_dummies(sep='|')
df_genres_dummies = df['genres'].str.get_dummies(sep='|')
most_important_values = list(df_star_dummies.sum().sort_values(ascending=False).head(1000).to_dict().keys())
df_star_dummies = df_star_dummies.loc[:,most_important_values]
most_important_values = list(df_directors_dummies.sum().sort_values(ascending=False).head(1000).to_dict().keys())
df_directors_dummies = df_directors_dummies.loc[:,most_important_values]
most_important_values = list(df_genres_dummies.sum().sort_values(ascending=False).head(1000).to_dict().keys())
df_genres_dummies = df_genres_dummies.loc[:,most_important_values]
df = pd.concat([df,df_genres_dummies,df_directors_dummies,df_star_dummies],axis=1)
df.head()
MOVIES | YEAR | GENRE | RATING | STARS | VOTES | start_year | end_year | directors | stars | ... | Greg Kading | Griffin Gluck | Peri Baumeister | Greg Chun | Carlos Belloso | Carlos Barbosa | John Abraham | Mauricio Argüelles | Maurice Compte | John Belushi | |
---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|
0 | Blood Red Sky | (2021) | Action, Horror, Thriller | 6.1 | Director:Peter Thorwarth| Stars:Peri Baume... | 21,062 | 2021 | 2021 | Peter Thorwarth | Peri Baumeister|Carl Anton Koch|Alexander Sche... | ... | 0 | 0 | 1 | 0 | 0 | 0 | 0 | 0 | 0 | 0 |
6 | The Last Letter from Your Lover | (2021) | Drama, Romance | 6.8 | Director:Augustine Frizzell| Stars:Shailen... | 5,283 | 2021 | 2021 | Augustine Frizzell | Shailene Woodley|Joe Alwyn|Wendy Nottingham|Fe... | ... | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 |
10 | Gunpowder Milkshake | (2021) | Action, Adventure, Thriller | 6.0 | Director:Navot Papushado| Stars:Karen Gill... | 17,989 | 2021 | 2021 | Navot Papushado | Karen Gillan|Lena Headey|Carla Gugino|Michelle... | ... | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 |
12 | Fear Street: 1994 | (2021) | Drama, Horror, Mystery | 6.2 | Director:Leigh Janiak| Stars:Kiana Madeira... | 50,148 | 2021 | 2021 | Leigh Janiak | Kiana Madeira|Olivia Scott Welch|Benjamin Flor... | ... | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 |
20 | Fear Street: 1978 | (2021) | Drama, Horror, Mystery | 6.8 | Director:Leigh Janiak| Stars:Sadie Sink, E... | 36,634 | 2021 | 2021 | Leigh Janiak | Sadie Sink|Emily Rudd|Ryan Simpkins|McCabe Slye | ... | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 |
5 rows × 2038 columns
df['start_year'] = df['start_year'].apply(lambda x : int(x))
df['end_year'] = df['end_year'].apply(lambda x : int(x))
df['RATING'] = df['RATING'].apply(lambda x : float(x))
df['VOTES'] = df['VOTES'].str.replace(',','')
df['VOTES'] = df['VOTES'].apply(lambda x : int(x))
df.head()
MOVIES | YEAR | GENRE | RATING | STARS | VOTES | start_year | end_year | directors | stars | ... | Greg Kading | Griffin Gluck | Peri Baumeister | Greg Chun | Carlos Belloso | Carlos Barbosa | John Abraham | Mauricio Argüelles | Maurice Compte | John Belushi | |
---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|
0 | Blood Red Sky | (2021) | Action, Horror, Thriller | 6.1 | Director:Peter Thorwarth| Stars:Peri Baume... | 21062 | 2021 | 2021 | Peter Thorwarth | Peri Baumeister|Carl Anton Koch|Alexander Sche... | ... | 0 | 0 | 1 | 0 | 0 | 0 | 0 | 0 | 0 | 0 |
6 | The Last Letter from Your Lover | (2021) | Drama, Romance | 6.8 | Director:Augustine Frizzell| Stars:Shailen... | 5283 | 2021 | 2021 | Augustine Frizzell | Shailene Woodley|Joe Alwyn|Wendy Nottingham|Fe... | ... | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 |
10 | Gunpowder Milkshake | (2021) | Action, Adventure, Thriller | 6.0 | Director:Navot Papushado| Stars:Karen Gill... | 17989 | 2021 | 2021 | Navot Papushado | Karen Gillan|Lena Headey|Carla Gugino|Michelle... | ... | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 |
12 | Fear Street: 1994 | (2021) | Drama, Horror, Mystery | 6.2 | Director:Leigh Janiak| Stars:Kiana Madeira... | 50148 | 2021 | 2021 | Leigh Janiak | Kiana Madeira|Olivia Scott Welch|Benjamin Flor... | ... | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 |
20 | Fear Street: 1978 | (2021) | Drama, Horror, Mystery | 6.8 | Director:Leigh Janiak| Stars:Sadie Sink, E... | 36634 | 2021 | 2021 | Leigh Janiak | Sadie Sink|Emily Rudd|Ryan Simpkins|McCabe Slye | ... | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 |
5 rows × 2038 columns
cleaned_df = df
del cleaned_df['MOVIES'], cleaned_df['GENRE'],cleaned_df['STARS'],cleaned_df['directors'],cleaned_df['stars'],cleaned_df['genres'],cleaned_df['YEAR']
cleaned_df.dropna(inplace=True)
first_values = cleaned_df[200:]
features = first_values.loc[:, df.columns!='RATING']
labels = first_values[['RATING']].values
features
VOTES | start_year | end_year | Drama | Comedy | Documentary | Crime | Action | Thriller | Romance | ... | Greg Kading | Griffin Gluck | Peri Baumeister | Greg Chun | Carlos Belloso | Carlos Barbosa | John Abraham | Mauricio Argüelles | Maurice Compte | John Belushi | |
---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|
508 | 200206 | 2013 | 2013 | 0 | 1 | 0 | 0 | 0 | 0 | 0 | ... | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 |
509 | 34984 | 2018 | 2018 | 1 | 0 | 0 | 1 | 0 | 0 | 0 | ... | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 |
510 | 124972 | 1993 | 1993 | 1 | 0 | 0 | 0 | 0 | 1 | 0 | ... | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 |
512 | 21572 | 2018 | 2018 | 1 | 0 | 0 | 1 | 0 | 0 | 0 | ... | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 |
513 | 3082 | 2020 | 2020 | 1 | 0 | 0 | 0 | 0 | 0 | 0 | ... | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 |
... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... |
9618 | 49 | 2021 | 2021 | 0 | 1 | 0 | 0 | 0 | 0 | 0 | ... | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 |
9814 | 175 | 2021 | 2021 | 0 | 0 | 1 | 1 | 0 | 0 | 0 | ... | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 |
9825 | 156 | 2021 | 2021 | 0 | 0 | 1 | 1 | 0 | 0 | 0 | ... | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 |
9826 | 145 | 2021 | 2021 | 0 | 0 | 1 | 1 | 0 | 0 | 0 | ... | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 |
9827 | 137 | 2021 | 2021 | 0 | 0 | 1 | 1 | 0 | 0 | 0 | ... | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 |
3563 rows × 2030 columns
from sklearn.model_selection import train_test_split
from sklearn.svm import SVC,LinearSVC
from sklearn.tree import DecisionTreeRegressor
from sklearn.ensemble import RandomForestRegressor
from sklearn.linear_model import LinearRegression
from sklearn.preprocessing import PolynomialFeatures
from sklearn.linear_model import Ridge
from sklearn.neighbors import KNeighborsClassifier
from sklearn.metrics import r2_score
train, test, train_labels, test_labels = train_test_split(features,labels,test_size=0.33)
decision_tree = DecisionTreeRegressor()
decision_tree.fit(train, train_labels)
Y_pred = decision_tree.predict(test)
acc_decision_tree = r2_score(test_labels,Y_pred)
acc_decision_tree
-0.03440859287218978
linreg = LinearRegression()
linreg.fit(train, train_labels)
Y_pred = linreg.predict(test)
acc_log = r2_score(test_labels,Y_pred)
acc_log
-32814680366.75818
ridge_reg = Ridge(alpha=1,max_iter=100,tol=0.1)
ridge_reg.fit(train,train_labels)
Y_pred = ridge_reg.predict(test)
ridge_log = r2_score(test_labels,Y_pred)
ridge_log
0.39096903616388345