przetwarzanie_jezyka_natura.../P1/imdb/main.ipynb
2023-07-04 20:28:47 +02:00

60 KiB
Raw Permalink Blame History

import pandas as pd
import numpy as np
df = pd.read_csv('movies.csv',header=0)
df.head()
MOVIES YEAR GENRE RATING ONE-LINE STARS VOTES RunTime Gross
0 Blood Red Sky (2021) \nAction, Horror, Thriller 6.1 \nA woman with a mysterious illness is forced ... \n Director:\nPeter Thorwarth\n| \n Star... 21,062 121.0 NaN
1 Masters of the Universe: Revelation (2021 ) \nAnimation, Action, Adventure 5.0 \nThe war for Eternia begins again in what may... \n \n Stars:\nChris Wood, \nSara... 17,870 25.0 NaN
2 The Walking Dead (20102022) \nDrama, Horror, Thriller 8.2 \nSheriff Deputy Rick Grimes wakes up from a c... \n \n Stars:\nAndrew Lincoln, \n... 885,805 44.0 NaN
3 Rick and Morty (2013 ) \nAnimation, Adventure, Comedy 9.2 \nAn animated series that follows the exploits... \n \n Stars:\nJustin Roiland, \n... 414,849 23.0 NaN
4 Army of Thieves (2021) \nAction, Crime, Horror NaN \nA prequel, set before the events of Army of ... \n Director:\nMatthias Schweighöfer\n| \n ... NaN NaN NaN
df['GENRE'] = df['GENRE'].str.replace('\n','')
df['ONE-LINE'] = df['ONE-LINE'].str.replace('\n','')
df['STARS'] = df['STARS'].str.replace('\n','')

df['GENRE'] = df['GENRE'].str.strip()
df['ONE-LINE'] = df['ONE-LINE'].str.strip()
df['STARS'] = df['STARS'].str.strip()

df.head()
MOVIES YEAR GENRE RATING ONE-LINE STARS VOTES RunTime Gross
0 Blood Red Sky (2021) Action, Horror, Thriller 6.1 A woman with a mysterious illness is forced in... Director:Peter Thorwarth| Stars:Peri Baume... 21,062 121.0 NaN
1 Masters of the Universe: Revelation (2021 ) Animation, Action, Adventure 5.0 The war for Eternia begins again in what may b... Stars:Chris Wood, Sarah Michelle Gellar, Lena ... 17,870 25.0 NaN
2 The Walking Dead (20102022) Drama, Horror, Thriller 8.2 Sheriff Deputy Rick Grimes wakes up from a com... Stars:Andrew Lincoln, Norman Reedus, Melissa M... 885,805 44.0 NaN
3 Rick and Morty (2013 ) Animation, Adventure, Comedy 9.2 An animated series that follows the exploits o... Stars:Justin Roiland, Chris Parnell, Spencer G... 414,849 23.0 NaN
4 Army of Thieves (2021) Action, Crime, Horror NaN A prequel, set before the events of Army of th... Director:Matthias Schweighöfer| Stars:Matt... NaN NaN NaN
del df['Gross'], df['RunTime'], df['ONE-LINE']

df.head()
MOVIES YEAR GENRE RATING STARS VOTES
0 Blood Red Sky (2021) Action, Horror, Thriller 6.1 Director:Peter Thorwarth| Stars:Peri Baume... 21,062
1 Masters of the Universe: Revelation (2021 ) Animation, Action, Adventure 5.0 Stars:Chris Wood, Sarah Michelle Gellar, Lena ... 17,870
2 The Walking Dead (20102022) Drama, Horror, Thriller 8.2 Stars:Andrew Lincoln, Norman Reedus, Melissa M... 885,805
3 Rick and Morty (2013 ) Animation, Adventure, Comedy 9.2 Stars:Justin Roiland, Chris Parnell, Spencer G... 414,849
4 Army of Thieves (2021) Action, Crime, Horror NaN Director:Matthias Schweighöfer| Stars:Matt... NaN
import re

def get_start_year(year):
    result = re.search(r'\(([0-9]{4})\-([0-9]{4})\)',year)
    if result:
        return result.group(1)
    result = re.search(r'\(([0-9]{4})\)',year)
    if result:
        return result.group(1)

def get_end_year(year):
    result = re.search(r'\(([0-9]{4})\-([0-9]{4})\)',year)
    if result:
        return result.group(2)
    result = re.search(r'\(([0-9]{4})\)',year)
    if result:
        return result.group(1)

df.dropna(inplace=True)
df['start_year'] = df['YEAR'].apply(lambda y : get_start_year(y))
df['end_year'] = df['YEAR'].apply(lambda y : get_end_year(y))
import re

def extract_director(direct):
    result = re.search(r'(Director:|Directors:)(.*)\|',direct)
    if result:
        return result.group(2).strip()
    return ''

def extract_stars(stars):
    result = re.search(r'(Stars:|Star:)(.*)',stars)
    if result:
        return result.group(2).strip()
    return ''

df['directors'] = df['STARS'].apply(lambda d : extract_director(d))
df['stars'] = df['STARS'].apply(lambda s : extract_stars(s))
df.dropna(inplace=True)
df['stars'].head()
0     Peri Baumeister, Carl Anton Koch, Alexander Sc...
6     Shailene Woodley, Joe Alwyn, Wendy Nottingham,...
10    Karen Gillan, Lena Headey, Carla Gugino, Miche...
12    Kiana Madeira, Olivia Scott Welch, Benjamin Fl...
20    Sadie Sink, Emily Rudd, Ryan Simpkins, McCabe ...
Name: stars, dtype: object
def strip_all_str(to_strip:str):
    striped_str = []
    for s in to_strip:
        striped_str.append(s.strip())
    return striped_str

df['stars'] = df['stars'].str.split(',')
df['directors'] = df['directors'].str.split(',')
df['genres'] = df['GENRE'].str.split(',')

df['stars'] = df['stars'].apply(lambda s : strip_all_str(s))
df['directors'] = df['directors'].apply(lambda s : strip_all_str(s))
df['genres'] = df['genres'].apply(lambda s : strip_all_str(s))

df['stars'] = df['stars'].apply(lambda s : '|'.join(s))
df['directors'] = df['directors'].apply(lambda s : '|'.join(s))
df['genres'] = df['genres'].apply(lambda s : '|'.join(s))
df.head()
MOVIES YEAR GENRE RATING STARS VOTES start_year end_year directors stars genres
0 Blood Red Sky (2021) Action, Horror, Thriller 6.1 Director:Peter Thorwarth| Stars:Peri Baume... 21,062 2021 2021 Peter Thorwarth Peri Baumeister|Carl Anton Koch|Alexander Sche... Action|Horror|Thriller
6 The Last Letter from Your Lover (2021) Drama, Romance 6.8 Director:Augustine Frizzell| Stars:Shailen... 5,283 2021 2021 Augustine Frizzell Shailene Woodley|Joe Alwyn|Wendy Nottingham|Fe... Drama|Romance
10 Gunpowder Milkshake (2021) Action, Adventure, Thriller 6.0 Director:Navot Papushado| Stars:Karen Gill... 17,989 2021 2021 Navot Papushado Karen Gillan|Lena Headey|Carla Gugino|Michelle... Action|Adventure|Thriller
12 Fear Street: 1994 (2021) Drama, Horror, Mystery 6.2 Director:Leigh Janiak| Stars:Kiana Madeira... 50,148 2021 2021 Leigh Janiak Kiana Madeira|Olivia Scott Welch|Benjamin Flor... Drama|Horror|Mystery
20 Fear Street: 1978 (2021) Drama, Horror, Mystery 6.8 Director:Leigh Janiak| Stars:Sadie Sink, E... 36,634 2021 2021 Leigh Janiak Sadie Sink|Emily Rudd|Ryan Simpkins|McCabe Slye Drama|Horror|Mystery
df_star_dummies = df['stars'].str.get_dummies(sep='|')
df_directors_dummies = df['directors'].str.get_dummies(sep='|')
df_genres_dummies = df['genres'].str.get_dummies(sep='|')

most_important_values = list(df_star_dummies.sum().sort_values(ascending=False).head(1000).to_dict().keys())
df_star_dummies = df_star_dummies.loc[:,most_important_values]
most_important_values = list(df_directors_dummies.sum().sort_values(ascending=False).head(1000).to_dict().keys())
df_directors_dummies = df_directors_dummies.loc[:,most_important_values]
most_important_values = list(df_genres_dummies.sum().sort_values(ascending=False).head(1000).to_dict().keys())
df_genres_dummies = df_genres_dummies.loc[:,most_important_values]

df = pd.concat([df,df_genres_dummies,df_directors_dummies,df_star_dummies],axis=1)

df.head()
MOVIES YEAR GENRE RATING STARS VOTES start_year end_year directors stars ... Greg Kading Griffin Gluck Peri Baumeister Greg Chun Carlos Belloso Carlos Barbosa John Abraham Mauricio Argüelles Maurice Compte John Belushi
0 Blood Red Sky (2021) Action, Horror, Thriller 6.1 Director:Peter Thorwarth| Stars:Peri Baume... 21,062 2021 2021 Peter Thorwarth Peri Baumeister|Carl Anton Koch|Alexander Sche... ... 0 0 1 0 0 0 0 0 0 0
6 The Last Letter from Your Lover (2021) Drama, Romance 6.8 Director:Augustine Frizzell| Stars:Shailen... 5,283 2021 2021 Augustine Frizzell Shailene Woodley|Joe Alwyn|Wendy Nottingham|Fe... ... 0 0 0 0 0 0 0 0 0 0
10 Gunpowder Milkshake (2021) Action, Adventure, Thriller 6.0 Director:Navot Papushado| Stars:Karen Gill... 17,989 2021 2021 Navot Papushado Karen Gillan|Lena Headey|Carla Gugino|Michelle... ... 0 0 0 0 0 0 0 0 0 0
12 Fear Street: 1994 (2021) Drama, Horror, Mystery 6.2 Director:Leigh Janiak| Stars:Kiana Madeira... 50,148 2021 2021 Leigh Janiak Kiana Madeira|Olivia Scott Welch|Benjamin Flor... ... 0 0 0 0 0 0 0 0 0 0
20 Fear Street: 1978 (2021) Drama, Horror, Mystery 6.8 Director:Leigh Janiak| Stars:Sadie Sink, E... 36,634 2021 2021 Leigh Janiak Sadie Sink|Emily Rudd|Ryan Simpkins|McCabe Slye ... 0 0 0 0 0 0 0 0 0 0

5 rows × 2038 columns

df['start_year'] = df['start_year'].apply(lambda x : int(x))
df['end_year'] = df['end_year'].apply(lambda x : int(x))
df['RATING'] = df['RATING'].apply(lambda x : float(x))

df['VOTES'] = df['VOTES'].str.replace(',','')
df['VOTES'] = df['VOTES'].apply(lambda x : int(x))

df.head()
MOVIES YEAR GENRE RATING STARS VOTES start_year end_year directors stars ... Greg Kading Griffin Gluck Peri Baumeister Greg Chun Carlos Belloso Carlos Barbosa John Abraham Mauricio Argüelles Maurice Compte John Belushi
0 Blood Red Sky (2021) Action, Horror, Thriller 6.1 Director:Peter Thorwarth| Stars:Peri Baume... 21062 2021 2021 Peter Thorwarth Peri Baumeister|Carl Anton Koch|Alexander Sche... ... 0 0 1 0 0 0 0 0 0 0
6 The Last Letter from Your Lover (2021) Drama, Romance 6.8 Director:Augustine Frizzell| Stars:Shailen... 5283 2021 2021 Augustine Frizzell Shailene Woodley|Joe Alwyn|Wendy Nottingham|Fe... ... 0 0 0 0 0 0 0 0 0 0
10 Gunpowder Milkshake (2021) Action, Adventure, Thriller 6.0 Director:Navot Papushado| Stars:Karen Gill... 17989 2021 2021 Navot Papushado Karen Gillan|Lena Headey|Carla Gugino|Michelle... ... 0 0 0 0 0 0 0 0 0 0
12 Fear Street: 1994 (2021) Drama, Horror, Mystery 6.2 Director:Leigh Janiak| Stars:Kiana Madeira... 50148 2021 2021 Leigh Janiak Kiana Madeira|Olivia Scott Welch|Benjamin Flor... ... 0 0 0 0 0 0 0 0 0 0
20 Fear Street: 1978 (2021) Drama, Horror, Mystery 6.8 Director:Leigh Janiak| Stars:Sadie Sink, E... 36634 2021 2021 Leigh Janiak Sadie Sink|Emily Rudd|Ryan Simpkins|McCabe Slye ... 0 0 0 0 0 0 0 0 0 0

5 rows × 2038 columns

cleaned_df = df
del cleaned_df['MOVIES'], cleaned_df['GENRE'],cleaned_df['STARS'],cleaned_df['directors'],cleaned_df['stars'],cleaned_df['genres'],cleaned_df['YEAR']
cleaned_df.dropna(inplace=True)
first_values = cleaned_df[200:]
features = first_values.loc[:, df.columns!='RATING']
labels = first_values[['RATING']].values
features
VOTES start_year end_year Drama Comedy Documentary Crime Action Thriller Romance ... Greg Kading Griffin Gluck Peri Baumeister Greg Chun Carlos Belloso Carlos Barbosa John Abraham Mauricio Argüelles Maurice Compte John Belushi
508 200206 2013 2013 0 1 0 0 0 0 0 ... 0 0 0 0 0 0 0 0 0 0
509 34984 2018 2018 1 0 0 1 0 0 0 ... 0 0 0 0 0 0 0 0 0 0
510 124972 1993 1993 1 0 0 0 0 1 0 ... 0 0 0 0 0 0 0 0 0 0
512 21572 2018 2018 1 0 0 1 0 0 0 ... 0 0 0 0 0 0 0 0 0 0
513 3082 2020 2020 1 0 0 0 0 0 0 ... 0 0 0 0 0 0 0 0 0 0
... ... ... ... ... ... ... ... ... ... ... ... ... ... ... ... ... ... ... ... ... ...
9618 49 2021 2021 0 1 0 0 0 0 0 ... 0 0 0 0 0 0 0 0 0 0
9814 175 2021 2021 0 0 1 1 0 0 0 ... 0 0 0 0 0 0 0 0 0 0
9825 156 2021 2021 0 0 1 1 0 0 0 ... 0 0 0 0 0 0 0 0 0 0
9826 145 2021 2021 0 0 1 1 0 0 0 ... 0 0 0 0 0 0 0 0 0 0
9827 137 2021 2021 0 0 1 1 0 0 0 ... 0 0 0 0 0 0 0 0 0 0

3563 rows × 2030 columns

from sklearn.model_selection import train_test_split
from sklearn.svm import SVC,LinearSVC
from sklearn.tree import DecisionTreeRegressor
from sklearn.ensemble import RandomForestRegressor
from sklearn.linear_model import LinearRegression
from sklearn.preprocessing import PolynomialFeatures
from sklearn.linear_model import Ridge
from sklearn.neighbors import KNeighborsClassifier
from sklearn.metrics import r2_score
train, test, train_labels, test_labels = train_test_split(features,labels,test_size=0.33)
decision_tree = DecisionTreeRegressor()
decision_tree.fit(train, train_labels)
Y_pred = decision_tree.predict(test)
acc_decision_tree = r2_score(test_labels,Y_pred)
acc_decision_tree
-0.03440859287218978
linreg = LinearRegression()
linreg.fit(train, train_labels)
Y_pred = linreg.predict(test)
acc_log = r2_score(test_labels,Y_pred)
acc_log
-32814680366.75818
ridge_reg = Ridge(alpha=1,max_iter=100,tol=0.1)
ridge_reg.fit(train,train_labels)
Y_pred = ridge_reg.predict(test)
ridge_log = r2_score(test_labels,Y_pred)
ridge_log
0.39096903616388345