UM_Netflix_Projekt/UM_projekt.ipynb
2021-06-30 21:26:41 +02:00

6.6 KiB

import pandas as pd
import seaborn as sns
import datetime
import matplotlib.pyplot as plt
from sklearn.metrics import mean_squared_error

netflix=pd.read_csv('netflix_titles_enriched.csv')

netflix_cleaned = netflix[netflix.rottentomatoes_audience_score > 0].sort_values(by = 'rottentomatoes_audience_score')
netflix_cleaned.rottentomatoes_audience_score /= 100
netflix_cleaned.drop(['rottentomatoes_audience_#reviews',
       'rottentomatoes_audience_review', 'rottentomatoes_tomatometer_score',
       'rottentomatoes_critics_#reviews', 'rottentomatoes_critic_review'], axis = 1)

netflix_cleaned.date_added = netflix_cleaned.date_added.dropna().apply(lambda x: datetime.datetime.strptime(x[1:] if x[0] == ' ' else x, '%B %d, %Y'))

netflix_cleaned.update(netflix_cleaned.select_dtypes(include = 'object').apply(lambda col: col.str.lower()))



movies = netflix_cleaned[netflix_cleaned.type == 'movie']


movies.duration = movies.duration.str.extract(r'(\d*)( min)')[0].astype('int32')

from sklearn.preprocessing import MultiLabelBinarizer
mlb = MultiLabelBinarizer()

movies = movies.join(pd.DataFrame(mlb.fit_transform(movies.pop('listed_in').str.split(', ')),
                          columns=mlb.classes_,
                          index=movies.index))
movies.drop(['movies'], axis = 1)

movies = movies[['release_year', 'duration',
       'rottentomatoes_audience_score',
       'action & adventure', 'anime features', 'children & family movies',
       'classic movies', 'comedies', 'cult movies', 'documentaries', 'dramas',
       'faith & spirituality', 'horror movies', 'independent movies',
       'international movies', 'lgbtq movies', 'movies', 'music & musicals',
       'romantic movies', 'sci-fi & fantasy', 'sports movies',
       'stand-up comedy', 'thrillers']]

import sklearn
from sklearn.model_selection import train_test_split

movies_train, movies_test = sklearn.model_selection.train_test_split(movies,test_size=0.20, random_state=42)
#movies_test, movies_val = sklearn.model_selection.train_test_split(movies_test,test_size=0.50, random_state=42)


x_train = movies_train.copy()
y_train = x_train.pop('rottentomatoes_audience_score')
#x_train.pop('Unnamed: 0')

x_test = movies_test.copy()
y_test = x_test.pop('rottentomatoes_audience_score')
#y_test.pop('Unnamed: 0')
2700
2160 540
/usr/local/lib/python3.7/dist-packages/pandas/core/generic.py:5170: SettingWithCopyWarning: 
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  self[name] = value
from sklearn.linear_model import SGDRegressor
from sklearn.pipeline import make_pipeline
from sklearn.preprocessing import StandardScaler
modelSGD = make_pipeline(StandardScaler(), SGDRegressor())
modelSGD.fit(x_train, y_train) 
y_predicted = modelSGD.predict(x_val)
errorSGD = mean_squared_error(y_val, y_predicted)
from sklearn.linear_model import LinearRegression
modelLR = make_pipeline(StandardScaler(), LinearRegression())
modelLR.fit(x_train, y_train) 
y_predicted = modelLR.predict(x_val)
errorLR = mean_squared_error(y_val, y_predicted)
from sklearn.svm import SVR
modelSVR = make_pipeline(StandardScaler(), SVR())
modelSVR.fit(x_train, y_train) 
y_predicted = modelSVR.predict(x_val)
errorSVR = mean_squared_error(y_val, y_predicted)
print(errorLR, errorSGD, errorSVR)
0.040133803263361176 0.040076924018007165 0.04124993242855958