2021-03-21 21:32:32 +01:00
|
|
|
|
|
|
|
|
|
|
|
import pandas as pd
|
|
|
|
import seaborn as sns
|
|
|
|
import datetime
|
|
|
|
import matplotlib.pyplot as plt
|
|
|
|
|
|
|
|
netflix=pd.read_csv('netflix_titles_enriched.csv')
|
|
|
|
|
|
|
|
netflix_cleaned = netflix[netflix.rottentomatoes_audience_score > 0].sort_values(by = 'rottentomatoes_audience_score')
|
|
|
|
netflix_cleaned.rottentomatoes_audience_score /= 100
|
|
|
|
netflix_cleaned.drop(['rottentomatoes_audience_#reviews',
|
|
|
|
'rottentomatoes_audience_review', 'rottentomatoes_tomatometer_score',
|
|
|
|
'rottentomatoes_critics_#reviews', 'rottentomatoes_critic_review'], axis = 1)
|
|
|
|
|
|
|
|
netflix_cleaned.date_added = netflix_cleaned.date_added.dropna().apply(lambda x: datetime.datetime.strptime(x[1:] if x[0] == ' ' else x, '%B %d, %Y'))
|
|
|
|
|
|
|
|
netflix_cleaned.update(netflix_cleaned.select_dtypes(include = 'object').apply(lambda col: col.str.lower()))
|
|
|
|
|
|
|
|
|
2021-04-25 21:38:20 +02:00
|
|
|
|
2021-03-21 21:32:32 +01:00
|
|
|
movies = netflix_cleaned[netflix_cleaned.type == 'movie']
|
|
|
|
series = netflix_cleaned[netflix_cleaned.type == 'tv show']
|
|
|
|
|
|
|
|
|
|
|
|
movies.duration = movies.duration.str.extract(r'(\d*)( min)')[0].astype('int32')
|
|
|
|
|
|
|
|
from sklearn.preprocessing import MultiLabelBinarizer
|
|
|
|
mlb = MultiLabelBinarizer()
|
|
|
|
|
|
|
|
movies = movies.join(pd.DataFrame(mlb.fit_transform(movies.pop('listed_in').str.split(', ')),
|
|
|
|
columns=mlb.classes_,
|
|
|
|
index=movies.index))
|
|
|
|
movies.drop(['movies'], axis = 1)
|
|
|
|
|
2021-04-25 21:38:20 +02:00
|
|
|
movies = movies[['release_year', 'duration',
|
|
|
|
'rottentomatoes_audience_score',
|
|
|
|
'action & adventure', 'anime features', 'children & family movies',
|
|
|
|
'classic movies', 'comedies', 'cult movies', 'documentaries', 'dramas',
|
|
|
|
'faith & spirituality', 'horror movies', 'independent movies',
|
|
|
|
'international movies', 'lgbtq movies', 'movies', 'music & musicals',
|
|
|
|
'romantic movies', 'sci-fi & fantasy', 'sports movies',
|
|
|
|
'stand-up comedy', 'thrillers']]
|
|
|
|
|
2021-03-21 21:32:32 +01:00
|
|
|
import sklearn
|
|
|
|
from sklearn.model_selection import train_test_split
|
|
|
|
movies_train, movies_test = sklearn.model_selection.train_test_split(movies,test_size=0.20, random_state=42)
|
|
|
|
movies_test, movies_val = sklearn.model_selection.train_test_split(movies_test,test_size=0.50, random_state=42)
|
|
|
|
|
|
|
|
movies_subsets = [movies_train, movies_test, movies_val]
|
|
|
|
for subset in movies_subsets:
|
|
|
|
print(subset.shape[0])
|
|
|
|
print(subset.describe(include='all'))
|
|
|
|
|
|
|
|
movies_subsets = pd.concat(movies_subsets, keys = ['Train','Test','Validation'])
|
|
|
|
movies_subsets = movies_subsets.reset_index()
|
|
|
|
ax = sns.boxplot(data = movies_subsets, x = 'level_0', y = 'rottentomatoes_audience_score')
|
|
|
|
ax.set(title = 'Audience score distribution between subsets', ylabel = 'Audience score on Rotten Tomatoes', xlabel = 'SUBSET')
|
2021-04-11 12:24:29 +02:00
|
|
|
#plt.show(ax)
|
2021-03-21 21:32:32 +01:00
|
|
|
|
|
|
|
|
|
|
|
series.duration = series.duration.str.extract(r'(\d*)( seasons?)')[0].astype('int32')
|
|
|
|
|
|
|
|
series = series.rename(columns = {'Unnamed: 0': 'Season'})
|
|
|
|
series['Id'] = series.Season.str.extract(r'(s\d+)(|\',\ )(\d+)')[0]
|
|
|
|
series.Season = series.Season.str.extract(r'(s\d+)(|\',\ )(\d+)')[2].astype('int32')
|
|
|
|
series = series[series.Season > 0]
|
|
|
|
|
|
|
|
mlb = MultiLabelBinarizer()
|
|
|
|
|
|
|
|
series = series.join(pd.DataFrame(mlb.fit_transform(series.pop('listed_in').str.split(', ')),
|
|
|
|
columns=mlb.classes_,
|
|
|
|
index=series.index))
|
|
|
|
|
|
|
|
series_train, series_test = sklearn.model_selection.train_test_split(series,test_size=0.20, random_state=42)
|
|
|
|
series_test, series_val = sklearn.model_selection.train_test_split(series_test,test_size=0.50, random_state=42)
|
|
|
|
|
|
|
|
series_subsets = [series_train, series_test, series_val]
|
|
|
|
for subset in series_subsets:
|
|
|
|
print(subset.shape[0])
|
|
|
|
print(subset.describe(include='all'))
|
|
|
|
|
|
|
|
series_subsets = pd.concat(series_subsets, keys = ['Train','Test','Validation'])
|
|
|
|
series_subsets = series_subsets.reset_index()
|
|
|
|
ax = sns.boxplot(data = series_subsets, x = 'level_0', y = 'rottentomatoes_audience_score')
|
|
|
|
ax.set(title = 'Audience score distribution between subsets', ylabel = 'Audience score on Rotten Tomatoes', xlabel = 'SUBSET')
|
2021-04-11 12:24:29 +02:00
|
|
|
#plt.show(ax)
|
2021-04-25 21:38:20 +02:00
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
movies_train.to_csv('movies_train.csv')
|
|
|
|
movies_test.to_csv('movies_test.csv')
|
|
|
|
movies_val.to_csv('movies_val.csv')
|