Add script for zadanie 5

This commit is contained in:
Zofia Galla 2021-04-25 21:38:20 +02:00
parent aa0e85f270
commit 997481e85a
5 changed files with 67 additions and 0 deletions

View File

@ -0,0 +1,6 @@
{
"cells": [],
"metadata": {},
"nbformat": 4,
"nbformat_minor": 2
}

View File

@ -15,5 +15,6 @@ RUN pip3 install sklearn
RUN pip3 install pandas RUN pip3 install pandas
RUN pip3 install seaborn RUN pip3 install seaborn
RUN pip3 install matplotlib RUN pip3 install matplotlib
RUN pip3 install tensorflow
CMD ./run.sh CMD ./run.sh

View File

@ -18,6 +18,7 @@ netflix_cleaned.date_added = netflix_cleaned.date_added.dropna().apply(lambda x:
netflix_cleaned.update(netflix_cleaned.select_dtypes(include = 'object').apply(lambda col: col.str.lower())) netflix_cleaned.update(netflix_cleaned.select_dtypes(include = 'object').apply(lambda col: col.str.lower()))
movies = netflix_cleaned[netflix_cleaned.type == 'movie'] movies = netflix_cleaned[netflix_cleaned.type == 'movie']
series = netflix_cleaned[netflix_cleaned.type == 'tv show'] series = netflix_cleaned[netflix_cleaned.type == 'tv show']
@ -32,6 +33,15 @@ movies = movies.join(pd.DataFrame(mlb.fit_transform(movies.pop('listed_in').str.
index=movies.index)) index=movies.index))
movies.drop(['movies'], axis = 1) movies.drop(['movies'], axis = 1)
movies = movies[['release_year', 'duration',
'rottentomatoes_audience_score',
'action & adventure', 'anime features', 'children & family movies',
'classic movies', 'comedies', 'cult movies', 'documentaries', 'dramas',
'faith & spirituality', 'horror movies', 'independent movies',
'international movies', 'lgbtq movies', 'movies', 'music & musicals',
'romantic movies', 'sci-fi & fantasy', 'sports movies',
'stand-up comedy', 'thrillers']]
import sklearn import sklearn
from sklearn.model_selection import train_test_split from sklearn.model_selection import train_test_split
movies_train, movies_test = sklearn.model_selection.train_test_split(movies,test_size=0.20, random_state=42) movies_train, movies_test = sklearn.model_selection.train_test_split(movies,test_size=0.20, random_state=42)
@ -75,3 +85,9 @@ series_subsets = series_subsets.reset_index()
ax = sns.boxplot(data = series_subsets, x = 'level_0', y = 'rottentomatoes_audience_score') ax = sns.boxplot(data = series_subsets, x = 'level_0', y = 'rottentomatoes_audience_score')
ax.set(title = 'Audience score distribution between subsets', ylabel = 'Audience score on Rotten Tomatoes', xlabel = 'SUBSET') ax.set(title = 'Audience score distribution between subsets', ylabel = 'Audience score on Rotten Tomatoes', xlabel = 'SUBSET')
#plt.show(ax) #plt.show(ax)
movies_train.to_csv('movies_train.csv')
movies_test.to_csv('movies_test.csv')
movies_val.to_csv('movies_val.csv')

43
ium_zadanie5.py Normal file
View File

@ -0,0 +1,43 @@
import tensorflow as tf
from keras.models import Sequential
from keras import layers
# from keras.layers import Flatten,Dense,Dropout, GlobalAveragePooling2D
from keras.optimizers import Adam
import numpy as np
import pandas as pd
from sklearn.metrics import mean_squared_error
movies_train = pd.read_csv('movies_train.csv')
movies_test = pd.read_csv('movies_test.csv')
x_train = movies_train.copy()
x_test = movies_test.copy()
y_train = x_train.pop('rottentomatoes_audience_score')
y_test = x_test.pop('rottentomatoes_audience_score')
x_train.pop('Unnamed: 0')
x_test.pop('Unnamed: 0')
model = Sequential()
model.add(layers.Input(shape=(22,)))
model.add(layers.Dense(64))
model.add(layers.Dense(64))
model.add(layers.Dense(32))
model.add(layers.Dense(1))
model.compile(loss='mean_absolute_error', optimizer=Adam(0.001))
history = model.fit(
x = tf.convert_to_tensor(x_train, np.float32),
y = y_train,
verbose=0, epochs=99)
y_predicted = model.predict(x_test, batch_size=64)
error = mean_squared_error(y_test, y_predicted)
np.savetxt("test_predictions.csv", y_predicted, delimiter=",")
with open('evaluation.txt', 'w') as f:
f.write('Mean square error: %d' % error)

1
run.sh
View File

@ -1,3 +1,4 @@
#!/bin/bash #!/bin/bash
kaggle kernels output 'eugenioscionti/scraping-rotten-tomatoes-to-enrich-netflix-dataset' kaggle kernels output 'eugenioscionti/scraping-rotten-tomatoes-to-enrich-netflix-dataset'
python3 ium_zadanie1.py python3 ium_zadanie1.py
python3 ium_zadanie5.py