Add script for zadanie 5

This commit is contained in:
Zofia Galla 2021-04-25 21:38:20 +02:00
parent aa0e85f270
commit 997481e85a
5 changed files with 67 additions and 0 deletions

View File

@ -0,0 +1,6 @@
{
"cells": [],
"metadata": {},
"nbformat": 4,
"nbformat_minor": 2
}

View File

@ -15,5 +15,6 @@ RUN pip3 install sklearn
RUN pip3 install pandas
RUN pip3 install seaborn
RUN pip3 install matplotlib
RUN pip3 install tensorflow
CMD ./run.sh

View File

@ -18,6 +18,7 @@ netflix_cleaned.date_added = netflix_cleaned.date_added.dropna().apply(lambda x:
netflix_cleaned.update(netflix_cleaned.select_dtypes(include = 'object').apply(lambda col: col.str.lower()))
movies = netflix_cleaned[netflix_cleaned.type == 'movie']
series = netflix_cleaned[netflix_cleaned.type == 'tv show']
@ -32,6 +33,15 @@ movies = movies.join(pd.DataFrame(mlb.fit_transform(movies.pop('listed_in').str.
index=movies.index))
movies.drop(['movies'], axis = 1)
movies = movies[['release_year', 'duration',
'rottentomatoes_audience_score',
'action & adventure', 'anime features', 'children & family movies',
'classic movies', 'comedies', 'cult movies', 'documentaries', 'dramas',
'faith & spirituality', 'horror movies', 'independent movies',
'international movies', 'lgbtq movies', 'movies', 'music & musicals',
'romantic movies', 'sci-fi & fantasy', 'sports movies',
'stand-up comedy', 'thrillers']]
import sklearn
from sklearn.model_selection import train_test_split
movies_train, movies_test = sklearn.model_selection.train_test_split(movies,test_size=0.20, random_state=42)
@ -75,3 +85,9 @@ series_subsets = series_subsets.reset_index()
ax = sns.boxplot(data = series_subsets, x = 'level_0', y = 'rottentomatoes_audience_score')
ax.set(title = 'Audience score distribution between subsets', ylabel = 'Audience score on Rotten Tomatoes', xlabel = 'SUBSET')
#plt.show(ax)
movies_train.to_csv('movies_train.csv')
movies_test.to_csv('movies_test.csv')
movies_val.to_csv('movies_val.csv')

43
ium_zadanie5.py Normal file
View File

@ -0,0 +1,43 @@
import tensorflow as tf
from keras.models import Sequential
from keras import layers
# from keras.layers import Flatten,Dense,Dropout, GlobalAveragePooling2D
from keras.optimizers import Adam
import numpy as np
import pandas as pd
from sklearn.metrics import mean_squared_error
movies_train = pd.read_csv('movies_train.csv')
movies_test = pd.read_csv('movies_test.csv')
x_train = movies_train.copy()
x_test = movies_test.copy()
y_train = x_train.pop('rottentomatoes_audience_score')
y_test = x_test.pop('rottentomatoes_audience_score')
x_train.pop('Unnamed: 0')
x_test.pop('Unnamed: 0')
model = Sequential()
model.add(layers.Input(shape=(22,)))
model.add(layers.Dense(64))
model.add(layers.Dense(64))
model.add(layers.Dense(32))
model.add(layers.Dense(1))
model.compile(loss='mean_absolute_error', optimizer=Adam(0.001))
history = model.fit(
x = tf.convert_to_tensor(x_train, np.float32),
y = y_train,
verbose=0, epochs=99)
y_predicted = model.predict(x_test, batch_size=64)
error = mean_squared_error(y_test, y_predicted)
np.savetxt("test_predictions.csv", y_predicted, delimiter=",")
with open('evaluation.txt', 'w') as f:
f.write('Mean square error: %d' % error)

1
run.sh
View File

@ -1,3 +1,4 @@
#!/bin/bash
kaggle kernels output 'eugenioscionti/scraping-rotten-tomatoes-to-enrich-netflix-dataset'
python3 ium_zadanie1.py
python3 ium_zadanie5.py