Projekt_UMA/UMA_projekt.ipynb
2021-06-30 14:58:18 +02:00

225 KiB
Raw Blame History

import pandas as pd
from sklearn.model_selection import train_test_split
from sklearn.metrics import mean_squared_error
import matplotlib.pyplot as plt
import numpy as np

Cel: rozpoznanie średniej oceny użytkowników dla danego filmu na bazie:

  • roku wydania
  • gatunku
  • czasu trwania filmu
  • ilości głosów
  • oceny krytyków (metascore)
  • przychodu

0. Preprocessing

data = pd.read_csv('IMDb movies.csv', low_memory=False)
data = data[["year","genre", "duration", "avg_vote", "votes", "worlwide_gross_income", "metascore"]]
data = data.dropna()
data = data[~data["worlwide_gross_income"].str.contains("NPR")]
data["worlwide_gross_income"] = data["worlwide_gross_income"].str.replace('$ ','', regex=False).astype(float)
data["genre"] = data["genre"].str.split(", ")
genres = pd.get_dummies(data["genre"].apply(pd.Series).stack()).sum(level=0)
data = pd.concat([data.drop(columns=["genre"]), genres.reindex(data.index)], axis=1)
display(data.head(5))
year duration avg_vote votes worlwide_gross_income metascore Action Adventure Animation Biography ... Horror Music Musical Mystery Romance Sci-Fi Sport Thriller War Western
506 1927 153 8.3 156076 1349711.0 98.0 0 0 0 0 ... 0 0 0 0 0 1 0 0 0 0
628 1928 72 8.1 27414 26916.0 90.0 0 0 0 0 ... 0 0 0 0 1 0 0 0 0 0
856 1930 104 7.7 13311 4410.0 88.0 0 0 0 0 ... 0 1 0 0 0 0 0 0 0 0
1048 1931 87 8.5 162668 46008.0 99.0 0 0 0 0 ... 0 0 0 0 1 0 0 0 0 0
1086 1931 70 7.8 63315 1626.0 91.0 0 0 0 0 ... 1 0 0 0 1 0 0 0 0 0

5 rows × 27 columns

X = data.drop(columns=["avg_vote"])
X["votes"] = X["votes"]/data["votes"].max()
X["duration"] = X["duration"]/data["duration"].max()
X["worlwide_gross_income"] = X["worlwide_gross_income"]/data["worlwide_gross_income"].max()
X["metascore"] = X["metascore"]/data["metascore"].max()
X["year"] = X["year"].astype(int)
X["year"] = X["year"]/X["year"].max()

Y = data["avg_vote"]/10
X_train, X_test, Y_train, Y_test = train_test_split(X, Y, train_size=0.8)

1. Regresja liniowa

from sklearn.linear_model import LinearRegression

linear_model = LinearRegression()
linear_model.fit(X_train,Y_train)

Y_linear_test_pred = linear_model.predict(X_test)
Y_linear_train_pred = linear_model.predict(X_train)
linear_mean_squared = mean_squared_error(Y_test, Y_linear_test_pred)
linear_mean_squared_train = mean_squared_error(Y_train, Y_linear_train_pred)

print(f"Test: {linear_mean_squared}")
print(f"Train: {linear_mean_squared_train}")
Test: 0.0033762327444214367
Train: 0.0036015583726998865
# Średnia głosów na podstawie metascore
# niebieski kolor oznacza faktyczny stosunek a zielony stosunek oszacowany przez model

fig = plt.figure(figsize=(10,5))
chart = fig.add_subplot()
chart.plot(X_test["metascore"], Y_test,"bo")
chart.plot(X_test["metascore"], Y_linear_test_pred, "go")
plt.ylim([0,1])
(0.0, 1.0)

2.1. Regresja wielomianowa

degree = 3
from sklearn.pipeline import make_pipeline
from sklearn.preprocessing import PolynomialFeatures
from sklearn.linear_model import Ridge

polynomial_model = make_pipeline(PolynomialFeatures(degree=degree, include_bias=True), 
                       LinearRegression())
polynomial_model.fit(X_train,Y_train)
Pipeline(steps=[('polynomialfeatures', PolynomialFeatures(degree=3)),
                ('linearregression', LinearRegression())])
Y_polynomial_test_pred = polynomial_model.predict(X_test)
Y_polynomial_train_pred = polynomial_model.predict(X_train)

polynomial_mean_squared = mean_squared_error(Y_test, Y_polynomial_test_pred)
polynomial_mean_squared_train = mean_squared_error(Y_train, Y_polynomial_train_pred)

print(f"Test: {polynomial_mean_squared}")
print(f"Train: {polynomial_mean_squared_train}")
Test: 0.08647485505065261
Train: 0.0024716850438651133
#Funkcja skokowa Heavisidea
Y_normalized_polynomial_test_pred = []
Y_normalized_polynomial_train_pred = []

for x in Y_polynomial_test_pred:
    x = min(x,1)
    x = max(0, x)
    Y_normalized_polynomial_test_pred.append(x)
    
for x in Y_polynomial_train_pred:
    x = min(x,1)
    x = max(0, x)
    Y_normalized_polynomial_train_pred.append(x)
    
polynomial_normalized_mean_squared = mean_squared_error(Y_test, Y_normalized_polynomial_test_pred)
polynomial_normalized_mean_squared_train = mean_squared_error(Y_train, Y_normalized_polynomial_train_pred)

print(f"Test: {polynomial_normalized_mean_squared}")
print(f"Train: {polynomial_normalized_mean_squared_train}")
Test: 0.007654936380156268
Train: 0.0024716850438651133
# Średnia głosów na podstawie metascore
# niebieski kolor oznacza faktyczny stosunek a zielony stosunek oszacowany przez model

fig = plt.figure(figsize=(10,5))
chart = fig.add_subplot()
chart.plot(X_test["metascore"], Y_test,"bo")
chart.plot(X_test["metascore"], Y_normalized_polynomial_test_pred, "go")
plt.ylim([0,1])
(0.0, 1.0)

2.2 Regresja wielomianowa z regularyzacją

polynomial_regular_model = make_pipeline(PolynomialFeatures(degree=degree, include_bias=True),
                       Ridge(alpha=10, fit_intercept=True))
polynomial_regular_model.fit(X_train,Y_train)

Y_polynomial_regular_test_pred = polynomial_regular_model.predict(X_test)
Y_polynomial_regular_train_pred = polynomial_regular_model.predict(X_train)

polynomial_regular_mean_squared = mean_squared_error(Y_test, Y_polynomial_regular_test_pred)
polynomial_regular_mean_squared_train = mean_squared_error(Y_train, Y_polynomial_regular_train_pred)

print(f"Test: {polynomial_regular_mean_squared}")
print(f"Train: {polynomial_regular_mean_squared_train}")
Test: 0.003350267646086885
Train: 0.0033181075895871736
# Średnia głosów na podstawie metascore
# niebieski kolor oznacza faktyczny stosunek a zielony stosunek oszacowany przez model

fig = plt.figure(figsize=(10,5))
chart = fig.add_subplot()
chart.plot(X_test["metascore"], Y_test,"bo")
chart.plot(X_test["metascore"], Y_polynomial_regular_test_pred, "go")
plt.ylim([0,1])
(0.0, 1.0)

3. Sieć neuronowa

import pandas as pd
from tensorflow import keras
from tensorflow.keras import layers

batch_size = 16
epochs = 10

model_nn = keras.Sequential(name="movies")
model_nn.add(keras.Input(shape=(26,), name="input"))
model_nn.add(layers.Dense(12, activation="relu", name="layer1"))
model_nn.add(layers.Dense(8, activation="sigmoid", name="layer2"))
model_nn.add(layers.Dense(1, activation="softplus", name="output"))

model_nn.compile(
    loss='mean_squared_error'
)

model_nn.fit(
    X_train.to_numpy().astype(float),
    Y_train.to_numpy(),
    batch_size=batch_size,
    epochs=epochs,
    verbose=0
)
<tensorflow.python.keras.callbacks.History at 0x147a3896bb0>
Y_nn_test_pred = model_nn.predict(X_test.to_numpy().astype(float))
Y_nn_train_pred = model_nn.predict(X_train.to_numpy().astype(float))

nn_mean_squared = mean_squared_error(Y_test, Y_nn_test_pred)
nn_mean_squared_train = mean_squared_error(Y_train, Y_nn_train_pred)
print(f"Test: {nn_mean_squared}")
print(f"Train: {nn_mean_squared_train}")
Test: 0.0034861018261550737
Train: 0.003624740580524968
# Średnia głosów na podstawie metascore
# niebieski kolor oznacza faktyczny stosunek a zielony stosunek oszacowany przez model

fig = plt.figure(figsize=(10,5))
chart = fig.add_subplot()
chart.plot(X_test["metascore"], Y_test,"bo")
chart.plot(X_test["metascore"], Y_nn_test_pred, "go")
plt.ylim([0,1])
(0.0, 1.0)

Podsumowanie

fig = plt.figure(figsize=(10,5))
chart = fig.add_subplot()
chart.plot(Y_nn_test_pred, Y_test,"bo", alpha=0.5, label='Sieć neuronowa')
chart.plot(Y_polynomial_test_pred, Y_test,"ro", alpha=0.5, label=f'Regresja wielomianowa (stopnia {degree})')
chart.plot(Y_polynomial_regular_test_pred, Y_test,"yo", alpha=0.5, label=f'Regresja wielomianowa z wygładzaniem (stopnia {degree})')
chart.plot(Y_linear_test_pred, Y_test,"go", alpha=0.5, label='Regresja liniowa')

plt.title('Stosunek rozpoznego Y do prawidłowego Y')
plt.ylim([0,1])
plt.xlim([0,1])

chart.legend()
<matplotlib.legend.Legend at 0x147a6ebc310>
dict = {'Nazwa' : ['Regresja liniowa', f'Regresja wielomianowa (stopień {degree})',  f'Regresja wielomianowa z funkcją skokową Heaviside\'a (stopień {degree})', f'Regresja wielomianowa z regularyzjacją (stopień {degree})', 'Sieć neuronowa'],
        'Mean squared error (train)' : [linear_mean_squared_train, polynomial_mean_squared_train, polynomial_normalized_mean_squared_train, polynomial_regular_mean_squared_train, nn_mean_squared_train],
        'Mean squared error (test)' : [linear_mean_squared, polynomial_mean_squared, polynomial_normalized_mean_squared, polynomial_regular_mean_squared, nn_mean_squared]}
df = pd.DataFrame(dict)
display(df)
Nazwa Mean squared error (train) Mean squared error (test)
0 Regresja liniowa 0.003602 0.003376
1 Regresja wielomianowa (stopień 3) 0.002472 0.086475
2 Regresja wielomianowa z funkcją skokową Heavis... 0.002472 0.007655
3 Regresja wielomianowa z regularyzjacją (stopie... 0.003318 0.003350
4 Sieć neuronowa 0.003625 0.003486