225 KiB
225 KiB
import pandas as pd
from sklearn.model_selection import train_test_split
from sklearn.metrics import mean_squared_error
import matplotlib.pyplot as plt
import numpy as np
Cel: rozpoznanie średniej oceny użytkowników dla danego filmu na bazie:
- roku wydania
- gatunku
- czasu trwania filmu
- ilości głosów
- oceny krytyków (metascore)
- przychodu
0. Preprocessing
data = pd.read_csv('IMDb movies.csv', low_memory=False)
data = data[["year","genre", "duration", "avg_vote", "votes", "worlwide_gross_income", "metascore"]]
data = data.dropna()
data = data[~data["worlwide_gross_income"].str.contains("NPR")]
data["worlwide_gross_income"] = data["worlwide_gross_income"].str.replace('$ ','', regex=False).astype(float)
data["genre"] = data["genre"].str.split(", ")
genres = pd.get_dummies(data["genre"].apply(pd.Series).stack()).sum(level=0)
data = pd.concat([data.drop(columns=["genre"]), genres.reindex(data.index)], axis=1)
display(data.head(5))
year | duration | avg_vote | votes | worlwide_gross_income | metascore | Action | Adventure | Animation | Biography | ... | Horror | Music | Musical | Mystery | Romance | Sci-Fi | Sport | Thriller | War | Western | |
---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|
506 | 1927 | 153 | 8.3 | 156076 | 1349711.0 | 98.0 | 0 | 0 | 0 | 0 | ... | 0 | 0 | 0 | 0 | 0 | 1 | 0 | 0 | 0 | 0 |
628 | 1928 | 72 | 8.1 | 27414 | 26916.0 | 90.0 | 0 | 0 | 0 | 0 | ... | 0 | 0 | 0 | 0 | 1 | 0 | 0 | 0 | 0 | 0 |
856 | 1930 | 104 | 7.7 | 13311 | 4410.0 | 88.0 | 0 | 0 | 0 | 0 | ... | 0 | 1 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 |
1048 | 1931 | 87 | 8.5 | 162668 | 46008.0 | 99.0 | 0 | 0 | 0 | 0 | ... | 0 | 0 | 0 | 0 | 1 | 0 | 0 | 0 | 0 | 0 |
1086 | 1931 | 70 | 7.8 | 63315 | 1626.0 | 91.0 | 0 | 0 | 0 | 0 | ... | 1 | 0 | 0 | 0 | 1 | 0 | 0 | 0 | 0 | 0 |
5 rows × 27 columns
X = data.drop(columns=["avg_vote"])
X["votes"] = X["votes"]/data["votes"].max()
X["duration"] = X["duration"]/data["duration"].max()
X["worlwide_gross_income"] = X["worlwide_gross_income"]/data["worlwide_gross_income"].max()
X["metascore"] = X["metascore"]/data["metascore"].max()
X["year"] = X["year"].astype(int)
X["year"] = X["year"]/X["year"].max()
Y = data["avg_vote"]/10
X_train, X_test, Y_train, Y_test = train_test_split(X, Y, train_size=0.8)
1. Regresja liniowa
from sklearn.linear_model import LinearRegression
linear_model = LinearRegression()
linear_model.fit(X_train,Y_train)
Y_linear_test_pred = linear_model.predict(X_test)
Y_linear_train_pred = linear_model.predict(X_train)
linear_mean_squared = mean_squared_error(Y_test, Y_linear_test_pred)
linear_mean_squared_train = mean_squared_error(Y_train, Y_linear_train_pred)
print(f"Test: {linear_mean_squared}")
print(f"Train: {linear_mean_squared_train}")
Test: 0.0033762327444214367 Train: 0.0036015583726998865
# Średnia głosów na podstawie metascore
# niebieski kolor oznacza faktyczny stosunek a zielony stosunek oszacowany przez model
fig = plt.figure(figsize=(10,5))
chart = fig.add_subplot()
chart.plot(X_test["metascore"], Y_test,"bo")
chart.plot(X_test["metascore"], Y_linear_test_pred, "go")
plt.ylim([0,1])
(0.0, 1.0)
2.1. Regresja wielomianowa
degree = 3
from sklearn.pipeline import make_pipeline
from sklearn.preprocessing import PolynomialFeatures
from sklearn.linear_model import Ridge
polynomial_model = make_pipeline(PolynomialFeatures(degree=degree, include_bias=True),
LinearRegression())
polynomial_model.fit(X_train,Y_train)
Pipeline(steps=[('polynomialfeatures', PolynomialFeatures(degree=3)), ('linearregression', LinearRegression())])
Y_polynomial_test_pred = polynomial_model.predict(X_test)
Y_polynomial_train_pred = polynomial_model.predict(X_train)
polynomial_mean_squared = mean_squared_error(Y_test, Y_polynomial_test_pred)
polynomial_mean_squared_train = mean_squared_error(Y_train, Y_polynomial_train_pred)
print(f"Test: {polynomial_mean_squared}")
print(f"Train: {polynomial_mean_squared_train}")
Test: 0.08647485505065261 Train: 0.0024716850438651133
#Funkcja skokowa Heaviside’a
Y_normalized_polynomial_test_pred = []
Y_normalized_polynomial_train_pred = []
for x in Y_polynomial_test_pred:
x = min(x,1)
x = max(0, x)
Y_normalized_polynomial_test_pred.append(x)
for x in Y_polynomial_train_pred:
x = min(x,1)
x = max(0, x)
Y_normalized_polynomial_train_pred.append(x)
polynomial_normalized_mean_squared = mean_squared_error(Y_test, Y_normalized_polynomial_test_pred)
polynomial_normalized_mean_squared_train = mean_squared_error(Y_train, Y_normalized_polynomial_train_pred)
print(f"Test: {polynomial_normalized_mean_squared}")
print(f"Train: {polynomial_normalized_mean_squared_train}")
Test: 0.007654936380156268 Train: 0.0024716850438651133
# Średnia głosów na podstawie metascore
# niebieski kolor oznacza faktyczny stosunek a zielony stosunek oszacowany przez model
fig = plt.figure(figsize=(10,5))
chart = fig.add_subplot()
chart.plot(X_test["metascore"], Y_test,"bo")
chart.plot(X_test["metascore"], Y_normalized_polynomial_test_pred, "go")
plt.ylim([0,1])
(0.0, 1.0)
2.2 Regresja wielomianowa z regularyzacją
polynomial_regular_model = make_pipeline(PolynomialFeatures(degree=degree, include_bias=True),
Ridge(alpha=10, fit_intercept=True))
polynomial_regular_model.fit(X_train,Y_train)
Y_polynomial_regular_test_pred = polynomial_regular_model.predict(X_test)
Y_polynomial_regular_train_pred = polynomial_regular_model.predict(X_train)
polynomial_regular_mean_squared = mean_squared_error(Y_test, Y_polynomial_regular_test_pred)
polynomial_regular_mean_squared_train = mean_squared_error(Y_train, Y_polynomial_regular_train_pred)
print(f"Test: {polynomial_regular_mean_squared}")
print(f"Train: {polynomial_regular_mean_squared_train}")
Test: 0.003350267646086885 Train: 0.0033181075895871736
# Średnia głosów na podstawie metascore
# niebieski kolor oznacza faktyczny stosunek a zielony stosunek oszacowany przez model
fig = plt.figure(figsize=(10,5))
chart = fig.add_subplot()
chart.plot(X_test["metascore"], Y_test,"bo")
chart.plot(X_test["metascore"], Y_polynomial_regular_test_pred, "go")
plt.ylim([0,1])
(0.0, 1.0)
3. Sieć neuronowa
import pandas as pd
from tensorflow import keras
from tensorflow.keras import layers
batch_size = 16
epochs = 10
model_nn = keras.Sequential(name="movies")
model_nn.add(keras.Input(shape=(26,), name="input"))
model_nn.add(layers.Dense(12, activation="relu", name="layer1"))
model_nn.add(layers.Dense(8, activation="sigmoid", name="layer2"))
model_nn.add(layers.Dense(1, activation="softplus", name="output"))
model_nn.compile(
loss='mean_squared_error'
)
model_nn.fit(
X_train.to_numpy().astype(float),
Y_train.to_numpy(),
batch_size=batch_size,
epochs=epochs,
verbose=0
)
<tensorflow.python.keras.callbacks.History at 0x147a3896bb0>
Y_nn_test_pred = model_nn.predict(X_test.to_numpy().astype(float))
Y_nn_train_pred = model_nn.predict(X_train.to_numpy().astype(float))
nn_mean_squared = mean_squared_error(Y_test, Y_nn_test_pred)
nn_mean_squared_train = mean_squared_error(Y_train, Y_nn_train_pred)
print(f"Test: {nn_mean_squared}")
print(f"Train: {nn_mean_squared_train}")
Test: 0.0034861018261550737 Train: 0.003624740580524968
# Średnia głosów na podstawie metascore
# niebieski kolor oznacza faktyczny stosunek a zielony stosunek oszacowany przez model
fig = plt.figure(figsize=(10,5))
chart = fig.add_subplot()
chart.plot(X_test["metascore"], Y_test,"bo")
chart.plot(X_test["metascore"], Y_nn_test_pred, "go")
plt.ylim([0,1])
(0.0, 1.0)
Podsumowanie
fig = plt.figure(figsize=(10,5))
chart = fig.add_subplot()
chart.plot(Y_nn_test_pred, Y_test,"bo", alpha=0.5, label='Sieć neuronowa')
chart.plot(Y_polynomial_test_pred, Y_test,"ro", alpha=0.5, label=f'Regresja wielomianowa (stopnia {degree})')
chart.plot(Y_polynomial_regular_test_pred, Y_test,"yo", alpha=0.5, label=f'Regresja wielomianowa z wygładzaniem (stopnia {degree})')
chart.plot(Y_linear_test_pred, Y_test,"go", alpha=0.5, label='Regresja liniowa')
plt.title('Stosunek rozpoznego Y do prawidłowego Y')
plt.ylim([0,1])
plt.xlim([0,1])
chart.legend()
<matplotlib.legend.Legend at 0x147a6ebc310>
dict = {'Nazwa' : ['Regresja liniowa', f'Regresja wielomianowa (stopień {degree})', f'Regresja wielomianowa z funkcją skokową Heaviside\'a (stopień {degree})', f'Regresja wielomianowa z regularyzjacją (stopień {degree})', 'Sieć neuronowa'],
'Mean squared error (train)' : [linear_mean_squared_train, polynomial_mean_squared_train, polynomial_normalized_mean_squared_train, polynomial_regular_mean_squared_train, nn_mean_squared_train],
'Mean squared error (test)' : [linear_mean_squared, polynomial_mean_squared, polynomial_normalized_mean_squared, polynomial_regular_mean_squared, nn_mean_squared]}
df = pd.DataFrame(dict)
display(df)
Nazwa | Mean squared error (train) | Mean squared error (test) | |
---|---|---|---|
0 | Regresja liniowa | 0.003602 | 0.003376 |
1 | Regresja wielomianowa (stopień 3) | 0.002472 | 0.086475 |
2 | Regresja wielomianowa z funkcją skokową Heavis... | 0.002472 | 0.007655 |
3 | Regresja wielomianowa z regularyzjacją (stopie... | 0.003318 | 0.003350 |
4 | Sieć neuronowa | 0.003625 | 0.003486 |