ium_430705/lab05.py

107 lines
2.7 KiB
Python

import string
import pandas as pd
from sklearn.model_selection import train_test_split
from sklearn import preprocessing
import wget
import numpy as np
from tensorflow.keras.models import Sequential
from tensorflow.keras.layers import Dense
from tensorflow.keras.optimizers import Adam
from tensorflow.keras.layers import Dropout
from tensorflow.keras.callbacks import EarlyStopping
from sklearn.metrics import mean_squared_error, mean_absolute_error
url = "https://git.wmi.amu.edu.pl/s430705/ium_430705/raw/branch/master/imdb_movies.csv"
wget.download(url, out="imdb_movies.csv", bar=None)
movies_data = pd.read_csv("imdb_movies.csv")
# Drop rows with missing values
movies_data.dropna(inplace=True)
#ToDo: Prepare columns for actors, genres, countries
# Remove not interesting columns
drop_columns = [
"title_id",
"certificate",
"title",
"plot",
"original_title",
"countries",
"genres",
"director",
"cast",
"release_date",
"certificate",
"plot",
]
movies_data.drop(labels=drop_columns, axis=1, inplace=True)
# Normalize data, lowercase str
# for column_name in ["original_title", "countries", "genres", "director", "cast"]:
# movies_data[column_name] = (
# movies_data[column_name]
# .str.translate(str.maketrans("", "", string.punctuation))
# .str.lower()
# )
# Remove ',' from votes number and change type to int
movies_data["votes_number"] = (movies_data["votes_number"].str.replace(",", "")).astype(
int
)
# Normalize number values
scaler = preprocessing.MinMaxScaler()
movies_data[["votes_number", "year", "runtime"]] = scaler.fit_transform(
movies_data[["votes_number", "year", "runtime"]]
)
X = movies_data.drop("rating", axis=1)
Y = movies_data["rating"]
# Split set to train/test 8:2 ratio
X_train, X_test, Y_train, Y_test = train_test_split(
X, Y, test_size=0.2, random_state=42
)
# Set up model
model = Sequential()
model.add(Dense(8, activation="relu"))
model.add(Dropout(0.5))
model.add(Dense(3, activation="relu"))
model.add(Dropout(0.5))
model.add(Dense(1))
model.compile(optimizer="adam", loss="mse")
early_stop = EarlyStopping(monitor="val_loss", mode="min", verbose=1, patience=10)
model.fit(
x=X_train,
y=Y_train.values,
validation_data=(X_test, Y_test.values),
batch_size=128,
epochs=400,
callbacks=[early_stop],
)
# Predict movie ratings
predictions = model.predict(X_test)
pd.DataFrame(predictions).to_csv('results.csv')
# Compare outputs
for i, score in enumerate(predictions):
print(f"Original score: {Y_test.iloc[i]} Predicted score: {score} \n")
print(f"Difference is : {Y_test.iloc[i] - score}")
# Evaluate
print(mean_absolute_error(Y_test, predictions))