import pandas as pd import numpy as np import tensorflow as tf from sklearn.model_selection import train_test_split from sklearn.preprocessing import StandardScaler # Step 1: Load the dataset data = pd.read_csv('25k_movies.csv.shuf') # Step 2: Preprocess the data features = ['Total Run Time', 'User Rating', 'Genres', 'Director Name', 'Writer Name'] target = 'Movie Rating' data = data[features + [target]] # Handle missing values if any data = data.dropna() # Filter out rows with a different number of columns try: data = data[data.apply(lambda x: len(x) == 12, axis=1)] except pd.errors.ParserError as e: print(f"Error occurred while parsing the dataset: {e}") print("Dropping rows with inconsistent number of columns...") data = data[~data.apply(lambda x: len(x) != 12, axis=1)] # Convert categorical variables to numerical representations data = pd.get_dummies(data, columns=['Genres', 'Director Name', 'Writer Name']) # Split the data into features and target variables X = data.drop(target, axis=1) y = data[target] # Split the data into training and testing sets X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42) # Standardize the feature data scaler = StandardScaler() X_train = scaler.fit_transform(X_train) X_test = scaler.transform(X_test) # Step 3: Build and train the neural network model model = tf.keras.Sequential([ tf.keras.layers.Dense(64, activation='relu', input_shape=(X_train.shape[1],)), tf.keras.layers.Dense(32, activation='relu'), tf.keras.layers.Dense(1) ]) model.compile(optimizer='adam', loss='mean_squared_error') model.fit(X_train, y_train, epochs=10, batch_size=32) # Step 4: Evaluate the model y_pred = model.predict(X_test) mse = np.mean((y_pred - y_test)**2) print(f"Mean Squared Error (MSE): {mse}")