import pandas as pd from sklearn.model_selection import train_test_split from sklearn.preprocessing import MinMaxScaler # Read the dataset from the CSV file dataset_path = '25k_IMDb_movie_Dataset.csv' df = pd.read_csv(dataset_path) # Drop the 'path' column df.drop('path', axis=1, inplace=True) # Drop rows that contain 'Top Gun: Maverick' in any column df = df[~df.apply(lambda row: row.astype(str).str.contains('not-released').any(), axis=1)] df = df[~df.apply(lambda row: row.astype(str).str.contains('no-rating').any(), axis=1)] # Split the dataset into train, dev, and test subsets train_df, test_df = train_test_split(df, test_size=0.2, random_state=42) dev_df, test_df = train_test_split(test_df, test_size=0.5, random_state=42) # Print the sizes of the dataset and subsets print(f"Dataset size: {len(df)} examples") print(f"Train set size: {len(train_df)} examples") print(f"Dev set size: {len(dev_df)} examples") print(f"Test set size: {len(test_df)} examples") # Compute statistics for the dataset statistics = df.describe() # Print mean, minimum, maximum, standard deviations, and median values of individual parameters print("Statistics:") print(statistics) # Normalize the data in the dataset scaler = MinMaxScaler() #df_normalized = pd.DataFrame(scaler.fit_transform(df), columns=df.columns) # Compute frequency distribution of examples for each class #class_frequency = df_normalized['class_column'].value_counts() # Print frequency distribution of examples for each class #print("Frequency distribution of examples for each class:") #print(class_frequency)