48 lines
1.6 KiB
Python
48 lines
1.6 KiB
Python
|
import pandas as pd
|
||
|
from sklearn.model_selection import train_test_split
|
||
|
from sklearn.preprocessing import MinMaxScaler
|
||
|
|
||
|
# Read the dataset from the CSV file
|
||
|
dataset_path = '25k_IMDb_movie_Dataset.csv'
|
||
|
df = pd.read_csv(dataset_path)
|
||
|
|
||
|
# Drop the 'path' column
|
||
|
df.drop('path', axis=1, inplace=True)
|
||
|
|
||
|
# Drop rows that contain 'Top Gun: Maverick' in any column
|
||
|
df = df[~df.apply(lambda row: row.astype(str).str.contains('not-released').any(), axis=1)]
|
||
|
df = df[~df.apply(lambda row: row.astype(str).str.contains('no-rating').any(), axis=1)]
|
||
|
|
||
|
|
||
|
|
||
|
|
||
|
|
||
|
|
||
|
# Split the dataset into train, dev, and test subsets
|
||
|
train_df, test_df = train_test_split(df, test_size=0.2, random_state=42)
|
||
|
dev_df, test_df = train_test_split(test_df, test_size=0.5, random_state=42)
|
||
|
|
||
|
# Print the sizes of the dataset and subsets
|
||
|
print(f"Dataset size: {len(df)} examples")
|
||
|
print(f"Train set size: {len(train_df)} examples")
|
||
|
print(f"Dev set size: {len(dev_df)} examples")
|
||
|
print(f"Test set size: {len(test_df)} examples")
|
||
|
|
||
|
# Compute statistics for the dataset
|
||
|
statistics = df.describe()
|
||
|
|
||
|
# Print mean, minimum, maximum, standard deviations, and median values of individual parameters
|
||
|
print("Statistics:")
|
||
|
print(statistics)
|
||
|
|
||
|
# Normalize the data in the dataset
|
||
|
scaler = MinMaxScaler()
|
||
|
#df_normalized = pd.DataFrame(scaler.fit_transform(df), columns=df.columns)
|
||
|
|
||
|
# Compute frequency distribution of examples for each class
|
||
|
#class_frequency = df_normalized['class_column'].value_counts()
|
||
|
|
||
|
# Print frequency distribution of examples for each class
|
||
|
#print("Frequency distribution of examples for each class:")
|
||
|
#print(class_frequency)
|