add new script

2023-05-11 21:29:38 +02:00 · 2023-05-11 21:29:38 +02:00 · 64864ab931
commit 64864ab931
parent 3582fb8610
1 changed files with 47 additions and 0 deletions
--- a/script2.py
+++ b/script2.py
@ -0,0 +1,47 @@
+import pandas as pd
+from sklearn.model_selection import train_test_split
+from sklearn.preprocessing import MinMaxScaler
+
+# Read the dataset from the CSV file
+dataset_path = '25k_IMDb_movie_Dataset.csv'
+df = pd.read_csv(dataset_path)
+
+# Drop the 'path' column
+df.drop('path', axis=1, inplace=True)
+
+# Drop rows that contain 'Top Gun: Maverick' in any column
+df = df[~df.apply(lambda row: row.astype(str).str.contains('not-released').any(), axis=1)]
+df = df[~df.apply(lambda row: row.astype(str).str.contains('no-rating').any(), axis=1)]
+
+
+
+
+
+
+# Split the dataset into train, dev, and test subsets
+train_df, test_df = train_test_split(df, test_size=0.2, random_state=42)
+dev_df, test_df = train_test_split(test_df, test_size=0.5, random_state=42)
+
+# Print the sizes of the dataset and subsets
+print(f"Dataset size: {len(df)} examples")
+print(f"Train set size: {len(train_df)} examples")
+print(f"Dev set size: {len(dev_df)} examples")
+print(f"Test set size: {len(test_df)} examples")
+
+# Compute statistics for the dataset
+statistics = df.describe()
+
+# Print mean, minimum, maximum, standard deviations, and median values of individual parameters
+print("Statistics:")
+print(statistics)
+
+# Normalize the data in the dataset
+scaler = MinMaxScaler()
+#df_normalized = pd.DataFrame(scaler.fit_transform(df), columns=df.columns)
+
+# Compute frequency distribution of examples for each class
+#class_frequency = df_normalized['class_column'].value_counts()
+
+# Print frequency distribution of examples for each class
+#print("Frequency distribution of examples for each class:")
+#print(class_frequency)