Add 'skrypt1'

2023-04-18 08:52:42 +02:00 · 2023-04-18 08:52:42 +02:00 · 94bc4d35f0
commit 94bc4d35f0
parent bd324cf46c
1 changed files with 72 additions and 0 deletions
--- a/72
+++ b/72
@ -0,0 +1,72 @@
+import subprocess
+import zipfile
+import os
+import pandas as pd
+import re
+
+def download_kaggle_dataset(dataset_id, destination_folder):
+    try:
+        result = subprocess.run(["kaggle", "datasets", "download", "-d", dataset_id, "-p", destination_folder], check=True, capture_output=True, text=True)
+        zip_filename = re.search(r"(\S+\.zip)", result.stdout).group(1)
+        print(f"Dataset {dataset_id} successfully downloaded.")
+        return os.path.join(destination_folder, zip_filename)
+    except subprocess.CalledProcessError as e:
+        print(f"Error downloading dataset {dataset_id}: {e}")
+        return None
+
+def unzip_file(zip_filepath, destination_folder):
+    try:
+        with zipfile.ZipFile(zip_filepath, 'r') as zip_ref:
+            zip_ref.extractall(destination_folder)
+            print(f"Files extracted to {destination_folder}.")
+    except Exception as e:
+        print(f"Error unzipping file {zip_filepath}: {e}")
+
+def combine_csv_files(train_file, test_file, output_file):
+    try:
+        train_df = pd.read_csv(train_file)
+        test_df = pd.read_csv(test_file)
+
+        combined_df = pd.concat([train_df, test_df], ignore_index=True)
+        combined_df.to_csv(output_file, index=False)
+        print(f"Combined CSV files saved to {output_file}.")
+    except Exception as e:
+        print(f"Error combining CSV files: {e}")
+
+def count_unique_rows(csv_file):
+    try:
+        df = pd.read_csv(csv_file)
+        unique_rows = df.drop_duplicates()
+        unique_count = len(unique_rows) - 1
+        print(f"The number of unique rows in {csv_file} (excluding header): {unique_count}")
+    except Exception as e:
+        print(f"Error counting unique rows: {e}")
+
+def describe_dataset(csv_file):
+    try:
+        df = pd.read_csv(csv_file)
+        print(f"Size of the dataset: {len(df)}\n")
+        print("Statistics for the dataset:\n")
+        print(df.describe())
+
+        for column in df.columns:
+            if df[column].nunique() < 10:
+                print(f"\nFrequency distribution for {column}:")
+                print(df[column].value_counts(normalize=True) * 100)
+
+    except Exception as e:
+        print(f"Error describing dataset: {e}")
+
+if __name__ == "__main__":
+    dataset_id = "iabhishekofficial/mobile-price-classification"
+    destination_folder = r"C:\studia\InżynieriaUczeniaMaszynowego"
+    zip_filepath = download_kaggle_dataset(dataset_id, destination_folder)
+
+    if zip_filepath is not None:
+        unzip_file(zip_filepath, destination_folder)
+        train_file = os.path.join(destination_folder, "train.csv")
+        test_file = os.path.join(destination_folder, "test.csv")
+        output_file = os.path.join(destination_folder, "combined.csv")
+        combine_csv_files(train_file, test_file, output_file)
+        count_unique_rows(output_file)
+        describe_dataset(output_file)