ium_z487177/skrypt1

import subprocess
import zipfile
import os
import pandas as pd
import re

def download_kaggle_dataset(dataset_id, destination_folder):
    try:
        result = subprocess.run(["kaggle", "datasets", "download", "-d", dataset_id, "-p", destination_folder], check=True, capture_output=True, text=True)
        zip_filename = re.search(r"(\S+\.zip)", result.stdout).group(1)
        print(f"Dataset {dataset_id} successfully downloaded.")
        return os.path.join(destination_folder, zip_filename)
    except subprocess.CalledProcessError as e:
        print(f"Error downloading dataset {dataset_id}: {e}")
        return None

def unzip_file(zip_filepath, destination_folder):
    try:
        with zipfile.ZipFile(zip_filepath, 'r') as zip_ref:
            zip_ref.extractall(destination_folder)
            print(f"Files extracted to {destination_folder}.")
    except Exception as e:
        print(f"Error unzipping file {zip_filepath}: {e}")

def combine_csv_files(train_file, test_file, output_file):
    try:
        train_df = pd.read_csv(train_file)
        test_df = pd.read_csv(test_file)

        combined_df = pd.concat([train_df, test_df], ignore_index=True)
        combined_df.to_csv(output_file, index=False)
        print(f"Combined CSV files saved to {output_file}.")
    except Exception as e:
        print(f"Error combining CSV files: {e}")

def count_unique_rows(csv_file):
    try:
        df = pd.read_csv(csv_file)
        unique_rows = df.drop_duplicates()
        unique_count = len(unique_rows) - 1
        print(f"The number of unique rows in {csv_file} (excluding header): {unique_count}")
    except Exception as e:
        print(f"Error counting unique rows: {e}")

def describe_dataset(csv_file):
    try:
        df = pd.read_csv(csv_file)
        print(f"Size of the dataset: {len(df)}\n")
        print("Statistics for the dataset:\n")
        print(df.describe())

        for column in df.columns:
            if df[column].nunique() < 10:
                print(f"\nFrequency distribution for {column}:")
                print(df[column].value_counts(normalize=True) * 100)

    except Exception as e:
        print(f"Error describing dataset: {e}")

if __name__ == "__main__":
    dataset_id = "iabhishekofficial/mobile-price-classification"
    destination_folder = r"C:\studia\InżynieriaUczeniaMaszynowego"
    zip_filepath = download_kaggle_dataset(dataset_id, destination_folder)

    if zip_filepath is not None:
        unzip_file(zip_filepath, destination_folder)
        train_file = os.path.join(destination_folder, "train.csv")
        test_file = os.path.join(destination_folder, "test.csv")
        output_file = os.path.join(destination_folder, "combined.csv")
        combine_csv_files(train_file, test_file, output_file)
        count_unique_rows(output_file)
        describe_dataset(output_file)