import subprocess import zipfile import os import pandas as pd import re def download_kaggle_dataset(dataset_id, destination_folder): try: result = subprocess.run(["kaggle", "datasets", "download", "-d", dataset_id, "-p", destination_folder], check=True, capture_output=True, text=True) zip_filename = re.search(r"(\S+\.zip)", result.stdout).group(1) print(f"Dataset {dataset_id} successfully downloaded.") return os.path.join(destination_folder, zip_filename) except subprocess.CalledProcessError as e: print(f"Error downloading dataset {dataset_id}: {e}") return None def unzip_file(zip_filepath, destination_folder): try: with zipfile.ZipFile(zip_filepath, 'r') as zip_ref: zip_ref.extractall(destination_folder) print(f"Files extracted to {destination_folder}.") except Exception as e: print(f"Error unzipping file {zip_filepath}: {e}") def combine_csv_files(train_file, test_file, output_file): try: train_df = pd.read_csv(train_file) test_df = pd.read_csv(test_file) combined_df = pd.concat([train_df, test_df], ignore_index=True) combined_df.to_csv(output_file, index=False) print(f"Combined CSV files saved to {output_file}.") except Exception as e: print(f"Error combining CSV files: {e}") def count_unique_rows(csv_file): try: df = pd.read_csv(csv_file) unique_rows = df.drop_duplicates() unique_count = len(unique_rows) - 1 print(f"The number of unique rows in {csv_file} (excluding header): {unique_count}") except Exception as e: print(f"Error counting unique rows: {e}") def describe_dataset(csv_file): try: df = pd.read_csv(csv_file) print(f"Size of the dataset: {len(df)}\n") print("Statistics for the dataset:\n") print(df.describe()) for column in df.columns: if df[column].nunique() < 10: print(f"\nFrequency distribution for {column}:") print(df[column].value_counts(normalize=True) * 100) except Exception as e: print(f"Error describing dataset: {e}") if __name__ == "__main__": dataset_id = "iabhishekofficial/mobile-price-classification" destination_folder = r"C:\studia\InżynieriaUczeniaMaszynowego" zip_filepath = download_kaggle_dataset(dataset_id, destination_folder) if zip_filepath is not None: unzip_file(zip_filepath, destination_folder) train_file = os.path.join(destination_folder, "train.csv") test_file = os.path.join(destination_folder, "test.csv") output_file = os.path.join(destination_folder, "combined.csv") combine_csv_files(train_file, test_file, output_file) count_unique_rows(output_file) describe_dataset(output_file)