ium_z487177/skrypt1
2023-04-18 08:52:42 +02:00

73 lines
2.8 KiB
Plaintext

import subprocess
import zipfile
import os
import pandas as pd
import re
def download_kaggle_dataset(dataset_id, destination_folder):
try:
result = subprocess.run(["kaggle", "datasets", "download", "-d", dataset_id, "-p", destination_folder], check=True, capture_output=True, text=True)
zip_filename = re.search(r"(\S+\.zip)", result.stdout).group(1)
print(f"Dataset {dataset_id} successfully downloaded.")
return os.path.join(destination_folder, zip_filename)
except subprocess.CalledProcessError as e:
print(f"Error downloading dataset {dataset_id}: {e}")
return None
def unzip_file(zip_filepath, destination_folder):
try:
with zipfile.ZipFile(zip_filepath, 'r') as zip_ref:
zip_ref.extractall(destination_folder)
print(f"Files extracted to {destination_folder}.")
except Exception as e:
print(f"Error unzipping file {zip_filepath}: {e}")
def combine_csv_files(train_file, test_file, output_file):
try:
train_df = pd.read_csv(train_file)
test_df = pd.read_csv(test_file)
combined_df = pd.concat([train_df, test_df], ignore_index=True)
combined_df.to_csv(output_file, index=False)
print(f"Combined CSV files saved to {output_file}.")
except Exception as e:
print(f"Error combining CSV files: {e}")
def count_unique_rows(csv_file):
try:
df = pd.read_csv(csv_file)
unique_rows = df.drop_duplicates()
unique_count = len(unique_rows) - 1
print(f"The number of unique rows in {csv_file} (excluding header): {unique_count}")
except Exception as e:
print(f"Error counting unique rows: {e}")
def describe_dataset(csv_file):
try:
df = pd.read_csv(csv_file)
print(f"Size of the dataset: {len(df)}\n")
print("Statistics for the dataset:\n")
print(df.describe())
for column in df.columns:
if df[column].nunique() < 10:
print(f"\nFrequency distribution for {column}:")
print(df[column].value_counts(normalize=True) * 100)
except Exception as e:
print(f"Error describing dataset: {e}")
if __name__ == "__main__":
dataset_id = "iabhishekofficial/mobile-price-classification"
destination_folder = r"C:\studia\InżynieriaUczeniaMaszynowego"
zip_filepath = download_kaggle_dataset(dataset_id, destination_folder)
if zip_filepath is not None:
unzip_file(zip_filepath, destination_folder)
train_file = os.path.join(destination_folder, "train.csv")
test_file = os.path.join(destination_folder, "test.csv")
output_file = os.path.join(destination_folder, "combined.csv")
combine_csv_files(train_file, test_file, output_file)
count_unique_rows(output_file)
describe_dataset(output_file)