73 lines
2.8 KiB
Plaintext
73 lines
2.8 KiB
Plaintext
|
import subprocess
|
||
|
import zipfile
|
||
|
import os
|
||
|
import pandas as pd
|
||
|
import re
|
||
|
|
||
|
def download_kaggle_dataset(dataset_id, destination_folder):
|
||
|
try:
|
||
|
result = subprocess.run(["kaggle", "datasets", "download", "-d", dataset_id, "-p", destination_folder], check=True, capture_output=True, text=True)
|
||
|
zip_filename = re.search(r"(\S+\.zip)", result.stdout).group(1)
|
||
|
print(f"Dataset {dataset_id} successfully downloaded.")
|
||
|
return os.path.join(destination_folder, zip_filename)
|
||
|
except subprocess.CalledProcessError as e:
|
||
|
print(f"Error downloading dataset {dataset_id}: {e}")
|
||
|
return None
|
||
|
|
||
|
def unzip_file(zip_filepath, destination_folder):
|
||
|
try:
|
||
|
with zipfile.ZipFile(zip_filepath, 'r') as zip_ref:
|
||
|
zip_ref.extractall(destination_folder)
|
||
|
print(f"Files extracted to {destination_folder}.")
|
||
|
except Exception as e:
|
||
|
print(f"Error unzipping file {zip_filepath}: {e}")
|
||
|
|
||
|
def combine_csv_files(train_file, test_file, output_file):
|
||
|
try:
|
||
|
train_df = pd.read_csv(train_file)
|
||
|
test_df = pd.read_csv(test_file)
|
||
|
|
||
|
combined_df = pd.concat([train_df, test_df], ignore_index=True)
|
||
|
combined_df.to_csv(output_file, index=False)
|
||
|
print(f"Combined CSV files saved to {output_file}.")
|
||
|
except Exception as e:
|
||
|
print(f"Error combining CSV files: {e}")
|
||
|
|
||
|
def count_unique_rows(csv_file):
|
||
|
try:
|
||
|
df = pd.read_csv(csv_file)
|
||
|
unique_rows = df.drop_duplicates()
|
||
|
unique_count = len(unique_rows) - 1
|
||
|
print(f"The number of unique rows in {csv_file} (excluding header): {unique_count}")
|
||
|
except Exception as e:
|
||
|
print(f"Error counting unique rows: {e}")
|
||
|
|
||
|
def describe_dataset(csv_file):
|
||
|
try:
|
||
|
df = pd.read_csv(csv_file)
|
||
|
print(f"Size of the dataset: {len(df)}\n")
|
||
|
print("Statistics for the dataset:\n")
|
||
|
print(df.describe())
|
||
|
|
||
|
for column in df.columns:
|
||
|
if df[column].nunique() < 10:
|
||
|
print(f"\nFrequency distribution for {column}:")
|
||
|
print(df[column].value_counts(normalize=True) * 100)
|
||
|
|
||
|
except Exception as e:
|
||
|
print(f"Error describing dataset: {e}")
|
||
|
|
||
|
if __name__ == "__main__":
|
||
|
dataset_id = "iabhishekofficial/mobile-price-classification"
|
||
|
destination_folder = r"C:\studia\InżynieriaUczeniaMaszynowego"
|
||
|
zip_filepath = download_kaggle_dataset(dataset_id, destination_folder)
|
||
|
|
||
|
if zip_filepath is not None:
|
||
|
unzip_file(zip_filepath, destination_folder)
|
||
|
train_file = os.path.join(destination_folder, "train.csv")
|
||
|
test_file = os.path.join(destination_folder, "test.csv")
|
||
|
output_file = os.path.join(destination_folder, "combined.csv")
|
||
|
combine_csv_files(train_file, test_file, output_file)
|
||
|
count_unique_rows(output_file)
|
||
|
describe_dataset(output_file)
|