import subprocess import zipfile import os import pandas as pd import re from sklearn.model_selection import train_test_split # Skrypt 1 funkcje def download_kaggle_dataset(dataset_id, destination_folder): try: result = subprocess.run(["kaggle", "datasets", "download", "-d", dataset_id, "-p", destination_folder], check=True, capture_output=True, text=True) zip_filename = re.search(r"(\S+\.zip)", result.stdout).group(1) print(f"Dataset {dataset_id} successfully downloaded.") return os.path.join(destination_folder, zip_filename) except subprocess.CalledProcessError as e: print(f"Error downloading dataset {dataset_id}: {e}") return None def unzip_file(zip_filepath, destination_folder): try: with zipfile.ZipFile(zip_filepath, 'r') as zip_ref: zip_ref.extractall(destination_folder) print(f"Files extracted to {destination_folder}.") except Exception as e: print(f"Error unzipping file {zip_filepath}: {e}") def combine_csv_files(train_file, test_file, output_file): try: train_df = pd.read_csv(train_file) test_df = pd.read_csv(test_file) combined_df = pd.concat([train_df, test_df], ignore_index=True) combined_df.to_csv(output_file, index=False) print(f"Combined CSV files saved to {output_file}.") except Exception as e: print(f"Error combining CSV files: {e}") # Skrypt 2 funkcje def split_data(data, train_ratio, dev_ratio, random_seed=42): train_data, temp_data = train_test_split(data, train_size=train_ratio, random_state=random_seed) dev_data, test_data = train_test_split(temp_data, train_size=dev_ratio / (1 - train_ratio), random_state=random_seed) return train_data, dev_data, test_data def main(): # Pobierz i wypakuj dane z Kaggle dataset_id = "iabhishekofficial/mobile-price-classification" destination_folder = "/app/data" zip_filepath = download_kaggle_dataset(dataset_id, destination_folder) if zip_filepath is not None: unzip_file(zip_filepath, destination_folder) train_file = os.path.join(destination_folder, "train.csv") test_file = os.path.join(destination_folder, "test.csv") output_file = os.path.join(destination_folder, "combined.csv") combine_csv_files(train_file, test_file, output_file) # Wczytanie danych z pliku CSV data = pd.read_csv(output_file) # Podział danych na zbiory train, dev, test z proporcjami 6:2:2 train_data, dev_data, test_data = split_data(data, train_ratio=0.6, dev_ratio=0.2) # Zapisanie podzielonych danych do plików CSV output_dir = "/app/output" os.makedirs(output_dir, exist_ok=True) train_data.to_csv(os.path.join(output_dir, 'Train1.csv'), index=False) dev_data.to_csv(os.path.join(output_dir, 'Dev1.csv'), index=False) test_data.to_csv(os.path.join(output_dir, 'Test1.csv'), index=False) # Wypisanie liczby wierszy w każdym pliku print(f"Liczba wierszy w pliku Train1.csv: {len(train_data)}") print(f"Liczba wierszy w pliku Dev1.csv: {len(dev_data)}") print(f"Liczba wierszy w pliku Test1.csv: {len(test_data)}") if __name__ == "__main__": main()