2023-04-20 11:37:50 +02:00
|
|
|
import subprocess
|
|
|
|
import zipfile
|
|
|
|
import os
|
|
|
|
import pandas as pd
|
|
|
|
import re
|
|
|
|
from sklearn.model_selection import train_test_split
|
|
|
|
|
|
|
|
# Skrypt 1 funkcje
|
|
|
|
def download_kaggle_dataset(dataset_id, destination_folder):
|
|
|
|
try:
|
|
|
|
result = subprocess.run(["kaggle", "datasets", "download", "-d", dataset_id, "-p", destination_folder], check=True, capture_output=True, text=True)
|
|
|
|
zip_filename = re.search(r"(\S+\.zip)", result.stdout).group(1)
|
|
|
|
print(f"Dataset {dataset_id} successfully downloaded.")
|
|
|
|
return os.path.join(destination_folder, zip_filename)
|
|
|
|
except subprocess.CalledProcessError as e:
|
|
|
|
print(f"Error downloading dataset {dataset_id}: {e}")
|
|
|
|
return None
|
|
|
|
|
|
|
|
def unzip_file(zip_filepath, destination_folder):
|
|
|
|
try:
|
|
|
|
with zipfile.ZipFile(zip_filepath, 'r') as zip_ref:
|
|
|
|
zip_ref.extractall(destination_folder)
|
|
|
|
print(f"Files extracted to {destination_folder}.")
|
|
|
|
except Exception as e:
|
|
|
|
print(f"Error unzipping file {zip_filepath}: {e}")
|
|
|
|
|
|
|
|
def combine_csv_files(train_file, test_file, output_file):
|
|
|
|
try:
|
|
|
|
train_df = pd.read_csv(train_file)
|
|
|
|
test_df = pd.read_csv(test_file)
|
|
|
|
|
|
|
|
combined_df = pd.concat([train_df, test_df], ignore_index=True)
|
|
|
|
combined_df.to_csv(output_file, index=False)
|
|
|
|
print(f"Combined CSV files saved to {output_file}.")
|
|
|
|
except Exception as e:
|
|
|
|
print(f"Error combining CSV files: {e}")
|
|
|
|
|
|
|
|
# Skrypt 2 funkcje
|
|
|
|
def split_data(data, train_ratio, dev_ratio, random_seed=42):
|
|
|
|
train_data, temp_data = train_test_split(data, train_size=train_ratio, random_state=random_seed)
|
|
|
|
dev_data, test_data = train_test_split(temp_data, train_size=dev_ratio / (1 - train_ratio), random_state=random_seed)
|
|
|
|
return train_data, dev_data, test_data
|
|
|
|
|
|
|
|
def main():
|
2023-04-20 12:02:46 +02:00
|
|
|
# Pobierz i wypakuj dane z Kaggle
|
2023-04-20 11:37:50 +02:00
|
|
|
dataset_id = "iabhishekofficial/mobile-price-classification"
|
2023-04-20 12:02:46 +02:00
|
|
|
destination_folder = "/app/data"
|
2023-04-20 11:37:50 +02:00
|
|
|
zip_filepath = download_kaggle_dataset(dataset_id, destination_folder)
|
|
|
|
|
|
|
|
if zip_filepath is not None:
|
|
|
|
unzip_file(zip_filepath, destination_folder)
|
|
|
|
train_file = os.path.join(destination_folder, "train.csv")
|
|
|
|
test_file = os.path.join(destination_folder, "test.csv")
|
|
|
|
output_file = os.path.join(destination_folder, "combined.csv")
|
|
|
|
combine_csv_files(train_file, test_file, output_file)
|
|
|
|
|
2023-04-20 12:02:46 +02:00
|
|
|
# Wczytanie danych z pliku CSV
|
|
|
|
data = pd.read_csv(output_file)
|
2023-04-20 11:37:50 +02:00
|
|
|
|
2023-04-20 12:02:46 +02:00
|
|
|
# Podział danych na zbiory train, dev, test z proporcjami 6:2:2
|
|
|
|
train_data, dev_data, test_data = split_data(data, train_ratio=0.6, dev_ratio=0.2)
|
2023-04-20 11:37:50 +02:00
|
|
|
|
2023-04-20 12:02:46 +02:00
|
|
|
# Zapisanie podzielonych danych do plików CSV
|
|
|
|
output_dir = "/app/output"
|
|
|
|
os.makedirs(output_dir, exist_ok=True)
|
|
|
|
train_data.to_csv(os.path.join(output_dir, 'Train1.csv'), index=False)
|
|
|
|
dev_data.to_csv(os.path.join(output_dir, 'Dev1.csv'), index=False)
|
|
|
|
test_data.to_csv(os.path.join(output_dir, 'Test1.csv'), index=False)
|
|
|
|
|
|
|
|
# Wypisanie liczby wierszy w każdym pliku
|
|
|
|
print(f"Liczba wierszy w pliku Train1.csv: {len(train_data)}")
|
|
|
|
print(f"Liczba wierszy w pliku Dev1.csv: {len(dev_data)}")
|
|
|
|
print(f"Liczba wierszy w pliku Test1.csv: {len(test_data)}")
|
2023-04-20 11:37:50 +02:00
|
|
|
|
|
|
|
if __name__ == "__main__":
|
|
|
|
main()
|