ium_444507/script-download.py

import subprocess
import sys
import pandas as pd
import os
import numpy as np


def unzip_package():
    """Unzip dataset"""
    print('Unzipping dataset...')
    os.system('unzip -o ./car-prices-poland.zip')
    print('Dataset unzipped')
    print('Removing .zip file...')
    os.system('rm ./car-prices-poland.zip')
    print('Zip file removed')


def download_dataset():
    """Download kaggle dataset."""
    print('Downloading dataset...')
    os.system('kaggle datasets download -d aleksandrglotov/car-prices-poland')

    print('Dir after downloading')
    os.system('ls -la')

    print('Dataset downloaded')


def divide_dataset(dataset):
    """Split dataset to dev, train, test datasets. """
    print('Dividing dataset...')

    os.system('tail -n +2 Car_Prices_Poland_Kaggle.csv | shuf > ./Car_Prices_Poland_Kaggle_shuf.csv')

    len1 = len(dataset) // 6
    len2 = (len1 * 2) + 1

    os.system(f'head -n {len1} Car_Prices_Poland_Kaggle_shuf.csv > Car_Prices_Poland_Kaggle_dev.csv')
    os.system(f'head -n {len1} Car_Prices_Poland_Kaggle_shuf.csv | tail -n {len1} > Car_Prices_Poland_Kaggle_test.csv')
    os.system(f'tail -n +{len2} Car_Prices_Poland_Kaggle_shuf.csv > Car_Prices_Poland_Kaggle_train.csv')

    os.system('rm ./Car_Prices_Poland_Kaggle_shuf.csv')
    print("Len match: " + str(sum([len1 * 2, len2]) == len(dataset)))
    os.system('cat Car_Prices_Poland_Kaggle_train.csv | wc -l')
    os.system('cat Car_Prices_Poland_Kaggle_dev.csv | wc -l')
    os.system('cat Car_Prices_Poland_Kaggle_test.csv | wc -l')

    print('Dataset devided')


def normalize_dataset(dataset):
    """Drop unnecessary columns and set numeric values to [0,1] range"""

    print(f'--------------- Initial dataset length ---------------')
    print(len(dataset))

    # drop columns
    dataset.drop(columns=["Unnamed: 0", "generation_name"], inplace=True)
    dataset = dataset.dropna()

    # normalize numbers to [0, 1]
    for column in dataset.columns:
        if isinstance(dataset.iloc[1][column], np.int64) or isinstance(dataset.iloc[1][column], np.float64):
            dataset[column] = (dataset[column] - dataset[column].min()) / (dataset[column].max() - dataset[column].min())
    return dataset


download_dataset()
unzip_package()
cars = pd.read_csv('./Car_Prices_Poland_Kaggle.csv')
df = pd.DataFrame(cars)
df = normalize_dataset(df)
divide_dataset(df)