ium_444507/script.py

import subprocess
import sys
import pandas as pd
import os
import numpy as np


def install_dependencies():
    """Install kaggle and pandas."""
    subprocess.check_call([sys.executable, '-m', 'pip', 'install', '--upgrade', 'pip'])
    subprocess.check_call([sys.executable, '-m', 'pip', 'install', 'kaggle'])
    subprocess.check_call([sys.executable, '-m', 'pip', 'install', 'pandas'])
    subprocess.check_call([sys.executable, '-m', 'pip', 'install', 'seaborn'])
    subprocess.check_call([sys.executable, '-m', 'pip', 'install', 'scikit-learn'])


def unzip_package():
    """Unzip dataset"""
    os.system('unzip -o car-prices-poland.zip')


def download_dataset():
    """Download kaggle dataset."""
    os.system('kaggle datasets download -d aleksandrglotov/car-prices-poland')


def divide_dataset(dataset):
    """Split dataset to dev, train, test datasets. """
    
    os.system('tail -n +2 Car_Prices_Poland_Kaggle.csv | shuf > ./Car_Prices_Poland_Kaggle_shuf.csv')

    len1 = len(dataset) // 6
    len2 = (len1 * 2) +1

    os.system(f'head -n {len1} Car_Prices_Poland_Kaggle_shuf.csv f > Car_Prices_Poland_Kaggle_dev.csv')
    os.system(f'head -n {len1} Car_Prices_Poland_Kaggle_shuf.csv| tail -n {len1} > Car_Prices_Poland_Kaggle_test.csv')
    os.system(f'tail -n +{len2} Car_Prices_Poland_Kaggle_shuf.csv  > Car_Prices_Poland_Kaggle_train.csv')

    os.system('rm ./Car_Prices_Poland_Kaggle_shuf.csv')
    print("Len match: " + str(sum([len1 * 2, len2]) == len(dataset)))
    os.system('cat Car_Prices_Poland_Kaggle_train.csv | wc -l')
    os.system('cat Car_Prices_Poland_Kaggle_dev.csv | wc -l')
    os.system('cat Car_Prices_Poland_Kaggle_test.csv | wc -l')


def get_statistics(dataset):
    """Mean, min, max, median etc."""

    print(f'--------------- Dataset length ---------------')
    print(len(dataset))

    print(f'---------------Describe dataset---------------')
    pd.set_option('display.max_columns', None)
    print(dataset.describe(include='all'))


def normalize_dataset(dataset):
    """Drop unnecessary columns and set numeric values to [0,1] range"""

    # drop columns
    dataset.drop(columns=["Unnamed: 0", "generation_name"], inplace=True)
    dataset = dataset.dropna()

    # normalize numbers to [0, 1]
    for column in dataset.columns:
        if isinstance(dataset.iloc[1][column], np.int64) or isinstance(dataset.iloc[1][column], np.float64):
            dataset[column] = (dataset[column] - dataset[column].min()) / (
                    dataset[column].max() - dataset[column].min())
    return dataset


install_dependencies()
download_dataset()
unzip_package()
cars = pd.read_csv('./Car_Prices_Poland_Kaggle.csv')
df = pd.DataFrame(cars)
df = normalize_dataset(df)
divide_dataset(df)
get_statistics(df)
IUM_02 zadanie - script.py 2022-03-19 19:54:20 +01:00			`import subprocess`
			`import sys`
dockerfile 2022-03-31 22:55:56 +02:00			`import pandas as pd`
			`import os`
			`import numpy as np`
IUM_02 zadanie - script.py 2022-03-19 19:54:20 +01:00

			`def install_dependencies():`
			`"""Install kaggle and pandas."""`
			`subprocess.check_call([sys.executable, '-m', 'pip', 'install', '--upgrade', 'pip'])`
			`subprocess.check_call([sys.executable, '-m', 'pip', 'install', 'kaggle'])`
			`subprocess.check_call([sys.executable, '-m', 'pip', 'install', 'pandas'])`
			`subprocess.check_call([sys.executable, '-m', 'pip', 'install', 'seaborn'])`
			`subprocess.check_call([sys.executable, '-m', 'pip', 'install', 'scikit-learn'])`


			`def unzip_package():`
			`"""Unzip dataset"""`
			`os.system('unzip -o car-prices-poland.zip')`


			`def download_dataset():`
			`"""Download kaggle dataset."""`
			`os.system('kaggle datasets download -d aleksandrglotov/car-prices-poland')`


			`def divide_dataset(dataset):`
			`"""Split dataset to dev, train, test datasets. """`
dockerfile 2022-03-31 22:55:56 +02:00
			`os.system('tail -n +2 Car_Prices_Poland_Kaggle.csv \| shuf > ./Car_Prices_Poland_Kaggle_shuf.csv')`
IUM_02 zadanie - script.py 2022-03-19 19:54:20 +01:00
dockerfile 2022-03-31 22:55:56 +02:00			`len1 = len(dataset) // 6`
			`len2 = (len1 * 2) +1`
IUM_02 zadanie - script.py 2022-03-19 19:54:20 +01:00
dockerfile 2022-03-31 22:55:56 +02:00			`os.system(f'head -n {len1} Car_Prices_Poland_Kaggle_shuf.csv f > Car_Prices_Poland_Kaggle_dev.csv')`
			`os.system(f'head -n {len1} Car_Prices_Poland_Kaggle_shuf.csv\| tail -n {len1} > Car_Prices_Poland_Kaggle_test.csv')`
			`os.system(f'tail -n +{len2} Car_Prices_Poland_Kaggle_shuf.csv > Car_Prices_Poland_Kaggle_train.csv')`
IUM_02 zadanie - script.py 2022-03-19 19:54:20 +01:00
			`os.system('rm ./Car_Prices_Poland_Kaggle_shuf.csv')`
dockerfile 2022-03-31 22:55:56 +02:00			`print("Len match: " + str(sum([len1 * 2, len2]) == len(dataset)))`
			`os.system('cat Car_Prices_Poland_Kaggle_train.csv \| wc -l')`
			`os.system('cat Car_Prices_Poland_Kaggle_dev.csv \| wc -l')`
			`os.system('cat Car_Prices_Poland_Kaggle_test.csv \| wc -l')`
IUM_02 zadanie - script.py 2022-03-19 19:54:20 +01:00

			`def get_statistics(dataset):`
			`"""Mean, min, max, median etc."""`

			`print(f'--------------- Dataset length ---------------')`
			`print(len(dataset))`

			`print(f'---------------Describe dataset---------------')`
			`pd.set_option('display.max_columns', None)`
			`print(dataset.describe(include='all'))`


			`def normalize_dataset(dataset):`
			`"""Drop unnecessary columns and set numeric values to [0,1] range"""`

			`# drop columns`
			`dataset.drop(columns=["Unnamed: 0", "generation_name"], inplace=True)`
dockerfile 2022-03-31 22:55:56 +02:00			`dataset = dataset.dropna()`
IUM_02 zadanie - script.py 2022-03-19 19:54:20 +01:00
			`# normalize numbers to [0, 1]`
			`for column in dataset.columns:`
			`if isinstance(dataset.iloc[1][column], np.int64) or isinstance(dataset.iloc[1][column], np.float64):`
			`dataset[column] = (dataset[column] - dataset[column].min()) / (`
			`dataset[column].max() - dataset[column].min())`
			`return dataset`


script fix 2022-03-31 23:23:30 +02:00			`install_dependencies()`
			`download_dataset()`
IUM_02 zadanie - script.py 2022-03-19 19:54:20 +01:00			`unzip_package()`
			`cars = pd.read_csv('./Car_Prices_Poland_Kaggle.csv')`
dockerfile 2022-03-31 22:55:56 +02:00			`df = pd.DataFrame(cars)`
			`df = normalize_dataset(df)`
			`divide_dataset(df)`
			`get_statistics(df)`
IUM_02 zadanie - script.py 2022-03-19 19:54:20 +01:00