r1

2022-04-02 20:28:16 +02:00 · 2022-04-02 20:28:16 +02:00 · 316471eda9
commit 316471eda9
parent 10f81c6bf9
4 changed files with 32 additions and 34 deletions
--- a/16
+++ b/16
@ -1,6 +1,6 @@
 FROM ubuntu:latest
-# COPY ./kaggle.json /root/.kaggle/kaggle.json
+COPY ./kaggle.json /root/.kaggle/kaggle.json
 WORKDIR /app
@ -12,16 +12,16 @@ RUN apt-get install -y python3-pip
 RUN python3 -m pip --version
 RUN python3 -m pip install kaggle
 RUN python3 -m pip install pandas
 RUN python3 -m pip freeze
-COPY ./download.sh ./
+ENV PATH="/root/.local/bin:${PATH}"
-COPY ./script.py ./
+COPY . .
-ARG KAGGLE_USERNAME=testKAGGLE_USERNAME
+ARG KAGGLE_USERNAME
-ARG KAGGLE_KEY=test1KAGGLE_KEY
+ARG KAGGLE_KEY
-RUN chmod u+x ./script.py
+RUN chmod u+x ./script-download.py
 RUN chmod u+x ./script-stats.py
 # RUN ./download.sh 117928
-# RUN python3 ./script.py
+RUN python3 ./script-download.py
--- a/2
+++ b/2
@ -20,7 +20,7 @@ pipeline {
    }
    agent { 
        dockerfile{
-            additionalBuildArgs  '--build-arg KAGGLE_USERNAME="$KAGGLE_USERNAME" --build-arg KAGGLE_KEY="$KAGGLE_KEY" --build-arg --no-cache=true'
+            additionalBuildArgs  '--build-arg KAGGLE_USERNAME="$KAGGLE_USERNAME" --build-arg KAGGLE_KEY="$KAGGLE_KEY" --no-cache=true'
        } 
    }
    stages {
--- a/script-download.py
+++ b/script-download.py
@ -5,26 +5,20 @@ import os
 import numpy as np
 def install_dependencies():
    """Install kaggle and pandas."""
    subprocess.check_call([sys.executable, '-m', 'pip', 'install', '--upgrade', 'pip'])
    subprocess.check_call([sys.executable, '-m', 'pip', 'install', 'kaggle'])
    subprocess.check_call([sys.executable, '-m', 'pip', 'install', 'pandas'])
    subprocess.check_call([sys.executable, '-m', 'pip', 'install', 'seaborn'])
    subprocess.check_call([sys.executable, '-m', 'pip', 'install', 'scikit-learn'])
 def unzip_package():
    """Unzip dataset"""
    print('Unzipping dataset...')
    os.system('unzip -o car-prices-poland.zip')
    print('Dataset unzipped')
-
+    print('Removing .zip file...')
    os.system('rm ./car-prices-poland.zip')
    print('Zip file removed')
 def download_dataset():
    """Download kaggle dataset."""
    print('Downloading dataset...')
-    os.system('kaggle datasets download -d anikannal/solar-power-generation-data')
+    os.system('kaggle datasets download -d aleksandrglotov/car-prices-poland')
    print('Dir after downloading')
    os.system('ls -la')
@ -54,17 +48,6 @@ def divide_dataset(dataset):
    print('Dataset devided')
 def get_statistics(dataset):
    """Mean, min, max, median etc."""
    print(f'--------------- Normalized dataset length ---------------')
    print(len(dataset))
    print(f'---------------Describe dataset---------------')
    pd.set_option('display.max_columns', None)
    print(dataset.describe(include='all'))
 def normalize_dataset(dataset):
    """Drop unnecessary columns and set numeric values to [0,1] range"""
@ -78,16 +61,14 @@ def normalize_dataset(dataset):
    # normalize numbers to [0, 1]
    for column in dataset.columns:
        if isinstance(dataset.iloc[1][column], np.int64) or isinstance(dataset.iloc[1][column], np.float64):
-            dataset[column] = (dataset[column] - dataset[column].min()) / (
+            dataset[column] = (dataset[column] - dataset[column].min()) / (dataset[column].max() - dataset[column].min())
                    dataset[column].max() - dataset[column].min())
    return dataset
 # print(os.system('python3 -m pip freeze'))
 download_dataset()
 unzip_package()
 cars = pd.read_csv('./Car_Prices_Poland_Kaggle.csv')
 df = pd.DataFrame(cars)
 df = normalize_dataset(df)
 divide_dataset(df)
-get_statistics(df)
+
--- a/script-stats.py
+++ b/script-stats.py
@ -0,0 +1,17 @@
 import pandas as pd
 print('--Full dataset stats--')
 cars = pd.read_csv('./Car_Prices_Poland_Kaggle.csv')
 print(cars.describe(include='all'))
 print('Dev dataset stats')
 cars_dev = pd.read_csv('./Car_Prices_Poland_Kaggle_dev.csv')
 print(cars_dev.describe(include='all'))
 print('# statystyki dla zbioru test')
 cars_test = pd.read_csv('./Car_Prices_Poland_Kaggle_test.csv')
 print(cars_test.describe(include='all'))
 print('# statystyki dla zbioru train')
 cars_train = pd.read_csv('./Car_Prices_Poland_Kaggle_train.csv')
 print(cars_train.describe(include='all'))