dockerfile

2022-03-31 22:55:56 +02:00 · 2022-03-31 22:55:56 +02:00 · 6cbad8e595
commit 6cbad8e595
parent bd18c8b09d
4 changed files with 54 additions and 26 deletions
--- a/.gitignore
+++ b/.gitignore
@ -150,3 +150,6 @@ crashlytics.properties
 crashlytics-build.properties
 fabric.properties

+# kaggle
+.kaggle.json
+
--- a/26
+++ b/26
@ -0,0 +1,26 @@
+# Nasz obraz będzie dzidziczył z obrazu Ubuntu w wersji latest
+FROM ubuntu:latest
+
+# Instalujemy niezbędne zależności. Zwróć uwagę na flagę "-y" (assume yes)
+RUN apt update && apt install -y figlet python3 python3-pip unzip
+RUN pip3 install --user kaggle
+RUN pip3 install --user pandas
+# RUN pip3 install --user unzip
+
+ENV PATH="/root/.local/bin:${PATH}"
+
+# Stwórzmy w kontenerze (jeśli nie istnieje) katalog /app i przejdźmy do niego (wszystkie kolejne polecenia RUN, CMD, ENTRYPOINT, COPY i ADD będą w nim wykonywane)
+WORKDIR /app
+
+# Skopiujmy nasz skrypt do katalogu /app w kontenerze
+COPY ./figlet-loop.sh ./
+COPY ./download.sh ./
+COPY ./script.py ./
+COPY ./kaggle.json /root/.kaggle/kaggle.json
+
+RUN ./download.sh 117928
+RUN python3 ./script.py
+
+
+# Domyślne polecenie, które zostanie uruchomione w kontenerze po jego starcie
+# CMD python ./script.py
--- a/figlet-loop.sh
+++ b/figlet-loop.sh
@ -0,0 +1,4 @@
+#!/bin/bash
+while read line; do
+    figlet "$line"
+done
--- a/script.py
+++ b/script.py
@ -1,5 +1,8 @@
 import subprocess
 import sys
+import pandas as pd
+import os
+import numpy as np


 def install_dependencies():
@ -23,22 +26,21 @@ def download_dataset():

 def divide_dataset(dataset):
    """Split dataset to dev, train, test datasets. """
+    
+    os.system('tail -n +2 Car_Prices_Poland_Kaggle.csv | shuf > ./Car_Prices_Poland_Kaggle_shuf.csv')

-    os.system('cat Car_Prices_Poland_Kaggle.csv | shuf > Car_Prices_Poland_Kaggle_shuf.csv')
+    len1 = len(dataset) // 6
+    len2 = (len1 * 2) +1

-    len_train = len(dataset) // 10 * 6
-    len_dev = len(dataset) // 10 * 2
-    len_test = len(dataset) // 10 * 2
-
-    if len_test + len_train + len_dev != len(dataset):
-        len_train += len(dataset) - (len_test + len_train + len_dev)
-
-    os.system(f'head -n {len_train} Car_Prices_Poland_Kaggle.csv | shuf > Car_Prices_Poland_Kaggle_train.csv')
-    os.system(f'head -n {len_dev} Car_Prices_Poland_Kaggle.csv | shuf > Car_Prices_Poland_Kaggle_dev.csv')
-    os.system(f'head -n {len_test} Car_Prices_Poland_Kaggle.csv | shuf > Car_Prices_Poland_Kaggle_test.csv')
+    os.system(f'head -n {len1} Car_Prices_Poland_Kaggle_shuf.csv f > Car_Prices_Poland_Kaggle_dev.csv')
+    os.system(f'head -n {len1} Car_Prices_Poland_Kaggle_shuf.csv| tail -n {len1} > Car_Prices_Poland_Kaggle_test.csv')
+    os.system(f'tail -n +{len2} Car_Prices_Poland_Kaggle_shuf.csv  > Car_Prices_Poland_Kaggle_train.csv')

    os.system('rm ./Car_Prices_Poland_Kaggle_shuf.csv')
-    print("Len match: " + str(sum([len_test, len_dev, len_train]) == len(dataset)))
+    print("Len match: " + str(sum([len1 * 2, len2]) == len(dataset)))
+    os.system('cat Car_Prices_Poland_Kaggle_train.csv | wc -l')
+    os.system('cat Car_Prices_Poland_Kaggle_dev.csv | wc -l')
+    os.system('cat Car_Prices_Poland_Kaggle_test.csv | wc -l')


 def get_statistics(dataset):
@ -57,31 +59,24 @@ def normalize_dataset(dataset):

    # drop columns
    dataset.drop(columns=["Unnamed: 0", "generation_name"], inplace=True)
+    dataset = dataset.dropna()

    # normalize numbers to [0, 1]
    for column in dataset.columns:
        if isinstance(dataset.iloc[1][column], np.int64) or isinstance(dataset.iloc[1][column], np.float64):
            dataset[column] = (dataset[column] - dataset[column].min()) / (
                    dataset[column].max() - dataset[column].min())
-
-        # There is no null rows
-        # dataset.isnull().sum()
-
    return dataset


-install_dependencies()
-
-import pandas as pd
-import os
-import numpy as np
-
-download_dataset()
+# install_dependencies()
+# download_dataset()
 unzip_package()
 cars = pd.read_csv('./Car_Prices_Poland_Kaggle.csv')
-normalize_dataset(cars)
-divide_dataset(cars)
-get_statistics(cars)
+df = pd.DataFrame(cars)
+df = normalize_dataset(df)
+divide_dataset(df)
+get_statistics(df)