From 6cbad8e595f4dc16dbe5e8d4564e78f1129bd621 Mon Sep 17 00:00:00 2001
From: Adam Wojdyla <adam.wojdyla@outlook.com>
Date: Thu, 31 Mar 2022 22:55:56 +0200
Subject: [PATCH] dockerfile

---
 .gitignore     |  3 +++
 dockerfile     | 26 ++++++++++++++++++++++++++
 figlet-loop.sh |  4 ++++
 script.py      | 47 +++++++++++++++++++++--------------------------
 4 files changed, 54 insertions(+), 26 deletions(-)
 create mode 100644 dockerfile
 create mode 100755 figlet-loop.sh

diff --git a/.gitignore b/.gitignore
index 8f948b9..5833ef2 100644
--- a/.gitignore
+++ b/.gitignore
@@ -150,3 +150,6 @@ crashlytics.properties
 crashlytics-build.properties
 fabric.properties
 
+# kaggle
+.kaggle.json
+
diff --git a/dockerfile b/dockerfile
new file mode 100644
index 0000000..b524e9e
--- /dev/null
+++ b/dockerfile
@@ -0,0 +1,26 @@
+# Nasz obraz będzie dzidziczył z obrazu Ubuntu w wersji latest
+FROM ubuntu:latest
+
+# Instalujemy niezbędne zależności. Zwróć uwagę na flagę "-y" (assume yes)
+RUN apt update && apt install -y figlet python3 python3-pip unzip
+RUN pip3 install --user kaggle
+RUN pip3 install --user pandas
+# RUN pip3 install --user unzip
+
+ENV PATH="/root/.local/bin:${PATH}"
+
+# Stwórzmy w kontenerze (jeśli nie istnieje) katalog /app i przejdźmy do niego (wszystkie kolejne polecenia RUN, CMD, ENTRYPOINT, COPY i ADD będą w nim wykonywane)
+WORKDIR /app
+
+# Skopiujmy nasz skrypt do katalogu /app w kontenerze
+COPY ./figlet-loop.sh ./
+COPY ./download.sh ./
+COPY ./script.py ./
+COPY ./kaggle.json /root/.kaggle/kaggle.json
+
+RUN ./download.sh 117928
+RUN python3 ./script.py
+
+
+# Domyślne polecenie, które zostanie uruchomione w kontenerze po jego starcie
+# CMD python ./script.py
\ No newline at end of file
diff --git a/figlet-loop.sh b/figlet-loop.sh
new file mode 100755
index 0000000..723692a
--- /dev/null
+++ b/figlet-loop.sh
@@ -0,0 +1,4 @@
+#!/bin/bash
+while read line; do
+    figlet "$line"
+done
\ No newline at end of file
diff --git a/script.py b/script.py
index 3e9bc9e..bb06b47 100644
--- a/script.py
+++ b/script.py
@@ -1,5 +1,8 @@
 import subprocess
 import sys
+import pandas as pd
+import os
+import numpy as np
 
 
 def install_dependencies():
@@ -23,22 +26,21 @@ def download_dataset():
 
 def divide_dataset(dataset):
     """Split dataset to dev, train, test datasets. """
+    
+    os.system('tail -n +2 Car_Prices_Poland_Kaggle.csv | shuf > ./Car_Prices_Poland_Kaggle_shuf.csv')
 
-    os.system('cat Car_Prices_Poland_Kaggle.csv | shuf > Car_Prices_Poland_Kaggle_shuf.csv')
+    len1 = len(dataset) // 6
+    len2 = (len1 * 2) +1
 
-    len_train = len(dataset) // 10 * 6
-    len_dev = len(dataset) // 10 * 2
-    len_test = len(dataset) // 10 * 2
-
-    if len_test + len_train + len_dev != len(dataset):
-        len_train += len(dataset) - (len_test + len_train + len_dev)
-
-    os.system(f'head -n {len_train} Car_Prices_Poland_Kaggle.csv | shuf > Car_Prices_Poland_Kaggle_train.csv')
-    os.system(f'head -n {len_dev} Car_Prices_Poland_Kaggle.csv | shuf > Car_Prices_Poland_Kaggle_dev.csv')
-    os.system(f'head -n {len_test} Car_Prices_Poland_Kaggle.csv | shuf > Car_Prices_Poland_Kaggle_test.csv')
+    os.system(f'head -n {len1} Car_Prices_Poland_Kaggle_shuf.csv f > Car_Prices_Poland_Kaggle_dev.csv')
+    os.system(f'head -n {len1} Car_Prices_Poland_Kaggle_shuf.csv| tail -n {len1} > Car_Prices_Poland_Kaggle_test.csv')
+    os.system(f'tail -n +{len2} Car_Prices_Poland_Kaggle_shuf.csv  > Car_Prices_Poland_Kaggle_train.csv')
 
     os.system('rm ./Car_Prices_Poland_Kaggle_shuf.csv')
-    print("Len match: " + str(sum([len_test, len_dev, len_train]) == len(dataset)))
+    print("Len match: " + str(sum([len1 * 2, len2]) == len(dataset)))
+    os.system('cat Car_Prices_Poland_Kaggle_train.csv | wc -l')
+    os.system('cat Car_Prices_Poland_Kaggle_dev.csv | wc -l')
+    os.system('cat Car_Prices_Poland_Kaggle_test.csv | wc -l')
 
 
 def get_statistics(dataset):
@@ -57,31 +59,24 @@ def normalize_dataset(dataset):
 
     # drop columns
     dataset.drop(columns=["Unnamed: 0", "generation_name"], inplace=True)
+    dataset = dataset.dropna()
 
     # normalize numbers to [0, 1]
     for column in dataset.columns:
         if isinstance(dataset.iloc[1][column], np.int64) or isinstance(dataset.iloc[1][column], np.float64):
             dataset[column] = (dataset[column] - dataset[column].min()) / (
                     dataset[column].max() - dataset[column].min())
-
-        # There is no null rows
-        # dataset.isnull().sum()
-
     return dataset
 
 
-install_dependencies()
-
-import pandas as pd
-import os
-import numpy as np
-
-download_dataset()
+# install_dependencies()
+# download_dataset()
 unzip_package()
 cars = pd.read_csv('./Car_Prices_Poland_Kaggle.csv')
-normalize_dataset(cars)
-divide_dataset(cars)
-get_statistics(cars)
+df = pd.DataFrame(cars)
+df = normalize_dataset(df)
+divide_dataset(df)
+get_statistics(df)