diff --git a/.gitignore b/.gitignore index 8f948b9..5833ef2 100644 --- a/.gitignore +++ b/.gitignore @@ -150,3 +150,6 @@ crashlytics.properties crashlytics-build.properties fabric.properties +# kaggle +.kaggle.json + diff --git a/dockerfile b/dockerfile new file mode 100644 index 0000000..b524e9e --- /dev/null +++ b/dockerfile @@ -0,0 +1,26 @@ +# Nasz obraz będzie dzidziczył z obrazu Ubuntu w wersji latest +FROM ubuntu:latest + +# Instalujemy niezbędne zależności. Zwróć uwagę na flagę "-y" (assume yes) +RUN apt update && apt install -y figlet python3 python3-pip unzip +RUN pip3 install --user kaggle +RUN pip3 install --user pandas +# RUN pip3 install --user unzip + +ENV PATH="/root/.local/bin:${PATH}" + +# Stwórzmy w kontenerze (jeśli nie istnieje) katalog /app i przejdźmy do niego (wszystkie kolejne polecenia RUN, CMD, ENTRYPOINT, COPY i ADD będą w nim wykonywane) +WORKDIR /app + +# Skopiujmy nasz skrypt do katalogu /app w kontenerze +COPY ./figlet-loop.sh ./ +COPY ./download.sh ./ +COPY ./script.py ./ +COPY ./kaggle.json /root/.kaggle/kaggle.json + +RUN ./download.sh 117928 +RUN python3 ./script.py + + +# Domyślne polecenie, które zostanie uruchomione w kontenerze po jego starcie +# CMD python ./script.py \ No newline at end of file diff --git a/figlet-loop.sh b/figlet-loop.sh new file mode 100755 index 0000000..723692a --- /dev/null +++ b/figlet-loop.sh @@ -0,0 +1,4 @@ +#!/bin/bash +while read line; do + figlet "$line" +done \ No newline at end of file diff --git a/script.py b/script.py index 3e9bc9e..bb06b47 100644 --- a/script.py +++ b/script.py @@ -1,5 +1,8 @@ import subprocess import sys +import pandas as pd +import os +import numpy as np def install_dependencies(): @@ -23,22 +26,21 @@ def download_dataset(): def divide_dataset(dataset): """Split dataset to dev, train, test datasets. """ + + os.system('tail -n +2 Car_Prices_Poland_Kaggle.csv | shuf > ./Car_Prices_Poland_Kaggle_shuf.csv') - os.system('cat Car_Prices_Poland_Kaggle.csv | shuf > Car_Prices_Poland_Kaggle_shuf.csv') + len1 = len(dataset) // 6 + len2 = (len1 * 2) +1 - len_train = len(dataset) // 10 * 6 - len_dev = len(dataset) // 10 * 2 - len_test = len(dataset) // 10 * 2 - - if len_test + len_train + len_dev != len(dataset): - len_train += len(dataset) - (len_test + len_train + len_dev) - - os.system(f'head -n {len_train} Car_Prices_Poland_Kaggle.csv | shuf > Car_Prices_Poland_Kaggle_train.csv') - os.system(f'head -n {len_dev} Car_Prices_Poland_Kaggle.csv | shuf > Car_Prices_Poland_Kaggle_dev.csv') - os.system(f'head -n {len_test} Car_Prices_Poland_Kaggle.csv | shuf > Car_Prices_Poland_Kaggle_test.csv') + os.system(f'head -n {len1} Car_Prices_Poland_Kaggle_shuf.csv f > Car_Prices_Poland_Kaggle_dev.csv') + os.system(f'head -n {len1} Car_Prices_Poland_Kaggle_shuf.csv| tail -n {len1} > Car_Prices_Poland_Kaggle_test.csv') + os.system(f'tail -n +{len2} Car_Prices_Poland_Kaggle_shuf.csv > Car_Prices_Poland_Kaggle_train.csv') os.system('rm ./Car_Prices_Poland_Kaggle_shuf.csv') - print("Len match: " + str(sum([len_test, len_dev, len_train]) == len(dataset))) + print("Len match: " + str(sum([len1 * 2, len2]) == len(dataset))) + os.system('cat Car_Prices_Poland_Kaggle_train.csv | wc -l') + os.system('cat Car_Prices_Poland_Kaggle_dev.csv | wc -l') + os.system('cat Car_Prices_Poland_Kaggle_test.csv | wc -l') def get_statistics(dataset): @@ -57,31 +59,24 @@ def normalize_dataset(dataset): # drop columns dataset.drop(columns=["Unnamed: 0", "generation_name"], inplace=True) + dataset = dataset.dropna() # normalize numbers to [0, 1] for column in dataset.columns: if isinstance(dataset.iloc[1][column], np.int64) or isinstance(dataset.iloc[1][column], np.float64): dataset[column] = (dataset[column] - dataset[column].min()) / ( dataset[column].max() - dataset[column].min()) - - # There is no null rows - # dataset.isnull().sum() - return dataset -install_dependencies() - -import pandas as pd -import os -import numpy as np - -download_dataset() +# install_dependencies() +# download_dataset() unzip_package() cars = pd.read_csv('./Car_Prices_Poland_Kaggle.csv') -normalize_dataset(cars) -divide_dataset(cars) -get_statistics(cars) +df = pd.DataFrame(cars) +df = normalize_dataset(df) +divide_dataset(df) +get_statistics(df)