dockerfile

This commit is contained in:
Adam Wojdyla 2022-03-31 22:55:56 +02:00
parent bd18c8b09d
commit 6cbad8e595
4 changed files with 54 additions and 26 deletions

3
.gitignore vendored
View File

@ -150,3 +150,6 @@ crashlytics.properties
crashlytics-build.properties
fabric.properties
# kaggle
.kaggle.json

26
dockerfile Normal file
View File

@ -0,0 +1,26 @@
# Nasz obraz będzie dzidziczył z obrazu Ubuntu w wersji latest
FROM ubuntu:latest
# Instalujemy niezbędne zależności. Zwróć uwagę na flagę "-y" (assume yes)
RUN apt update && apt install -y figlet python3 python3-pip unzip
RUN pip3 install --user kaggle
RUN pip3 install --user pandas
# RUN pip3 install --user unzip
ENV PATH="/root/.local/bin:${PATH}"
# Stwórzmy w kontenerze (jeśli nie istnieje) katalog /app i przejdźmy do niego (wszystkie kolejne polecenia RUN, CMD, ENTRYPOINT, COPY i ADD będą w nim wykonywane)
WORKDIR /app
# Skopiujmy nasz skrypt do katalogu /app w kontenerze
COPY ./figlet-loop.sh ./
COPY ./download.sh ./
COPY ./script.py ./
COPY ./kaggle.json /root/.kaggle/kaggle.json
RUN ./download.sh 117928
RUN python3 ./script.py
# Domyślne polecenie, które zostanie uruchomione w kontenerze po jego starcie
# CMD python ./script.py

4
figlet-loop.sh Executable file
View File

@ -0,0 +1,4 @@
#!/bin/bash
while read line; do
figlet "$line"
done

View File

@ -1,5 +1,8 @@
import subprocess
import sys
import pandas as pd
import os
import numpy as np
def install_dependencies():
@ -24,21 +27,20 @@ def download_dataset():
def divide_dataset(dataset):
"""Split dataset to dev, train, test datasets. """
os.system('cat Car_Prices_Poland_Kaggle.csv | shuf > Car_Prices_Poland_Kaggle_shuf.csv')
os.system('tail -n +2 Car_Prices_Poland_Kaggle.csv | shuf > ./Car_Prices_Poland_Kaggle_shuf.csv')
len_train = len(dataset) // 10 * 6
len_dev = len(dataset) // 10 * 2
len_test = len(dataset) // 10 * 2
len1 = len(dataset) // 6
len2 = (len1 * 2) +1
if len_test + len_train + len_dev != len(dataset):
len_train += len(dataset) - (len_test + len_train + len_dev)
os.system(f'head -n {len_train} Car_Prices_Poland_Kaggle.csv | shuf > Car_Prices_Poland_Kaggle_train.csv')
os.system(f'head -n {len_dev} Car_Prices_Poland_Kaggle.csv | shuf > Car_Prices_Poland_Kaggle_dev.csv')
os.system(f'head -n {len_test} Car_Prices_Poland_Kaggle.csv | shuf > Car_Prices_Poland_Kaggle_test.csv')
os.system(f'head -n {len1} Car_Prices_Poland_Kaggle_shuf.csv f > Car_Prices_Poland_Kaggle_dev.csv')
os.system(f'head -n {len1} Car_Prices_Poland_Kaggle_shuf.csv| tail -n {len1} > Car_Prices_Poland_Kaggle_test.csv')
os.system(f'tail -n +{len2} Car_Prices_Poland_Kaggle_shuf.csv > Car_Prices_Poland_Kaggle_train.csv')
os.system('rm ./Car_Prices_Poland_Kaggle_shuf.csv')
print("Len match: " + str(sum([len_test, len_dev, len_train]) == len(dataset)))
print("Len match: " + str(sum([len1 * 2, len2]) == len(dataset)))
os.system('cat Car_Prices_Poland_Kaggle_train.csv | wc -l')
os.system('cat Car_Prices_Poland_Kaggle_dev.csv | wc -l')
os.system('cat Car_Prices_Poland_Kaggle_test.csv | wc -l')
def get_statistics(dataset):
@ -57,31 +59,24 @@ def normalize_dataset(dataset):
# drop columns
dataset.drop(columns=["Unnamed: 0", "generation_name"], inplace=True)
dataset = dataset.dropna()
# normalize numbers to [0, 1]
for column in dataset.columns:
if isinstance(dataset.iloc[1][column], np.int64) or isinstance(dataset.iloc[1][column], np.float64):
dataset[column] = (dataset[column] - dataset[column].min()) / (
dataset[column].max() - dataset[column].min())
# There is no null rows
# dataset.isnull().sum()
return dataset
install_dependencies()
import pandas as pd
import os
import numpy as np
download_dataset()
# install_dependencies()
# download_dataset()
unzip_package()
cars = pd.read_csv('./Car_Prices_Poland_Kaggle.csv')
normalize_dataset(cars)
divide_dataset(cars)
get_statistics(cars)
df = pd.DataFrame(cars)
df = normalize_dataset(df)
divide_dataset(df)
get_statistics(df)