This commit is contained in:
Adam Wojdyla 2022-04-02 17:23:10 +02:00
parent 3f3d593e77
commit 3aefa5d8cd
3 changed files with 9 additions and 44 deletions

View File

@ -15,15 +15,8 @@ RUN python3 -m pip install pandas
RUN python3 -m pip freeze
# ENV PATH="/root/.local/bin:${PATH}"
COPY . .
# COPY ./figlet-loop.sh ./
# COPY ./download.sh ./
# COPY ./script.py ./
# COPY ./kaggle.json /root/.kaggle/kaggle.json
ARG KAGGLE_USERNAME=testKAGGLE_USERNAME
ARG KAGGLE_KEY=test1KAGGLE_KEY

View File

@ -1,24 +0,0 @@
FROM ubuntu:latest
WORKDIR /app
RUN apt-get update && apt-get install -y figlet python3 python3-pip unzip
RUN pip3 install kaggle
RUN pip3 install pandas
# ENV PATH="/root/.local/bin:${PATH}"
COPY . .
COPY ./figlet-loop.sh ./
COPY ./download.sh ./
COPY ./script.py ./
# COPY ./kaggle.json /root/.kaggle/kaggle.json
ARG KAGGLE_USERNAME=testKAGGLE_USERNAME
ARG KAGGLE_KEY=test1KAGGLE_KEY
RUN pip freeze
RUN chmod u+x ./script.py
# RUN ./download.sh 117928
# RUN python3 ./script.py

View File

@ -35,25 +35,25 @@ def divide_dataset(dataset):
os.system('tail -n +2 Car_Prices_Poland_Kaggle.csv | shuf > ./Car_Prices_Poland_Kaggle_shuf.csv')
len1 = len(dataset) // 6
len2 = (len1 * 2) +1
len2 = (len1 * 2) + 1
os.system(f'head -n {len1} Car_Prices_Poland_Kaggle_shuf.csv f > Car_Prices_Poland_Kaggle_dev.csv')
os.system(f'head -n {len1} Car_Prices_Poland_Kaggle_shuf.csv| tail -n {len1} > Car_Prices_Poland_Kaggle_test.csv')
os.system(f'tail -n +{len2} Car_Prices_Poland_Kaggle_shuf.csv > Car_Prices_Poland_Kaggle_train.csv')
os.system(f'head -n {len1} Car_Prices_Poland_Kaggle_shuf.csv > Car_Prices_Poland_Kaggle_dev.csv')
os.system(f'head -n {len1} Car_Prices_Poland_Kaggle_shuf.csv | tail -n {len1} > Car_Prices_Poland_Kaggle_test.csv')
os.system(f'tail -n +{len2} Car_Prices_Poland_Kaggle_shuf.csv > Car_Prices_Poland_Kaggle_train.csv')
os.system('rm ./Car_Prices_Poland_Kaggle_shuf.csv')
print("Len match: " + str(sum([len1 * 2, len2]) == len(dataset)))
os.system('cat Car_Prices_Poland_Kaggle_train.csv | wc -l')
os.system('cat Car_Prices_Poland_Kaggle_dev.csv | wc -l')
os.system('cat Car_Prices_Poland_Kaggle_test.csv | wc -l')
print('Dataset devided')
def get_statistics(dataset):
"""Mean, min, max, median etc."""
print(f'--------------- Dataset length ---------------')
print(f'--------------- Normalized dataset length ---------------')
print(len(dataset))
print(f'---------------Describe dataset---------------')
@ -64,6 +64,9 @@ def get_statistics(dataset):
def normalize_dataset(dataset):
"""Drop unnecessary columns and set numeric values to [0,1] range"""
print(f'--------------- Initial dataset length ---------------')
print(len(dataset))
# drop columns
dataset.drop(columns=["Unnamed: 0", "generation_name"], inplace=True)
dataset = dataset.dropna()
@ -76,9 +79,6 @@ def normalize_dataset(dataset):
return dataset
# install_dependencies()
print(os.system('python3 -m pip freeze'))
download_dataset()
@ -88,7 +88,3 @@ df = pd.DataFrame(cars)
df = normalize_dataset(df)
divide_dataset(df)
get_statistics(df)