From e575276d819cdd1f4ef0a5d992ac0705e50ff475 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Krzysztof=20Raczy=C5=84ski?= Date: Sun, 28 Apr 2024 20:29:40 +0200 Subject: [PATCH] IUM_05 --- Dockerfile | 20 +++++++++++++++ create-dataset.sh | 64 ++++++++++++++++++++++++++--------------------- model.py | 33 ++++++++++++++++++++++++ predict.py | 27 ++++++++++++++++++++ 4 files changed, 115 insertions(+), 29 deletions(-) create mode 100644 Dockerfile create mode 100644 model.py create mode 100644 predict.py diff --git a/Dockerfile b/Dockerfile new file mode 100644 index 0000000..ed19e8c --- /dev/null +++ b/Dockerfile @@ -0,0 +1,20 @@ +FROM ubuntu:latest + +# Instalacja niezbędnych narzędzi +RUN apt-get update && apt-get install -y python3-pip python3-venv unzip coreutils dos2unix + +# Utworzenie i aktywacja wirtualnego środowiska +RUN python3 -m venv /opt/venv +ENV PATH="/opt/venv/bin:$PATH" + +# Instalacja bibliotek Pythona w wirtualnym środowisku +RUN pip install numpy tensorflow scikit-learn kaggle pandas + +WORKDIR /app + +# Kopiowanie plików konfiguracyjnych i skryptów +COPY kaggle.json /root/.kaggle/kaggle.json +COPY ./create-dataset.sh ./ +RUN dos2unix ./create-dataset.sh +COPY ./model.py ./ +COPY ./predict.py ./ diff --git a/create-dataset.sh b/create-dataset.sh index 41102c3..d979c87 100644 --- a/create-dataset.sh +++ b/create-dataset.sh @@ -1,29 +1,35 @@ -#!/bin/bash - -pip install kaggle - -kaggle datasets download -d syedanwarafridi/vehicle-sales-data - -unzip -o vehicle-sales-data.zip - -# Tasowanie -shuf car_prices.csv -o car_prices_shuf.csv - -# Podział danych na podzbiory -total_rows=$(wc -l < car_prices_shuf.csv) -test_dev_rows=$(( $1 * 2 )) - -head -n $1 car_prices_shuf.csv > car_prices_test.csv -head -n $test_dev_rows car_prices_shuf.csv | tail -n +$(( $1 + 1 )) > car_prices_dev.csv -tail -n +$(( $test_dev_rows + 1 )) car_prices_shuf.csv > car_prices_train.csv - -test_size=$(wc -l < car_prices_test.csv) -dev_size=$(wc -l < car_prices_dev.csv) -train_size=$(wc -l < car_prices_train.csv) -echo "Rozmiar zbioru testowego: $test_size" -echo "Rozmiar zbioru deweloperskiego: $dev_size" -echo "Rozmiar zbioru treningowego: $train_size" - -# Zapis artefaktów -mkdir -p data -mv car_prices.csv car_prices_shuf.csv car_prices_test.csv car_prices_dev.csv car_prices_train.csv data/ +#!/bin/bash + +export PATH=$PATH:/root/.local/bindock + +pip install kaggle + +kaggle datasets download -d syedanwarafridi/vehicle-sales-data + +unzip -o vehicle-sales-data.zip + +head -n 1 car_prices.csv > car_prices_header.csv +tail -n +2 car_prices.csv | awk -F, '!/,,/' | shuf > car_prices_no_null.csv + +total_rows=$(wc -l < car_prices_no_null.csv) +test_dev_rows=$(( $1 * 2 )) + +head -n $1 car_prices_no_null.csv > car_prices_test_temp.csv +head -n $test_dev_rows car_prices_no_null.csv | tail -n +$(( $1 + 1 )) > car_prices_dev_temp.csv +tail -n +$(( $test_dev_rows + 1 )) car_prices_no_null.csv > car_prices_train_temp.csv + +cat car_prices_header.csv car_prices_test_temp.csv > car_prices_test.csv +cat car_prices_header.csv car_prices_dev_temp.csv > car_prices_dev.csv +cat car_prices_header.csv car_prices_train_temp.csv > car_prices_train.csv + +rm car_prices_test_temp.csv car_prices_dev_temp.csv car_prices_train_temp.csv car_prices_no_null.csv car_prices_header.csv + +test_size=$(wc -l < car_prices_test.csv) +dev_size=$(wc -l < car_prices_dev.csv) +train_size=$(wc -l < car_prices_train.csv) +echo "Rozmiar zbioru testowego: $test_size" +echo "Rozmiar zbioru deweloperskiego: $dev_size" +echo "Rozmiar zbioru treningowego: $train_size" + +mkdir -p data +mv car_prices.csv car_prices_test.csv car_prices_dev.csv car_prices_train.csv data/ diff --git a/model.py b/model.py new file mode 100644 index 0000000..b2ffddc --- /dev/null +++ b/model.py @@ -0,0 +1,33 @@ +import pandas as pd +import numpy as np +from tensorflow.keras import Sequential +from tensorflow.keras.layers import Dense +from sklearn.preprocessing import MinMaxScaler + +train_data = pd.read_csv('./data/car_prices_train.csv') + +train_data.dropna(inplace=True) + +y_train = train_data['sellingprice'].astype(np.float32) + +X_train = train_data[['year', 'condition', 'transmission']] + +scaler_x = MinMaxScaler() +X_train['condition'] = scaler_x.fit_transform(X_train[['condition']]) + +scaler_y = MinMaxScaler() +y_train = scaler_y.fit_transform(y_train.values.reshape(-1, 1)) + +X_train = pd.get_dummies(X_train, columns=['transmission']) + +model = Sequential([ + Dense(64, activation='relu'), + Dense(32, activation='relu'), + Dense(1) +]) + +model.compile(optimizer='adam', loss='mean_squared_error') + +model.fit(X_train, y_train, epochs=20, batch_size=32) + +model.save('car_prices_predict_model.h5') diff --git a/predict.py b/predict.py new file mode 100644 index 0000000..3059f66 --- /dev/null +++ b/predict.py @@ -0,0 +1,27 @@ +import pandas as pd +import numpy as np +from tensorflow.keras import Sequential +import tensorflow as tf +from sklearn.preprocessing import MinMaxScaler + +test_data = pd.read_csv('./data/car_prices_test.csv') +test_data.dropna(inplace=True) + +y_test = test_data['sellingprice'].astype(np.float32) +X_test = test_data[['year', 'condition', 'transmission']] + +scaler_y = MinMaxScaler() +scaler_y.fit(y_test.values.reshape(-1, 1)) + +scaler_X = MinMaxScaler() +X_test['condition'] = scaler_X.fit_transform(X_test[['condition']]) +X_test = pd.get_dummies(X_test, columns=['transmission']) + +model = tf.keras.models.load_model('car_prices_predict_model.h5') + +y_pred_scaled = model.predict(X_test) + +y_pred = scaler_y.inverse_transform(y_pred_scaled) + +y_pred_df = pd.DataFrame(y_pred, columns=['PredictedSellingPrice']) +y_pred_df.to_csv('predicted_selling_prices.csv', index=False)