This commit is contained in:
patrycjalazna 2021-04-25 19:08:03 +02:00
parent fad654ca64
commit 78645d7188
5 changed files with 6771 additions and 28 deletions

6
.idea/vcs.xml Normal file
View File

@ -0,0 +1,6 @@
<?xml version="1.0" encoding="UTF-8"?>
<project version="4">
<component name="VcsDirectoryMappings">
<mapping directory="$PROJECT_DIR$" vcs="Git" />
</component>
</project>

View File

@ -4,12 +4,28 @@ RUN apt update
RUN apt install -y python3 python3-pip RUN apt install -y python3 python3-pip
RUN pip3 install kaggle RUN pip3 install kaggle
RUN apt install -y unzip RUN pip3 install pandas
RUN pip3 install tensorflow
RUN pip3 install scikit-learn
RUN pip3 install pandas
# RUN apt install -y unzip
RUN mkdir /.kaggle RUN mkdir /.kaggle
RUN chmod -R 777 /.kaggle RUN chmod -R 777 /.kaggle
WORKDIR /app WORKDIR /app
COPY ./avocado-preprocessing.sh ./
RUN chmod +x avocado-preprocessing.sh COPY ./requirements.txt ./
CMD ./avocado-preprocessing.sh COPY ./avocado-preprocessing.py ./
RUN chmod +x ./requirements.txt
RUN chmod +x ./avocado-preprocessing.py
RUN pip3 install -r ./requirements.txt
CMD python3 avocado-preprocessing.py
# COPY ./avocado-preprocessing.sh ./
# RUN chmod +x avocado-preprocessing.sh
# CMD ./avocado-preprocessing.sh

View File

@ -2,42 +2,34 @@ import kaggle
import pandas as pd import pandas as pd
import numpy as np import numpy as np
from sklearn import preprocessing from sklearn import preprocessing
from sklearn.linear_model import LinearRegression
import tensorflow as tf
from tensorflow.keras.layers import Input, Dense, Activation,Dropout
from tensorflow.keras.models import Model
from tensorflow.keras.callbacks import EarlyStopping
from keras.models import Sequential
device = 'cpu'
# kaggle # kaggle
kaggle.api.authenticate() kaggle.api.authenticate()
kaggle.api.dataset_download_files('timmate/avocado-prices-2020', path='.', unzip=True) kaggle.api.dataset_download_files('timmate/avocado-prices-2020', path='.', unzip=True)
# wczytanie danych
avocado_with_year = pd.read_csv('avocado-updated-2020.csv') avocado_with_year = pd.read_csv('avocado-updated-2020.csv')
# usuniecie redundantnej kolumny 'year' i zamiana wartosci 'type' na 0 lub 1
new = ['date', 'average_price', 'total_volume', '4046', '4225', '4770', 'total_bags', 'small_bags', 'large_bags', 'xlarge_bags', 'type', 'geography'] new = ['date', 'average_price', 'total_volume', '4046', '4225', '4770', 'total_bags', 'small_bags', 'large_bags', 'xlarge_bags', 'type', 'geography']
avocado = avocado_with_year[new] avocado = avocado_with_year[new]
avocado.to_csv("avocado.csv", index=False) avocado.to_csv("avocado.csv", index=False)
avocado = pd.read_csv('avocado.csv') avocado = pd.read_csv('avocado.csv')
avocado['type'] = avocado.type.map(dict(organic=1, conventional=0))
avocado_train, avocado_validate, avocado_test = np.split(avocado.sample(frac=1), [int(.6*len(avocado)), int(.8*len(avocado))]) # usuniecie wierszy z pustymi wartosciami
avocado.isnull().sum()
print("Avocado: ".ljust(20), np.size(avocado)) avocado.dropna()
print("Avocado (train) : ".ljust(20), np.size(avocado_train))
print("Avocado (validate): ".ljust(20), np.size(avocado_validate))
print("Avocado (test) ".ljust(20), np.size(avocado_test))
avocado.describe(include = 'all')
avocado_train.describe(include= 'all')
avocado_validate.describe(include = 'all')
avocado_test.describe(include = 'all')
avocado.geography.value_counts()
avocado_test.geography.value_counts()
avocado_train.geography.value_counts()
pd.value_counts(avocado['type']).plot.bar()
pd.value_counts(avocado_train['type']).plot.bar()
pd.value_counts(avocado_test['type']).plot.bar()
avocado['average_price'].hist()
avocado_train['average_price'].hist()
avocado_validate['average_price'].hist()
avocado_test['average_price'].hist()
# preprocessing
num_values = avocado.select_dtypes(include='float64').values num_values = avocado.select_dtypes(include='float64').values
scaler = preprocessing.MinMaxScaler() scaler = preprocessing.MinMaxScaler()
x_scaled = scaler.fit_transform(num_values) x_scaled = scaler.fit_transform(num_values)
@ -47,5 +39,66 @@ for col in avocado.columns:
if col in num_columns: if col in num_columns:
avocado[col] = avocado_normalized[col] avocado[col] = avocado_normalized[col]
avocado.isnull().sum() avocado_normalized['type'] = avocado['type']
avocado.dropna() avocado_normalized['geography'] = avocado['geography']
# podział na train/dev/test
avocado_train, avocado_validate, avocado_test = np.split(avocado_normalized.sample(frac=1), [int(.6*len(avocado_normalized)), int(.8*len(avocado_normalized))])
print("Avocado: ".ljust(20), np.size(avocado_normalized))
print("Avocado (train) : ".ljust(20), np.size(avocado_train))
print("Avocado (validate): ".ljust(20), np.size(avocado_validate))
print("Avocado (test) ".ljust(20), np.size(avocado_test))
# sprawdzenie danych
avocado_normalized.describe(include = 'all')
avocado_train.describe(include= 'all')
avocado_validate.describe(include = 'all')
avocado_test.describe(include = 'all')
avocado_normalized.geography.value_counts()
avocado_test.geography.value_counts()
avocado_train.geography.value_counts()
pd.value_counts(avocado_normalized['type']).plot.bar()
pd.value_counts(avocado_train['type']).plot.bar()
pd.value_counts(avocado_test['type']).plot.bar()
avocado_normalized['average_price'].hist()
avocado_train['average_price'].hist()
avocado_validate['average_price'].hist()
avocado_test['average_price'].hist()
# print(avocado_train[:10])
# print(avocado_test[:10])
print(avocado_normalized)
# podzial na X i y
X_train = avocado_train[['average_price', 'total_volume', '4046', '4225', '4770', 'total_bags', 'small_bags', 'large_bags', 'xlarge_bags']]
y_train = avocado_train[['type']]
X_test = avocado_test[['average_price', 'total_volume', '4046', '4225', '4770', 'total_bags', 'small_bags', 'large_bags', 'xlarge_bags']]
y_test = avocado_test[['type']]
print(X_train.shape[1])
# keras model
model = Sequential()
model.add(Dense(9, input_dim = X_train.shape[1], kernel_initializer='normal', activation='relu'))
model.add(Dense(1,kernel_initializer='normal', activation='sigmoid'))
early_stop = EarlyStopping(monitor="val_loss", mode="min", verbose=1, patience=10)
# kompilacja
model.compile(loss='binary_crossentropy', optimizer='adam', metrics=['accuracy'])
# model fit
model.fit(X_train, y_train, epochs=30, batch_size=10, validation_data=(X_test, y_test))
# predict
predictions = model.predict(X_test)
pd.DataFrame(predictions).to_csv('prediction_results.csv')
# ewaluacja
# _, accuracy = model.evaluate(y_test, predictions)
# print('Accuracy: %.2f' % (accuracy*100))

6610
prediction_results.csv Normal file

File diff suppressed because it is too large Load Diff

58
requirements.txt Normal file
View File

@ -0,0 +1,58 @@
appnope==0.1.0
attrs==19.1.0
backcall==0.1.0
bleach==3.1.0
click==7.1.2
csv-diff==1.0
cycler==0.10.0
decorator==4.4.0
defusedxml==0.6.0
dictdiffer==0.8.1
entrypoints==0.3
ipykernel==5.1.1
ipython==7.5.0
ipython-genutils==0.2.0
ipywidgets==7.4.2
jedi==0.13.3
jsonschema==3.0.1
jupyter==1.0.0
jupyter-client==5.2.4
jupyter-console==6.0.0
jupyter-core==4.4.0
kiwisolver==1.1.0
matplotlib==3.1.0
mistune==0.8.4
mpmath==1.1.0
nbconvert==5.5.0
nbformat==4.4.0
nose==1.3.7
notebook==5.7.8
numpy==1.16.4
pandas==0.24.2
pandocfilters==1.4.2
parso==0.4.0
pexpect==4.7.0
pickleshare==0.7.5
prometheus-client==0.7.0
prompt-toolkit==2.0.9
ptyprocess==0.6.0
pygame==1.9.4
Pygments==2.4.2
pyparsing==2.4.0
pyrsistent==0.15.2
python-dateutil==2.8.0
pytz==2019.1
pyzmq==18.0.1
qtconsole==4.5.1
scipy==1.3.0
Send2Trash==1.5.0
six==1.12.0
sympy==1.4
terminado==0.8.2
testpath==0.4.2
tornado==6.0.2
traitlets==4.3.2
wcwidth==0.1.7
webencodings==0.5.1
widgetsnbextension==3.4.2
xlrd==1.2.0