update dockerfile, data_expl, add nn_train

This commit is contained in:
Kamila 2022-04-19 16:17:09 +02:00
parent 85feb828bd
commit b298b6299f
3 changed files with 81 additions and 3 deletions

View File

@ -7,9 +7,13 @@ WORKDIR /app
COPY ./data_expl.py ./ COPY ./data_expl.py ./
COPY ./googleplaystore.csv ./ COPY ./googleplaystore.csv ./
COPY ./nn_train.py ./
RUN pip3 install pandas RUN pip3 install pandas
RUN pip3 install numpy RUN pip3 install numpy
RUN pip3 install tensorflow
RUN pip3 install keras
RUN pip3 install sklearn
CMD python3 data_expl.py CMD python3 data_expl.py
CMD python3 nn_train.py

View File

@ -14,6 +14,10 @@ for column in to_lowercase:
data["Installs"] = data["Installs"].replace({'\+': ''}, regex=True) data["Installs"] = data["Installs"].replace({'\+': ''}, regex=True)
data["Installs"] = data["Installs"].replace({',': ''}, regex=True) data["Installs"] = data["Installs"].replace({',': ''}, regex=True)
data["Price"] = data["Price"].replace({'\$': ''}, regex=True)
data["Genres"] = data["Genres"].astype('category')
data["Genres_numeric_value"] = (data["Genres"].cat.codes).astype('float32')
# normalizing numbers # normalizing numbers
data["Reviews"] = pd.to_numeric(data["Reviews"], errors='coerce') data["Reviews"] = pd.to_numeric(data["Reviews"], errors='coerce')
@ -26,7 +30,13 @@ max_value = data["Installs"].max()
min_value = data["Installs"].min() min_value = data["Installs"].min()
data["Installs"] = (data["Installs"] - min_value) / (max_value - min_value) data["Installs"] = (data["Installs"] - min_value) / (max_value - min_value)
#print(data) data["Rating"] = np.asarray(data["Rating"]).astype('float32')
data["Reviews"] = np.asarray(data["Reviews"]).astype('float32')
data["Installs"] = np.asarray(data["Installs"]).astype('float32')
data["Price"] = np.asarray(data["Price"]).astype('float32')
print(data)
# splitting into sets # splitting into sets

64
nn_train.py Normal file
View File

@ -0,0 +1,64 @@
import pandas as pd
import numpy as np
from tensorflow.keras.models import Sequential
from tensorflow.keras.layers import Dense
from sklearn.preprocessing import LabelEncoder
from keras.utils import np_utils
# reading data
def read_data():
all_data = []
for name in ['train', 'test', 'validate']:
all_data.append(pd.read_csv(f'apps_{name}.csv', header=0))
return all_data
train_set, test_set, validate_set = read_data()
train_set = train_set.drop(columns=["Unnamed: 0"])
test_set = test_set.drop(columns=["Unnamed: 0"])
validate_set = validate_set.drop(columns=["Unnamed: 0"])
numeric_columns = ["Rating", "Reviews", "Installs", "Price", "Genres_numeric_value"]
# train set set-up
x_train_set = train_set[numeric_columns]
y_train_set = train_set["Category"]
encoder = LabelEncoder()
encoder.fit(y_train_set)
encoded_Y = encoder.transform(y_train_set)
dummy_y = np_utils.to_categorical(encoded_Y)
# validation set set-up
x_validate_set = validate_set[numeric_columns]
y_validate_set = validate_set["Category"]
encoder = LabelEncoder()
encoder.fit(y_validate_set)
encoded_Yv = encoder.transform(y_validate_set)
dummy_yv = np_utils.to_categorical(encoded_Yv)
#test set set-up
x_test_set = test_set[numeric_columns]
y_test_set = test_set["Category"]
y_class_names = train_set["Category"].unique()
encoder = LabelEncoder()
encoder.fit(y_test_set)
encoded_Ytt = encoder.transform(y_test_set)
dummy_ytt = np_utils.to_categorical(encoded_Ytt)
# model definition
number_of_classes = 33
number_of_features = 5
model = Sequential()
model.add(Dense(number_of_classes, activation='relu'))
model.add(Dense(number_of_classes, activation='softmax',input_dim=number_of_features))
model.compile(optimizer='adam', loss='categorical_crossentropy', metrics=['accuracy', 'categorical_accuracy'])
model.fit(x_train_set, dummy_y, epochs=200, validation_data=(x_validate_set, dummy_yv))
#model.save("my_model/")
#model predictions
#model = keras.models.load_model('my_model')
yhat = model.predict(x_test_set)
f = open("results.txt", "w")
for numerator, single_pred in enumerate(yhat):
f.write(f"PREDICTED: {sorted(y_class_names)[np.argmax(single_pred)]}, ACTUAL: {y_test_set[numerator]} {sorted(y_class_names)[np.argmax(single_pred)] == y_test_set[numerator]}\n")
f.close()