diff --git a/Dockerfile b/Dockerfile index 1a9839b..e44fa55 100644 --- a/Dockerfile +++ b/Dockerfile @@ -7,9 +7,13 @@ WORKDIR /app COPY ./data_expl.py ./ COPY ./googleplaystore.csv ./ +COPY ./nn_train.py ./ RUN pip3 install pandas RUN pip3 install numpy +RUN pip3 install tensorflow +RUN pip3 install keras +RUN pip3 install sklearn - -CMD python3 data_expl.py \ No newline at end of file +CMD python3 data_expl.py +CMD python3 nn_train.py \ No newline at end of file diff --git a/data_expl.py b/data_expl.py index 82e7860..8ff9648 100644 --- a/data_expl.py +++ b/data_expl.py @@ -14,6 +14,10 @@ for column in to_lowercase: data["Installs"] = data["Installs"].replace({'\+': ''}, regex=True) data["Installs"] = data["Installs"].replace({',': ''}, regex=True) +data["Price"] = data["Price"].replace({'\$': ''}, regex=True) + +data["Genres"] = data["Genres"].astype('category') +data["Genres_numeric_value"] = (data["Genres"].cat.codes).astype('float32') # normalizing numbers data["Reviews"] = pd.to_numeric(data["Reviews"], errors='coerce') @@ -26,7 +30,13 @@ max_value = data["Installs"].max() min_value = data["Installs"].min() data["Installs"] = (data["Installs"] - min_value) / (max_value - min_value) -#print(data) +data["Rating"] = np.asarray(data["Rating"]).astype('float32') +data["Reviews"] = np.asarray(data["Reviews"]).astype('float32') +data["Installs"] = np.asarray(data["Installs"]).astype('float32') +data["Price"] = np.asarray(data["Price"]).astype('float32') + + +print(data) # splitting into sets diff --git a/nn_train.py b/nn_train.py new file mode 100644 index 0000000..6f58717 --- /dev/null +++ b/nn_train.py @@ -0,0 +1,64 @@ + +import pandas as pd +import numpy as np + +from tensorflow.keras.models import Sequential +from tensorflow.keras.layers import Dense +from sklearn.preprocessing import LabelEncoder +from keras.utils import np_utils + +# reading data +def read_data(): + all_data = [] + for name in ['train', 'test', 'validate']: + all_data.append(pd.read_csv(f'apps_{name}.csv', header=0)) + return all_data + +train_set, test_set, validate_set = read_data() +train_set = train_set.drop(columns=["Unnamed: 0"]) +test_set = test_set.drop(columns=["Unnamed: 0"]) +validate_set = validate_set.drop(columns=["Unnamed: 0"]) +numeric_columns = ["Rating", "Reviews", "Installs", "Price", "Genres_numeric_value"] + +# train set set-up +x_train_set = train_set[numeric_columns] +y_train_set = train_set["Category"] +encoder = LabelEncoder() +encoder.fit(y_train_set) +encoded_Y = encoder.transform(y_train_set) +dummy_y = np_utils.to_categorical(encoded_Y) + +# validation set set-up +x_validate_set = validate_set[numeric_columns] +y_validate_set = validate_set["Category"] +encoder = LabelEncoder() +encoder.fit(y_validate_set) +encoded_Yv = encoder.transform(y_validate_set) +dummy_yv = np_utils.to_categorical(encoded_Yv) + +#test set set-up +x_test_set = test_set[numeric_columns] +y_test_set = test_set["Category"] +y_class_names = train_set["Category"].unique() +encoder = LabelEncoder() +encoder.fit(y_test_set) +encoded_Ytt = encoder.transform(y_test_set) +dummy_ytt = np_utils.to_categorical(encoded_Ytt) + +# model definition +number_of_classes = 33 +number_of_features = 5 +model = Sequential() +model.add(Dense(number_of_classes, activation='relu')) +model.add(Dense(number_of_classes, activation='softmax',input_dim=number_of_features)) +model.compile(optimizer='adam', loss='categorical_crossentropy', metrics=['accuracy', 'categorical_accuracy']) +model.fit(x_train_set, dummy_y, epochs=200, validation_data=(x_validate_set, dummy_yv)) +#model.save("my_model/") + +#model predictions +#model = keras.models.load_model('my_model') +yhat = model.predict(x_test_set) +f = open("results.txt", "w") +for numerator, single_pred in enumerate(yhat): + f.write(f"PREDICTED: {sorted(y_class_names)[np.argmax(single_pred)]}, ACTUAL: {y_test_set[numerator]} {sorted(y_class_names)[np.argmax(single_pred)] == y_test_set[numerator]}\n") +f.close()