2021-05-20 23:48:25 +02:00
|
|
|
import warnings
|
|
|
|
|
2021-04-24 21:18:57 +02:00
|
|
|
import pandas as pd
|
2021-04-24 22:23:04 +02:00
|
|
|
import numpy as np
|
|
|
|
|
|
|
|
from tensorflow import keras
|
2021-05-17 21:36:02 +02:00
|
|
|
import sys
|
2021-05-22 17:10:35 +02:00
|
|
|
import mlflow
|
|
|
|
import mlflow.models
|
2021-05-20 23:48:25 +02:00
|
|
|
import logging
|
|
|
|
|
|
|
|
from evaluate_network import evaluate_model
|
|
|
|
|
|
|
|
logging.basicConfig(level=logging.WARN)
|
|
|
|
logger = logging.getLogger(__name__)
|
|
|
|
|
2021-05-22 17:51:31 +02:00
|
|
|
mlflow.set_tracking_uri("http://172.17.0.1:5000")
|
2021-05-20 23:48:25 +02:00
|
|
|
mlflow.set_experiment("s434765")
|
|
|
|
|
|
|
|
warnings.filterwarnings("ignore")
|
|
|
|
np.random.seed(40)
|
2021-05-17 19:24:30 +02:00
|
|
|
|
|
|
|
def normalize_data(data):
|
|
|
|
return (data - np.min(data)) / (np.max(data) - np.min(data))
|
|
|
|
|
|
|
|
|
2021-05-17 21:04:57 +02:00
|
|
|
data = pd.read_csv("data_train", sep=',', skip_blank_lines=True, nrows=1087, error_bad_lines=False,
|
|
|
|
names=["video_id", "last_trending_date", "publish_date", "publish_hour", "category_id",
|
|
|
|
"channel_title", "views", "likes", "dislikes", "comment_count"]).dropna()
|
|
|
|
|
|
|
|
X = data.loc[:,data.columns == "views"].astype(int)
|
|
|
|
y = data.loc[:,data.columns == "likes"].astype(int)
|
2021-04-24 22:23:04 +02:00
|
|
|
|
2021-05-17 19:24:30 +02:00
|
|
|
min_val_sub = np.min(X)
|
|
|
|
max_val_sub = np.max(X)
|
|
|
|
X = (X - min_val_sub) / (max_val_sub - min_val_sub)
|
|
|
|
print(min_val_sub)
|
|
|
|
print(max_val_sub)
|
2021-04-24 22:23:04 +02:00
|
|
|
|
2021-05-17 19:24:30 +02:00
|
|
|
min_val_like = np.min(y)
|
|
|
|
max_val_like = np.max(y)
|
|
|
|
y = (y - min_val_like) / (max_val_like - min_val_like)
|
2021-04-24 22:23:04 +02:00
|
|
|
|
2021-05-17 19:24:30 +02:00
|
|
|
print(min_val_like)
|
|
|
|
print(max_val_like)
|
2021-04-24 22:23:04 +02:00
|
|
|
|
2021-05-20 23:48:25 +02:00
|
|
|
with mlflow.start_run() as run:
|
|
|
|
print("MLflow run experiment_id: {0}".format(run.info.experiment_id))
|
|
|
|
print("MLflow run artifact_uri: {0}".format(run.info.artifact_uri))
|
|
|
|
|
|
|
|
mlflow.keras.autolog()
|
|
|
|
mlflow.log_param("epochs", int(sys.argv[1]))
|
|
|
|
model = keras.Sequential([
|
|
|
|
keras.layers.Dense(512,input_dim = X.shape[1], activation='relu'),
|
|
|
|
keras.layers.Dense(256, activation='relu'),
|
|
|
|
keras.layers.Dense(256, activation='relu'),
|
|
|
|
keras.layers.Dense(128, activation='relu'),
|
|
|
|
keras.layers.Dense(1,activation='linear'),
|
|
|
|
])
|
2021-04-24 22:23:04 +02:00
|
|
|
|
2021-05-20 23:48:25 +02:00
|
|
|
model.compile(loss='mean_absolute_error', optimizer="Adam", metrics=['mean_absolute_error'])
|
2021-04-24 22:23:04 +02:00
|
|
|
|
2021-05-20 23:48:25 +02:00
|
|
|
model.fit(X, y, epochs=int(sys.argv[1]), validation_split = 0.3)
|
2021-04-24 22:23:04 +02:00
|
|
|
|
2021-05-20 23:48:25 +02:00
|
|
|
model.save('model')
|
2021-04-24 22:23:04 +02:00
|
|
|
|
2021-05-20 23:48:25 +02:00
|
|
|
error = evaluate_model()
|
|
|
|
mlflow.log_metric("rmse", error)
|
2021-05-22 17:10:35 +02:00
|
|
|
signature = mlflow.models.signature.infer_signature(X, model.predict(y))
|
|
|
|
data = pd.read_csv("data_dev", sep=',', error_bad_lines=False,
|
|
|
|
skip_blank_lines=True, nrows=527, names=["video_id", "last_trending_date",
|
|
|
|
"publish_date", "publish_hour", "category_id",
|
|
|
|
"channel_title", "views", "likes", "dislikes",
|
|
|
|
"comment_count"]).dropna()
|
|
|
|
X_test = data.loc[:, data.columns == "views"].astype(int)
|
|
|
|
mlflow.keras.save_model(model, "model", signature=signature, input_example=X_test)
|