ium_444501/biblioteki_ml.py

import sys
from urllib.parse import urlparse
import numpy as np
import mlflow
import torch
import torch.nn as nn
import torch.nn.functional as F
from sklearn.preprocessing import LabelEncoder
import pandas as pd

# MLFlow
mlflow.set_tracking_uri("http://172.17.0.1:5000")
# Niechcący usunąłem eksperyment s444501 i nie mogę go przywrócić dlatego stworzyłem drugi bez literki 's' na początku
mlflow.set_experiment("444501")


# Parametry z konsoli
try:
    epochs = int(sys.argv[1])
except:
    print('No epoch number passed. Defaulting to 100')
    epochs = 100


# Model
class Model(nn.Module):
    def __init__(self, input_features=2, hidden_layer1=60, hidden_layer2=90, output_features=3):
        super().__init__()
        self.fc1 = nn.Linear(input_features, hidden_layer1)
        self.fc2 = nn.Linear(hidden_layer1, hidden_layer2)
        self.out = nn.Linear(hidden_layer2, output_features)

    def forward(self, x):
        x = F.relu(self.fc1(x))
        x = F.relu(self.fc2(x))
        x = self.out(x)
        return x


def train_main(epochs, run):
    # Ładowanie danych
    train_set = pd.read_csv('d_train.csv', encoding='latin-1')
    train_set = train_set[['Rating', 'Branch', 'Reviewer_Location']]

    test_set = pd.read_csv('d_test.csv', encoding='latin-1')
    test_set = test_set[['Rating', 'Branch', 'Reviewer_Location']]


    # Mapowanie kolumny 'Reviewer_Location' na cyfry
    le = LabelEncoder()
    le.fit(pd.concat([train_set['Reviewer_Location'], test_set['Reviewer_Location']]))
    train_set['Reviewer_Location'] = le.transform(train_set['Reviewer_Location'])
    test_set['Reviewer_Location'] = le.transform(test_set['Reviewer_Location'])


    # Mapowanie kolumny 'Branch' na inny sposób
    mappings = {
        'Disneyland_California': 0,
        'Disneyland_Paris': 1,
        'Disneyland_HongKong': 2
    }
    train_set['Branch'] = train_set['Branch'].apply(lambda x: mappings[x])
    test_set['Branch'] = test_set['Branch'].apply(lambda x: mappings[x])


    # Zamiana danych na tensory
    X_train = train_set[['Rating', 'Reviewer_Location']].to_numpy()
    X_test = test_set[['Rating', 'Reviewer_Location']].to_numpy()
    y_train = train_set['Branch'].to_numpy()
    y_test = test_set['Branch'].to_numpy()

    X_train = torch.FloatTensor(X_train)
    X_test = torch.FloatTensor(X_test)
    y_train = torch.LongTensor(y_train)
    y_test = torch.LongTensor(y_test)


    # Hiperparametry
    model = Model()
    criterion = nn.CrossEntropyLoss()
    optimizer = torch.optim.Adam(model.parameters(), lr=0.01)


    # Trening
    losses = []
    for i in range(epochs):
        y_pred = model.forward(X_train)
        loss = criterion(y_pred, y_train)
        losses.append(loss)
        print(f'epoch: {i:2}  loss: {loss.item():10.8f}')

        optimizer.zero_grad()
        loss.backward()
        optimizer.step()


    # Testy
    preds = []
    with torch.no_grad():
        for val in X_test:
            y_hat = model.forward(val)
            preds.append(y_hat.argmax().item())

    df = pd.DataFrame({'Testing Y': y_test, 'Predicted Y': preds})
    df['Correct'] = [1 if corr == pred else 0 for corr, pred in zip(df['Testing Y'], df['Predicted Y'])]
    correct = df['Correct'].sum() / len(df)
    print(f"{correct} percent of predictions correct")


    # Logi
    mlflow.log_param("epochs", epochs)
    mlflow.log_metric("final_loss", losses[-1].item())
    mlflow.log_metric("accuracy", correct)

    signature = mlflow.models.signature.infer_signature(X_train.numpy(), np.array(preds))
    tracking_url_type_store = urlparse(mlflow.get_tracking_uri()).scheme

    if tracking_url_type_store != "file":
        mlflow.pytorch.log_model(model,
                                 '444501',
                                 registered_model_name='444501',
                                 signature=signature,
                                 input_example=X_test.numpy())
    else:
        mlflow.pytorch.log_model(model,
                                 '444501',
                                 signature=signature,
                                 input_example=X_test.numpy())

    # Zapis do pliku
    df.to_csv('neural_network_prediction_results.csv', index=False)
    torch.save(model, "model.pkl")


with mlflow.start_run() as run:
    print(f"MLflow run experiment_id: {run.info.experiment_id}")
    print(f"MLflow run artifact_uri: {run.info.artifact_uri}")
    train_main(epochs, run)
. 2022-05-01 14:59:44 +02:00			`import sys`
mlflow 2022-05-13 00:59:11 +02:00			`from urllib.parse import urlparse`
			`import numpy as np`
			`import mlflow`
pytorch 2022-04-24 02:37:51 +02:00			`import torch`
			`import torch.nn as nn`
			`import torch.nn.functional as F`
			`from sklearn.preprocessing import LabelEncoder`
			`import pandas as pd`
mlflow 2022-05-13 00:59:11 +02:00
mlflow 2022-05-13 01:51:10 +02:00			`# MLFlow`
mlflow 2022-05-13 02:00:31 +02:00			`mlflow.set_tracking_uri("http://172.17.0.1:5000")`
mlflow 2022-05-13 02:39:21 +02:00			`# Niechcący usunąłem eksperyment s444501 i nie mogę go przywrócić dlatego stworzyłem drugi bez literki 's' na początku`
mlflow 2022-05-13 02:11:38 +02:00			`mlflow.set_experiment("444501")`
mlflow 2022-05-13 00:59:11 +02:00

			`# Parametry z konsoli`
			`try:`
			`epochs = int(sys.argv[1])`
			`except:`
			`print('No epoch number passed. Defaulting to 100')`
			`epochs = 100`
. 2022-05-01 14:59:44 +02:00

pytorch 2022-04-24 02:37:51 +02:00			`# Model`
			`class Model(nn.Module):`
			`def __init__(self, input_features=2, hidden_layer1=60, hidden_layer2=90, output_features=3):`
			`super().__init__()`
			`self.fc1 = nn.Linear(input_features, hidden_layer1)`
			`self.fc2 = nn.Linear(hidden_layer1, hidden_layer2)`
			`self.out = nn.Linear(hidden_layer2, output_features)`

			`def forward(self, x):`
			`x = F.relu(self.fc1(x))`
			`x = F.relu(self.fc2(x))`
			`x = self.out(x)`
			`return x`


mlflow 2022-05-13 00:59:11 +02:00			`def train_main(epochs, run):`
sacred 2022-05-02 17:39:03 +02:00			`# Ładowanie danych`
			`train_set = pd.read_csv('d_train.csv', encoding='latin-1')`
			`train_set = train_set[['Rating', 'Branch', 'Reviewer_Location']]`

			`test_set = pd.read_csv('d_test.csv', encoding='latin-1')`
			`test_set = test_set[['Rating', 'Branch', 'Reviewer_Location']]`
pytorch 2022-04-24 02:37:51 +02:00

sacred 2022-05-02 17:39:03 +02:00			`# Mapowanie kolumny 'Reviewer_Location' na cyfry`
			`le = LabelEncoder()`
			`le.fit(pd.concat([train_set['Reviewer_Location'], test_set['Reviewer_Location']]))`
			`train_set['Reviewer_Location'] = le.transform(train_set['Reviewer_Location'])`
			`test_set['Reviewer_Location'] = le.transform(test_set['Reviewer_Location'])`
pytorch 2022-04-24 02:37:51 +02:00

sacred 2022-05-02 17:39:03 +02:00			`# Mapowanie kolumny 'Branch' na inny sposób`
			`mappings = {`
			`'Disneyland_California': 0,`
			`'Disneyland_Paris': 1,`
			`'Disneyland_HongKong': 2`
			`}`
			`train_set['Branch'] = train_set['Branch'].apply(lambda x: mappings[x])`
			`test_set['Branch'] = test_set['Branch'].apply(lambda x: mappings[x])`
pytorch 2022-04-24 02:37:51 +02:00

sacred 2022-05-02 17:39:03 +02:00			`# Zamiana danych na tensory`
			`X_train = train_set[['Rating', 'Reviewer_Location']].to_numpy()`
			`X_test = test_set[['Rating', 'Reviewer_Location']].to_numpy()`
			`y_train = train_set['Branch'].to_numpy()`
			`y_test = test_set['Branch'].to_numpy()`
pytorch 2022-04-24 02:37:51 +02:00
sacred 2022-05-02 17:39:03 +02:00			`X_train = torch.FloatTensor(X_train)`
			`X_test = torch.FloatTensor(X_test)`
			`y_train = torch.LongTensor(y_train)`
			`y_test = torch.LongTensor(y_test)`
pytorch 2022-04-24 02:37:51 +02:00

sacred 2022-05-02 17:39:03 +02:00			`# Hiperparametry`
			`model = Model()`
			`criterion = nn.CrossEntropyLoss()`
			`optimizer = torch.optim.Adam(model.parameters(), lr=0.01)`
pytorch 2022-04-24 02:37:51 +02:00

sacred 2022-05-02 17:39:03 +02:00			`# Trening`
			`losses = []`
			`for i in range(epochs):`
			`y_pred = model.forward(X_train)`
			`loss = criterion(y_pred, y_train)`
			`losses.append(loss)`
			`print(f'epoch: {i:2} loss: {loss.item():10.8f}')`
pytorch 2022-04-24 02:37:51 +02:00
sacred 2022-05-02 17:39:03 +02:00			`optimizer.zero_grad()`
			`loss.backward()`
			`optimizer.step()`
pytorch 2022-04-24 02:37:51 +02:00

sacred 2022-05-02 17:39:03 +02:00			`# Testy`
			`preds = []`
			`with torch.no_grad():`
			`for val in X_test:`
			`y_hat = model.forward(val)`
			`preds.append(y_hat.argmax().item())`
pytorch 2022-04-24 02:37:51 +02:00
sacred 2022-05-02 17:39:03 +02:00			`df = pd.DataFrame({'Testing Y': y_test, 'Predicted Y': preds})`
			`df['Correct'] = [1 if corr == pred else 0 for corr, pred in zip(df['Testing Y'], df['Predicted Y'])]`
mlflow 2022-05-13 00:59:11 +02:00			`correct = df['Correct'].sum() / len(df)`
			`print(f"{correct} percent of predictions correct")`


			`# Logi`
			`mlflow.log_param("epochs", epochs)`
			`mlflow.log_metric("final_loss", losses[-1].item())`
			`mlflow.log_metric("accuracy", correct)`

			`signature = mlflow.models.signature.infer_signature(X_train.numpy(), np.array(preds))`
			`tracking_url_type_store = urlparse(mlflow.get_tracking_uri()).scheme`

			`if tracking_url_type_store != "file":`
			`mlflow.pytorch.log_model(model,`
mlflow 2022-05-13 02:11:38 +02:00			`'444501',`
			`registered_model_name='444501',`
mlflow 2022-05-13 00:59:11 +02:00			`signature=signature,`
			`input_example=X_test.numpy())`
			`else:`
			`mlflow.pytorch.log_model(model,`
mlflow 2022-05-13 02:11:38 +02:00			`'444501',`
mlflow 2022-05-13 00:59:11 +02:00			`signature=signature,`
			`input_example=X_test.numpy())`
pytorch 2022-04-24 02:37:51 +02:00
sacred 2022-05-02 17:39:03 +02:00			`# Zapis do pliku`
			`df.to_csv('neural_network_prediction_results.csv', index=False)`
			`torch.save(model, "model.pkl")`
pytorch 2022-04-24 02:37:51 +02:00
mlflow 2022-05-13 00:59:11 +02:00
			`with mlflow.start_run() as run:`
			`print(f"MLflow run experiment_id: {run.info.experiment_id}")`
			`print(f"MLflow run artifact_uri: {run.info.artifact_uri}")`
			`train_main(epochs, run)`