diff --git a/biblioteki_ml.py b/biblioteki_ml.py index 4f93aad..47c7f66 100644 --- a/biblioteki_ml.py +++ b/biblioteki_ml.py @@ -2,16 +2,10 @@ import sys import torch import torch.nn as nn import torch.nn.functional as F +from sacred.observers import FileStorageObserver, MongoObserver from sklearn.preprocessing import LabelEncoder import pandas as pd - - -# Parametry z konsoli -try: - epochs = int(sys.argv[1]) -except: - print('No epoch number passed. Defaulting to 100') - epochs = 100 +from sacred import Experiment # Model @@ -29,74 +23,105 @@ class Model(nn.Module): return x -# Ładowanie danych -train_set = pd.read_csv('d_train.csv', encoding='latin-1') -train_set = train_set[['Rating', 'Branch', 'Reviewer_Location']] - -test_set = pd.read_csv('d_test.csv', encoding='latin-1') -test_set = test_set[['Rating', 'Branch', 'Reviewer_Location']] +# Sacred +ex = Experiment() +ex.observers.append(FileStorageObserver('my_runs')) +# Parametry treningu -> my_runs/X/config.json +# Plik z modelem jako artefakt -> my_runs/X/model.pkl +# Kod źródłowy -> my_runs/_sources/biblioteki_ml_XXXXXXXXXXX.py +# Wyniki (ostateczny loss) -> my_runs/X/metrics.json +ex.observers.append(MongoObserver(url='mongodb://mongo_user:mongo_password_IUM_2021@localhost:27017', + db_name='sacred')) -# Mapowanie kolumny 'Reviewer_Location' na cyfry -le = LabelEncoder() -le.fit(pd.concat([train_set['Reviewer_Location'], test_set['Reviewer_Location']])) -train_set['Reviewer_Location'] = le.transform(train_set['Reviewer_Location']) -test_set['Reviewer_Location'] = le.transform(test_set['Reviewer_Location']) +@ex.config +def my_config(): + epochs = 100 -# Mapowanie kolumny 'Branch' na inny sposób -mappings = { - 'Disneyland_California': 0, - 'Disneyland_Paris': 1, - 'Disneyland_HongKong': 2 -} -train_set['Branch'] = train_set['Branch'].apply(lambda x: mappings[x]) -test_set['Branch'] = test_set['Branch'].apply(lambda x: mappings[x]) +@ex.automain +def train_main(epochs, _run): + # Parametry z konsoli + # try: + # epochs = int(sys.argv[1]) + # except: + # print('No epoch number passed. Defaulting to 100') + # epochs = 100 -# Zamiana danych na tensory -X_train = train_set[['Rating', 'Reviewer_Location']].to_numpy() -X_test = test_set[['Rating', 'Reviewer_Location']].to_numpy() -y_train = train_set['Branch'].to_numpy() -y_test = test_set['Branch'].to_numpy() + # Ładowanie danych + train_set = pd.read_csv('d_train.csv', encoding='latin-1') + train_set = train_set[['Rating', 'Branch', 'Reviewer_Location']] -X_train = torch.FloatTensor(X_train) -X_test = torch.FloatTensor(X_test) -y_train = torch.LongTensor(y_train) -y_test = torch.LongTensor(y_test) + test_set = pd.read_csv('d_test.csv', encoding='latin-1') + test_set = test_set[['Rating', 'Branch', 'Reviewer_Location']] -# Hiperparametry -model = Model() -criterion = nn.CrossEntropyLoss() -optimizer = torch.optim.Adam(model.parameters(), lr=0.01) + # Mapowanie kolumny 'Reviewer_Location' na cyfry + le = LabelEncoder() + le.fit(pd.concat([train_set['Reviewer_Location'], test_set['Reviewer_Location']])) + train_set['Reviewer_Location'] = le.transform(train_set['Reviewer_Location']) + test_set['Reviewer_Location'] = le.transform(test_set['Reviewer_Location']) -# Trening -losses = [] -for i in range(epochs): - y_pred = model.forward(X_train) - loss = criterion(y_pred, y_train) - losses.append(loss) - print(f'epoch: {i:2} loss: {loss.item():10.8f}') - - optimizer.zero_grad() - loss.backward() - optimizer.step() + # Mapowanie kolumny 'Branch' na inny sposób + mappings = { + 'Disneyland_California': 0, + 'Disneyland_Paris': 1, + 'Disneyland_HongKong': 2 + } + train_set['Branch'] = train_set['Branch'].apply(lambda x: mappings[x]) + test_set['Branch'] = test_set['Branch'].apply(lambda x: mappings[x]) -# Testy -preds = [] -with torch.no_grad(): - for val in X_test: - y_hat = model.forward(val) - preds.append(y_hat.argmax().item()) + # Zamiana danych na tensory + X_train = train_set[['Rating', 'Reviewer_Location']].to_numpy() + X_test = test_set[['Rating', 'Reviewer_Location']].to_numpy() + y_train = train_set['Branch'].to_numpy() + y_test = test_set['Branch'].to_numpy() -df = pd.DataFrame({'Testing Y': y_test, 'Predicted Y': preds}) -df['Correct'] = [1 if corr == pred else 0 for corr, pred in zip(df['Testing Y'], df['Predicted Y'])] -print(f"{df['Correct'].sum() / len(df)} percent of predictions correct") + X_train = torch.FloatTensor(X_train) + X_test = torch.FloatTensor(X_test) + y_train = torch.LongTensor(y_train) + y_test = torch.LongTensor(y_test) -# Zapis do pliku -df.to_csv('neural_network_prediction_results.csv', index=False) -torch.save(model, "model.pkl") + # Hiperparametry + model = Model() + criterion = nn.CrossEntropyLoss() + optimizer = torch.optim.Adam(model.parameters(), lr=0.01) + + + # Trening + losses = [] + for i in range(epochs): + y_pred = model.forward(X_train) + loss = criterion(y_pred, y_train) + losses.append(loss) + print(f'epoch: {i:2} loss: {loss.item():10.8f}') + + optimizer.zero_grad() + loss.backward() + optimizer.step() + _run.log_scalar("training.final_loss", losses[-1].item()) # Ostateczny loss + + + # Testy + preds = [] + with torch.no_grad(): + for val in X_test: + y_hat = model.forward(val) + preds.append(y_hat.argmax().item()) + + df = pd.DataFrame({'Testing Y': y_test, 'Predicted Y': preds}) + df['Correct'] = [1 if corr == pred else 0 for corr, pred in zip(df['Testing Y'], df['Predicted Y'])] + print(f"{df['Correct'].sum() / len(df)} percent of predictions correct") + + + # Zapis do pliku + df.to_csv('neural_network_prediction_results.csv', index=False) + torch.save(model, "model.pkl") + + # Zapis Sacred + ex.add_artifact("model.pkl") + ex.add_artifact("neural_network_prediction_results.csv") diff --git a/training.Jenkinsfile b/training.Jenkinsfile index 8a608e0..c2bddfb 100644 --- a/training.Jenkinsfile +++ b/training.Jenkinsfile @@ -24,13 +24,13 @@ pipeline { stage('Train model') { steps { withEnv(["EPOCH=${params.EPOCH}"]) { - sh 'python biblioteki_ml.py $EPOCH' + sh 'python biblioteki_ml.py with "epochs=$EPOCH"' } } } - stage('Archive model') { + stage('Archive artifacts') { steps { - archiveArtifacts artifacts: 'model.pkl, neural_network_prediction_results.csv' + archiveArtifacts artifacts: 'model.pkl, neural_network_prediction_results.csv, my_run' } } stage ('Model - evaluation') {