diff --git a/Dockerfile b/Dockerfile index 680f6aa..64aa453 100644 --- a/Dockerfile +++ b/Dockerfile @@ -9,3 +9,5 @@ RUN pip3 install numpy RUN pip3 install matplotlib RUN pip3 install sklearn RUN pip3 install silence-tensorflow +RUN pip3 install sacred +RUN pip3 install mlflow diff --git a/Jenkinsfile_eval_tensorflow b/Jenkinsfile_eval_tensorflow index 93d37d2..78a4afa 100644 --- a/Jenkinsfile_eval_tensorflow +++ b/Jenkinsfile_eval_tensorflow @@ -42,7 +42,7 @@ pipeline { } stage('archiveArtifacts') { - steps{ + steps { archiveArtifacts 'evaluation.txt' } } diff --git a/eval-tensorflow.py b/eval-tensorflow.py index 736c115..c123fae 100644 --- a/eval-tensorflow.py +++ b/eval-tensorflow.py @@ -21,8 +21,8 @@ predictions = [1 if p > 0.5 else 0 for p in predictions] accuracy = accuracy_score(test_y, predictions) f1 = f1_score(test_y, predictions) -file = open('evaluation.txt', 'w') +file = open('evaluation.txt', 'a') file.writelines(accuracy.__str__() + '\n') -file.writelines(f1.__str__()) file.close() + diff --git a/train-tensorflow-sacred.py b/train-tensorflow-sacred.py new file mode 100644 index 0000000..fd0b02e --- /dev/null +++ b/train-tensorflow-sacred.py @@ -0,0 +1,105 @@ +import pandas as pd +from sacred.observers import FileStorageObserver +from silence_tensorflow import silence_tensorflow +from tensorflow import keras +silence_tensorflow() +from tensorflow.keras.preprocessing.text import Tokenizer +from collections import Counter +from tensorflow.keras.preprocessing.sequence import pad_sequences +from tensorflow.keras import layers + +from sacred import Experiment + + +ex = Experiment("file_observer") +ex.observers.append(FileStorageObserver('my_runs')) + + +@ex.config +def my_config(): + epochs = 15 + batch_size = 16 + + +@ex.capture +def prepare_model(epochs, batch_size): + df = pd.read_csv('data.csv') + train_df = pd.read_csv('train.csv') + val_df = pd.read_csv('dev.csv') + test_df = pd.read_csv('test.csv') + + df.dropna(subset = ['reviews.text'], inplace = True) + val_df.dropna(subset = ['reviews.text'], inplace = True) + test_df.dropna(subset = ['reviews.text'], inplace = True) + train_df.dropna(subset = ['reviews.text'], inplace = True) + + train_sentences = train_df['reviews.text'].to_numpy() + train_labels = train_df['reviews.doRecommend'].to_numpy() + val_sentences = val_df['reviews.text'].to_numpy() + val_labels = val_df['reviews.doRecommend'].to_numpy() + test_sentences = test_df['reviews.text'].to_numpy() + test_labels = test_df['reviews.doRecommend'].to_numpy() + + # print(train_labels.shape) + # print(train_sentences.shape) + + counter = counter_word(df['reviews.text']) + num_unique_words = len(counter) + + tokenizer = Tokenizer(num_words=num_unique_words) + tokenizer.fit_on_texts(train_sentences) + + word_index = tokenizer.word_index + + train_sequences = tokenizer.texts_to_sequences(train_sentences) + val_sequences = tokenizer.texts_to_sequences(val_sentences) + test_sequences = tokenizer.texts_to_sequences(test_sentences) + + max_length = 30 + train_padded = pad_sequences(train_sequences, maxlen=max_length, padding="post", truncating="post") + val_padded = pad_sequences(val_sequences, maxlen=max_length, padding="post", truncating="post") + test_padded = pad_sequences(test_sequences, maxlen=max_length, padding="post", truncating="post") + + test_df['reviews.text'] = test_padded + test_df.to_csv('test.csv') + + model = keras.models.Sequential() + model.add(layers.Embedding(num_unique_words, 32, input_length=max_length)) + model.add(layers.LSTM(64, dropout=0.1)) + model.add(layers.Dense(1, activation="sigmoid")) + + model.summary() + + loss = keras.losses.BinaryCrossentropy(from_logits=False) + optim = keras.optimizers.Adam(lr = 0.001) + metrics = ["accuracy"] + + model.compile(loss = loss, optimizer = optim, metrics = metrics) + model.fit(train_padded, train_labels, epochs = 20, validation_data=(val_padded, val_labels), verbose=2) + + predictions = model.predict(test_padded) + + predictions = [1 if p > 0.5 else 0 for p in predictions] + + model.save('trained_model') + + file = open('results.txt', 'w') + file.write(predictions.__str__()) + file.close() + + +def counter_word(text_col): + count = Counter() + for text in text_col.values: + for word in text.split(): + count[word] += 1 + return count + + +@ex.automain +def my_main(epochs, batch_size): + prepare_model() + + +ex.run() +ex.add_artifact('trained_model') diff --git a/trained_model/saved_model.pb b/trained_model/saved_model.pb index 8b310d4..e25b652 100644 Binary files a/trained_model/saved_model.pb and b/trained_model/saved_model.pb differ diff --git a/trained_model/variables/variables.data-00000-of-00001 b/trained_model/variables/variables.data-00000-of-00001 index ecf4302..39876e3 100644 Binary files a/trained_model/variables/variables.data-00000-of-00001 and b/trained_model/variables/variables.data-00000-of-00001 differ diff --git a/trained_model/variables/variables.index b/trained_model/variables/variables.index index 65587b0..97af11b 100644 Binary files a/trained_model/variables/variables.index and b/trained_model/variables/variables.index differ