From e067972a057e8701a89dbf397b8904fb0004a8ed Mon Sep 17 00:00:00 2001 From: AWieczarek Date: Tue, 16 Apr 2024 18:58:43 +0200 Subject: [PATCH] IUM_05 --- Dockerfile | 7 ++++--- IUM_05-model.py | 27 +++++++++++++++++++++++++++ IUM_05-predict.py | 18 ++++++++++++++++++ IUM_05-split.py | 9 +++++++++ Jenkinsfile | 41 +++++++++++++++++++++++++++++------------ 5 files changed, 87 insertions(+), 15 deletions(-) create mode 100644 IUM_05-model.py create mode 100644 IUM_05-predict.py create mode 100644 IUM_05-split.py diff --git a/Dockerfile b/Dockerfile index 5daf552..76a671c 100644 --- a/Dockerfile +++ b/Dockerfile @@ -10,12 +10,13 @@ ENV TZ=Etc/UTC RUN apt update && \ apt install -y python3 python3-pip unzip -RUN pip install kaggle pandas seaborn scikit-learn +RUN pip install kaggle pandas seaborn scikit-learn tensorflow WORKDIR /app -COPY dataset_stats.py /app/ -COPY IUM_02.py /app/ +COPY IUM_05-model.py ./ +COPY IUM_05-predict.py ./ +COPY IUM_05-split.py ./ CMD ["python3", "IUM_02.py"] diff --git a/IUM_05-model.py b/IUM_05-model.py new file mode 100644 index 0000000..452d0bc --- /dev/null +++ b/IUM_05-model.py @@ -0,0 +1,27 @@ +import pandas as pd +import tensorflow as tf + +train_data = pd.read_csv('./beer_reviews_train.csv') +X_train = train_data[['review_aroma', 'review_appearance', 'review_palate', 'review_taste']] +y_train = train_data['review_overall'] + +tokenizer = tf.keras.preprocessing.text.Tokenizer(num_words=10000) +tokenizer.fit_on_texts(X_train) +X_train_seq = tokenizer.texts_to_sequences(X_train) + +X_train_pad = tf.keras.preprocessing.sequence.pad_sequences(X_train_seq, maxlen=100) + +model = tf.keras.Sequential([ + tf.keras.layers.Embedding(input_dim=10000, output_dim=16, input_length=100), + tf.keras.layers.GlobalAveragePooling1D(), + tf.keras.layers.Dense(16, activation='relu'), + tf.keras.layers.Dense(1, activation='sigmoid') +]) + +model.compile(optimizer='adam', + loss='binary_crossentropy', + metrics=['accuracy']) + +model.fit(X_train_pad, y_train, epochs=40, batch_size=32, validation_split=0.1) + +model.save('beer_review_sentiment_model.h5') diff --git a/IUM_05-predict.py b/IUM_05-predict.py new file mode 100644 index 0000000..7f222a3 --- /dev/null +++ b/IUM_05-predict.py @@ -0,0 +1,18 @@ +import pandas as pd +import numpy as np +import tensorflow as tf + +test_data = pd.read_csv('./beer_reviews_test.csv') +X_test = test_data[['review_aroma', 'review_appearance', 'review_palate', 'review_taste']] + +model = tf.keras.models.load_model('beer_review_sentiment_model.h5') + +tokenizer = tf.keras.preprocessing.text.Tokenizer(num_words=10000) + + +X_test_seq = tokenizer.texts_to_sequences(X_test) +X_test_pad = tf.keras.preprocessing.sequence.pad_sequences(X_test_seq, maxlen=100) + +predictions = model.predict(X_test_pad) + +np.savetxt('beer_review_sentiment_predictions.csv', predictions, delimiter=',', fmt='%.10f') diff --git a/IUM_05-split.py b/IUM_05-split.py new file mode 100644 index 0000000..44af6d1 --- /dev/null +++ b/IUM_05-split.py @@ -0,0 +1,9 @@ +import pandas as pd +from sklearn.model_selection import train_test_split + +data = pd.read_csv('./beer_reviews.csv') + +train_data, test_data = train_test_split(data, test_size=0.2, random_state=42) + +train_data.to_csv('beer_reviews_train.csv', index=False) +test_data.to_csv('beer_reviews_test.csv', index=False) diff --git a/Jenkinsfile b/Jenkinsfile index 07e6dfc..e700ed6 100644 --- a/Jenkinsfile +++ b/Jenkinsfile @@ -1,5 +1,5 @@ pipeline { - agent { dockerfile true } + agent any parameters { string(name: 'CUTOFF', defaultValue: '10000', description: 'Liczba wierszy do obcięcia ze zbioru danych') @@ -13,22 +13,39 @@ pipeline { git url: "https://git.wmi.amu.edu.pl/s464979/ium_464979" } } - - stage('Download, Process, and Split Dataset') { + stage('Download dataset') { steps { - withEnv([ - "KAGGLE_USERNAME=${env.KAGGLE_USERNAME}", - "KAGGLE_KEY=${env.KAGGLE_KEY}" - ]) { - sh 'export KAGGLE_USERNAME=${env.KAGGLE_USERNAME}"' - sh 'export KAGGLE_KEY=${env.KAGGLE_KEY}"' - sh "python3 IUM_02.py" + withEnv(["KAGGLE_USERNAME=${env.KAGGLE_USERNAME}", "KAGGLE_KEY=${env.KAGGLE_KEY}"]) { + sh "kaggle datasets download -d thedevastator/1-5-million-beer-reviews-from-beer-advocate --unzip" } } } - stage('Archive Results') { + stage('Process and Split Dataset') { + agent { + dockerfile { + filename 'Dockerfile' + reuseNode true + } + } steps { - archiveArtifacts artifacts: 'data/*', onlyIfSuccessful: true + sh "chmod +x ./IUM_05-split.py" + sh "python ./IUM_05-split.py" + archiveArtifacts artifacts: 'beer_reviews.csv,beer_reviews_train.csv,beer_reviews_test.csv', onlyIfSuccessful: true + } + } + stage("Run") { + agent { + dockerfile { + filename 'Dockerfile' + reuseNode true + } + } + steps { + sh "chmod +x ./IUM_05-model.py" + sh "chmod +x ./IUM_05-predict.py" + sh "python ./IUM_05-model.py" + sh "python ./IUM_05-predict.py" + archiveArtifacts artifacts: 'beer_review_sentiment_model.h5,beer_review_sentiment_predictions.csv', onlyIfSuccessful: true } } }