From e9f53be95453a8da8811653ba3c4a6e75895cd33 Mon Sep 17 00:00:00 2001 From: AWieczarek Date: Tue, 28 May 2024 18:36:32 +0200 Subject: [PATCH] IUM_10 --- .gitignore | 4 ++++ IUM_05-model.py | 3 ++- IUM_05-predict.py | 14 +++++++------- IUM_05-split.py | 2 +- dvc.lock | 46 ++++++++++++++++++++++++++++++++++++++++++++++ dvc.yaml | 23 +++++++++++++++++++++++ 6 files changed, 83 insertions(+), 9 deletions(-) create mode 100644 .gitignore create mode 100644 dvc.lock create mode 100644 dvc.yaml diff --git a/.gitignore b/.gitignore new file mode 100644 index 0000000..8b679b5 --- /dev/null +++ b/.gitignore @@ -0,0 +1,4 @@ +/beer_reviews_train.csv +/beer_reviews_test.csv +/beer_review_sentiment_model.h5 +/beer_review_sentiment_predictions.csv diff --git a/IUM_05-model.py b/IUM_05-model.py index 452d0bc..0f8a5a6 100644 --- a/IUM_05-model.py +++ b/IUM_05-model.py @@ -1,5 +1,6 @@ import pandas as pd import tensorflow as tf +import sys train_data = pd.read_csv('./beer_reviews_train.csv') X_train = train_data[['review_aroma', 'review_appearance', 'review_palate', 'review_taste']] @@ -22,6 +23,6 @@ model.compile(optimizer='adam', loss='binary_crossentropy', metrics=['accuracy']) -model.fit(X_train_pad, y_train, epochs=40, batch_size=32, validation_split=0.1) +model.fit(X_train_pad, y_train, epochs=int(sys.argv[1]), batch_size=int(sys.argv[2]), validation_split=0.1) model.save('beer_review_sentiment_model.h5') diff --git a/IUM_05-predict.py b/IUM_05-predict.py index 7f222a3..54038e4 100644 --- a/IUM_05-predict.py +++ b/IUM_05-predict.py @@ -1,18 +1,18 @@ import pandas as pd -import numpy as np import tensorflow as tf test_data = pd.read_csv('./beer_reviews_test.csv') X_test = test_data[['review_aroma', 'review_appearance', 'review_palate', 'review_taste']] +y_test = test_data['review_overall'] model = tf.keras.models.load_model('beer_review_sentiment_model.h5') -tokenizer = tf.keras.preprocessing.text.Tokenizer(num_words=10000) +predictions = model.predict(X_test) +print(f'Predictions shape: {predictions.shape}') -X_test_seq = tokenizer.texts_to_sequences(X_test) -X_test_pad = tf.keras.preprocessing.sequence.pad_sequences(X_test_seq, maxlen=100) +if len(predictions.shape) > 1: + predictions = predictions[:, 0] -predictions = model.predict(X_test_pad) - -np.savetxt('beer_review_sentiment_predictions.csv', predictions, delimiter=',', fmt='%.10f') +results = pd.DataFrame({'Predictions': predictions, 'Actual': y_test}) +results.to_csv('beer_review_sentiment_predictions.csv', index=False) \ No newline at end of file diff --git a/IUM_05-split.py b/IUM_05-split.py index 44af6d1..7460c7b 100644 --- a/IUM_05-split.py +++ b/IUM_05-split.py @@ -1,7 +1,7 @@ import pandas as pd from sklearn.model_selection import train_test_split -data = pd.read_csv('./beer_reviews.csv') +data = pd.read_csv('data/beer_reviews.csv') train_data, test_data = train_test_split(data, test_size=0.2, random_state=42) diff --git a/dvc.lock b/dvc.lock new file mode 100644 index 0000000..b7e2060 --- /dev/null +++ b/dvc.lock @@ -0,0 +1,46 @@ +schema: '2.0' +stages: + split_data: + cmd: python IUM_05-split.py + deps: + - path: data/beer_reviews.csv + hash: md5 + md5: 50f6eec0d0fe78bc0f10e35edd271998 + size: 201644905 + outs: + - path: beer_reviews_test.csv + hash: md5 + md5: edbd0a7f05c59a0c0e936917f60e9b96 + size: 40632354 + - path: beer_reviews_train.csv + hash: md5 + md5: 8c6877a26fef1542369bfae6b39d163c + size: 162599343 + train_model: + cmd: python IUM_05-model.py 10 32 + deps: + - path: beer_reviews_train.csv + hash: md5 + md5: 8c6877a26fef1542369bfae6b39d163c + size: 162599343 + outs: + - path: beer_review_sentiment_model.h5 + hash: md5 + md5: c126bd5d332a905262c66894585450e3 + size: 1950856 + predict: + cmd: python IUM_05-predict.py + deps: + - path: beer_review_sentiment_model.h5 + hash: md5 + md5: c126bd5d332a905262c66894585450e3 + size: 1950856 + - path: beer_reviews_test.csv + hash: md5 + md5: edbd0a7f05c59a0c0e936917f60e9b96 + size: 40632354 + outs: + - path: beer_review_sentiment_predictions.csv + hash: md5 + md5: 12a66fafb7f4d7d19eb0c4a90cc7d3ad + size: 4814242 diff --git a/dvc.yaml b/dvc.yaml new file mode 100644 index 0000000..8b2daa0 --- /dev/null +++ b/dvc.yaml @@ -0,0 +1,23 @@ +stages: + split_data: + cmd: python IUM_05-split.py + deps: + - data/beer_reviews.csv + outs: + - beer_reviews_train.csv + - beer_reviews_test.csv + + train_model: + cmd: python IUM_05-model.py 10 32 + deps: + - beer_reviews_train.csv + outs: + - beer_review_sentiment_model.h5 + + predict: + cmd: python IUM_05-predict.py + deps: + - beer_review_sentiment_model.h5 + - beer_reviews_test.csv + outs: + - beer_review_sentiment_predictions.csv