From 179a5101cc1a76d5b8f1130bb523e2500748e8bd Mon Sep 17 00:00:00 2001
From: mszmyd <erilf9@gmail.com>
Date: Sat, 4 May 2024 23:30:01 +0200
Subject: [PATCH 01/10] move

---
 .gitignore                   | 56 ++++++++++++++++++++++++-
 DataManager.py               | 55 ------------------------
 Makefile                     |  8 ++++
 file_manager/__init__.py     |  0
 file_manager/data_manager.py | 81 ++++++++++++++++++++++++++++++++++++
 requirements.txt             |  5 +++
 6 files changed, 148 insertions(+), 57 deletions(-)
 delete mode 100644 DataManager.py
 create mode 100644 Makefile
 create mode 100644 file_manager/__init__.py
 create mode 100644 file_manager/data_manager.py
 create mode 100644 requirements.txt

diff --git a/.gitignore b/.gitignore
index 6f951ce..8ee8ec4 100644
--- a/.gitignore
+++ b/.gitignore
@@ -1,2 +1,54 @@
-data
-archive.zip
\ No newline at end of file
+data/
+archive.zip
+
+# https://github.com/microsoft/vscode-python/blob/main/.gitignore
+.DS_Store
+.huskyrc.json
+out
+log.log
+**/node_modules
+*.pyc
+*.vsix
+envVars.txt
+**/.vscode/.ropeproject/**
+**/testFiles/**/.cache/**
+*.noseids
+.nyc_output
+.vscode-test
+__pycache__
+npm-debug.log
+**/.mypy_cache/**
+!yarn.lock
+coverage/
+cucumber-report.json
+**/.vscode-test/**
+**/.vscode test/**
+**/.vscode-smoke/**
+**/.venv*/
+port.txt
+precommit.hook
+python_files/lib/**
+python_files/get-pip.py
+debug_coverage*/**
+languageServer/**
+languageServer.*/**
+bin/**
+obj/**
+.pytest_cache
+tmp/**
+.python-version
+.vs/
+test-results*.xml
+xunit-test-results.xml
+build/ci/performance/performance-results.json
+!build/
+debug*.log
+debugpy*.log
+pydevd*.log
+nodeLanguageServer/**
+nodeLanguageServer.*/**
+dist/**
+# translation files
+*.xlf
+package.nls.*.json
+l10n/
\ No newline at end of file
diff --git a/DataManager.py b/DataManager.py
deleted file mode 100644
index d837ec8..0000000
--- a/DataManager.py
+++ /dev/null
@@ -1,55 +0,0 @@
-import glob
-import shutil
-import cv2
-from zipfile import ZipFile
-import os
-import wget
-
-mainPath="data/"
-pathToTrainAndValidDate = mainPath + "%s/**/*.*"
-pathToTestDataset = mainPath + "/test"
-originalDatasetName = "original dataset"
-
-class DataManager:
-
-    def downloadData(self):
-        if not os.path.isfile("archive.zip"):
-            wget.download("https://storage.googleapis.com/kaggle-data-sets/78313/182633/bundle/archive.zip?X-Goog-Algorithm=GOOG4-RSA-SHA256&X-Goog-Credential=gcp-kaggle-com%40kaggle-161607.iam.gserviceaccount.com%2F20240502%2Fauto%2Fstorage%2Fgoog4_request&X-Goog-Date=20240502T181500Z&X-Goog-Expires=259200&X-Goog-SignedHeaders=host&X-Goog-Signature=87d0661313e358206b6e10d44f135d41e23501d601e58b1e8236ca28a82ccc434534564b45baa84c4d829dd1995ff384d51fe5dba3f543d00eb0763169fd712c6c8f91bb4f298db38a19b31b2d489798a9723a271aa4108d7b93345c5a64a7ef00b9b8f27d1d5f728e373c870f0287eb89bc747941f0aeeb4703c288059e2e07b7ece3a83114a9607276874a90d4ec96dde06fddb94a0d3af72848565661b1404e3ea248eeebf46374daada7df1f37db7d62b21b4ac90706ea64cc74200a58f35bfe379703e7691aeda9e39635b02f58a9f8399fa64b031b1a9bccd7f109d256c6f4886ef94fcdc11034d6da13c0f1d4d8b97cabdd295862a5107b587824ebe8")
-
-    def unzipData(self, fileName, pathToExtract):
-        if not os.path.exists(mainPath):
-            os.makedirs("data")    
-        ZipFile(fileName).extractall(mainPath + pathToExtract)
-        shutil.move("data/original dataset/test/test", "data", copy_function = shutil.copytree)
-        shutil.move("data/original dataset/New Plant Diseases Dataset(Augmented)/New Plant Diseases Dataset(Augmented)/train", "data/original dataset/train", copy_function = shutil.copytree)
-        shutil.move("data/original dataset/New Plant Diseases Dataset(Augmented)/New Plant Diseases Dataset(Augmented)/valid", "data/original dataset/valid", copy_function = shutil.copytree)
-        shutil.rmtree("data/original dataset/New Plant Diseases Dataset(Augmented)")
-        shutil.rmtree("data/Detection-of-plant-diseases/data/original dataset/test")
-
-    def writeImageToGivenPath(self, image, path):
-        os.makedirs(path.rsplit('/', 1)[0], exist_ok=True)
-        cv2.imwrite(path, image)
-
-    def resizeDataset(self, soruceDatasetName, width, height):
-        if not os.path.exists(mainPath + "resized dataset"):
-            for file in glob.glob(pathToTrainAndValidDate % soruceDatasetName, recursive=True):
-                pathToFile = file.replace("\\","/")
-                image = cv2.imread(pathToFile)
-                image = cv2.resize(image, (width, height))
-                newPath = pathToFile.replace(soruceDatasetName,"resized dataset")
-                self.writeImageToGivenPath(image,newPath)
-
-    def sobelx(self, soruceDatasetName):
-        if not os.path.exists(mainPath + "sobel dataset"):
-            for file in glob.glob(pathToTrainAndValidDate % soruceDatasetName, recursive=True):
-                pathToFile = file.replace("\\","/")
-                image = cv2.imread(pathToFile)
-                sobel = cv2.Sobel(image,cv2.CV_64F,1,0,ksize=5) 
-                newPath = pathToFile.replace(soruceDatasetName,"sobel dataset")
-                self.writeImageToGivenPath(sobel,newPath)
-
-dataManager = DataManager()
-dataManager.downloadData()
-dataManager.unzipData("archive.zip","original dataset")
-dataManager.resizeDataset("original dataset", 64, 64)
-dataManager.sobelx("resized dataset")
\ No newline at end of file
diff --git a/Makefile b/Makefile
new file mode 100644
index 0000000..17eb68f
--- /dev/null
+++ b/Makefile
@@ -0,0 +1,8 @@
+.PHONY: download-dataset sobel-dataset
+
+
+download-dataset:
+	python3 ./file_manager/data_manager.py --download
+
+sobel-dataset:
+	python3 ./file_manager/data_manager.py --sobel
\ No newline at end of file
diff --git a/file_manager/__init__.py b/file_manager/__init__.py
new file mode 100644
index 0000000..e69de29
diff --git a/file_manager/data_manager.py b/file_manager/data_manager.py
new file mode 100644
index 0000000..58ef563
--- /dev/null
+++ b/file_manager/data_manager.py
@@ -0,0 +1,81 @@
+import glob
+import shutil
+import cv2
+from zipfile import ZipFile
+import os
+import wget
+import argparse
+from pathlib import Path
+
+main_path = Path("data/")
+path_to_train_and_valid = main_path / "%s/**/*.*"
+path_to_test_dataset = main_path / "test"
+original_dataset_name = "original_dataset"
+
+parser = argparse.ArgumentParser()
+parser.add_argument("--download", action="store_true",
+                    help="Download the data")
+parser.add_argument("--sobel", action="store_true",
+                    help="Apply Sobel filter to the dataset")
+
+args = parser.parse_args()
+
+
+class DataManager:
+
+    def download_data(self):
+        if not os.path.isfile("archive.zip"):
+            wget.download("https://storage.googleapis.com/kaggle-data-sets/78313/182633/bundle/archive.zip?X-Goog-Algorithm=GOOG4-RSA-SHA256&X-Goog-Credential=gcp-kaggle-com%40kaggle-161607.iam.gserviceaccount.com%2F20240502%2Fauto%2Fstorage%2Fgoog4_request&X-Goog-Date=20240502T181500Z&X-Goog-Expires=259200&X-Goog-SignedHeaders=host&X-Goog-Signature=87d0661313e358206b6e10d44f135d41e23501d601e58b1e8236ca28a82ccc434534564b45baa84c4d829dd1995ff384d51fe5dba3f543d00eb0763169fd712c6c8f91bb4f298db38a19b31b2d489798a9723a271aa4108d7b93345c5a64a7ef00b9b8f27d1d5f728e373c870f0287eb89bc747941f0aeeb4703c288059e2e07b7ece3a83114a9607276874a90d4ec96dde06fddb94a0d3af72848565661b1404e3ea248eeebf46374daada7df1f37db7d62b21b4ac90706ea64cc74200a58f35bfe379703e7691aeda9e39635b02f58a9f8399fa64b031b1a9bccd7f109d256c6f4886ef94fcdc11034d6da13c0f1d4d8b97cabdd295862a5107b587824ebe8")
+
+    def unzip_data(self, file_name, path_to_extract):
+        full_path_to_extract = main_path / path_to_extract
+        old_path = "New Plant Diseases Dataset(Augmented)/New Plant Diseases Dataset(Augmented)"
+        if not os.path.exists(main_path):
+            os.makedirs(main_path)
+        ZipFile(file_name).extractall(full_path_to_extract)
+        shutil.move("data/test/test",
+                    full_path_to_extract, copy_function=shutil.copytree)
+        shutil.move(full_path_to_extract / old_path / "train",
+                    full_path_to_extract / "train", copy_function=shutil.copytree)
+        shutil.move(full_path_to_extract / old_path / "valid",
+                    full_path_to_extract / "valid", copy_function=shutil.copytree)
+        shutil.rmtree(
+            full_path_to_extract / "New Plant Diseases Dataset(Augmented)")
+        shutil.rmtree(
+            "data/Detection-of-plant-diseases/data/original dataset/test")
+
+    def write_image(self, image, path):
+        os.makedirs(path.rsplit('/', 1)[0], exist_ok=True)
+        cv2.imwrite(path, image)
+
+    def resize_dataset(self, source_dataset_name, width, height):
+        dataset_name = "resized_dataset"
+        if not os.path.exists(main_path / dataset_name):
+            for file in glob.glob(path_to_train_and_valid % source_dataset_name, recursive=True):
+                path_to_file = file.replace("\\", "/")
+                image = cv2.imread(path_to_file)
+                image = cv2.resize(image, (width, height))
+                new_path = path_to_file.replace(
+                    source_dataset_name, dataset_name)
+                self.write_image(image, new_path)
+
+    def sobelx(self, source_dataset_name):
+        dataset_name = "sobel_dataset"
+        if not os.path.exists(main_path / dataset_name):
+            for file in glob.glob(path_to_train_and_valid % source_dataset_name, recursive=True):
+                path_to_file = file.replace("\\", "/")
+                image = cv2.imread(path_to_file)
+                sobel = cv2.Sobel(image, cv2.CV_64F, 1, 0, ksize=5)
+                new_path = path_to_file.replace(
+                    source_dataset_name, dataset_name)
+                self.write_image(sobel, new_path)
+
+
+if __name__ == "__main__":
+    data_manager = DataManager()
+    if args.download:
+        data_manager.download_data()
+        data_manager.unzip_data("archive.zip", original_dataset_name)
+        data_manager.resize_dataset(original_dataset_name, 64, 64)
+    if args.sobel:
+        data_manager.sobelx("resized_dataset")
diff --git a/requirements.txt b/requirements.txt
new file mode 100644
index 0000000..cd27218
--- /dev/null
+++ b/requirements.txt
@@ -0,0 +1,5 @@
+tensorflow==2.16.1
+tensorflow-io==0.37.0
+numpy==1.26.4
+opencv-python==4.9.0.80
+wget==3.2

From 9dafc8c895ad497fa5e15cac8e0f4e997ca8e1b6 Mon Sep 17 00:00:00 2001
From: mszmyd <erilf9@gmail.com>
Date: Sat, 4 May 2024 23:30:49 +0200
Subject: [PATCH 02/10] git

---
 .gitignore | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/.gitignore b/.gitignore
index 8ee8ec4..ac91879 100644
--- a/.gitignore
+++ b/.gitignore
@@ -1,5 +1,5 @@
 data/
-archive.zip
+*.zip
 
 # https://github.com/microsoft/vscode-python/blob/main/.gitignore
 .DS_Store

From 9ac4b3ec4ee187604b75ef45dd15eb8cc3b4c6c1 Mon Sep 17 00:00:00 2001
From: mszmyd <erilf9@gmail.com>
Date: Sat, 4 May 2024 23:42:37 +0200
Subject: [PATCH 03/10] fix

---
 Makefile                     |  1 -
 file_manager/data_manager.py | 14 ++++++++------
 2 files changed, 8 insertions(+), 7 deletions(-)

diff --git a/Makefile b/Makefile
index 17eb68f..5734438 100644
--- a/Makefile
+++ b/Makefile
@@ -1,6 +1,5 @@
 .PHONY: download-dataset sobel-dataset
 
-
 download-dataset:
 	python3 ./file_manager/data_manager.py --download
 
diff --git a/file_manager/data_manager.py b/file_manager/data_manager.py
index 58ef563..9136161 100644
--- a/file_manager/data_manager.py
+++ b/file_manager/data_manager.py
@@ -33,16 +33,18 @@ class DataManager:
         if not os.path.exists(main_path):
             os.makedirs(main_path)
         ZipFile(file_name).extractall(full_path_to_extract)
-        shutil.move("data/test/test",
-                    full_path_to_extract, copy_function=shutil.copytree)
+        # shutil.move("data/test/test",
+        #             full_path_to_extract, copy_function=shutil.copytree)
         shutil.move(full_path_to_extract / old_path / "train",
                     full_path_to_extract / "train", copy_function=shutil.copytree)
         shutil.move(full_path_to_extract / old_path / "valid",
                     full_path_to_extract / "valid", copy_function=shutil.copytree)
         shutil.rmtree(
-            full_path_to_extract / "New Plant Diseases Dataset(Augmented)")
+            full_path_to_extract / "New Plant Diseases Dataset(Augmented)"
+        )
         shutil.rmtree(
-            "data/Detection-of-plant-diseases/data/original dataset/test")
+            full_path_to_extract / "new plant diseases dataset(augmented)"
+        )
 
     def write_image(self, image, path):
         os.makedirs(path.rsplit('/', 1)[0], exist_ok=True)
@@ -51,7 +53,7 @@ class DataManager:
     def resize_dataset(self, source_dataset_name, width, height):
         dataset_name = "resized_dataset"
         if not os.path.exists(main_path / dataset_name):
-            for file in glob.glob(path_to_train_and_valid % source_dataset_name, recursive=True):
+            for file in glob.glob(str(path_to_train_and_valid) % source_dataset_name, recursive=True):
                 path_to_file = file.replace("\\", "/")
                 image = cv2.imread(path_to_file)
                 image = cv2.resize(image, (width, height))
@@ -62,7 +64,7 @@ class DataManager:
     def sobelx(self, source_dataset_name):
         dataset_name = "sobel_dataset"
         if not os.path.exists(main_path / dataset_name):
-            for file in glob.glob(path_to_train_and_valid % source_dataset_name, recursive=True):
+            for file in glob.glob(str(path_to_train_and_valid) % source_dataset_name, recursive=True):
                 path_to_file = file.replace("\\", "/")
                 image = cv2.imread(path_to_file)
                 sobel = cv2.Sobel(image, cv2.CV_64F, 1, 0, ksize=5)

From e3002d5ef8bbffafed014b64e1b231a3fbcb615d Mon Sep 17 00:00:00 2001
From: mszmyd <erilf9@gmail.com>
Date: Sun, 5 May 2024 01:20:04 +0200
Subject: [PATCH 04/10] add loader

---
 dataset/__init__.py         |  0
 dataset/dataset.py          | 66 +++++++++++++++++++++++++++++++++++++
 file_manager/shard_files.py | 19 +++++++++++
 test.py                     | 10 ++++++
 4 files changed, 95 insertions(+)
 create mode 100644 dataset/__init__.py
 create mode 100644 dataset/dataset.py
 create mode 100644 file_manager/shard_files.py
 create mode 100644 test.py

diff --git a/dataset/__init__.py b/dataset/__init__.py
new file mode 100644
index 0000000..e69de29
diff --git a/dataset/dataset.py b/dataset/dataset.py
new file mode 100644
index 0000000..6631a20
--- /dev/null
+++ b/dataset/dataset.py
@@ -0,0 +1,66 @@
+import os
+from pathlib import Path
+
+import tensorflow as tf
+
+
+class Dataset:
+    ''' Class to load and preprocess the dataset. 
+    Loads images and labels from the given directory to tf.data.Dataset.
+    
+    
+    Args:
+        `data_dir (Path)`: Path to the dataset directory.
+        `seed (int)`: Seed for shuffling the dataset.
+        `repeat (int)`: Number of times to repeat the dataset.
+        `shuffle_buffer_size (int)`: Size of the buffer for shuffling the dataset.
+        `batch_size (int)`: Batch size for the dataset.
+    '''
+    def __init__(self,
+                 data_dir: Path,
+                 seed: int = 42,
+                 repeat: int = 1,
+                 shuffle_buffer_size: int = 10_000,
+                 batch_size: int = 64) -> None:
+        self.data_dir = data_dir
+        self.seed = seed
+        self.repeat = repeat
+        self.batch_size = batch_size
+
+        self.dataset = self._load_dataset()\
+            .shuffle(shuffle_buffer_size, seed=self.seed)\
+            .repeat(self.repeat)\
+            .prefetch(tf.data.experimental.AUTOTUNE)
+
+    def _load_dataset(self) -> tf.data.Dataset:
+        # check if path has 'test' word in it
+        dataset = tf.data.Dataset.list_files(str(self.data_dir / '*/*'))
+        if 'test' in str(self.data_dir).lower():
+            # file names issue - labels have camel case (regex?) and differs from the train/valid sets
+            pass
+        else:
+            dataset = dataset.map(
+                _preprocess, num_parallel_calls=tf.data.experimental.AUTOTUNE)
+
+        return dataset
+
+
+def _get_labels(image_path):
+    path = tf.strings.split(image_path, os.path.sep)[-2]
+    plant = tf.strings.split(path, '___')[0]
+    disease = tf.strings.split(path, '___')[1]
+    return tf.cast(plant, dtype=tf.string, name=None), tf.cast(disease, dtype=tf.string, name=None)
+
+
+def _get_image(image_path):
+    img = tf.io.read_file(image_path)
+    img = tf.io.decode_jpeg(img, channels=3) / 255
+    return tf.cast(img, dtype=tf.float32, name=None)
+
+
+def _preprocess(image_path):
+    labels = _get_labels(image_path)
+    image = _get_image(image_path)
+
+    # returns X, Y1, Y2
+    return image, labels
diff --git a/file_manager/shard_files.py b/file_manager/shard_files.py
new file mode 100644
index 0000000..d4236f7
--- /dev/null
+++ b/file_manager/shard_files.py
@@ -0,0 +1,19 @@
+from pathlib import Path
+
+# TODO: split the files into smaller dirs and make list of them
+class FileSharder:
+    def __init__(self,
+                 train_dir: Path = Path('./data/resized_dataset/train'),
+                 valid_dir: Path = Path('./data/resized_dataset/valid'),
+                 test_dir: Path = Path('./data/resized_dataset/test'),
+                 shard_size = 5_000) -> None:
+        self.shard_size = shard_size
+
+        self.train_dir = train_dir
+        self.valid_dir = valid_dir
+        self.test_dir = test_dir
+
+        self.shard()
+
+    def shard(self):
+        pass
diff --git a/test.py b/test.py
new file mode 100644
index 0000000..41d5b6f
--- /dev/null
+++ b/test.py
@@ -0,0 +1,10 @@
+
+from pathlib import Path
+
+from dataset.dataset import Dataset
+
+train_dataset = Dataset(Path('data/resized_dataset/train'))
+valid_dataset = Dataset(Path('data/resized_dataset/valid'))
+
+for image, labels in train_dataset.dataset.take(1):
+    print(image, labels)

From de27695d530ae13eeef990bedfe5f4696965bfe9 Mon Sep 17 00:00:00 2001
From: mszmyd <erilf9@gmail.com>
Date: Sun, 5 May 2024 12:41:54 +0200
Subject: [PATCH 05/10] move funcs to class

---
 dataset/dataset.py | 41 ++++++++++++++++++++---------------------
 1 file changed, 20 insertions(+), 21 deletions(-)

diff --git a/dataset/dataset.py b/dataset/dataset.py
index 6631a20..d098ead 100644
--- a/dataset/dataset.py
+++ b/dataset/dataset.py
@@ -7,8 +7,8 @@ import tensorflow as tf
 class Dataset:
     ''' Class to load and preprocess the dataset. 
     Loads images and labels from the given directory to tf.data.Dataset.
-    
-    
+
+
     Args:
         `data_dir (Path)`: Path to the dataset directory.
         `seed (int)`: Seed for shuffling the dataset.
@@ -16,6 +16,7 @@ class Dataset:
         `shuffle_buffer_size (int)`: Size of the buffer for shuffling the dataset.
         `batch_size (int)`: Batch size for the dataset.
     '''
+
     def __init__(self,
                  data_dir: Path,
                  seed: int = 42,
@@ -25,10 +26,11 @@ class Dataset:
         self.data_dir = data_dir
         self.seed = seed
         self.repeat = repeat
+        self.shuffle_buffer_size = shuffle_buffer_size
         self.batch_size = batch_size
 
         self.dataset = self._load_dataset()\
-            .shuffle(shuffle_buffer_size, seed=self.seed)\
+            .shuffle(self.shuffle_buffer_size, seed=self.seed)\
             .repeat(self.repeat)\
             .prefetch(tf.data.experimental.AUTOTUNE)
 
@@ -40,27 +42,24 @@ class Dataset:
             pass
         else:
             dataset = dataset.map(
-                _preprocess, num_parallel_calls=tf.data.experimental.AUTOTUNE)
+                self._preprocess, num_parallel_calls=tf.data.experimental.AUTOTUNE)
 
         return dataset
 
+    def _get_labels(self, image_path):
+        path = tf.strings.split(image_path, os.path.sep)[-2]
+        plant = tf.strings.split(path, '___')[0]
+        disease = tf.strings.split(path, '___')[1]
+        return tf.cast(plant, dtype=tf.string, name=None), tf.cast(disease, dtype=tf.string, name=None)
 
-def _get_labels(image_path):
-    path = tf.strings.split(image_path, os.path.sep)[-2]
-    plant = tf.strings.split(path, '___')[0]
-    disease = tf.strings.split(path, '___')[1]
-    return tf.cast(plant, dtype=tf.string, name=None), tf.cast(disease, dtype=tf.string, name=None)
+    def _get_image(self, image_path):
+        img = tf.io.read_file(image_path)
+        img = tf.io.decode_jpeg(img, channels=3) / 255
+        return tf.cast(img, dtype=tf.float32, name=None)
 
+    def _preprocess(self, image_path):
+        labels = self._get_labels(image_path)
+        image = self._get_image(image_path)
 
-def _get_image(image_path):
-    img = tf.io.read_file(image_path)
-    img = tf.io.decode_jpeg(img, channels=3) / 255
-    return tf.cast(img, dtype=tf.float32, name=None)
-
-
-def _preprocess(image_path):
-    labels = _get_labels(image_path)
-    image = _get_image(image_path)
-
-    # returns X, Y1, Y2
-    return image, labels
+        # returns X, Y1, Y2
+        return image, labels

From b7ca0fae45b2056ae3fb54b663755575f2142a64 Mon Sep 17 00:00:00 2001
From: mszmyd <erilf9@gmail.com>
Date: Sun, 5 May 2024 12:44:54 +0200
Subject: [PATCH 06/10] div by float

---
 dataset/dataset.py | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/dataset/dataset.py b/dataset/dataset.py
index d098ead..6b4489b 100644
--- a/dataset/dataset.py
+++ b/dataset/dataset.py
@@ -54,7 +54,7 @@ class Dataset:
 
     def _get_image(self, image_path):
         img = tf.io.read_file(image_path)
-        img = tf.io.decode_jpeg(img, channels=3) / 255
+        img = tf.io.decode_jpeg(img, channels=3) / 255.
         return tf.cast(img, dtype=tf.float32, name=None)
 
     def _preprocess(self, image_path):

From d4b6a714bb83fd0e66910198c248b0b62f101a6a Mon Sep 17 00:00:00 2001
From: mszmyd <erilf9@gmail.com>
Date: Sun, 5 May 2024 13:25:53 +0200
Subject: [PATCH 07/10] add onehot and getattr

---
 dataset/consts.py  | 40 ++++++++++++++++++++++++++++++++++++++++
 dataset/dataset.py | 17 +++++++++++++----
 test.py            |  4 ++--
 3 files changed, 55 insertions(+), 6 deletions(-)
 create mode 100644 dataset/consts.py

diff --git a/dataset/consts.py b/dataset/consts.py
new file mode 100644
index 0000000..5f71df7
--- /dev/null
+++ b/dataset/consts.py
@@ -0,0 +1,40 @@
+PLANT_CLASSES = [
+    "Tomato",
+    "Potato",
+    "Corn_(maize)",
+    "Apple",
+    "Blueberry",
+    "Soybean",
+    "Cherry_(including_sour)",
+    "Squash",
+    "Strawberry",
+    "Pepper,_bell",
+    "Peach",
+    "Grape",
+    "Orange",
+    "Raspberry",
+]
+
+DISEASE_CLASSES = [
+    "healthy",
+    "Northern_Leaf_Blight",
+    "Tomato_mosaic_virus",
+    "Early_blight",
+    "Leaf_scorch",
+    "Tomato_Yellow_Leaf_Curl_Virus",
+    "Cedar_apple_rust",
+    "Late_blight",
+    "Spider_mites Two-spotted_spider_mite",
+    "Black_rot",
+    "Bacterial_spot",
+    "Apple_scab",
+    "Powdery_mildew",
+    "Esca_(Black_Measles)",
+    "Haunglongbing_(Citrus_greening)",
+    "Leaf_Mold",
+    "Common_rust_",
+    "Target_Spot",
+    "Leaf_blight_(Isariopsis_Leaf_Spot)",
+    "Septoria_leaf_spot",
+    "Cercospora_leaf_spot Gray_leaf_spot",
+]
\ No newline at end of file
diff --git a/dataset/dataset.py b/dataset/dataset.py
index 6b4489b..b299e25 100644
--- a/dataset/dataset.py
+++ b/dataset/dataset.py
@@ -3,6 +3,8 @@ from pathlib import Path
 
 import tensorflow as tf
 
+from .consts import DISEASE_CLASSES, PLANT_CLASSES
+
 
 class Dataset:
     ''' Class to load and preprocess the dataset. 
@@ -50,16 +52,23 @@ class Dataset:
         path = tf.strings.split(image_path, os.path.sep)[-2]
         plant = tf.strings.split(path, '___')[0]
         disease = tf.strings.split(path, '___')[1]
-        return tf.cast(plant, dtype=tf.string, name=None), tf.cast(disease, dtype=tf.string, name=None)
+
+        one_hot_plant = plant == PLANT_CLASSES
+        one_hot_disease = disease == DISEASE_CLASSES
+
+        return tf.cast(one_hot_plant, dtype=tf.uint8, name=None), tf.cast(one_hot_disease, dtype=tf.uint8, name=None)
 
     def _get_image(self, image_path):
         img = tf.io.read_file(image_path)
-        img = tf.io.decode_jpeg(img, channels=3) / 255.
-        return tf.cast(img, dtype=tf.float32, name=None)
+        img = tf.io.decode_jpeg(img, channels=3)
+        return tf.cast(img, dtype=tf.float32, name=None) / 255.
 
     def _preprocess(self, image_path):
         labels = self._get_labels(image_path)
         image = self._get_image(image_path)
 
         # returns X, Y1, Y2
-        return image, labels
+        return image, labels[0], labels[1]
+
+    def __getattr__(self, attr):
+        return getattr(self.dataset, attr)
diff --git a/test.py b/test.py
index 41d5b6f..a75f18f 100644
--- a/test.py
+++ b/test.py
@@ -6,5 +6,5 @@ from dataset.dataset import Dataset
 train_dataset = Dataset(Path('data/resized_dataset/train'))
 valid_dataset = Dataset(Path('data/resized_dataset/valid'))
 
-for image, labels in train_dataset.dataset.take(1):
-    print(image, labels)
+for i in train_dataset.take(1):
+    print(i)

From 8e6318b1fec263d37aed2431c43705764ae9faf7 Mon Sep 17 00:00:00 2001
From: mszmyd <erilf9@gmail.com>
Date: Sun, 5 May 2024 13:41:12 +0200
Subject: [PATCH 08/10] del tfio from req

---
 requirements.txt | 1 -
 1 file changed, 1 deletion(-)

diff --git a/requirements.txt b/requirements.txt
index cd27218..cc6fe18 100644
--- a/requirements.txt
+++ b/requirements.txt
@@ -1,5 +1,4 @@
 tensorflow==2.16.1
-tensorflow-io==0.37.0
 numpy==1.26.4
 opencv-python==4.9.0.80
 wget==3.2

From 1cfb74db6aa454c832bb372c0c7dedda90476e86 Mon Sep 17 00:00:00 2001
From: mszmyd <erilf9@gmail.com>
Date: Sun, 5 May 2024 19:03:15 +0200
Subject: [PATCH 09/10] priv

---
 dataset/dataset.py | 16 ++++++++--------
 1 file changed, 8 insertions(+), 8 deletions(-)

diff --git a/dataset/dataset.py b/dataset/dataset.py
index b299e25..0c9c035 100644
--- a/dataset/dataset.py
+++ b/dataset/dataset.py
@@ -31,12 +31,12 @@ class Dataset:
         self.shuffle_buffer_size = shuffle_buffer_size
         self.batch_size = batch_size
 
-        self.dataset = self._load_dataset()\
+        self.dataset = self.__load_dataset()\
             .shuffle(self.shuffle_buffer_size, seed=self.seed)\
             .repeat(self.repeat)\
             .prefetch(tf.data.experimental.AUTOTUNE)
 
-    def _load_dataset(self) -> tf.data.Dataset:
+    def __load_dataset(self) -> tf.data.Dataset:
         # check if path has 'test' word in it
         dataset = tf.data.Dataset.list_files(str(self.data_dir / '*/*'))
         if 'test' in str(self.data_dir).lower():
@@ -44,11 +44,11 @@ class Dataset:
             pass
         else:
             dataset = dataset.map(
-                self._preprocess, num_parallel_calls=tf.data.experimental.AUTOTUNE)
+                self.__preprocess, num_parallel_calls=tf.data.experimental.AUTOTUNE)
 
         return dataset
 
-    def _get_labels(self, image_path):
+    def __get_labels(self, image_path):
         path = tf.strings.split(image_path, os.path.sep)[-2]
         plant = tf.strings.split(path, '___')[0]
         disease = tf.strings.split(path, '___')[1]
@@ -58,14 +58,14 @@ class Dataset:
 
         return tf.cast(one_hot_plant, dtype=tf.uint8, name=None), tf.cast(one_hot_disease, dtype=tf.uint8, name=None)
 
-    def _get_image(self, image_path):
+    def __get_image(self, image_path):
         img = tf.io.read_file(image_path)
         img = tf.io.decode_jpeg(img, channels=3)
         return tf.cast(img, dtype=tf.float32, name=None) / 255.
 
-    def _preprocess(self, image_path):
-        labels = self._get_labels(image_path)
-        image = self._get_image(image_path)
+    def __preprocess(self, image_path):
+        labels = self.__get_labels(image_path)
+        image = self.__get_image(image_path)
 
         # returns X, Y1, Y2
         return image, labels[0], labels[1]

From c6f6ae28ca1276dd0a1e437121b47b3c6302e0b0 Mon Sep 17 00:00:00 2001
From: mszmyd <erilf9@gmail.com>
Date: Sun, 5 May 2024 19:28:40 +0200
Subject: [PATCH 10/10] add batch

---
 dataset/dataset.py | 1 +
 1 file changed, 1 insertion(+)

diff --git a/dataset/dataset.py b/dataset/dataset.py
index 0c9c035..e190384 100644
--- a/dataset/dataset.py
+++ b/dataset/dataset.py
@@ -34,6 +34,7 @@ class Dataset:
         self.dataset = self.__load_dataset()\
             .shuffle(self.shuffle_buffer_size, seed=self.seed)\
             .repeat(self.repeat)\
+            .batch(self.batch_size, drop_remainder=True)\
             .prefetch(tf.data.experimental.AUTOTUNE)
 
     def __load_dataset(self) -> tf.data.Dataset: