From e3002d5ef8bbffafed014b64e1b231a3fbcb615d Mon Sep 17 00:00:00 2001 From: mszmyd Date: Sun, 5 May 2024 01:20:04 +0200 Subject: [PATCH 1/7] add loader --- dataset/__init__.py | 0 dataset/dataset.py | 66 +++++++++++++++++++++++++++++++++++++ file_manager/shard_files.py | 19 +++++++++++ test.py | 10 ++++++ 4 files changed, 95 insertions(+) create mode 100644 dataset/__init__.py create mode 100644 dataset/dataset.py create mode 100644 file_manager/shard_files.py create mode 100644 test.py diff --git a/dataset/__init__.py b/dataset/__init__.py new file mode 100644 index 0000000..e69de29 diff --git a/dataset/dataset.py b/dataset/dataset.py new file mode 100644 index 0000000..6631a20 --- /dev/null +++ b/dataset/dataset.py @@ -0,0 +1,66 @@ +import os +from pathlib import Path + +import tensorflow as tf + + +class Dataset: + ''' Class to load and preprocess the dataset. + Loads images and labels from the given directory to tf.data.Dataset. + + + Args: + `data_dir (Path)`: Path to the dataset directory. + `seed (int)`: Seed for shuffling the dataset. + `repeat (int)`: Number of times to repeat the dataset. + `shuffle_buffer_size (int)`: Size of the buffer for shuffling the dataset. + `batch_size (int)`: Batch size for the dataset. + ''' + def __init__(self, + data_dir: Path, + seed: int = 42, + repeat: int = 1, + shuffle_buffer_size: int = 10_000, + batch_size: int = 64) -> None: + self.data_dir = data_dir + self.seed = seed + self.repeat = repeat + self.batch_size = batch_size + + self.dataset = self._load_dataset()\ + .shuffle(shuffle_buffer_size, seed=self.seed)\ + .repeat(self.repeat)\ + .prefetch(tf.data.experimental.AUTOTUNE) + + def _load_dataset(self) -> tf.data.Dataset: + # check if path has 'test' word in it + dataset = tf.data.Dataset.list_files(str(self.data_dir / '*/*')) + if 'test' in str(self.data_dir).lower(): + # file names issue - labels have camel case (regex?) and differs from the train/valid sets + pass + else: + dataset = dataset.map( + _preprocess, num_parallel_calls=tf.data.experimental.AUTOTUNE) + + return dataset + + +def _get_labels(image_path): + path = tf.strings.split(image_path, os.path.sep)[-2] + plant = tf.strings.split(path, '___')[0] + disease = tf.strings.split(path, '___')[1] + return tf.cast(plant, dtype=tf.string, name=None), tf.cast(disease, dtype=tf.string, name=None) + + +def _get_image(image_path): + img = tf.io.read_file(image_path) + img = tf.io.decode_jpeg(img, channels=3) / 255 + return tf.cast(img, dtype=tf.float32, name=None) + + +def _preprocess(image_path): + labels = _get_labels(image_path) + image = _get_image(image_path) + + # returns X, Y1, Y2 + return image, labels diff --git a/file_manager/shard_files.py b/file_manager/shard_files.py new file mode 100644 index 0000000..d4236f7 --- /dev/null +++ b/file_manager/shard_files.py @@ -0,0 +1,19 @@ +from pathlib import Path + +# TODO: split the files into smaller dirs and make list of them +class FileSharder: + def __init__(self, + train_dir: Path = Path('./data/resized_dataset/train'), + valid_dir: Path = Path('./data/resized_dataset/valid'), + test_dir: Path = Path('./data/resized_dataset/test'), + shard_size = 5_000) -> None: + self.shard_size = shard_size + + self.train_dir = train_dir + self.valid_dir = valid_dir + self.test_dir = test_dir + + self.shard() + + def shard(self): + pass diff --git a/test.py b/test.py new file mode 100644 index 0000000..41d5b6f --- /dev/null +++ b/test.py @@ -0,0 +1,10 @@ + +from pathlib import Path + +from dataset.dataset import Dataset + +train_dataset = Dataset(Path('data/resized_dataset/train')) +valid_dataset = Dataset(Path('data/resized_dataset/valid')) + +for image, labels in train_dataset.dataset.take(1): + print(image, labels) -- 2.20.1 From de27695d530ae13eeef990bedfe5f4696965bfe9 Mon Sep 17 00:00:00 2001 From: mszmyd Date: Sun, 5 May 2024 12:41:54 +0200 Subject: [PATCH 2/7] move funcs to class --- dataset/dataset.py | 41 ++++++++++++++++++++--------------------- 1 file changed, 20 insertions(+), 21 deletions(-) diff --git a/dataset/dataset.py b/dataset/dataset.py index 6631a20..d098ead 100644 --- a/dataset/dataset.py +++ b/dataset/dataset.py @@ -7,8 +7,8 @@ import tensorflow as tf class Dataset: ''' Class to load and preprocess the dataset. Loads images and labels from the given directory to tf.data.Dataset. - - + + Args: `data_dir (Path)`: Path to the dataset directory. `seed (int)`: Seed for shuffling the dataset. @@ -16,6 +16,7 @@ class Dataset: `shuffle_buffer_size (int)`: Size of the buffer for shuffling the dataset. `batch_size (int)`: Batch size for the dataset. ''' + def __init__(self, data_dir: Path, seed: int = 42, @@ -25,10 +26,11 @@ class Dataset: self.data_dir = data_dir self.seed = seed self.repeat = repeat + self.shuffle_buffer_size = shuffle_buffer_size self.batch_size = batch_size self.dataset = self._load_dataset()\ - .shuffle(shuffle_buffer_size, seed=self.seed)\ + .shuffle(self.shuffle_buffer_size, seed=self.seed)\ .repeat(self.repeat)\ .prefetch(tf.data.experimental.AUTOTUNE) @@ -40,27 +42,24 @@ class Dataset: pass else: dataset = dataset.map( - _preprocess, num_parallel_calls=tf.data.experimental.AUTOTUNE) + self._preprocess, num_parallel_calls=tf.data.experimental.AUTOTUNE) return dataset + def _get_labels(self, image_path): + path = tf.strings.split(image_path, os.path.sep)[-2] + plant = tf.strings.split(path, '___')[0] + disease = tf.strings.split(path, '___')[1] + return tf.cast(plant, dtype=tf.string, name=None), tf.cast(disease, dtype=tf.string, name=None) -def _get_labels(image_path): - path = tf.strings.split(image_path, os.path.sep)[-2] - plant = tf.strings.split(path, '___')[0] - disease = tf.strings.split(path, '___')[1] - return tf.cast(plant, dtype=tf.string, name=None), tf.cast(disease, dtype=tf.string, name=None) + def _get_image(self, image_path): + img = tf.io.read_file(image_path) + img = tf.io.decode_jpeg(img, channels=3) / 255 + return tf.cast(img, dtype=tf.float32, name=None) + def _preprocess(self, image_path): + labels = self._get_labels(image_path) + image = self._get_image(image_path) -def _get_image(image_path): - img = tf.io.read_file(image_path) - img = tf.io.decode_jpeg(img, channels=3) / 255 - return tf.cast(img, dtype=tf.float32, name=None) - - -def _preprocess(image_path): - labels = _get_labels(image_path) - image = _get_image(image_path) - - # returns X, Y1, Y2 - return image, labels + # returns X, Y1, Y2 + return image, labels -- 2.20.1 From b7ca0fae45b2056ae3fb54b663755575f2142a64 Mon Sep 17 00:00:00 2001 From: mszmyd Date: Sun, 5 May 2024 12:44:54 +0200 Subject: [PATCH 3/7] div by float --- dataset/dataset.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/dataset/dataset.py b/dataset/dataset.py index d098ead..6b4489b 100644 --- a/dataset/dataset.py +++ b/dataset/dataset.py @@ -54,7 +54,7 @@ class Dataset: def _get_image(self, image_path): img = tf.io.read_file(image_path) - img = tf.io.decode_jpeg(img, channels=3) / 255 + img = tf.io.decode_jpeg(img, channels=3) / 255. return tf.cast(img, dtype=tf.float32, name=None) def _preprocess(self, image_path): -- 2.20.1 From d4b6a714bb83fd0e66910198c248b0b62f101a6a Mon Sep 17 00:00:00 2001 From: mszmyd Date: Sun, 5 May 2024 13:25:53 +0200 Subject: [PATCH 4/7] add onehot and getattr --- dataset/consts.py | 40 ++++++++++++++++++++++++++++++++++++++++ dataset/dataset.py | 17 +++++++++++++---- test.py | 4 ++-- 3 files changed, 55 insertions(+), 6 deletions(-) create mode 100644 dataset/consts.py diff --git a/dataset/consts.py b/dataset/consts.py new file mode 100644 index 0000000..5f71df7 --- /dev/null +++ b/dataset/consts.py @@ -0,0 +1,40 @@ +PLANT_CLASSES = [ + "Tomato", + "Potato", + "Corn_(maize)", + "Apple", + "Blueberry", + "Soybean", + "Cherry_(including_sour)", + "Squash", + "Strawberry", + "Pepper,_bell", + "Peach", + "Grape", + "Orange", + "Raspberry", +] + +DISEASE_CLASSES = [ + "healthy", + "Northern_Leaf_Blight", + "Tomato_mosaic_virus", + "Early_blight", + "Leaf_scorch", + "Tomato_Yellow_Leaf_Curl_Virus", + "Cedar_apple_rust", + "Late_blight", + "Spider_mites Two-spotted_spider_mite", + "Black_rot", + "Bacterial_spot", + "Apple_scab", + "Powdery_mildew", + "Esca_(Black_Measles)", + "Haunglongbing_(Citrus_greening)", + "Leaf_Mold", + "Common_rust_", + "Target_Spot", + "Leaf_blight_(Isariopsis_Leaf_Spot)", + "Septoria_leaf_spot", + "Cercospora_leaf_spot Gray_leaf_spot", +] \ No newline at end of file diff --git a/dataset/dataset.py b/dataset/dataset.py index 6b4489b..b299e25 100644 --- a/dataset/dataset.py +++ b/dataset/dataset.py @@ -3,6 +3,8 @@ from pathlib import Path import tensorflow as tf +from .consts import DISEASE_CLASSES, PLANT_CLASSES + class Dataset: ''' Class to load and preprocess the dataset. @@ -50,16 +52,23 @@ class Dataset: path = tf.strings.split(image_path, os.path.sep)[-2] plant = tf.strings.split(path, '___')[0] disease = tf.strings.split(path, '___')[1] - return tf.cast(plant, dtype=tf.string, name=None), tf.cast(disease, dtype=tf.string, name=None) + + one_hot_plant = plant == PLANT_CLASSES + one_hot_disease = disease == DISEASE_CLASSES + + return tf.cast(one_hot_plant, dtype=tf.uint8, name=None), tf.cast(one_hot_disease, dtype=tf.uint8, name=None) def _get_image(self, image_path): img = tf.io.read_file(image_path) - img = tf.io.decode_jpeg(img, channels=3) / 255. - return tf.cast(img, dtype=tf.float32, name=None) + img = tf.io.decode_jpeg(img, channels=3) + return tf.cast(img, dtype=tf.float32, name=None) / 255. def _preprocess(self, image_path): labels = self._get_labels(image_path) image = self._get_image(image_path) # returns X, Y1, Y2 - return image, labels + return image, labels[0], labels[1] + + def __getattr__(self, attr): + return getattr(self.dataset, attr) diff --git a/test.py b/test.py index 41d5b6f..a75f18f 100644 --- a/test.py +++ b/test.py @@ -6,5 +6,5 @@ from dataset.dataset import Dataset train_dataset = Dataset(Path('data/resized_dataset/train')) valid_dataset = Dataset(Path('data/resized_dataset/valid')) -for image, labels in train_dataset.dataset.take(1): - print(image, labels) +for i in train_dataset.take(1): + print(i) -- 2.20.1 From 8e6318b1fec263d37aed2431c43705764ae9faf7 Mon Sep 17 00:00:00 2001 From: mszmyd Date: Sun, 5 May 2024 13:41:12 +0200 Subject: [PATCH 5/7] del tfio from req --- requirements.txt | 1 - 1 file changed, 1 deletion(-) diff --git a/requirements.txt b/requirements.txt index cd27218..cc6fe18 100644 --- a/requirements.txt +++ b/requirements.txt @@ -1,5 +1,4 @@ tensorflow==2.16.1 -tensorflow-io==0.37.0 numpy==1.26.4 opencv-python==4.9.0.80 wget==3.2 -- 2.20.1 From 1cfb74db6aa454c832bb372c0c7dedda90476e86 Mon Sep 17 00:00:00 2001 From: mszmyd Date: Sun, 5 May 2024 19:03:15 +0200 Subject: [PATCH 6/7] priv --- dataset/dataset.py | 16 ++++++++-------- 1 file changed, 8 insertions(+), 8 deletions(-) diff --git a/dataset/dataset.py b/dataset/dataset.py index b299e25..0c9c035 100644 --- a/dataset/dataset.py +++ b/dataset/dataset.py @@ -31,12 +31,12 @@ class Dataset: self.shuffle_buffer_size = shuffle_buffer_size self.batch_size = batch_size - self.dataset = self._load_dataset()\ + self.dataset = self.__load_dataset()\ .shuffle(self.shuffle_buffer_size, seed=self.seed)\ .repeat(self.repeat)\ .prefetch(tf.data.experimental.AUTOTUNE) - def _load_dataset(self) -> tf.data.Dataset: + def __load_dataset(self) -> tf.data.Dataset: # check if path has 'test' word in it dataset = tf.data.Dataset.list_files(str(self.data_dir / '*/*')) if 'test' in str(self.data_dir).lower(): @@ -44,11 +44,11 @@ class Dataset: pass else: dataset = dataset.map( - self._preprocess, num_parallel_calls=tf.data.experimental.AUTOTUNE) + self.__preprocess, num_parallel_calls=tf.data.experimental.AUTOTUNE) return dataset - def _get_labels(self, image_path): + def __get_labels(self, image_path): path = tf.strings.split(image_path, os.path.sep)[-2] plant = tf.strings.split(path, '___')[0] disease = tf.strings.split(path, '___')[1] @@ -58,14 +58,14 @@ class Dataset: return tf.cast(one_hot_plant, dtype=tf.uint8, name=None), tf.cast(one_hot_disease, dtype=tf.uint8, name=None) - def _get_image(self, image_path): + def __get_image(self, image_path): img = tf.io.read_file(image_path) img = tf.io.decode_jpeg(img, channels=3) return tf.cast(img, dtype=tf.float32, name=None) / 255. - def _preprocess(self, image_path): - labels = self._get_labels(image_path) - image = self._get_image(image_path) + def __preprocess(self, image_path): + labels = self.__get_labels(image_path) + image = self.__get_image(image_path) # returns X, Y1, Y2 return image, labels[0], labels[1] -- 2.20.1 From c6f6ae28ca1276dd0a1e437121b47b3c6302e0b0 Mon Sep 17 00:00:00 2001 From: mszmyd Date: Sun, 5 May 2024 19:28:40 +0200 Subject: [PATCH 7/7] add batch --- dataset/dataset.py | 1 + 1 file changed, 1 insertion(+) diff --git a/dataset/dataset.py b/dataset/dataset.py index 0c9c035..e190384 100644 --- a/dataset/dataset.py +++ b/dataset/dataset.py @@ -34,6 +34,7 @@ class Dataset: self.dataset = self.__load_dataset()\ .shuffle(self.shuffle_buffer_size, seed=self.seed)\ .repeat(self.repeat)\ + .batch(self.batch_size, drop_remainder=True)\ .prefetch(tf.data.experimental.AUTOTUNE) def __load_dataset(self) -> tf.data.Dataset: -- 2.20.1