From e3002d5ef8bbffafed014b64e1b231a3fbcb615d Mon Sep 17 00:00:00 2001 From: mszmyd Date: Sun, 5 May 2024 01:20:04 +0200 Subject: [PATCH] add loader --- dataset/__init__.py | 0 dataset/dataset.py | 66 +++++++++++++++++++++++++++++++++++++ file_manager/shard_files.py | 19 +++++++++++ test.py | 10 ++++++ 4 files changed, 95 insertions(+) create mode 100644 dataset/__init__.py create mode 100644 dataset/dataset.py create mode 100644 file_manager/shard_files.py create mode 100644 test.py diff --git a/dataset/__init__.py b/dataset/__init__.py new file mode 100644 index 0000000..e69de29 diff --git a/dataset/dataset.py b/dataset/dataset.py new file mode 100644 index 0000000..6631a20 --- /dev/null +++ b/dataset/dataset.py @@ -0,0 +1,66 @@ +import os +from pathlib import Path + +import tensorflow as tf + + +class Dataset: + ''' Class to load and preprocess the dataset. + Loads images and labels from the given directory to tf.data.Dataset. + + + Args: + `data_dir (Path)`: Path to the dataset directory. + `seed (int)`: Seed for shuffling the dataset. + `repeat (int)`: Number of times to repeat the dataset. + `shuffle_buffer_size (int)`: Size of the buffer for shuffling the dataset. + `batch_size (int)`: Batch size for the dataset. + ''' + def __init__(self, + data_dir: Path, + seed: int = 42, + repeat: int = 1, + shuffle_buffer_size: int = 10_000, + batch_size: int = 64) -> None: + self.data_dir = data_dir + self.seed = seed + self.repeat = repeat + self.batch_size = batch_size + + self.dataset = self._load_dataset()\ + .shuffle(shuffle_buffer_size, seed=self.seed)\ + .repeat(self.repeat)\ + .prefetch(tf.data.experimental.AUTOTUNE) + + def _load_dataset(self) -> tf.data.Dataset: + # check if path has 'test' word in it + dataset = tf.data.Dataset.list_files(str(self.data_dir / '*/*')) + if 'test' in str(self.data_dir).lower(): + # file names issue - labels have camel case (regex?) and differs from the train/valid sets + pass + else: + dataset = dataset.map( + _preprocess, num_parallel_calls=tf.data.experimental.AUTOTUNE) + + return dataset + + +def _get_labels(image_path): + path = tf.strings.split(image_path, os.path.sep)[-2] + plant = tf.strings.split(path, '___')[0] + disease = tf.strings.split(path, '___')[1] + return tf.cast(plant, dtype=tf.string, name=None), tf.cast(disease, dtype=tf.string, name=None) + + +def _get_image(image_path): + img = tf.io.read_file(image_path) + img = tf.io.decode_jpeg(img, channels=3) / 255 + return tf.cast(img, dtype=tf.float32, name=None) + + +def _preprocess(image_path): + labels = _get_labels(image_path) + image = _get_image(image_path) + + # returns X, Y1, Y2 + return image, labels diff --git a/file_manager/shard_files.py b/file_manager/shard_files.py new file mode 100644 index 0000000..d4236f7 --- /dev/null +++ b/file_manager/shard_files.py @@ -0,0 +1,19 @@ +from pathlib import Path + +# TODO: split the files into smaller dirs and make list of them +class FileSharder: + def __init__(self, + train_dir: Path = Path('./data/resized_dataset/train'), + valid_dir: Path = Path('./data/resized_dataset/valid'), + test_dir: Path = Path('./data/resized_dataset/test'), + shard_size = 5_000) -> None: + self.shard_size = shard_size + + self.train_dir = train_dir + self.valid_dir = valid_dir + self.test_dir = test_dir + + self.shard() + + def shard(self): + pass diff --git a/test.py b/test.py new file mode 100644 index 0000000..41d5b6f --- /dev/null +++ b/test.py @@ -0,0 +1,10 @@ + +from pathlib import Path + +from dataset.dataset import Dataset + +train_dataset = Dataset(Path('data/resized_dataset/train')) +valid_dataset = Dataset(Path('data/resized_dataset/valid')) + +for image, labels in train_dataset.dataset.take(1): + print(image, labels)