diff --git a/dataset/__init__.py b/dataset/__init__.py new file mode 100644 index 0000000..e69de29 diff --git a/dataset/consts.py b/dataset/consts.py new file mode 100644 index 0000000..5f71df7 --- /dev/null +++ b/dataset/consts.py @@ -0,0 +1,40 @@ +PLANT_CLASSES = [ + "Tomato", + "Potato", + "Corn_(maize)", + "Apple", + "Blueberry", + "Soybean", + "Cherry_(including_sour)", + "Squash", + "Strawberry", + "Pepper,_bell", + "Peach", + "Grape", + "Orange", + "Raspberry", +] + +DISEASE_CLASSES = [ + "healthy", + "Northern_Leaf_Blight", + "Tomato_mosaic_virus", + "Early_blight", + "Leaf_scorch", + "Tomato_Yellow_Leaf_Curl_Virus", + "Cedar_apple_rust", + "Late_blight", + "Spider_mites Two-spotted_spider_mite", + "Black_rot", + "Bacterial_spot", + "Apple_scab", + "Powdery_mildew", + "Esca_(Black_Measles)", + "Haunglongbing_(Citrus_greening)", + "Leaf_Mold", + "Common_rust_", + "Target_Spot", + "Leaf_blight_(Isariopsis_Leaf_Spot)", + "Septoria_leaf_spot", + "Cercospora_leaf_spot Gray_leaf_spot", +] \ No newline at end of file diff --git a/dataset/dataset.py b/dataset/dataset.py new file mode 100644 index 0000000..e190384 --- /dev/null +++ b/dataset/dataset.py @@ -0,0 +1,75 @@ +import os +from pathlib import Path + +import tensorflow as tf + +from .consts import DISEASE_CLASSES, PLANT_CLASSES + + +class Dataset: + ''' Class to load and preprocess the dataset. + Loads images and labels from the given directory to tf.data.Dataset. + + + Args: + `data_dir (Path)`: Path to the dataset directory. + `seed (int)`: Seed for shuffling the dataset. + `repeat (int)`: Number of times to repeat the dataset. + `shuffle_buffer_size (int)`: Size of the buffer for shuffling the dataset. + `batch_size (int)`: Batch size for the dataset. + ''' + + def __init__(self, + data_dir: Path, + seed: int = 42, + repeat: int = 1, + shuffle_buffer_size: int = 10_000, + batch_size: int = 64) -> None: + self.data_dir = data_dir + self.seed = seed + self.repeat = repeat + self.shuffle_buffer_size = shuffle_buffer_size + self.batch_size = batch_size + + self.dataset = self.__load_dataset()\ + .shuffle(self.shuffle_buffer_size, seed=self.seed)\ + .repeat(self.repeat)\ + .batch(self.batch_size, drop_remainder=True)\ + .prefetch(tf.data.experimental.AUTOTUNE) + + def __load_dataset(self) -> tf.data.Dataset: + # check if path has 'test' word in it + dataset = tf.data.Dataset.list_files(str(self.data_dir / '*/*')) + if 'test' in str(self.data_dir).lower(): + # file names issue - labels have camel case (regex?) and differs from the train/valid sets + pass + else: + dataset = dataset.map( + self.__preprocess, num_parallel_calls=tf.data.experimental.AUTOTUNE) + + return dataset + + def __get_labels(self, image_path): + path = tf.strings.split(image_path, os.path.sep)[-2] + plant = tf.strings.split(path, '___')[0] + disease = tf.strings.split(path, '___')[1] + + one_hot_plant = plant == PLANT_CLASSES + one_hot_disease = disease == DISEASE_CLASSES + + return tf.cast(one_hot_plant, dtype=tf.uint8, name=None), tf.cast(one_hot_disease, dtype=tf.uint8, name=None) + + def __get_image(self, image_path): + img = tf.io.read_file(image_path) + img = tf.io.decode_jpeg(img, channels=3) + return tf.cast(img, dtype=tf.float32, name=None) / 255. + + def __preprocess(self, image_path): + labels = self.__get_labels(image_path) + image = self.__get_image(image_path) + + # returns X, Y1, Y2 + return image, labels[0], labels[1] + + def __getattr__(self, attr): + return getattr(self.dataset, attr) diff --git a/file_manager/shard_files.py b/file_manager/shard_files.py new file mode 100644 index 0000000..d4236f7 --- /dev/null +++ b/file_manager/shard_files.py @@ -0,0 +1,19 @@ +from pathlib import Path + +# TODO: split the files into smaller dirs and make list of them +class FileSharder: + def __init__(self, + train_dir: Path = Path('./data/resized_dataset/train'), + valid_dir: Path = Path('./data/resized_dataset/valid'), + test_dir: Path = Path('./data/resized_dataset/test'), + shard_size = 5_000) -> None: + self.shard_size = shard_size + + self.train_dir = train_dir + self.valid_dir = valid_dir + self.test_dir = test_dir + + self.shard() + + def shard(self): + pass diff --git a/requirements.txt b/requirements.txt index cd27218..cc6fe18 100644 --- a/requirements.txt +++ b/requirements.txt @@ -1,5 +1,4 @@ tensorflow==2.16.1 -tensorflow-io==0.37.0 numpy==1.26.4 opencv-python==4.9.0.80 wget==3.2 diff --git a/test.py b/test.py new file mode 100644 index 0000000..a75f18f --- /dev/null +++ b/test.py @@ -0,0 +1,10 @@ + +from pathlib import Path + +from dataset.dataset import Dataset + +train_dataset = Dataset(Path('data/resized_dataset/train')) +valid_dataset = Dataset(Path('data/resized_dataset/valid')) + +for i in train_dataset.take(1): + print(i)