352 KiB
352 KiB
import numpy as np
import matplotlib.pyplot as plt
from torch.utils.data import DataLoader, random_split
from torchvision import datasets, transforms, utils
%matplotlib inline
# Parameters
IMG_SIZE = 224
BATCH_SIZE = 32
IMG_SHOW_NUM = 6
# Data Augmentation: resize, normalize (ImageNet mean and std)
transformer = transforms.Compose([
transforms.Resize(size = (IMG_SIZE, IMG_SIZE), antialias = True),
transforms.ToTensor(),
transforms.Normalize(mean = [0.485, 0.456, 0.406], std = [0.229, 0.224, 0.225]),
])
testTransformer = transforms.Compose([
transforms.Resize(size = (IMG_SIZE, IMG_SIZE), antialias = True),
transforms.ToTensor(),
transforms.Normalize(mean = [0.485, 0.456, 0.406], std = [0.229, 0.224, 0.225]),
])
# Dataset from: https://www.kaggle.com/datasets/fanconic/skin-cancer-malignant-vs-benign
trainPath = "./train"
testPath = "./test"
trainData = datasets.ImageFolder(root = trainPath, transform = transformer)
testData = datasets.ImageFolder(root = testPath, transform = testTransformer)
trainLoader = DataLoader(trainData, batch_size = BATCH_SIZE, shuffle = True, num_workers = 4)
testLoader = DataLoader(testData, batch_size = BATCH_SIZE, shuffle = False, num_workers = 4)
# Show images
def show_images(imgs, title):
grid = utils.make_grid(imgs, nrow = 3, padding = 2, normalize=True)
plt.figure(figsize = (7, 10))
plt.imshow(np.transpose(grid, (1, 2, 0)))
plt.title(title)
dataiter = iter(trainLoader)
images, labels = next(dataiter)
show_images(images[:IMG_SHOW_NUM], title = "Training images")
# Train data statistics: number of samples, number of classes, classes, shape of the data
print("Train data statistics")
print("Number of samples: ", len(trainData))
print("Number of classes: ", len(trainData.classes))
print("Classes: ", trainData.classes)
print("Shape of the data: ", trainData[0][0].shape)
Train data statistics Number of samples: 11879 Number of classes: 2 Classes: ['Benign', 'Malignant'] Shape of the data: torch.Size([3, 224, 224])
# Test data statistics: number of samples, number of classes, classes, shape of the data
print("Test data statistics")
print("Number of samples: ", len(testData))
print("Number of classes: ", len(testData.classes))
print("Classes: ", testData.classes)
print("Shape of the data: ", testData[0][0].shape)
Test data statistics Number of samples: 2000 Number of classes: 2 Classes: ['Benign', 'Malignant'] Shape of the data: torch.Size([3, 224, 224])
# Number of samples per class in train data
class_dict = dict.fromkeys(testData.class_to_idx, 0)
for i in range(len(trainData)):
class_dict[trainData.classes[trainData[i][1]]] += 1
print(class_dict)
{'Benign': 6289, 'Malignant': 5590}
# Mean and std of the train data
mean = 0
std = 0
for images, _ in trainLoader:
batch_samples = images.size(0)
images = images.view(batch_samples, images.size(1), -1)
mean += images.mean(2).sum(0)
std += images.std(2).sum(0)
mean /= len(trainLoader.dataset)
std /= len(trainLoader.dataset)
print("Mean: ", mean)
print("Std: ", std)
Mean: tensor([1.0394, 0.4445, 0.5912]) Std: tensor([0.5279, 0.5983, 0.6442])
# Split train data into train and validation data
trainData, valData = random_split(trainData, [int(0.8*len(trainData)), len(trainData) - int(0.8*len(trainData))])
valLoader = DataLoader(valData, batch_size = BATCH_SIZE, shuffle = False, num_workers = 4)
print("Number of samples in train data: ", len(trainData))
print("Number of samples in validation data: ", len(valData))
Number of samples in train data: 9503 Number of samples in validation data: 2376