1.2 MiB
1.2 MiB
!wget --quiet http://sceneparsing.csail.mit.edu/data/ChallengeData2017/images.tar
!wget --quiet http://sceneparsing.csail.mit.edu/data/ChallengeData2017/annotations_instance.tar
!tar -xf images.tar
!tar -xf annotations_instance.tar
!rm images.tar annotations_instance.tar
!pip install -qU torch_snippets
!wget --quiet https://raw.githubusercontent.com/pytorch/vision/release/0.12/references/detection/engine.py
!wget --quiet https://raw.githubusercontent.com/pytorch/vision/release/0.12/references/detection/utils.py
!wget --quiet https://raw.githubusercontent.com/pytorch/vision/release/0.12/references/detection/transforms.py
!wget --quiet https://raw.githubusercontent.com/pytorch/vision/release/0.12/references/detection/coco_eval.py
!wget --quiet https://raw.githubusercontent.com/pytorch/vision/release/0.12/references/detection/coco_utils.py
!pip install -q -U 'git+https://github.com/cocodataset/cocoapi.git#subdirectory=PythonAPI'
[K |████████████████████████████████| 48 kB 2.6 MB/s [K |████████████████████████████████| 60 kB 5.9 MB/s [K |████████████████████████████████| 10.9 MB 30.2 MB/s [K |████████████████████████████████| 78 kB 6.3 MB/s [K |████████████████████████████████| 948 kB 29.1 MB/s [K |████████████████████████████████| 58 kB 5.3 MB/s [K |████████████████████████████████| 232 kB 48.2 MB/s [K |████████████████████████████████| 51 kB 6.4 MB/s [?25h Building wheel for typing (setup.py) ... [?25l[?25hdone [31mERROR: pip's dependency resolver does not currently take into account all the packages that are installed. This behaviour is the source of the following dependency conflicts. albumentations 0.1.12 requires imgaug<0.2.7,>=0.2.5, but you have imgaug 0.4.0 which is incompatible.[0m Building wheel for pycocotools (setup.py) ... [?25l[?25hdone
from torch_snippets import *
from torch_snippets.inspector import inspect
import torchvision
from torchvision.models.detection.faster_rcnn import FastRCNNPredictor
from torchvision.models.detection.mask_rcnn import MaskRCNNPredictor
from engine import train_one_epoch, evaluate
import utils
import transforms as T
device = 'cuda' if torch.cuda.is_available() else 'cpu'
all_images = Glob('images/training')
all_annots = Glob('annotations_instance/training')
f = 'ADE_train_00014301'
im = read(find(f, all_images), 1)
an = read(find(f, all_annots), 1).transpose(2,0,1)
r,g,b = an
nzs = np.nonzero(r==4) # 4 stands for person
instances = np.unique(g[nzs])
masks = np.zeros((len(instances), *r.shape))
for ix,_id in enumerate(instances):
masks[ix] = g==_id
subplots([im, *masks], sz=20)
annots = []
for ann in Tqdm(all_annots[:5000]):
_ann = read(ann, 1).transpose(2,0,1)
r,g,b = _ann
if 4 not in np.unique(r): continue
annots.append(ann)
100%|██████████| 5000/5000 [00:27<00:00, 179.49it/s]
from sklearn.model_selection import train_test_split
_annots = stems(annots)
trn_items, val_items = train_test_split(_annots, random_state=2)
def get_transform(train):
image_transforms = []
image_transforms.append(T.PILToTensor())
if train:
image_transforms.append(T.RandomHorizontalFlip(0.5))
return T.Compose(image_transforms)
class MasksDataset(Dataset):
def __init__(self, items, transforms, N):
self.items = items
self.transforms = transforms
self.N = N
def get_mask(self, path):
an = read(path, 1).transpose(2,0,1)
r,g,b = an
nzs = np.nonzero(r==4)
instances = np.unique(g[nzs])
masks = np.zeros((len(instances), *r.shape))
for ix,_id in enumerate(instances):
masks[ix] = g==_id
return masks
def __getitem__(self, ix):
_id = self.items[ix]
img_path = f'images/training/{_id}.jpg'
mask_path = f'annotations_instance/training/{_id}.png'
masks = self.get_mask(mask_path)
obj_ids = np.arange(1, len(masks)+1)
img = Image.open(img_path).convert("RGB")
num_objs = len(obj_ids)
boxes = []
for i in range(num_objs):
obj_pixels = np.where(masks[i])
xmin = np.min(obj_pixels[1])
xmax = np.max(obj_pixels[1])
ymin = np.min(obj_pixels[0])
ymax = np.max(obj_pixels[0])
if (((xmax-xmin)<=10) | (ymax-ymin)<=10):
xmax = xmin+10
ymax = ymin+10
boxes.append([xmin, ymin, xmax, ymax])
boxes = torch.as_tensor(boxes, dtype=torch.float32)
labels = torch.ones((num_objs,), dtype=torch.int64)
masks = torch.as_tensor(masks, dtype=torch.uint8)
area = (boxes[:, 3] - boxes[:, 1]) * (boxes[:, 2] - boxes[:, 0])
iscrowd = torch.zeros((num_objs,), dtype=torch.int64)
image_id = torch.tensor([ix])
target = {}
target["boxes"] = boxes
target["labels"] = labels
target["masks"] = masks
target["image_id"] = image_id
target["area"] = area
target["iscrowd"] = iscrowd
if self.transforms is not None:
img, target = self.transforms(img, target)
if (img.dtype == torch.float32) or (img.dtype == torch.uint8) :
img = img/255.
return img, target
def __len__(self):
return self.N
def choose(self):
return self[randint(len(self))]
x = MasksDataset(trn_items, get_transform(train=True), N=100)
im,targ = x[0]
inspect(im,targ)
subplots([im, *targ['masks']], sz=10)
Tensor Shape: torch.Size([3, 512, 684]) Min: 0.000 Max: 1.000 Mean: 0.486 dtype: torch.float32 Dict Of 6 items BOXES: Tensor Shape: torch.Size([3, 4]) Min: 42.000 Max: 477.000 Mean: 259.417 dtype: torch.float32 LABELS: Tensor Shape: torch.Size([3]) Min: 1.000 Max: 1.000 Mean: 1.000 dtype: torch.int64 MASKS: Tensor Shape: torch.Size([3, 512, 684]) Min: 0.000 Max: 1.000 Mean: 0.008 dtype: torch.uint8 IMAGE_ID: Tensor Shape: torch.Size([1]) Min: 0.000 Max: 0.000 Mean: 0.000 dtype: torch.int64 AREA: Tensor Shape: torch.Size([3]) Min: 1932.000 Max: 10688.000 Mean: 5270.667 dtype: torch.float32 ... ... 1 more item(s)
def get_model_instance_segmentation(num_classes):
# load an instance segmentation model pre-trained pre-trained on COCO
model = torchvision.models.detection.maskrcnn_resnet50_fpn(pretrained=True)
# get number of input features for the classifier
in_features = model.roi_heads.box_predictor.cls_score.in_features
# replace the pre-trained head with a new one
model.roi_heads.box_predictor = FastRCNNPredictor(in_features, num_classes)
# now get the number of input features for the mask classifier
in_features_mask = model.roi_heads.mask_predictor.conv5_mask.in_channels
hidden_layer = 256
# and replace the mask predictor with a new one
model.roi_heads.mask_predictor = MaskRCNNPredictor(in_features_mask,
hidden_layer,num_classes)
return model
model = get_model_instance_segmentation(2).to(device)
model
MaskRCNN( (transform): GeneralizedRCNNTransform( Normalize(mean=[0.485, 0.456, 0.406], std=[0.229, 0.224, 0.225]) Resize(min_size=(800,), max_size=1333, mode='bilinear') ) (backbone): BackboneWithFPN( (body): IntermediateLayerGetter( (conv1): Conv2d(3, 64, kernel_size=(7, 7), stride=(2, 2), padding=(3, 3), bias=False) (bn1): FrozenBatchNorm2d(64, eps=0.0) (relu): ReLU(inplace=True) (maxpool): MaxPool2d(kernel_size=3, stride=2, padding=1, dilation=1, ceil_mode=False) (layer1): Sequential( (0): Bottleneck( (conv1): Conv2d(64, 64, kernel_size=(1, 1), stride=(1, 1), bias=False) (bn1): FrozenBatchNorm2d(64, eps=0.0) (conv2): Conv2d(64, 64, kernel_size=(3, 3), stride=(1, 1), padding=(1, 1), bias=False) (bn2): FrozenBatchNorm2d(64, eps=0.0) (conv3): Conv2d(64, 256, kernel_size=(1, 1), stride=(1, 1), bias=False) (bn3): FrozenBatchNorm2d(256, eps=0.0) (relu): ReLU(inplace=True) (downsample): Sequential( (0): Conv2d(64, 256, kernel_size=(1, 1), stride=(1, 1), bias=False) (1): FrozenBatchNorm2d(256, eps=0.0) ) ) (1): Bottleneck( (conv1): Conv2d(256, 64, kernel_size=(1, 1), stride=(1, 1), bias=False) (bn1): FrozenBatchNorm2d(64, eps=0.0) (conv2): Conv2d(64, 64, kernel_size=(3, 3), stride=(1, 1), padding=(1, 1), bias=False) (bn2): FrozenBatchNorm2d(64, eps=0.0) (conv3): Conv2d(64, 256, kernel_size=(1, 1), stride=(1, 1), bias=False) (bn3): FrozenBatchNorm2d(256, eps=0.0) (relu): ReLU(inplace=True) ) (2): Bottleneck( (conv1): Conv2d(256, 64, kernel_size=(1, 1), stride=(1, 1), bias=False) (bn1): FrozenBatchNorm2d(64, eps=0.0) (conv2): Conv2d(64, 64, kernel_size=(3, 3), stride=(1, 1), padding=(1, 1), bias=False) (bn2): FrozenBatchNorm2d(64, eps=0.0) (conv3): Conv2d(64, 256, kernel_size=(1, 1), stride=(1, 1), bias=False) (bn3): FrozenBatchNorm2d(256, eps=0.0) (relu): ReLU(inplace=True) ) ) (layer2): Sequential( (0): Bottleneck( (conv1): Conv2d(256, 128, kernel_size=(1, 1), stride=(1, 1), bias=False) (bn1): FrozenBatchNorm2d(128, eps=0.0) (conv2): Conv2d(128, 128, kernel_size=(3, 3), stride=(2, 2), padding=(1, 1), bias=False) (bn2): FrozenBatchNorm2d(128, eps=0.0) (conv3): Conv2d(128, 512, kernel_size=(1, 1), stride=(1, 1), bias=False) (bn3): FrozenBatchNorm2d(512, eps=0.0) (relu): ReLU(inplace=True) (downsample): Sequential( (0): Conv2d(256, 512, kernel_size=(1, 1), stride=(2, 2), bias=False) (1): FrozenBatchNorm2d(512, eps=0.0) ) ) (1): Bottleneck( (conv1): Conv2d(512, 128, kernel_size=(1, 1), stride=(1, 1), bias=False) (bn1): FrozenBatchNorm2d(128, eps=0.0) (conv2): Conv2d(128, 128, kernel_size=(3, 3), stride=(1, 1), padding=(1, 1), bias=False) (bn2): FrozenBatchNorm2d(128, eps=0.0) (conv3): Conv2d(128, 512, kernel_size=(1, 1), stride=(1, 1), bias=False) (bn3): FrozenBatchNorm2d(512, eps=0.0) (relu): ReLU(inplace=True) ) (2): Bottleneck( (conv1): Conv2d(512, 128, kernel_size=(1, 1), stride=(1, 1), bias=False) (bn1): FrozenBatchNorm2d(128, eps=0.0) (conv2): Conv2d(128, 128, kernel_size=(3, 3), stride=(1, 1), padding=(1, 1), bias=False) (bn2): FrozenBatchNorm2d(128, eps=0.0) (conv3): Conv2d(128, 512, kernel_size=(1, 1), stride=(1, 1), bias=False) (bn3): FrozenBatchNorm2d(512, eps=0.0) (relu): ReLU(inplace=True) ) (3): Bottleneck( (conv1): Conv2d(512, 128, kernel_size=(1, 1), stride=(1, 1), bias=False) (bn1): FrozenBatchNorm2d(128, eps=0.0) (conv2): Conv2d(128, 128, kernel_size=(3, 3), stride=(1, 1), padding=(1, 1), bias=False) (bn2): FrozenBatchNorm2d(128, eps=0.0) (conv3): Conv2d(128, 512, kernel_size=(1, 1), stride=(1, 1), bias=False) (bn3): FrozenBatchNorm2d(512, eps=0.0) (relu): ReLU(inplace=True) ) ) (layer3): Sequential( (0): Bottleneck( (conv1): Conv2d(512, 256, kernel_size=(1, 1), stride=(1, 1), bias=False) (bn1): FrozenBatchNorm2d(256, eps=0.0) (conv2): Conv2d(256, 256, kernel_size=(3, 3), stride=(2, 2), padding=(1, 1), bias=False) (bn2): FrozenBatchNorm2d(256, eps=0.0) (conv3): Conv2d(256, 1024, kernel_size=(1, 1), stride=(1, 1), bias=False) (bn3): FrozenBatchNorm2d(1024, eps=0.0) (relu): ReLU(inplace=True) (downsample): Sequential( (0): Conv2d(512, 1024, kernel_size=(1, 1), stride=(2, 2), bias=False) (1): FrozenBatchNorm2d(1024, eps=0.0) ) ) (1): Bottleneck( (conv1): Conv2d(1024, 256, kernel_size=(1, 1), stride=(1, 1), bias=False) (bn1): FrozenBatchNorm2d(256, eps=0.0) (conv2): Conv2d(256, 256, kernel_size=(3, 3), stride=(1, 1), padding=(1, 1), bias=False) (bn2): FrozenBatchNorm2d(256, eps=0.0) (conv3): Conv2d(256, 1024, kernel_size=(1, 1), stride=(1, 1), bias=False) (bn3): FrozenBatchNorm2d(1024, eps=0.0) (relu): ReLU(inplace=True) ) (2): Bottleneck( (conv1): Conv2d(1024, 256, kernel_size=(1, 1), stride=(1, 1), bias=False) (bn1): FrozenBatchNorm2d(256, eps=0.0) (conv2): Conv2d(256, 256, kernel_size=(3, 3), stride=(1, 1), padding=(1, 1), bias=False) (bn2): FrozenBatchNorm2d(256, eps=0.0) (conv3): Conv2d(256, 1024, kernel_size=(1, 1), stride=(1, 1), bias=False) (bn3): FrozenBatchNorm2d(1024, eps=0.0) (relu): ReLU(inplace=True) ) (3): Bottleneck( (conv1): Conv2d(1024, 256, kernel_size=(1, 1), stride=(1, 1), bias=False) (bn1): FrozenBatchNorm2d(256, eps=0.0) (conv2): Conv2d(256, 256, kernel_size=(3, 3), stride=(1, 1), padding=(1, 1), bias=False) (bn2): FrozenBatchNorm2d(256, eps=0.0) (conv3): Conv2d(256, 1024, kernel_size=(1, 1), stride=(1, 1), bias=False) (bn3): FrozenBatchNorm2d(1024, eps=0.0) (relu): ReLU(inplace=True) ) (4): Bottleneck( (conv1): Conv2d(1024, 256, kernel_size=(1, 1), stride=(1, 1), bias=False) (bn1): FrozenBatchNorm2d(256, eps=0.0) (conv2): Conv2d(256, 256, kernel_size=(3, 3), stride=(1, 1), padding=(1, 1), bias=False) (bn2): FrozenBatchNorm2d(256, eps=0.0) (conv3): Conv2d(256, 1024, kernel_size=(1, 1), stride=(1, 1), bias=False) (bn3): FrozenBatchNorm2d(1024, eps=0.0) (relu): ReLU(inplace=True) ) (5): Bottleneck( (conv1): Conv2d(1024, 256, kernel_size=(1, 1), stride=(1, 1), bias=False) (bn1): FrozenBatchNorm2d(256, eps=0.0) (conv2): Conv2d(256, 256, kernel_size=(3, 3), stride=(1, 1), padding=(1, 1), bias=False) (bn2): FrozenBatchNorm2d(256, eps=0.0) (conv3): Conv2d(256, 1024, kernel_size=(1, 1), stride=(1, 1), bias=False) (bn3): FrozenBatchNorm2d(1024, eps=0.0) (relu): ReLU(inplace=True) ) ) (layer4): Sequential( (0): Bottleneck( (conv1): Conv2d(1024, 512, kernel_size=(1, 1), stride=(1, 1), bias=False) (bn1): FrozenBatchNorm2d(512, eps=0.0) (conv2): Conv2d(512, 512, kernel_size=(3, 3), stride=(2, 2), padding=(1, 1), bias=False) (bn2): FrozenBatchNorm2d(512, eps=0.0) (conv3): Conv2d(512, 2048, kernel_size=(1, 1), stride=(1, 1), bias=False) (bn3): FrozenBatchNorm2d(2048, eps=0.0) (relu): ReLU(inplace=True) (downsample): Sequential( (0): Conv2d(1024, 2048, kernel_size=(1, 1), stride=(2, 2), bias=False) (1): FrozenBatchNorm2d(2048, eps=0.0) ) ) (1): Bottleneck( (conv1): Conv2d(2048, 512, kernel_size=(1, 1), stride=(1, 1), bias=False) (bn1): FrozenBatchNorm2d(512, eps=0.0) (conv2): Conv2d(512, 512, kernel_size=(3, 3), stride=(1, 1), padding=(1, 1), bias=False) (bn2): FrozenBatchNorm2d(512, eps=0.0) (conv3): Conv2d(512, 2048, kernel_size=(1, 1), stride=(1, 1), bias=False) (bn3): FrozenBatchNorm2d(2048, eps=0.0) (relu): ReLU(inplace=True) ) (2): Bottleneck( (conv1): Conv2d(2048, 512, kernel_size=(1, 1), stride=(1, 1), bias=False) (bn1): FrozenBatchNorm2d(512, eps=0.0) (conv2): Conv2d(512, 512, kernel_size=(3, 3), stride=(1, 1), padding=(1, 1), bias=False) (bn2): FrozenBatchNorm2d(512, eps=0.0) (conv3): Conv2d(512, 2048, kernel_size=(1, 1), stride=(1, 1), bias=False) (bn3): FrozenBatchNorm2d(2048, eps=0.0) (relu): ReLU(inplace=True) ) ) ) (fpn): FeaturePyramidNetwork( (inner_blocks): ModuleList( (0): Conv2d(256, 256, kernel_size=(1, 1), stride=(1, 1)) (1): Conv2d(512, 256, kernel_size=(1, 1), stride=(1, 1)) (2): Conv2d(1024, 256, kernel_size=(1, 1), stride=(1, 1)) (3): Conv2d(2048, 256, kernel_size=(1, 1), stride=(1, 1)) ) (layer_blocks): ModuleList( (0): Conv2d(256, 256, kernel_size=(3, 3), stride=(1, 1), padding=(1, 1)) (1): Conv2d(256, 256, kernel_size=(3, 3), stride=(1, 1), padding=(1, 1)) (2): Conv2d(256, 256, kernel_size=(3, 3), stride=(1, 1), padding=(1, 1)) (3): Conv2d(256, 256, kernel_size=(3, 3), stride=(1, 1), padding=(1, 1)) ) (extra_blocks): LastLevelMaxPool() ) ) (rpn): RegionProposalNetwork( (anchor_generator): AnchorGenerator() (head): RPNHead( (conv): Conv2d(256, 256, kernel_size=(3, 3), stride=(1, 1), padding=(1, 1)) (cls_logits): Conv2d(256, 3, kernel_size=(1, 1), stride=(1, 1)) (bbox_pred): Conv2d(256, 12, kernel_size=(1, 1), stride=(1, 1)) ) ) (roi_heads): RoIHeads( (box_roi_pool): MultiScaleRoIAlign(featmap_names=['0', '1', '2', '3'], output_size=(7, 7), sampling_ratio=2) (box_head): TwoMLPHead( (fc6): Linear(in_features=12544, out_features=1024, bias=True) (fc7): Linear(in_features=1024, out_features=1024, bias=True) ) (box_predictor): FastRCNNPredictor( (cls_score): Linear(in_features=1024, out_features=2, bias=True) (bbox_pred): Linear(in_features=1024, out_features=8, bias=True) ) (mask_roi_pool): MultiScaleRoIAlign(featmap_names=['0', '1', '2', '3'], output_size=(14, 14), sampling_ratio=2) (mask_head): MaskRCNNHeads( (mask_fcn1): Conv2d(256, 256, kernel_size=(3, 3), stride=(1, 1), padding=(1, 1)) (relu1): ReLU(inplace=True) (mask_fcn2): Conv2d(256, 256, kernel_size=(3, 3), stride=(1, 1), padding=(1, 1)) (relu2): ReLU(inplace=True) (mask_fcn3): Conv2d(256, 256, kernel_size=(3, 3), stride=(1, 1), padding=(1, 1)) (relu3): ReLU(inplace=True) (mask_fcn4): Conv2d(256, 256, kernel_size=(3, 3), stride=(1, 1), padding=(1, 1)) (relu4): ReLU(inplace=True) ) (mask_predictor): MaskRCNNPredictor( (conv5_mask): ConvTranspose2d(256, 256, kernel_size=(2, 2), stride=(2, 2)) (relu): ReLU(inplace=True) (mask_fcn_logits): Conv2d(256, 2, kernel_size=(1, 1), stride=(1, 1)) ) ) )
dataset = MasksDataset(trn_items, get_transform(train=True), N=len(trn_items))
dataset_test = MasksDataset(val_items, get_transform(train=False), N=len(val_items))
# define training and validation data loaders
data_loader = torch.utils.data.DataLoader(
dataset, batch_size=2, shuffle=True, num_workers=0,
collate_fn=utils.collate_fn)
data_loader_test = torch.utils.data.DataLoader(
dataset_test, batch_size=1, shuffle=False, num_workers=0,
collate_fn=utils.collate_fn)
num_classes = 2
model = get_model_instance_segmentation(num_classes).to(device)
params = [p for p in model.parameters() if p.requires_grad]
optimizer = torch.optim.SGD(params, lr=0.005,
momentum=0.9, weight_decay=0.0005)
# and a learning rate scheduler
lr_scheduler = torch.optim.lr_scheduler.StepLR(optimizer,
step_size=3,
gamma=0.1)
num_epochs = 1
trn_history = []
for epoch in range(num_epochs):
# train for one epoch, printing every 10 iterations
res = train_one_epoch(model, optimizer, data_loader, device, epoch, print_freq=10)
trn_history.append(res)
# update the learning rate
lr_scheduler.step()
# evaluate on the test dataset
res = evaluate(model, data_loader_test, device=device)
Epoch: [0] [ 0/482] eta: 0:19:00 lr: 0.000015 loss: 5.5752 (5.5752) loss_classifier: 0.8792 (0.8792) loss_box_reg: 0.8384 (0.8384) loss_mask: 3.7591 (3.7591) loss_objectness: 0.0579 (0.0579) loss_rpn_box_reg: 0.0406 (0.0406) time: 2.3664 data: 0.2396 max mem: 5656 Epoch: [0] [ 10/482] eta: 0:17:21 lr: 0.000119 loss: 4.4057 (4.4962) loss_classifier: 0.8939 (0.8836) loss_box_reg: 0.3374 (0.3255) loss_mask: 3.1220 (3.2297) loss_objectness: 0.0395 (0.0374) loss_rpn_box_reg: 0.0189 (0.0200) time: 2.2061 data: 0.0599 max mem: 7010 Epoch: [0] [ 20/482] eta: 0:16:13 lr: 0.000223 loss: 2.3401 (3.1059) loss_classifier: 0.5854 (0.6268) loss_box_reg: 0.2879 (0.3154) loss_mask: 1.5016 (2.1052) loss_objectness: 0.0275 (0.0391) loss_rpn_box_reg: 0.0097 (0.0193) time: 2.0950 data: 0.0462 max mem: 7010 Epoch: [0] [ 30/482] eta: 0:15:52 lr: 0.000327 loss: 1.3235 (2.5105) loss_classifier: 0.2697 (0.5083) loss_box_reg: 0.3295 (0.3287) loss_mask: 0.6481 (1.6156) loss_objectness: 0.0256 (0.0382) loss_rpn_box_reg: 0.0169 (0.0197) time: 2.0544 data: 0.0498 max mem: 7010 Epoch: [0] [ 40/482] eta: 0:15:16 lr: 0.000431 loss: 1.1525 (2.1411) loss_classifier: 0.2526 (0.4368) loss_box_reg: 0.3490 (0.3244) loss_mask: 0.4338 (1.3189) loss_objectness: 0.0398 (0.0401) loss_rpn_box_reg: 0.0169 (0.0209) time: 2.0381 data: 0.0498 max mem: 7010 Epoch: [0] [ 50/482] eta: 0:14:54 lr: 0.000535 loss: 0.9247 (1.9162) loss_classifier: 0.1698 (0.3886) loss_box_reg: 0.2237 (0.3282) loss_mask: 0.3867 (1.1401) loss_objectness: 0.0281 (0.0392) loss_rpn_box_reg: 0.0114 (0.0202) time: 2.0131 data: 0.0610 max mem: 7010 Epoch: [0] [ 60/482] eta: 0:14:28 lr: 0.000638 loss: 0.8918 (1.7327) loss_classifier: 0.1380 (0.3478) loss_box_reg: 0.2182 (0.3140) loss_mask: 0.3384 (1.0056) loss_objectness: 0.0281 (0.0426) loss_rpn_box_reg: 0.0114 (0.0226) time: 2.0290 data: 0.0806 max mem: 7010 Epoch: [0] [ 70/482] eta: 0:14:18 lr: 0.000742 loss: 0.6852 (1.5877) loss_classifier: 0.1196 (0.3171) loss_box_reg: 0.2182 (0.3067) loss_mask: 0.2703 (0.9032) loss_objectness: 0.0281 (0.0394) loss_rpn_box_reg: 0.0066 (0.0212) time: 2.1122 data: 0.0643 max mem: 7010 Epoch: [0] [ 80/482] eta: 0:13:59 lr: 0.000846 loss: 0.7877 (1.5065) loss_classifier: 0.1280 (0.2988) loss_box_reg: 0.3146 (0.3121) loss_mask: 0.2894 (0.8359) loss_objectness: 0.0127 (0.0377) loss_rpn_box_reg: 0.0114 (0.0220) time: 2.1728 data: 0.0598 max mem: 7397 Epoch: [0] [ 90/482] eta: 0:13:40 lr: 0.000950 loss: 0.9356 (1.4376) loss_classifier: 0.1458 (0.2816) loss_box_reg: 0.3707 (0.3133) loss_mask: 0.3574 (0.7842) loss_objectness: 0.0190 (0.0365) loss_rpn_box_reg: 0.0180 (0.0220) time: 2.1258 data: 0.0642 max mem: 7397 Epoch: [0] [100/482] eta: 0:13:17 lr: 0.001054 loss: 0.9208 (1.3765) loss_classifier: 0.1397 (0.2671) loss_box_reg: 0.3707 (0.3114) loss_mask: 0.3647 (0.7410) loss_objectness: 0.0144 (0.0351) loss_rpn_box_reg: 0.0127 (0.0219) time: 2.0945 data: 0.0482 max mem: 7397 Epoch: [0] [110/482] eta: 0:12:57 lr: 0.001158 loss: 0.7449 (1.3182) loss_classifier: 0.1145 (0.2534) loss_box_reg: 0.3175 (0.3092) loss_mask: 0.3104 (0.7013) loss_objectness: 0.0113 (0.0331) loss_rpn_box_reg: 0.0096 (0.0213) time: 2.0841 data: 0.0597 max mem: 7397 Epoch: [0] [120/482] eta: 0:12:37 lr: 0.001262 loss: 0.8370 (1.2859) loss_classifier: 0.1253 (0.2448) loss_box_reg: 0.3549 (0.3110) loss_mask: 0.3249 (0.6721) loss_objectness: 0.0138 (0.0340) loss_rpn_box_reg: 0.0193 (0.0240) time: 2.1204 data: 0.0764 max mem: 7397 Epoch: [0] [130/482] eta: 0:12:17 lr: 0.001365 loss: 0.7976 (1.2434) loss_classifier: 0.1197 (0.2353) loss_box_reg: 0.2635 (0.3046) loss_mask: 0.3405 (0.6463) loss_objectness: 0.0226 (0.0334) loss_rpn_box_reg: 0.0215 (0.0238) time: 2.1225 data: 0.0722 max mem: 7397 Epoch: [0] [140/482] eta: 0:11:59 lr: 0.001469 loss: 0.7081 (1.2095) loss_classifier: 0.1068 (0.2274) loss_box_reg: 0.2080 (0.3000) loss_mask: 0.3405 (0.6248) loss_objectness: 0.0217 (0.0327) loss_rpn_box_reg: 0.0171 (0.0246) time: 2.1536 data: 0.0611 max mem: 7397 Epoch: [0] [150/482] eta: 0:11:37 lr: 0.001573 loss: 0.7522 (1.1775) loss_classifier: 0.0847 (0.2193) loss_box_reg: 0.1924 (0.2910) loss_mask: 0.3606 (0.6111) loss_objectness: 0.0174 (0.0320) loss_rpn_box_reg: 0.0199 (0.0241) time: 2.1382 data: 0.0556 max mem: 7397 Epoch: [0] [160/482] eta: 0:11:14 lr: 0.001677 loss: 0.7844 (1.1583) loss_classifier: 0.1275 (0.2149) loss_box_reg: 0.1283 (0.2864) loss_mask: 0.3421 (0.5944) loss_objectness: 0.0413 (0.0379) loss_rpn_box_reg: 0.0124 (0.0246) time: 2.0422 data: 0.0565 max mem: 7397 Epoch: [0] [170/482] eta: 0:10:53 lr: 0.001781 loss: 0.7441 (1.1383) loss_classifier: 0.1275 (0.2104) loss_box_reg: 0.1731 (0.2818) loss_mask: 0.3421 (0.5821) loss_objectness: 0.0512 (0.0393) loss_rpn_box_reg: 0.0140 (0.0247) time: 2.0471 data: 0.0580 max mem: 7397 Epoch: [0] [180/482] eta: 0:10:30 lr: 0.001885 loss: 0.7389 (1.1167) loss_classifier: 0.1088 (0.2058) loss_box_reg: 0.1782 (0.2773) loss_mask: 0.3572 (0.5695) loss_objectness: 0.0371 (0.0394) loss_rpn_box_reg: 0.0169 (0.0247) time: 2.0214 data: 0.0548 max mem: 7397 Epoch: [0] [190/482] eta: 0:10:09 lr: 0.001988 loss: 0.7369 (1.0932) loss_classifier: 0.1137 (0.2013) loss_box_reg: 0.1782 (0.2730) loss_mask: 0.3162 (0.5559) loss_objectness: 0.0230 (0.0385) loss_rpn_box_reg: 0.0142 (0.0244) time: 2.0172 data: 0.0557 max mem: 7472 Epoch: [0] [200/482] eta: 0:09:49 lr: 0.002092 loss: 0.6406 (1.0779) loss_classifier: 0.1390 (0.1998) loss_box_reg: 0.1775 (0.2714) loss_mask: 0.3061 (0.5442) loss_objectness: 0.0176 (0.0383) loss_rpn_box_reg: 0.0076 (0.0242) time: 2.1139 data: 0.0526 max mem: 7472 Epoch: [0] [210/482] eta: 0:09:27 lr: 0.002196 loss: 0.6591 (1.0593) loss_classifier: 0.1058 (0.1949) loss_box_reg: 0.1948 (0.2685) loss_mask: 0.3317 (0.5347) loss_objectness: 0.0178 (0.0372) loss_rpn_box_reg: 0.0128 (0.0239) time: 2.0869 data: 0.0533 max mem: 7472 Epoch: [0] [220/482] eta: 0:09:05 lr: 0.002300 loss: 0.5995 (1.0413) loss_classifier: 0.0771 (0.1903) loss_box_reg: 0.1384 (0.2628) loss_mask: 0.3158 (0.5275) loss_objectness: 0.0091 (0.0368) loss_rpn_box_reg: 0.0132 (0.0239) time: 2.0243 data: 0.0517 max mem: 7472 Epoch: [0] [230/482] eta: 0:08:45 lr: 0.002404 loss: 0.5488 (1.0206) loss_classifier: 0.0771 (0.1861) loss_box_reg: 0.1384 (0.2585) loss_mask: 0.3013 (0.5169) loss_objectness: 0.0074 (0.0357) loss_rpn_box_reg: 0.0060 (0.0234) time: 2.0572 data: 0.0482 max mem: 7472 Epoch: [0] [240/482] eta: 0:08:24 lr: 0.002508 loss: 0.5726 (1.0085) loss_classifier: 0.0902 (0.1839) loss_box_reg: 0.1435 (0.2558) loss_mask: 0.3014 (0.5098) loss_objectness: 0.0091 (0.0353) loss_rpn_box_reg: 0.0133 (0.0237) time: 2.1009 data: 0.0742 max mem: 7472 Epoch: [0] [250/482] eta: 0:08:03 lr: 0.002612 loss: 0.5975 (0.9965) loss_classifier: 0.0902 (0.1816) loss_box_reg: 0.1355 (0.2537) loss_mask: 0.3355 (0.5032) loss_objectness: 0.0154 (0.0347) loss_rpn_box_reg: 0.0170 (0.0233) time: 2.0566 data: 0.0683 max mem: 7472 Epoch: [0] [260/482] eta: 0:07:43 lr: 0.002715 loss: 0.6450 (0.9821) loss_classifier: 0.0816 (0.1786) loss_box_reg: 0.1494 (0.2506) loss_mask: 0.3354 (0.4959) loss_objectness: 0.0153 (0.0341) loss_rpn_box_reg: 0.0119 (0.0229) time: 2.1059 data: 0.0468 max mem: 7472 Epoch: [0] [270/482] eta: 0:07:23 lr: 0.002819 loss: 0.6738 (0.9781) loss_classifier: 0.1087 (0.1784) loss_box_reg: 0.1711 (0.2504) loss_mask: 0.3463 (0.4899) loss_objectness: 0.0163 (0.0361) loss_rpn_box_reg: 0.0119 (0.0233) time: 2.1858 data: 0.0666 max mem: 7472 Epoch: [0] [280/482] eta: 0:07:01 lr: 0.002923 loss: 0.6262 (0.9664) loss_classifier: 0.0892 (0.1756) loss_box_reg: 0.1449 (0.2462) loss_mask: 0.3501 (0.4849) loss_objectness: 0.0280 (0.0364) loss_rpn_box_reg: 0.0087 (0.0232) time: 2.1137 data: 0.0605 max mem: 7472 Epoch: [0] [290/482] eta: 0:06:40 lr: 0.003027 loss: 0.6054 (0.9550) loss_classifier: 0.0817 (0.1727) loss_box_reg: 0.0929 (0.2428) loss_mask: 0.3019 (0.4783) loss_objectness: 0.0440 (0.0382) loss_rpn_box_reg: 0.0113 (0.0231) time: 2.0563 data: 0.0434 max mem: 7472 Epoch: [0] [300/482] eta: 0:06:20 lr: 0.003131 loss: 0.6487 (0.9488) loss_classifier: 0.0922 (0.1709) loss_box_reg: 0.1191 (0.2404) loss_mask: 0.3320 (0.4754) loss_objectness: 0.0427 (0.0389) loss_rpn_box_reg: 0.0121 (0.0232) time: 2.0836 data: 0.0515 max mem: 7472 Epoch: [0] [310/482] eta: 0:05:59 lr: 0.003235 loss: 0.7082 (0.9449) loss_classifier: 0.0969 (0.1694) loss_box_reg: 0.1515 (0.2396) loss_mask: 0.3683 (0.4714) loss_objectness: 0.0424 (0.0398) loss_rpn_box_reg: 0.0133 (0.0247) time: 2.0858 data: 0.0690 max mem: 7472 Epoch: [0] [320/482] eta: 0:05:38 lr: 0.003338 loss: 0.7512 (0.9427) loss_classifier: 0.0994 (0.1687) loss_box_reg: 0.1919 (0.2402) loss_mask: 0.3849 (0.4693) loss_objectness: 0.0353 (0.0396) loss_rpn_box_reg: 0.0357 (0.0250) time: 2.1452 data: 0.0762 max mem: 7472 Epoch: [0] [330/482] eta: 0:05:18 lr: 0.003442 loss: 0.8277 (0.9388) loss_classifier: 0.1592 (0.1682) loss_box_reg: 0.2393 (0.2398) loss_mask: 0.3809 (0.4659) loss_objectness: 0.0306 (0.0397) loss_rpn_box_reg: 0.0325 (0.0252) time: 2.2168 data: 0.0701 max mem: 7472 Epoch: [0] [340/482] eta: 0:04:57 lr: 0.003546 loss: 0.6757 (0.9307) loss_classifier: 0.1065 (0.1662) loss_box_reg: 0.1474 (0.2375) loss_mask: 0.3438 (0.4626) loss_objectness: 0.0306 (0.0396) loss_rpn_box_reg: 0.0133 (0.0248) time: 2.1240 data: 0.0569 max mem: 7472 Epoch: [0] [350/482] eta: 0:04:35 lr: 0.003650 loss: 0.5709 (0.9201) loss_classifier: 0.0669 (0.1634) loss_box_reg: 0.1112 (0.2336) loss_mask: 0.3369 (0.4597) loss_objectness: 0.0229 (0.0388) loss_rpn_box_reg: 0.0117 (0.0246) time: 1.9926 data: 0.0469 max mem: 7472 Epoch: [0] [360/482] eta: 0:04:14 lr: 0.003754 loss: 0.6046 (0.9188) loss_classifier: 0.1003 (0.1641) loss_box_reg: 0.1324 (0.2334) loss_mask: 0.3303 (0.4571) loss_objectness: 0.0204 (0.0392) loss_rpn_box_reg: 0.0121 (0.0250) time: 2.0180 data: 0.0626 max mem: 7472 Epoch: [0] [370/482] eta: 0:03:53 lr: 0.003858 loss: 0.6006 (0.9078) loss_classifier: 0.0758 (0.1615) loss_box_reg: 0.0887 (0.2301) loss_mask: 0.3290 (0.4529) loss_objectness: 0.0181 (0.0386) loss_rpn_box_reg: 0.0084 (0.0247) time: 2.0244 data: 0.0655 max mem: 7472 Epoch: [0] [380/482] eta: 0:03:32 lr: 0.003962 loss: 0.5364 (0.9007) loss_classifier: 0.0682 (0.1604) loss_box_reg: 0.0931 (0.2289) loss_mask: 0.2617 (0.4485) loss_objectness: 0.0139 (0.0383) loss_rpn_box_reg: 0.0050 (0.0247) time: 2.0151 data: 0.0667 max mem: 7472 Epoch: [0] [390/482] eta: 0:03:11 lr: 0.004065 loss: 0.7302 (0.9014) loss_classifier: 0.1498 (0.1610) loss_box_reg: 0.2004 (0.2305) loss_mask: 0.3288 (0.4462) loss_objectness: 0.0307 (0.0386) loss_rpn_box_reg: 0.0100 (0.0250) time: 2.1033 data: 0.0753 max mem: 7472 Epoch: [0] [400/482] eta: 0:02:51 lr: 0.004169 loss: 0.7474 (0.8952) loss_classifier: 0.1134 (0.1600) loss_box_reg: 0.2025 (0.2292) loss_mask: 0.3490 (0.4431) loss_objectness: 0.0315 (0.0381) loss_rpn_box_reg: 0.0130 (0.0248) time: 2.1243 data: 0.0560 max mem: 7472 Epoch: [0] [410/482] eta: 0:02:30 lr: 0.004273 loss: 0.5860 (0.8895) loss_classifier: 0.0841 (0.1584) loss_box_reg: 0.1454 (0.2277) loss_mask: 0.3100 (0.4401) loss_objectness: 0.0135 (0.0377) loss_rpn_box_reg: 0.0110 (0.0257) time: 2.1124 data: 0.0537 max mem: 7472 Epoch: [0] [420/482] eta: 0:02:09 lr: 0.004377 loss: 0.6747 (0.8857) loss_classifier: 0.0979 (0.1575) loss_box_reg: 0.1829 (0.2272) loss_mask: 0.3432 (0.4379) loss_objectness: 0.0167 (0.0374) loss_rpn_box_reg: 0.0182 (0.0256) time: 2.0534 data: 0.0636 max mem: 7472 Epoch: [0] [430/482] eta: 0:01:48 lr: 0.004481 loss: 0.6763 (0.8817) loss_classifier: 0.1123 (0.1566) loss_box_reg: 0.2011 (0.2271) loss_mask: 0.3441 (0.4354) loss_objectness: 0.0201 (0.0370) loss_rpn_box_reg: 0.0182 (0.0256) time: 1.9787 data: 0.0590 max mem: 7472 Epoch: [0] [440/482] eta: 0:01:27 lr: 0.004585 loss: 0.5922 (0.8753) loss_classifier: 0.0770 (0.1549) loss_box_reg: 0.1138 (0.2247) loss_mask: 0.3409 (0.4336) loss_objectness: 0.0169 (0.0367) loss_rpn_box_reg: 0.0143 (0.0255) time: 2.0144 data: 0.0529 max mem: 7472 Epoch: [0] [450/482] eta: 0:01:06 lr: 0.004688 loss: 0.5922 (0.8718) loss_classifier: 0.0838 (0.1539) loss_box_reg: 0.1332 (0.2244) loss_mask: 0.3409 (0.4317) loss_objectness: 0.0113 (0.0363) loss_rpn_box_reg: 0.0107 (0.0254) time: 2.0415 data: 0.0510 max mem: 7472 Epoch: [0] [460/482] eta: 0:00:45 lr: 0.004792 loss: 0.7216 (0.8688) loss_classifier: 0.1217 (0.1534) loss_box_reg: 0.2018 (0.2241) loss_mask: 0.3192 (0.4296) loss_objectness: 0.0107 (0.0362) loss_rpn_box_reg: 0.0123 (0.0254) time: 2.0111 data: 0.0544 max mem: 7472 Epoch: [0] [470/482] eta: 0:00:24 lr: 0.004896 loss: 0.6759 (0.8650) loss_classifier: 0.1234 (0.1532) loss_box_reg: 0.1583 (0.2228) loss_mask: 0.3192 (0.4277) loss_objectness: 0.0207 (0.0359) loss_rpn_box_reg: 0.0134 (0.0254) time: 1.9929 data: 0.0480 max mem: 7472 Epoch: [0] [480/482] eta: 0:00:04 lr: 0.005000 loss: 0.6775 (0.8623) loss_classifier: 0.1407 (0.1530) loss_box_reg: 0.1559 (0.2217) loss_mask: 0.3240 (0.4261) loss_objectness: 0.0219 (0.0360) loss_rpn_box_reg: 0.0148 (0.0254) time: 1.9480 data: 0.0479 max mem: 7472 Epoch: [0] [481/482] eta: 0:00:02 lr: 0.005000 loss: 0.6555 (0.8611) loss_classifier: 0.1407 (0.1528) loss_box_reg: 0.1366 (0.2213) loss_mask: 0.3131 (0.4257) loss_objectness: 0.0219 (0.0359) loss_rpn_box_reg: 0.0134 (0.0254) time: 1.9022 data: 0.0456 max mem: 7472 Epoch: [0] Total time: 0:16:38 (2.0720 s / it) creating index... index created! Test: [ 0/321] eta: 0:03:07 model_time: 0.5358 (0.5358) evaluator_time: 0.0336 (0.0336) time: 0.5840 data: 0.0141 max mem: 7472 Test: [100/321] eta: 0:01:56 model_time: 0.4708 (0.4680) evaluator_time: 0.0227 (0.0377) time: 0.5279 data: 0.0188 max mem: 7472 Test: [200/321] eta: 0:01:04 model_time: 0.4536 (0.4670) evaluator_time: 0.0357 (0.0438) time: 0.5308 data: 0.0233 max mem: 7472 Test: [300/321] eta: 0:00:11 model_time: 0.4652 (0.4660) evaluator_time: 0.0366 (0.0433) time: 0.5570 data: 0.0269 max mem: 7472 Test: [320/321] eta: 0:00:00 model_time: 0.4608 (0.4663) evaluator_time: 0.0370 (0.0441) time: 0.5569 data: 0.0280 max mem: 7472 Test: Total time: 0:02:51 (0.5352 s / it) Averaged stats: model_time: 0.4608 (0.4663) evaluator_time: 0.0370 (0.0441) Accumulating evaluation results... DONE (t=0.13s). Accumulating evaluation results... DONE (t=0.14s). IoU metric: bbox Average Precision (AP) @[ IoU=0.50:0.95 | area= all | maxDets=100 ] = 0.338 Average Precision (AP) @[ IoU=0.50 | area= all | maxDets=100 ] = 0.665 Average Precision (AP) @[ IoU=0.75 | area= all | maxDets=100 ] = 0.296 Average Precision (AP) @[ IoU=0.50:0.95 | area= small | maxDets=100 ] = 0.208 Average Precision (AP) @[ IoU=0.50:0.95 | area=medium | maxDets=100 ] = 0.402 Average Precision (AP) @[ IoU=0.50:0.95 | area= large | maxDets=100 ] = 0.488 Average Recall (AR) @[ IoU=0.50:0.95 | area= all | maxDets= 1 ] = 0.140 Average Recall (AR) @[ IoU=0.50:0.95 | area= all | maxDets= 10 ] = 0.405 Average Recall (AR) @[ IoU=0.50:0.95 | area= all | maxDets=100 ] = 0.485 Average Recall (AR) @[ IoU=0.50:0.95 | area= small | maxDets=100 ] = 0.374 Average Recall (AR) @[ IoU=0.50:0.95 | area=medium | maxDets=100 ] = 0.545 Average Recall (AR) @[ IoU=0.50:0.95 | area= large | maxDets=100 ] = 0.614 IoU metric: segm Average Precision (AP) @[ IoU=0.50:0.95 | area= all | maxDets=100 ] = 0.295 Average Precision (AP) @[ IoU=0.50 | area= all | maxDets=100 ] = 0.630 Average Precision (AP) @[ IoU=0.75 | area= all | maxDets=100 ] = 0.231 Average Precision (AP) @[ IoU=0.50:0.95 | area= small | maxDets=100 ] = 0.158 Average Precision (AP) @[ IoU=0.50:0.95 | area=medium | maxDets=100 ] = 0.355 Average Precision (AP) @[ IoU=0.50:0.95 | area= large | maxDets=100 ] = 0.466 Average Recall (AR) @[ IoU=0.50:0.95 | area= all | maxDets= 1 ] = 0.126 Average Recall (AR) @[ IoU=0.50:0.95 | area= all | maxDets= 10 ] = 0.356 Average Recall (AR) @[ IoU=0.50:0.95 | area= all | maxDets=100 ] = 0.422 Average Recall (AR) @[ IoU=0.50:0.95 | area= small | maxDets=100 ] = 0.304 Average Recall (AR) @[ IoU=0.50:0.95 | area=medium | maxDets=100 ] = 0.487 Average Recall (AR) @[ IoU=0.50:0.95 | area= large | maxDets=100 ] = 0.553
import matplotlib.pyplot as plt
plt.title('Training Loss')
losses = [np.mean(list(trn_history[i].meters['loss'].deque)) for i in range(len(trn_history))]
plt.plot(losses)
model.eval()
im = dataset_test[10][0]
show(im)
with torch.no_grad():
prediction = model([im.to(device)])
for i in range(len(prediction[0]['masks'])):
plt.imshow(Image.fromarray(prediction[0]['masks'][i, 0].mul(255).byte().cpu().numpy()))
plt.title('Class: '+str(prediction[0]['labels'][i].cpu().numpy())+' Score:'+str(prediction[0]['scores'][i].cpu().numpy()))
plt.show()