From d7fd12985ca6a0749e9b112a2a5117615324de20 Mon Sep 17 00:00:00 2001 From: vfdev-5 Date: Mon, 12 Sep 2022 19:01:14 +0200 Subject: [PATCH 1/3] [proto] Added consistency tests for detection transforms --- test/test_prototype_transforms_consistency.py | 680 +++++++++++++++++- 1 file changed, 679 insertions(+), 1 deletion(-) diff --git a/test/test_prototype_transforms_consistency.py b/test/test_prototype_transforms_consistency.py index 2bb98002e12..d3496d623cc 100644 --- a/test/test_prototype_transforms_consistency.py +++ b/test/test_prototype_transforms_consistency.py @@ -6,7 +6,7 @@ import pytest import torch -from prototype_common_utils import ArgsKwargs, assert_equal, make_images +from prototype_common_utils import ArgsKwargs, assert_equal, make_bounding_box, make_images from torchvision import transforms as legacy_transforms from torchvision._utils import sequence_to_str from torchvision.prototype import features, transforms as prototype_transforms @@ -840,3 +840,681 @@ def test_aa(self, inpt, interpolation): output = t(inpt) assert_equal(expected_output, output) + + +class TestRefDetTransforms: + def make_datapoints(self, with_mask=True): + size = (600, 800) + + pil_image = PIL.Image.new("RGB", size[::-1], 123) + target = { + "boxes": make_bounding_box(image_size=size, format="XYXY", extra_dims=(22,), dtype=torch.float), + "labels": features.Label(torch.randint(0, 81, size=(22,))), + } + if with_mask: + target["masks"] = features.SegmentationMask(torch.randint(0, 2, size=(22, *size), dtype=torch.long)) + + yield (pil_image, target) + + tensor_image = torch.randint(0, 256, size=(3, *size), dtype=torch.uint8) + target = { + "boxes": make_bounding_box(image_size=size, format="XYXY", extra_dims=(22,), dtype=torch.float), + "labels": features.Label(torch.randint(0, 81, size=(22,))), + } + if with_mask: + target["masks"] = features.SegmentationMask(torch.randint(0, 2, size=(22, *size), dtype=torch.long)) + + yield (tensor_image, target) + + feature_image = features.Image(torch.randint(0, 256, size=(3, *size), dtype=torch.uint8)) + target = { + "boxes": make_bounding_box(image_size=size, format="XYXY", extra_dims=(22,), dtype=torch.float), + "labels": features.Label(torch.randint(0, 81, size=(22,))), + } + if with_mask: + target["masks"] = features.SegmentationMask(torch.randint(0, 2, size=(22, *size), dtype=torch.long)) + + yield (feature_image, target) + + def _test_transform(self, t_ref, t, data_kwargs={}): + for dp in self.make_datapoints(**data_kwargs): + + # We should use prototype transform first as reference transform performs inplace target update + torch.manual_seed(12) + output = t(dp) + + torch.manual_seed(12) + expected_output = t_ref(*dp) + + assert_equal(expected_output, output) + + def test_randomhorizontalflip(self): + t_ref = RandomHorizontalFlip(p=1.0) + t = prototype_transforms.RandomHorizontalFlip(p=1.0) + self._test_transform(t_ref, t) + + def test_randomioucrop(self): + t_ref = RandomIoUCrop() + t = prototype_transforms.RandomIoUCrop() + self._test_transform(t_ref, t, {"with_mask": False}) + + def test_randomzoomout(self): + t_ref = RandomZoomOut() + t = prototype_transforms.RandomZoomOut() + self._test_transform(t_ref, t, {"with_mask": False}) + + def test_scalejitter(self): + t_ref = ScaleJitter((1024, 1024)) + t = prototype_transforms.ScaleJitter((1024, 1024)) + self._test_transform(t_ref, t) + + def test_fixedsizecrop(self): + t_ref = FixedSizeCrop(size=(1024, 1024), fill=0) + t = prototype_transforms.FixedSizeCrop(size=(1024, 1024), fill=0) + self._test_transform(t_ref, t) + + def test_randomshortestsize(self): + t_ref = RandomShortestSize(min_size=(480, 512, 544, 576, 608, 640, 672, 704, 736, 768, 800), max_size=1333) + t = prototype_transforms.RandomShortestSize( + min_size=(480, 512, 544, 576, 608, 640, 672, 704, 736, 768, 800), max_size=1333 + ) + self._test_transform(t_ref, t) + + +# ----- +# Dumped reference detection transforms here for consistency checks +# torchvision/references/detection/transforms.py +# ----- +from typing import Dict, List, Optional, Tuple, Union + +import torch +import torchvision +from torch import nn, Tensor +from torchvision import ops +from torchvision.transforms import functional as F, InterpolationMode, transforms as T + + +def _flip_coco_person_keypoints(kps, width): + flip_inds = [0, 2, 1, 4, 3, 6, 5, 8, 7, 10, 9, 12, 11, 14, 13, 16, 15] + flipped_data = kps[:, flip_inds] + flipped_data[..., 0] = width - flipped_data[..., 0] + # Maintain COCO convention that if visibility == 0, then x, y = 0 + inds = flipped_data[..., 2] == 0 + flipped_data[inds] = 0 + return flipped_data + + +class Compose: + def __init__(self, transforms): + self.transforms = transforms + + def __call__(self, image, target): + for t in self.transforms: + image, target = t(image, target) + return image, target + + +class RandomHorizontalFlip(T.RandomHorizontalFlip): + def forward( + self, image: Tensor, target: Optional[Dict[str, Tensor]] = None + ) -> Tuple[Tensor, Optional[Dict[str, Tensor]]]: + if torch.rand(1) < self.p: + image = F.hflip(image) + if target is not None: + _, _, width = F.get_dimensions(image) + target["boxes"][:, [0, 2]] = width - target["boxes"][:, [2, 0]] + if "masks" in target: + target["masks"] = target["masks"].flip(-1) + if "keypoints" in target: + keypoints = target["keypoints"] + keypoints = _flip_coco_person_keypoints(keypoints, width) + target["keypoints"] = keypoints + return image, target + + +class PILToTensor(nn.Module): + def forward( + self, image: Tensor, target: Optional[Dict[str, Tensor]] = None + ) -> Tuple[Tensor, Optional[Dict[str, Tensor]]]: + image = F.pil_to_tensor(image) + return image, target + + +class ConvertImageDtype(nn.Module): + def __init__(self, dtype: torch.dtype) -> None: + super().__init__() + self.dtype = dtype + + def forward( + self, image: Tensor, target: Optional[Dict[str, Tensor]] = None + ) -> Tuple[Tensor, Optional[Dict[str, Tensor]]]: + image = F.convert_image_dtype(image, self.dtype) + return image, target + + +class RandomIoUCrop(nn.Module): + def __init__( + self, + min_scale: float = 0.3, + max_scale: float = 1.0, + min_aspect_ratio: float = 0.5, + max_aspect_ratio: float = 2.0, + sampler_options: Optional[List[float]] = None, + trials: int = 40, + ): + super().__init__() + # Configuration similar to https://github.com/weiliu89/caffe/blob/ssd/examples/ssd/ssd_coco.py#L89-L174 + self.min_scale = min_scale + self.max_scale = max_scale + self.min_aspect_ratio = min_aspect_ratio + self.max_aspect_ratio = max_aspect_ratio + if sampler_options is None: + sampler_options = [0.0, 0.1, 0.3, 0.5, 0.7, 0.9, 1.0] + self.options = sampler_options + self.trials = trials + + def forward( + self, image: Tensor, target: Optional[Dict[str, Tensor]] = None + ) -> Tuple[Tensor, Optional[Dict[str, Tensor]]]: + if target is None: + raise ValueError("The targets can't be None for this transform.") + + if isinstance(image, torch.Tensor): + if image.ndimension() not in {2, 3}: + raise ValueError(f"image should be 2/3 dimensional. Got {image.ndimension()} dimensions.") + elif image.ndimension() == 2: + image = image.unsqueeze(0) + + _, orig_h, orig_w = F.get_dimensions(image) + + while True: + # sample an option + idx = int(torch.randint(low=0, high=len(self.options), size=(1,))) + min_jaccard_overlap = self.options[idx] + if min_jaccard_overlap >= 1.0: # a value larger than 1 encodes the leave as-is option + return image, target + + for _ in range(self.trials): + # check the aspect ratio limitations + r = self.min_scale + (self.max_scale - self.min_scale) * torch.rand(2) + new_w = int(orig_w * r[0]) + new_h = int(orig_h * r[1]) + aspect_ratio = new_w / new_h + if not (self.min_aspect_ratio <= aspect_ratio <= self.max_aspect_ratio): + continue + + # check for 0 area crops + r = torch.rand(2) + left = int((orig_w - new_w) * r[0]) + top = int((orig_h - new_h) * r[1]) + right = left + new_w + bottom = top + new_h + if left == right or top == bottom: + continue + + # check for any valid boxes with centers within the crop area + cx = 0.5 * (target["boxes"][:, 0] + target["boxes"][:, 2]) + cy = 0.5 * (target["boxes"][:, 1] + target["boxes"][:, 3]) + is_within_crop_area = (left < cx) & (cx < right) & (top < cy) & (cy < bottom) + if not is_within_crop_area.any(): + continue + + # check at least 1 box with jaccard limitations + boxes = target["boxes"][is_within_crop_area] + ious = torchvision.ops.boxes.box_iou( + boxes, torch.tensor([[left, top, right, bottom]], dtype=boxes.dtype, device=boxes.device) + ) + if ious.max() < min_jaccard_overlap: + continue + + # keep only valid boxes and perform cropping + target["boxes"] = boxes + target["labels"] = target["labels"][is_within_crop_area] + target["boxes"][:, 0::2] -= left + target["boxes"][:, 1::2] -= top + target["boxes"][:, 0::2].clamp_(min=0, max=new_w) + target["boxes"][:, 1::2].clamp_(min=0, max=new_h) + image = F.crop(image, top, left, new_h, new_w) + + return image, target + + +class RandomZoomOut(nn.Module): + def __init__( + self, fill: Optional[List[float]] = None, side_range: Tuple[float, float] = (1.0, 4.0), p: float = 0.5 + ): + super().__init__() + if fill is None: + fill = [0.0, 0.0, 0.0] + self.fill = fill + self.side_range = side_range + if side_range[0] < 1.0 or side_range[0] > side_range[1]: + raise ValueError(f"Invalid canvas side range provided {side_range}.") + self.p = p + + @torch.jit.unused + def _get_fill_value(self, is_pil): + # type: (bool) -> int + # We fake the type to make it work on JIT + return tuple(int(x) for x in self.fill) if is_pil else 0 + + def forward( + self, image: Tensor, target: Optional[Dict[str, Tensor]] = None + ) -> Tuple[Tensor, Optional[Dict[str, Tensor]]]: + if isinstance(image, torch.Tensor): + if image.ndimension() not in {2, 3}: + raise ValueError(f"image should be 2/3 dimensional. Got {image.ndimension()} dimensions.") + elif image.ndimension() == 2: + image = image.unsqueeze(0) + + if torch.rand(1) >= self.p: + return image, target + + _, orig_h, orig_w = F.get_dimensions(image) + + r = self.side_range[0] + torch.rand(1) * (self.side_range[1] - self.side_range[0]) + canvas_width = int(orig_w * r) + canvas_height = int(orig_h * r) + + r = torch.rand(2) + left = int((canvas_width - orig_w) * r[0]) + top = int((canvas_height - orig_h) * r[1]) + right = canvas_width - (left + orig_w) + bottom = canvas_height - (top + orig_h) + + if torch.jit.is_scripting(): + fill = 0 + else: + fill = self._get_fill_value(F._is_pil_image(image)) + + image = F.pad(image, [left, top, right, bottom], fill=fill) + if isinstance(image, torch.Tensor): + # PyTorch's pad supports only integers on fill. So we need to overwrite the colour + v = torch.tensor(self.fill, device=image.device, dtype=image.dtype).view(-1, 1, 1) + image[..., :top, :] = image[..., :, :left] = image[..., (top + orig_h) :, :] = image[ + ..., :, (left + orig_w) : + ] = v + + if target is not None: + target["boxes"][:, 0::2] += left + target["boxes"][:, 1::2] += top + + return image, target + + +class RandomPhotometricDistort(nn.Module): + def __init__( + self, + contrast: Tuple[float, float] = (0.5, 1.5), + saturation: Tuple[float, float] = (0.5, 1.5), + hue: Tuple[float, float] = (-0.05, 0.05), + brightness: Tuple[float, float] = (0.875, 1.125), + p: float = 0.5, + ): + super().__init__() + self._brightness = T.ColorJitter(brightness=brightness) + self._contrast = T.ColorJitter(contrast=contrast) + self._hue = T.ColorJitter(hue=hue) + self._saturation = T.ColorJitter(saturation=saturation) + self.p = p + + def forward( + self, image: Tensor, target: Optional[Dict[str, Tensor]] = None + ) -> Tuple[Tensor, Optional[Dict[str, Tensor]]]: + if isinstance(image, torch.Tensor): + if image.ndimension() not in {2, 3}: + raise ValueError(f"image should be 2/3 dimensional. Got {image.ndimension()} dimensions.") + elif image.ndimension() == 2: + image = image.unsqueeze(0) + + r = torch.rand(7) + + if r[0] < self.p: + image = self._brightness(image) + + contrast_before = r[1] < 0.5 + if contrast_before: + if r[2] < self.p: + image = self._contrast(image) + + if r[3] < self.p: + image = self._saturation(image) + + if r[4] < self.p: + image = self._hue(image) + + if not contrast_before: + if r[5] < self.p: + image = self._contrast(image) + + if r[6] < self.p: + channels, _, _ = F.get_dimensions(image) + permutation = torch.randperm(channels) + + is_pil = F._is_pil_image(image) + if is_pil: + image = F.pil_to_tensor(image) + image = F.convert_image_dtype(image) + image = image[..., permutation, :, :] + if is_pil: + image = F.to_pil_image(image) + + return image, target + + +class ScaleJitter(nn.Module): + """Randomly resizes the image and its bounding boxes within the specified scale range. + The class implements the Scale Jitter augmentation as described in the paper + `"Simple Copy-Paste is a Strong Data Augmentation Method for Instance Segmentation" `_. + + Args: + target_size (tuple of ints): The target size for the transform provided in (height, weight) format. + scale_range (tuple of ints): scaling factor interval, e.g (a, b), then scale is randomly sampled from the + range a <= scale <= b. + interpolation (InterpolationMode): Desired interpolation enum defined by + :class:`torchvision.transforms.InterpolationMode`. Default is ``InterpolationMode.BILINEAR``. + """ + + def __init__( + self, + target_size: Tuple[int, int], + scale_range: Tuple[float, float] = (0.1, 2.0), + interpolation: InterpolationMode = InterpolationMode.BILINEAR, + ): + super().__init__() + self.target_size = target_size + self.scale_range = scale_range + self.interpolation = interpolation + + def forward( + self, image: Tensor, target: Optional[Dict[str, Tensor]] = None + ) -> Tuple[Tensor, Optional[Dict[str, Tensor]]]: + if isinstance(image, torch.Tensor): + if image.ndimension() not in {2, 3}: + raise ValueError(f"image should be 2/3 dimensional. Got {image.ndimension()} dimensions.") + elif image.ndimension() == 2: + image = image.unsqueeze(0) + + _, orig_height, orig_width = F.get_dimensions(image) + + scale = self.scale_range[0] + torch.rand(1) * (self.scale_range[1] - self.scale_range[0]) + r = min(self.target_size[1] / orig_height, self.target_size[0] / orig_width) * scale + new_width = int(orig_width * r) + new_height = int(orig_height * r) + + image = F.resize(image, [new_height, new_width], interpolation=self.interpolation) + + if target is not None: + target["boxes"][:, 0::2] *= new_width / orig_width + target["boxes"][:, 1::2] *= new_height / orig_height + if "masks" in target: + target["masks"] = F.resize( + target["masks"], [new_height, new_width], interpolation=InterpolationMode.NEAREST + ) + + return image, target + + +class FixedSizeCrop(nn.Module): + def __init__(self, size, fill=0, padding_mode="constant"): + super().__init__() + size = tuple(T._setup_size(size, error_msg="Please provide only two dimensions (h, w) for size.")) + self.crop_height = size[0] + self.crop_width = size[1] + self.fill = fill # TODO: Fill is currently respected only on PIL. Apply tensor patch. + self.padding_mode = padding_mode + + def _pad(self, img, target, padding): + # Taken from the functional_tensor.py pad + if isinstance(padding, int): + pad_left = pad_right = pad_top = pad_bottom = padding + elif len(padding) == 1: + pad_left = pad_right = pad_top = pad_bottom = padding[0] + elif len(padding) == 2: + pad_left = pad_right = padding[0] + pad_top = pad_bottom = padding[1] + else: + pad_left = padding[0] + pad_top = padding[1] + pad_right = padding[2] + pad_bottom = padding[3] + + padding = [pad_left, pad_top, pad_right, pad_bottom] + img = F.pad(img, padding, self.fill, self.padding_mode) + if target is not None: + target["boxes"][:, 0::2] += pad_left + target["boxes"][:, 1::2] += pad_top + if "masks" in target: + target["masks"] = F.pad(target["masks"], padding, 0, "constant") + + return img, target + + def _crop(self, img, target, top, left, height, width): + img = F.crop(img, top, left, height, width) + if target is not None: + boxes = target["boxes"] + boxes[:, 0::2] -= left + boxes[:, 1::2] -= top + boxes[:, 0::2].clamp_(min=0, max=width) + boxes[:, 1::2].clamp_(min=0, max=height) + + is_valid = (boxes[:, 0] < boxes[:, 2]) & (boxes[:, 1] < boxes[:, 3]) + + target["boxes"] = boxes[is_valid] + target["labels"] = target["labels"][is_valid] + if "masks" in target: + target["masks"] = F.crop(target["masks"][is_valid], top, left, height, width) + + return img, target + + def forward(self, img, target=None): + _, height, width = F.get_dimensions(img) + new_height = min(height, self.crop_height) + new_width = min(width, self.crop_width) + + if new_height != height or new_width != width: + offset_height = max(height - self.crop_height, 0) + offset_width = max(width - self.crop_width, 0) + + r = torch.rand(1) + top = int(offset_height * r) + left = int(offset_width * r) + + img, target = self._crop(img, target, top, left, new_height, new_width) + + pad_bottom = max(self.crop_height - new_height, 0) + pad_right = max(self.crop_width - new_width, 0) + if pad_bottom != 0 or pad_right != 0: + img, target = self._pad(img, target, [0, 0, pad_right, pad_bottom]) + + return img, target + + +class RandomShortestSize(nn.Module): + def __init__( + self, + min_size: Union[List[int], Tuple[int], int], + max_size: int, + interpolation: InterpolationMode = InterpolationMode.BILINEAR, + ): + super().__init__() + self.min_size = [min_size] if isinstance(min_size, int) else list(min_size) + self.max_size = max_size + self.interpolation = interpolation + + def forward( + self, image: Tensor, target: Optional[Dict[str, Tensor]] = None + ) -> Tuple[Tensor, Optional[Dict[str, Tensor]]]: + _, orig_height, orig_width = F.get_dimensions(image) + + min_size = self.min_size[torch.randint(len(self.min_size), (1,)).item()] + r = min(min_size / min(orig_height, orig_width), self.max_size / max(orig_height, orig_width)) + + new_width = int(orig_width * r) + new_height = int(orig_height * r) + + image = F.resize(image, [new_height, new_width], interpolation=self.interpolation) + + if target is not None: + target["boxes"][:, 0::2] *= new_width / orig_width + target["boxes"][:, 1::2] *= new_height / orig_height + if "masks" in target: + target["masks"] = F.resize( + target["masks"], [new_height, new_width], interpolation=InterpolationMode.NEAREST + ) + + return image, target + + +def _copy_paste( + image: torch.Tensor, + target: Dict[str, Tensor], + paste_image: torch.Tensor, + paste_target: Dict[str, Tensor], + blending: bool = True, + resize_interpolation: F.InterpolationMode = F.InterpolationMode.BILINEAR, +) -> Tuple[torch.Tensor, Dict[str, Tensor]]: + + # Random paste targets selection: + num_masks = len(paste_target["masks"]) + + if num_masks < 1: + # Such degerante case with num_masks=0 can happen with LSJ + # Let's just return (image, target) + return image, target + + # We have to please torch script by explicitly specifying dtype as torch.long + random_selection = torch.randint(0, num_masks, (num_masks,), device=paste_image.device) + random_selection = torch.unique(random_selection).to(torch.long) + + paste_masks = paste_target["masks"][random_selection] + paste_boxes = paste_target["boxes"][random_selection] + paste_labels = paste_target["labels"][random_selection] + + masks = target["masks"] + + # We resize source and paste data if they have different sizes + # This is something we introduced here as originally the algorithm works + # on equal-sized data (for example, coming from LSJ data augmentations) + size1 = image.shape[-2:] + size2 = paste_image.shape[-2:] + if size1 != size2: + paste_image = F.resize(paste_image, size1, interpolation=resize_interpolation) + paste_masks = F.resize(paste_masks, size1, interpolation=F.InterpolationMode.NEAREST) + # resize bboxes: + ratios = torch.tensor((size1[1] / size2[1], size1[0] / size2[0]), device=paste_boxes.device) + paste_boxes = paste_boxes.view(-1, 2, 2).mul(ratios).view(paste_boxes.shape) + + paste_alpha_mask = paste_masks.sum(dim=0) > 0 + + if blending: + paste_alpha_mask = F.gaussian_blur( + paste_alpha_mask.unsqueeze(0), + kernel_size=(5, 5), + sigma=[ + 2.0, + ], + ) + + # Copy-paste images: + image = (image * (~paste_alpha_mask)) + (paste_image * paste_alpha_mask) + + # Copy-paste masks: + masks = masks * (~paste_alpha_mask) + non_all_zero_masks = masks.sum((-1, -2)) > 0 + masks = masks[non_all_zero_masks] + + # Do a shallow copy of the target dict + out_target = {k: v for k, v in target.items()} + + out_target["masks"] = torch.cat([masks, paste_masks]) + + # Copy-paste boxes and labels + boxes = ops.masks_to_boxes(masks) + out_target["boxes"] = torch.cat([boxes, paste_boxes]) + + labels = target["labels"][non_all_zero_masks] + out_target["labels"] = torch.cat([labels, paste_labels]) + + # Update additional optional keys: area and iscrowd if exist + if "area" in target: + out_target["area"] = out_target["masks"].sum((-1, -2)).to(torch.float32) + + if "iscrowd" in target and "iscrowd" in paste_target: + # target['iscrowd'] size can be differ from mask size (non_all_zero_masks) + # For example, if previous transforms geometrically modifies masks/boxes/labels but + # does not update "iscrowd" + if len(target["iscrowd"]) == len(non_all_zero_masks): + iscrowd = target["iscrowd"][non_all_zero_masks] + paste_iscrowd = paste_target["iscrowd"][random_selection] + out_target["iscrowd"] = torch.cat([iscrowd, paste_iscrowd]) + + # Check for degenerated boxes and remove them + boxes = out_target["boxes"] + degenerate_boxes = boxes[:, 2:] <= boxes[:, :2] + if degenerate_boxes.any(): + valid_targets = ~degenerate_boxes.any(dim=1) + + out_target["boxes"] = boxes[valid_targets] + out_target["masks"] = out_target["masks"][valid_targets] + out_target["labels"] = out_target["labels"][valid_targets] + + if "area" in out_target: + out_target["area"] = out_target["area"][valid_targets] + if "iscrowd" in out_target and len(out_target["iscrowd"]) == len(valid_targets): + out_target["iscrowd"] = out_target["iscrowd"][valid_targets] + + return image, out_target + + +class SimpleCopyPaste(torch.nn.Module): + def __init__(self, blending=True, resize_interpolation=F.InterpolationMode.BILINEAR): + super().__init__() + self.resize_interpolation = resize_interpolation + self.blending = blending + + def forward( + self, images: List[torch.Tensor], targets: List[Dict[str, Tensor]] + ) -> Tuple[List[torch.Tensor], List[Dict[str, Tensor]]]: + torch._assert( + isinstance(images, (list, tuple)) and all([isinstance(v, torch.Tensor) for v in images]), + "images should be a list of tensors", + ) + torch._assert( + isinstance(targets, (list, tuple)) and len(images) == len(targets), + "targets should be a list of the same size as images", + ) + for target in targets: + # Can not check for instance type dict with inside torch.jit.script + # torch._assert(isinstance(target, dict), "targets item should be a dict") + for k in ["masks", "boxes", "labels"]: + torch._assert(k in target, f"Key {k} should be present in targets") + torch._assert(isinstance(target[k], torch.Tensor), f"Value for the key {k} should be a tensor") + + # images = [t1, t2, ..., tN] + # Let's define paste_images as shifted list of input images + # paste_images = [t2, t3, ..., tN, t1] + # FYI: in TF they mix data on the dataset level + images_rolled = images[-1:] + images[:-1] + targets_rolled = targets[-1:] + targets[:-1] + + output_images: List[torch.Tensor] = [] + output_targets: List[Dict[str, Tensor]] = [] + + for image, target, paste_image, paste_target in zip(images, targets, images_rolled, targets_rolled): + output_image, output_data = _copy_paste( + image, + target, + paste_image, + paste_target, + blending=self.blending, + resize_interpolation=self.resize_interpolation, + ) + output_images.append(output_image) + output_targets.append(output_data) + + return output_images, output_targets + + def __repr__(self) -> str: + s = f"{self.__class__.__name__}(blending={self.blending}, resize_interpolation={self.resize_interpolation})" + return s From b7ebf254a6d950f464882929012e9dbfdcea1657 Mon Sep 17 00:00:00 2001 From: vfdev-5 Date: Thu, 15 Sep 2022 11:29:24 +0200 Subject: [PATCH 2/3] Updated tests according to the review --- test/test_prototype_transforms_consistency.py | 697 ++---------------- 1 file changed, 53 insertions(+), 644 deletions(-) diff --git a/test/test_prototype_transforms_consistency.py b/test/test_prototype_transforms_consistency.py index d3496d623cc..9a64f16f746 100644 --- a/test/test_prototype_transforms_consistency.py +++ b/test/test_prototype_transforms_consistency.py @@ -1,16 +1,26 @@ import enum import inspect +from importlib.machinery import SourceFileLoader +from pathlib import Path import numpy as np import PIL.Image import pytest import torch -from prototype_common_utils import ArgsKwargs, assert_equal, make_bounding_box, make_images +from prototype_common_utils import ( + ArgsKwargs, + assert_equal, + make_bounding_box, + make_detection_mask, + make_image, + make_images, + make_label, +) from torchvision import transforms as legacy_transforms from torchvision._utils import sequence_to_str from torchvision.prototype import features, transforms as prototype_transforms -from torchvision.prototype.transforms.functional import to_image_pil +from torchvision.prototype.transforms.functional import to_image_pil, to_pil_image DEFAULT_MAKE_IMAGES_KWARGS = dict(color_spaces=[features.ColorSpace.RGB], extra_dims=[(4,)]) @@ -842,41 +852,71 @@ def test_aa(self, inpt, interpolation): assert_equal(expected_output, output) +# Import reference detection transforms here for consistency checks +# torchvision/references/detection/transforms.py +ref_det_filepath = Path(__file__).parent.parent / "references" / "detection" / "transforms.py" +det_transforms = SourceFileLoader(ref_det_filepath.stem, ref_det_filepath.as_posix()).load_module() + + class TestRefDetTransforms: def make_datapoints(self, with_mask=True): size = (600, 800) + num_objects = 22 - pil_image = PIL.Image.new("RGB", size[::-1], 123) + pil_image = to_pil_image(make_image(size=size, color_space=features.ColorSpace.RGB)) target = { - "boxes": make_bounding_box(image_size=size, format="XYXY", extra_dims=(22,), dtype=torch.float), - "labels": features.Label(torch.randint(0, 81, size=(22,))), + "boxes": make_bounding_box(image_size=size, format="XYXY", extra_dims=(num_objects,), dtype=torch.float), + "labels": make_label(size=(num_objects,)), } if with_mask: - target["masks"] = features.SegmentationMask(torch.randint(0, 2, size=(22, *size), dtype=torch.long)) + target["masks"] = make_detection_mask(size=size, num_objects=num_objects, dtype=torch.long) yield (pil_image, target) tensor_image = torch.randint(0, 256, size=(3, *size), dtype=torch.uint8) target = { - "boxes": make_bounding_box(image_size=size, format="XYXY", extra_dims=(22,), dtype=torch.float), - "labels": features.Label(torch.randint(0, 81, size=(22,))), + "boxes": make_bounding_box(image_size=size, format="XYXY", extra_dims=(num_objects,), dtype=torch.float), + "labels": make_label(size=(num_objects,)), } if with_mask: - target["masks"] = features.SegmentationMask(torch.randint(0, 2, size=(22, *size), dtype=torch.long)) + target["masks"] = make_detection_mask(size=size, num_objects=num_objects, dtype=torch.long) yield (tensor_image, target) feature_image = features.Image(torch.randint(0, 256, size=(3, *size), dtype=torch.uint8)) target = { - "boxes": make_bounding_box(image_size=size, format="XYXY", extra_dims=(22,), dtype=torch.float), - "labels": features.Label(torch.randint(0, 81, size=(22,))), + "boxes": make_bounding_box(image_size=size, format="XYXY", extra_dims=(num_objects,), dtype=torch.float), + "labels": make_label(size=(num_objects,)), } if with_mask: - target["masks"] = features.SegmentationMask(torch.randint(0, 2, size=(22, *size), dtype=torch.long)) + target["masks"] = make_detection_mask(size=size, num_objects=num_objects, dtype=torch.long) yield (feature_image, target) - def _test_transform(self, t_ref, t, data_kwargs={}): + @pytest.mark.parametrize( + "t_ref, t, data_kwargs", + [ + (det_transforms.RandomHorizontalFlip(p=1.0), prototype_transforms.RandomHorizontalFlip(p=1.0), {}), + (det_transforms.RandomIoUCrop(), prototype_transforms.RandomIoUCrop(), {"with_mask": False}), + (det_transforms.RandomZoomOut(), prototype_transforms.RandomZoomOut(), {"with_mask": False}), + (det_transforms.ScaleJitter((1024, 1024)), prototype_transforms.ScaleJitter((1024, 1024)), {}), + ( + det_transforms.FixedSizeCrop((1024, 1024), fill=0), + prototype_transforms.FixedSizeCrop((1024, 1024), fill=0), + {}, + ), + ( + det_transforms.RandomShortestSize( + min_size=(480, 512, 544, 576, 608, 640, 672, 704, 736, 768, 800), max_size=1333 + ), + prototype_transforms.RandomShortestSize( + min_size=(480, 512, 544, 576, 608, 640, 672, 704, 736, 768, 800), max_size=1333 + ), + {}, + ), + ], + ) + def test_transform(self, t_ref, t, data_kwargs): for dp in self.make_datapoints(**data_kwargs): # We should use prototype transform first as reference transform performs inplace target update @@ -887,634 +927,3 @@ def _test_transform(self, t_ref, t, data_kwargs={}): expected_output = t_ref(*dp) assert_equal(expected_output, output) - - def test_randomhorizontalflip(self): - t_ref = RandomHorizontalFlip(p=1.0) - t = prototype_transforms.RandomHorizontalFlip(p=1.0) - self._test_transform(t_ref, t) - - def test_randomioucrop(self): - t_ref = RandomIoUCrop() - t = prototype_transforms.RandomIoUCrop() - self._test_transform(t_ref, t, {"with_mask": False}) - - def test_randomzoomout(self): - t_ref = RandomZoomOut() - t = prototype_transforms.RandomZoomOut() - self._test_transform(t_ref, t, {"with_mask": False}) - - def test_scalejitter(self): - t_ref = ScaleJitter((1024, 1024)) - t = prototype_transforms.ScaleJitter((1024, 1024)) - self._test_transform(t_ref, t) - - def test_fixedsizecrop(self): - t_ref = FixedSizeCrop(size=(1024, 1024), fill=0) - t = prototype_transforms.FixedSizeCrop(size=(1024, 1024), fill=0) - self._test_transform(t_ref, t) - - def test_randomshortestsize(self): - t_ref = RandomShortestSize(min_size=(480, 512, 544, 576, 608, 640, 672, 704, 736, 768, 800), max_size=1333) - t = prototype_transforms.RandomShortestSize( - min_size=(480, 512, 544, 576, 608, 640, 672, 704, 736, 768, 800), max_size=1333 - ) - self._test_transform(t_ref, t) - - -# ----- -# Dumped reference detection transforms here for consistency checks -# torchvision/references/detection/transforms.py -# ----- -from typing import Dict, List, Optional, Tuple, Union - -import torch -import torchvision -from torch import nn, Tensor -from torchvision import ops -from torchvision.transforms import functional as F, InterpolationMode, transforms as T - - -def _flip_coco_person_keypoints(kps, width): - flip_inds = [0, 2, 1, 4, 3, 6, 5, 8, 7, 10, 9, 12, 11, 14, 13, 16, 15] - flipped_data = kps[:, flip_inds] - flipped_data[..., 0] = width - flipped_data[..., 0] - # Maintain COCO convention that if visibility == 0, then x, y = 0 - inds = flipped_data[..., 2] == 0 - flipped_data[inds] = 0 - return flipped_data - - -class Compose: - def __init__(self, transforms): - self.transforms = transforms - - def __call__(self, image, target): - for t in self.transforms: - image, target = t(image, target) - return image, target - - -class RandomHorizontalFlip(T.RandomHorizontalFlip): - def forward( - self, image: Tensor, target: Optional[Dict[str, Tensor]] = None - ) -> Tuple[Tensor, Optional[Dict[str, Tensor]]]: - if torch.rand(1) < self.p: - image = F.hflip(image) - if target is not None: - _, _, width = F.get_dimensions(image) - target["boxes"][:, [0, 2]] = width - target["boxes"][:, [2, 0]] - if "masks" in target: - target["masks"] = target["masks"].flip(-1) - if "keypoints" in target: - keypoints = target["keypoints"] - keypoints = _flip_coco_person_keypoints(keypoints, width) - target["keypoints"] = keypoints - return image, target - - -class PILToTensor(nn.Module): - def forward( - self, image: Tensor, target: Optional[Dict[str, Tensor]] = None - ) -> Tuple[Tensor, Optional[Dict[str, Tensor]]]: - image = F.pil_to_tensor(image) - return image, target - - -class ConvertImageDtype(nn.Module): - def __init__(self, dtype: torch.dtype) -> None: - super().__init__() - self.dtype = dtype - - def forward( - self, image: Tensor, target: Optional[Dict[str, Tensor]] = None - ) -> Tuple[Tensor, Optional[Dict[str, Tensor]]]: - image = F.convert_image_dtype(image, self.dtype) - return image, target - - -class RandomIoUCrop(nn.Module): - def __init__( - self, - min_scale: float = 0.3, - max_scale: float = 1.0, - min_aspect_ratio: float = 0.5, - max_aspect_ratio: float = 2.0, - sampler_options: Optional[List[float]] = None, - trials: int = 40, - ): - super().__init__() - # Configuration similar to https://github.com/weiliu89/caffe/blob/ssd/examples/ssd/ssd_coco.py#L89-L174 - self.min_scale = min_scale - self.max_scale = max_scale - self.min_aspect_ratio = min_aspect_ratio - self.max_aspect_ratio = max_aspect_ratio - if sampler_options is None: - sampler_options = [0.0, 0.1, 0.3, 0.5, 0.7, 0.9, 1.0] - self.options = sampler_options - self.trials = trials - - def forward( - self, image: Tensor, target: Optional[Dict[str, Tensor]] = None - ) -> Tuple[Tensor, Optional[Dict[str, Tensor]]]: - if target is None: - raise ValueError("The targets can't be None for this transform.") - - if isinstance(image, torch.Tensor): - if image.ndimension() not in {2, 3}: - raise ValueError(f"image should be 2/3 dimensional. Got {image.ndimension()} dimensions.") - elif image.ndimension() == 2: - image = image.unsqueeze(0) - - _, orig_h, orig_w = F.get_dimensions(image) - - while True: - # sample an option - idx = int(torch.randint(low=0, high=len(self.options), size=(1,))) - min_jaccard_overlap = self.options[idx] - if min_jaccard_overlap >= 1.0: # a value larger than 1 encodes the leave as-is option - return image, target - - for _ in range(self.trials): - # check the aspect ratio limitations - r = self.min_scale + (self.max_scale - self.min_scale) * torch.rand(2) - new_w = int(orig_w * r[0]) - new_h = int(orig_h * r[1]) - aspect_ratio = new_w / new_h - if not (self.min_aspect_ratio <= aspect_ratio <= self.max_aspect_ratio): - continue - - # check for 0 area crops - r = torch.rand(2) - left = int((orig_w - new_w) * r[0]) - top = int((orig_h - new_h) * r[1]) - right = left + new_w - bottom = top + new_h - if left == right or top == bottom: - continue - - # check for any valid boxes with centers within the crop area - cx = 0.5 * (target["boxes"][:, 0] + target["boxes"][:, 2]) - cy = 0.5 * (target["boxes"][:, 1] + target["boxes"][:, 3]) - is_within_crop_area = (left < cx) & (cx < right) & (top < cy) & (cy < bottom) - if not is_within_crop_area.any(): - continue - - # check at least 1 box with jaccard limitations - boxes = target["boxes"][is_within_crop_area] - ious = torchvision.ops.boxes.box_iou( - boxes, torch.tensor([[left, top, right, bottom]], dtype=boxes.dtype, device=boxes.device) - ) - if ious.max() < min_jaccard_overlap: - continue - - # keep only valid boxes and perform cropping - target["boxes"] = boxes - target["labels"] = target["labels"][is_within_crop_area] - target["boxes"][:, 0::2] -= left - target["boxes"][:, 1::2] -= top - target["boxes"][:, 0::2].clamp_(min=0, max=new_w) - target["boxes"][:, 1::2].clamp_(min=0, max=new_h) - image = F.crop(image, top, left, new_h, new_w) - - return image, target - - -class RandomZoomOut(nn.Module): - def __init__( - self, fill: Optional[List[float]] = None, side_range: Tuple[float, float] = (1.0, 4.0), p: float = 0.5 - ): - super().__init__() - if fill is None: - fill = [0.0, 0.0, 0.0] - self.fill = fill - self.side_range = side_range - if side_range[0] < 1.0 or side_range[0] > side_range[1]: - raise ValueError(f"Invalid canvas side range provided {side_range}.") - self.p = p - - @torch.jit.unused - def _get_fill_value(self, is_pil): - # type: (bool) -> int - # We fake the type to make it work on JIT - return tuple(int(x) for x in self.fill) if is_pil else 0 - - def forward( - self, image: Tensor, target: Optional[Dict[str, Tensor]] = None - ) -> Tuple[Tensor, Optional[Dict[str, Tensor]]]: - if isinstance(image, torch.Tensor): - if image.ndimension() not in {2, 3}: - raise ValueError(f"image should be 2/3 dimensional. Got {image.ndimension()} dimensions.") - elif image.ndimension() == 2: - image = image.unsqueeze(0) - - if torch.rand(1) >= self.p: - return image, target - - _, orig_h, orig_w = F.get_dimensions(image) - - r = self.side_range[0] + torch.rand(1) * (self.side_range[1] - self.side_range[0]) - canvas_width = int(orig_w * r) - canvas_height = int(orig_h * r) - - r = torch.rand(2) - left = int((canvas_width - orig_w) * r[0]) - top = int((canvas_height - orig_h) * r[1]) - right = canvas_width - (left + orig_w) - bottom = canvas_height - (top + orig_h) - - if torch.jit.is_scripting(): - fill = 0 - else: - fill = self._get_fill_value(F._is_pil_image(image)) - - image = F.pad(image, [left, top, right, bottom], fill=fill) - if isinstance(image, torch.Tensor): - # PyTorch's pad supports only integers on fill. So we need to overwrite the colour - v = torch.tensor(self.fill, device=image.device, dtype=image.dtype).view(-1, 1, 1) - image[..., :top, :] = image[..., :, :left] = image[..., (top + orig_h) :, :] = image[ - ..., :, (left + orig_w) : - ] = v - - if target is not None: - target["boxes"][:, 0::2] += left - target["boxes"][:, 1::2] += top - - return image, target - - -class RandomPhotometricDistort(nn.Module): - def __init__( - self, - contrast: Tuple[float, float] = (0.5, 1.5), - saturation: Tuple[float, float] = (0.5, 1.5), - hue: Tuple[float, float] = (-0.05, 0.05), - brightness: Tuple[float, float] = (0.875, 1.125), - p: float = 0.5, - ): - super().__init__() - self._brightness = T.ColorJitter(brightness=brightness) - self._contrast = T.ColorJitter(contrast=contrast) - self._hue = T.ColorJitter(hue=hue) - self._saturation = T.ColorJitter(saturation=saturation) - self.p = p - - def forward( - self, image: Tensor, target: Optional[Dict[str, Tensor]] = None - ) -> Tuple[Tensor, Optional[Dict[str, Tensor]]]: - if isinstance(image, torch.Tensor): - if image.ndimension() not in {2, 3}: - raise ValueError(f"image should be 2/3 dimensional. Got {image.ndimension()} dimensions.") - elif image.ndimension() == 2: - image = image.unsqueeze(0) - - r = torch.rand(7) - - if r[0] < self.p: - image = self._brightness(image) - - contrast_before = r[1] < 0.5 - if contrast_before: - if r[2] < self.p: - image = self._contrast(image) - - if r[3] < self.p: - image = self._saturation(image) - - if r[4] < self.p: - image = self._hue(image) - - if not contrast_before: - if r[5] < self.p: - image = self._contrast(image) - - if r[6] < self.p: - channels, _, _ = F.get_dimensions(image) - permutation = torch.randperm(channels) - - is_pil = F._is_pil_image(image) - if is_pil: - image = F.pil_to_tensor(image) - image = F.convert_image_dtype(image) - image = image[..., permutation, :, :] - if is_pil: - image = F.to_pil_image(image) - - return image, target - - -class ScaleJitter(nn.Module): - """Randomly resizes the image and its bounding boxes within the specified scale range. - The class implements the Scale Jitter augmentation as described in the paper - `"Simple Copy-Paste is a Strong Data Augmentation Method for Instance Segmentation" `_. - - Args: - target_size (tuple of ints): The target size for the transform provided in (height, weight) format. - scale_range (tuple of ints): scaling factor interval, e.g (a, b), then scale is randomly sampled from the - range a <= scale <= b. - interpolation (InterpolationMode): Desired interpolation enum defined by - :class:`torchvision.transforms.InterpolationMode`. Default is ``InterpolationMode.BILINEAR``. - """ - - def __init__( - self, - target_size: Tuple[int, int], - scale_range: Tuple[float, float] = (0.1, 2.0), - interpolation: InterpolationMode = InterpolationMode.BILINEAR, - ): - super().__init__() - self.target_size = target_size - self.scale_range = scale_range - self.interpolation = interpolation - - def forward( - self, image: Tensor, target: Optional[Dict[str, Tensor]] = None - ) -> Tuple[Tensor, Optional[Dict[str, Tensor]]]: - if isinstance(image, torch.Tensor): - if image.ndimension() not in {2, 3}: - raise ValueError(f"image should be 2/3 dimensional. Got {image.ndimension()} dimensions.") - elif image.ndimension() == 2: - image = image.unsqueeze(0) - - _, orig_height, orig_width = F.get_dimensions(image) - - scale = self.scale_range[0] + torch.rand(1) * (self.scale_range[1] - self.scale_range[0]) - r = min(self.target_size[1] / orig_height, self.target_size[0] / orig_width) * scale - new_width = int(orig_width * r) - new_height = int(orig_height * r) - - image = F.resize(image, [new_height, new_width], interpolation=self.interpolation) - - if target is not None: - target["boxes"][:, 0::2] *= new_width / orig_width - target["boxes"][:, 1::2] *= new_height / orig_height - if "masks" in target: - target["masks"] = F.resize( - target["masks"], [new_height, new_width], interpolation=InterpolationMode.NEAREST - ) - - return image, target - - -class FixedSizeCrop(nn.Module): - def __init__(self, size, fill=0, padding_mode="constant"): - super().__init__() - size = tuple(T._setup_size(size, error_msg="Please provide only two dimensions (h, w) for size.")) - self.crop_height = size[0] - self.crop_width = size[1] - self.fill = fill # TODO: Fill is currently respected only on PIL. Apply tensor patch. - self.padding_mode = padding_mode - - def _pad(self, img, target, padding): - # Taken from the functional_tensor.py pad - if isinstance(padding, int): - pad_left = pad_right = pad_top = pad_bottom = padding - elif len(padding) == 1: - pad_left = pad_right = pad_top = pad_bottom = padding[0] - elif len(padding) == 2: - pad_left = pad_right = padding[0] - pad_top = pad_bottom = padding[1] - else: - pad_left = padding[0] - pad_top = padding[1] - pad_right = padding[2] - pad_bottom = padding[3] - - padding = [pad_left, pad_top, pad_right, pad_bottom] - img = F.pad(img, padding, self.fill, self.padding_mode) - if target is not None: - target["boxes"][:, 0::2] += pad_left - target["boxes"][:, 1::2] += pad_top - if "masks" in target: - target["masks"] = F.pad(target["masks"], padding, 0, "constant") - - return img, target - - def _crop(self, img, target, top, left, height, width): - img = F.crop(img, top, left, height, width) - if target is not None: - boxes = target["boxes"] - boxes[:, 0::2] -= left - boxes[:, 1::2] -= top - boxes[:, 0::2].clamp_(min=0, max=width) - boxes[:, 1::2].clamp_(min=0, max=height) - - is_valid = (boxes[:, 0] < boxes[:, 2]) & (boxes[:, 1] < boxes[:, 3]) - - target["boxes"] = boxes[is_valid] - target["labels"] = target["labels"][is_valid] - if "masks" in target: - target["masks"] = F.crop(target["masks"][is_valid], top, left, height, width) - - return img, target - - def forward(self, img, target=None): - _, height, width = F.get_dimensions(img) - new_height = min(height, self.crop_height) - new_width = min(width, self.crop_width) - - if new_height != height or new_width != width: - offset_height = max(height - self.crop_height, 0) - offset_width = max(width - self.crop_width, 0) - - r = torch.rand(1) - top = int(offset_height * r) - left = int(offset_width * r) - - img, target = self._crop(img, target, top, left, new_height, new_width) - - pad_bottom = max(self.crop_height - new_height, 0) - pad_right = max(self.crop_width - new_width, 0) - if pad_bottom != 0 or pad_right != 0: - img, target = self._pad(img, target, [0, 0, pad_right, pad_bottom]) - - return img, target - - -class RandomShortestSize(nn.Module): - def __init__( - self, - min_size: Union[List[int], Tuple[int], int], - max_size: int, - interpolation: InterpolationMode = InterpolationMode.BILINEAR, - ): - super().__init__() - self.min_size = [min_size] if isinstance(min_size, int) else list(min_size) - self.max_size = max_size - self.interpolation = interpolation - - def forward( - self, image: Tensor, target: Optional[Dict[str, Tensor]] = None - ) -> Tuple[Tensor, Optional[Dict[str, Tensor]]]: - _, orig_height, orig_width = F.get_dimensions(image) - - min_size = self.min_size[torch.randint(len(self.min_size), (1,)).item()] - r = min(min_size / min(orig_height, orig_width), self.max_size / max(orig_height, orig_width)) - - new_width = int(orig_width * r) - new_height = int(orig_height * r) - - image = F.resize(image, [new_height, new_width], interpolation=self.interpolation) - - if target is not None: - target["boxes"][:, 0::2] *= new_width / orig_width - target["boxes"][:, 1::2] *= new_height / orig_height - if "masks" in target: - target["masks"] = F.resize( - target["masks"], [new_height, new_width], interpolation=InterpolationMode.NEAREST - ) - - return image, target - - -def _copy_paste( - image: torch.Tensor, - target: Dict[str, Tensor], - paste_image: torch.Tensor, - paste_target: Dict[str, Tensor], - blending: bool = True, - resize_interpolation: F.InterpolationMode = F.InterpolationMode.BILINEAR, -) -> Tuple[torch.Tensor, Dict[str, Tensor]]: - - # Random paste targets selection: - num_masks = len(paste_target["masks"]) - - if num_masks < 1: - # Such degerante case with num_masks=0 can happen with LSJ - # Let's just return (image, target) - return image, target - - # We have to please torch script by explicitly specifying dtype as torch.long - random_selection = torch.randint(0, num_masks, (num_masks,), device=paste_image.device) - random_selection = torch.unique(random_selection).to(torch.long) - - paste_masks = paste_target["masks"][random_selection] - paste_boxes = paste_target["boxes"][random_selection] - paste_labels = paste_target["labels"][random_selection] - - masks = target["masks"] - - # We resize source and paste data if they have different sizes - # This is something we introduced here as originally the algorithm works - # on equal-sized data (for example, coming from LSJ data augmentations) - size1 = image.shape[-2:] - size2 = paste_image.shape[-2:] - if size1 != size2: - paste_image = F.resize(paste_image, size1, interpolation=resize_interpolation) - paste_masks = F.resize(paste_masks, size1, interpolation=F.InterpolationMode.NEAREST) - # resize bboxes: - ratios = torch.tensor((size1[1] / size2[1], size1[0] / size2[0]), device=paste_boxes.device) - paste_boxes = paste_boxes.view(-1, 2, 2).mul(ratios).view(paste_boxes.shape) - - paste_alpha_mask = paste_masks.sum(dim=0) > 0 - - if blending: - paste_alpha_mask = F.gaussian_blur( - paste_alpha_mask.unsqueeze(0), - kernel_size=(5, 5), - sigma=[ - 2.0, - ], - ) - - # Copy-paste images: - image = (image * (~paste_alpha_mask)) + (paste_image * paste_alpha_mask) - - # Copy-paste masks: - masks = masks * (~paste_alpha_mask) - non_all_zero_masks = masks.sum((-1, -2)) > 0 - masks = masks[non_all_zero_masks] - - # Do a shallow copy of the target dict - out_target = {k: v for k, v in target.items()} - - out_target["masks"] = torch.cat([masks, paste_masks]) - - # Copy-paste boxes and labels - boxes = ops.masks_to_boxes(masks) - out_target["boxes"] = torch.cat([boxes, paste_boxes]) - - labels = target["labels"][non_all_zero_masks] - out_target["labels"] = torch.cat([labels, paste_labels]) - - # Update additional optional keys: area and iscrowd if exist - if "area" in target: - out_target["area"] = out_target["masks"].sum((-1, -2)).to(torch.float32) - - if "iscrowd" in target and "iscrowd" in paste_target: - # target['iscrowd'] size can be differ from mask size (non_all_zero_masks) - # For example, if previous transforms geometrically modifies masks/boxes/labels but - # does not update "iscrowd" - if len(target["iscrowd"]) == len(non_all_zero_masks): - iscrowd = target["iscrowd"][non_all_zero_masks] - paste_iscrowd = paste_target["iscrowd"][random_selection] - out_target["iscrowd"] = torch.cat([iscrowd, paste_iscrowd]) - - # Check for degenerated boxes and remove them - boxes = out_target["boxes"] - degenerate_boxes = boxes[:, 2:] <= boxes[:, :2] - if degenerate_boxes.any(): - valid_targets = ~degenerate_boxes.any(dim=1) - - out_target["boxes"] = boxes[valid_targets] - out_target["masks"] = out_target["masks"][valid_targets] - out_target["labels"] = out_target["labels"][valid_targets] - - if "area" in out_target: - out_target["area"] = out_target["area"][valid_targets] - if "iscrowd" in out_target and len(out_target["iscrowd"]) == len(valid_targets): - out_target["iscrowd"] = out_target["iscrowd"][valid_targets] - - return image, out_target - - -class SimpleCopyPaste(torch.nn.Module): - def __init__(self, blending=True, resize_interpolation=F.InterpolationMode.BILINEAR): - super().__init__() - self.resize_interpolation = resize_interpolation - self.blending = blending - - def forward( - self, images: List[torch.Tensor], targets: List[Dict[str, Tensor]] - ) -> Tuple[List[torch.Tensor], List[Dict[str, Tensor]]]: - torch._assert( - isinstance(images, (list, tuple)) and all([isinstance(v, torch.Tensor) for v in images]), - "images should be a list of tensors", - ) - torch._assert( - isinstance(targets, (list, tuple)) and len(images) == len(targets), - "targets should be a list of the same size as images", - ) - for target in targets: - # Can not check for instance type dict with inside torch.jit.script - # torch._assert(isinstance(target, dict), "targets item should be a dict") - for k in ["masks", "boxes", "labels"]: - torch._assert(k in target, f"Key {k} should be present in targets") - torch._assert(isinstance(target[k], torch.Tensor), f"Value for the key {k} should be a tensor") - - # images = [t1, t2, ..., tN] - # Let's define paste_images as shifted list of input images - # paste_images = [t2, t3, ..., tN, t1] - # FYI: in TF they mix data on the dataset level - images_rolled = images[-1:] + images[:-1] - targets_rolled = targets[-1:] + targets[:-1] - - output_images: List[torch.Tensor] = [] - output_targets: List[Dict[str, Tensor]] = [] - - for image, target, paste_image, paste_target in zip(images, targets, images_rolled, targets_rolled): - output_image, output_data = _copy_paste( - image, - target, - paste_image, - paste_target, - blending=self.blending, - resize_interpolation=self.resize_interpolation, - ) - output_images.append(output_image) - output_targets.append(output_data) - - return output_images, output_targets - - def __repr__(self) -> str: - s = f"{self.__class__.__name__}(blending={self.blending}, resize_interpolation={self.resize_interpolation})" - return s From 12eb9bebc93e02fe48050e6464474edb52ce440a Mon Sep 17 00:00:00 2001 From: vfdev-5 Date: Thu, 15 Sep 2022 12:14:49 +0200 Subject: [PATCH 3/3] More updates --- test/test_prototype_transforms_consistency.py | 10 +++++----- 1 file changed, 5 insertions(+), 5 deletions(-) diff --git a/test/test_prototype_transforms_consistency.py b/test/test_prototype_transforms_consistency.py index 9a64f16f746..fac2eb0bd94 100644 --- a/test/test_prototype_transforms_consistency.py +++ b/test/test_prototype_transforms_consistency.py @@ -20,7 +20,7 @@ from torchvision import transforms as legacy_transforms from torchvision._utils import sequence_to_str from torchvision.prototype import features, transforms as prototype_transforms -from torchvision.prototype.transforms.functional import to_image_pil, to_pil_image +from torchvision.prototype.transforms.functional import to_image_pil DEFAULT_MAKE_IMAGES_KWARGS = dict(color_spaces=[features.ColorSpace.RGB], extra_dims=[(4,)]) @@ -863,10 +863,10 @@ def make_datapoints(self, with_mask=True): size = (600, 800) num_objects = 22 - pil_image = to_pil_image(make_image(size=size, color_space=features.ColorSpace.RGB)) + pil_image = to_image_pil(make_image(size=size, color_space=features.ColorSpace.RGB)) target = { "boxes": make_bounding_box(image_size=size, format="XYXY", extra_dims=(num_objects,), dtype=torch.float), - "labels": make_label(size=(num_objects,)), + "labels": make_label(extra_dims=(num_objects,), categories=80), } if with_mask: target["masks"] = make_detection_mask(size=size, num_objects=num_objects, dtype=torch.long) @@ -876,7 +876,7 @@ def make_datapoints(self, with_mask=True): tensor_image = torch.randint(0, 256, size=(3, *size), dtype=torch.uint8) target = { "boxes": make_bounding_box(image_size=size, format="XYXY", extra_dims=(num_objects,), dtype=torch.float), - "labels": make_label(size=(num_objects,)), + "labels": make_label(extra_dims=(num_objects,), categories=80), } if with_mask: target["masks"] = make_detection_mask(size=size, num_objects=num_objects, dtype=torch.long) @@ -886,7 +886,7 @@ def make_datapoints(self, with_mask=True): feature_image = features.Image(torch.randint(0, 256, size=(3, *size), dtype=torch.uint8)) target = { "boxes": make_bounding_box(image_size=size, format="XYXY", extra_dims=(num_objects,), dtype=torch.float), - "labels": make_label(size=(num_objects,)), + "labels": make_label(extra_dims=(num_objects,), categories=80), } if with_mask: target["masks"] = make_detection_mask(size=size, num_objects=num_objects, dtype=torch.long)