diff --git a/test/test_ious.py b/test/test_ious.py
new file mode 100644
index 00000000000..4e87d64b477
--- /dev/null
+++ b/test/test_ious.py
@@ -0,0 +1,147 @@
+from typing import List, Callable
+
+import pytest
+import torch
+import torch.fx
+from torch import Tensor
+from torchvision import ops
+
+
+class IouTestBase:
+    @staticmethod
+    def _run_test(target_fn: Callable, test_input: List, dtypes: List[torch.dtype], tolerance: float, expected: List):
+        def assert_close(box: Tensor, expected: Tensor, tolerance):
+            out = target_fn(box, box)
+            torch.testing.assert_close(out, expected, rtol=0.0, check_dtype=False, atol=tolerance)
+
+        for dtype in dtypes:
+            actual_box = torch.tensor(test_input, dtype=dtype)
+            expected_box = torch.tensor(expected)
+            assert_close(actual_box, expected_box, tolerance)
+
+    @staticmethod
+    def _run_jit_test(target_fn: Callable, test_input: List):
+        box_tensor = torch.tensor(test_input, dtype=torch.float)
+        expected = target_fn(box_tensor, box_tensor)
+        scripted_fn = torch.jit.script(target_fn)
+        scripted_out = scripted_fn(box_tensor, box_tensor)
+        torch.testing.assert_close(scripted_out, expected, rtol=0.0, atol=1e-3)
+
+
+def _generate_int_input():
+    return [[0, 0, 100, 100], [0, 0, 50, 50], [200, 200, 300, 300]]
+
+
+def _generate_float_input():
+    return [
+        [285.3538, 185.5758, 1193.5110, 851.4551],
+        [285.1472, 188.7374, 1192.4984, 851.0669],
+        [279.2440, 197.9812, 1189.4746, 849.2019],
+    ]
+
+
+class TestBoxIou(IouTestBase):
+    def _generate_int_expected():
+        return [[1.0, 0.25, 0.0], [0.25, 1.0, 0.0], [0.0, 0.0, 1.0]]
+
+    def _generate_float_input():
+        return [
+            [285.3538, 185.5758, 1193.5110, 851.4551],
+            [285.1472, 188.7374, 1192.4984, 851.0669],
+            [279.2440, 197.9812, 1189.4746, 849.2019],
+        ]
+
+    def _generate_float_expected():
+        return [[1.0, 0.9933, 0.9673], [0.9933, 1.0, 0.9737], [0.9673, 0.9737, 1.0]]
+
+    @pytest.mark.parametrize(
+        "test_input, dtypes, tolerance, expected",
+        [
+            pytest.param(
+                _generate_int_input(), [torch.int16, torch.int32, torch.int64], 1e-4, _generate_int_expected()
+            ),
+            pytest.param(_generate_float_input(), [torch.float16], 0.002, _generate_float_expected()),
+            pytest.param(_generate_float_input(), [torch.float32, torch.float64], 1e-3, _generate_float_expected()),
+        ],
+    )
+    def test_iou(self, test_input, dtypes, tolerance, expected):
+        self._run_test(ops.box_iou, test_input, dtypes, tolerance, expected)
+
+    def test_iou_jit(self):
+        self._run_jit_test(ops.box_iou, [[0, 0, 100, 100], [0, 0, 50, 50], [200, 200, 300, 300]])
+
+
+class TestGenBoxIou(IouTestBase):
+    def _generate_int_expected():
+        return [[1.0, 0.25, -0.7778], [0.25, 1.0, -0.8611], [-0.7778, -0.8611, 1.0]]
+
+    def _generate_float_expected():
+        return [[1.0, 0.9933, 0.9673], [0.9933, 1.0, 0.9737], [0.9673, 0.9737, 1.0]]
+
+    @pytest.mark.parametrize(
+        "test_input, dtypes, tolerance, expected",
+        [
+            pytest.param(
+                _generate_int_input(), [torch.int16, torch.int32, torch.int64], 1e-4, _generate_int_expected()
+            ),
+            pytest.param(_generate_float_input(), [torch.float16], 0.002, _generate_float_expected()),
+            pytest.param(_generate_float_input(), [torch.float32, torch.float64], 1e-3, _generate_float_expected()),
+        ],
+    )
+    def test_iou(self, test_input, dtypes, tolerance, expected):
+        self._run_test(ops.generalized_box_iou, test_input, dtypes, tolerance, expected)
+
+    def test_iou_jit(self):
+        self._run_jit_test(ops.generalized_box_iou, [[0, 0, 100, 100], [0, 0, 50, 50], [200, 200, 300, 300]])
+
+
+class TestDistanceBoxIoU(IouTestBase):
+    def _generate_int_expected():
+        return [[1.0, 0.25, 0.0], [0.25, 1.0, 0.0], [0.0, 0.0, 1.0]]
+
+    def _generate_float_expected():
+        return [[1.0, 0.9933, 0.9673], [0.9933, 1.0, 0.9737], [0.9673, 0.9737, 1.0]]
+
+    @pytest.mark.parametrize(
+        "test_input, dtypes, tolerance, expected",
+        [
+            pytest.param(
+                _generate_int_input(), [torch.int16, torch.int32, torch.int64], 1e-4, _generate_int_expected()
+            ),
+            pytest.param(_generate_float_input(), [torch.float16], 0.002, _generate_float_expected()),
+            pytest.param(_generate_float_input(), [torch.float32, torch.float64], 1e-3, _generate_float_expected()),
+        ],
+    )
+    def test_iou(self, test_input, dtypes, tolerance, expected):
+        self._run_test(ops.distance_box_iou, test_input, dtypes, tolerance, expected)
+
+    def test_iou_jit(self):
+        self._run_jit_test(ops.distance_box_iou, [[0, 0, 100, 100], [0, 0, 50, 50], [200, 200, 300, 300]])
+
+
+class TestCompleteBoxIou(IouTestBase):
+    def _generate_int_expected():
+        return [[1.0, 0.25, 0.0], [0.25, 1.0, 0.0], [0.0, 0.0, 1.0]]
+
+    def _generate_float_expected():
+        return [[1.0, 0.9933, 0.9673], [0.9933, 1.0, 0.9737], [0.9673, 0.9737, 1.0]]
+
+    @pytest.mark.parametrize(
+        "test_input, dtypes, tolerance, expected",
+        [
+            pytest.param(
+                _generate_int_input(), [torch.int16, torch.int32, torch.int64], 1e-4, _generate_int_expected()
+            ),
+            pytest.param(_generate_float_input(), [torch.float16], 0.002, _generate_float_expected()),
+            pytest.param(_generate_float_input(), [torch.float32, torch.float64], 1e-3, _generate_float_expected()),
+        ],
+    )
+    def test_iou(self, test_input, dtypes, tolerance, expected):
+        self._run_test(ops.complete_box_iou, test_input, dtypes, tolerance, expected)
+
+    def test_iou_jit(self):
+        self._run_jit_test(ops.complete_box_iou, [[0, 0, 100, 100], [0, 0, 50, 50], [200, 200, 300, 300]])
+
+
+if __name__ == "__main__":
+    pytest.main([__file__])
diff --git a/test/test_losses.py b/test/test_losses.py
new file mode 100644
index 00000000000..c2d7f9452ef
--- /dev/null
+++ b/test/test_losses.py
@@ -0,0 +1,229 @@
+import pytest
+import torch
+import torch.nn.functional as F
+from common_utils import cpu_and_gpu
+from torchvision import ops
+
+
+def get_boxes(dtype, device):
+    box1 = torch.tensor([-1, -1, 1, 1], dtype=dtype, device=device)
+    box2 = torch.tensor([0, 0, 1, 1], dtype=dtype, device=device)
+    box3 = torch.tensor([0, 1, 1, 2], dtype=dtype, device=device)
+    box4 = torch.tensor([1, 1, 2, 2], dtype=dtype, device=device)
+
+    box1s = torch.stack([box2, box2], dim=0)
+    box2s = torch.stack([box3, box4], dim=0)
+
+    return box1, box2, box3, box4, box1s, box2s
+
+
+def assert_iou_loss(iou_fn, box1, box2, expected_loss, dtype, device, reduction="none"):
+    tol = 1e-3 if dtype is torch.half else 1e-5
+    computed_loss = iou_fn(box1, box2, reduction=reduction)
+    expected_loss = torch.tensor(expected_loss, device=device)
+    torch.testing.assert_close(computed_loss, expected_loss, rtol=tol, atol=tol)
+
+
+def assert_empty_loss(iou_fn, dtype, device):
+    box1 = torch.randn([0, 4], dtype=dtype, device=device).requires_grad_()
+    box2 = torch.randn([0, 4], dtype=dtype, device=device).requires_grad_()
+    loss = iou_fn(box1, box2, reduction="mean")
+    loss.backward()
+    tol = 1e-3 if dtype is torch.half else 1e-5
+    torch.testing.assert_close(loss, torch.tensor(0.0, device=device), rtol=tol, atol=tol)
+    assert box1.grad is not None, "box1.grad should not be None after backward is called"
+    assert box2.grad is not None, "box2.grad should not be None after backward is called"
+    loss = iou_fn(box1, box2, reduction="none")
+    assert loss.numel() == 0, f"{str(iou_fn)} for two empty box should be empty"
+
+
+class TestGeneralizedBoxIouLoss:
+    # We refer to original test: https://github.com/facebookresearch/fvcore/blob/main/tests/test_giou_loss.py
+    @pytest.mark.parametrize("device", cpu_and_gpu())
+    @pytest.mark.parametrize("dtype", [torch.float32, torch.half])
+    def test_giou_loss(self, dtype, device):
+
+        box1, box2, box3, box4, box1s, box2s = get_boxes(dtype, device)
+
+        # Identical boxes should have loss of 0
+        assert_iou_loss(ops.generalized_box_iou_loss, box1, box1, 0.0, dtype=dtype, device=device)
+
+        # quarter size box inside other box = IoU of 0.25
+        assert_iou_loss(ops.generalized_box_iou_loss, box1, box2, 0.75, dtype=dtype, device=device)
+
+        # Two side by side boxes, area=union
+        # IoU=0 and GIoU=0 (loss 1.0)
+        assert_iou_loss(ops.generalized_box_iou_loss, box2, box3, 1.0, dtype=dtype, device=device)
+
+        # Two diagonally adjacent boxes, area=2*union
+        # IoU=0 and GIoU=-0.5 (loss 1.5)
+        assert_iou_loss(ops.generalized_box_iou_loss, box2, box4, 1.5, dtype=dtype, device=device)
+
+        # Test batched loss and reductions
+        assert_iou_loss(ops.generalized_box_iou_loss, box1s, box2s, 2.5, dtype=dtype, device=device, reduction="sum")
+        assert_iou_loss(ops.generalized_box_iou_loss, box1s, box2s, 1.25, dtype=dtype, device=device, reduction="mean")
+
+    @pytest.mark.parametrize("device", cpu_and_gpu())
+    @pytest.mark.parametrize("dtype", [torch.float32, torch.half])
+    def test_empty_inputs(self, dtype, device):
+        assert_empty_loss(ops.generalized_box_iou_loss, dtype, device)
+
+
+class TestCIOULoss:
+    @pytest.mark.parametrize("dtype", [torch.float32, torch.half])
+    @pytest.mark.parametrize("device", cpu_and_gpu())
+    def test_ciou_loss(self, dtype, device):
+        box1, box2, box3, box4, box1s, box2s = get_boxes(dtype, device)
+
+        assert_iou_loss(ops.complete_box_iou_loss, box1, box1, 0.0, dtype=dtype, device=device)
+        assert_iou_loss(ops.complete_box_iou_loss, box1, box2, 0.8125, dtype=dtype, device=device)
+        assert_iou_loss(ops.complete_box_iou_loss, box1, box3, 1.1923, dtype=dtype, device=device)
+        assert_iou_loss(ops.complete_box_iou_loss, box1, box4, 1.2500, dtype=dtype, device=device)
+        assert_iou_loss(ops.complete_box_iou_loss, box1s, box2s, 1.2250, dtype=dtype, device=device, reduction="mean")
+        assert_iou_loss(ops.complete_box_iou_loss, box1s, box2s, 2.4500, dtype=dtype, device=device, reduction="sum")
+
+    @pytest.mark.parametrize("device", cpu_and_gpu())
+    @pytest.mark.parametrize("dtype", [torch.float32, torch.half])
+    def test_empty_inputs(self, dtype, device):
+        assert_empty_loss(ops.complete_box_iou_loss, dtype, device)
+
+
+class TestDIouLoss:
+    @pytest.mark.parametrize("device", cpu_and_gpu())
+    @pytest.mark.parametrize("dtype", [torch.float32, torch.half])
+    def test_distance_iou_loss(self, dtype, device):
+        box1, box2, box3, box4, box1s, box2s = get_boxes(dtype, device)
+
+        assert_iou_loss(ops.distance_box_iou_loss, box1, box1, 0.0, dtype=dtype, device=device)
+        assert_iou_loss(ops.distance_box_iou_loss, box1, box2, 0.8125, dtype=dtype, device=device)
+        assert_iou_loss(ops.distance_box_iou_loss, box1, box3, 1.1923, dtype=dtype, device=device)
+        assert_iou_loss(ops.distance_box_iou_loss, box1, box4, 1.2500, dtype=dtype, device=device)
+        assert_iou_loss(ops.distance_box_iou_loss, box1s, box2s, 1.2250, dtype=dtype, device=device, reduction="mean")
+        assert_iou_loss(ops.distance_box_iou_loss, box1s, box2s, 2.4500, dtype=dtype, device=device, reduction="sum")
+
+    @pytest.mark.parametrize("device", cpu_and_gpu())
+    @pytest.mark.parametrize("dtype", [torch.float32, torch.half])
+    def test_empty_distance_iou_inputs(self, dtype, device):
+        assert_empty_loss(ops.distance_box_iou_loss, dtype, device)
+
+
+class TestFocalLoss:
+    def _generate_diverse_input_target_pair(self, shape=(5, 2), **kwargs):
+        def logit(p):
+            return torch.log(p / (1 - p))
+
+        def generate_tensor_with_range_type(shape, range_type, **kwargs):
+            if range_type != "random_binary":
+                low, high = {
+                    "small": (0.0, 0.2),
+                    "big": (0.8, 1.0),
+                    "zeros": (0.0, 0.0),
+                    "ones": (1.0, 1.0),
+                    "random": (0.0, 1.0),
+                }[range_type]
+                return torch.testing.make_tensor(shape, low=low, high=high, **kwargs)
+            else:
+                return torch.randint(0, 2, shape, **kwargs)
+
+        # This function will return inputs and targets with shape: (shape[0]*9, shape[1])
+        inputs = []
+        targets = []
+        for input_range_type, target_range_type in [
+            ("small", "zeros"),
+            ("small", "ones"),
+            ("small", "random_binary"),
+            ("big", "zeros"),
+            ("big", "ones"),
+            ("big", "random_binary"),
+            ("random", "zeros"),
+            ("random", "ones"),
+            ("random", "random_binary"),
+        ]:
+            inputs.append(logit(generate_tensor_with_range_type(shape, input_range_type, **kwargs)))
+            targets.append(generate_tensor_with_range_type(shape, target_range_type, **kwargs))
+
+        return torch.cat(inputs), torch.cat(targets)
+
+    @pytest.mark.parametrize("alpha", [-1.0, 0.0, 0.58, 1.0])
+    @pytest.mark.parametrize("gamma", [0, 2])
+    @pytest.mark.parametrize("device", cpu_and_gpu())
+    @pytest.mark.parametrize("dtype", [torch.float32, torch.half])
+    @pytest.mark.parametrize("seed", [0, 1])
+    def test_correct_ratio(self, alpha, gamma, device, dtype, seed):
+        if device == "cpu" and dtype is torch.half:
+            pytest.skip("Currently torch.half is not fully supported on cpu")
+        # For testing the ratio with manual calculation, we require the reduction to be "none"
+        reduction = "none"
+        torch.random.manual_seed(seed)
+        inputs, targets = self._generate_diverse_input_target_pair(dtype=dtype, device=device)
+        focal_loss = ops.sigmoid_focal_loss(inputs, targets, gamma=gamma, alpha=alpha, reduction=reduction)
+        ce_loss = F.binary_cross_entropy_with_logits(inputs, targets, reduction=reduction)
+
+        assert torch.all(
+            focal_loss <= ce_loss
+        ), "focal loss must be less or equal to cross entropy loss with same input"
+
+        loss_ratio = (focal_loss / ce_loss).squeeze()
+        prob = torch.sigmoid(inputs)
+        p_t = prob * targets + (1 - prob) * (1 - targets)
+        correct_ratio = (1.0 - p_t) ** gamma
+        if alpha >= 0:
+            alpha_t = alpha * targets + (1 - alpha) * (1 - targets)
+            correct_ratio = correct_ratio * alpha_t
+
+        tol = 1e-3 if dtype is torch.half else 1e-5
+        torch.testing.assert_close(correct_ratio, loss_ratio, rtol=tol, atol=tol)
+
+    @pytest.mark.parametrize("reduction", ["mean", "sum"])
+    @pytest.mark.parametrize("device", cpu_and_gpu())
+    @pytest.mark.parametrize("dtype", [torch.float32, torch.half])
+    @pytest.mark.parametrize("seed", [2, 3])
+    def test_equal_ce_loss(self, reduction, device, dtype, seed):
+        if device == "cpu" and dtype is torch.half:
+            pytest.skip("Currently torch.half is not fully supported on cpu")
+        # focal loss should be equal ce_loss if alpha=-1 and gamma=0
+        alpha = -1
+        gamma = 0
+        torch.random.manual_seed(seed)
+        inputs, targets = self._generate_diverse_input_target_pair(dtype=dtype, device=device)
+        inputs_fl = inputs.clone().requires_grad_()
+        targets_fl = targets.clone()
+        inputs_ce = inputs.clone().requires_grad_()
+        targets_ce = targets.clone()
+        focal_loss = ops.sigmoid_focal_loss(inputs_fl, targets_fl, gamma=gamma, alpha=alpha, reduction=reduction)
+        ce_loss = F.binary_cross_entropy_with_logits(inputs_ce, targets_ce, reduction=reduction)
+
+        tol = 1e-3 if dtype is torch.half else 1e-5
+        torch.testing.assert_close(focal_loss, ce_loss, rtol=tol, atol=tol)
+
+        focal_loss.backward()
+        ce_loss.backward()
+        torch.testing.assert_close(inputs_fl.grad, inputs_ce.grad, rtol=tol, atol=tol)
+
+    @pytest.mark.parametrize("alpha", [-1.0, 0.0, 0.58, 1.0])
+    @pytest.mark.parametrize("gamma", [0, 2])
+    @pytest.mark.parametrize("reduction", ["none", "mean", "sum"])
+    @pytest.mark.parametrize("device", cpu_and_gpu())
+    @pytest.mark.parametrize("dtype", [torch.float32, torch.half])
+    @pytest.mark.parametrize("seed", [4, 5])
+    def test_jit(self, alpha, gamma, reduction, device, dtype, seed):
+        if device == "cpu" and dtype is torch.half:
+            pytest.skip("Currently torch.half is not fully supported on cpu")
+        script_fn = torch.jit.script(ops.sigmoid_focal_loss)
+        torch.random.manual_seed(seed)
+        inputs, targets = self._generate_diverse_input_target_pair(dtype=dtype, device=device)
+        focal_loss = ops.sigmoid_focal_loss(inputs, targets, gamma=gamma, alpha=alpha, reduction=reduction)
+        if device == "cpu":
+            scripted_focal_loss = script_fn(inputs, targets, gamma=gamma, alpha=alpha, reduction=reduction)
+        else:
+            with torch.jit.fuser("fuser2"):
+                # Use fuser2 to prevent a bug on fuser: https://github.com/pytorch/pytorch/issues/75476
+                # We may remove this condition once the bug is resolved
+                scripted_focal_loss = script_fn(inputs, targets, gamma=gamma, alpha=alpha, reduction=reduction)
+
+        tol = 1e-3 if dtype is torch.half else 1e-5
+        torch.testing.assert_close(focal_loss, scripted_focal_loss, rtol=tol, atol=tol)
+
+
+if __name__ == "__main__":
+    pytest.main([__file__])
diff --git a/test/test_ops.py b/test/test_ops.py
index 96cfb630e8d..df5d397713c 100644
--- a/test/test_ops.py
+++ b/test/test_ops.py
@@ -3,13 +3,12 @@
 from abc import ABC, abstractmethod
 from functools import lru_cache
 from itertools import product
-from typing import Callable, List, Tuple
+from typing import Tuple
 
 import numpy as np
 import pytest
 import torch
 import torch.fx
-import torch.nn.functional as F
 from common_utils import assert_equal, cpu_and_gpu, needs_cuda
 from PIL import Image
 from torch import nn, Tensor
@@ -1021,7 +1020,7 @@ def test_convert_boxes_to_roi_format(self, box_sequence):
             assert_equal(ref_tensor, ops._utils.convert_boxes_to_roi_format(box_sequence))
 
 
-class TestBox:
+class TestBoxConvert:
     def test_bbox_same(self):
         box_tensor = torch.tensor(
             [[0, 0, 100, 100], [0, 0, 0, 0], [10, 15, 30, 35], [23, 35, 93, 95]], dtype=torch.float
@@ -1051,7 +1050,7 @@ def test_bbox_xyxy_xywh(self):
         assert_equal(box_xyxy, box_tensor)
 
     def test_bbox_xyxy_cxcywh(self):
-        # Simple test convert boxes to xywh and back. Make sure they are same.
+        # Simple test convert boxes to cxcywh and back. Make sure they are same.
         # box_tensor is in x1 y1 x2 y2 format.
         box_tensor = torch.tensor(
             [[0, 0, 100, 100], [0, 0, 0, 0], [10, 15, 30, 35], [23, 35, 93, 95]], dtype=torch.float
@@ -1073,7 +1072,6 @@ def test_bbox_xywh_cxcywh(self):
             [[0, 0, 100, 100], [0, 0, 0, 0], [10, 15, 20, 20], [23, 35, 70, 60]], dtype=torch.float
         )
 
-        # This is wrong
         exp_cxcywh = torch.tensor(
             [[50, 50, 100, 100], [0, 0, 0, 0], [20, 25, 20, 20], [58, 65, 70, 60]], dtype=torch.float
         )
@@ -1113,277 +1111,48 @@ def test_bbox_convert_jit(self):
         torch.testing.assert_close(scripted_cxcywh, box_cxcywh, rtol=0.0, atol=TOLERANCE)
 
 
-class BoxTestBase(ABC):
-    @abstractmethod
-    def _target_fn(self) -> Tuple[bool, Callable]:
-        pass
+def area_check(box, expected, tolerance=1e-4):
+    out = ops.box_area(box)
+    torch.testing.assert_close(out, expected, rtol=0.0, check_dtype=False, atol=tolerance)
 
-    def _perform_box_operation(self, box: Tensor, run_as_script: bool = False) -> Tensor:
-        is_binary_fn = self._target_fn()[0]
-        target_fn = self._target_fn()[1]
-        box_operation = torch.jit.script(target_fn) if run_as_script else target_fn
-        return box_operation(box, box) if is_binary_fn else box_operation(box)
 
-    def _run_test(self, test_input: List, dtypes: List[torch.dtype], tolerance: float, expected: List) -> None:
-        def assert_close(box: Tensor, expected: Tensor, tolerance):
-            out = self._perform_box_operation(box)
-            torch.testing.assert_close(out, expected, rtol=0.0, check_dtype=False, atol=tolerance)
-
-        for dtype in dtypes:
-            actual_box = torch.tensor(test_input, dtype=dtype)
-            expected_box = torch.tensor(expected)
-            assert_close(actual_box, expected_box, tolerance)
-
-    def _run_jit_test(self, test_input: List) -> None:
-        box_tensor = torch.tensor(test_input, dtype=torch.float)
-        expected = self._perform_box_operation(box_tensor, True)
-        scripted_area = self._perform_box_operation(box_tensor, True)
-        torch.testing.assert_close(scripted_area, expected, rtol=0.0, atol=1e-3)
+class TestBoxArea:
+    @pytest.mark.parametrize("dtype", [torch.int8, torch.int16, torch.int32, torch.int64])
+    def test_int_boxes(self, dtype):
+        # Check for int boxes
+        box_tensor = torch.tensor([[0, 0, 100, 100], [0, 0, 0, 0]], dtype=dtype)
+        expected = torch.tensor([10000, 0])
+        area_check(box_tensor, expected)
 
-
-class TestBoxArea(BoxTestBase):
-    def _target_fn(self) -> Tuple[bool, Callable]:
-        return (False, ops.box_area)
-
-    def _generate_int_input() -> List[List[int]]:
-        return [[0, 0, 100, 100], [0, 0, 0, 0]]
-
-    def _generate_int_expected() -> List[int]:
-        return [10000, 0]
-
-    def _generate_float_input(index: int) -> List[List[float]]:
-        return [
+    # Check for float32 and float64 boxes
+    @pytest.mark.parametrize("dtype", [torch.float32, torch.float64])
+    def test_float_boxes(self, dtype):
+        box_tensor = torch.tensor(
             [
                 [285.3538, 185.5758, 1193.5110, 851.4551],
                 [285.1472, 188.7374, 1192.4984, 851.0669],
                 [279.2440, 197.9812, 1189.4746, 849.2019],
             ],
-            [[285.25, 185.625, 1194.0, 851.5], [285.25, 188.75, 1192.0, 851.0], [279.25, 198.0, 1189.0, 849.0]],
-        ][index]
-
-    def _generate_float_expected(index: int) -> List[float]:
-        return [[604723.0806, 600965.4666, 592761.0085], [605113.875, 600495.1875, 592247.25]][index]
-
-    @pytest.mark.parametrize(
-        "test_input, dtypes, tolerance, expected",
-        [
-            pytest.param(
-                _generate_int_input(),
-                [torch.int8, torch.int16, torch.int32, torch.int64],
-                1e-4,
-                _generate_int_expected(),
-            ),
-            pytest.param(_generate_float_input(0), [torch.float32, torch.float64], 0.05, _generate_float_expected(0)),
-            pytest.param(_generate_float_input(1), [torch.float16], 1e-4, _generate_float_expected(1)),
-        ],
-    )
-    def test_box_area(self, test_input: List, dtypes: List[torch.dtype], tolerance: float, expected: List) -> None:
-        self._run_test(test_input, dtypes, tolerance, expected)
-
-    def test_box_area_jit(self) -> None:
-        self._run_jit_test([[0, 0, 100, 100], [0, 0, 0, 0]])
-
-
-class TestBoxIou(BoxTestBase):
-    def _target_fn(self) -> Tuple[bool, Callable]:
-        return (True, ops.box_iou)
-
-    def _generate_int_input() -> List[List[int]]:
-        return [[0, 0, 100, 100], [0, 0, 50, 50], [200, 200, 300, 300]]
-
-    def _generate_int_expected() -> List[List[float]]:
-        return [[1.0, 0.25, 0.0], [0.25, 1.0, 0.0], [0.0, 0.0, 1.0]]
-
-    def _generate_float_input() -> List[List[float]]:
-        return [
-            [285.3538, 185.5758, 1193.5110, 851.4551],
-            [285.1472, 188.7374, 1192.4984, 851.0669],
-            [279.2440, 197.9812, 1189.4746, 849.2019],
-        ]
-
-    def _generate_float_expected() -> List[List[float]]:
-        return [[1.0, 0.9933, 0.9673], [0.9933, 1.0, 0.9737], [0.9673, 0.9737, 1.0]]
-
-    @pytest.mark.parametrize(
-        "test_input, dtypes, tolerance, expected",
-        [
-            pytest.param(
-                _generate_int_input(), [torch.int16, torch.int32, torch.int64], 1e-4, _generate_int_expected()
-            ),
-            pytest.param(_generate_float_input(), [torch.float16], 0.002, _generate_float_expected()),
-            pytest.param(_generate_float_input(), [torch.float32, torch.float64], 1e-4, _generate_float_expected()),
-        ],
-    )
-    def test_iou(self, test_input: List, dtypes: List[torch.dtype], tolerance: float, expected: List) -> None:
-        self._run_test(test_input, dtypes, tolerance, expected)
-
-    def test_iou_jit(self) -> None:
-        self._run_jit_test([[0, 0, 100, 100], [0, 0, 50, 50], [200, 200, 300, 300]])
-
-
-class TestGenBoxIou(BoxTestBase):
-    def _target_fn(self) -> Tuple[bool, Callable]:
-        return (True, ops.generalized_box_iou)
-
-    def _generate_int_input() -> List[List[int]]:
-        return [[0, 0, 100, 100], [0, 0, 50, 50], [200, 200, 300, 300]]
-
-    def _generate_int_expected() -> List[List[float]]:
-        return [[1.0, 0.25, -0.7778], [0.25, 1.0, -0.8611], [-0.7778, -0.8611, 1.0]]
-
-    def _generate_float_input() -> List[List[float]]:
-        return [
-            [285.3538, 185.5758, 1193.5110, 851.4551],
-            [285.1472, 188.7374, 1192.4984, 851.0669],
-            [279.2440, 197.9812, 1189.4746, 849.2019],
-        ]
-
-    def _generate_float_expected() -> List[List[float]]:
-        return [[1.0, 0.9933, 0.9673], [0.9933, 1.0, 0.9737], [0.9673, 0.9737, 1.0]]
-
-    @pytest.mark.parametrize(
-        "test_input, dtypes, tolerance, expected",
-        [
-            pytest.param(
-                _generate_int_input(), [torch.int16, torch.int32, torch.int64], 1e-4, _generate_int_expected()
-            ),
-            pytest.param(_generate_float_input(), [torch.float16], 0.002, _generate_float_expected()),
-            pytest.param(_generate_float_input(), [torch.float32, torch.float64], 0.001, _generate_float_expected()),
-        ],
-    )
-    def test_gen_iou(self, test_input: List, dtypes: List[torch.dtype], tolerance: float, expected: List) -> None:
-        self._run_test(test_input, dtypes, tolerance, expected)
-
-    def test_giou_jit(self) -> None:
-        self._run_jit_test([[0, 0, 100, 100], [0, 0, 50, 50], [200, 200, 300, 300]])
-
-
-class TestDistanceBoxIoU(BoxTestBase):
-    def _target_fn(self):
-        return (True, ops.distance_box_iou)
-
-    def _generate_int_input():
-        return [[0, 0, 100, 100], [0, 0, 50, 50], [200, 200, 300, 300]]
-
-    def _generate_int_expected():
-        return [[1.0, 0.25, 0.0], [0.25, 1.0, 0.0], [0.0, 0.0, 1.0]]
-
-    def _generate_float_input():
-        return [
-            [285.3538, 185.5758, 1193.5110, 851.4551],
-            [285.1472, 188.7374, 1192.4984, 851.0669],
-            [279.2440, 197.9812, 1189.4746, 849.2019],
-        ]
-
-    def _generate_float_expected():
-        return [[1.0, 0.9933, 0.9673], [0.9933, 1.0, 0.9737], [0.9673, 0.9737, 1.0]]
-
-    @pytest.mark.parametrize(
-        "test_input, dtypes, tolerance, expected",
-        [
-            pytest.param(
-                _generate_int_input(), [torch.int16, torch.int32, torch.int64], 1e-4, _generate_int_expected()
-            ),
-            pytest.param(_generate_float_input(), [torch.float16], 0.002, _generate_float_expected()),
-            pytest.param(_generate_float_input(), [torch.float32, torch.float64], 0.001, _generate_float_expected()),
-        ],
-    )
-    def test_distance_iou(self, test_input, dtypes, tolerance, expected):
-        self._run_test(test_input, dtypes, tolerance, expected)
-
-    def test_distance_iou_jit(self):
-        self._run_jit_test([[0, 0, 100, 100], [0, 0, 50, 50], [200, 200, 300, 300]])
-
-
-@pytest.mark.parametrize("device", cpu_and_gpu())
-@pytest.mark.parametrize("dtype", [torch.float32, torch.half])
-def test_distance_iou_loss(dtype, device):
-    box1 = torch.tensor([-1, -1, 1, 1], dtype=dtype, device=device)
-    box2 = torch.tensor([0, 0, 1, 1], dtype=dtype, device=device)
-    box3 = torch.tensor([0, 1, 1, 2], dtype=dtype, device=device)
-    box4 = torch.tensor([1, 1, 2, 2], dtype=dtype, device=device)
-
-    box1s = torch.stack(
-        [box2, box2],
-        dim=0,
-    )
-    box2s = torch.stack(
-        [box3, box4],
-        dim=0,
-    )
-
-    def assert_distance_iou_loss(box1, box2, expected_output, reduction="none"):
-        output = ops.distance_box_iou_loss(box1, box2, reduction=reduction)
-        # TODO: When passing the dtype, the torch.half fails as usual.
-        expected_output = torch.tensor(expected_output, device=device)
-        tol = 1e-5 if dtype != torch.half else 1e-3
-        torch.testing.assert_close(output, expected_output, rtol=tol, atol=tol)
-
-    assert_distance_iou_loss(box1, box1, 0.0)
-
-    assert_distance_iou_loss(box1, box2, 0.8125)
-
-    assert_distance_iou_loss(box1, box3, 1.1923)
-
-    assert_distance_iou_loss(box1, box4, 1.2500)
-
-    assert_distance_iou_loss(box1s, box2s, 1.2250, reduction="mean")
-    assert_distance_iou_loss(box1s, box2s, 2.4500, reduction="sum")
-
-
-@pytest.mark.parametrize("device", cpu_and_gpu())
-@pytest.mark.parametrize("dtype", [torch.float32, torch.half])
-def test_empty_distance_iou_inputs(dtype, device) -> None:
-    box1 = torch.randn([0, 4], dtype=dtype, device=device).requires_grad_()
-    box2 = torch.randn([0, 4], dtype=dtype, device=device).requires_grad_()
-
-    loss = ops.distance_box_iou_loss(box1, box2, reduction="mean")
-    loss.backward()
-
-    tol = 1e-3 if dtype is torch.half else 1e-5
-    torch.testing.assert_close(loss, torch.tensor(0.0, device=device), rtol=tol, atol=tol)
-    assert box1.grad is not None, "box1.grad should not be None after backward is called"
-    assert box2.grad is not None, "box2.grad should not be None after backward is called"
-
-    loss = ops.distance_box_iou_loss(box1, box2, reduction="none")
-    assert loss.numel() == 0, "diou_loss for two empty box should be empty"
-
-
-class TestCompleteBoxIou(BoxTestBase):
-    def _target_fn(self) -> Tuple[bool, Callable]:
-        return (True, ops.complete_box_iou)
-
-    def _generate_int_input() -> List[List[int]]:
-        return [[0, 0, 100, 100], [0, 0, 50, 50], [200, 200, 300, 300]]
-
-    def _generate_int_expected() -> List[List[float]]:
-        return [[1.0, 0.25, 0.0], [0.25, 1.0, 0.0], [0.0, 0.0, 1.0]]
-
-    def _generate_float_input() -> List[List[float]]:
-        return [
-            [285.3538, 185.5758, 1193.5110, 851.4551],
-            [285.1472, 188.7374, 1192.4984, 851.0669],
-            [279.2440, 197.9812, 1189.4746, 849.2019],
-        ]
-
-    def _generate_float_expected() -> List[List[float]]:
-        return [[1.0, 0.9933, 0.9673], [0.9933, 1.0, 0.9737], [0.9673, 0.9737, 1.0]]
-
-    @pytest.mark.parametrize(
-        "test_input, dtypes, tolerance, expected",
-        [
-            pytest.param(
-                _generate_int_input(), [torch.int16, torch.int32, torch.int64], 1e-4, _generate_int_expected()
-            ),
-            pytest.param(_generate_float_input(), [torch.float32, torch.float64], 0.002, _generate_float_expected()),
-            pytest.param(_generate_float_input(), [torch.float32, torch.float64], 0.001, _generate_float_expected()),
-        ],
-    )
-    def test_complete_iou(self, test_input: List, dtypes: List[torch.dtype], tolerance: float, expected: List) -> None:
-        self._run_test(test_input, dtypes, tolerance, expected)
+            dtype=dtype,
+        )
+        expected = torch.tensor([604723.0806, 600965.4666, 592761.0085], dtype=torch.float64)
+        area_check(box_tensor, expected, tolerance=0.05)
 
-    def test_ciou_jit(self) -> None:
-        self._run_jit_test([[0, 0, 100, 100], [0, 0, 50, 50], [200, 200, 300, 300]])
+    def test_float16_box(self):
+        # Check for float16 box
+        box_tensor = torch.tensor(
+            [[285.25, 185.625, 1194.0, 851.5], [285.25, 188.75, 1192.0, 851.0], [279.25, 198.0, 1189.0, 849.0]],
+            dtype=torch.float16,
+        )
+        expected = torch.tensor([605113.875, 600495.1875, 592247.25])
+        area_check(box_tensor, expected)
+
+    def test_box_area_jit(self):
+        box_tensor = torch.tensor([[0, 0, 100, 100], [0, 0, 0, 0]], dtype=torch.float)
+        expected = ops.box_area(box_tensor)
+        scripted_fn = torch.jit.script(ops.box_area)
+        scripted_area = scripted_fn(box_tensor)
+        torch.testing.assert_close(scripted_area, expected, rtol=0.0, atol=1e-3)
 
 
 class TestMasksToBoxes:
@@ -1579,227 +1348,5 @@ def test_is_leaf_node(self, dim, p, block_size, inplace):
         assert len(graph_node_names[0]) == 1 + op_obj.n_inputs
 
 
-class TestFocalLoss:
-    def _generate_diverse_input_target_pair(self, shape=(5, 2), **kwargs):
-        def logit(p: Tensor) -> Tensor:
-            return torch.log(p / (1 - p))
-
-        def generate_tensor_with_range_type(shape, range_type, **kwargs):
-            if range_type != "random_binary":
-                low, high = {
-                    "small": (0.0, 0.2),
-                    "big": (0.8, 1.0),
-                    "zeros": (0.0, 0.0),
-                    "ones": (1.0, 1.0),
-                    "random": (0.0, 1.0),
-                }[range_type]
-                return torch.testing.make_tensor(shape, low=low, high=high, **kwargs)
-            else:
-                return torch.randint(0, 2, shape, **kwargs)
-
-        # This function will return inputs and targets with shape: (shape[0]*9, shape[1])
-        inputs = []
-        targets = []
-        for input_range_type, target_range_type in [
-            ("small", "zeros"),
-            ("small", "ones"),
-            ("small", "random_binary"),
-            ("big", "zeros"),
-            ("big", "ones"),
-            ("big", "random_binary"),
-            ("random", "zeros"),
-            ("random", "ones"),
-            ("random", "random_binary"),
-        ]:
-            inputs.append(logit(generate_tensor_with_range_type(shape, input_range_type, **kwargs)))
-            targets.append(generate_tensor_with_range_type(shape, target_range_type, **kwargs))
-
-        return torch.cat(inputs), torch.cat(targets)
-
-    @pytest.mark.parametrize("alpha", [-1.0, 0.0, 0.58, 1.0])
-    @pytest.mark.parametrize("gamma", [0, 2])
-    @pytest.mark.parametrize("device", cpu_and_gpu())
-    @pytest.mark.parametrize("dtype", [torch.float32, torch.half])
-    @pytest.mark.parametrize("seed", [0, 1])
-    def test_correct_ratio(self, alpha, gamma, device, dtype, seed) -> None:
-        if device == "cpu" and dtype is torch.half:
-            pytest.skip("Currently torch.half is not fully supported on cpu")
-        # For testing the ratio with manual calculation, we require the reduction to be "none"
-        reduction = "none"
-        torch.random.manual_seed(seed)
-        inputs, targets = self._generate_diverse_input_target_pair(dtype=dtype, device=device)
-        focal_loss = ops.sigmoid_focal_loss(inputs, targets, gamma=gamma, alpha=alpha, reduction=reduction)
-        ce_loss = F.binary_cross_entropy_with_logits(inputs, targets, reduction=reduction)
-
-        assert torch.all(
-            focal_loss <= ce_loss
-        ), "focal loss must be less or equal to cross entropy loss with same input"
-
-        loss_ratio = (focal_loss / ce_loss).squeeze()
-        prob = torch.sigmoid(inputs)
-        p_t = prob * targets + (1 - prob) * (1 - targets)
-        correct_ratio = (1.0 - p_t) ** gamma
-        if alpha >= 0:
-            alpha_t = alpha * targets + (1 - alpha) * (1 - targets)
-            correct_ratio = correct_ratio * alpha_t
-
-        tol = 1e-3 if dtype is torch.half else 1e-5
-        torch.testing.assert_close(correct_ratio, loss_ratio, rtol=tol, atol=tol)
-
-    @pytest.mark.parametrize("reduction", ["mean", "sum"])
-    @pytest.mark.parametrize("device", cpu_and_gpu())
-    @pytest.mark.parametrize("dtype", [torch.float32, torch.half])
-    @pytest.mark.parametrize("seed", [2, 3])
-    def test_equal_ce_loss(self, reduction, device, dtype, seed) -> None:
-        if device == "cpu" and dtype is torch.half:
-            pytest.skip("Currently torch.half is not fully supported on cpu")
-        # focal loss should be equal ce_loss if alpha=-1 and gamma=0
-        alpha = -1
-        gamma = 0
-        torch.random.manual_seed(seed)
-        inputs, targets = self._generate_diverse_input_target_pair(dtype=dtype, device=device)
-        inputs_fl = inputs.clone().requires_grad_()
-        targets_fl = targets.clone()
-        inputs_ce = inputs.clone().requires_grad_()
-        targets_ce = targets.clone()
-        focal_loss = ops.sigmoid_focal_loss(inputs_fl, targets_fl, gamma=gamma, alpha=alpha, reduction=reduction)
-        ce_loss = F.binary_cross_entropy_with_logits(inputs_ce, targets_ce, reduction=reduction)
-
-        tol = 1e-3 if dtype is torch.half else 1e-5
-        torch.testing.assert_close(focal_loss, ce_loss, rtol=tol, atol=tol)
-
-        focal_loss.backward()
-        ce_loss.backward()
-        torch.testing.assert_close(inputs_fl.grad, inputs_ce.grad, rtol=tol, atol=tol)
-
-    @pytest.mark.parametrize("alpha", [-1.0, 0.0, 0.58, 1.0])
-    @pytest.mark.parametrize("gamma", [0, 2])
-    @pytest.mark.parametrize("reduction", ["none", "mean", "sum"])
-    @pytest.mark.parametrize("device", cpu_and_gpu())
-    @pytest.mark.parametrize("dtype", [torch.float32, torch.half])
-    @pytest.mark.parametrize("seed", [4, 5])
-    def test_jit(self, alpha, gamma, reduction, device, dtype, seed) -> None:
-        if device == "cpu" and dtype is torch.half:
-            pytest.skip("Currently torch.half is not fully supported on cpu")
-        script_fn = torch.jit.script(ops.sigmoid_focal_loss)
-        torch.random.manual_seed(seed)
-        inputs, targets = self._generate_diverse_input_target_pair(dtype=dtype, device=device)
-        focal_loss = ops.sigmoid_focal_loss(inputs, targets, gamma=gamma, alpha=alpha, reduction=reduction)
-        if device == "cpu":
-            scripted_focal_loss = script_fn(inputs, targets, gamma=gamma, alpha=alpha, reduction=reduction)
-        else:
-            with torch.jit.fuser("fuser2"):
-                # Use fuser2 to prevent a bug on fuser: https://github.com/pytorch/pytorch/issues/75476
-                # We may remove this condition once the bug is resolved
-                scripted_focal_loss = script_fn(inputs, targets, gamma=gamma, alpha=alpha, reduction=reduction)
-
-        tol = 1e-3 if dtype is torch.half else 1e-5
-        torch.testing.assert_close(focal_loss, scripted_focal_loss, rtol=tol, atol=tol)
-
-
-class TestGeneralizedBoxIouLoss:
-    # We refer to original test: https://github.com/facebookresearch/fvcore/blob/main/tests/test_giou_loss.py
-    @pytest.mark.parametrize("device", cpu_and_gpu())
-    @pytest.mark.parametrize("dtype", [torch.float32, torch.half])
-    def test_giou_loss(self, dtype, device) -> None:
-        box1 = torch.tensor([-1, -1, 1, 1], dtype=dtype, device=device)
-        box2 = torch.tensor([0, 0, 1, 1], dtype=dtype, device=device)
-        box3 = torch.tensor([0, 1, 1, 2], dtype=dtype, device=device)
-        box4 = torch.tensor([1, 1, 2, 2], dtype=dtype, device=device)
-
-        box1s = torch.stack([box2, box2], dim=0)
-        box2s = torch.stack([box3, box4], dim=0)
-
-        def assert_giou_loss(box1, box2, expected_loss, reduction="none"):
-            tol = 1e-3 if dtype is torch.half else 1e-5
-            computed_loss = ops.generalized_box_iou_loss(box1, box2, reduction=reduction)
-            expected_loss = torch.tensor(expected_loss, device=device)
-            torch.testing.assert_close(computed_loss, expected_loss, rtol=tol, atol=tol)
-
-        # Identical boxes should have loss of 0
-        assert_giou_loss(box1, box1, 0.0)
-
-        # quarter size box inside other box = IoU of 0.25
-        assert_giou_loss(box1, box2, 0.75)
-
-        # Two side by side boxes, area=union
-        # IoU=0 and GIoU=0 (loss 1.0)
-        assert_giou_loss(box2, box3, 1.0)
-
-        # Two diagonally adjacent boxes, area=2*union
-        # IoU=0 and GIoU=-0.5 (loss 1.5)
-        assert_giou_loss(box2, box4, 1.5)
-
-        # Test batched loss and reductions
-        assert_giou_loss(box1s, box2s, 2.5, reduction="sum")
-        assert_giou_loss(box1s, box2s, 1.25, reduction="mean")
-
-    @pytest.mark.parametrize("device", cpu_and_gpu())
-    @pytest.mark.parametrize("dtype", [torch.float32, torch.half])
-    def test_empty_inputs(self, dtype, device) -> None:
-        box1 = torch.randn([0, 4], dtype=dtype).requires_grad_()
-        box2 = torch.randn([0, 4], dtype=dtype).requires_grad_()
-
-        loss = ops.generalized_box_iou_loss(box1, box2, reduction="mean")
-        loss.backward()
-
-        tol = 1e-3 if dtype is torch.half else 1e-5
-        torch.testing.assert_close(loss, torch.tensor(0.0), rtol=tol, atol=tol)
-        assert box1.grad is not None, "box1.grad should not be None after backward is called"
-        assert box2.grad is not None, "box2.grad should not be None after backward is called"
-
-        loss = ops.generalized_box_iou_loss(box1, box2, reduction="none")
-        assert loss.numel() == 0, "giou_loss for two empty box should be empty"
-
-
-class TestCIOULoss:
-    @pytest.mark.parametrize("dtype", [torch.float32, torch.half])
-    @pytest.mark.parametrize("device", cpu_and_gpu())
-    def test_ciou_loss(self, dtype, device):
-        box1 = torch.tensor([-1, -1, 1, 1], dtype=dtype, device=device)
-        box2 = torch.tensor([0, 0, 1, 1], dtype=dtype, device=device)
-        box3 = torch.tensor([0, 1, 1, 2], dtype=dtype, device=device)
-        box4 = torch.tensor([1, 1, 2, 2], dtype=dtype, device=device)
-
-        box1s = torch.stack([box2, box2], dim=0)
-        box2s = torch.stack([box3, box4], dim=0)
-
-        def assert_ciou_loss(box1, box2, expected_output, reduction="none"):
-
-            output = ops.complete_box_iou_loss(box1, box2, reduction=reduction)
-            # TODO: When passing the dtype, the torch.half test doesn't pass...
-            expected_output = torch.tensor(expected_output, device=device)
-            tol = 1e-5 if dtype != torch.half else 1e-3
-            torch.testing.assert_close(output, expected_output, rtol=tol, atol=tol)
-
-        assert_ciou_loss(box1, box1, 0.0)
-
-        assert_ciou_loss(box1, box2, 0.8125)
-
-        assert_ciou_loss(box1, box3, 1.1923)
-
-        assert_ciou_loss(box1, box4, 1.2500)
-
-        assert_ciou_loss(box1s, box2s, 1.2250, reduction="mean")
-        assert_ciou_loss(box1s, box2s, 2.4500, reduction="sum")
-
-    @pytest.mark.parametrize("device", cpu_and_gpu())
-    @pytest.mark.parametrize("dtype", [torch.float32, torch.half])
-    def test_empty_inputs(self, dtype, device) -> None:
-        box1 = torch.randn([0, 4], dtype=dtype).requires_grad_()
-        box2 = torch.randn([0, 4], dtype=dtype).requires_grad_()
-
-        loss = ops.complete_box_iou_loss(box1, box2, reduction="mean")
-        loss.backward()
-
-        tol = 1e-3 if dtype is torch.half else 1e-5
-        torch.testing.assert_close(loss, torch.tensor(0.0), rtol=tol, atol=tol)
-        assert box1.grad is not None, "box1.grad should not be None after backward is called"
-        assert box2.grad is not None, "box2.grad should not be None after backward is called"
-
-        loss = ops.complete_box_iou_loss(box1, box2, reduction="none")
-        assert loss.numel() == 0, "ciou_loss for two empty box should be empty"
-
-
 if __name__ == "__main__":
     pytest.main([__file__])
diff --git a/torchvision/ops/__init__.py b/torchvision/ops/__init__.py
index cd711578a6c..d3f27ef1657 100644
--- a/torchvision/ops/__init__.py
+++ b/torchvision/ops/__init__.py
@@ -5,13 +5,13 @@
     remove_small_boxes,
     clip_boxes_to_image,
     box_area,
+    box_convert,
     box_iou,
     generalized_box_iou,
     distance_box_iou,
     complete_box_iou,
     masks_to_boxes,
 )
-from .boxes import box_convert
 from .ciou_loss import complete_box_iou_loss
 from .deform_conv import deform_conv2d, DeformConv2d
 from .diou_loss import distance_box_iou_loss
diff --git a/torchvision/ops/_utils.py b/torchvision/ops/_utils.py
index 8a02490ab13..a6ca557a98b 100644
--- a/torchvision/ops/_utils.py
+++ b/torchvision/ops/_utils.py
@@ -67,3 +67,40 @@ def split_normalization_params(
         else:
             other_params.extend(p for p in module.parameters() if p.requires_grad)
     return norm_params, other_params
+
+
+def _upcast(t: Tensor) -> Tensor:
+    # Protects from numerical overflows in multiplications by upcasting to the equivalent higher type
+    if t.is_floating_point():
+        return t if t.dtype in (torch.float32, torch.float64) else t.float()
+    else:
+        return t if t.dtype in (torch.int32, torch.int64) else t.int()
+
+
+def _upcast_non_float(t: Tensor) -> Tensor:
+    # Protects from numerical overflows in multiplications by upcasting to the equivalent higher type
+    if t.dtype not in (torch.float32, torch.float64):
+        return t.float()
+    return t
+
+
+def _loss_inter_union(
+    boxes1: torch.Tensor,
+    boxes2: torch.Tensor,
+) -> Tuple[torch.Tensor, torch.Tensor]:
+
+    x1, y1, x2, y2 = boxes1.unbind(dim=-1)
+    x1g, y1g, x2g, y2g = boxes2.unbind(dim=-1)
+
+    # Intersection keypoints
+    xkis1 = torch.max(x1, x1g)
+    ykis1 = torch.max(y1, y1g)
+    xkis2 = torch.min(x2, x2g)
+    ykis2 = torch.min(y2, y2g)
+
+    intsctk = torch.zeros_like(x1)
+    mask = (ykis2 > ykis1) & (xkis2 > xkis1)
+    intsctk[mask] = (xkis2[mask] - xkis1[mask]) * (ykis2[mask] - ykis1[mask])
+    unionk = (x2 - x1) * (y2 - y1) + (x2g - x1g) * (y2g - y1g) - intsctk
+
+    return intsctk, unionk
diff --git a/torchvision/ops/boxes.py b/torchvision/ops/boxes.py
index 3b994879ecf..72c95442b78 100644
--- a/torchvision/ops/boxes.py
+++ b/torchvision/ops/boxes.py
@@ -7,6 +7,7 @@
 
 from ..utils import _log_api_usage_once
 from ._box_convert import _box_cxcywh_to_xyxy, _box_xyxy_to_cxcywh, _box_xywh_to_xyxy, _box_xyxy_to_xywh
+from ._utils import _upcast
 
 
 def nms(boxes: Tensor, scores: Tensor, iou_threshold: float) -> Tensor:
@@ -215,14 +216,6 @@ def box_convert(boxes: Tensor, in_fmt: str, out_fmt: str) -> Tensor:
     return boxes
 
 
-def _upcast(t: Tensor) -> Tensor:
-    # Protects from numerical overflows in multiplications by upcasting to the equivalent higher type
-    if t.is_floating_point():
-        return t if t.dtype in (torch.float32, torch.float64) else t.float()
-    else:
-        return t if t.dtype in (torch.int32, torch.int64) else t.int()
-
-
 def box_area(boxes: Tensor) -> Tensor:
     """
     Computes the area of a set of bounding boxes, which are specified by their
@@ -330,22 +323,7 @@ def complete_box_iou(boxes1: Tensor, boxes2: Tensor, eps: float = 1e-7) -> Tenso
     boxes1 = _upcast(boxes1)
     boxes2 = _upcast(boxes2)
 
-    inter, union = _box_inter_union(boxes1, boxes2)
-    iou = inter / union
-
-    lti = torch.min(boxes1[:, None, :2], boxes2[:, None, :2])
-    rbi = torch.max(boxes1[:, None, 2:], boxes2[:, None, 2:])
-
-    whi = (rbi - lti).clamp(min=0)  # [N,M,2]
-    diagonal_distance_squared = (whi[:, :, 0] ** 2) + (whi[:, :, 1] ** 2) + eps
-
-    # centers of boxes
-    x_p = (boxes1[:, 0] + boxes1[:, 2]) / 2
-    y_p = (boxes1[:, 1] + boxes1[:, 3]) / 2
-    x_g = (boxes2[:, 0] + boxes2[:, 2]) / 2
-    y_g = (boxes2[:, 1] + boxes2[:, 3]) / 2
-    # The distance between boxes' centers squared.
-    centers_distance_squared = (x_p - x_g) ** 2 + (y_p - y_g) ** 2
+    diou, iou = _box_diou_iou(boxes1, boxes2, eps)
 
     w_pred = boxes1[:, 2] - boxes1[:, 0]
     h_pred = boxes1[:, 3] - boxes1[:, 1]
@@ -356,7 +334,7 @@ def complete_box_iou(boxes1: Tensor, boxes2: Tensor, eps: float = 1e-7) -> Tenso
     v = (4 / (torch.pi ** 2)) * torch.pow((torch.atan(w_gt / h_gt) - torch.atan(w_pred / h_pred)), 2)
     with torch.no_grad():
         alpha = v / (1 - iou + v + eps)
-    return iou - (centers_distance_squared / diagonal_distance_squared) - alpha * v
+    return diou - alpha * v
 
 
 def distance_box_iou(boxes1: Tensor, boxes2: Tensor, eps: float = 1e-7) -> Tensor:
@@ -380,16 +358,17 @@ def distance_box_iou(boxes1: Tensor, boxes2: Tensor, eps: float = 1e-7) -> Tenso
 
     boxes1 = _upcast(boxes1)
     boxes2 = _upcast(boxes2)
+    diou, _ = _box_diou_iou(boxes1, boxes2)
+    return diou
 
-    inter, union = _box_inter_union(boxes1, boxes2)
-    iou = inter / union
 
+def _box_diou_iou(boxes1: Tensor, boxes2: Tensor, eps: float = 1e-7) -> Tuple[Tensor, Tensor]:
+
+    iou = box_iou(boxes1, boxes2)
     lti = torch.min(boxes1[:, None, :2], boxes2[:, :2])
     rbi = torch.max(boxes1[:, None, 2:], boxes2[:, 2:])
-
     whi = _upcast(rbi - lti).clamp(min=0)  # [N,M,2]
     diagonal_distance_squared = (whi[:, :, 0] ** 2) + (whi[:, :, 1] ** 2) + eps
-
     # centers of boxes
     x_p = (boxes1[:, 0] + boxes1[:, 2]) / 2
     y_p = (boxes1[:, 1] + boxes1[:, 3]) / 2
@@ -397,10 +376,9 @@ def distance_box_iou(boxes1: Tensor, boxes2: Tensor, eps: float = 1e-7) -> Tenso
     y_g = (boxes2[:, 1] + boxes2[:, 3]) / 2
     # The distance between boxes' centers squared.
     centers_distance_squared = (_upcast(x_p - x_g) ** 2) + (_upcast(y_p - y_g) ** 2)
-
     # The distance IoU is the IoU penalized by a normalized
     # distance between boxes' centers squared.
-    return iou - (centers_distance_squared / diagonal_distance_squared)
+    return iou - (centers_distance_squared / diagonal_distance_squared), iou
 
 
 def masks_to_boxes(masks: torch.Tensor) -> torch.Tensor:
diff --git a/torchvision/ops/ciou_loss.py b/torchvision/ops/ciou_loss.py
index d53e2d6af2a..1f271fb0a1d 100644
--- a/torchvision/ops/ciou_loss.py
+++ b/torchvision/ops/ciou_loss.py
@@ -1,7 +1,8 @@
 import torch
 
 from ..utils import _log_api_usage_once
-from .giou_loss import _upcast
+from ._utils import _upcast_non_float
+from .diou_loss import _diou_iou_loss
 
 
 def complete_box_iou_loss(
@@ -30,50 +31,28 @@ def complete_box_iou_loss(
             ``'sum'``: The output will be summed. Default: ``'none'``
         eps : (float): small number to prevent division by zero. Default: 1e-7
 
-    Reference:
+    Returns:
+        Tensor: Loss tensor with the reduction option applied.
 
-    Complete Intersection over Union Loss (Zhaohui Zheng et. al)
-    https://arxiv.org/abs/1911.08287
+    Reference:
+        Zhaohui Zheng et. al: Complete Intersection over Union Loss:
+        https://arxiv.org/abs/1911.08287
 
     """
 
-    # Original Implementation : https://github.com/facebookresearch/detectron2/blob/main/detectron2/layers/losses.py
+    # Original Implementation from https://github.com/facebookresearch/detectron2/blob/main/detectron2/layers/losses.py
 
     if not torch.jit.is_scripting() and not torch.jit.is_tracing():
         _log_api_usage_once(complete_box_iou_loss)
 
-    boxes1 = _upcast(boxes1)
-    boxes2 = _upcast(boxes2)
+    boxes1 = _upcast_non_float(boxes1)
+    boxes2 = _upcast_non_float(boxes2)
+
+    diou_loss, iou = _diou_iou_loss(boxes1, boxes2)
 
     x1, y1, x2, y2 = boxes1.unbind(dim=-1)
     x1g, y1g, x2g, y2g = boxes2.unbind(dim=-1)
 
-    # Intersection keypoints
-    xkis1 = torch.max(x1, x1g)
-    ykis1 = torch.max(y1, y1g)
-    xkis2 = torch.min(x2, x2g)
-    ykis2 = torch.min(y2, y2g)
-
-    intsct = torch.zeros_like(x1)
-    mask = (ykis2 > ykis1) & (xkis2 > xkis1)
-    intsct[mask] = (xkis2[mask] - xkis1[mask]) * (ykis2[mask] - ykis1[mask])
-    union = (x2 - x1) * (y2 - y1) + (x2g - x1g) * (y2g - y1g) - intsct + eps
-    iou = intsct / union
-
-    # smallest enclosing box
-    xc1 = torch.min(x1, x1g)
-    yc1 = torch.min(y1, y1g)
-    xc2 = torch.max(x2, x2g)
-    yc2 = torch.max(y2, y2g)
-    diag_len = ((xc2 - xc1) ** 2) + ((yc2 - yc1) ** 2) + eps
-
-    # centers of boxes
-    x_p = (x2 + x1) / 2
-    y_p = (y2 + y1) / 2
-    x_g = (x1g + x2g) / 2
-    y_g = (y1g + y2g) / 2
-    distance = ((x_p - x_g) ** 2) + ((y_p - y_g) ** 2)
-
     # width and height of boxes
     w_pred = x2 - x1
     h_pred = y2 - y1
@@ -83,7 +62,7 @@ def complete_box_iou_loss(
     with torch.no_grad():
         alpha = v / (1 - iou + v + eps)
 
-    loss = 1 - iou + (distance / diag_len) + alpha * v
+    loss = diou_loss + alpha * v
     if reduction == "mean":
         loss = loss.mean() if loss.numel() > 0 else 0.0 * loss.sum()
     elif reduction == "sum":
diff --git a/torchvision/ops/diou_loss.py b/torchvision/ops/diou_loss.py
index ea7ead19344..4b38d58a28f 100644
--- a/torchvision/ops/diou_loss.py
+++ b/torchvision/ops/diou_loss.py
@@ -1,7 +1,12 @@
+from typing import Tuple
+
 import torch
 
 from ..utils import _log_api_usage_once
-from .boxes import _upcast
+from ._utils import _loss_inter_union, _upcast_non_float
+
+
+# Original Implementation from https://github.com/facebookresearch/detectron2/blob/main/detectron2/layers/losses.py
 
 
 def distance_box_iou_loss(
@@ -10,6 +15,7 @@ def distance_box_iou_loss(
     reduction: str = "none",
     eps: float = 1e-7,
 ) -> torch.Tensor:
+
     """
     Gradient-friendly IoU loss with an additional penalty that is non-zero when the
     distance between boxes' centers isn't zero. Indeed, for two exactly overlapping
@@ -37,29 +43,31 @@ def distance_box_iou_loss(
         https://arxiv.org/abs/1911.08287
     """
 
-    # Original Implementation : https://github.com/facebookresearch/detectron2/blob/main/detectron2/layers/losses.py
-
     if not torch.jit.is_scripting() and not torch.jit.is_tracing():
         _log_api_usage_once(distance_box_iou_loss)
 
-    boxes1 = _upcast(boxes1)
-    boxes2 = _upcast(boxes2)
+    boxes1 = _upcast_non_float(boxes1)
+    boxes2 = _upcast_non_float(boxes2)
 
-    x1, y1, x2, y2 = boxes1.unbind(dim=-1)
-    x1g, y1g, x2g, y2g = boxes2.unbind(dim=-1)
+    loss, _ = _diou_iou_loss(boxes1, boxes2, eps)
 
-    # Intersection keypoints
-    xkis1 = torch.max(x1, x1g)
-    ykis1 = torch.max(y1, y1g)
-    xkis2 = torch.min(x2, x2g)
-    ykis2 = torch.min(y2, y2g)
+    if reduction == "mean":
+        loss = loss.mean() if loss.numel() > 0 else 0.0 * loss.sum()
+    elif reduction == "sum":
+        loss = loss.sum()
+    return loss
 
-    intsct = torch.zeros_like(x1)
-    mask = (ykis2 > ykis1) & (xkis2 > xkis1)
-    intsct[mask] = (xkis2[mask] - xkis1[mask]) * (ykis2[mask] - ykis1[mask])
-    union = (x2 - x1) * (y2 - y1) + (x2g - x1g) * (y2g - y1g) - intsct + eps
-    iou = intsct / union
 
+def _diou_iou_loss(
+    boxes1: torch.Tensor,
+    boxes2: torch.Tensor,
+    eps: float = 1e-7,
+) -> Tuple[torch.Tensor, torch.Tensor]:
+
+    x1, y1, x2, y2 = boxes1.unbind(dim=-1)
+    x1g, y1g, x2g, y2g = boxes2.unbind(dim=-1)
+    intsct, union = _loss_inter_union(boxes1, boxes2)
+    iou = intsct / (union + eps)
     # smallest enclosing box
     xc1 = torch.min(x1, x1g)
     yc1 = torch.min(y1, y1g)
@@ -67,7 +75,6 @@ def distance_box_iou_loss(
     yc2 = torch.max(y2, y2g)
     # The diagonal distance of the smallest enclosing box squared
     diagonal_distance_squared = ((xc2 - xc1) ** 2) + ((yc2 - yc1) ** 2) + eps
-
     # centers of boxes
     x_p = (x2 + x1) / 2
     y_p = (y2 + y1) / 2
@@ -75,12 +82,7 @@ def distance_box_iou_loss(
     y_g = (y1g + y2g) / 2
     # The distance between boxes' centers squared.
     centers_distance_squared = ((x_p - x_g) ** 2) + ((y_p - y_g) ** 2)
-
     # The distance IoU is the IoU penalized by a normalized
     # distance between boxes' centers squared.
     loss = 1 - iou + (centers_distance_squared / diagonal_distance_squared)
-    if reduction == "mean":
-        loss = loss.mean() if loss.numel() > 0 else 0.0 * loss.sum()
-    elif reduction == "sum":
-        loss = loss.sum()
-    return loss
+    return loss, iou
diff --git a/torchvision/ops/giou_loss.py b/torchvision/ops/giou_loss.py
index 4d6f946f5e8..efb9cd1f992 100644
--- a/torchvision/ops/giou_loss.py
+++ b/torchvision/ops/giou_loss.py
@@ -1,14 +1,9 @@
 import torch
-from torch import Tensor
 
 from ..utils import _log_api_usage_once
+from ._utils import _upcast_non_float, _loss_inter_union
 
-
-def _upcast(t: Tensor) -> Tensor:
-    # Protects from numerical overflows in multiplications by upcasting to the equivalent higher type
-    if t.dtype not in (torch.float32, torch.float64):
-        return t.float()
-    return t
+# Original implementation from https://github.com/facebookresearch/fvcore/blob/bfff2ef/fvcore/nn/giou_loss.py
 
 
 def generalized_box_iou_loss(
@@ -17,10 +12,8 @@ def generalized_box_iou_loss(
     reduction: str = "none",
     eps: float = 1e-7,
 ) -> torch.Tensor:
-    """
-    Original implementation from
-    https://github.com/facebookresearch/fvcore/blob/bfff2ef/fvcore/nn/giou_loss.py
 
+    """
     Gradient-friendly IoU loss with an additional penalty that is non-zero when the
     boxes do not overlap and scales with the size of their smallest enclosing box.
     This loss is symmetric, so the boxes1 and boxes2 arguments are interchangeable.
@@ -38,6 +31,9 @@ def generalized_box_iou_loss(
             ``'sum'``: The output will be summed. Default: ``'none'``
         eps (float): small number to prevent division by zero. Default: 1e-7
 
+    Returns:
+        Tensor: Loss tensor with the reduction option applied.
+
     Reference:
         Hamid Rezatofighi et. al: Generalized Intersection over Union:
         A Metric and A Loss for Bounding Box Regression:
@@ -46,21 +42,12 @@ def generalized_box_iou_loss(
     if not torch.jit.is_scripting() and not torch.jit.is_tracing():
         _log_api_usage_once(generalized_box_iou_loss)
 
-    boxes1 = _upcast(boxes1)
-    boxes2 = _upcast(boxes2)
+    boxes1 = _upcast_non_float(boxes1)
+    boxes2 = _upcast_non_float(boxes2)
     x1, y1, x2, y2 = boxes1.unbind(dim=-1)
     x1g, y1g, x2g, y2g = boxes2.unbind(dim=-1)
 
-    # Intersection keypoints
-    xkis1 = torch.max(x1, x1g)
-    ykis1 = torch.max(y1, y1g)
-    xkis2 = torch.min(x2, x2g)
-    ykis2 = torch.min(y2, y2g)
-
-    intsctk = torch.zeros_like(x1)
-    mask = (ykis2 > ykis1) & (xkis2 > xkis1)
-    intsctk[mask] = (xkis2[mask] - xkis1[mask]) * (ykis2[mask] - ykis1[mask])
-    unionk = (x2 - x1) * (y2 - y1) + (x2g - x1g) * (y2g - y1g) - intsctk
+    intsctk, unionk = _loss_inter_union(boxes1, boxes2)
     iouk = intsctk / (unionk + eps)
 
     # smallest enclosing box