Arm backend: Add Ethos-U55 permute check

Erik-Lundell · Erik-Lundell · commit ffe9181e0f75 · 2025-04-01T14:51:57.000+02:00
Signed-off-by: Erik Lundell &lt;erik.lundell@arm.com&gt;
Change-Id: Id7c6d6469e96e4133b7b1a54be6ea66bc7dc861a
diff --git a/backends/arm/operator_support/ethos_u55_support.py b/backends/arm/operator_support/ethos_u55_support.py
@@ -11,12 +11,27 @@
 import torch.fx as fx
 from executorch.backends.arm._passes.arm_pass_utils import get_first_fake_tensor
 from executorch.backends.arm._passes.insert_table_ops import TableOps
+from executorch.backends.arm.operators.op_permute import transform_permutation_vector
+from executorch.backends.arm.tosa_utils import tosa_shape
 from executorch.exir.backend.utils import WhyNoPartitionReporter
 
 from executorch.exir.dialects._ops import ops as exir_ops
 from torch.fx.passes.operator_support import OperatorSupportBase
 
 
+def _try_determine_dtype(node: fx.Node) -> torch.dtype | None:
+    dtype = get_first_fake_tensor(node).dtype
+    if not dtype.is_floating_point:
+        return dtype
+    if node.target is exir_ops.edge.quantized_decomposed.dequantize_per_tensor.default:
+        return get_first_fake_tensor(node.all_input_nodes[0]).dtype
+    q_node = list(node.users)[0]
+    if q_node.target is exir_ops.edge.quantized_decomposed.quantize_per_tensor.default:
+        return typing.cast(torch.dtype, q_node.args[-1])
+    # We can't easily figure out dtype, return None
+    return None
+
+
 class EthosU55DtypeSupport(OperatorSupportBase):
 
     def __init__(self, reporter: WhyNoPartitionReporter):
@@ -33,37 +48,11 @@ def __init__(self, reporter: WhyNoPartitionReporter):
 
     target_ops_i8 = tuple(TableOps.included_ops())
 
-    def _try_determine_dtype(self, node: fx.Node) -> torch.dtype | None:
-        """Attempt to figure out the quantized data type of node. On failure, return None."""
-
-        dtype = get_first_fake_tensor(node).dtype
-        if not dtype.is_floating_point:
-            return dtype
-
-        if (
-            node.target
-            is exir_ops.edge.quantized_decomposed.dequantize_per_tensor.default
-        ):
-            return get_first_fake_tensor(node.all_input_nodes[0]).dtype
-
-        if len(node.users) == 0:
-            return None
-
-        q_node = list(node.users)[0]
-        if (
-            q_node.target
-            is exir_ops.edge.quantized_decomposed.quantize_per_tensor.default
-        ):
-            return typing.cast(torch.dtype, q_node.args[-1])
-
-        # We can't easily figure out dtype, return None
-        return None
-
     def is_node_supported(  # noqa: C901
         self, submodules: typing.Mapping[str, torch.nn.Module], node: fx.Node
     ) -> bool:
 
-        dtype = self._try_determine_dtype(node)
+        dtype = _try_determine_dtype(node)
         if dtype is None:
             # If we couldn't determine dtype, just return ok.
             return True
@@ -84,21 +73,21 @@ def is_node_supported(  # noqa: C901
 
         if node.target == exir_ops.edge.aten.convolution.default:
             ifm, weight = node.all_input_nodes[0:2]
-            ifm_dtype = self._try_determine_dtype(ifm)
+            ifm_dtype = _try_determine_dtype(ifm)
             if ifm_dtype is not None and ifm_dtype not in (torch.int8, torch.int16):
                 self.reporter.report_reject(
                     node, f"Unsupported input dtype {dtype} (Supports i8, i16)."
                 )
                 return False
-            weight_dtype = self._try_determine_dtype(weight)
+            weight_dtype = _try_determine_dtype(weight)
             if weight_dtype is not None and weight_dtype not in (torch.int8,):
                 self.reporter.report_reject(
                     node, f"Unsupported weight dtype {dtype} (Supports i8)."
                 )
                 return False
             if len(node.all_input_nodes) > 2:
                 bias = node.all_input_nodes[2]
-                bias_dtype = self._try_determine_dtype(bias)
+                bias_dtype = _try_determine_dtype(bias)
                 if bias_dtype is not None and bias_dtype not in (torch.int32,):
                     self.reporter.report_reject(
                         node, f"Unsupported bias dtype {dtype} (Supports i32)."
@@ -110,7 +99,7 @@ def is_node_supported(  # noqa: C901
             exir_ops.edge.aten.bmm.default,
         ):
             for input_node in node.all_input_nodes:
-                dtype = self._try_determine_dtype(input_node)
+                dtype = _try_determine_dtype(input_node)
                 if dtype is not None and dtype != torch.int8:
                     self.reporter.report_reject(
                         input_node,
@@ -174,3 +163,114 @@ def is_node_supported(
             return False
 
         return True
+
+
+shape_t = list[int]
+
+
+class EthosU55TransposeCheck(OperatorSupportBase):
+
+    def __init__(self, reporter: WhyNoPartitionReporter):
+        super().__init__()
+        self.reporter = reporter
+
+    def _pad_to_rank_4(
+        self, shape: shape_t, permutation: list[int]
+    ) -> tuple[shape_t, shape_t]:
+        diff = 4 - len(shape)
+        padded_shape = [1] * diff + shape
+        for i in range(len(permutation)):
+            permutation[i] += diff
+        padded_permutation = list(range(diff)) + permutation
+        return padded_shape, padded_permutation
+
+    def axes_product(self, nhwc_shape: shape_t) -> int:
+        product = 1
+        for axes in nhwc_shape:
+            product *= axes
+        return product
+
+    def _permute_constraint_i8_i16(
+        self, nhwc_shape: list[int], permutation: list[int]
+    ) -> bool:
+        """Returns True if the constraints are ok."""
+        N, H, W, C = nhwc_shape
+        match permutation:
+            case (0, 1, 2, 3):  # NHWC -> NHWC
+                return True
+            case (0, 2, 1, 3) | (0, 1, 3, 2) | (0, 3, 1, 2):  # NHWC -> NWHC, NHCW, NCWH
+                return N * H <= 65536 and W <= 65536 and C <= 65536
+            case _:
+                return self.axes_product(nhwc_shape) <= 65536
+
+    def _permute_constraint_i32(
+        self, nhwc_shape: list[int], permutation: list[int]
+    ) -> bool:
+        """Returns True if the constraints are ok."""
+        N, H, W, C = nhwc_shape
+        match permutation:
+            case (0, 1, 2, 3):  # NHWC -> NHWC
+                return C <= 32768
+            case (0, 2, 1, 3):  # NHWC -> NHWC
+                return N == 1 and H <= 65536 and W <= 65536 and C <= 16384
+            case (0, 1, 3, 2):  # NHWC -> NHCW
+                return N * H <= 65536 and W <= 65536 and C <= 65536
+            case _:
+                return False
+
+    def _permute_constraint(self, shape, permutation, dtype):
+        if dtype in (torch.int8, torch.int16):
+            return self._permute_constraint_i8_i16(shape, permutation)
+        if dtype == torch.int32:
+            return not self._permute_constraint_i32(shape, permutation)
+        return True
+
+    def is_node_supported(
+        self, submodules: typing.Mapping[str, torch.nn.Module], node: fx.Node
+    ) -> bool:
+
+        if not node.target == exir_ops.edge.aten.permute_copy.default:
+            return True
+
+        shape = list(get_first_fake_tensor(node).shape)
+        dtype = _try_determine_dtype(node)
+        permutation = list(typing.cast(list[int], node.args[1]))
+
+        rank = len(shape)
+        if rank > 4:
+            if dtype == torch.int32:
+                self.reporter.report_reject(
+                    node, f"No support for {permutation=} in int32."
+                )
+                return False
+            if dtype in (torch.int8, torch.int16):
+                if self.axes_product(shape) > 65536:
+                    self.reporter.report_reject(
+                        node,
+                        f"No support for {shape=}, {dtype=}. Product of axes must be <65536",
+                    )
+                    return False
+            return True
+
+        shape, permutation = self._pad_to_rank_4(shape, permutation)
+        if rank == 3 or rank == 4:
+            # For rank 3 and 4, we can have channels first or channels last dim order.
+            # Since we don't know which at partition-time, test both.
+
+            nhwc_shape = tosa_shape(shape, [0, 2, 3, 1])
+            nhwc_permutation = transform_permutation_vector(permutation, [0, 2, 3, 1])
+
+            if not self._permute_constraint(nhwc_shape, nhwc_permutation, dtype):
+                self.reporter.report_reject(
+                    node,
+                    f"Unsupported NHWC {nhwc_shape=} for {nhwc_permutation=}, {dtype=}",
+                )
+                return False
+
+        if not self._permute_constraint(shape, permutation, dtype):
+            self.reporter.report_reject(
+                node, f"Unsupported NCHW {shape=} for {permutation=}, {dtype=}"
+            )
+            return False
+
+        return True
diff --git a/backends/arm/operator_support/tosa_supported_operators.py b/backends/arm/operator_support/tosa_supported_operators.py
@@ -21,6 +21,7 @@
 from executorch.backends.arm.operator_support.ethos_u55_support import (
     EthosU55DtypeSupport,
     EthosU55NotSupported,
+    EthosU55TransposeCheck,
 )
 from executorch.backends.arm.tosa_specification import Tosa_0_80, TosaSpecification
 from executorch.exir import ExportedProgram
@@ -123,6 +124,7 @@ def tosa_support_factory(
     if isinstance(tosa_spec, Tosa_0_80) and tosa_spec.is_U55_subset:
         negative_checks.append(EthosU55NotSupported(reporter))
         negative_checks.append(EthosU55DtypeSupport(reporter))
+        negative_checks.append(EthosU55TransposeCheck(reporter))
 
     return chain(
         reporter.wrap_check(
diff --git a/backends/arm/operators/op_permute.py b/backends/arm/operators/op_permute.py
@@ -65,6 +65,29 @@ def permutation_matrix_to_vector(permutation_matrix: torch.Tensor) -> list[int]:
     return p
 
 
+def transform_permutation_vector(permutation_vector: list[int], dim_order: list[int]):
+    """Transforms a permutation to dim_order."""
+
+    # We need to first transform to dim_order, apply the permutation P,
+    # and then transform back to the original dim_order.
+    # This transformation, S, is also a permutation, with the dim_order as permutation vector.
+
+    # To do this, represent P and S with permutation matrices.
+    # Matrices can handle chained transformations and inversion easily.
+    S = permutation_vector_to_matrix(dim_order)
+    # The inverse of a permutation matrix is its transpose.
+    S_inverse = S.t()
+    P = permutation_vector_to_matrix(permutation_vector)
+
+    # The complete transformation is S * P * S_inverse.
+    transformation_matrix = S.matmul(P.matmul(S_inverse))
+
+    # Luckily, since it is just a combination of permutations, the result is also a permutation
+    # that can again be described by a new permutation vector.
+    permutation_vector = permutation_matrix_to_vector(transformation_matrix)
+    return permutation_vector
+
+
 @register_node_visitor
 class PermuteVisitor(NodeVisitor):
     target = "aten.permute_copy.default"
@@ -86,23 +109,10 @@ def define_node(
 
         if output.dim_order != tuple(range(len(output.dim_order))):
             # the permutation vector can't be used directly if we are not in NCHW dim_order.
-            # We need to first transform to NCHW, apply P,
-            # and then transform back to the original dim_order.
-            # This transformation, S, is also a permutation, with the dim_order as permutation vector.
-
-            # To do this, represent P and S with permutation matrices.
-            # Matrices can handle chained transformations and inversion easily.
-            S = permutation_vector_to_matrix(output.dim_order)
-            # The inverse of a permutation matrix is its transpose.
-            S_inverse = S.transpose(1, 0)
-            P = permutation_vector_to_matrix(permutation_vector)
-
-            # The complete transformation is S * P * S_inverse.
-            transformation_matrix = S.matmul(P.matmul(S_inverse))
-
-            # Luckily, since it is just a combination of permutations, the result is also a permutation
-            # that can again be described by a new permutation vector.
-            permutation_vector = permutation_matrix_to_vector(transformation_matrix)
+            # Transform to dim_order.
+            permutation_vector = transform_permutation_vector(
+                permutation_vector, output.dim_order
+            )
 
         attr = ts.TosaSerializerAttribute()
         attr.TransposeAttribute(permutation_vector)
diff --git a/backends/arm/test/ops/test_permute.py b/backends/arm/test/ops/test_permute.py
@@ -1,6 +1,6 @@
 # Copyright (c) Meta Platforms, Inc. and affiliates.
-# Copyright 2024-2025 Arm Limited and/or its affiliates.
 # All rights reserved.
+# Copyright 2024-2025 Arm Limited and/or its affiliates.
 #
 # This source code is licensed under the BSD-style license found in the
 # LICENSE file in the root directory of this source tree.
@@ -20,6 +20,7 @@
 )
 from executorch.backends.arm.test import common, conftest
 from executorch.backends.arm.test.tester.arm_tester import ArmTester
+from executorch.backends.arm.test.tester.test_pipeline import OpNotSupportedPipeline
 from executorch.backends.arm.tosa_specification import TosaSpecification
 from executorch.backends.xnnpack.test.tester.tester import Quantize
 from executorch.exir.backend.compile_spec_schema import CompileSpec
@@ -163,3 +164,26 @@ def test_permute_u85_BI_xfails(
         self._test_permute_ethos_BI_pipeline(
             self.Permute(dims=dims), common.get_u85_compile_spec(), (test_data,)
         )
+
+
+reject_data_suite = {
+    "int8_r3_axes_product": ([1, 700, 1000], [2, 1, 0], torch.int8),
+    "int8_r5_axes_product": ([1, 1, 1, 700, 1000], [0, 1, 2, 3, 4], torch.int8),
+    "int8_r4_NH_too_large": ([700, 100, 1, 1], [0, 1, 3, 2], torch.int8),
+    "int32_r5_no_support": ([2, 2, 2, 2, 2], [3, 4, 2, 1, 0], torch.int32),
+}
+input_t = tuple[torch.Tensor]
+
+
+@common.parametrize("test_data", reject_data_suite)
+def test_permute_u55_BI_not_delegated(test_data):
+    # Tests that we don't delegate these ops since they are not supported on U55.
+    shape, permutation, dtype = test_data
+    data = ((torch.rand(shape) * 10).to(dtype),)
+    pipeline = OpNotSupportedPipeline[input_t](
+        TestPermute.Permute(dims=permutation),
+        data,
+        "TOSA-0.80+BI+u55",
+        {"executorch_exir_dialects_edge__ops_aten_permute_copy_default": 1},
+    )
+    pipeline.run()