pytorch
diff --git a/‎.ci/scripts/build-qnn-sdk.sh
+1 b/‎.ci/scripts/build-qnn-sdk.sh
+1
diff --git a/‎backends/apple/mps/setup.md
+3-3 b/‎backends/apple/mps/setup.md
+3-3
diff --git a/‎backends/arm/_passes/arm_pass_manager.py
+4 b/‎backends/arm/_passes/arm_pass_manager.py
+4
diff --git a/‎backends/arm/_passes/decompose_softmax_pass.py
+5-1 b/‎backends/arm/_passes/decompose_softmax_pass.py
+5-1
diff --git a/‎backends/arm/operator_support/convolution_support.py
+8-1 b/‎backends/arm/operator_support/convolution_support.py
+8-1
diff --git a/‎backends/arm/operators/op_abs.py
+129-5 b/‎backends/arm/operators/op_abs.py
+129-5
@@ -33,6 +33,7 @@ set_up_aot() {
   cmake .. \
       -DCMAKE_INSTALL_PREFIX=$PWD \
       -DEXECUTORCH_BUILD_QNN=ON \
+      -DANDROID_NATIVE_API_LEVEL=30 \
       -DQNN_SDK_ROOT=${QNN_SDK_ROOT} \
       -DEXECUTORCH_BUILD_DEVTOOLS=ON \
       -DEXECUTORCH_BUILD_EXTENSION_MODULE=ON \
 
@@ -76,12 +76,12 @@ cd executorch
 ## Run the mv3 generated model using the mps_executor_runner
 
 ```bash
-./cmake-out/examples/apple/mps/mps_executor_runner --model_path mv3_mps_bundled_fp16.pte --bundled_program
+./cmake-out/examples/apple/mps/mps_executor_runner --model_path mv3_mps_float16_bundled.pte --bundled_program
 ```
 
 - You should see the following results. Note that no output file will be generated in this example:
 ```
-I 00:00:00.003290 executorch:mps_executor_runner.mm:286] Model file mv3_mps_bundled_fp16.pte is loaded.
+I 00:00:00.003290 executorch:mps_executor_runner.mm:286] Model file mv3_mps_float16_bundled.pte is loaded.
 I 00:00:00.003306 executorch:mps_executor_runner.mm:292] Program methods: 1
 I 00:00:00.003308 executorch:mps_executor_runner.mm:294] Running method forward
 I 00:00:00.003311 executorch:mps_executor_runner.mm:349] Setting up non-const buffer 1, size 606112.
@@ -118,7 +118,7 @@ python3 -m examples.apple.mps.scripts.mps_example --model_name="mv3" --generate_
 ```
 2. Run your Program on the ExecuTorch runtime and generate an [ETDump](../../../docs/source/etdump.md).
 ```
-./cmake-out/examples/apple/mps/mps_executor_runner --model_path mv3_mps_bundled_fp16.pte --bundled_program --dump-outputs
+./cmake-out/examples/apple/mps/mps_executor_runner --model_path mv3_mps_float16_bundled.pte --bundled_program --dump-outputs
 ```
 3. Create an instance of the Inspector API by passing in the ETDump you have sourced from the runtime along with the optionally generated ETRecord from step 1.
 ```bash
 
@@ -59,6 +59,9 @@
 )
 
 from executorch.backends.arm.tosa_specification import Tosa_0_80, TosaSpecification
+from executorch.backends.transforms.decompose_sdpa import (
+    DecomposeScaledDotProductAttention,
+)
 from executorch.backends.transforms.fuse_view_copy import FuseViewCopyTransform
 from executorch.backends.xnnpack._passes.remove_getitem_op import RemoveGetItemPass
 from executorch.exir import ExportedProgram
@@ -194,6 +197,7 @@ def transform_to_backend_pipeline(self, exported_program: ExportedProgram):
             )
 
     def transform_for_annotation_pipeline(self, graph_module: GraphModule):
+        self.add_pass(DecomposeScaledDotProductAttention())
         self.add_pass(ReplaceScalarWithTensorArgPassTOSABI())
         self.add_pass(ScalarsToAttributePass())
         self.add_pass(DecomposeLayerNormPass())
 
@@ -8,7 +8,11 @@
 from executorch.exir.pass_base import ExportPass
 
 # For BI case
-torch_softmax = (torch.ops.aten.softmax.int, torch.ops.aten.log_softmax.int)
+torch_softmax = (
+    torch.ops.aten.softmax.int,
+    torch.ops.aten._safe_softmax.default,
+    torch.ops.aten.log_softmax.int,
+)
 # For MI case
 edge_softmax = (
     exir_ops.edge.aten._softmax.default,
 
@@ -11,7 +11,11 @@
     register_tosa_support_check,
     SupportedTOSAOperatorCheck,
 )
-from executorch.backends.arm.tosa_specification import Tosa_0_80, TosaSpecification
+from executorch.backends.arm.tosa_specification import (
+    Tosa_0_80,
+    Tosa_1_00,
+    TosaSpecification,
+)
 from executorch.exir.dialects._ops import ops as exir_ops
 
 
@@ -43,6 +47,9 @@ def is_node_tosa_supported(self, node: fx.Node, tosa_spec: TosaSpecification):
 
         # Hardware specific constraints
         if not (isinstance(tosa_spec, Tosa_0_80) and tosa_spec.is_U55_subset):
+            # TODO remove this once TOSA 1.0 support for u55 is added.
+            if isinstance(tosa_spec, Tosa_1_00) and "u55" in tosa_spec.extensions:
+                return False
             return True
         else:
             return self._is_node_supported_u55(node)
 
@@ -4,12 +4,11 @@
 # LICENSE file in the root directory of this source tree.
 
 # pyre-unsafe
-from typing import List
+from typing import Any, List
 
 import executorch.backends.arm.tosa_quant_utils as tqutils
 import executorch.backends.arm.tosa_utils as tutils
 
-import tosa_tools.v0_80.serializer.tosa_serializer as ts  # type: ignore
 from executorch.backends.arm.operators.node_visitor import (
     NodeVisitor,
     register_node_visitor,
@@ -33,10 +32,13 @@ def __init__(self, *args):
     def define_node(
         self,
         node: Node,
-        tosa_graph: ts.TosaSerializer,
+        tosa_graph: Any,
         inputs: List[TosaArg],
         output: TosaArg,
     ) -> None:
+
+        import tosa_tools.v0_80.serializer.tosa_serializer as ts  # type: ignore
+
         # Specification (0.80) states that input and output types
         # should all be the same
         if not (inputs[0].dtype == output.dtype):
@@ -53,7 +55,7 @@ def define_node(
         if inputs[0].dtype == ts.DType.INT8:
             rescaled_inputs, scale_back = tqutils.insert_rescale_ops_to_int32(
                 tosa_graph, inputs, node
-            )
+            )  # type: ignore[possibly-undefined]
         else:
             # input[0].dtype == ts.DType.INT32
             # Non quantized input, natively support by TOSA.abs
@@ -96,10 +98,13 @@ def __init__(self, *args):
     def define_node(
         self,
         node: Node,
-        tosa_graph: ts.TosaSerializer,
+        tosa_graph: Any,
         inputs: List[TosaArg],
         output: TosaArg,
     ) -> None:
+
+        import tosa_tools.v0_80.serializer.tosa_serializer as ts  # type: ignore
+
         # Specification (0.80) states that input and output types
         # should all be the same
         if not (inputs[0].dtype == output.dtype):
@@ -129,3 +134,122 @@ def define_node(
                 [output.name],
                 None,
             )
+
+
+@register_node_visitor
+class AbsVisitor_INT(NodeVisitor):
+    target = "aten.abs.default"
+
+    tosa_specs = [
+        TosaSpecification.create_from_string("TOSA-1.0+INT"),
+    ]
+
+    def __init__(self, *args):
+        super().__init__(*args)
+
+    def define_node(
+        self,
+        node: Node,
+        tosa_graph: Any,
+        inputs: List[TosaArg],
+        output: TosaArg,
+    ) -> None:
+
+        import serializer.tosa_serializer as ts  # type: ignore
+
+        # Specification (1.0) states that input and output types
+        # should all be the same
+        if not (inputs[0].dtype == output.dtype):
+            raise ValueError(
+                "All inputs and outputs need same dtype."
+                f"Got {inputs[0].dtype=}, {output.dtype=}"
+            )
+        # Handle int8 (quantized) and int32
+        if not (inputs[0].dtype in [ts.DType.INT8, ts.DType.INT32]):
+            raise ValueError(
+                "All inputs need to be INT8 or INT32." f"Got {inputs[0].dtype=}"
+            )
+
+        scale_back = 1.0
+        if inputs[0].dtype == ts.DType.INT8:
+            rescaled_inputs, scale_back = tqutils.insert_rescale_ops_to_int32(
+                tosa_graph, inputs, node, self.tosa_specs
+            )  # type: ignore[possibly-undefined]
+        else:
+            # input[0].dtype == ts.DType.INT32
+            # Non quantized input, natively support by TOSA.abs
+            rescaled_inputs = inputs
+
+        if output.dtype == ts.DType.INT8:
+            broadcasted_shape = tutils.tosa_shape(output.shape, output.dim_order)
+            abs_output = tosa_graph.addIntermediate(broadcasted_shape, ts.DType.INT32)
+        else:
+            # output.dtype == ts.DType.INT32
+            abs_output = output
+
+        # Do the INT32 Abs
+        tosa_graph.addOperator(
+            ts.TosaOp.Op().ABS,
+            [
+                rescaled_inputs[0].name,
+            ],
+            [abs_output.name],
+            None,
+        )
+
+        if output.dtype == ts.DType.INT8:
+            # Scale output back to 8 bit
+            # pyre-ignore
+            tqutils.insert_rescale_op_to_int8(
+                tosa_graph, abs_output, scale_back, node, self.tosa_specs
+            )  # type: ignore[possibly-undefined]
+
+
+@register_node_visitor
+class AbsVisitor_FP(AbsVisitor_INT):
+    # inheriting 'target' from BI class
+
+    tosa_specs = [TosaSpecification.create_from_string("TOSA-1.0+FP")]
+
+    def __init__(self, *args):
+        super().__init__(*args)
+
+    def define_node(
+        self,
+        node: Node,
+        tosa_graph: Any,
+        inputs: List[TosaArg],
+        output: TosaArg,
+    ) -> None:
+
+        import serializer.tosa_serializer as ts  # type: ignore
+
+        # Specification (1.0) states that input and output types
+        # should all be the same
+        if not (inputs[0].dtype == output.dtype):
+            raise ValueError(
+                "All inputs and output need same dtype."
+                f"Got {inputs[0].dtype=}, {output.dtype=}"
+            )
+
+        if inputs[0].dtype in [ts.DType.INT8, ts.DType.INT32]:
+            # Call the inherited define_node for handling integers
+            super().define_node(node, tosa_graph, inputs, output)
+        else:
+            # FP32 Abs lowering
+
+            if not (inputs[0].dtype == ts.DType.FP32):
+                raise ValueError(
+                    "All inputs need to be FP32." f"Got {inputs[0].dtype=}"
+                )
+
+            if not (output.dtype == ts.DType.FP32):
+                raise ValueError("All outputs need to be FP32." f"Got {output.dtype=}")
+
+            # MI lowering
+            tosa_graph.addOperator(
+                ts.TosaOp.Op().ABS,
+                [inputs[0].name],
+                [output.name],
+                None,
+            )