Merge branch 'main' into export-D72485973

Vysarat · web-flow · commit 185f7b07cd16 · 2025-04-07T11:52:09.000-07:00
diff --git a/.github/release.yml b/.github/release.yml
@@ -15,57 +15,82 @@ changelog:
     - title: ARM
       labels:
         - "release notes: arm"
+        - "module: arm"
+        - "partner: arm"
     - title: NXP
-        labels:
+      labels:
         - "release notes: nxp"
+        - "module: nxp"
     - title: Exir
-        labels:
+      labels:
         - "release notes: exir"
+        - "module: exir"
     - title: Misc
-        labels:
+      labels:
         - "release notes: misc"
     - title: Apple
-        labels:
+      labels:
         - "release notes: apple"
+        - "module: coreml"
+        - "module: mps"
+    - title: Android
+      labels:
+        - "module: android"
+    - title: IOS
+      labels:
+        - "module: ios"
     - title: Build
-        labels:
+      labels:
         - "release notes: build"
     - title: Vulkan
-        labels:
+      labels:
         - "release notes: vulkan"
+        - "module: vulkan"
     - title: Cadence
-        labels:
+      labels:
         - "release notes: cadence"
+        - "module: cadence"
     - title: Runtime
-        labels:
+      labels:
         - "release notes: runtime"
+        - "module: runtime"
     - title: XNNPACK
-        labels:
+      labels:
         - "release notes: xnnpack"
+        - "module: xnnpack"
     - title: Devtools
-        labels:
+      labels:
         - "release notes: devtools"   
+        - "module: devtools"
     - title: Examples
-        labels:
+      labels:
         - "release notes: examples"
+    - title: LLM
+      labels:
+        - "module: llm"
     - title: Mediatek
-        labels:
+      labels:
         - "release notes: mediatek"
+        - "partner: mediatek"
     - title: Openvino
-        labels:
+      labels:
         - "release notes: openvino"
     - title: Qualcomm
-        labels:
+      labels:
         - "release notes: qualcomm"
+        - "partner: qualcomm"
+        - "module: qnn"
     - title: Training
-        labels:
+      labels:
         - "release notes: training"
+        - "module: training"
     - title: Quantization
-        labels:
+      labels:
         - "release notes: quantization" 
     - title: Ops & kernels
-        labels:
-        - "release notes: ops & kernels" 
+      labels:
+        - "release notes: ops & kernels"
+        - "module: kernels"
     - title: Other Changes
       labels:
         - "*"
diff --git a/backends/arm/operators/op_eq.py b/backends/arm/operators/op_eq.py
@@ -34,9 +34,11 @@ def define_node(
         inputs: List[TosaArg],
         output: TosaArg,
     ) -> None:
-        assert (
-            inputs[0].dtype == inputs[1].dtype
-        ), "EQ must have the same dtypes as input"
+        if inputs[0].dtype != inputs[1].dtype:
+            raise TypeError(
+                "All inputs need to have the same data type for operator EQ but got "
+                f"{inputs[0].dtype=}, {inputs[1].dtype=}"
+            )
 
         input_nodes = inputs
         # Handle quantization
diff --git a/backends/arm/operators/op_exp.py b/backends/arm/operators/op_exp.py
@@ -36,7 +36,14 @@ def define_node(
         output: TosaArg,
     ) -> None:
 
-        assert len(node.all_input_nodes) == 1
-        assert inputs[0].dtype == output.dtype == ts.DType.FP32
+        if len(node.all_input_nodes) != 1:
+            raise ValueError(
+                f"Expected 1 input for {self.target}, got {len(node.all_input_nodes)}"
+            )
+        if inputs[0].dtype != ts.DType.FP32 or output.dtype != ts.DType.FP32:
+            raise ValueError(
+                f"Input and output for {self.target} need to be FP32, got input dtype: "
+                f"{inputs[0].dtype} and output dtype: {output.dtype}"
+            )
 
         tosa_graph.addOperator(TosaOp.Op().EXP, [inputs[0].name], [output.name])
diff --git a/backends/arm/operators/op_ge.py b/backends/arm/operators/op_ge.py
@@ -34,9 +34,11 @@ def define_node(
         inputs: List[TosaArg],
         output: TosaArg,
     ) -> None:
-        assert (
-            inputs[0].dtype == inputs[1].dtype
-        ), "GE must have the same dtypes as input"
+        if inputs[0].dtype != inputs[1].dtype:
+            raise TypeError(
+                "All inputs need to have the same data type for operator GE but got "
+                f"{inputs[0].dtype=}, {inputs[1].dtype=}"
+            )
 
         input_nodes = inputs
         # Handle quantization
diff --git a/backends/arm/operators/op_gt.py b/backends/arm/operators/op_gt.py
@@ -34,9 +34,11 @@ def define_node(
         inputs: List[TosaArg],
         output: TosaArg,
     ) -> None:
-        assert (
-            inputs[0].dtype == inputs[1].dtype
-        ), "GT must have the same dtypes as input"
+        if inputs[0].dtype != inputs[1].dtype:
+            raise TypeError(
+                "All inputs need to have the same data type for operator GT but got "
+                f"{inputs[0].dtype=}, {inputs[1].dtype=}"
+            )
 
         input_nodes = inputs
         # Handle quantization
diff --git a/backends/arm/operators/op_le.py b/backends/arm/operators/op_le.py
@@ -34,9 +34,11 @@ def define_node(
         inputs: List[TosaArg],
         output: TosaArg,
     ) -> None:
-        assert (
-            inputs[0].dtype == inputs[1].dtype
-        ), "LE must have the same dtypes as input"
+        if inputs[0].dtype != inputs[1].dtype:
+            raise TypeError(
+                "All inputs need to have the same data type for operator LE but got "
+                f"{inputs[0].dtype=}, {inputs[1].dtype=}"
+            )
 
         input_nodes = inputs
         # Handle quantization
diff --git a/backends/arm/operators/op_lt.py b/backends/arm/operators/op_lt.py
@@ -34,9 +34,11 @@ def define_node(
         inputs: List[TosaArg],
         output: TosaArg,
     ) -> None:
-        assert (
-            inputs[0].dtype == inputs[1].dtype
-        ), "LT must have the same dtypes as input"
+        if inputs[0].dtype != inputs[1].dtype:
+            raise TypeError(
+                "All inputs need to have the same data type for operator LT but got "
+                f"{inputs[0].dtype=}, {inputs[1].dtype=}"
+            )
 
         input_nodes = inputs
         # Handle quantization
diff --git a/backends/arm/test/ops/test_mm.py b/backends/arm/test/ops/test_mm.py
@@ -6,6 +6,7 @@
 
 from typing import Callable
 
+import pytest
 import torch
 from executorch.backends.arm.test import common
 from executorch.backends.arm.test.tester.test_pipeline import (
@@ -53,6 +54,7 @@ def test_mm_tosa_u55(test_data_generator: Callable[[], tuple]):
 
 
 @parameterized.expand(MM.test_data_generators)
+@pytest.mark.flaky  # Investigate flakiness (MLETORCH-870)
 def test_mm_tosa_u85(test_data_generator: Callable[[], tuple]):
     test_data = test_data_generator()
     EthosU85PipelineBI[test_t](MM(), test_data, MM.aten_op, MM.exir_op).run()
@@ -67,6 +69,7 @@ def test_mm_tosa_u55_on_fvp(test_data_generator: Callable[[], tuple]):
 
 @parameterized.expand(MM.test_data_generators)
 @common.SkipIfNoCorstone320
+@pytest.mark.flaky  # Investigate flakiness (MLETORCH-870)
 def test_mm_tosa_u85_on_fvp(test_data_generator: Callable[[], tuple]):
     test_data = test_data_generator()
     EthosU85PipelineBI[test_t](
diff --git a/backends/vulkan/op_registry.py b/backends/vulkan/op_registry.py
@@ -277,6 +277,7 @@ def register_binary_op(features: OpFeatures):
         exir_ops.edge.aten.rsqrt.default,
         exir_ops.edge.aten.tanh.default,
         exir_ops.edge.aten.round.default,
+        exir_ops.edge.aten.leaky_relu.default,
     ]
 )
 def register_unary_op(features: OpFeatures):
diff --git a/backends/xnnpack/operators/op_slice_copy.py b/backends/xnnpack/operators/op_slice_copy.py
@@ -69,7 +69,9 @@ def define_node(
             output_shape = [output_shape[i] for i in PERM_NCHW_TO_NHWC]
             dim_of_slice = PERM_NHWC_TO_NCHW[dim_of_slice]
 
-        slice_begin_index = cast(int, node.args[2])
+        slice_begin_index = 0
+        if len(node.args) > 2 and node.args[2]:
+            slice_begin_index = cast(int, node.args[2])
         if slice_begin_index < 0:
             slice_begin_index = input_shape[dim_of_slice] + slice_begin_index
 
diff --git a/backends/xnnpack/test/ops/test_slice_copy.py b/backends/xnnpack/test/ops/test_slice_copy.py
@@ -69,6 +69,18 @@ def forward(self, x):
         # Note that two of the slices are optimized away as they are identity.
         self._test_slice_copy(ConvSlice(), inputs, 4, 2)
 
+    def test_fp32_slice_copy_default_start(self):
+        """
+        XNNPACK supports default start in slice op.
+        """
+
+        class Slice(torch.nn.Module):
+            def forward(self, x):
+                return torch.ops.aten.slice.Tensor(x, 0, None, 2)
+
+        inputs = (torch.randn(5, 5),)
+        self._test_slice_copy(Slice(), inputs, 1, 1)
+
     def test_fp32_slice_copy_stride_non_1(self):
         """
         XNNPACK does not support strided slicing.
diff --git a/examples/qualcomm/scripts/mobilebert_fine_tune.py b/examples/qualcomm/scripts/mobilebert_fine_tune.py
@@ -23,7 +23,6 @@
     make_output_dir,
     make_quantizer,
     parse_skip_delegation_node,
-    QnnPartitioner,
     setup_common_args_and_variables,
     SimpleADB,
 )
@@ -273,19 +272,15 @@ def calibrator(gm):
 
         quantizer = make_quantizer(quant_dtype=quant_dtype)
         backend_options = generate_htp_compiler_spec(quant_dtype is not None)
-        partitioner = QnnPartitioner(
-            generate_qnn_executorch_compiler_spec(
-                soc_model=getattr(QcomChipset, args.model),
-                backend_options=backend_options,
-            ),
-            skip_node_id_set=skip_node_id_set,
-            skip_node_op_set=skip_node_op_set,
+        compiler_specs = generate_qnn_executorch_compiler_spec(
+            soc_model=getattr(QcomChipset, args.model),
+            backend_options=backend_options,
         )
         # skip embedding layer cause it's quantization sensitive
         graph_module, _ = skip_annotation(
             nn_module=model,
             quantizer=quantizer,
-            partitioner=partitioner,
+            compiler_specs=compiler_specs,
             sample_input=inputs[0],
             calibration_cb=calibrator,
             fp_node_op_set={torch.ops.aten.embedding.default},
diff --git a/extension/parallel/targets.bzl b/extension/parallel/targets.bzl
@@ -17,6 +17,6 @@ def define_common_targets():
             "@EXECUTORCH_CLIENTS",
         ],
         deps = [
-            "//executorch/runtime/kernel:thread_parallel_interface",
+            "//executorch/extension/threadpool:threadpool",
         ],
     )
diff --git a/extension/threadpool/targets.bzl b/extension/threadpool/targets.bzl
@@ -20,7 +20,7 @@ def define_common_targets():
     ] + (["fb/threadpool_use_n_threads.h"] if not runtime.is_oss else [])
 
     runtime.cxx_library(
-        name = "threadpool",
+        name = "threadpool_lib",
         srcs = _THREADPOOL_SRCS,
         deps = [
             "//executorch/runtime/core:core",
@@ -45,6 +45,38 @@ def define_common_targets():
         ],
     )
 
+    runtime.cxx_library(
+        name = "threadpool",
+        # TODO: OSS doesn't have os:iphoneos. Sync buck2 prelude
+        # update to add it and remove duplication.
+        exported_deps = (select({
+            # Major operating systems should be able to use threadpool.
+            "ovr_config//os:linux": [":threadpool_lib"],
+            "ovr_config//os:macos": [":threadpool_lib"],
+            "ovr_config//os:windows": [":threadpool_lib"],
+            "ovr_config//os:android": [":threadpool_lib"],
+            "ovr_config//os:iphoneos": [":threadpool_lib"],
+            # Machines without an operating system shouldn't.
+            "ovr_config//os:none": ["//executorch/runtime/kernel:thread_parallel_interface"],
+            # If we don't know what it is, disable threadpool out of caution.
+            "DEFAULT": ["//executorch/runtime/kernel:thread_parallel_interface"],
+        }) if not runtime.is_oss else select({
+            # Major operating systems should be able to use threadpool.
+            "ovr_config//os:linux": [":threadpool_lib"],
+            "ovr_config//os:macos": [":threadpool_lib"],
+            "ovr_config//os:windows": [":threadpool_lib"],
+            "ovr_config//os:android": [":threadpool_lib"],
+            # Machines without an operating system shouldn't.
+            "ovr_config//os:none": ["//executorch/runtime/kernel:thread_parallel_interface"],
+            # If we don't know what it is, disable threadpool out of caution.
+            "DEFAULT": ["//executorch/runtime/kernel:thread_parallel_interface"],
+        })),
+        visibility = [
+            "//executorch/...",
+            "@EXECUTORCH_CLIENTS",
+        ],
+    )
+
     runtime.cxx_library(
         name = "cpuinfo_utils",
         srcs = [
diff --git a/kernels/optimized/cpu/targets.bzl b/kernels/optimized/cpu/targets.bzl
@@ -107,8 +107,8 @@ _OPTIMIZED_ATEN_OPS = (
     op_target(
         name = "op_where",
         deps = [
+            "//executorch/extension/threadpool:threadpool",
             "//executorch/kernels/portable/cpu/util:elementwise_util",
-            "//executorch/runtime/kernel:thread_parallel_interface",
         ],
     ),
 )
diff --git a/kernels/optimized/lib_defs.bzl b/kernels/optimized/lib_defs.bzl
@@ -232,9 +232,9 @@ def define_libs(is_fbcode=False):
                 "DEFAULT": [],
             }) + LIBBLAS_DEPS,
             exported_deps = [
+                "//executorch/extension/threadpool:threadpool",
                 "//executorch/kernels/optimized:libutils",
                 "//executorch/runtime/core/exec_aten:lib",
-                "//executorch/runtime/kernel:thread_parallel_interface",
             ],
             **get_apple_framework_deps_kwargs(is_fbcode),
         )
diff --git a/kernels/portable/cpu/util/targets.bzl b/kernels/portable/cpu/util/targets.bzl
diff --git a/runtime/kernel/targets.bzl b/runtime/kernel/targets.bzl

Original file line number	Diff line number	Diff line change
`@@ -277,6 +277,7 @@ def register_binary_op(features: OpFeatures):`
`277`	`277`	`exir_ops.edge.aten.rsqrt.default,`
`278`	`278`	`exir_ops.edge.aten.tanh.default,`
`279`	`279`	`exir_ops.edge.aten.round.default,`
	`280`	`+ exir_ops.edge.aten.leaky_relu.default,`
`280`	`281`	`]`
`281`	`282`	`)`
`282`	`283`	`def register_unary_op(features: OpFeatures):`
Original file line number	Diff line number	Diff line change
`@@ -17,6 +17,6 @@ def define_common_targets():`
`17`	`17`	`"@EXECUTORCH_CLIENTS",`
`18`	`18`	`],`
`19`	`19`	`deps = [`
`20`		`- "//executorch/runtime/kernel:thread_parallel_interface",`
	`20`	`+ "//executorch/extension/threadpool:threadpool",`
`21`	`21`	`],`
`22`	`22`	`)`