[QDQ Optimizer] Update WeightBiasQuantization to skip Conv/Gemm if downstream node is not QuantizeLinear (microsoft#24537)

adrianlizarraga · ankitm3k · commit 2cb61a6b3fe7 · 2025-05-12T17:27:38.000+05:30
### Description Updates the WeightBiasQuantization optimizer to skip processing on Conv/Gemm nodes if the downstream child node is not a QuantizeLinear. #### Before this PR Original graph: ``` input_0 -> DQ -> Conv -> graph_output (or non-Q node) ^ ^ | | weights_f32------+ | bias_f32------------+ ``` Becomes: ``` input_0 -> DQ ------> Conv -> graph_output (or non-Q node) ^ ^ | | weights_quant -> DQ --+ | bias_quant -> DQ --------+ ``` The above is **NOT** a valid QDQ node unit for Conv because the Conv's output is not consumed by a QuantizeLinear node. #### With this PR The above example graph remains unchanged after L1 optimizations: ``` input_0 -> DQ -> Conv -> graph_output (or non-Q node) ^ ^ | | weights_f32------+ | bias_f32------------+ ``` ### Motivation and Context Caused inaccuracy for a customer model. Automatically quantizing the weights and biases of a Conv/Gemm is detrimental if the output of the Conv/Gemm is not consumed by a QuantizeLinear node. In this scenario, the whole node group is not considered a valid QDQ node unit, and so the EP has to run the Conv/Gemm as float32/float16 anyway. If the Conv/Gemm is running as float32/float16, then quantizing the weights and biases introduces inaccuracy for no gain. PR that originally added this optimizer: microsoft#22969
diff --git a/onnxruntime/core/optimizer/qdq_transformer/weight_bias_quantization.cc b/onnxruntime/core/optimizer/qdq_transformer/weight_bias_quantization.cc
@@ -43,6 +43,14 @@ Status WeightBiasQuantization::ApplyImpl(Graph& graph, bool& modified, int graph
       continue;
     }
 
+    // Require that the node's output is consumed by a single QuantizeLinear node.
+    // Otherwise, if only the inputs are quantized, but not the output, then this node group would not
+    // be considered a QDQ node unit anyway.
+    std::vector<const Node*> children_nodes = graph.GetConsumerNodes(node.OutputDefs()[0]->Name());
+    if (children_nodes.size() != 1 || children_nodes[0]->OpType() != QDQ::QOpName) {
+      continue;
+    }
+
     Node& dq_0 = *graph.GetNode(parent_node_0->Index());
     Node* dq_1 = nullptr;
     const ONNX_NAMESPACE::TensorProto* weight_proto = nullptr;
diff --git a/onnxruntime/test/optimizer/qdq_transformer_test.cc b/onnxruntime/test/optimizer/qdq_transformer_test.cc
@@ -5349,6 +5349,62 @@ TEST(QDQTransformerTests, WeightBiasQuantization_Conv_Weight_Bias) {
 #endif
 }
 
+// Tests that the WeightBiasQuantization optimizer does not process nodes that do not
+// already have an output that is consumed by a single QuantizeLinear node.
+TEST(QDQTransformerTests, WeightBiasQuantization_SkipIfOutputNotQuantized) {
+  auto test_case = [](bool add_final_reshape) {
+    auto build_test_case = [&](ModelTestBuilder& builder) {
+      NodeArg* input_arg = builder.MakeInput<uint8_t>({1, 24, 67, 67}, std::numeric_limits<uint8_t>::min(),
+                                                      std::numeric_limits<uint8_t>::max());
+      NodeArg* weight_arg = builder.MakeInitializer<float>({24, 1, 5, 5}, -0.1f, 0.1f);
+      NodeArg* bias_arg = builder.MakeInitializer<float>({24}, -0.1f, 0.1f);
+      NodeArg* input_dq_arg = builder.MakeIntermediate();
+      NodeArg* conv_output_arg = add_final_reshape ? builder.MakeIntermediate() : builder.MakeOutput();
+
+      builder.AddDequantizeLinearNode<uint8_t>(input_arg, 0.014f, static_cast<uint8_t>(127), input_dq_arg);
+      auto& conv_node = builder.AddNode("Conv", {input_dq_arg, weight_arg, bias_arg}, {conv_output_arg});
+      conv_node.AddAttribute("dilations", std::vector<int64_t>{1, 1});
+      conv_node.AddAttribute("kernel_shape", std::vector<int64_t>{5, 5});
+      conv_node.AddAttribute("strides", std::vector<int64_t>{2, 2});
+      conv_node.AddAttribute("group", static_cast<int64_t>(24));
+      conv_node.AddAttribute("pads", std::vector<int64_t>{0, 0, 0, 0});
+
+      // Make adding a final Reshape node configurable to test two cases:
+      //  - Conv produces a graph output
+      //  - Conv output is consumed by some node that is NOT a QuantizeLinear
+      // In either case, the WeightBiasQuantization optimizer should skip this node.
+      if (add_final_reshape) {
+        NodeArg* reshape_output_arg = builder.MakeOutput();
+        NodeArg* new_shape_arg = builder.Make1DInitializer<int64_t>({1, -1});
+        builder.AddNode("Reshape", {conv_output_arg, new_shape_arg}, {reshape_output_arg});
+      }
+    };
+
+    auto check_graph = [add_final_reshape](InferenceSessionWrapper& session) {
+      auto op_to_count = CountOpsInGraph(session.GetGraph());
+      const QDQOpKeys qdq_keys = GetQDQOpKeys(false);
+
+      // Should retain the same nodes in the original graph.
+      EXPECT_EQ(op_to_count[qdq_keys.dequantize_linear], 1);
+      EXPECT_EQ(op_to_count["Conv"], 1);
+      EXPECT_EQ(op_to_count[qdq_keys.quantize_linear], 0);
+      EXPECT_EQ(op_to_count["Reshape"], static_cast<int>(add_final_reshape));
+    };
+
+    TransformerTester(build_test_case,
+                      check_graph,
+                      TransformerLevel::Default,
+                      TransformerLevel::Level1,
+                      21,
+                      /*per_sample_tolerance*/ 0.0,
+                      /*relative_per_sample_tolerance*/ 0.0,
+                      std::make_unique<WeightBiasQuantization>());
+  };
+
+  test_case(false);  // Conv produces a graph output directly
+  test_case(true);   // Conv -> Reshape -> graph_output
+}
+
 TEST(QDQTransformerTests, WeightBiasQuantization_ConvTranspose_Weight) {
   auto test_case = [](bool use_contrib_qdq) {
     auto build_test_case = [&](ModelTestBuilder& builder) {