[js/webgpu] allows a ProgramInfo's RunData to use zero sized output (microsoft#19614)

fs-eire · web-flow · commit 97e899837357 · 2024-02-23T12:52:47.000-08:00
### Description
This PR allows zero-sized output.

To make the implementation simple, it does not support partial
zero-sized tensor. Which means, either all outputs are zero-sized, or an
error will be reported.

added 2 tests:
 - op test of `Add` with input T[2,0] T[2,1], and
 - test_split_zero_size_splits
diff --git a/web/lib/wasm/jsep/backend-webgpu.ts b/web/lib/wasm/jsep/backend-webgpu.ts
@@ -385,11 +385,16 @@ export class WebGpuBackend {
     // create info for inputs
     const inputDatas: GpuData[] = [];
     for (let i = 0; i < inputTensorViews.length; ++i) {
-      const gpuData = this.gpuDataManager.get(inputTensorViews[i].data);
+      const data = inputTensorViews[i].data;
+      // if tensor view data is 0, it means the output is zero-sized tensor, and there is no GPU data for it.
+      if (data === 0) {
+        continue;
+      }
+      const gpuData = this.gpuDataManager.get(data);
       if (!gpuData) {
-        throw new Error(`no GPU data for input: ${inputTensorViews[i].data}`);
+        throw new Error(`no GPU data for input: ${data}`);
       }
-      inputDatas[i] = gpuData;
+      inputDatas.push(gpuData);
     }
 
     const {outputs, dispatchGroup, programUniforms} = program.getRunData(inputTensorViews);
@@ -419,6 +424,11 @@ export class WebGpuBackend {
       const tensorView = (isTemporary || isPersistent) ?
           createIntermediateOutput(outputs[i].dataType, outputs[i].dims) :
           createKernelOutput(validatedOutputIndices[i], outputs[i].dataType, outputs[i].dims);
+      outputTensorViews.push(tensorView);
+      // if tensor view data is 0, it means the output is zero-sized tensor, and there is no GPU data for it.
+      if (tensorView.data === 0) {
+        continue;
+      }
       const gpuData = this.gpuDataManager.get(tensorView.data);
       if (!gpuData) {
         throw new Error(`no GPU data for output: ${tensorView.data}`);
@@ -434,10 +444,24 @@ export class WebGpuBackend {
         }
         persistentData.push(gpuData);
       }
-      outputTensorViews.push(tensorView);
       outputDatas.push(gpuData);
     }
 
+    // when there are any zero-sized tensor in the inputs or outputs, we should report error unless all outputs are
+    // zero-sized tensors.
+    if (inputDatas.length !== inputTensorViews.length || outputDatas.length !== outputTensorViews.length) {
+      // if all outputs are zero-sized tensors, there is no need to run the program.
+      if (outputDatas.length === 0) {
+        TRACE_FUNC_END(program.name);
+        return outputTensorViews;
+      }
+      // if some outputs are zero-sized tensors, report an error.
+      //
+      // TODO: so far we don't see any use case that outputs include both zero-sized tensors and non-zero-sized tensors.
+      // If we see such use case, we need to make a change here to support it.
+      throw new Error(
+          `Program ${program.name} has zero-sized tensor(s) in inputs or outputs. This is not supported now.`);
+    }
 
     // load uniforms
     // TODO: add cache for uniform (is it necessary?)
diff --git a/web/lib/wasm/jsep/init.ts b/web/lib/wasm/jsep/init.ts
@@ -104,7 +104,8 @@ class ComputeContextImpl implements ComputeContext {
         throw new Error(`Unsupported data type: ${dataType}`);
       }
       const bufferSize = elementSize * ShapeUtil.size(dims);
-      return new TensorViewImpl(this.module, dataType, this.backend.gpuDataManager.create(bufferSize).id, dims);
+      const gpuDataId = bufferSize > 0 ? this.backend.gpuDataManager.create(bufferSize).id : 0;
+      return new TensorViewImpl(this.module, dataType, gpuDataId, dims);
     };
     return this.backend.run(program, mappedInputs, outputIndices, createKernelOutput, createTemporaryOutput);
   }
diff --git a/web/lib/wasm/jsep/util.ts b/web/lib/wasm/jsep/util.ts
@@ -56,7 +56,16 @@ export class BroadcastUtil {
       if (aLen !== bLen && aLen > 1 && bLen > 1) {
         return undefined;
       }
-      cdims[crank - i] = Math.max(aLen, bLen);
+      const max = Math.max(aLen, bLen);
+      if (aLen && bLen) {
+        cdims[crank - i] = Math.max(aLen, bLen);
+      } else {
+        // when either aLen or bLen is 0, the other should be either 0 or 1, otherwise it is not broadcastable.
+        if (max > 1) {
+          return undefined;
+        }
+        cdims[crank - i] = 0;
+      }
     }
 
     return cdims;
diff --git a/web/test/data/ops/add.jsonc b/web/test/data/ops/add.jsonc
@@ -157,6 +157,28 @@
             "type": "float32"
           }
         ]
+      },
+      {
+        "name": "T[2,0] T[2,1]",
+        "inputs": [
+          {
+            "data": [],
+            "dims": [2, 0],
+            "type": "float32"
+          },
+          {
+            "data": [1, 2],
+            "dims": [2, 1],
+            "type": "float32"
+          }
+        ],
+        "outputs": [
+          {
+            "data": [],
+            "dims": [2, 0],
+            "type": "float32"
+          }
+        ]
       }
     ]
   }
diff --git a/web/test/suite-test-list.jsonc b/web/test/suite-test-list.jsonc
@@ -1231,7 +1231,7 @@
       "test_split_variable_parts_1d",
       "test_split_variable_parts_2d",
       "test_split_variable_parts_default_axis",
-      // // "test_split_zero_size_splits",
+      "test_split_zero_size_splits",
       "test_sqrt_example",
       "test_sqrt",
       "test_squeeze_negative_axes",
diff --git a/web/test/test-runner.ts b/web/test/test-runner.ts
@@ -573,7 +573,9 @@ export async function sessionRun(options: {
       // replace the CPU tensors in feeds into GPU tensors
       for (const name in feeds) {
         if (Object.hasOwnProperty.call(feeds, name)) {
-          feeds[name] = createGpuTensorForInput(feeds[name]);
+          if (feeds[name].size > 0) {
+            feeds[name] = createGpuTensorForInput(feeds[name]);
+          }
         }
       }
     }
@@ -582,7 +584,11 @@ export async function sessionRun(options: {
       for (const name in options.outputsMetaInfo) {
         if (Object.hasOwnProperty.call(options.outputsMetaInfo, name)) {
           const {type, dims} = options.outputsMetaInfo[name];
-          fetches[name] = createGpuTensorForOutput(type, dims);
+          if (dims.some(d => d === 0)) {
+            fetches[name] = new ort.Tensor(type, [], dims);
+          } else {
+            fetches[name] = createGpuTensorForOutput(type, dims);
+          }
         }
       }
     }

Original file line number	Diff line number	Diff line change
`@@ -104,7 +104,8 @@ class ComputeContextImpl implements ComputeContext {`
`104`	`104`	throw new Error(`Unsupported data type: ${dataType}`);
`105`	`105`	`}`
`106`	`106`	`const bufferSize = elementSize * ShapeUtil.size(dims);`
`107`		`- return new TensorViewImpl(this.module, dataType, this.backend.gpuDataManager.create(bufferSize).id, dims);`
	`107`	`+ const gpuDataId = bufferSize > 0 ? this.backend.gpuDataManager.create(bufferSize).id : 0;`
	`108`	`+ return new TensorViewImpl(this.module, dataType, gpuDataId, dims);`
`108`	`109`	`};`
`109`	`110`	`return this.backend.run(program, mappedInputs, outputIndices, createKernelOutput, createTemporaryOutput);`
`110`	`111`	`}`
Original file line number	Diff line number	Diff line change
`@@ -157,6 +157,28 @@`
`157`	`157`	`"type": "float32"`
`158`	`158`	`}`
`159`	`159`	`]`
	`160`	`+ },`
	`161`	`+ {`
	`162`	`+ "name": "T[2,0] T[2,1]",`
	`163`	`+ "inputs": [`
	`164`	`+ {`
	`165`	`+ "data": [],`
	`166`	`+ "dims": [2, 0],`
	`167`	`+ "type": "float32"`
	`168`	`+ },`
	`169`	`+ {`
	`170`	`+ "data": [1, 2],`
	`171`	`+ "dims": [2, 1],`
	`172`	`+ "type": "float32"`
	`173`	`+ }`
	`174`	`+ ],`
	`175`	`+ "outputs": [`
	`176`	`+ {`
	`177`	`+ "data": [],`
	`178`	`+ "dims": [2, 0],`
	`179`	`+ "type": "float32"`
	`180`	`+ }`
	`181`	`+ ]`
`160`	`182`	`}`
`161`	`183`	`]`
`162`	`184`	`}`
Original file line number	Diff line number	Diff line change
`@@ -573,7 +573,9 @@ export async function sessionRun(options: {`
`573`	`573`	`// replace the CPU tensors in feeds into GPU tensors`
`574`	`574`	`for (const name in feeds) {`
`575`	`575`	`if (Object.hasOwnProperty.call(feeds, name)) {`
`576`		`- feeds[name] = createGpuTensorForInput(feeds[name]);`
	`576`	`+ if (feeds[name].size > 0) {`
	`577`	`+ feeds[name] = createGpuTensorForInput(feeds[name]);`
	`578`	`+ }`
`577`	`579`	`}`
`578`	`580`	`}`
`579`	`581`	`}`
`@@ -582,7 +584,11 @@ export async function sessionRun(options: {`
`582`	`584`	`for (const name in options.outputsMetaInfo) {`
`583`	`585`	`if (Object.hasOwnProperty.call(options.outputsMetaInfo, name)) {`
`584`	`586`	`const {type, dims} = options.outputsMetaInfo[name];`
`585`		`- fetches[name] = createGpuTensorForOutput(type, dims);`
	`587`	`+ if (dims.some(d => d === 0)) {`
	`588`	`+ fetches[name] = new ort.Tensor(type, [], dims);`
	`589`	`+ } else {`
	`590`	`+ fetches[name] = createGpuTensorForOutput(type, dims);`
	`591`	`+ }`
`586`	`592`	`}`
`587`	`593`	`}`
`588`	`594`	`}`