Skip to content

Commit b54dd28

Browse files
Yang Gufs-eire
Yang Gu
authored andcommitted
[js/webgpu] Enable GroupedConvVectorize path (#19791)
Vectorize met 2 failed cases in a CI bot with NVIDIA GPU, but we couldn't repro with all the GPUs at hand, including NVIDIA GPUs. This PR introduces GPUAdapterInfo and enables this opt on non-NVIDIA GPUs to make the bots happy. No obivous perf gain can be seen if we enable vectorize on NVIDIA. However, it shows big perf improvement on Intel. On my Gen12 Intel GPU, mobilenetv2-12 perf was improved from 11.14ms to 7.1ms.
1 parent 18027be commit b54dd28

File tree

4 files changed

+42
-5
lines changed

4 files changed

+42
-5
lines changed

js/web/lib/wasm/jsep/backend-webgpu.ts

+23-1
Original file line numberDiff line numberDiff line change
@@ -10,7 +10,7 @@ import {createView, TensorView} from './tensor-view';
1010
import {createGpuDataManager, downloadGpuData, GpuDataManager} from './webgpu/gpu-data-manager';
1111
import {RunFunction, WEBGPU_OP_RESOLVE_RULES} from './webgpu/op-resolve-rules';
1212
import {ProgramManager} from './webgpu/program-manager';
13-
import {ComputeContext, GpuData, ProgramInfo, ProgramInputTensorInfoDependency, SessionState, TimestampQuery} from './webgpu/types';
13+
import {AdapterInfo, ComputeContext, GpuArchitecture, GpuData, GpuVendor, ProgramInfo, ProgramInputTensorInfoDependency, SessionState, TimestampQuery} from './webgpu/types';
1414

1515
interface CommandInfo {
1616
readonly kernelId: number;
@@ -94,11 +94,32 @@ const getProgramInfoUniqueKey =
9494
return key;
9595
};
9696

97+
class AdapterInfoImpl implements AdapterInfo {
98+
readonly architecture?: string;
99+
readonly vendor?: string;
100+
101+
constructor(adapterInfo: GPUAdapterInfo) {
102+
if (adapterInfo) {
103+
this.architecture = adapterInfo.architecture;
104+
this.vendor = adapterInfo.vendor;
105+
}
106+
}
107+
108+
isArchitecture(architecture: GpuArchitecture): boolean {
109+
return this.architecture === architecture;
110+
}
111+
112+
isVendor(vendor: GpuVendor): boolean {
113+
return this.vendor === vendor;
114+
}
115+
}
116+
97117
/**
98118
* this class is designed to store status and being used as a singleton for JSEP. It will be passed to jsepInit() as
99119
* the first parameter so that it is stored for future use.
100120
*/
101121
export class WebGpuBackend {
122+
adapterInfo: AdapterInfoImpl;
102123
device: GPUDevice;
103124
/**
104125
* an instance of GpuDataManager to manage a GpuDataId -> GpuBuffer mapping
@@ -212,6 +233,7 @@ export class WebGpuBackend {
212233
}
213234

214235
this.device = await adapter.requestDevice(deviceDescriptor);
236+
this.adapterInfo = new AdapterInfoImpl(await adapter.requestAdapterInfo());
215237
this.gpuDataManager = createGpuDataManager(this);
216238
this.programManager = new ProgramManager(this);
217239
this.kernels = new Map();

js/web/lib/wasm/jsep/init.ts

+3-1
Original file line numberDiff line numberDiff line change
@@ -10,7 +10,7 @@ import {WebGpuBackend} from './backend-webgpu';
1010
import {LOG_DEBUG} from './log';
1111
import {TensorView} from './tensor-view';
1212
import {ShapeUtil} from './util';
13-
import {ComputeContext, ComputeContextInputsOutputsMapping, ProgramInfo} from './webgpu/types';
13+
import {AdapterInfo, ComputeContext, ComputeContextInputsOutputsMapping, ProgramInfo} from './webgpu/types';
1414

1515
/* eslint-disable no-bitwise */
1616

@@ -54,6 +54,7 @@ class TensorViewImpl implements TensorView {
5454
}
5555

5656
class ComputeContextImpl implements ComputeContext {
57+
readonly adapterInfo: AdapterInfo;
5758
readonly opKernelContext: number;
5859
readonly inputs: readonly TensorView[];
5960
readonly outputCount: number;
@@ -66,6 +67,7 @@ class ComputeContextImpl implements ComputeContext {
6667
private customDataOffset = 0;
6768
private customDataSize = 0;
6869
constructor(private module: OrtWasmModule, private backend: WebGpuBackend, contextDataOffset: number) {
70+
this.adapterInfo = backend.adapterInfo;
6971
const heapU32 = module.HEAPU32;
7072

7173
// extract context data

js/web/lib/wasm/jsep/webgpu/ops/conv.ts

+4-3
Original file line numberDiff line numberDiff line change
@@ -148,11 +148,12 @@ const conv2d = (context: ComputeContext, inputs: readonly TensorView[], attribut
148148
// const hasPreluActivationWeights = false; /* TODO: add support for prelu activation weights */
149149
const isChannelsLast = attributes.format === 'NHWC';
150150
if (attributes.group !== 1) {
151-
// Temporarily disable createGroupedConvVectorizeProgramInfo path due to bots failures with below two cases:
151+
// NVIDIA GPU with ampere architecture fails with below 2 cases, but we couldn't repro them with any other
152+
// GPUs. So just disable vectorize on NVIDIA ampere to ensure always correct outputs.
152153
// [webgpu]Conv - conv - vectorize group - B
153154
// [webgpu]Conv - conv - vectorize group - D
154-
const disableGroupedConvVectorize = true;
155-
if (!disableGroupedConvVectorize && isChannelsLast && inputs[1].dims[0] === attributes.group &&
155+
const enableGroupedConvVectorize = !context.adapterInfo.isArchitecture('ampere');
156+
if (enableGroupedConvVectorize && isChannelsLast && inputs[1].dims[0] === attributes.group &&
156157
inputs[1].dims[1] === 1 && attributes.dilations[0] === 1 && attributes.dilations[1] === 1) {
157158
const outputShape = calculateOutputShape(
158159
inputs[0].dims, inputs[1].dims, attributes.dilations, adjustedAttributes.pads, attributes.strides,

js/web/lib/wasm/jsep/webgpu/types.ts

+12
Original file line numberDiff line numberDiff line change
@@ -15,6 +15,13 @@ export enum GpuDataType {
1515
}
1616
export type GpuDataId = number;
1717

18+
export type GpuArchitecture = 'ampere';
19+
export type GpuVendor = 'amd'|'intel'|'nvidia';
20+
export interface AdapterInfo {
21+
isArchitecture: (architecture: GpuArchitecture) => boolean;
22+
isVendor: (vendor: GpuVendor) => boolean;
23+
}
24+
1825
export interface GpuData {
1926
type: GpuDataType;
2027
id: GpuDataId;
@@ -146,6 +153,11 @@ export interface ComputeContextInputsOutputsMapping {
146153
* A ComputeContext instance carries the states that representing the current running of a kernel.
147154
*/
148155
export interface ComputeContext {
156+
/**
157+
* gpu adapter info
158+
*/
159+
readonly adapterInfo: AdapterInfo;
160+
149161
/**
150162
* stores the pointer to OpKernelContext
151163
*/

0 commit comments

Comments
 (0)