diff --git a/backends/vulkan/runtime/graph/ops/glsl/conv2d_dw.glsl b/backends/vulkan/runtime/graph/ops/glsl/conv2d_dw.glsl index 5d7c69ab65..dd4bcb89a8 100644 --- a/backends/vulkan/runtime/graph/ops/glsl/conv2d_dw.glsl +++ b/backends/vulkan/runtime/graph/ops/glsl/conv2d_dw.glsl @@ -14,7 +14,7 @@ #define op(X, A, B) ${OPERATOR} -#include "indexing_utils_u16.h" +#include "indexing_utils.h" layout(std430) buffer; @@ -35,7 +35,7 @@ layout(local_size_x_id = 0, local_size_y_id = 1, local_size_z_id = 2) in; * output at a single output location. */ void main() { - const ivec3 pos = idx_to_u16pos_x_wise(gl_GlobalInvocationID.x, out_limits.x, out_limits.y); + const ivec3 pos = idx_to_ipos_x_wise(gl_GlobalInvocationID.x, out_limits.x, out_limits.y); if (any(greaterThanEqual(pos, out_limits))) { return; diff --git a/backends/vulkan/runtime/graph/ops/glsl/conv2d_pw.glsl b/backends/vulkan/runtime/graph/ops/glsl/conv2d_pw.glsl index ad5d4adb13..91a067c690 100644 --- a/backends/vulkan/runtime/graph/ops/glsl/conv2d_pw.glsl +++ b/backends/vulkan/runtime/graph/ops/glsl/conv2d_pw.glsl @@ -16,7 +16,7 @@ #define op(X, A, B) ${OPERATOR} -#include "indexing_utils_u16.h" +#include "indexing_utils.h" layout(std430) buffer; @@ -32,10 +32,8 @@ ${layout_declare_ubo(8, "float", "out_min", "float", "out_max")} layout(local_size_x_id = 0, local_size_y_id = 1, local_size_z_id = 2) in; -#extension GL_EXT_shader_explicit_arithmetic_types_int16 : require - // shared memory to hold calculated positions, this would reduce register usage thus improving performance. -shared u16vec2 pos_shared[gl_WorkGroupSize.x * gl_WorkGroupSize.y * gl_WorkGroupSize.z * TILE_SIZE * TILE_SIZE]; +shared ivec2 pos_shared[gl_WorkGroupSize.x * gl_WorkGroupSize.y * gl_WorkGroupSize.z * TILE_SIZE * TILE_SIZE]; /* * Computes a 2D pointwise convolution of an NxN output tile. Calculating an @@ -46,7 +44,7 @@ void main() { const ivec2 out_limits_scaled = (out_limits.xy + TILE_SIZE - 1) / TILE_SIZE; const uint shared_mem_stride = gl_WorkGroupSize.x * gl_WorkGroupSize.y * gl_WorkGroupSize.z; - const u16vec3 gpos = idx_to_u16pos_x_wise(gl_GlobalInvocationID.x, out_limits_scaled.x, out_limits_scaled.y); + const ivec3 gpos = idx_to_ipos_x_wise(gl_GlobalInvocationID.x, out_limits_scaled.x, out_limits_scaled.y); // Output position for TILE_SIZE = 2 // +--------+--------+ @@ -54,10 +52,10 @@ void main() { // +--------+--------+ // | pos[2] | pos[3] | // +--------+--------+ - u16vec2 pos[TILE_SIZE * TILE_SIZE]; + ivec2 pos[TILE_SIZE * TILE_SIZE]; for (int y = 0, i = 0; y < TILE_SIZE; ++y) { for (int x = 0; x < TILE_SIZE; ++x) { - pos[i] = u16vec2( + pos[i] = ivec2( gpos.x * TILE_SIZE + x, gpos.y * TILE_SIZE + y); pos_shared[(shared_mem_stride * i) + gl_LocalInvocationIndex] = pos[i]; i++; @@ -66,38 +64,38 @@ void main() { // If the top left position is out of bounds, then this invocation will have // no work to do. - if (any(greaterThanEqual(u16vec3(pos[0], gpos.z), out_limits))) { + if (any(greaterThanEqual(ivec3(pos[0], gpos.z), out_limits))) { return; } // Compute the index of the input texture that needs to be loaded for each // output position. Note that negative indices can be produced indicating that // the top-left element is in a region added by padding. - u16vec2 ipos[TILE_SIZE * TILE_SIZE]; + ivec2 ipos[TILE_SIZE * TILE_SIZE]; for (int i = 0; i < TILE_SIZE * TILE_SIZE; ++i) { - ipos[i] = pos[i] * u16vec2(stride) - u16vec2(padding); + ipos[i] = pos[i] * stride - padding; } vec4 sum[TILE_SIZE * TILE_SIZE]; - sum[0] = texelFetch(t_bias, u16vec2(gpos.z, 0), 0); + sum[0] = texelFetch(t_bias, ivec2(gpos.z, 0), 0); for (int i = 1; i < TILE_SIZE * TILE_SIZE; ++i) { sum[i] = sum[0]; } int z4 = 0; // Since the kernel is 1x1, we only have to loop over the depth dimension. - for (uint16_t z = uint16_t(0); z < uint16_t(in_group_size); z += uint16_t(4), ++z4) { + for (int z = 0; z < in_group_size; z += 4, ++z4) { // During prepacking, the weight tensor has been permuted so that the // channel (IC) dim is along the x-axis, and the batch (OC) dim is along // the z-axis. - const vec4 ktex_0 = texelFetchOffset(t_kernel, u16vec2(z, gpos.z), 0, u16vec2(0, 0)); - const vec4 ktex_1 = texelFetchOffset(t_kernel, u16vec2(z, gpos.z), 0, u16vec2(1, 0)); - const vec4 ktex_2 = texelFetchOffset(t_kernel, u16vec2(z, gpos.z), 0, u16vec2(2, 0)); - const vec4 ktex_3 = texelFetchOffset(t_kernel, u16vec2(z, gpos.z), 0, u16vec2(3, 0)); + const vec4 ktex_0 = texelFetchOffset(t_kernel, ivec2(z, gpos.z), 0, ivec2(0, 0)); + const vec4 ktex_1 = texelFetchOffset(t_kernel, ivec2(z, gpos.z), 0, ivec2(1, 0)); + const vec4 ktex_2 = texelFetchOffset(t_kernel, ivec2(z, gpos.z), 0, ivec2(2, 0)); + const vec4 ktex_3 = texelFetchOffset(t_kernel, ivec2(z, gpos.z), 0, ivec2(3, 0)); #pragma unroll for (int i = 0; i < TILE_SIZE * TILE_SIZE; ++i) { - const vec4 in_tex = texelFetch(t_in, u16vec3(ipos[i], z4), 0); + const vec4 in_tex = texelFetch(t_in, ivec3(ipos[i], z4), 0); // For 2x2 tile size algorithm works as follows. // To explain the calculations below, the contents of one in_tex and the // group of 4 texels loaded from t_kernel are shown: @@ -139,9 +137,9 @@ void main() { } for (int i = 0; i < TILE_SIZE * TILE_SIZE; ++i) { - const u16vec2 pos = pos_shared[(shared_mem_stride * i) + gl_LocalInvocationIndex]; - if (all(lessThan(u16vec3(pos, gpos.z), out_limits))) { - imageStore(t_out, u16vec3(pos, gpos.z), op(sum[i], out_min, out_max)); + const ivec2 pos = pos_shared[(shared_mem_stride * i) + gl_LocalInvocationIndex]; + if (all(lessThan(ivec3(pos, gpos.z), out_limits))) { + imageStore(t_out, ivec3(pos, gpos.z), op(sum[i], out_min, out_max)); } } } diff --git a/backends/vulkan/runtime/graph/ops/glsl/indexing_utils_u16.h b/backends/vulkan/runtime/graph/ops/glsl/indexing_utils_u16.h deleted file mode 100644 index 6dc59b6303..0000000000 --- a/backends/vulkan/runtime/graph/ops/glsl/indexing_utils_u16.h +++ /dev/null @@ -1,19 +0,0 @@ -/* - * Copyright (c) Meta Platforms, Inc. and affiliates. - * All rights reserved. - * - * This source code is licensed under the BSD-style license found in the - * LICENSE file in the root directory of this source tree. - */ - -#ifndef INDEXING_UTILS_U16_H -#define INDEXING_UTILS_U16_H - -#extension GL_EXT_shader_explicit_arithmetic_types_int16 : require - -u16vec3 idx_to_u16pos_x_wise(uint idx, int size_x, int size_y) { - const uint div_by_x = idx / size_x; - return u16vec3(idx % size_x, div_by_x % size_y, div_by_x / size_y); -} - -#endif // INDEXING_UTILS_U16_H