From: Josh Simmons Date: Sat, 1 Jun 2024 14:36:49 +0000 (+0200) Subject: shark: Improve performance of binning shader X-Git-Url: https://git.nega.tv//gitweb.cgi?a=commitdiff_plain;h=483d031bca775327fd9db2b9c8bba2c1ff0011ba;p=josh%2Fnarcissus shark: Improve performance of binning shader --- diff --git a/title/shark-shaders/shaders/primitive_2d_bin.comp.glsl b/title/shark-shaders/shaders/primitive_2d_bin.comp.glsl index 894ebb5..277d272 100644 --- a/title/shark-shaders/shaders/primitive_2d_bin.comp.glsl +++ b/title/shark-shaders/shaders/primitive_2d_bin.comp.glsl @@ -10,21 +10,32 @@ #include "primitive_2d.h" +layout(std430, set = 0, binding = 3) readonly coherent buffer glyphInstanceBufferCoherent { + GlyphInstance glyph_instances_coherent[]; +}; + +#define SUBGROUP_SIZE 64 +#define NUM_PRIMS_WG (SUBGROUP_SIZE * 32) + // TODO: Spec constant support for different subgroup sizes. -layout (local_size_x = 64, local_size_y = 1, local_size_z = 1) in; +layout (local_size_x = SUBGROUP_SIZE, local_size_y = 1, local_size_z = 1) in; -shared uint bitmap_0[64]; +shared uint bitmap_0[SUBGROUP_SIZE]; void main() { const uvec2 bin_coord = gl_GlobalInvocationID.yz; const uvec2 bin_min = bin_coord * TILE_SIZE * 8; const uvec2 bin_max = min(bin_min + TILE_SIZE * 8, primitive_uniforms.screen_resolution); - for (uint i = 0; i < 2048; i += 64) { - const uint prim_index = gl_WorkGroupID.x * 2048 + i + gl_SubgroupInvocationID; + for (uint i = 0; i < NUM_PRIMS_WG; i += gl_SubgroupSize.x) { + const uint prim_index = gl_WorkGroupID.x * NUM_PRIMS_WG + i + gl_SubgroupInvocationID; bool intersects = false; if (prim_index < primitive_uniforms.num_primitives) { - intersects = test_glyph(prim_index, bin_min, bin_max); + const GlyphInstance gi = glyph_instances_coherent[prim_index]; + const Glyph gl = glyphs[gi.index]; + const vec2 glyph_min = gi.position + gl.offset_min; + const vec2 glyph_max = gi.position + gl.offset_max; + intersects = !(any(lessThan(bin_max, glyph_min)) || any(greaterThan(bin_min, glyph_max))); } const uvec4 ballot = subgroupBallot(intersects); bitmap_0[i / 32 + 0] = ballot.x; @@ -33,54 +44,39 @@ void main() { memoryBarrierShared(); - uint bitmap_1[2]; - { - const uvec4 ballot = subgroupBallot(bitmap_0[gl_SubgroupInvocationID] != 0); - bitmap_1[0] = ballot.x; - bitmap_1[1] = ballot.y; - } - - for (uint y = 0; y < 8; y++) { - for (uint x = 0; x < 8; x++) { - const uvec2 tile_coord = gl_GlobalInvocationID.yz * 8 + uvec2(x, y); - const uvec2 tile_min = tile_coord * TILE_SIZE; - const uvec2 tile_max = min(tile_min + TILE_SIZE, primitive_uniforms.screen_resolution); - [[branch]] - if (any(greaterThanEqual(tile_min, tile_max))) { - continue; - } - - const uint tile_index = tile_coord.y * primitive_uniforms.tile_stride + tile_coord.x; - - for (uint i = 0; i < 2; i++) { - uint out_1 = 0; - - uint word_1 = bitmap_1[i]; - while (word_1 != 0) { - const uint bit_1 = findLSB(word_1); - word_1 ^= word_1 & -word_1; - - uint out_0 = 0; - uint index_0 = i * 32 + bit_1; - uint word_0 = bitmap_0[index_0]; - while (word_0 != 0) { - const uint bit_0 = findLSB(word_0); - word_0 ^= word_0 & -word_0; - - const uint prim_index = gl_WorkGroupID.x * 2048 + index_0 * 32 + bit_0; - if (test_glyph(prim_index, tile_min, tile_max)) { - out_0 |= 1 << bit_0; - } + const uint x = gl_SubgroupInvocationID.x & 7; + const uint y = gl_SubgroupInvocationID.x >> 3; + const uvec2 tile_coord = gl_GlobalInvocationID.yz * 8 + uvec2(x, y); + const uvec2 tile_min = tile_coord * TILE_SIZE; + const uvec2 tile_max = min(tile_min + TILE_SIZE, primitive_uniforms.screen_resolution); + + if (all(lessThan(tile_min, tile_max))) { + const uint tile_index = tile_coord.y * primitive_uniforms.tile_stride + tile_coord.x; + + for (uint i = 0; i < 2; i++) { + uint out_1 = 0; + + for (uint j = 0; j < 32; j++) { + uint out_0 = 0; + uint index_0 = i * 32 + j; + uint word_0 = bitmap_0[index_0]; + while (word_0 != 0) { + const uint bit_0 = findLSB(word_0); + word_0 ^= word_0 & -word_0; + + const uint prim_index = gl_WorkGroupID.x * NUM_PRIMS_WG + index_0 * 32 + bit_0; + if (test_glyph(prim_index, tile_min, tile_max)) { + out_0 |= 1 << bit_0; } + } - if (out_0 != 0) { - out_1 |= 1 << bit_1; - } + if (out_0 != 0) { + out_1 |= 1 << j; tile_bitmap_wo[tile_index * TILE_STRIDE + TILE_BITMAP_L0_OFFSET + gl_WorkGroupID.x * 64 + index_0] = out_0; } - - tile_bitmap_wo[tile_index * TILE_STRIDE + TILE_BITMAP_L1_OFFSET + gl_WorkGroupID.x * 2 + i] = out_1; } + + tile_bitmap_wo[tile_index * TILE_STRIDE + TILE_BITMAP_L1_OFFSET + gl_WorkGroupID.x * 2 + i] = out_1; } } }