From 558f1fbbf3ba641090e60530cfdb8aa97951b6dd Mon Sep 17 00:00:00 2001 From: Joshua Simmons Date: Sat, 9 Nov 2024 17:20:06 +0100 Subject: [PATCH] shark-shaders: Improve scatter performance Use a bitmap in LDS to represent sparse scattered tiles. We can write arbitrary geometry into the bitmap (e.g. lines) and then count and emit to vram. This improves performance from ~600us to ~300us on my 6800, and allows us to handle line geometry in the future without a large performance hit. --- .../shaders/draw_2d_bin_1_scatter.comp | 111 +++++++++--------- 1 file changed, 57 insertions(+), 54 deletions(-) diff --git a/title/shark-shaders/shaders/draw_2d_bin_1_scatter.comp b/title/shark-shaders/shaders/draw_2d_bin_1_scatter.comp index df750ed..bff9642 100644 --- a/title/shark-shaders/shaders/draw_2d_bin_1_scatter.comp +++ b/title/shark-shaders/shaders/draw_2d_bin_1_scatter.comp @@ -29,16 +29,20 @@ layout(std430, push_constant) uniform Draw2dScatterConstantsBlock { Draw2dScatterConstants constants; }; +const uint MAX_TILES = 256; +const uint BITMAP_STRIDE = MAX_TILES / 32; +const uint BITMAP_SIZE = MAX_TILES * BITMAP_STRIDE; + +shared uint intersected_tiles[BITMAP_SIZE]; + layout (local_size_x_id = 0, local_size_y = 1, local_size_z = 1) in; void main() { const uint local_id = gl_SubgroupID * gl_SubgroupSize + gl_SubgroupInvocationID; const uint draw_index = gl_WorkGroupID.x * gl_WorkGroupSize.x + local_id; - // Bounds for this command, any tiles which intersect this AABB will be written. vec2 cmd_min = vec2(99999.9); vec2 cmd_max = vec2(-99999.9); - if (draw_index < constants.draw_buffer_len) { const Draw2dCmd cmd = constants.draw_buffer.values[draw_index]; const uint type = cmd.type; @@ -64,66 +68,17 @@ void main() { } } - const vec2 cmds_min = subgroupMin(cmd_min); - const vec2 cmds_max = subgroupMax(cmd_max); - - // Are all our commands entirely offscreen? - if (any(greaterThan(cmds_min, constants.screen_resolution)) || any(lessThan(cmds_max, vec2(0.0)))) { - return; - } - const uvec2 cmd_min_tile = uvec2(floor(max(min(cmd_min, constants.screen_resolution), 0.0) / TILE_SIZE)); const uvec2 cmd_max_tile = uvec2(floor(max(min(cmd_max, constants.screen_resolution), 0.0) / TILE_SIZE)); const uvec2 cmds_min_tile = subgroupMin(cmd_min_tile); const uvec2 cmds_max_tile = subgroupMax(cmd_max_tile); - // Are any single commands responsible for the entire bounds? + // Is any single command responsible for the entire bounds? const bool cmd_absolute_min = cmd_min_tile == cmds_min_tile; const bool cmd_absolute_max = cmd_max_tile == cmds_max_tile; - const bool use_individual_bounds = !any(notEqual(subgroupBallot(cmd_absolute_min) & subgroupBallot(cmd_absolute_max), uvec4(0))); - - if (use_individual_bounds) { - uint count = 0; - - for (uint y = cmds_min_tile.y; y <= cmds_max_tile.y; y++) { - for (uint x = cmds_min_tile.x; x <= cmds_max_tile.x; x++) { - const vec2 tile_min = uvec2(x, y) * TILE_SIZE; - const vec2 tile_max = min(tile_min + TILE_SIZE, constants.screen_resolution); - - const bool intersects = !(any(lessThan(tile_max, cmd_min)) || any(greaterThan(tile_min, cmd_max))); - const uvec4 ballot = subgroupBallot(intersects); - - if (subgroupElect()) { - count += uint(ballot.x != 0); - } - } - } + const bool use_combined_bounds = subgroupAny(cmd_absolute_min && cmd_absolute_max); - uint offset; - if (subgroupElect()) { - offset = atomicAdd(constants.coarse_buffer.values[0], count) + 1; - } - offset = subgroupBroadcastFirst(offset); - - if (offset >= constants.coarse_buffer_len) { - return; - } - - for (uint y = cmds_min_tile.y; y <= cmds_max_tile.y; y++) { - for (uint x = cmds_min_tile.x; x <= cmds_max_tile.x; x++) { - const vec2 tile_min = uvec2(x, y) * TILE_SIZE; - const vec2 tile_max = min(tile_min + TILE_SIZE, constants.screen_resolution); - - const bool intersects = !(any(lessThan(tile_max, cmd_min)) || any(greaterThan(tile_min, cmd_max))); - const uvec4 ballot = subgroupBallot(intersects); - - if (subgroupElect() && ballot.x != 0 && offset < constants.coarse_buffer_len) { - const uint packed = ((y & 0xff) << 24) | ((x & 0xff) << 16) | (gl_WorkGroupID.x & 0xffff); - constants.coarse_buffer.values[offset++] = packed; - } - } - } - } else { + if (use_combined_bounds) { const uvec2 tile_count = cmds_max_tile - cmds_min_tile + uvec2(1); const uint count = tile_count.x * tile_count.y; @@ -150,5 +105,53 @@ void main() { } offset += tile_count.x; } + } else { + for (uint i = 0; i < BITMAP_SIZE; i += gl_SubgroupSize) { + intersected_tiles[i + gl_SubgroupInvocationID] = 0; + } + + subgroupBarrier(); + + for (uint y = cmd_min_tile.y; y <= cmd_max_tile.y; y++) { + for (uint x = cmd_min_tile.x; x <= cmd_max_tile.x; x++) { + atomicOr(intersected_tiles[y * BITMAP_STRIDE + x / 32], 1 << (x & 31)); + } + } + + subgroupBarrier(); + + uint count = 0; + for (uint i = 0; i < BITMAP_SIZE; i += gl_SubgroupSize) { + count += subgroupAdd(bitCount(intersected_tiles[i + gl_SubgroupInvocationID])); + } + + if (count == 0) { + return; + } + + uint offset; + if (subgroupElect()) { + offset = atomicAdd(constants.coarse_buffer.values[0], count) + 1; + } + offset = subgroupBroadcastFirst(offset); + + for (uint i = 0; i < BITMAP_SIZE; i += gl_SubgroupSize) { + const uint ii = i + gl_SubgroupInvocationID; + uint bitmap = intersected_tiles[ii]; + const uint count = bitCount(bitmap); + uint scan = subgroupExclusiveAdd(count); + + while (bitmap != 0) { + const uint index = findLSB(bitmap); + bitmap ^= bitmap & -bitmap; + const uint y = (ii * 32 + index) / MAX_TILES; + const uint x = (ii * 32 + index) & (MAX_TILES - 1); + const uint packed = (y << 24) | (x << 16) | (gl_WorkGroupID.x & 0xffff); + constants.coarse_buffer.values[offset + scan] = packed; + scan++; + } + + offset += subgroupAdd(count); + } } } -- 2.49.0