From 558f1fbbf3ba641090e60530cfdb8aa97951b6dd Mon Sep 17 00:00:00 2001
From: Joshua Simmons <josh@nega.tv>
Date: Sat, 9 Nov 2024 17:20:06 +0100
Subject: [PATCH] shark-shaders: Improve scatter performance

Use a bitmap in LDS to represent sparse scattered tiles. We can write
arbitrary geometry into the bitmap (e.g. lines) and then count and emit
to vram.

This improves performance from ~600us to ~300us on my 6800, and allows
us to handle line geometry in the future without a large performance
hit.
---
 .../shaders/draw_2d_bin_1_scatter.comp        | 111 +++++++++---------
 1 file changed, 57 insertions(+), 54 deletions(-)

diff --git a/title/shark-shaders/shaders/draw_2d_bin_1_scatter.comp b/title/shark-shaders/shaders/draw_2d_bin_1_scatter.comp
index df750ed..bff9642 100644
--- a/title/shark-shaders/shaders/draw_2d_bin_1_scatter.comp
+++ b/title/shark-shaders/shaders/draw_2d_bin_1_scatter.comp
@@ -29,16 +29,20 @@ layout(std430, push_constant) uniform Draw2dScatterConstantsBlock {
     Draw2dScatterConstants constants;
 };
 
+const uint MAX_TILES = 256;
+const uint BITMAP_STRIDE = MAX_TILES / 32;
+const uint BITMAP_SIZE = MAX_TILES * BITMAP_STRIDE;
+
+shared uint intersected_tiles[BITMAP_SIZE];
+
 layout (local_size_x_id = 0, local_size_y = 1, local_size_z = 1) in;
 
 void main() {
     const uint local_id = gl_SubgroupID * gl_SubgroupSize + gl_SubgroupInvocationID;
     const uint draw_index = gl_WorkGroupID.x * gl_WorkGroupSize.x + local_id;
 
-    // Bounds for this command, any tiles which intersect this AABB will be written.
     vec2 cmd_min = vec2(99999.9);
     vec2 cmd_max = vec2(-99999.9);
-
     if (draw_index < constants.draw_buffer_len) {
         const Draw2dCmd cmd = constants.draw_buffer.values[draw_index];
         const uint type = cmd.type;
@@ -64,66 +68,17 @@ void main() {
         }
     }
 
-    const vec2 cmds_min = subgroupMin(cmd_min);
-    const vec2 cmds_max = subgroupMax(cmd_max);
-
-    // Are all our commands entirely offscreen?
-    if (any(greaterThan(cmds_min, constants.screen_resolution)) || any(lessThan(cmds_max, vec2(0.0)))) {
-        return;
-    }
-
     const uvec2 cmd_min_tile = uvec2(floor(max(min(cmd_min, constants.screen_resolution), 0.0) / TILE_SIZE));
     const uvec2 cmd_max_tile = uvec2(floor(max(min(cmd_max, constants.screen_resolution), 0.0) / TILE_SIZE));
     const uvec2 cmds_min_tile = subgroupMin(cmd_min_tile);
     const uvec2 cmds_max_tile = subgroupMax(cmd_max_tile);
 
-    // Are any single commands responsible for the entire bounds?
+    // Is any single command responsible for the entire bounds?
     const bool cmd_absolute_min = cmd_min_tile == cmds_min_tile;
     const bool cmd_absolute_max = cmd_max_tile == cmds_max_tile;
-    const bool use_individual_bounds = !any(notEqual(subgroupBallot(cmd_absolute_min) & subgroupBallot(cmd_absolute_max), uvec4(0)));
-
-    if (use_individual_bounds) {
-        uint count = 0;
-
-        for (uint y = cmds_min_tile.y; y <= cmds_max_tile.y; y++) {
-            for (uint x = cmds_min_tile.x; x <= cmds_max_tile.x; x++) {
-                const vec2 tile_min = uvec2(x, y) * TILE_SIZE;
-                const vec2 tile_max = min(tile_min + TILE_SIZE, constants.screen_resolution);
-
-                const bool intersects = !(any(lessThan(tile_max, cmd_min)) || any(greaterThan(tile_min, cmd_max)));
-                const uvec4 ballot = subgroupBallot(intersects);
-
-                if (subgroupElect()) {
-                    count += uint(ballot.x != 0);
-                }
-            }
-        }
+    const bool use_combined_bounds = subgroupAny(cmd_absolute_min && cmd_absolute_max);
 
-        uint offset;
-        if (subgroupElect()) {
-            offset = atomicAdd(constants.coarse_buffer.values[0], count) + 1;
-        }
-        offset = subgroupBroadcastFirst(offset);
-
-        if (offset >= constants.coarse_buffer_len) {
-            return;
-        }
-
-        for (uint y = cmds_min_tile.y; y <= cmds_max_tile.y; y++) {
-            for (uint x = cmds_min_tile.x; x <= cmds_max_tile.x; x++) {
-                const vec2 tile_min = uvec2(x, y) * TILE_SIZE;
-                const vec2 tile_max = min(tile_min + TILE_SIZE, constants.screen_resolution);
-
-                const bool intersects = !(any(lessThan(tile_max, cmd_min)) || any(greaterThan(tile_min, cmd_max)));
-                const uvec4 ballot = subgroupBallot(intersects);
-
-                if (subgroupElect() && ballot.x != 0 && offset < constants.coarse_buffer_len) {
-                    const uint packed = ((y & 0xff) << 24) | ((x & 0xff) << 16) | (gl_WorkGroupID.x & 0xffff);
-                    constants.coarse_buffer.values[offset++] = packed;
-                }
-            }
-        }
-    } else {
+    if (use_combined_bounds) {
         const uvec2 tile_count = cmds_max_tile - cmds_min_tile + uvec2(1);
         const uint count = tile_count.x * tile_count.y;
 
@@ -150,5 +105,53 @@ void main() {
             }
             offset += tile_count.x;
         }
+    } else {
+        for (uint i = 0; i < BITMAP_SIZE; i += gl_SubgroupSize) {
+            intersected_tiles[i + gl_SubgroupInvocationID] = 0;
+        }
+
+        subgroupBarrier();
+
+        for (uint y = cmd_min_tile.y; y <= cmd_max_tile.y; y++) {
+            for (uint x = cmd_min_tile.x; x <= cmd_max_tile.x; x++) {
+                atomicOr(intersected_tiles[y * BITMAP_STRIDE + x / 32], 1 << (x & 31));
+            }
+        }
+
+        subgroupBarrier();
+
+        uint count = 0;
+        for (uint i = 0; i < BITMAP_SIZE; i += gl_SubgroupSize) {
+            count += subgroupAdd(bitCount(intersected_tiles[i + gl_SubgroupInvocationID]));
+        }
+
+        if (count == 0) {
+            return;
+        }
+
+        uint offset;
+        if (subgroupElect()) {
+            offset = atomicAdd(constants.coarse_buffer.values[0], count) + 1;
+        }
+        offset = subgroupBroadcastFirst(offset);
+
+        for (uint i = 0; i < BITMAP_SIZE; i += gl_SubgroupSize) {
+            const uint ii = i + gl_SubgroupInvocationID;
+            uint bitmap = intersected_tiles[ii];
+            const uint count = bitCount(bitmap);
+            uint scan = subgroupExclusiveAdd(count);
+
+            while (bitmap != 0) {
+                const uint index = findLSB(bitmap);
+                bitmap ^= bitmap & -bitmap;
+                const uint y = (ii * 32 + index) / MAX_TILES;
+                const uint x = (ii * 32 + index) & (MAX_TILES - 1);
+                const uint packed = (y << 24) | (x << 16) | (gl_WorkGroupID.x & 0xffff);
+                constants.coarse_buffer.values[offset + scan] = packed;
+                scan++;
+            }
+
+            offset += subgroupAdd(count);
+        }
     }
 }
-- 
2.49.0