From: Josh Simmons <josh@nega.tv>
Date: Mon, 11 Nov 2024 18:50:30 +0000 (+0100)
Subject: shark-shaders: Improve scatter robustness and perf
X-Git-Url: https://git.nega.tv//gitweb.cgi?a=commitdiff_plain;h=01df40d53f6d57f9773b5952573ed4e8311d21ed;p=josh%2Fnarcissus

shark-shaders: Improve scatter robustness and perf
---

diff --git a/title/shark-shaders/shaders/draw_2d_bin_1_scatter.comp b/title/shark-shaders/shaders/draw_2d_bin_1_scatter.comp
index 935c380..c9cfbb6 100644
--- a/title/shark-shaders/shaders/draw_2d_bin_1_scatter.comp
+++ b/title/shark-shaders/shaders/draw_2d_bin_1_scatter.comp
@@ -70,51 +70,62 @@ void main() {
         }
     }
 
-    const uvec2 cmd_min_tile = uvec2(floor(max(min(cmd_min, constants.screen_resolution), 0.0) / TILE_SIZE));
-    const uvec2 cmd_max_tile = uvec2(floor(max(min(cmd_max, constants.screen_resolution), 0.0) / TILE_SIZE));
-    const uvec2 cmds_min_tile = subgroupMin(cmd_min_tile);
-    const uvec2 cmds_max_tile = subgroupMax(cmd_max_tile);
+    // For any out-of-bounds draws, we'll get the defaults of 99999.9 and -99999.9, which will fail
+    // here. Out-of-bounds draws are therefore off-screen. Well, so long as you don't have 27 4k
+    // monitors arranged horizontally.
+    const bool offscreen = any(greaterThan(cmd_min, constants.screen_resolution)) || any(lessThan(cmd_max, vec2(0.0)));
+
+    // Are all draws off-screen?
+    if (subgroupAll(offscreen)) {
+        return;
+    }
+
+    // Make sure off-screen commands don't contribute to the bounds.
+    const uvec2 cmds_min_tile = uvec2(clamp(subgroupMin(offscreen ? ivec2(999999) : ivec2(floor(cmd_min / TILE_SIZE))), ivec2(0), ivec2(constants.tile_resolution)));
+    const uvec2 cmds_max_tile = uvec2(clamp(subgroupMax(offscreen ? ivec2(-999999) : ivec2(floor(cmd_max / TILE_SIZE))), ivec2(0), ivec2(constants.tile_resolution)));
+    const uvec2 cmd_min_tile = uvec2(clamp(ivec2(floor(cmd_min / TILE_SIZE)), ivec2(0), ivec2(constants.tile_resolution)));
+    const uvec2 cmd_max_tile = uvec2(clamp(ivec2(floor(cmd_max / TILE_SIZE)), ivec2(0), ivec2(constants.tile_resolution)));
 
-    // Is any single command responsible for the entire bounds?
-    const bool cmd_absolute_min = cmd_min_tile == cmds_min_tile;
-    const bool cmd_absolute_max = cmd_max_tile == cmds_max_tile;
-    const bool use_combined_bounds = subgroupAny(cmd_absolute_min && cmd_absolute_max);
+    const bool cmd_dominates_bounds = all(equal(cmd_min_tile, cmds_min_tile)) && all(equal(cmd_max_tile, cmds_max_tile));
+    const bool use_combined_bounds = subgroupAny(cmd_dominates_bounds);
 
     if (use_combined_bounds) {
-        const uvec2 tile_count = cmds_max_tile - cmds_min_tile + uvec2(1);
-        const uint count = tile_count.x * tile_count.y;
+        const uvec2 tile_count = cmds_max_tile - cmds_min_tile + ivec2(1);
 
         uint offset;
         if (subgroupElect()) {
-            offset = atomicAdd(constants.coarse_buffer.values[0], count) + 1;
+            offset = atomicAdd(constants.coarse_buffer.values[0], tile_count.x * tile_count.y) + 1;
         }
         offset = subgroupBroadcastFirst(offset);
 
-        if (offset >= constants.coarse_buffer_len) {
-            return;
-        }
-
-        for (uint y = 0; y < tile_count.y; y++) {
-            for (uint x = 0; x < tile_count.x; x += gl_SubgroupSize) {
-                const uint local_x = x + gl_SubgroupInvocationID;
-                if (local_x < tile_count.x) {
-                    const uint yy = cmds_min_tile.y + y;
-                    const uint xx = cmds_min_tile.x + local_x;
-                    const uint packed = ((yy & 0xff) << 24) | ((xx & 0xff) << 16) | (gl_WorkGroupID.x & 0xffff);
-                    constants.coarse_buffer.values[offset + local_x] = packed;
+        for (uint i = 0; i < tile_count.y; i++) {
+            for (uint j = 0; j < tile_count.x; j += gl_SubgroupSize) {
+                const uint jj = j + gl_SubgroupInvocationID;
+                const uint y = cmds_min_tile.y + i;
+                const uint x = cmds_min_tile.x + jj;
+                if (jj < tile_count.x) {
+                    const uint packed = ((y & 0xff) << 24) | ((x & 0xff) << 16) | (gl_WorkGroupID.x & 0xffff);
+                    const uint index = offset + i * tile_count.x + jj;
+                    if (index < constants.coarse_buffer_len) {
+                        constants.coarse_buffer.values[index] = packed;
+                    }
                 }
-                subgroupBarrier();
             }
-            offset += tile_count.x;
         }
     } else {
-        for (uint i = 0; i < BITMAP_SIZE; i += gl_SubgroupSize) {
-            intersected_tiles[i + gl_SubgroupInvocationID] = 0;
+        const uint start = cmds_min_tile.y * BITMAP_STRIDE + cmds_min_tile.x / 32;
+        const uint end = cmds_max_tile.y * BITMAP_STRIDE + cmds_max_tile.x / 32;
+
+        for (uint i = start; i <= end; i += gl_SubgroupSize) {
+            const uint ii = i + gl_SubgroupInvocationID;
+            if (ii < BITMAP_SIZE) {
+                intersected_tiles[ii] = 0;
+            }
         }
 
         subgroupBarrier();
 
-        {
+        if (!offscreen) {
             const uint min_word = cmd_min_tile.x / 32;
             const uint max_word = cmd_max_tile.x / 32;
             const uint min_bit = cmd_min_tile.x & 31;
@@ -139,8 +150,9 @@ void main() {
         subgroupBarrier();
 
         uint count = 0;
-        for (uint i = 0; i < BITMAP_SIZE; i += gl_SubgroupSize) {
-            count += subgroupAdd(bitCount(intersected_tiles[i + gl_SubgroupInvocationID]));
+        for (uint i = start; i <= end; i += gl_SubgroupSize) {
+            const uint ii = i + gl_SubgroupInvocationID;
+            count += subgroupAdd(ii < BITMAP_SIZE ? bitCount(intersected_tiles[ii]) : 0);
         }
 
         if (count == 0) {
@@ -153,11 +165,12 @@ void main() {
         }
         offset = subgroupBroadcastFirst(offset);
 
-        const uint start = cmds_min_tile.y * BITMAP_STRIDE + cmds_min_tile.x / 32;
-        const uint end = cmds_max_tile.y * BITMAP_STRIDE + cmds_max_tile.x / 32;
-
         for (uint i = start; i <= end; i += gl_SubgroupSize) {
             const uint ii = i + gl_SubgroupInvocationID;
+            if (ii >= BITMAP_SIZE) {
+                continue;
+            }
+
             uint bitmap = intersected_tiles[ii];
             const uint count = bitCount(bitmap);
             uint scan = subgroupExclusiveAdd(count);
@@ -168,7 +181,9 @@ void main() {
                 const uint y = (ii * 32 + index) / MAX_TILES;
                 const uint x = (ii * 32 + index) & (MAX_TILES - 1);
                 const uint packed = (y << 24) | (x << 16) | (gl_WorkGroupID.x & 0xffff);
-                constants.coarse_buffer.values[offset + scan] = packed;
+                if (offset + scan < constants.coarse_buffer_len) {
+                    constants.coarse_buffer.values[offset + scan] = packed;
+                }
                 scan++;
             }
 
diff --git a/title/shark/src/main.rs b/title/shark/src/main.rs
index 2aab991..d0eed25 100644
--- a/title/shark/src/main.rs
+++ b/title/shark/src/main.rs
@@ -1305,7 +1305,7 @@ impl<'gpu> DrawState<'gpu> {
                     touched_glyphs,
                 );
 
-                const COARSE_BUFFER_LEN: usize = 1 << 18;
+                const COARSE_BUFFER_LEN: usize = 1 << 20;
                 let coarse_buffer = gpu.request_transient_buffer(
                     frame,
                     thread_token,