From: Josh Simmons <josh@nega.tv>
Date: Mon, 27 May 2024 06:43:34 +0000 (+0200)
Subject: shark: Try single pass for coarse culling
X-Git-Url: https://git.nega.tv//gitweb.cgi?a=commitdiff_plain;h=874cbd727a3879d08ce2fe157046aeb16052dc55;p=josh%2Fnarcissus

shark: Try single pass for coarse culling
---

diff --git a/title/shark-shaders/shaders/primitive_2d_bin_coarse.comp.glsl b/title/shark-shaders/shaders/primitive_2d_bin_coarse.comp.glsl
index 6be407c..00fa31c 100644
--- a/title/shark-shaders/shaders/primitive_2d_bin_coarse.comp.glsl
+++ b/title/shark-shaders/shaders/primitive_2d_bin_coarse.comp.glsl
@@ -29,7 +29,7 @@ void main() {
 
     uvec4 ballot_result = subgroupBallot(intersects);
     if (subgroupElect()) { // managed democracy wins again
-        const uint tile_index = tile_coord.y * TILE_DISPATCH_X + tile_coord.x;
+        const uint tile_index = tile_coord.y * (primitive_uniforms.tile_stride_fine / TILE_SIZE_MUL)  + tile_coord.x;
         const uint tile_offset = tile_index * TILE_STRIDE_COARSE;
         coarse_bitmap_wo[tile_offset + 2 * gl_WorkGroupID.x + 0] = ballot_result.x;
         coarse_bitmap_wo[tile_offset + 2 * gl_WorkGroupID.x + 1] = ballot_result.y;
diff --git a/title/shark-shaders/shaders/primitive_2d_bin_fine.comp.glsl b/title/shark-shaders/shaders/primitive_2d_bin_fine.comp.glsl
index b4c8c33..06c9a7b 100644
--- a/title/shark-shaders/shaders/primitive_2d_bin_fine.comp.glsl
+++ b/title/shark-shaders/shaders/primitive_2d_bin_fine.comp.glsl
@@ -27,8 +27,8 @@ void main() {
 
     uint bitmap_l0 = 0;
     if (index < primitive_uniforms.num_primitives_32) {
-        const uvec2 tile_coord_coarse = tile_coord / TILE_SIZE_MUL;
-        const uint tile_index_coarse = tile_coord_coarse.y * TILE_DISPATCH_X + tile_coord_coarse.x;
+        const uvec2 tile_coord_coarse = (tile_coord / TILE_SIZE_MUL) + primitive_uniforms.tile_offset_coarse;
+        const uint tile_index_coarse = tile_coord_coarse.y * (primitive_uniforms.tile_stride_fine / TILE_SIZE_MUL) + tile_coord_coarse.x;
         const uint tile_base_coarse = tile_index_coarse * TILE_STRIDE_COARSE;
         const uint tile_bitmap_base_coarse = tile_base_coarse + TILE_BITMAP_OFFSET_COARSE;
 
diff --git a/title/shark/src/main.rs b/title/shark/src/main.rs
index c5b378b..e50ab41 100644
--- a/title/shark/src/main.rs
+++ b/title/shark/src/main.rs
@@ -885,23 +885,11 @@ impl<'gpu> DrawState<'gpu> {
         let models = Models::load(gpu);
         let images = Images::load(gpu, thread_token);
 
-        let coarse_bitmap_buffer_size = TILE_DISPATCH_COARSE_X
-            * TILE_DISPATCH_COARSE_Y
-            * TILE_STRIDE_COARSE
-            * std::mem::size_of::<u32>() as u32;
-
         let fine_bitmap_buffer_size = TILE_DISPATCH_FINE_X
             * TILE_DISPATCH_FINE_Y
             * TILE_STRIDE_FINE
             * std::mem::size_of::<u32>() as u32;
 
-        let coarse_tile_bitmap_buffer = gpu.create_buffer(&BufferDesc {
-            memory_location: MemoryLocation::Device,
-            host_mapped: false,
-            usage: BufferUsageFlags::STORAGE,
-            size: coarse_bitmap_buffer_size.widen(),
-        });
-
         let fine_tile_bitmap_buffer = gpu.create_buffer(&BufferDesc {
             memory_location: MemoryLocation::Device,
             host_mapped: false,
@@ -923,7 +911,7 @@ impl<'gpu> DrawState<'gpu> {
             depth_image: default(),
             rt_image: default(),
             ui_image: default(),
-            coarse_tile_bitmap_buffer,
+            coarse_tile_bitmap_buffer: default(),
             fine_tile_bitmap_buffer,
             fine_tile_color_buffer: default(),
             glyph_atlas_image: default(),
@@ -1047,6 +1035,19 @@ impl<'gpu> DrawState<'gpu> {
                     || tile_resolution_fine_y != self.tile_resolution_fine_y
                 {
                     gpu.destroy_buffer(frame, self.fine_tile_color_buffer);
+                    gpu.destroy_buffer(frame, self.coarse_tile_bitmap_buffer);
+
+                    let coarse_bitmap_buffer_size = tile_resolution_coarse_x
+                        * tile_resolution_coarse_y
+                        * TILE_STRIDE_COARSE
+                        * std::mem::size_of::<u32>() as u32;
+
+                    self.coarse_tile_bitmap_buffer = gpu.create_buffer(&BufferDesc {
+                        memory_location: MemoryLocation::Device,
+                        host_mapped: false,
+                        usage: BufferUsageFlags::STORAGE,
+                        size: coarse_bitmap_buffer_size.widen(),
+                    });
 
                     // align to the workgroup size to simplify shader
                     let fine_color_buffer_size =
@@ -1418,19 +1419,48 @@ impl<'gpu> DrawState<'gpu> {
 
                 ui_state.primitive_instances.clear();
 
+                gpu.cmd_set_pipeline(cmd_encoder, self.primitive_2d_pipeline.coarse_bin_pipeline);
+
+                gpu.cmd_push_constants(
+                    cmd_encoder,
+                    ShaderStageFlags::COMPUTE,
+                    0,
+                    &PrimitiveUniforms {
+                        screen_resolution_x: self.width,
+                        screen_resolution_y: self.height,
+                        atlas_resolution_x: atlas_width,
+                        atlas_resolution_y: atlas_height,
+                        num_primitives,
+                        num_primitives_32,
+                        num_primitives_1024,
+                        tile_stride_fine: self.tile_resolution_fine_x,
+                        tile_offset_x: 0,
+                        tile_offset_y: 0,
+                    },
+                );
+
+                gpu.cmd_dispatch(
+                    cmd_encoder,
+                    (num_primitives + 63) / 64,
+                    self.tile_resolution_coarse_x,
+                    self.tile_resolution_coarse_y,
+                );
+
+                gpu.cmd_barrier(
+                    cmd_encoder,
+                    Some(&GlobalBarrier {
+                        prev_access: &[Access::ShaderWrite],
+                        next_access: &[Access::ShaderOtherRead],
+                    }),
+                    &[],
+                );
+
                 for tile_offset_y in
                     (0..self.tile_resolution_coarse_y).step_by(TILE_DISPATCH_COARSE_Y as usize)
                 {
                     for tile_offset_x in
                         (0..self.tile_resolution_coarse_x).step_by(TILE_DISPATCH_COARSE_X as usize)
                     {
-                        let coarse_dispatch_x = (tile_offset_x + TILE_DISPATCH_COARSE_X)
-                            .min(self.tile_resolution_coarse_x)
-                            - tile_offset_x;
-                        let coarse_dispatch_y = (tile_offset_y + TILE_DISPATCH_COARSE_Y)
-                            .min(self.tile_resolution_coarse_y)
-                            - tile_offset_y;
-
                         let tile_offset_fine_x =
                             tile_offset_x * (TILE_SIZE_COARSE / TILE_SIZE_FINE);
 
@@ -1445,11 +1475,6 @@ impl<'gpu> DrawState<'gpu> {
                             .min(self.tile_resolution_fine_y)
                             - tile_offset_fine_y;
 
-                        gpu.cmd_set_pipeline(
-                            cmd_encoder,
-                            self.primitive_2d_pipeline.coarse_bin_pipeline,
-                        );
-
                         gpu.cmd_push_constants(
                             cmd_encoder,
                             ShaderStageFlags::COMPUTE,
@@ -1468,22 +1493,6 @@ impl<'gpu> DrawState<'gpu> {
                             },
                         );
 
-                        gpu.cmd_dispatch(
-                            cmd_encoder,
-                            (num_primitives + 63) / 64,
-                            coarse_dispatch_x,
-                            coarse_dispatch_y,
-                        );
-
-                        gpu.cmd_barrier(
-                            cmd_encoder,
-                            Some(&GlobalBarrier {
-                                prev_access: &[Access::ShaderWrite],
-                                next_access: &[Access::ShaderOtherRead],
-                            }),
-                            &[],
-                        );
-
                         gpu.cmd_set_pipeline(
                             cmd_encoder,
                             self.primitive_2d_pipeline.fine_bin_pipeline,