]> git.nega.tv - josh/narcissus/commitdiff
shark: Track range of non-zero L1 buckets
authorJosh Simmons <josh@nega.tv>
Sun, 2 Jun 2024 12:52:54 +0000 (14:52 +0200)
committerJosh Simmons <josh@nega.tv>
Sun, 2 Jun 2024 12:54:05 +0000 (14:54 +0200)
title/shark-shaders/build.rs
title/shark-shaders/shaders/display_transform.comp.glsl
title/shark-shaders/shaders/primitive_2d.h
title/shark-shaders/shaders/primitive_2d_bin.comp.glsl
title/shark-shaders/shaders/primitive_2d_bin_clear.comp.glsl [new file with mode: 0644]
title/shark-shaders/shaders/primitive_2d_rasterize.comp.glsl
title/shark/src/main.rs
title/shark/src/pipelines/display_transform.rs
title/shark/src/pipelines/primitive_2d.rs

index af405407d052fdb37013c06e989409f43bb7d5a5..623cd4da120cee4d6aa1183675108b96723f2774 100644 (file)
@@ -22,6 +22,10 @@ const SHADERS: &[Shader] = &[
         stage: "comp",
         name: "primitive_2d_bin",
     },
+    Shader {
+        stage: "comp",
+        name: "primitive_2d_bin_clear",
+    },
     Shader {
         stage: "comp",
         name: "primitive_2d_rasterize",
index f9277132a9afc93bfdcce3d3793f7bea928ef27a..5f1bf321c6cc9606d8ae1b2c04c0a71e6075593a 100644 (file)
@@ -2,6 +2,12 @@
 
 #extension GL_EXT_control_flow_attributes : require
 
+const uint MAX_PRIMS = 1 << 18;
+const uint TILE_BITMAP_L1_WORDS = (MAX_PRIMS / 32 / 32);
+const uint TILE_BITMAP_L0_WORDS = (MAX_PRIMS / 32);
+const uint TILE_STRIDE = (TILE_BITMAP_L0_WORDS + TILE_BITMAP_L1_WORDS + 2);
+const uint TILE_BITMAP_RANGE_OFFSET = 0;
+
 struct PrimitiveUniforms {
     uvec2 screen_resolution;
     uvec2 atlas_resolution;
@@ -20,10 +26,14 @@ layout (set = 0, binding = 0) uniform sampler bilinear_sampler;
 
 layout (set = 0, binding = 1) uniform texture3D tony_mc_mapface_lut;
 
-layout (set = 0, binding = 2, rgba16f) uniform readonly image2D layer_rt;
-layout (set = 0, binding = 3, rgba16f) uniform readonly image2D layer_ui;
+layout(std430, set = 0, binding = 2) readonly buffer tileBufferRead {
+    uint tile_bitmap_ro[];
+};
+
+layout (set = 0, binding = 3, rgba16f) uniform readonly image2D layer_rt;
+layout (set = 0, binding = 4, rgba16f) uniform readonly image2D layer_ui;
 
-layout (set = 0, binding = 4, rgba16f) uniform writeonly image2D composited_output;
+layout (set = 0, binding = 5, rgba16f) uniform writeonly image2D composited_output;
 
 float srgb_oetf(float a) {
     return (.0031308f >= a) ? 12.92f * a : 1.055f * pow(a, .4166666666666667f) - .055f;
@@ -47,8 +57,16 @@ void main() {
     const vec3 transformed = tony_mc_mapface(stimulus);
     vec3 composited = srgb_oetf(transformed);
 
-    const vec4 ui = imageLoad(layer_ui, ivec2(gl_GlobalInvocationID.xy)).rgba;
-    composited = ui.rgb + (composited * (1.0 - ui.a));
+    const uvec2 tile_coord = gl_WorkGroupID.xy / 4;
+    const uint tile_index = tile_coord.y * primitive_uniforms.tile_stride + tile_coord.x;
+    const uint tile_base = tile_index * TILE_STRIDE;
+
+    const uint first = tile_bitmap_ro[tile_base + TILE_BITMAP_RANGE_OFFSET + 0];
+    const uint last = tile_bitmap_ro[tile_base + TILE_BITMAP_RANGE_OFFSET + 1];
+    if (first <= last) {
+        const vec4 ui = imageLoad(layer_ui, ivec2(gl_GlobalInvocationID.xy)).rgba;
+        composited = ui.rgb + (composited * (1.0 - ui.a));
+    }
 
     imageStore(composited_output, ivec2(gl_GlobalInvocationID.xy), vec4(composited, 1.0));
 }
index 46df2fd096e497c730d40034ff59ac92db318d6f..231cc6fc1ecf8feebef4fc03a192dbb78ea30c23 100644 (file)
@@ -3,9 +3,10 @@ const uint TILE_SIZE = 32;
 const uint MAX_PRIMS = 1 << 18;
 const uint TILE_BITMAP_L1_WORDS = (MAX_PRIMS / 32 / 32);
 const uint TILE_BITMAP_L0_WORDS = (MAX_PRIMS / 32);
-const uint TILE_STRIDE = (TILE_BITMAP_L0_WORDS + TILE_BITMAP_L1_WORDS);
-const uint TILE_BITMAP_L1_OFFSET = 0;
-const uint TILE_BITMAP_L0_OFFSET = TILE_BITMAP_L1_WORDS;
+const uint TILE_STRIDE = (TILE_BITMAP_L0_WORDS + TILE_BITMAP_L1_WORDS + 2);
+const uint TILE_BITMAP_RANGE_OFFSET = 0;
+const uint TILE_BITMAP_L1_OFFSET = 2;
+const uint TILE_BITMAP_L0_OFFSET = TILE_BITMAP_L1_OFFSET + TILE_BITMAP_L1_WORDS;
 
 struct PrimitiveUniforms {
     uvec2 screen_resolution;
index 12ddab41202ab410c67ca8593e2edc4e629b5937..b2e3e9f0f0b7679e84dc2a0741710f52ae39b612 100644 (file)
@@ -73,6 +73,11 @@ void main() {
             }
 
             tile_bitmap_wo[tile_index * TILE_STRIDE + TILE_BITMAP_L1_OFFSET + gl_WorkGroupID.x * 2 + i] = out_1;
+
+            if (out_1 != 0) {
+                atomicMin(tile_bitmap_wo[tile_index * TILE_STRIDE + TILE_BITMAP_RANGE_OFFSET + 0], gl_WorkGroupID.x * 2 + i);
+                atomicMax(tile_bitmap_wo[tile_index * TILE_STRIDE + TILE_BITMAP_RANGE_OFFSET + 1], gl_WorkGroupID.x * 2 + i);
+            }
         }
     }
 }
diff --git a/title/shark-shaders/shaders/primitive_2d_bin_clear.comp.glsl b/title/shark-shaders/shaders/primitive_2d_bin_clear.comp.glsl
new file mode 100644 (file)
index 0000000..9913ec0
--- /dev/null
@@ -0,0 +1,19 @@
+#version 460
+
+#extension GL_GOOGLE_include_directive : require
+
+#extension GL_EXT_scalar_block_layout : require
+#extension GL_EXT_control_flow_attributes : require
+
+#extension GL_KHR_shader_subgroup_vote : require
+#extension GL_KHR_shader_subgroup_ballot : require
+
+#include "primitive_2d.h"
+
+// TODO: Spec constant support for different subgroup sizes.
+layout (local_size_x = 64, local_size_y = 1, local_size_z = 1) in;
+
+void main() {
+    tile_bitmap_wo[gl_GlobalInvocationID.x * TILE_STRIDE + TILE_BITMAP_RANGE_OFFSET + 0] = 0xffffffff;
+    tile_bitmap_wo[gl_GlobalInvocationID.x * TILE_STRIDE + TILE_BITMAP_RANGE_OFFSET + 1] = 0;
+}
index 01bb88224fe9d66848d1ab3b9f8d96d56a6d445f..2bd3cd56b4a4ebaba7c0286fc965f2465f8060e4 100644 (file)
@@ -34,16 +34,22 @@ void main() {
     const uvec2 tile_coord = gl_WorkGroupID.xy / 4;
     const uint tile_index = tile_coord.y * primitive_uniforms.tile_stride + tile_coord.x;
     const uint tile_base = tile_index * TILE_STRIDE;
-    const uint tile_bitmap_l1_base_fine = tile_base + TILE_BITMAP_L1_OFFSET;
-    const uint tile_bitmap_l0_base_fine = tile_base + TILE_BITMAP_L0_OFFSET;
+
+    const uint first = tile_bitmap_ro[tile_base + TILE_BITMAP_RANGE_OFFSET + 0];
+    const uint last = tile_bitmap_ro[tile_base + TILE_BITMAP_RANGE_OFFSET + 1];
+
+    [[branch]]
+    if (last < first) {
+        return;
+    }
 
 #if DEBUG_SHOW_TILES == 1
 
     int count = 0;
     // For each tile, iterate over all words in the L1 bitmap.
-    for (int index_l1 = 0; index_l1 < primitive_uniforms.num_primitives_1024; index_l1++) {
+    for (uint index_l1 = first; index_l1 <= last; index_l1++) {
         // For each word, iterate all set bits.
-        uint bitmap_l1 = tile_bitmap_ro[tile_bitmap_l1_base_fine + index_l1];
+        uint bitmap_l1 = tile_bitmap_ro[tile_base + TILE_BITMAP_L1_OFFSET + index_l1];
 
         while (bitmap_l1 != 0) {
             const uint i = findLSB(bitmap_l1);
@@ -52,7 +58,7 @@ void main() {
             // For each set bit in the L1 bitmap, iterate the set bits in the
             // corresponding L0 bitmap.
             const uint index_l0 = index_l1 * 32 + i;
-            uint bitmap_l0 = tile_bitmap_ro[tile_bitmap_l0_base_fine + index_l0];
+            uint bitmap_l0 = tile_bitmap_ro[tile_base + TILE_BITMAP_L0_OFFSET + index_l0];
 
             count += bitCount(bitmap_l0);
         }
@@ -66,9 +72,9 @@ void main() {
     vec4 accum = vec4(0.0);
 
     // For each tile, iterate over all words in the L1 bitmap. 
-    for (int index_l1 = 0; index_l1 < primitive_uniforms.num_primitives_1024; index_l1++) {
+    for (uint index_l1 = first; index_l1 <= last; index_l1++) {
         // For each word, iterate all set bits.
-        uint bitmap_l1 = tile_bitmap_ro[tile_bitmap_l1_base_fine + index_l1];
+        uint bitmap_l1 = tile_bitmap_ro[tile_base + TILE_BITMAP_L1_OFFSET + index_l1];
 
         while (bitmap_l1 != 0) {
             const uint i = findLSB(bitmap_l1);
@@ -77,7 +83,7 @@ void main() {
             // For each set bit in the L1 bitmap, iterate the set bits in the
             // corresponding L0 bitmap.
             const uint index_l0 = index_l1 * 32 + i;
-            uint bitmap_l0 = tile_bitmap_ro[tile_bitmap_l0_base_fine + index_l0];
+            uint bitmap_l0 = tile_bitmap_ro[tile_base + TILE_BITMAP_L0_OFFSET + index_l0];
             while (bitmap_l0 != 0) {
                 const uint j = findLSB(bitmap_l0);
                 bitmap_l0 ^= bitmap_l0 & -bitmap_l0;
index fd621b6b3115523e7c3fbc79b83ba2c69ee18ce4..0742a0cda7a8af5fea1fc03825d079b4995be67e 100644 (file)
@@ -1309,7 +1309,7 @@ impl<'gpu> DrawState<'gpu> {
 
                 ui_state.primitive_instances.clear();
 
-                gpu.cmd_set_pipeline(cmd_encoder, self.primitive_2d_pipeline.bin_pipeline);
+                gpu.cmd_set_pipeline(cmd_encoder, self.primitive_2d_pipeline.bin_clear_pipeline);
 
                 gpu.cmd_set_bind_group(
                     frame,
@@ -1372,6 +1372,24 @@ impl<'gpu> DrawState<'gpu> {
                     },
                 );
 
+                gpu.cmd_dispatch(
+                    cmd_encoder,
+                    (self.tile_resolution_y * self.tile_resolution_x + 63) / 64,
+                    1,
+                    1,
+                );
+
+                gpu.cmd_barrier(
+                    cmd_encoder,
+                    Some(&GlobalBarrier {
+                        prev_access: &[Access::ComputeWrite],
+                        next_access: &[Access::ComputeOtherRead],
+                    }),
+                    &[],
+                );
+
+                gpu.cmd_set_pipeline(cmd_encoder, self.primitive_2d_pipeline.bin_pipeline);
+
                 gpu.cmd_dispatch(
                     cmd_encoder,
                     (num_primitives + 2047) / 2048,
@@ -1382,8 +1400,8 @@ impl<'gpu> DrawState<'gpu> {
                 gpu.cmd_barrier(
                     cmd_encoder,
                     Some(&GlobalBarrier {
-                        prev_access: &[Access::ShaderWrite],
-                        next_access: &[Access::ShaderOtherRead],
+                        prev_access: &[Access::ComputeWrite],
+                        next_access: &[Access::ComputeOtherRead],
                     }),
                     &[],
                 );
@@ -1444,13 +1462,18 @@ impl<'gpu> DrawState<'gpu> {
                         Bind {
                             binding: 2,
                             array_element: 0,
+                            typed: TypedBind::StorageBuffer(&[self.tile_bitmap_buffer.to_arg()]),
+                        },
+                        Bind {
+                            binding: 3,
+                            array_element: 0,
                             typed: TypedBind::StorageImage(&[(
                                 ImageLayout::General,
                                 self.rt_image,
                             )]),
                         },
                         Bind {
-                            binding: 3,
+                            binding: 4,
                             array_element: 0,
                             typed: TypedBind::StorageImage(&[(
                                 ImageLayout::General,
@@ -1458,7 +1481,7 @@ impl<'gpu> DrawState<'gpu> {
                             )]),
                         },
                         Bind {
-                            binding: 4,
+                            binding: 5,
                             array_element: 0,
                             typed: TypedBind::StorageImage(&[(
                                 ImageLayout::General,
index 7a602c2c8f2b603ee7778f995d7f4b3fee04a40d..4405b4e86f6b1a2cbd8478a280947574e5d78880 100644 (file)
@@ -19,6 +19,8 @@ impl DisplayTransformPipeline {
             BindDesc::new(ShaderStageFlags::COMPUTE, BindingType::Sampler),
             // Tony Mc'mapface LUT
             BindDesc::new(ShaderStageFlags::COMPUTE, BindingType::SampledImage),
+            // Tiles
+            BindDesc::new(ShaderStageFlags::COMPUTE, BindingType::StorageBuffer),
             // Layer RT
             BindDesc::new(ShaderStageFlags::COMPUTE, BindingType::StorageImage),
             // Layer UI
index fcf828b62f9007d3dbf98c54efafe9f16241876f..2259d7723670dd24ed54c6ba53a51cb29cb21312 100644 (file)
@@ -10,7 +10,7 @@ pub const TILE_SIZE: u32 = 32;
 pub const MAX_PRIMS: u32 = 1 << 18;
 pub const TILE_BITMAP_WORDS_L1: u32 = MAX_PRIMS / 32 / 32;
 pub const TILE_BITMAP_WORDS_L0: u32 = MAX_PRIMS / 32;
-pub const TILE_STRIDE: u32 = TILE_BITMAP_WORDS_L0 + TILE_BITMAP_WORDS_L1;
+pub const TILE_STRIDE: u32 = TILE_BITMAP_WORDS_L0 + TILE_BITMAP_WORDS_L1 + 2;
 
 #[allow(unused)]
 #[repr(C)]
@@ -38,6 +38,7 @@ pub struct GlyphInstance {
 
 pub struct Primitive2dPipeline {
     pub bind_group_layout: BindGroupLayout,
+    pub bin_clear_pipeline: Pipeline,
     pub bin_pipeline: Pipeline,
     pub rasterize_pipeline: Pipeline,
 }
@@ -68,6 +69,14 @@ impl Primitive2dPipeline {
             }],
         };
 
+        let bin_clear_pipeline = gpu.create_compute_pipeline(&ComputePipelineDesc {
+            shader: ShaderDesc {
+                entry: c"main",
+                code: shark_shaders::PRIMITIVE_2D_BIN_CLEAR_COMP_SPV,
+            },
+            layout,
+        });
+
         let bin_pipeline = gpu.create_compute_pipeline(&ComputePipelineDesc {
             shader: ShaderDesc {
                 entry: c"main",
@@ -86,6 +95,7 @@ impl Primitive2dPipeline {
 
         Self {
             bind_group_layout,
+            bin_clear_pipeline,
             bin_pipeline,
             rasterize_pipeline,
         }