]> git.nega.tv - josh/narcissus/commitdiff
shark-shaders: Remove one dispatchfrom radix sort
authorJosh Simmons <josh@nega.tv>
Sun, 10 Nov 2024 15:00:02 +0000 (16:00 +0100)
committerJosh Simmons <josh@nega.tv>
Sun, 10 Nov 2024 15:00:02 +0000 (16:00 +0100)
Calculate the prefix sum for the spine with the last workgroup in the
upsweep dispatch.

title/shark-shaders/build.rs
title/shark-shaders/shaders/draw_2d_bin_0_clear.comp
title/shark-shaders/shaders/radix_sort.h
title/shark-shaders/shaders/radix_sort_0_upsweep.comp
title/shark-shaders/shaders/radix_sort_1_downsweep.comp [moved from title/shark-shaders/shaders/radix_sort_2_downsweep.comp with 100% similarity]
title/shark-shaders/shaders/radix_sort_1_spine.comp [deleted file]
title/shark-shaders/src/pipelines.rs
title/shark/src/main.rs
title/shark/tests/radix_sort.rs

index a9acb52939dfb2984308f2e87a861be6e79f0777..077cb66fdd6b6e2e4ba30b1c5b904194585fd81a 100644 (file)
@@ -44,11 +44,7 @@ const SHADERS: &[Shader] = &[
     },
     Shader {
         stage: "comp",
-        name: "radix_sort_1_spine",
-    },
-    Shader {
-        stage: "comp",
-        name: "radix_sort_2_downsweep",
+        name: "radix_sort_1_downsweep",
     },
     Shader {
         stage: "comp",
index 8d5e613d52108127b180a5e0f588643a665d7d24..9f3501b5fbd0ddae52aa332ac34909f2bdb2c94c 100644 (file)
@@ -14,6 +14,7 @@
 #include "radix_sort.h"
 
 struct Draw2dClearConstants {
+    FinishedRef finished_buffer;
     CoarseRef coarse_buffer;
 };
 
@@ -24,5 +25,6 @@ layout(std430, push_constant) uniform Draw2dClearConstantsBlock {
 layout (local_size_x = 1, local_size_y = 1, local_size_z = 1) in;
 
 void main() {
+    constants.finished_buffer.value = 0;
     constants.coarse_buffer.values[0] = 0;
 }
index 2a57798beac308e0a95050b8b43dfe7c671a3c93..edf5a6cc41ca03d8c4dd0cf076965809ad5a3549 100644 (file)
@@ -9,6 +9,10 @@ const uint RADIX_WGP_SIZE = 256;
 const uint RADIX_ITEMS_PER_INVOCATION = 16;
 const uint RADIX_ITEMS_PER_WGP = RADIX_WGP_SIZE * RADIX_ITEMS_PER_INVOCATION;
 
+layout(buffer_reference, std430, buffer_reference_align = 4) coherent buffer FinishedRef {
+    coherent uint value;
+};
+
 layout(buffer_reference, std430, buffer_reference_align = 4) readonly buffer CountRef {
     uint value;
 };
index 4bff3a4346f787a0c51a800a3586eabf6db2e239..c68fed9fc3b436becfa6c1c82e2bb714911e068c 100644 (file)
@@ -7,6 +7,11 @@
 #extension GL_EXT_scalar_block_layout : require
 #extension GL_EXT_control_flow_attributes : require
 
+#extension GL_KHR_shader_subgroup_arithmetic : require
+#extension GL_KHR_shader_subgroup_ballot : require
+#extension GL_KHR_shader_subgroup_shuffle_relative: enable
+#extension GL_KHR_shader_subgroup_vote : require
+
 #include "compute_bindings.h"
 
 #include "radix_sort.h"
 #include "draw_2d.h"
 #include "indirect.h"
 
+layout (constant_id = 0) const uint SUBGROUP_SIZE = 64;
+
+const uint NUM_SUBGROUPS = RADIX_WGP_SIZE / SUBGROUP_SIZE;
+
 layout(buffer_reference, std430, buffer_reference_align = 4) readonly buffer ValuesRef {
     uint values[];
 };
 
-layout(buffer_reference, std430, buffer_reference_align = 4) writeonly buffer SpineRef {
+layout(buffer_reference, std430, buffer_reference_align = 4) buffer SpineRef {
     uint values[];
 };
 
 struct RadixSortUpsweepConstants {
     uint shift;
     uint _pad;
+    FinishedRef finished_buffer;
     CountRef count_buffer;
     ValuesRef src_buffer;
     SpineRef spine_buffer;
@@ -36,14 +46,18 @@ layout(std430, push_constant) uniform RadixSortUpsweepConstantsBlock {
 
 shared uint histogram[RADIX_DIGITS];
 
+shared bool finished;
+shared uint carry;
+shared uint sums[NUM_SUBGROUPS];
+
 layout (local_size_x = RADIX_DIGITS, local_size_y = 1, local_size_z = 1) in;
 
 void main() {
     const uint shift = constants.shift;
     const uint count = constants.count_buffer.value;
+    const uint workgroup_count = (count + (RADIX_ITEMS_PER_WGP - 1)) / RADIX_ITEMS_PER_WGP;
 
-    const uint wgp_count = (count + (RADIX_ITEMS_PER_WGP - 1)) / RADIX_ITEMS_PER_WGP;
-    const bool needs_bounds_check = gl_WorkGroupID.x == wgp_count - 1;
+    const bool needs_bounds_check = gl_WorkGroupID.x == workgroup_count - 1;
 
     // Clear local histogram
     histogram[gl_LocalInvocationID.x] = 0;
@@ -73,5 +87,52 @@ void main() {
     // Scatter to the spine, this is a striped layout so we can efficiently
     // calculate the prefix sum. Re-calculate how many workgroups we dispatched
     // to determine the stride we need to write at.
-    constants.spine_buffer.values[(gl_LocalInvocationID.x * wgp_count) + gl_WorkGroupID.x] = histogram[gl_LocalInvocationID.x];
+    constants.spine_buffer.values[(gl_LocalInvocationID.x * workgroup_count) + gl_WorkGroupID.x] = histogram[gl_LocalInvocationID.x];
+
+    barrier();
+
+    if (gl_SubgroupID == 0 && subgroupElect()) {
+        finished = atomicAdd(constants.finished_buffer.value, 1) < workgroup_count - 1;
+    }
+
+    barrier();
+
+    if (finished) {
+        return;
+    }
+
+    // reset for the next pass
+    constants.finished_buffer.value = 0;
+
+    const uint local_id = gl_SubgroupID * gl_SubgroupSize + gl_SubgroupInvocationID;
+
+    carry = 0;
+    for (uint i = 0; i < workgroup_count; i++) {
+        // Load values and calculate partial sums
+        const uint value = constants.spine_buffer.values[i * RADIX_DIGITS + local_id];
+        const uint sum = subgroupAdd(value);
+        const uint scan = subgroupExclusiveAdd(value);
+
+        if (subgroupElect()) {
+            sums[gl_SubgroupID] = sum;
+        }
+
+        barrier();
+
+        const uint carry_in = carry;
+
+        // Scan partials
+        if (local_id < NUM_SUBGROUPS) {
+            sums[local_id] = subgroupExclusiveAdd(sums[local_id]);
+        }
+
+        barrier();
+
+        // Write out the final prefix sum, combining the carry-in, subgroup sums, and local scan
+        constants.spine_buffer.values[i * RADIX_DIGITS + local_id] = carry_in + sums[gl_SubgroupID] + scan;
+
+        if (gl_SubgroupID == gl_NumSubgroups - 1 && subgroupElect()) {
+            atomicAdd(carry, sums[gl_NumSubgroups - 1] + sum);
+        }
+    }
 }
\ No newline at end of file
diff --git a/title/shark-shaders/shaders/radix_sort_1_spine.comp b/title/shark-shaders/shaders/radix_sort_1_spine.comp
deleted file mode 100644 (file)
index a02949e..0000000
+++ /dev/null
@@ -1,81 +0,0 @@
-#version 460
-
-#extension GL_GOOGLE_include_directive : require
-
-#extension GL_EXT_buffer_reference : require
-#extension GL_EXT_buffer_reference2 : require
-#extension GL_EXT_scalar_block_layout : require
-#extension GL_EXT_control_flow_attributes : require
-
-#extension GL_KHR_shader_subgroup_arithmetic : require
-#extension GL_KHR_shader_subgroup_ballot : require
-#extension GL_KHR_shader_subgroup_shuffle_relative: enable
-#extension GL_KHR_shader_subgroup_vote : require
-
-#include "compute_bindings.h"
-
-#include "radix_sort.h"
-
-#include "draw_2d.h"
-#include "indirect.h"
-
-layout(buffer_reference, std430, buffer_reference_align = 4) buffer SpineRef {
-    uint values[];
-};
-
-struct RadixSortSpineConstants {
-    CountRef count_buffer;
-    SpineRef spine_buffer;
-};
-
-layout(std430, push_constant) uniform RadixSortSpineConstantsBlock {
-    RadixSortSpineConstants constants;
-};
-
-layout (constant_id = 0) const uint SUBGROUP_SIZE = 64;
-
-const uint NUM_SUBGROUPS = RADIX_WGP_SIZE / SUBGROUP_SIZE;
-
-shared uint carry;
-shared uint sums[NUM_SUBGROUPS];
-
-layout (local_size_x = RADIX_WGP_SIZE, local_size_y = 1, local_size_z = 1) in;
-
-void main() {
-    const uint local_id = gl_SubgroupID * gl_SubgroupSize + gl_SubgroupInvocationID;
-
-    const uint count = constants.count_buffer.value;
-
-    // Re-calculate how many workgroups pushed data into the spine
-    const uint upsweep_wgp_count = (count + (RADIX_ITEMS_PER_WGP - 1)) / RADIX_ITEMS_PER_WGP;
-
-    carry = 0;
-    for (uint i = 0; i < upsweep_wgp_count; i++) {
-        // Load values and calculate partial sums
-        const uint value = constants.spine_buffer.values[i * RADIX_DIGITS + local_id];
-        const uint sum = subgroupAdd(value);
-        const uint scan = subgroupExclusiveAdd(value);
-
-        if (subgroupElect()) {
-            sums[gl_SubgroupID] = sum;
-        }
-
-        barrier();
-
-        const uint carry_in = carry;
-
-        // Scan partials
-        if (local_id < NUM_SUBGROUPS) {
-            sums[local_id] = subgroupExclusiveAdd(sums[local_id]);
-        }
-
-        barrier();
-
-        // Write out the final prefix sum, combining the carry-in, subgroup sums, and local scan
-        constants.spine_buffer.values[i * RADIX_DIGITS + local_id] = carry_in + sums[gl_SubgroupID] + scan;
-
-        if (gl_SubgroupID == gl_NumSubgroups - 1 && subgroupElect()) {
-            atomicAdd(carry, sums[gl_NumSubgroups - 1] + sum);
-        }
-    }
-}
index 5a4fb366410f00782e5ca11bf8b9ab83be182ad7..9809d0f4d807c5323c93b5075432872205a7809b 100644 (file)
@@ -142,6 +142,7 @@ pub struct BasicConstants<'a> {
 
 #[repr(C)]
 pub struct Draw2dClearConstants<'a> {
+    pub finished_buffer_address: BufferAddress<'a>,
     pub coarse_buffer_address: BufferAddress<'a>,
 }
 
@@ -212,17 +213,12 @@ pub struct CompositeConstants<'a> {
 pub struct RadixSortUpsweepConstants<'a> {
     pub shift: u32,
     pub _pad: u32,
+    pub finished_buffer_address: BufferAddress<'a>,
     pub count_buffer_address: BufferAddress<'a>,
     pub src_buffer_address: BufferAddress<'a>,
     pub spine_buffer_address: BufferAddress<'a>,
 }
 
-#[repr(C)]
-pub struct RadixSortSpineConstants<'a> {
-    pub count_buffer_address: BufferAddress<'a>,
-    pub spine_buffer_address: BufferAddress<'a>,
-}
-
 #[repr(C)]
 pub struct RadixSortDownsweepConstants<'a> {
     pub shift: u32,
@@ -261,8 +257,7 @@ pub struct Pipelines {
     pub draw_2d_rasterize_pipeline: Pipeline,
 
     pub radix_sort_0_upsweep_pipeline: Pipeline,
-    pub radix_sort_1_spine_pipeline: Pipeline,
-    pub radix_sort_2_downsweep_pipeline: Pipeline,
+    pub radix_sort_1_downsweep_pipeline: Pipeline,
 
     pub composite_pipeline: Pipeline,
 }
@@ -425,16 +420,8 @@ impl Pipelines {
             std::mem::size_of::<RadixSortUpsweepConstants>(),
         );
 
-        let radix_sort_1_spine_pipeline = create_compute_pipeline(
-            crate::RADIX_SORT_1_SPINE_COMP_SPV,
-            "radix_sort_spine",
-            32,
-            true,
-            std::mem::size_of::<RadixSortSpineConstants>(),
-        );
-
-        let radix_sort_2_downsweep_pipeline = create_compute_pipeline(
-            crate::RADIX_SORT_2_DOWNSWEEP_COMP_SPV,
+        let radix_sort_1_downsweep_pipeline = create_compute_pipeline(
+            crate::RADIX_SORT_1_DOWNSWEEP_COMP_SPV,
             "radix_sort_downsweep",
             32,
             true,
@@ -465,8 +452,7 @@ impl Pipelines {
             draw_2d_rasterize_pipeline,
 
             radix_sort_0_upsweep_pipeline,
-            radix_sort_1_spine_pipeline,
-            radix_sort_2_downsweep_pipeline,
+            radix_sort_1_downsweep_pipeline,
 
             composite_pipeline,
         }
index a637c0a4305bbc3977f7d2fccc64a124641116dd..2aab99142d7503e60c3cf4d2c0c1133bcb2b85d2 100644 (file)
@@ -9,7 +9,7 @@ use shark_shaders::pipelines::{
     calculate_spine_size, BasicConstants, CompositeConstants, ComputeBinds, Draw2dClearConstants,
     Draw2dCmd, Draw2dRasterizeConstants, Draw2dResolveConstants, Draw2dScatterConstants,
     Draw2dSortConstants, GraphicsBinds, Pipelines, RadixSortDownsweepConstants,
-    RadixSortSpineConstants, RadixSortUpsweepConstants, DRAW_2D_TILE_SIZE,
+    RadixSortUpsweepConstants, DRAW_2D_TILE_SIZE,
 };
 
 use renderdoc_sys as rdoc;
@@ -18,7 +18,7 @@ use fonts::{FontFamily, Fonts};
 use helpers::load_obj;
 use narcissus_app::{create_app, Event, Key, PressedState, WindowDesc};
 use narcissus_core::{box_assume_init, default, rand::Pcg64, zeroed_box, BitIter};
-use narcissus_font::{FontCollection, GlyphCache, GlyphIndex, HorizontalMetrics};
+use narcissus_font::{FontCollection, GlyphCache, HorizontalMetrics};
 use narcissus_gpu::{
     create_device, Access, Bind, BufferImageCopy, BufferUsageFlags, ClearValue, CmdEncoder,
     ColorSpace, DeviceExt, Extent2d, Extent3d, Frame, GlobalBarrier, Gpu, Image, ImageAspectFlags,
@@ -1320,6 +1320,13 @@ impl<'gpu> DrawState<'gpu> {
                     3 * std::mem::size_of::<u32>(),
                 );
 
+                let finished_buffer = gpu.request_transient_buffer(
+                    frame,
+                    thread_token,
+                    BufferUsageFlags::INDIRECT,
+                    std::mem::size_of::<u32>(),
+                );
+
                 let tmp_buffer = gpu.request_transient_buffer(
                     frame,
                     thread_token,
@@ -1339,6 +1346,7 @@ impl<'gpu> DrawState<'gpu> {
                 let coarse_buffer_address = gpu.get_buffer_address(coarse_buffer.to_arg());
                 let indirect_dispatch_buffer_address =
                     gpu.get_buffer_address(indirect_dispatch_buffer.to_arg());
+                let finished_buffer_address = gpu.get_buffer_address(finished_buffer.to_arg());
                 let tmp_buffer_address = gpu.get_buffer_address(tmp_buffer.to_arg());
                 let spine_buffer_address = gpu.get_buffer_address(spine_buffer.to_arg());
 
@@ -1349,6 +1357,7 @@ impl<'gpu> DrawState<'gpu> {
                     ShaderStageFlags::COMPUTE,
                     0,
                     &Draw2dClearConstants {
+                        finished_buffer_address,
                         coarse_buffer_address,
                     },
                 );
@@ -1450,6 +1459,7 @@ impl<'gpu> DrawState<'gpu> {
                         &RadixSortUpsweepConstants {
                             shift,
                             _pad: 0,
+                            finished_buffer_address,
                             count_buffer_address,
                             src_buffer_address,
                             spine_buffer_address,
@@ -1466,33 +1476,10 @@ impl<'gpu> DrawState<'gpu> {
                         &[],
                     );
 
-                    // Exclusive sum of the spine
-                    gpu.cmd_set_pipeline(cmd_encoder, self.pipelines.radix_sort_1_spine_pipeline);
-                    gpu.cmd_set_bind_group(cmd_encoder, 0, &compute_bind_group);
-                    gpu.cmd_push_constants(
-                        cmd_encoder,
-                        ShaderStageFlags::COMPUTE,
-                        0,
-                        &RadixSortSpineConstants {
-                            count_buffer_address,
-                            spine_buffer_address,
-                        },
-                    );
-                    gpu.cmd_dispatch(cmd_encoder, 1, 1, 1);
-
-                    gpu.cmd_barrier(
-                        cmd_encoder,
-                        Some(&GlobalBarrier {
-                            prev_access: &[Access::ComputeWrite],
-                            next_access: &[Access::ComputeOtherRead],
-                        }),
-                        &[],
-                    );
-
                     // Downsweep
                     gpu.cmd_set_pipeline(
                         cmd_encoder,
-                        self.pipelines.radix_sort_2_downsweep_pipeline,
+                        self.pipelines.radix_sort_1_downsweep_pipeline,
                     );
                     gpu.cmd_set_bind_group(cmd_encoder, 0, &compute_bind_group);
                     gpu.cmd_push_constants(
index 64e1897625f3de098f2a95eff7d4481a35948c1f..e1be7aaf46746184490a1b2a2369ce1cf30ba182 100644 (file)
@@ -5,7 +5,7 @@ use narcissus_gpu::{
 };
 use shark_shaders::pipelines::{
     calcuate_workgroup_count, calculate_spine_size, Pipelines, RadixSortDownsweepConstants,
-    RadixSortSpineConstants, RadixSortUpsweepConstants,
+    RadixSortUpsweepConstants,
 };
 
 fn gpu_sort(values: &mut [u32]) {
@@ -33,6 +33,13 @@ fn gpu_sort(values: &mut [u32]) {
         size: std::mem::size_of_val(values),
     });
 
+    let finished_buffer = gpu.create_buffer(&BufferDesc {
+        memory_location: MemoryLocation::Device,
+        host_mapped: false,
+        usage: BufferUsageFlags::STORAGE,
+        size: std::mem::size_of::<u32>(),
+    });
+
     let spine_buffer = gpu.create_buffer(&BufferDesc {
         memory_location: MemoryLocation::Device,
         host_mapped: false,
@@ -41,6 +48,7 @@ fn gpu_sort(values: &mut [u32]) {
     });
 
     let count_buffer_address = gpu.get_buffer_address(count_buffer.to_arg());
+    let finished_buffer_address = gpu.get_buffer_address(finished_buffer.to_arg());
     let spine_buffer_address = gpu.get_buffer_address(spine_buffer.to_arg());
     let mut src_buffer_address = gpu.get_buffer_address(sort_buffer.to_arg());
     let mut dst_buffer_address = gpu.get_buffer_address(tmp_buffer.to_arg());
@@ -67,6 +75,7 @@ fn gpu_sort(values: &mut [u32]) {
                     &RadixSortUpsweepConstants {
                         shift,
                         _pad: 0,
+                        finished_buffer_address,
                         count_buffer_address,
                         src_buffer_address,
                         spine_buffer_address,
@@ -88,30 +97,8 @@ fn gpu_sort(values: &mut [u32]) {
                     &[],
                 );
 
-                // Exclusive sum of the spine
-                gpu.cmd_set_pipeline(cmd_encoder, pipelines.radix_sort_1_spine_pipeline);
-                gpu.cmd_push_constants(
-                    cmd_encoder,
-                    ShaderStageFlags::COMPUTE,
-                    0,
-                    &RadixSortSpineConstants {
-                        count_buffer_address,
-                        spine_buffer_address,
-                    },
-                );
-                gpu.cmd_dispatch(cmd_encoder, 1, 1, 1);
-
-                gpu.cmd_barrier(
-                    cmd_encoder,
-                    Some(&GlobalBarrier {
-                        prev_access: &[Access::ComputeWrite],
-                        next_access: &[Access::ComputeOtherRead],
-                    }),
-                    &[],
-                );
-
                 // Downsweep
-                gpu.cmd_set_pipeline(cmd_encoder, pipelines.radix_sort_2_downsweep_pipeline);
+                gpu.cmd_set_pipeline(cmd_encoder, pipelines.radix_sort_1_downsweep_pipeline);
                 gpu.cmd_push_constants(
                     cmd_encoder,
                     ShaderStageFlags::COMPUTE,