shark-shaders: Remove one dispatchfrom radix sort

author Josh Simmons <josh@nega.tv>

Sun, 10 Nov 2024 15:00:02 +0000 (16:00 +0100)

committer Josh Simmons <josh@nega.tv>

Sun, 10 Nov 2024 15:00:02 +0000 (16:00 +0100)
author Josh Simmons <josh@nega.tv>
Sun, 10 Nov 2024 15:00:02 +0000 (16:00 +0100)
committer Josh Simmons <josh@nega.tv>
Sun, 10 Nov 2024 15:00:02 +0000 (16:00 +0100)
diff --git a/title/shark-shaders/build.rs b/title/shark-shaders/build.rs

index a9acb52939dfb2984308f2e87a861be6e79f0777..077cb66fdd6b6e2e4ba30b1c5b904194585fd81a 100644 (file)
--- a/title/shark-shaders/build.rs
+++ b/title/shark-shaders/build.rs
@@ -44,11 +44,7 @@ const SHADERS: &[Shader] = &[
      },
      Shader {
          stage: "comp",
-        name: "radix_sort_1_spine",
-    },
-    Shader {
-        stage: "comp",
-        name: "radix_sort_2_downsweep",
+        name: "radix_sort_1_downsweep",
      },
      Shader {
          stage: "comp",
diff --git a/title/shark-shaders/shaders/draw_2d_bin_0_clear.comp b/title/shark-shaders/shaders/draw_2d_bin_0_clear.comp

index 8d5e613d52108127b180a5e0f588643a665d7d24..9f3501b5fbd0ddae52aa332ac34909f2bdb2c94c 100644 (file)
--- a/title/shark-shaders/shaders/draw_2d_bin_0_clear.comp
+++ b/title/shark-shaders/shaders/draw_2d_bin_0_clear.comp
@@ -14,6 +14,7 @@
  #include "radix_sort.h"
  
  struct Draw2dClearConstants {
+    FinishedRef finished_buffer;
      CoarseRef coarse_buffer;
  };
  
@@ -24,5 +25,6 @@ layout(std430, push_constant) uniform Draw2dClearConstantsBlock {
  layout (local_size_x = 1, local_size_y = 1, local_size_z = 1) in;
  
  void main() {
+    constants.finished_buffer.value = 0;
      constants.coarse_buffer.values[0] = 0;
  }
diff --git a/title/shark-shaders/shaders/radix_sort.h b/title/shark-shaders/shaders/radix_sort.h

index 2a57798beac308e0a95050b8b43dfe7c671a3c93..edf5a6cc41ca03d8c4dd0cf076965809ad5a3549 100644 (file)
--- a/title/shark-shaders/shaders/radix_sort.h
+++ b/title/shark-shaders/shaders/radix_sort.h
@@ -9,6 +9,10 @@ const uint RADIX_WGP_SIZE = 256;
  const uint RADIX_ITEMS_PER_INVOCATION = 16;
  const uint RADIX_ITEMS_PER_WGP = RADIX_WGP_SIZE * RADIX_ITEMS_PER_INVOCATION;
  
+layout(buffer_reference, std430, buffer_reference_align = 4) coherent buffer FinishedRef {
+    coherent uint value;
+};
+
  layout(buffer_reference, std430, buffer_reference_align = 4) readonly buffer CountRef {
      uint value;
  };
diff --git a/title/shark-shaders/shaders/radix_sort_0_upsweep.comp b/title/shark-shaders/shaders/radix_sort_0_upsweep.comp

index 4bff3a4346f787a0c51a800a3586eabf6db2e239..c68fed9fc3b436becfa6c1c82e2bb714911e068c 100644 (file)
--- a/title/shark-shaders/shaders/radix_sort_0_upsweep.comp
+++ b/title/shark-shaders/shaders/radix_sort_0_upsweep.comp
@@ -7,6 +7,11 @@
  #extension GL_EXT_scalar_block_layout : require
  #extension GL_EXT_control_flow_attributes : require
  
+#extension GL_KHR_shader_subgroup_arithmetic : require
+#extension GL_KHR_shader_subgroup_ballot : require
+#extension GL_KHR_shader_subgroup_shuffle_relative: enable
+#extension GL_KHR_shader_subgroup_vote : require
+
  #include "compute_bindings.h"
  
  #include "radix_sort.h"
@@ -14,17 +19,22 @@
  #include "draw_2d.h"
  #include "indirect.h"
  
+layout (constant_id = 0) const uint SUBGROUP_SIZE = 64;
+
+const uint NUM_SUBGROUPS = RADIX_WGP_SIZE / SUBGROUP_SIZE;
+
  layout(buffer_reference, std430, buffer_reference_align = 4) readonly buffer ValuesRef {
      uint values[];
  };
  
-layout(buffer_reference, std430, buffer_reference_align = 4) writeonly buffer SpineRef {
+layout(buffer_reference, std430, buffer_reference_align = 4) buffer SpineRef {
      uint values[];
  };
  
  struct RadixSortUpsweepConstants {
      uint shift;
      uint _pad;
+    FinishedRef finished_buffer;
      CountRef count_buffer;
      ValuesRef src_buffer;
      SpineRef spine_buffer;
@@ -36,14 +46,18 @@ layout(std430, push_constant) uniform RadixSortUpsweepConstantsBlock {
  
  shared uint histogram[RADIX_DIGITS];
  
+shared bool finished;
+shared uint carry;
+shared uint sums[NUM_SUBGROUPS];
+
  layout (local_size_x = RADIX_DIGITS, local_size_y = 1, local_size_z = 1) in;
  
  void main() {
      const uint shift = constants.shift;
      const uint count = constants.count_buffer.value;
+    const uint workgroup_count = (count + (RADIX_ITEMS_PER_WGP - 1)) / RADIX_ITEMS_PER_WGP;
  
-    const uint wgp_count = (count + (RADIX_ITEMS_PER_WGP - 1)) / RADIX_ITEMS_PER_WGP;
-    const bool needs_bounds_check = gl_WorkGroupID.x == wgp_count - 1;
+    const bool needs_bounds_check = gl_WorkGroupID.x == workgroup_count - 1;
  
      // Clear local histogram
      histogram[gl_LocalInvocationID.x] = 0;
@@ -73,5 +87,52 @@ void main() {
      // Scatter to the spine, this is a striped layout so we can efficiently
      // calculate the prefix sum. Re-calculate how many workgroups we dispatched
      // to determine the stride we need to write at.
-    constants.spine_buffer.values[(gl_LocalInvocationID.x * wgp_count) + gl_WorkGroupID.x] = histogram[gl_LocalInvocationID.x];
+    constants.spine_buffer.values[(gl_LocalInvocationID.x * workgroup_count) + gl_WorkGroupID.x] = histogram[gl_LocalInvocationID.x];
+
+    barrier();
+
+    if (gl_SubgroupID == 0 && subgroupElect()) {
+        finished = atomicAdd(constants.finished_buffer.value, 1) < workgroup_count - 1;
+    }
+
+    barrier();
+
+    if (finished) {
+        return;
+    }
+
+    // reset for the next pass
+    constants.finished_buffer.value = 0;
+
+    const uint local_id = gl_SubgroupID * gl_SubgroupSize + gl_SubgroupInvocationID;
+
+    carry = 0;
+    for (uint i = 0; i < workgroup_count; i++) {
+        // Load values and calculate partial sums
+        const uint value = constants.spine_buffer.values[i * RADIX_DIGITS + local_id];
+        const uint sum = subgroupAdd(value);
+        const uint scan = subgroupExclusiveAdd(value);
+
+        if (subgroupElect()) {
+            sums[gl_SubgroupID] = sum;
+        }
+
+        barrier();
+
+        const uint carry_in = carry;
+
+        // Scan partials
+        if (local_id < NUM_SUBGROUPS) {
+            sums[local_id] = subgroupExclusiveAdd(sums[local_id]);
+        }
+
+        barrier();
+
+        // Write out the final prefix sum, combining the carry-in, subgroup sums, and local scan
+        constants.spine_buffer.values[i * RADIX_DIGITS + local_id] = carry_in + sums[gl_SubgroupID] + scan;
+
+        if (gl_SubgroupID == gl_NumSubgroups - 1 && subgroupElect()) {
+            atomicAdd(carry, sums[gl_NumSubgroups - 1] + sum);
+        }
+    }
  }
 \ No newline at end of file
diff --git a/title/shark-shaders/shaders/radix_sort_2_downsweep.comp b/title/shark-shaders/shaders/radix_sort_1_downsweep.comp

similarity index 100%

rename from title/shark-shaders/shaders/radix_sort_2_downsweep.comp

rename to title/shark-shaders/shaders/radix_sort_1_downsweep.comp
diff --git a/title/shark-shaders/shaders/radix_sort_1_spine.comp b/title/shark-shaders/shaders/radix_sort_1_spine.comp

deleted file mode 100644 (file)

index a02949e..0000000
--- a/title/shark-shaders/shaders/radix_sort_1_spine.comp
+++ /dev/null
@@ -1,81 +0,0 @@
-#version 460
-
-#extension GL_GOOGLE_include_directive : require
-
-#extension GL_EXT_buffer_reference : require
-#extension GL_EXT_buffer_reference2 : require
-#extension GL_EXT_scalar_block_layout : require
-#extension GL_EXT_control_flow_attributes : require
-
-#extension GL_KHR_shader_subgroup_arithmetic : require
-#extension GL_KHR_shader_subgroup_ballot : require
-#extension GL_KHR_shader_subgroup_shuffle_relative: enable
-#extension GL_KHR_shader_subgroup_vote : require
-
-#include "compute_bindings.h"
-
-#include "radix_sort.h"
-
-#include "draw_2d.h"
-#include "indirect.h"
-
-layout(buffer_reference, std430, buffer_reference_align = 4) buffer SpineRef {
-    uint values[];
-};
-
-struct RadixSortSpineConstants {
-    CountRef count_buffer;
-    SpineRef spine_buffer;
-};
-
-layout(std430, push_constant) uniform RadixSortSpineConstantsBlock {
-    RadixSortSpineConstants constants;
-};
-
-layout (constant_id = 0) const uint SUBGROUP_SIZE = 64;
-
-const uint NUM_SUBGROUPS = RADIX_WGP_SIZE / SUBGROUP_SIZE;
-
-shared uint carry;
-shared uint sums[NUM_SUBGROUPS];
-
-layout (local_size_x = RADIX_WGP_SIZE, local_size_y = 1, local_size_z = 1) in;
-
-void main() {
-    const uint local_id = gl_SubgroupID * gl_SubgroupSize + gl_SubgroupInvocationID;
-
-    const uint count = constants.count_buffer.value;
-
-    // Re-calculate how many workgroups pushed data into the spine
-    const uint upsweep_wgp_count = (count + (RADIX_ITEMS_PER_WGP - 1)) / RADIX_ITEMS_PER_WGP;
-
-    carry = 0;
-    for (uint i = 0; i < upsweep_wgp_count; i++) {
-        // Load values and calculate partial sums
-        const uint value = constants.spine_buffer.values[i * RADIX_DIGITS + local_id];
-        const uint sum = subgroupAdd(value);
-        const uint scan = subgroupExclusiveAdd(value);
-
-        if (subgroupElect()) {
-            sums[gl_SubgroupID] = sum;
-        }
-
-        barrier();
-
-        const uint carry_in = carry;
-
-        // Scan partials
-        if (local_id < NUM_SUBGROUPS) {
-            sums[local_id] = subgroupExclusiveAdd(sums[local_id]);
-        }
-
-        barrier();
-
-        // Write out the final prefix sum, combining the carry-in, subgroup sums, and local scan
-        constants.spine_buffer.values[i * RADIX_DIGITS + local_id] = carry_in + sums[gl_SubgroupID] + scan;
-
-        if (gl_SubgroupID == gl_NumSubgroups - 1 && subgroupElect()) {
-            atomicAdd(carry, sums[gl_NumSubgroups - 1] + sum);
-        }
-    }
-}
diff --git a/title/shark-shaders/src/pipelines.rs b/title/shark-shaders/src/pipelines.rs

index 5a4fb366410f00782e5ca11bf8b9ab83be182ad7..9809d0f4d807c5323c93b5075432872205a7809b 100644 (file)
--- a/title/shark-shaders/src/pipelines.rs
+++ b/title/shark-shaders/src/pipelines.rs
@@ -142,6 +142,7 @@ pub struct BasicConstants<'a> {
  
  #[repr(C)]
  pub struct Draw2dClearConstants<'a> {
+    pub finished_buffer_address: BufferAddress<'a>,
      pub coarse_buffer_address: BufferAddress<'a>,
  }
  
@@ -212,17 +213,12 @@ pub struct CompositeConstants<'a> {
  pub struct RadixSortUpsweepConstants<'a> {
      pub shift: u32,
      pub _pad: u32,
+    pub finished_buffer_address: BufferAddress<'a>,
      pub count_buffer_address: BufferAddress<'a>,
      pub src_buffer_address: BufferAddress<'a>,
      pub spine_buffer_address: BufferAddress<'a>,
  }
  
-#[repr(C)]
-pub struct RadixSortSpineConstants<'a> {
-    pub count_buffer_address: BufferAddress<'a>,
-    pub spine_buffer_address: BufferAddress<'a>,
-}
-
  #[repr(C)]
  pub struct RadixSortDownsweepConstants<'a> {
      pub shift: u32,
@@ -261,8 +257,7 @@ pub struct Pipelines {
      pub draw_2d_rasterize_pipeline: Pipeline,
  
      pub radix_sort_0_upsweep_pipeline: Pipeline,
-    pub radix_sort_1_spine_pipeline: Pipeline,
-    pub radix_sort_2_downsweep_pipeline: Pipeline,
+    pub radix_sort_1_downsweep_pipeline: Pipeline,
  
      pub composite_pipeline: Pipeline,
  }
@@ -425,16 +420,8 @@ impl Pipelines {
              std::mem::size_of::<RadixSortUpsweepConstants>(),
          );
  
-        let radix_sort_1_spine_pipeline = create_compute_pipeline(
-            crate::RADIX_SORT_1_SPINE_COMP_SPV,
-            "radix_sort_spine",
-            32,
-            true,
-            std::mem::size_of::<RadixSortSpineConstants>(),
-        );
-
-        let radix_sort_2_downsweep_pipeline = create_compute_pipeline(
-            crate::RADIX_SORT_2_DOWNSWEEP_COMP_SPV,
+        let radix_sort_1_downsweep_pipeline = create_compute_pipeline(
+            crate::RADIX_SORT_1_DOWNSWEEP_COMP_SPV,
              "radix_sort_downsweep",
              32,
              true,
@@ -465,8 +452,7 @@ impl Pipelines {
              draw_2d_rasterize_pipeline,
  
              radix_sort_0_upsweep_pipeline,
-            radix_sort_1_spine_pipeline,
-            radix_sort_2_downsweep_pipeline,
+            radix_sort_1_downsweep_pipeline,
  
              composite_pipeline,
          }
diff --git a/title/shark/src/main.rs b/title/shark/src/main.rs

index a637c0a4305bbc3977f7d2fccc64a124641116dd..2aab99142d7503e60c3cf4d2c0c1133bcb2b85d2 100644 (file)
--- a/title/shark/src/main.rs
+++ b/title/shark/src/main.rs
@@ -9,7 +9,7 @@ use shark_shaders::pipelines::{
      calculate_spine_size, BasicConstants, CompositeConstants, ComputeBinds, Draw2dClearConstants,
      Draw2dCmd, Draw2dRasterizeConstants, Draw2dResolveConstants, Draw2dScatterConstants,
      Draw2dSortConstants, GraphicsBinds, Pipelines, RadixSortDownsweepConstants,
-    RadixSortSpineConstants, RadixSortUpsweepConstants, DRAW_2D_TILE_SIZE,
+    RadixSortUpsweepConstants, DRAW_2D_TILE_SIZE,
  };
  
  use renderdoc_sys as rdoc;
@@ -18,7 +18,7 @@ use fonts::{FontFamily, Fonts};
  use helpers::load_obj;
  use narcissus_app::{create_app, Event, Key, PressedState, WindowDesc};
  use narcissus_core::{box_assume_init, default, rand::Pcg64, zeroed_box, BitIter};
-use narcissus_font::{FontCollection, GlyphCache, GlyphIndex, HorizontalMetrics};
+use narcissus_font::{FontCollection, GlyphCache, HorizontalMetrics};
  use narcissus_gpu::{
      create_device, Access, Bind, BufferImageCopy, BufferUsageFlags, ClearValue, CmdEncoder,
      ColorSpace, DeviceExt, Extent2d, Extent3d, Frame, GlobalBarrier, Gpu, Image, ImageAspectFlags,
@@ -1320,6 +1320,13 @@ impl<'gpu> DrawState<'gpu> {
                      3 * std::mem::size_of::<u32>(),
                  );
  
+                let finished_buffer = gpu.request_transient_buffer(
+                    frame,
+                    thread_token,
+                    BufferUsageFlags::INDIRECT,
+                    std::mem::size_of::<u32>(),
+                );
+
                  let tmp_buffer = gpu.request_transient_buffer(
                      frame,
                      thread_token,
@@ -1339,6 +1346,7 @@ impl<'gpu> DrawState<'gpu> {
                  let coarse_buffer_address = gpu.get_buffer_address(coarse_buffer.to_arg());
                  let indirect_dispatch_buffer_address =
                      gpu.get_buffer_address(indirect_dispatch_buffer.to_arg());
+                let finished_buffer_address = gpu.get_buffer_address(finished_buffer.to_arg());
                  let tmp_buffer_address = gpu.get_buffer_address(tmp_buffer.to_arg());
                  let spine_buffer_address = gpu.get_buffer_address(spine_buffer.to_arg());
  
@@ -1349,6 +1357,7 @@ impl<'gpu> DrawState<'gpu> {
                      ShaderStageFlags::COMPUTE,
                      0,
                      &Draw2dClearConstants {
+                        finished_buffer_address,
                          coarse_buffer_address,
                      },
                  );
@@ -1450,6 +1459,7 @@ impl<'gpu> DrawState<'gpu> {
                          &RadixSortUpsweepConstants {
                              shift,
                              _pad: 0,
+                            finished_buffer_address,
                              count_buffer_address,
                              src_buffer_address,
                              spine_buffer_address,
@@ -1466,33 +1476,10 @@ impl<'gpu> DrawState<'gpu> {
                          &[],
                      );
  
-                    // Exclusive sum of the spine
-                    gpu.cmd_set_pipeline(cmd_encoder, self.pipelines.radix_sort_1_spine_pipeline);
-                    gpu.cmd_set_bind_group(cmd_encoder, 0, &compute_bind_group);
-                    gpu.cmd_push_constants(
-                        cmd_encoder,
-                        ShaderStageFlags::COMPUTE,
-                        0,
-                        &RadixSortSpineConstants {
-                            count_buffer_address,
-                            spine_buffer_address,
-                        },
-                    );
-                    gpu.cmd_dispatch(cmd_encoder, 1, 1, 1);
-
-                    gpu.cmd_barrier(
-                        cmd_encoder,
-                        Some(&GlobalBarrier {
-                            prev_access: &[Access::ComputeWrite],
-                            next_access: &[Access::ComputeOtherRead],
-                        }),
-                        &[],
-                    );
-
                      // Downsweep
                      gpu.cmd_set_pipeline(
                          cmd_encoder,
-                        self.pipelines.radix_sort_2_downsweep_pipeline,
+                        self.pipelines.radix_sort_1_downsweep_pipeline,
                      );
                      gpu.cmd_set_bind_group(cmd_encoder, 0, &compute_bind_group);
                      gpu.cmd_push_constants(
diff --git a/title/shark/tests/radix_sort.rs b/title/shark/tests/radix_sort.rs

index 64e1897625f3de098f2a95eff7d4481a35948c1f..e1be7aaf46746184490a1b2a2369ce1cf30ba182 100644 (file)
--- a/title/shark/tests/radix_sort.rs
+++ b/title/shark/tests/radix_sort.rs
@@ -5,7 +5,7 @@ use narcissus_gpu::{
  };
  use shark_shaders::pipelines::{
      calcuate_workgroup_count, calculate_spine_size, Pipelines, RadixSortDownsweepConstants,
-    RadixSortSpineConstants, RadixSortUpsweepConstants,
+    RadixSortUpsweepConstants,
  };
  
  fn gpu_sort(values: &mut [u32]) {
@@ -33,6 +33,13 @@ fn gpu_sort(values: &mut [u32]) {
          size: std::mem::size_of_val(values),
      });
  
+    let finished_buffer = gpu.create_buffer(&BufferDesc {
+        memory_location: MemoryLocation::Device,
+        host_mapped: false,
+        usage: BufferUsageFlags::STORAGE,
+        size: std::mem::size_of::<u32>(),
+    });
+
      let spine_buffer = gpu.create_buffer(&BufferDesc {
          memory_location: MemoryLocation::Device,
          host_mapped: false,
@@ -41,6 +48,7 @@ fn gpu_sort(values: &mut [u32]) {
      });
  
      let count_buffer_address = gpu.get_buffer_address(count_buffer.to_arg());
+    let finished_buffer_address = gpu.get_buffer_address(finished_buffer.to_arg());
      let spine_buffer_address = gpu.get_buffer_address(spine_buffer.to_arg());
      let mut src_buffer_address = gpu.get_buffer_address(sort_buffer.to_arg());
      let mut dst_buffer_address = gpu.get_buffer_address(tmp_buffer.to_arg());
@@ -67,6 +75,7 @@ fn gpu_sort(values: &mut [u32]) {
                      &RadixSortUpsweepConstants {
                          shift,
                          _pad: 0,
+                        finished_buffer_address,
                          count_buffer_address,
                          src_buffer_address,
                          spine_buffer_address,
@@ -88,30 +97,8 @@ fn gpu_sort(values: &mut [u32]) {
                      &[],
                  );
  
-                // Exclusive sum of the spine
-                gpu.cmd_set_pipeline(cmd_encoder, pipelines.radix_sort_1_spine_pipeline);
-                gpu.cmd_push_constants(
-                    cmd_encoder,
-                    ShaderStageFlags::COMPUTE,
-                    0,
-                    &RadixSortSpineConstants {
-                        count_buffer_address,
-                        spine_buffer_address,
-                    },
-                );
-                gpu.cmd_dispatch(cmd_encoder, 1, 1, 1);
-
-                gpu.cmd_barrier(
-                    cmd_encoder,
-                    Some(&GlobalBarrier {
-                        prev_access: &[Access::ComputeWrite],
-                        next_access: &[Access::ComputeOtherRead],
-                    }),
-                    &[],
-                );
-
                  // Downsweep
-                gpu.cmd_set_pipeline(cmd_encoder, pipelines.radix_sort_2_downsweep_pipeline);
+                gpu.cmd_set_pipeline(cmd_encoder, pipelines.radix_sort_1_downsweep_pipeline);
                  gpu.cmd_push_constants(
                      cmd_encoder,
                      ShaderStageFlags::COMPUTE,
author	Josh Simmons <josh@nega.tv>
	Sun, 10 Nov 2024 15:00:02 +0000 (16:00 +0100)
committer	Josh Simmons <josh@nega.tv>
	Sun, 10 Nov 2024 15:00:02 +0000 (16:00 +0100)
title/shark-shaders/build.rs		patch \| blob \| blame \| history
title/shark-shaders/shaders/draw_2d_bin_0_clear.comp		patch \| blob \| blame \| history
title/shark-shaders/shaders/radix_sort.h		patch \| blob \| blame \| history
title/shark-shaders/shaders/radix_sort_0_upsweep.comp		patch \| blob \| blame \| history
title/shark-shaders/shaders/radix_sort_1_downsweep.comp	[moved from title/shark-shaders/shaders/radix_sort_2_downsweep.comp with 100% similarity]	patch \| blob \| blame \| history
title/shark-shaders/shaders/radix_sort_1_spine.comp	[deleted file]	patch \| blob \| blame \| history
title/shark-shaders/src/pipelines.rs		patch \| blob \| blame \| history
title/shark/src/main.rs		patch \| blob \| blame \| history
title/shark/tests/radix_sort.rs		patch \| blob \| blame \| history