From: Josh Simmons Date: Sun, 10 Nov 2024 15:00:02 +0000 (+0100) Subject: shark-shaders: Remove one dispatchfrom radix sort X-Git-Url: https://git.nega.tv//gitweb.cgi?a=commitdiff_plain;h=e4ad6df21d934609088aaa17c8b4ca81adff2135;p=josh%2Fnarcissus shark-shaders: Remove one dispatchfrom radix sort Calculate the prefix sum for the spine with the last workgroup in the upsweep dispatch. --- diff --git a/title/shark-shaders/build.rs b/title/shark-shaders/build.rs index a9acb52..077cb66 100644 --- a/title/shark-shaders/build.rs +++ b/title/shark-shaders/build.rs @@ -44,11 +44,7 @@ const SHADERS: &[Shader] = &[ }, Shader { stage: "comp", - name: "radix_sort_1_spine", - }, - Shader { - stage: "comp", - name: "radix_sort_2_downsweep", + name: "radix_sort_1_downsweep", }, Shader { stage: "comp", diff --git a/title/shark-shaders/shaders/draw_2d_bin_0_clear.comp b/title/shark-shaders/shaders/draw_2d_bin_0_clear.comp index 8d5e613..9f3501b 100644 --- a/title/shark-shaders/shaders/draw_2d_bin_0_clear.comp +++ b/title/shark-shaders/shaders/draw_2d_bin_0_clear.comp @@ -14,6 +14,7 @@ #include "radix_sort.h" struct Draw2dClearConstants { + FinishedRef finished_buffer; CoarseRef coarse_buffer; }; @@ -24,5 +25,6 @@ layout(std430, push_constant) uniform Draw2dClearConstantsBlock { layout (local_size_x = 1, local_size_y = 1, local_size_z = 1) in; void main() { + constants.finished_buffer.value = 0; constants.coarse_buffer.values[0] = 0; } diff --git a/title/shark-shaders/shaders/radix_sort.h b/title/shark-shaders/shaders/radix_sort.h index 2a57798..edf5a6c 100644 --- a/title/shark-shaders/shaders/radix_sort.h +++ b/title/shark-shaders/shaders/radix_sort.h @@ -9,6 +9,10 @@ const uint RADIX_WGP_SIZE = 256; const uint RADIX_ITEMS_PER_INVOCATION = 16; const uint RADIX_ITEMS_PER_WGP = RADIX_WGP_SIZE * RADIX_ITEMS_PER_INVOCATION; +layout(buffer_reference, std430, buffer_reference_align = 4) coherent buffer FinishedRef { + coherent uint value; +}; + layout(buffer_reference, std430, buffer_reference_align = 4) readonly buffer CountRef { uint value; }; diff --git a/title/shark-shaders/shaders/radix_sort_0_upsweep.comp b/title/shark-shaders/shaders/radix_sort_0_upsweep.comp index 4bff3a4..c68fed9 100644 --- a/title/shark-shaders/shaders/radix_sort_0_upsweep.comp +++ b/title/shark-shaders/shaders/radix_sort_0_upsweep.comp @@ -7,6 +7,11 @@ #extension GL_EXT_scalar_block_layout : require #extension GL_EXT_control_flow_attributes : require +#extension GL_KHR_shader_subgroup_arithmetic : require +#extension GL_KHR_shader_subgroup_ballot : require +#extension GL_KHR_shader_subgroup_shuffle_relative: enable +#extension GL_KHR_shader_subgroup_vote : require + #include "compute_bindings.h" #include "radix_sort.h" @@ -14,17 +19,22 @@ #include "draw_2d.h" #include "indirect.h" +layout (constant_id = 0) const uint SUBGROUP_SIZE = 64; + +const uint NUM_SUBGROUPS = RADIX_WGP_SIZE / SUBGROUP_SIZE; + layout(buffer_reference, std430, buffer_reference_align = 4) readonly buffer ValuesRef { uint values[]; }; -layout(buffer_reference, std430, buffer_reference_align = 4) writeonly buffer SpineRef { +layout(buffer_reference, std430, buffer_reference_align = 4) buffer SpineRef { uint values[]; }; struct RadixSortUpsweepConstants { uint shift; uint _pad; + FinishedRef finished_buffer; CountRef count_buffer; ValuesRef src_buffer; SpineRef spine_buffer; @@ -36,14 +46,18 @@ layout(std430, push_constant) uniform RadixSortUpsweepConstantsBlock { shared uint histogram[RADIX_DIGITS]; +shared bool finished; +shared uint carry; +shared uint sums[NUM_SUBGROUPS]; + layout (local_size_x = RADIX_DIGITS, local_size_y = 1, local_size_z = 1) in; void main() { const uint shift = constants.shift; const uint count = constants.count_buffer.value; + const uint workgroup_count = (count + (RADIX_ITEMS_PER_WGP - 1)) / RADIX_ITEMS_PER_WGP; - const uint wgp_count = (count + (RADIX_ITEMS_PER_WGP - 1)) / RADIX_ITEMS_PER_WGP; - const bool needs_bounds_check = gl_WorkGroupID.x == wgp_count - 1; + const bool needs_bounds_check = gl_WorkGroupID.x == workgroup_count - 1; // Clear local histogram histogram[gl_LocalInvocationID.x] = 0; @@ -73,5 +87,52 @@ void main() { // Scatter to the spine, this is a striped layout so we can efficiently // calculate the prefix sum. Re-calculate how many workgroups we dispatched // to determine the stride we need to write at. - constants.spine_buffer.values[(gl_LocalInvocationID.x * wgp_count) + gl_WorkGroupID.x] = histogram[gl_LocalInvocationID.x]; + constants.spine_buffer.values[(gl_LocalInvocationID.x * workgroup_count) + gl_WorkGroupID.x] = histogram[gl_LocalInvocationID.x]; + + barrier(); + + if (gl_SubgroupID == 0 && subgroupElect()) { + finished = atomicAdd(constants.finished_buffer.value, 1) < workgroup_count - 1; + } + + barrier(); + + if (finished) { + return; + } + + // reset for the next pass + constants.finished_buffer.value = 0; + + const uint local_id = gl_SubgroupID * gl_SubgroupSize + gl_SubgroupInvocationID; + + carry = 0; + for (uint i = 0; i < workgroup_count; i++) { + // Load values and calculate partial sums + const uint value = constants.spine_buffer.values[i * RADIX_DIGITS + local_id]; + const uint sum = subgroupAdd(value); + const uint scan = subgroupExclusiveAdd(value); + + if (subgroupElect()) { + sums[gl_SubgroupID] = sum; + } + + barrier(); + + const uint carry_in = carry; + + // Scan partials + if (local_id < NUM_SUBGROUPS) { + sums[local_id] = subgroupExclusiveAdd(sums[local_id]); + } + + barrier(); + + // Write out the final prefix sum, combining the carry-in, subgroup sums, and local scan + constants.spine_buffer.values[i * RADIX_DIGITS + local_id] = carry_in + sums[gl_SubgroupID] + scan; + + if (gl_SubgroupID == gl_NumSubgroups - 1 && subgroupElect()) { + atomicAdd(carry, sums[gl_NumSubgroups - 1] + sum); + } + } } \ No newline at end of file diff --git a/title/shark-shaders/shaders/radix_sort_2_downsweep.comp b/title/shark-shaders/shaders/radix_sort_1_downsweep.comp similarity index 100% rename from title/shark-shaders/shaders/radix_sort_2_downsweep.comp rename to title/shark-shaders/shaders/radix_sort_1_downsweep.comp diff --git a/title/shark-shaders/shaders/radix_sort_1_spine.comp b/title/shark-shaders/shaders/radix_sort_1_spine.comp deleted file mode 100644 index a02949e..0000000 --- a/title/shark-shaders/shaders/radix_sort_1_spine.comp +++ /dev/null @@ -1,81 +0,0 @@ -#version 460 - -#extension GL_GOOGLE_include_directive : require - -#extension GL_EXT_buffer_reference : require -#extension GL_EXT_buffer_reference2 : require -#extension GL_EXT_scalar_block_layout : require -#extension GL_EXT_control_flow_attributes : require - -#extension GL_KHR_shader_subgroup_arithmetic : require -#extension GL_KHR_shader_subgroup_ballot : require -#extension GL_KHR_shader_subgroup_shuffle_relative: enable -#extension GL_KHR_shader_subgroup_vote : require - -#include "compute_bindings.h" - -#include "radix_sort.h" - -#include "draw_2d.h" -#include "indirect.h" - -layout(buffer_reference, std430, buffer_reference_align = 4) buffer SpineRef { - uint values[]; -}; - -struct RadixSortSpineConstants { - CountRef count_buffer; - SpineRef spine_buffer; -}; - -layout(std430, push_constant) uniform RadixSortSpineConstantsBlock { - RadixSortSpineConstants constants; -}; - -layout (constant_id = 0) const uint SUBGROUP_SIZE = 64; - -const uint NUM_SUBGROUPS = RADIX_WGP_SIZE / SUBGROUP_SIZE; - -shared uint carry; -shared uint sums[NUM_SUBGROUPS]; - -layout (local_size_x = RADIX_WGP_SIZE, local_size_y = 1, local_size_z = 1) in; - -void main() { - const uint local_id = gl_SubgroupID * gl_SubgroupSize + gl_SubgroupInvocationID; - - const uint count = constants.count_buffer.value; - - // Re-calculate how many workgroups pushed data into the spine - const uint upsweep_wgp_count = (count + (RADIX_ITEMS_PER_WGP - 1)) / RADIX_ITEMS_PER_WGP; - - carry = 0; - for (uint i = 0; i < upsweep_wgp_count; i++) { - // Load values and calculate partial sums - const uint value = constants.spine_buffer.values[i * RADIX_DIGITS + local_id]; - const uint sum = subgroupAdd(value); - const uint scan = subgroupExclusiveAdd(value); - - if (subgroupElect()) { - sums[gl_SubgroupID] = sum; - } - - barrier(); - - const uint carry_in = carry; - - // Scan partials - if (local_id < NUM_SUBGROUPS) { - sums[local_id] = subgroupExclusiveAdd(sums[local_id]); - } - - barrier(); - - // Write out the final prefix sum, combining the carry-in, subgroup sums, and local scan - constants.spine_buffer.values[i * RADIX_DIGITS + local_id] = carry_in + sums[gl_SubgroupID] + scan; - - if (gl_SubgroupID == gl_NumSubgroups - 1 && subgroupElect()) { - atomicAdd(carry, sums[gl_NumSubgroups - 1] + sum); - } - } -} diff --git a/title/shark-shaders/src/pipelines.rs b/title/shark-shaders/src/pipelines.rs index 5a4fb36..9809d0f 100644 --- a/title/shark-shaders/src/pipelines.rs +++ b/title/shark-shaders/src/pipelines.rs @@ -142,6 +142,7 @@ pub struct BasicConstants<'a> { #[repr(C)] pub struct Draw2dClearConstants<'a> { + pub finished_buffer_address: BufferAddress<'a>, pub coarse_buffer_address: BufferAddress<'a>, } @@ -212,17 +213,12 @@ pub struct CompositeConstants<'a> { pub struct RadixSortUpsweepConstants<'a> { pub shift: u32, pub _pad: u32, + pub finished_buffer_address: BufferAddress<'a>, pub count_buffer_address: BufferAddress<'a>, pub src_buffer_address: BufferAddress<'a>, pub spine_buffer_address: BufferAddress<'a>, } -#[repr(C)] -pub struct RadixSortSpineConstants<'a> { - pub count_buffer_address: BufferAddress<'a>, - pub spine_buffer_address: BufferAddress<'a>, -} - #[repr(C)] pub struct RadixSortDownsweepConstants<'a> { pub shift: u32, @@ -261,8 +257,7 @@ pub struct Pipelines { pub draw_2d_rasterize_pipeline: Pipeline, pub radix_sort_0_upsweep_pipeline: Pipeline, - pub radix_sort_1_spine_pipeline: Pipeline, - pub radix_sort_2_downsweep_pipeline: Pipeline, + pub radix_sort_1_downsweep_pipeline: Pipeline, pub composite_pipeline: Pipeline, } @@ -425,16 +420,8 @@ impl Pipelines { std::mem::size_of::(), ); - let radix_sort_1_spine_pipeline = create_compute_pipeline( - crate::RADIX_SORT_1_SPINE_COMP_SPV, - "radix_sort_spine", - 32, - true, - std::mem::size_of::(), - ); - - let radix_sort_2_downsweep_pipeline = create_compute_pipeline( - crate::RADIX_SORT_2_DOWNSWEEP_COMP_SPV, + let radix_sort_1_downsweep_pipeline = create_compute_pipeline( + crate::RADIX_SORT_1_DOWNSWEEP_COMP_SPV, "radix_sort_downsweep", 32, true, @@ -465,8 +452,7 @@ impl Pipelines { draw_2d_rasterize_pipeline, radix_sort_0_upsweep_pipeline, - radix_sort_1_spine_pipeline, - radix_sort_2_downsweep_pipeline, + radix_sort_1_downsweep_pipeline, composite_pipeline, } diff --git a/title/shark/src/main.rs b/title/shark/src/main.rs index a637c0a..2aab991 100644 --- a/title/shark/src/main.rs +++ b/title/shark/src/main.rs @@ -9,7 +9,7 @@ use shark_shaders::pipelines::{ calculate_spine_size, BasicConstants, CompositeConstants, ComputeBinds, Draw2dClearConstants, Draw2dCmd, Draw2dRasterizeConstants, Draw2dResolveConstants, Draw2dScatterConstants, Draw2dSortConstants, GraphicsBinds, Pipelines, RadixSortDownsweepConstants, - RadixSortSpineConstants, RadixSortUpsweepConstants, DRAW_2D_TILE_SIZE, + RadixSortUpsweepConstants, DRAW_2D_TILE_SIZE, }; use renderdoc_sys as rdoc; @@ -18,7 +18,7 @@ use fonts::{FontFamily, Fonts}; use helpers::load_obj; use narcissus_app::{create_app, Event, Key, PressedState, WindowDesc}; use narcissus_core::{box_assume_init, default, rand::Pcg64, zeroed_box, BitIter}; -use narcissus_font::{FontCollection, GlyphCache, GlyphIndex, HorizontalMetrics}; +use narcissus_font::{FontCollection, GlyphCache, HorizontalMetrics}; use narcissus_gpu::{ create_device, Access, Bind, BufferImageCopy, BufferUsageFlags, ClearValue, CmdEncoder, ColorSpace, DeviceExt, Extent2d, Extent3d, Frame, GlobalBarrier, Gpu, Image, ImageAspectFlags, @@ -1320,6 +1320,13 @@ impl<'gpu> DrawState<'gpu> { 3 * std::mem::size_of::(), ); + let finished_buffer = gpu.request_transient_buffer( + frame, + thread_token, + BufferUsageFlags::INDIRECT, + std::mem::size_of::(), + ); + let tmp_buffer = gpu.request_transient_buffer( frame, thread_token, @@ -1339,6 +1346,7 @@ impl<'gpu> DrawState<'gpu> { let coarse_buffer_address = gpu.get_buffer_address(coarse_buffer.to_arg()); let indirect_dispatch_buffer_address = gpu.get_buffer_address(indirect_dispatch_buffer.to_arg()); + let finished_buffer_address = gpu.get_buffer_address(finished_buffer.to_arg()); let tmp_buffer_address = gpu.get_buffer_address(tmp_buffer.to_arg()); let spine_buffer_address = gpu.get_buffer_address(spine_buffer.to_arg()); @@ -1349,6 +1357,7 @@ impl<'gpu> DrawState<'gpu> { ShaderStageFlags::COMPUTE, 0, &Draw2dClearConstants { + finished_buffer_address, coarse_buffer_address, }, ); @@ -1450,6 +1459,7 @@ impl<'gpu> DrawState<'gpu> { &RadixSortUpsweepConstants { shift, _pad: 0, + finished_buffer_address, count_buffer_address, src_buffer_address, spine_buffer_address, @@ -1466,33 +1476,10 @@ impl<'gpu> DrawState<'gpu> { &[], ); - // Exclusive sum of the spine - gpu.cmd_set_pipeline(cmd_encoder, self.pipelines.radix_sort_1_spine_pipeline); - gpu.cmd_set_bind_group(cmd_encoder, 0, &compute_bind_group); - gpu.cmd_push_constants( - cmd_encoder, - ShaderStageFlags::COMPUTE, - 0, - &RadixSortSpineConstants { - count_buffer_address, - spine_buffer_address, - }, - ); - gpu.cmd_dispatch(cmd_encoder, 1, 1, 1); - - gpu.cmd_barrier( - cmd_encoder, - Some(&GlobalBarrier { - prev_access: &[Access::ComputeWrite], - next_access: &[Access::ComputeOtherRead], - }), - &[], - ); - // Downsweep gpu.cmd_set_pipeline( cmd_encoder, - self.pipelines.radix_sort_2_downsweep_pipeline, + self.pipelines.radix_sort_1_downsweep_pipeline, ); gpu.cmd_set_bind_group(cmd_encoder, 0, &compute_bind_group); gpu.cmd_push_constants( diff --git a/title/shark/tests/radix_sort.rs b/title/shark/tests/radix_sort.rs index 64e1897..e1be7aa 100644 --- a/title/shark/tests/radix_sort.rs +++ b/title/shark/tests/radix_sort.rs @@ -5,7 +5,7 @@ use narcissus_gpu::{ }; use shark_shaders::pipelines::{ calcuate_workgroup_count, calculate_spine_size, Pipelines, RadixSortDownsweepConstants, - RadixSortSpineConstants, RadixSortUpsweepConstants, + RadixSortUpsweepConstants, }; fn gpu_sort(values: &mut [u32]) { @@ -33,6 +33,13 @@ fn gpu_sort(values: &mut [u32]) { size: std::mem::size_of_val(values), }); + let finished_buffer = gpu.create_buffer(&BufferDesc { + memory_location: MemoryLocation::Device, + host_mapped: false, + usage: BufferUsageFlags::STORAGE, + size: std::mem::size_of::(), + }); + let spine_buffer = gpu.create_buffer(&BufferDesc { memory_location: MemoryLocation::Device, host_mapped: false, @@ -41,6 +48,7 @@ fn gpu_sort(values: &mut [u32]) { }); let count_buffer_address = gpu.get_buffer_address(count_buffer.to_arg()); + let finished_buffer_address = gpu.get_buffer_address(finished_buffer.to_arg()); let spine_buffer_address = gpu.get_buffer_address(spine_buffer.to_arg()); let mut src_buffer_address = gpu.get_buffer_address(sort_buffer.to_arg()); let mut dst_buffer_address = gpu.get_buffer_address(tmp_buffer.to_arg()); @@ -67,6 +75,7 @@ fn gpu_sort(values: &mut [u32]) { &RadixSortUpsweepConstants { shift, _pad: 0, + finished_buffer_address, count_buffer_address, src_buffer_address, spine_buffer_address, @@ -88,30 +97,8 @@ fn gpu_sort(values: &mut [u32]) { &[], ); - // Exclusive sum of the spine - gpu.cmd_set_pipeline(cmd_encoder, pipelines.radix_sort_1_spine_pipeline); - gpu.cmd_push_constants( - cmd_encoder, - ShaderStageFlags::COMPUTE, - 0, - &RadixSortSpineConstants { - count_buffer_address, - spine_buffer_address, - }, - ); - gpu.cmd_dispatch(cmd_encoder, 1, 1, 1); - - gpu.cmd_barrier( - cmd_encoder, - Some(&GlobalBarrier { - prev_access: &[Access::ComputeWrite], - next_access: &[Access::ComputeOtherRead], - }), - &[], - ); - // Downsweep - gpu.cmd_set_pipeline(cmd_encoder, pipelines.radix_sort_2_downsweep_pipeline); + gpu.cmd_set_pipeline(cmd_encoder, pipelines.radix_sort_1_downsweep_pipeline); gpu.cmd_push_constants( cmd_encoder, ShaderStageFlags::COMPUTE,