},
Shader {
stage: "comp",
- name: "radix_sort_1_spine",
- },
- Shader {
- stage: "comp",
- name: "radix_sort_2_downsweep",
+ name: "radix_sort_1_downsweep",
},
Shader {
stage: "comp",
#include "radix_sort.h"
struct Draw2dClearConstants {
+ FinishedRef finished_buffer;
CoarseRef coarse_buffer;
};
layout (local_size_x = 1, local_size_y = 1, local_size_z = 1) in;
void main() {
+ constants.finished_buffer.value = 0;
constants.coarse_buffer.values[0] = 0;
}
const uint RADIX_ITEMS_PER_INVOCATION = 16;
const uint RADIX_ITEMS_PER_WGP = RADIX_WGP_SIZE * RADIX_ITEMS_PER_INVOCATION;
+layout(buffer_reference, std430, buffer_reference_align = 4) coherent buffer FinishedRef {
+ coherent uint value;
+};
+
layout(buffer_reference, std430, buffer_reference_align = 4) readonly buffer CountRef {
uint value;
};
#extension GL_EXT_scalar_block_layout : require
#extension GL_EXT_control_flow_attributes : require
+#extension GL_KHR_shader_subgroup_arithmetic : require
+#extension GL_KHR_shader_subgroup_ballot : require
+#extension GL_KHR_shader_subgroup_shuffle_relative: enable
+#extension GL_KHR_shader_subgroup_vote : require
+
#include "compute_bindings.h"
#include "radix_sort.h"
#include "draw_2d.h"
#include "indirect.h"
+layout (constant_id = 0) const uint SUBGROUP_SIZE = 64;
+
+const uint NUM_SUBGROUPS = RADIX_WGP_SIZE / SUBGROUP_SIZE;
+
layout(buffer_reference, std430, buffer_reference_align = 4) readonly buffer ValuesRef {
uint values[];
};
-layout(buffer_reference, std430, buffer_reference_align = 4) writeonly buffer SpineRef {
+layout(buffer_reference, std430, buffer_reference_align = 4) buffer SpineRef {
uint values[];
};
struct RadixSortUpsweepConstants {
uint shift;
uint _pad;
+ FinishedRef finished_buffer;
CountRef count_buffer;
ValuesRef src_buffer;
SpineRef spine_buffer;
shared uint histogram[RADIX_DIGITS];
+shared bool finished;
+shared uint carry;
+shared uint sums[NUM_SUBGROUPS];
+
layout (local_size_x = RADIX_DIGITS, local_size_y = 1, local_size_z = 1) in;
void main() {
const uint shift = constants.shift;
const uint count = constants.count_buffer.value;
+ const uint workgroup_count = (count + (RADIX_ITEMS_PER_WGP - 1)) / RADIX_ITEMS_PER_WGP;
- const uint wgp_count = (count + (RADIX_ITEMS_PER_WGP - 1)) / RADIX_ITEMS_PER_WGP;
- const bool needs_bounds_check = gl_WorkGroupID.x == wgp_count - 1;
+ const bool needs_bounds_check = gl_WorkGroupID.x == workgroup_count - 1;
// Clear local histogram
histogram[gl_LocalInvocationID.x] = 0;
// Scatter to the spine, this is a striped layout so we can efficiently
// calculate the prefix sum. Re-calculate how many workgroups we dispatched
// to determine the stride we need to write at.
- constants.spine_buffer.values[(gl_LocalInvocationID.x * wgp_count) + gl_WorkGroupID.x] = histogram[gl_LocalInvocationID.x];
+ constants.spine_buffer.values[(gl_LocalInvocationID.x * workgroup_count) + gl_WorkGroupID.x] = histogram[gl_LocalInvocationID.x];
+
+ barrier();
+
+ if (gl_SubgroupID == 0 && subgroupElect()) {
+ finished = atomicAdd(constants.finished_buffer.value, 1) < workgroup_count - 1;
+ }
+
+ barrier();
+
+ if (finished) {
+ return;
+ }
+
+ // reset for the next pass
+ constants.finished_buffer.value = 0;
+
+ const uint local_id = gl_SubgroupID * gl_SubgroupSize + gl_SubgroupInvocationID;
+
+ carry = 0;
+ for (uint i = 0; i < workgroup_count; i++) {
+ // Load values and calculate partial sums
+ const uint value = constants.spine_buffer.values[i * RADIX_DIGITS + local_id];
+ const uint sum = subgroupAdd(value);
+ const uint scan = subgroupExclusiveAdd(value);
+
+ if (subgroupElect()) {
+ sums[gl_SubgroupID] = sum;
+ }
+
+ barrier();
+
+ const uint carry_in = carry;
+
+ // Scan partials
+ if (local_id < NUM_SUBGROUPS) {
+ sums[local_id] = subgroupExclusiveAdd(sums[local_id]);
+ }
+
+ barrier();
+
+ // Write out the final prefix sum, combining the carry-in, subgroup sums, and local scan
+ constants.spine_buffer.values[i * RADIX_DIGITS + local_id] = carry_in + sums[gl_SubgroupID] + scan;
+
+ if (gl_SubgroupID == gl_NumSubgroups - 1 && subgroupElect()) {
+ atomicAdd(carry, sums[gl_NumSubgroups - 1] + sum);
+ }
+ }
}
\ No newline at end of file
+++ /dev/null
-#version 460
-
-#extension GL_GOOGLE_include_directive : require
-
-#extension GL_EXT_buffer_reference : require
-#extension GL_EXT_buffer_reference2 : require
-#extension GL_EXT_scalar_block_layout : require
-#extension GL_EXT_control_flow_attributes : require
-
-#extension GL_KHR_shader_subgroup_arithmetic : require
-#extension GL_KHR_shader_subgroup_ballot : require
-#extension GL_KHR_shader_subgroup_shuffle_relative: enable
-#extension GL_KHR_shader_subgroup_vote : require
-
-#include "compute_bindings.h"
-
-#include "radix_sort.h"
-
-#include "draw_2d.h"
-#include "indirect.h"
-
-layout(buffer_reference, std430, buffer_reference_align = 4) buffer SpineRef {
- uint values[];
-};
-
-struct RadixSortSpineConstants {
- CountRef count_buffer;
- SpineRef spine_buffer;
-};
-
-layout(std430, push_constant) uniform RadixSortSpineConstantsBlock {
- RadixSortSpineConstants constants;
-};
-
-layout (constant_id = 0) const uint SUBGROUP_SIZE = 64;
-
-const uint NUM_SUBGROUPS = RADIX_WGP_SIZE / SUBGROUP_SIZE;
-
-shared uint carry;
-shared uint sums[NUM_SUBGROUPS];
-
-layout (local_size_x = RADIX_WGP_SIZE, local_size_y = 1, local_size_z = 1) in;
-
-void main() {
- const uint local_id = gl_SubgroupID * gl_SubgroupSize + gl_SubgroupInvocationID;
-
- const uint count = constants.count_buffer.value;
-
- // Re-calculate how many workgroups pushed data into the spine
- const uint upsweep_wgp_count = (count + (RADIX_ITEMS_PER_WGP - 1)) / RADIX_ITEMS_PER_WGP;
-
- carry = 0;
- for (uint i = 0; i < upsweep_wgp_count; i++) {
- // Load values and calculate partial sums
- const uint value = constants.spine_buffer.values[i * RADIX_DIGITS + local_id];
- const uint sum = subgroupAdd(value);
- const uint scan = subgroupExclusiveAdd(value);
-
- if (subgroupElect()) {
- sums[gl_SubgroupID] = sum;
- }
-
- barrier();
-
- const uint carry_in = carry;
-
- // Scan partials
- if (local_id < NUM_SUBGROUPS) {
- sums[local_id] = subgroupExclusiveAdd(sums[local_id]);
- }
-
- barrier();
-
- // Write out the final prefix sum, combining the carry-in, subgroup sums, and local scan
- constants.spine_buffer.values[i * RADIX_DIGITS + local_id] = carry_in + sums[gl_SubgroupID] + scan;
-
- if (gl_SubgroupID == gl_NumSubgroups - 1 && subgroupElect()) {
- atomicAdd(carry, sums[gl_NumSubgroups - 1] + sum);
- }
- }
-}
#[repr(C)]
pub struct Draw2dClearConstants<'a> {
+ pub finished_buffer_address: BufferAddress<'a>,
pub coarse_buffer_address: BufferAddress<'a>,
}
pub struct RadixSortUpsweepConstants<'a> {
pub shift: u32,
pub _pad: u32,
+ pub finished_buffer_address: BufferAddress<'a>,
pub count_buffer_address: BufferAddress<'a>,
pub src_buffer_address: BufferAddress<'a>,
pub spine_buffer_address: BufferAddress<'a>,
}
-#[repr(C)]
-pub struct RadixSortSpineConstants<'a> {
- pub count_buffer_address: BufferAddress<'a>,
- pub spine_buffer_address: BufferAddress<'a>,
-}
-
#[repr(C)]
pub struct RadixSortDownsweepConstants<'a> {
pub shift: u32,
pub draw_2d_rasterize_pipeline: Pipeline,
pub radix_sort_0_upsweep_pipeline: Pipeline,
- pub radix_sort_1_spine_pipeline: Pipeline,
- pub radix_sort_2_downsweep_pipeline: Pipeline,
+ pub radix_sort_1_downsweep_pipeline: Pipeline,
pub composite_pipeline: Pipeline,
}
std::mem::size_of::<RadixSortUpsweepConstants>(),
);
- let radix_sort_1_spine_pipeline = create_compute_pipeline(
- crate::RADIX_SORT_1_SPINE_COMP_SPV,
- "radix_sort_spine",
- 32,
- true,
- std::mem::size_of::<RadixSortSpineConstants>(),
- );
-
- let radix_sort_2_downsweep_pipeline = create_compute_pipeline(
- crate::RADIX_SORT_2_DOWNSWEEP_COMP_SPV,
+ let radix_sort_1_downsweep_pipeline = create_compute_pipeline(
+ crate::RADIX_SORT_1_DOWNSWEEP_COMP_SPV,
"radix_sort_downsweep",
32,
true,
draw_2d_rasterize_pipeline,
radix_sort_0_upsweep_pipeline,
- radix_sort_1_spine_pipeline,
- radix_sort_2_downsweep_pipeline,
+ radix_sort_1_downsweep_pipeline,
composite_pipeline,
}
calculate_spine_size, BasicConstants, CompositeConstants, ComputeBinds, Draw2dClearConstants,
Draw2dCmd, Draw2dRasterizeConstants, Draw2dResolveConstants, Draw2dScatterConstants,
Draw2dSortConstants, GraphicsBinds, Pipelines, RadixSortDownsweepConstants,
- RadixSortSpineConstants, RadixSortUpsweepConstants, DRAW_2D_TILE_SIZE,
+ RadixSortUpsweepConstants, DRAW_2D_TILE_SIZE,
};
use renderdoc_sys as rdoc;
use helpers::load_obj;
use narcissus_app::{create_app, Event, Key, PressedState, WindowDesc};
use narcissus_core::{box_assume_init, default, rand::Pcg64, zeroed_box, BitIter};
-use narcissus_font::{FontCollection, GlyphCache, GlyphIndex, HorizontalMetrics};
+use narcissus_font::{FontCollection, GlyphCache, HorizontalMetrics};
use narcissus_gpu::{
create_device, Access, Bind, BufferImageCopy, BufferUsageFlags, ClearValue, CmdEncoder,
ColorSpace, DeviceExt, Extent2d, Extent3d, Frame, GlobalBarrier, Gpu, Image, ImageAspectFlags,
3 * std::mem::size_of::<u32>(),
);
+ let finished_buffer = gpu.request_transient_buffer(
+ frame,
+ thread_token,
+ BufferUsageFlags::INDIRECT,
+ std::mem::size_of::<u32>(),
+ );
+
let tmp_buffer = gpu.request_transient_buffer(
frame,
thread_token,
let coarse_buffer_address = gpu.get_buffer_address(coarse_buffer.to_arg());
let indirect_dispatch_buffer_address =
gpu.get_buffer_address(indirect_dispatch_buffer.to_arg());
+ let finished_buffer_address = gpu.get_buffer_address(finished_buffer.to_arg());
let tmp_buffer_address = gpu.get_buffer_address(tmp_buffer.to_arg());
let spine_buffer_address = gpu.get_buffer_address(spine_buffer.to_arg());
ShaderStageFlags::COMPUTE,
0,
&Draw2dClearConstants {
+ finished_buffer_address,
coarse_buffer_address,
},
);
&RadixSortUpsweepConstants {
shift,
_pad: 0,
+ finished_buffer_address,
count_buffer_address,
src_buffer_address,
spine_buffer_address,
&[],
);
- // Exclusive sum of the spine
- gpu.cmd_set_pipeline(cmd_encoder, self.pipelines.radix_sort_1_spine_pipeline);
- gpu.cmd_set_bind_group(cmd_encoder, 0, &compute_bind_group);
- gpu.cmd_push_constants(
- cmd_encoder,
- ShaderStageFlags::COMPUTE,
- 0,
- &RadixSortSpineConstants {
- count_buffer_address,
- spine_buffer_address,
- },
- );
- gpu.cmd_dispatch(cmd_encoder, 1, 1, 1);
-
- gpu.cmd_barrier(
- cmd_encoder,
- Some(&GlobalBarrier {
- prev_access: &[Access::ComputeWrite],
- next_access: &[Access::ComputeOtherRead],
- }),
- &[],
- );
-
// Downsweep
gpu.cmd_set_pipeline(
cmd_encoder,
- self.pipelines.radix_sort_2_downsweep_pipeline,
+ self.pipelines.radix_sort_1_downsweep_pipeline,
);
gpu.cmd_set_bind_group(cmd_encoder, 0, &compute_bind_group);
gpu.cmd_push_constants(
};
use shark_shaders::pipelines::{
calcuate_workgroup_count, calculate_spine_size, Pipelines, RadixSortDownsweepConstants,
- RadixSortSpineConstants, RadixSortUpsweepConstants,
+ RadixSortUpsweepConstants,
};
fn gpu_sort(values: &mut [u32]) {
size: std::mem::size_of_val(values),
});
+ let finished_buffer = gpu.create_buffer(&BufferDesc {
+ memory_location: MemoryLocation::Device,
+ host_mapped: false,
+ usage: BufferUsageFlags::STORAGE,
+ size: std::mem::size_of::<u32>(),
+ });
+
let spine_buffer = gpu.create_buffer(&BufferDesc {
memory_location: MemoryLocation::Device,
host_mapped: false,
});
let count_buffer_address = gpu.get_buffer_address(count_buffer.to_arg());
+ let finished_buffer_address = gpu.get_buffer_address(finished_buffer.to_arg());
let spine_buffer_address = gpu.get_buffer_address(spine_buffer.to_arg());
let mut src_buffer_address = gpu.get_buffer_address(sort_buffer.to_arg());
let mut dst_buffer_address = gpu.get_buffer_address(tmp_buffer.to_arg());
&RadixSortUpsweepConstants {
shift,
_pad: 0,
+ finished_buffer_address,
count_buffer_address,
src_buffer_address,
spine_buffer_address,
&[],
);
- // Exclusive sum of the spine
- gpu.cmd_set_pipeline(cmd_encoder, pipelines.radix_sort_1_spine_pipeline);
- gpu.cmd_push_constants(
- cmd_encoder,
- ShaderStageFlags::COMPUTE,
- 0,
- &RadixSortSpineConstants {
- count_buffer_address,
- spine_buffer_address,
- },
- );
- gpu.cmd_dispatch(cmd_encoder, 1, 1, 1);
-
- gpu.cmd_barrier(
- cmd_encoder,
- Some(&GlobalBarrier {
- prev_access: &[Access::ComputeWrite],
- next_access: &[Access::ComputeOtherRead],
- }),
- &[],
- );
-
// Downsweep
- gpu.cmd_set_pipeline(cmd_encoder, pipelines.radix_sort_2_downsweep_pipeline);
+ gpu.cmd_set_pipeline(cmd_encoder, pipelines.radix_sort_1_downsweep_pipeline);
gpu.cmd_push_constants(
cmd_encoder,
ShaderStageFlags::COMPUTE,