From: Josh Simmons <josh@nega.tv>
Date: Mon, 4 Nov 2024 20:23:11 +0000 (+0100)
Subject: shark: Add basic tests for gpu radix sorting
X-Git-Url: https://git.nega.tv//gitweb.cgi?a=commitdiff_plain;h=76542513e2e3670f37edffc8aa338e731cc83e57;p=josh%2Fnarcissus

shark: Add basic tests for gpu radix sorting
---

diff --git a/Cargo.lock b/Cargo.lock
index 0f9129a..a891808 100644
--- a/Cargo.lock
+++ b/Cargo.lock
@@ -191,6 +191,12 @@ dependencies = [
 [[package]]
 name = "shark-shaders"
 version = "0.1.0"
+dependencies = [
+ "narcissus-core",
+ "narcissus-font",
+ "narcissus-gpu",
+ "narcissus-maths",
+]
 
 [[package]]
 name = "sqlite-sys"
diff --git a/engine/narcissus-gpu/src/lib.rs b/engine/narcissus-gpu/src/lib.rs
index 1ae5d81..7cab81b 100644
--- a/engine/narcissus-gpu/src/lib.rs
+++ b/engine/narcissus-gpu/src/lib.rs
@@ -13,6 +13,8 @@ mod tlsf;
 
 pub use mapped_buffer::{PersistentBuffer, TransientBuffer};
 
+pub type Gpu = dyn Device + 'static;
+
 pub enum DeviceBackend {
     Vulkan,
 }
diff --git a/title/shark-shaders/Cargo.toml b/title/shark-shaders/Cargo.toml
index ccd34fd..6f57c70 100644
--- a/title/shark-shaders/Cargo.toml
+++ b/title/shark-shaders/Cargo.toml
@@ -6,3 +6,7 @@ edition = "2021"
 # See more keys and their definitions at https://doc.rust-lang.org/cargo/reference/manifest.html
 
 [dependencies]
+narcissus-core = { path = "../../engine/narcissus-core" }
+narcissus-font = { path = "../../engine/narcissus-font" }
+narcissus-maths = { path = "../../engine/narcissus-maths" }
+narcissus-gpu = { path = "../../engine/narcissus-gpu" }
\ No newline at end of file
diff --git a/title/shark-shaders/src/lib.rs b/title/shark-shaders/src/lib.rs
index 9f574b0..e24a302 100644
--- a/title/shark-shaders/src/lib.rs
+++ b/title/shark-shaders/src/lib.rs
@@ -1 +1,3 @@
+pub mod pipelines;
+
 include!(concat!(env!("OUT_DIR"), "/shaders.rs"));
diff --git a/title/shark/src/pipelines.rs b/title/shark-shaders/src/pipelines.rs
similarity index 89%
rename from title/shark/src/pipelines.rs
rename to title/shark-shaders/src/pipelines.rs
index 5b893b4..9bc69e3 100644
--- a/title/shark/src/pipelines.rs
+++ b/title/shark-shaders/src/pipelines.rs
@@ -2,15 +2,13 @@ use narcissus_core::default;
 use narcissus_font::TouchedGlyphIndex;
 use narcissus_gpu::{
     BindDesc, BindGroupLayout, BindingType, BlendMode, BufferAddress, CompareOp,
-    ComputePipelineDesc, CullingMode, FrontFace, GraphicsPipelineAttachments, GraphicsPipelineDesc,
-    ImageFormat, Pipeline, PipelineLayout, PolygonMode, PushConstantRange, Sampler,
-    SamplerAddressMode, SamplerDesc, SamplerFilter, ShaderDesc, ShaderStageFlags, SpecConstant,
-    Topology,
+    ComputePipelineDesc, CullingMode, FrontFace, Gpu, GraphicsPipelineAttachments,
+    GraphicsPipelineDesc, ImageFormat, Pipeline, PipelineLayout, PolygonMode, PushConstantRange,
+    Sampler, SamplerAddressMode, SamplerDesc, SamplerFilter, ShaderDesc, ShaderStageFlags,
+    SpecConstant, Topology,
 };
 use narcissus_maths::Mat4;
 
-use crate::Gpu;
-
 pub const DRAW_2D_TILE_SIZE: u32 = 32;
 
 #[allow(unused)]
@@ -196,6 +194,18 @@ pub struct RadixSortDownsweepConstants<'a> {
     pub dst_buffer_address: BufferAddress<'a>,
 }
 
+pub const RADIX_ITEMS_PER_WGP: usize = 4096;
+pub const RADIX_DIGITS: usize = 256;
+
+pub fn calcuate_workgroup_count(count: usize) -> usize {
+    (count + (RADIX_ITEMS_PER_WGP - 1)) / RADIX_ITEMS_PER_WGP
+}
+
+/// Returns the size of the spine required to sort the given count in units of u32 words.
+pub fn calculate_spine_size(count: usize) -> usize {
+    calcuate_workgroup_count(count) * RADIX_DIGITS
+}
+
 pub struct Pipelines {
     _samplers: Samplers,
 
@@ -251,11 +261,11 @@ impl Pipelines {
 
         let basic_pipeline = gpu.create_graphics_pipeline(&GraphicsPipelineDesc {
             vertex_shader: ShaderDesc {
-                code: shark_shaders::BASIC_VERT_SPV,
+                code: crate::BASIC_VERT_SPV,
                 ..default()
             },
             fragment_shader: ShaderDesc {
-                code: shark_shaders::BASIC_FRAG_SPV,
+                code: crate::BASIC_FRAG_SPV,
                 ..default()
             },
             layout: PipelineLayout {
@@ -327,7 +337,7 @@ impl Pipelines {
         };
 
         let draw_2d_bin_0_clear_pipeline = create_compute_pipeline(
-            shark_shaders::DRAW_2D_BIN_0_CLEAR_COMP_SPV,
+            crate::DRAW_2D_BIN_0_CLEAR_COMP_SPV,
             "draw2d_bin_clear",
             0,
             std::mem::size_of::<Draw2dClearConstants>(),
@@ -335,56 +345,52 @@ impl Pipelines {
 
         let draw_2d_bin_1_scatter_pipeline_workgroup_size = 32;
         let draw_2d_bin_1_scatter_pipeline = create_compute_pipeline(
-            shark_shaders::DRAW_2D_BIN_1_SCATTER_COMP_SPV,
+            crate::DRAW_2D_BIN_1_SCATTER_COMP_SPV,
             "draw2d_bin_scatter",
             draw_2d_bin_1_scatter_pipeline_workgroup_size,
             std::mem::size_of::<Draw2dScatterConstants>(),
         );
 
         let draw_2d_bin_2_sort_pipeline = create_compute_pipeline(
-            shark_shaders::DRAW_2D_BIN_2_SORT_COMP_SPV,
+            crate::DRAW_2D_BIN_2_SORT_COMP_SPV,
             "draw2d_bin_sort",
             0,
             std::mem::size_of::<Draw2dSortConstants>(),
         );
 
         let draw_2d_bin_3_resolve_pipeline = create_compute_pipeline(
-            shark_shaders::DRAW_2D_BIN_3_RESOLVE_COMP_SPV,
+            crate::DRAW_2D_BIN_3_RESOLVE_COMP_SPV,
             "draw2d_bin_resolve",
             0,
             0,
         );
 
-        let draw_2d_rasterize_pipeline = create_compute_pipeline(
-            shark_shaders::DRAW_2D_RASTERIZE_COMP_SPV,
-            "draw2d_rasterize",
-            0,
-            0,
-        );
+        let draw_2d_rasterize_pipeline =
+            create_compute_pipeline(crate::DRAW_2D_RASTERIZE_COMP_SPV, "draw2d_rasterize", 0, 0);
 
         let radix_sort_0_upsweep_pipeline = create_compute_pipeline(
-            shark_shaders::RADIX_SORT_0_UPSWEEP_COMP_SPV,
+            crate::RADIX_SORT_0_UPSWEEP_COMP_SPV,
             "radix_sort_upsweep",
             32,
             std::mem::size_of::<RadixSortUpsweepConstants>(),
         );
 
         let radix_sort_1_spine_pipeline = create_compute_pipeline(
-            shark_shaders::RADIX_SORT_1_SPINE_COMP_SPV,
+            crate::RADIX_SORT_1_SPINE_COMP_SPV,
             "radix_sort_spine",
             32,
             std::mem::size_of::<RadixSortSpineConstants>(),
         );
 
         let radix_sort_2_downsweep_pipeline = create_compute_pipeline(
-            shark_shaders::RADIX_SORT_2_DOWNSWEEP_COMP_SPV,
+            crate::RADIX_SORT_2_DOWNSWEEP_COMP_SPV,
             "radix_sort_downsweep",
             32,
             std::mem::size_of::<RadixSortDownsweepConstants>(),
         );
 
         let composite_pipeline =
-            create_compute_pipeline(shark_shaders::COMPOSITE_COMP_SPV, "composite", 0, 0);
+            create_compute_pipeline(crate::COMPOSITE_COMP_SPV, "composite", 0, 0);
 
         Self {
             _samplers: samplers,
diff --git a/title/shark/src/helpers.rs b/title/shark/src/helpers.rs
index e45e0ba..000300f 100644
--- a/title/shark/src/helpers.rs
+++ b/title/shark/src/helpers.rs
@@ -3,7 +3,7 @@ use std::path::Path;
 use narcissus_core::{obj, Widen};
 use narcissus_maths::{vec2, vec3, vec4, Vec2, Vec3};
 
-use crate::pipelines::Vertex;
+use shark_shaders::pipelines::Vertex;
 
 pub fn load_obj<P: AsRef<Path>>(path: P) -> (Vec<Vertex>, Vec<u16>) {
     #[derive(Default)]
diff --git a/title/shark/src/main.rs b/title/shark/src/main.rs
index 8986482..5441cb5 100644
--- a/title/shark/src/main.rs
+++ b/title/shark/src/main.rs
@@ -5,10 +5,11 @@ use std::time::{Duration, Instant};
 
 use narcissus_core::dds;
 
-use pipelines::{
-    BasicConstants, ComputeBinds, Draw2dClearConstants, Draw2dCmd, Draw2dScatterConstants,
-    Draw2dSortConstants, GraphicsBinds, Pipelines, RadixSortDownsweepConstants,
-    RadixSortSpineConstants, RadixSortUpsweepConstants, DRAW_2D_TILE_SIZE,
+use shark_shaders::pipelines::{
+    calculate_spine_size, BasicConstants, ComputeBinds, Draw2dClearConstants, Draw2dCmd,
+    Draw2dScatterConstants, Draw2dSortConstants, GraphicsBinds, Pipelines,
+    RadixSortDownsweepConstants, RadixSortSpineConstants, RadixSortUpsweepConstants,
+    DRAW_2D_TILE_SIZE,
 };
 
 use renderdoc_sys as rdoc;
@@ -20,12 +21,11 @@ use narcissus_core::{box_assume_init, default, rand::Pcg64, zeroed_box, BitIter}
 use narcissus_font::{FontCollection, GlyphCache, HorizontalMetrics};
 use narcissus_gpu::{
     create_device, Access, Bind, BufferImageCopy, BufferUsageFlags, ClearValue, CmdEncoder,
-    ColorSpace, Device, DeviceExt, Extent2d, Extent3d, Frame, GlobalBarrier, Image,
-    ImageAspectFlags, ImageBarrier, ImageDesc, ImageDimension, ImageFormat, ImageLayout,
-    ImageSubresourceRange, ImageTiling, ImageUsageFlags, IndexType, LoadOp, MemoryLocation,
-    Offset2d, PersistentBuffer, PresentMode, RenderingAttachment, RenderingDesc, Scissor,
-    ShaderStageFlags, StoreOp, SwapchainConfigurator, SwapchainImage, ThreadToken, TypedBind,
-    Viewport,
+    ColorSpace, DeviceExt, Extent2d, Extent3d, Frame, GlobalBarrier, Gpu, Image, ImageAspectFlags,
+    ImageBarrier, ImageDesc, ImageDimension, ImageFormat, ImageLayout, ImageSubresourceRange,
+    ImageTiling, ImageUsageFlags, IndexType, LoadOp, MemoryLocation, Offset2d, PersistentBuffer,
+    PresentMode, RenderingAttachment, RenderingDesc, Scissor, ShaderStageFlags, StoreOp,
+    SwapchainConfigurator, SwapchainImage, ThreadToken, TypedBind, Viewport,
 };
 use narcissus_image as image;
 use narcissus_maths::{
@@ -37,7 +37,6 @@ use spring::simple_spring_damper_exact;
 mod fonts;
 mod helpers;
 pub mod microshades;
-mod pipelines;
 mod spring;
 
 const SQRT_2: f32 = 0.70710677;
@@ -836,8 +835,6 @@ impl Images {
     }
 }
 
-type Gpu = dyn Device + 'static;
-
 struct DrawState<'gpu> {
     gpu: &'gpu Gpu,
 
@@ -1312,7 +1309,7 @@ impl<'gpu> DrawState<'gpu> {
                     3 * std::mem::size_of::<u32>(),
                 );
 
-                let sort_tmp_buffer = gpu.request_transient_buffer(
+                let tmp_buffer = gpu.request_transient_buffer(
                     frame,
                     thread_token,
                     BufferUsageFlags::STORAGE,
@@ -1323,7 +1320,7 @@ impl<'gpu> DrawState<'gpu> {
                     frame,
                     thread_token,
                     BufferUsageFlags::STORAGE,
-                    (COARSE_BUFFER_LEN / (32 * 16)) * 256 * std::mem::size_of::<u32>(), // TODO: Fix size
+                    calculate_spine_size(COARSE_BUFFER_LEN) * std::mem::size_of::<u32>(), // TODO: Fix size
                 );
 
                 let draw_buffer_address = gpu.get_buffer_address(draw_buffer.to_arg());
@@ -1331,7 +1328,7 @@ impl<'gpu> DrawState<'gpu> {
                 let coarse_buffer_address = gpu.get_buffer_address(coarse_buffer.to_arg());
                 let indirect_dispatch_buffer_address =
                     gpu.get_buffer_address(indirect_dispatch_buffer.to_arg());
-                let sort_tmp_buffer_address = gpu.get_buffer_address(sort_tmp_buffer.to_arg());
+                let tmp_buffer_address = gpu.get_buffer_address(tmp_buffer.to_arg());
                 let spine_buffer_address = gpu.get_buffer_address(spine_buffer.to_arg());
 
                 gpu.cmd_set_pipeline(cmd_encoder, self.pipelines.draw_2d_bin_0_clear_pipeline);
@@ -1394,21 +1391,6 @@ impl<'gpu> DrawState<'gpu> {
                     &[],
                 );
 
-                // let mut sort_data = Vec::new();
-                // let count = 8192u32;
-                // sort_data.push(count);
-                // for i in 0..count {
-                //     sort_data.push(255 - i / 32);
-                // }
-
-                // let sort_buffer = gpu.request_transient_buffer_with_data(
-                //     frame,
-                //     thread_token,
-                //     BufferUsageFlags::STORAGE,
-                //     sort_data.as_slice(),
-                // );
-                // let sort_buffer_address = gpu.get_buffer_address(sort_buffer.to_arg());
-
                 gpu.cmd_set_pipeline(cmd_encoder, self.pipelines.draw_2d_bin_2_sort_pipeline);
                 gpu.cmd_set_bind_group(cmd_encoder, 0, &compute_bind_group);
                 gpu.cmd_push_constants(
@@ -1444,7 +1426,7 @@ impl<'gpu> DrawState<'gpu> {
                 let count_buffer_address = coarse_buffer_address;
                 // Then the elements we want to sort follow.
                 let mut src_buffer_address = count_buffer_address.byte_add(4);
-                let mut dst_buffer_address = sort_tmp_buffer_address;
+                let mut dst_buffer_address = tmp_buffer_address;
 
                 for pass in 0..4 {
                     let shift = pass * 8;
diff --git a/title/shark/tests/radix_sort.rs b/title/shark/tests/radix_sort.rs
new file mode 100644
index 0000000..64e1897
--- /dev/null
+++ b/title/shark/tests/radix_sort.rs
@@ -0,0 +1,240 @@
+use narcissus_core::rand::Pcg64;
+use narcissus_gpu::{
+    create_device, Access, BufferDesc, BufferUsageFlags, DeviceExt, GlobalBarrier, MemoryLocation,
+    ShaderStageFlags, ThreadToken,
+};
+use shark_shaders::pipelines::{
+    calcuate_workgroup_count, calculate_spine_size, Pipelines, RadixSortDownsweepConstants,
+    RadixSortSpineConstants, RadixSortUpsweepConstants,
+};
+
+fn gpu_sort(values: &mut [u32]) {
+    let gpu = create_device(narcissus_gpu::DeviceBackend::Vulkan);
+    let gpu = gpu.as_ref();
+
+    let pipelines = Pipelines::load(gpu);
+
+    let count_buffer = gpu.create_persistent_buffer_with_data(
+        MemoryLocation::Device,
+        BufferUsageFlags::STORAGE,
+        &(values.len() as u32),
+    );
+
+    let sort_buffer = gpu.create_persistent_buffer_with_data(
+        MemoryLocation::Device,
+        BufferUsageFlags::STORAGE,
+        values,
+    );
+
+    let tmp_buffer = gpu.create_buffer(&BufferDesc {
+        memory_location: MemoryLocation::Device,
+        host_mapped: false,
+        usage: BufferUsageFlags::STORAGE,
+        size: std::mem::size_of_val(values),
+    });
+
+    let spine_buffer = gpu.create_buffer(&BufferDesc {
+        memory_location: MemoryLocation::Device,
+        host_mapped: false,
+        usage: BufferUsageFlags::STORAGE,
+        size: calculate_spine_size(values.len()) * std::mem::size_of::<u32>(),
+    });
+
+    let count_buffer_address = gpu.get_buffer_address(count_buffer.to_arg());
+    let spine_buffer_address = gpu.get_buffer_address(spine_buffer.to_arg());
+    let mut src_buffer_address = gpu.get_buffer_address(sort_buffer.to_arg());
+    let mut dst_buffer_address = gpu.get_buffer_address(tmp_buffer.to_arg());
+
+    let thread_token = ThreadToken::new();
+    let thread_token = &thread_token;
+    let frame = gpu.begin_frame();
+    {
+        let frame = &frame;
+        let mut cmd_encoder = gpu.request_cmd_encoder(frame, thread_token);
+
+        {
+            let cmd_encoder = &mut cmd_encoder;
+
+            for pass in 0..4 {
+                let shift = pass * 8;
+
+                // Upsweep
+                gpu.cmd_set_pipeline(cmd_encoder, pipelines.radix_sort_0_upsweep_pipeline);
+                gpu.cmd_push_constants(
+                    cmd_encoder,
+                    ShaderStageFlags::COMPUTE,
+                    0,
+                    &RadixSortUpsweepConstants {
+                        shift,
+                        _pad: 0,
+                        count_buffer_address,
+                        src_buffer_address,
+                        spine_buffer_address,
+                    },
+                );
+                gpu.cmd_dispatch(
+                    cmd_encoder,
+                    calcuate_workgroup_count(values.len()) as u32,
+                    1,
+                    1,
+                );
+
+                gpu.cmd_barrier(
+                    cmd_encoder,
+                    Some(&GlobalBarrier {
+                        prev_access: &[Access::ComputeWrite],
+                        next_access: &[Access::ComputeOtherRead],
+                    }),
+                    &[],
+                );
+
+                // Exclusive sum of the spine
+                gpu.cmd_set_pipeline(cmd_encoder, pipelines.radix_sort_1_spine_pipeline);
+                gpu.cmd_push_constants(
+                    cmd_encoder,
+                    ShaderStageFlags::COMPUTE,
+                    0,
+                    &RadixSortSpineConstants {
+                        count_buffer_address,
+                        spine_buffer_address,
+                    },
+                );
+                gpu.cmd_dispatch(cmd_encoder, 1, 1, 1);
+
+                gpu.cmd_barrier(
+                    cmd_encoder,
+                    Some(&GlobalBarrier {
+                        prev_access: &[Access::ComputeWrite],
+                        next_access: &[Access::ComputeOtherRead],
+                    }),
+                    &[],
+                );
+
+                // Downsweep
+                gpu.cmd_set_pipeline(cmd_encoder, pipelines.radix_sort_2_downsweep_pipeline);
+                gpu.cmd_push_constants(
+                    cmd_encoder,
+                    ShaderStageFlags::COMPUTE,
+                    0,
+                    &RadixSortDownsweepConstants {
+                        shift,
+                        _pad: 0,
+                        count_buffer_address,
+                        spine_buffer_address,
+                        src_buffer_address,
+                        dst_buffer_address,
+                    },
+                );
+                gpu.cmd_dispatch(
+                    cmd_encoder,
+                    calcuate_workgroup_count(values.len()) as u32,
+                    1,
+                    1,
+                );
+
+                gpu.cmd_barrier(
+                    cmd_encoder,
+                    Some(&GlobalBarrier {
+                        prev_access: &[Access::ComputeWrite],
+                        next_access: &[Access::ComputeOtherRead],
+                    }),
+                    &[],
+                );
+
+                std::mem::swap(&mut src_buffer_address, &mut dst_buffer_address);
+            }
+        }
+
+        gpu.submit(frame, cmd_encoder);
+    }
+
+    gpu.end_frame(frame);
+
+    gpu.wait_idle();
+
+    unsafe { sort_buffer.copy_to_slice(values) };
+}
+
+// This test requires a GPU, so ignore the test by default.
+#[ignore]
+#[test]
+pub fn sort_random_input() {
+    let mut rng = Pcg64::new();
+
+    let count = 15 * 1024 * 1024 + 3;
+
+    let mut values = vec![];
+    values.reserve_exact(count);
+    for _ in 0..count / 2 {
+        let i = rng.next_u64();
+        values.push((i & 0xffff_ffff) as u32);
+        values.push(((i >> 32) & 0xffff_ffff) as u32);
+    }
+
+    values.push((rng.next_u64() & 0xffff_ffff) as u32);
+
+    gpu_sort(&mut values);
+
+    assert!(values.is_sorted());
+}
+
+#[ignore]
+#[test]
+pub fn sort_single() {
+    let mut values = vec![5];
+    let mut sorted = values.clone();
+    sorted.sort();
+
+    gpu_sort(&mut values);
+
+    assert!(values == sorted);
+}
+
+#[ignore]
+#[test]
+pub fn sort_double() {
+    let mut values = vec![u32::MAX, 0];
+    let mut sorted = values.clone();
+    sorted.sort();
+
+    gpu_sort(&mut values);
+
+    assert!(values.is_sorted());
+}
+
+#[ignore]
+#[test]
+pub fn sort_short_input() {
+    let mut values = vec![5, 4, 3, 2, 1];
+    let mut sorted = values.clone();
+    sorted.sort();
+
+    assert!(!values.is_sorted());
+
+    gpu_sort(&mut values);
+
+    assert!(values.is_sorted());
+    assert!(values == sorted);
+}
+
+#[ignore]
+#[test]
+pub fn sort_u32_max() {
+    let mut values = vec![u32::MAX; 10_000];
+
+    gpu_sort(&mut values);
+
+    assert!(values.is_sorted());
+    assert!(values.iter().all(|&x| x == u32::MAX));
+}
+
+#[ignore]
+#[test]
+pub fn sort_u32_zero() {
+    let mut values = vec![0; 10_000];
+
+    gpu_sort(&mut values);
+
+    assert!(values.is_sorted());
+    assert!(values.iter().all(|&x| x == 0));
+}