From: Josh Simmons Date: Mon, 4 Nov 2024 20:23:11 +0000 (+0100) Subject: shark: Add basic tests for gpu radix sorting X-Git-Url: https://git.nega.tv//gitweb.cgi?a=commitdiff_plain;h=76542513e2e3670f37edffc8aa338e731cc83e57;p=josh%2Fnarcissus shark: Add basic tests for gpu radix sorting --- diff --git a/Cargo.lock b/Cargo.lock index 0f9129a..a891808 100644 --- a/Cargo.lock +++ b/Cargo.lock @@ -191,6 +191,12 @@ dependencies = [ [[package]] name = "shark-shaders" version = "0.1.0" +dependencies = [ + "narcissus-core", + "narcissus-font", + "narcissus-gpu", + "narcissus-maths", +] [[package]] name = "sqlite-sys" diff --git a/engine/narcissus-gpu/src/lib.rs b/engine/narcissus-gpu/src/lib.rs index 1ae5d81..7cab81b 100644 --- a/engine/narcissus-gpu/src/lib.rs +++ b/engine/narcissus-gpu/src/lib.rs @@ -13,6 +13,8 @@ mod tlsf; pub use mapped_buffer::{PersistentBuffer, TransientBuffer}; +pub type Gpu = dyn Device + 'static; + pub enum DeviceBackend { Vulkan, } diff --git a/title/shark-shaders/Cargo.toml b/title/shark-shaders/Cargo.toml index ccd34fd..6f57c70 100644 --- a/title/shark-shaders/Cargo.toml +++ b/title/shark-shaders/Cargo.toml @@ -6,3 +6,7 @@ edition = "2021" # See more keys and their definitions at https://doc.rust-lang.org/cargo/reference/manifest.html [dependencies] +narcissus-core = { path = "../../engine/narcissus-core" } +narcissus-font = { path = "../../engine/narcissus-font" } +narcissus-maths = { path = "../../engine/narcissus-maths" } +narcissus-gpu = { path = "../../engine/narcissus-gpu" } \ No newline at end of file diff --git a/title/shark-shaders/src/lib.rs b/title/shark-shaders/src/lib.rs index 9f574b0..e24a302 100644 --- a/title/shark-shaders/src/lib.rs +++ b/title/shark-shaders/src/lib.rs @@ -1 +1,3 @@ +pub mod pipelines; + include!(concat!(env!("OUT_DIR"), "/shaders.rs")); diff --git a/title/shark/src/pipelines.rs b/title/shark-shaders/src/pipelines.rs similarity index 89% rename from title/shark/src/pipelines.rs rename to title/shark-shaders/src/pipelines.rs index 5b893b4..9bc69e3 100644 --- a/title/shark/src/pipelines.rs +++ b/title/shark-shaders/src/pipelines.rs @@ -2,15 +2,13 @@ use narcissus_core::default; use narcissus_font::TouchedGlyphIndex; use narcissus_gpu::{ BindDesc, BindGroupLayout, BindingType, BlendMode, BufferAddress, CompareOp, - ComputePipelineDesc, CullingMode, FrontFace, GraphicsPipelineAttachments, GraphicsPipelineDesc, - ImageFormat, Pipeline, PipelineLayout, PolygonMode, PushConstantRange, Sampler, - SamplerAddressMode, SamplerDesc, SamplerFilter, ShaderDesc, ShaderStageFlags, SpecConstant, - Topology, + ComputePipelineDesc, CullingMode, FrontFace, Gpu, GraphicsPipelineAttachments, + GraphicsPipelineDesc, ImageFormat, Pipeline, PipelineLayout, PolygonMode, PushConstantRange, + Sampler, SamplerAddressMode, SamplerDesc, SamplerFilter, ShaderDesc, ShaderStageFlags, + SpecConstant, Topology, }; use narcissus_maths::Mat4; -use crate::Gpu; - pub const DRAW_2D_TILE_SIZE: u32 = 32; #[allow(unused)] @@ -196,6 +194,18 @@ pub struct RadixSortDownsweepConstants<'a> { pub dst_buffer_address: BufferAddress<'a>, } +pub const RADIX_ITEMS_PER_WGP: usize = 4096; +pub const RADIX_DIGITS: usize = 256; + +pub fn calcuate_workgroup_count(count: usize) -> usize { + (count + (RADIX_ITEMS_PER_WGP - 1)) / RADIX_ITEMS_PER_WGP +} + +/// Returns the size of the spine required to sort the given count in units of u32 words. +pub fn calculate_spine_size(count: usize) -> usize { + calcuate_workgroup_count(count) * RADIX_DIGITS +} + pub struct Pipelines { _samplers: Samplers, @@ -251,11 +261,11 @@ impl Pipelines { let basic_pipeline = gpu.create_graphics_pipeline(&GraphicsPipelineDesc { vertex_shader: ShaderDesc { - code: shark_shaders::BASIC_VERT_SPV, + code: crate::BASIC_VERT_SPV, ..default() }, fragment_shader: ShaderDesc { - code: shark_shaders::BASIC_FRAG_SPV, + code: crate::BASIC_FRAG_SPV, ..default() }, layout: PipelineLayout { @@ -327,7 +337,7 @@ impl Pipelines { }; let draw_2d_bin_0_clear_pipeline = create_compute_pipeline( - shark_shaders::DRAW_2D_BIN_0_CLEAR_COMP_SPV, + crate::DRAW_2D_BIN_0_CLEAR_COMP_SPV, "draw2d_bin_clear", 0, std::mem::size_of::(), @@ -335,56 +345,52 @@ impl Pipelines { let draw_2d_bin_1_scatter_pipeline_workgroup_size = 32; let draw_2d_bin_1_scatter_pipeline = create_compute_pipeline( - shark_shaders::DRAW_2D_BIN_1_SCATTER_COMP_SPV, + crate::DRAW_2D_BIN_1_SCATTER_COMP_SPV, "draw2d_bin_scatter", draw_2d_bin_1_scatter_pipeline_workgroup_size, std::mem::size_of::(), ); let draw_2d_bin_2_sort_pipeline = create_compute_pipeline( - shark_shaders::DRAW_2D_BIN_2_SORT_COMP_SPV, + crate::DRAW_2D_BIN_2_SORT_COMP_SPV, "draw2d_bin_sort", 0, std::mem::size_of::(), ); let draw_2d_bin_3_resolve_pipeline = create_compute_pipeline( - shark_shaders::DRAW_2D_BIN_3_RESOLVE_COMP_SPV, + crate::DRAW_2D_BIN_3_RESOLVE_COMP_SPV, "draw2d_bin_resolve", 0, 0, ); - let draw_2d_rasterize_pipeline = create_compute_pipeline( - shark_shaders::DRAW_2D_RASTERIZE_COMP_SPV, - "draw2d_rasterize", - 0, - 0, - ); + let draw_2d_rasterize_pipeline = + create_compute_pipeline(crate::DRAW_2D_RASTERIZE_COMP_SPV, "draw2d_rasterize", 0, 0); let radix_sort_0_upsweep_pipeline = create_compute_pipeline( - shark_shaders::RADIX_SORT_0_UPSWEEP_COMP_SPV, + crate::RADIX_SORT_0_UPSWEEP_COMP_SPV, "radix_sort_upsweep", 32, std::mem::size_of::(), ); let radix_sort_1_spine_pipeline = create_compute_pipeline( - shark_shaders::RADIX_SORT_1_SPINE_COMP_SPV, + crate::RADIX_SORT_1_SPINE_COMP_SPV, "radix_sort_spine", 32, std::mem::size_of::(), ); let radix_sort_2_downsweep_pipeline = create_compute_pipeline( - shark_shaders::RADIX_SORT_2_DOWNSWEEP_COMP_SPV, + crate::RADIX_SORT_2_DOWNSWEEP_COMP_SPV, "radix_sort_downsweep", 32, std::mem::size_of::(), ); let composite_pipeline = - create_compute_pipeline(shark_shaders::COMPOSITE_COMP_SPV, "composite", 0, 0); + create_compute_pipeline(crate::COMPOSITE_COMP_SPV, "composite", 0, 0); Self { _samplers: samplers, diff --git a/title/shark/src/helpers.rs b/title/shark/src/helpers.rs index e45e0ba..000300f 100644 --- a/title/shark/src/helpers.rs +++ b/title/shark/src/helpers.rs @@ -3,7 +3,7 @@ use std::path::Path; use narcissus_core::{obj, Widen}; use narcissus_maths::{vec2, vec3, vec4, Vec2, Vec3}; -use crate::pipelines::Vertex; +use shark_shaders::pipelines::Vertex; pub fn load_obj>(path: P) -> (Vec, Vec) { #[derive(Default)] diff --git a/title/shark/src/main.rs b/title/shark/src/main.rs index 8986482..5441cb5 100644 --- a/title/shark/src/main.rs +++ b/title/shark/src/main.rs @@ -5,10 +5,11 @@ use std::time::{Duration, Instant}; use narcissus_core::dds; -use pipelines::{ - BasicConstants, ComputeBinds, Draw2dClearConstants, Draw2dCmd, Draw2dScatterConstants, - Draw2dSortConstants, GraphicsBinds, Pipelines, RadixSortDownsweepConstants, - RadixSortSpineConstants, RadixSortUpsweepConstants, DRAW_2D_TILE_SIZE, +use shark_shaders::pipelines::{ + calculate_spine_size, BasicConstants, ComputeBinds, Draw2dClearConstants, Draw2dCmd, + Draw2dScatterConstants, Draw2dSortConstants, GraphicsBinds, Pipelines, + RadixSortDownsweepConstants, RadixSortSpineConstants, RadixSortUpsweepConstants, + DRAW_2D_TILE_SIZE, }; use renderdoc_sys as rdoc; @@ -20,12 +21,11 @@ use narcissus_core::{box_assume_init, default, rand::Pcg64, zeroed_box, BitIter} use narcissus_font::{FontCollection, GlyphCache, HorizontalMetrics}; use narcissus_gpu::{ create_device, Access, Bind, BufferImageCopy, BufferUsageFlags, ClearValue, CmdEncoder, - ColorSpace, Device, DeviceExt, Extent2d, Extent3d, Frame, GlobalBarrier, Image, - ImageAspectFlags, ImageBarrier, ImageDesc, ImageDimension, ImageFormat, ImageLayout, - ImageSubresourceRange, ImageTiling, ImageUsageFlags, IndexType, LoadOp, MemoryLocation, - Offset2d, PersistentBuffer, PresentMode, RenderingAttachment, RenderingDesc, Scissor, - ShaderStageFlags, StoreOp, SwapchainConfigurator, SwapchainImage, ThreadToken, TypedBind, - Viewport, + ColorSpace, DeviceExt, Extent2d, Extent3d, Frame, GlobalBarrier, Gpu, Image, ImageAspectFlags, + ImageBarrier, ImageDesc, ImageDimension, ImageFormat, ImageLayout, ImageSubresourceRange, + ImageTiling, ImageUsageFlags, IndexType, LoadOp, MemoryLocation, Offset2d, PersistentBuffer, + PresentMode, RenderingAttachment, RenderingDesc, Scissor, ShaderStageFlags, StoreOp, + SwapchainConfigurator, SwapchainImage, ThreadToken, TypedBind, Viewport, }; use narcissus_image as image; use narcissus_maths::{ @@ -37,7 +37,6 @@ use spring::simple_spring_damper_exact; mod fonts; mod helpers; pub mod microshades; -mod pipelines; mod spring; const SQRT_2: f32 = 0.70710677; @@ -836,8 +835,6 @@ impl Images { } } -type Gpu = dyn Device + 'static; - struct DrawState<'gpu> { gpu: &'gpu Gpu, @@ -1312,7 +1309,7 @@ impl<'gpu> DrawState<'gpu> { 3 * std::mem::size_of::(), ); - let sort_tmp_buffer = gpu.request_transient_buffer( + let tmp_buffer = gpu.request_transient_buffer( frame, thread_token, BufferUsageFlags::STORAGE, @@ -1323,7 +1320,7 @@ impl<'gpu> DrawState<'gpu> { frame, thread_token, BufferUsageFlags::STORAGE, - (COARSE_BUFFER_LEN / (32 * 16)) * 256 * std::mem::size_of::(), // TODO: Fix size + calculate_spine_size(COARSE_BUFFER_LEN) * std::mem::size_of::(), // TODO: Fix size ); let draw_buffer_address = gpu.get_buffer_address(draw_buffer.to_arg()); @@ -1331,7 +1328,7 @@ impl<'gpu> DrawState<'gpu> { let coarse_buffer_address = gpu.get_buffer_address(coarse_buffer.to_arg()); let indirect_dispatch_buffer_address = gpu.get_buffer_address(indirect_dispatch_buffer.to_arg()); - let sort_tmp_buffer_address = gpu.get_buffer_address(sort_tmp_buffer.to_arg()); + let tmp_buffer_address = gpu.get_buffer_address(tmp_buffer.to_arg()); let spine_buffer_address = gpu.get_buffer_address(spine_buffer.to_arg()); gpu.cmd_set_pipeline(cmd_encoder, self.pipelines.draw_2d_bin_0_clear_pipeline); @@ -1394,21 +1391,6 @@ impl<'gpu> DrawState<'gpu> { &[], ); - // let mut sort_data = Vec::new(); - // let count = 8192u32; - // sort_data.push(count); - // for i in 0..count { - // sort_data.push(255 - i / 32); - // } - - // let sort_buffer = gpu.request_transient_buffer_with_data( - // frame, - // thread_token, - // BufferUsageFlags::STORAGE, - // sort_data.as_slice(), - // ); - // let sort_buffer_address = gpu.get_buffer_address(sort_buffer.to_arg()); - gpu.cmd_set_pipeline(cmd_encoder, self.pipelines.draw_2d_bin_2_sort_pipeline); gpu.cmd_set_bind_group(cmd_encoder, 0, &compute_bind_group); gpu.cmd_push_constants( @@ -1444,7 +1426,7 @@ impl<'gpu> DrawState<'gpu> { let count_buffer_address = coarse_buffer_address; // Then the elements we want to sort follow. let mut src_buffer_address = count_buffer_address.byte_add(4); - let mut dst_buffer_address = sort_tmp_buffer_address; + let mut dst_buffer_address = tmp_buffer_address; for pass in 0..4 { let shift = pass * 8; diff --git a/title/shark/tests/radix_sort.rs b/title/shark/tests/radix_sort.rs new file mode 100644 index 0000000..64e1897 --- /dev/null +++ b/title/shark/tests/radix_sort.rs @@ -0,0 +1,240 @@ +use narcissus_core::rand::Pcg64; +use narcissus_gpu::{ + create_device, Access, BufferDesc, BufferUsageFlags, DeviceExt, GlobalBarrier, MemoryLocation, + ShaderStageFlags, ThreadToken, +}; +use shark_shaders::pipelines::{ + calcuate_workgroup_count, calculate_spine_size, Pipelines, RadixSortDownsweepConstants, + RadixSortSpineConstants, RadixSortUpsweepConstants, +}; + +fn gpu_sort(values: &mut [u32]) { + let gpu = create_device(narcissus_gpu::DeviceBackend::Vulkan); + let gpu = gpu.as_ref(); + + let pipelines = Pipelines::load(gpu); + + let count_buffer = gpu.create_persistent_buffer_with_data( + MemoryLocation::Device, + BufferUsageFlags::STORAGE, + &(values.len() as u32), + ); + + let sort_buffer = gpu.create_persistent_buffer_with_data( + MemoryLocation::Device, + BufferUsageFlags::STORAGE, + values, + ); + + let tmp_buffer = gpu.create_buffer(&BufferDesc { + memory_location: MemoryLocation::Device, + host_mapped: false, + usage: BufferUsageFlags::STORAGE, + size: std::mem::size_of_val(values), + }); + + let spine_buffer = gpu.create_buffer(&BufferDesc { + memory_location: MemoryLocation::Device, + host_mapped: false, + usage: BufferUsageFlags::STORAGE, + size: calculate_spine_size(values.len()) * std::mem::size_of::(), + }); + + let count_buffer_address = gpu.get_buffer_address(count_buffer.to_arg()); + let spine_buffer_address = gpu.get_buffer_address(spine_buffer.to_arg()); + let mut src_buffer_address = gpu.get_buffer_address(sort_buffer.to_arg()); + let mut dst_buffer_address = gpu.get_buffer_address(tmp_buffer.to_arg()); + + let thread_token = ThreadToken::new(); + let thread_token = &thread_token; + let frame = gpu.begin_frame(); + { + let frame = &frame; + let mut cmd_encoder = gpu.request_cmd_encoder(frame, thread_token); + + { + let cmd_encoder = &mut cmd_encoder; + + for pass in 0..4 { + let shift = pass * 8; + + // Upsweep + gpu.cmd_set_pipeline(cmd_encoder, pipelines.radix_sort_0_upsweep_pipeline); + gpu.cmd_push_constants( + cmd_encoder, + ShaderStageFlags::COMPUTE, + 0, + &RadixSortUpsweepConstants { + shift, + _pad: 0, + count_buffer_address, + src_buffer_address, + spine_buffer_address, + }, + ); + gpu.cmd_dispatch( + cmd_encoder, + calcuate_workgroup_count(values.len()) as u32, + 1, + 1, + ); + + gpu.cmd_barrier( + cmd_encoder, + Some(&GlobalBarrier { + prev_access: &[Access::ComputeWrite], + next_access: &[Access::ComputeOtherRead], + }), + &[], + ); + + // Exclusive sum of the spine + gpu.cmd_set_pipeline(cmd_encoder, pipelines.radix_sort_1_spine_pipeline); + gpu.cmd_push_constants( + cmd_encoder, + ShaderStageFlags::COMPUTE, + 0, + &RadixSortSpineConstants { + count_buffer_address, + spine_buffer_address, + }, + ); + gpu.cmd_dispatch(cmd_encoder, 1, 1, 1); + + gpu.cmd_barrier( + cmd_encoder, + Some(&GlobalBarrier { + prev_access: &[Access::ComputeWrite], + next_access: &[Access::ComputeOtherRead], + }), + &[], + ); + + // Downsweep + gpu.cmd_set_pipeline(cmd_encoder, pipelines.radix_sort_2_downsweep_pipeline); + gpu.cmd_push_constants( + cmd_encoder, + ShaderStageFlags::COMPUTE, + 0, + &RadixSortDownsweepConstants { + shift, + _pad: 0, + count_buffer_address, + spine_buffer_address, + src_buffer_address, + dst_buffer_address, + }, + ); + gpu.cmd_dispatch( + cmd_encoder, + calcuate_workgroup_count(values.len()) as u32, + 1, + 1, + ); + + gpu.cmd_barrier( + cmd_encoder, + Some(&GlobalBarrier { + prev_access: &[Access::ComputeWrite], + next_access: &[Access::ComputeOtherRead], + }), + &[], + ); + + std::mem::swap(&mut src_buffer_address, &mut dst_buffer_address); + } + } + + gpu.submit(frame, cmd_encoder); + } + + gpu.end_frame(frame); + + gpu.wait_idle(); + + unsafe { sort_buffer.copy_to_slice(values) }; +} + +// This test requires a GPU, so ignore the test by default. +#[ignore] +#[test] +pub fn sort_random_input() { + let mut rng = Pcg64::new(); + + let count = 15 * 1024 * 1024 + 3; + + let mut values = vec![]; + values.reserve_exact(count); + for _ in 0..count / 2 { + let i = rng.next_u64(); + values.push((i & 0xffff_ffff) as u32); + values.push(((i >> 32) & 0xffff_ffff) as u32); + } + + values.push((rng.next_u64() & 0xffff_ffff) as u32); + + gpu_sort(&mut values); + + assert!(values.is_sorted()); +} + +#[ignore] +#[test] +pub fn sort_single() { + let mut values = vec![5]; + let mut sorted = values.clone(); + sorted.sort(); + + gpu_sort(&mut values); + + assert!(values == sorted); +} + +#[ignore] +#[test] +pub fn sort_double() { + let mut values = vec![u32::MAX, 0]; + let mut sorted = values.clone(); + sorted.sort(); + + gpu_sort(&mut values); + + assert!(values.is_sorted()); +} + +#[ignore] +#[test] +pub fn sort_short_input() { + let mut values = vec![5, 4, 3, 2, 1]; + let mut sorted = values.clone(); + sorted.sort(); + + assert!(!values.is_sorted()); + + gpu_sort(&mut values); + + assert!(values.is_sorted()); + assert!(values == sorted); +} + +#[ignore] +#[test] +pub fn sort_u32_max() { + let mut values = vec![u32::MAX; 10_000]; + + gpu_sort(&mut values); + + assert!(values.is_sorted()); + assert!(values.iter().all(|&x| x == u32::MAX)); +} + +#[ignore] +#[test] +pub fn sort_u32_zero() { + let mut values = vec![0; 10_000]; + + gpu_sort(&mut values); + + assert!(values.is_sorted()); + assert!(values.iter().all(|&x| x == 0)); +}