From 050cc2d4308aaf8b4cd8c940eb23e1b145801959 Mon Sep 17 00:00:00 2001 From: Josh Simmons Date: Mon, 4 Nov 2024 08:33:52 +0100 Subject: [PATCH] shark: Re-work UI Add radix sort pipelines. Add buffer address abstraction. Add spec constants. Add subgroup size control. Add `cmd_dispatch_indirect`. Change bind group API to separate allocation and binding. Surely I won't regret the mega-commit... --- Cargo.lock | 4 +- .../narcissus-gpu/src/backend/vulkan/mod.rs | 596 ++++++++++------ engine/narcissus-gpu/src/lib.rs | 90 ++- engine/narcissus-gpu/src/mapped_buffer.rs | 10 +- external/vulkan-sys/src/structs.rs | 21 +- title/shark-shaders/build.rs | 28 +- .../shaders/{basic.frag.glsl => basic.frag} | 4 +- title/shark-shaders/shaders/basic.vert | 55 ++ title/shark-shaders/shaders/basic.vert.glsl | 47 -- ...lay_transform.comp.glsl => composite.comp} | 24 +- .../shark-shaders/shaders/compute_bindings.h | 47 -- title/shark-shaders/shaders/draw_2d.h | 69 ++ .../shaders/draw_2d_bin_0_clear.comp | 28 + .../shaders/draw_2d_bin_1_scatter.comp | 189 ++++++ .../shaders/draw_2d_bin_2_sort.comp | 49 ++ .../shaders/draw_2d_bin_3_resolve.comp | 22 + .../shaders/draw_2d_rasterize.comp | 145 ++++ title/shark-shaders/shaders/indirect.h | 10 + title/shark-shaders/shaders/primitive_2d.h | 38 -- .../shaders/primitive_2d_bin.comp.glsl | 106 --- .../shaders/primitive_2d_bin_clear.comp.glsl | 27 - .../shaders/primitive_2d_rasterize.comp.glsl | 143 ---- title/shark-shaders/shaders/radix_sort.h | 14 + .../shaders/radix_sort_0_upsweep.comp | 84 +++ .../shaders/radix_sort_1_spine.comp | 91 +++ .../shaders/radix_sort_2_downsweep.comp | 115 ++++ title/shark/src/helpers.rs | 2 +- title/shark/src/main.rs | 639 +++++++++--------- title/shark/src/pipelines.rs | 411 +++++++++++ title/shark/src/pipelines/basic.rs | 90 --- title/shark/src/pipelines/mod.rs | 79 --- 31 files changed, 2156 insertions(+), 1121 deletions(-) rename title/shark-shaders/shaders/{basic.frag.glsl => basic.frag} (75%) create mode 100644 title/shark-shaders/shaders/basic.vert delete mode 100644 title/shark-shaders/shaders/basic.vert.glsl rename title/shark-shaders/shaders/{display_transform.comp.glsl => composite.comp} (64%) create mode 100644 title/shark-shaders/shaders/draw_2d.h create mode 100644 title/shark-shaders/shaders/draw_2d_bin_0_clear.comp create mode 100644 title/shark-shaders/shaders/draw_2d_bin_1_scatter.comp create mode 100644 title/shark-shaders/shaders/draw_2d_bin_2_sort.comp create mode 100644 title/shark-shaders/shaders/draw_2d_bin_3_resolve.comp create mode 100644 title/shark-shaders/shaders/draw_2d_rasterize.comp create mode 100644 title/shark-shaders/shaders/indirect.h delete mode 100644 title/shark-shaders/shaders/primitive_2d.h delete mode 100644 title/shark-shaders/shaders/primitive_2d_bin.comp.glsl delete mode 100644 title/shark-shaders/shaders/primitive_2d_bin_clear.comp.glsl delete mode 100644 title/shark-shaders/shaders/primitive_2d_rasterize.comp.glsl create mode 100644 title/shark-shaders/shaders/radix_sort.h create mode 100644 title/shark-shaders/shaders/radix_sort_0_upsweep.comp create mode 100644 title/shark-shaders/shaders/radix_sort_1_spine.comp create mode 100644 title/shark-shaders/shaders/radix_sort_2_downsweep.comp create mode 100644 title/shark/src/pipelines.rs delete mode 100644 title/shark/src/pipelines/basic.rs delete mode 100644 title/shark/src/pipelines/mod.rs diff --git a/Cargo.lock b/Cargo.lock index 8b2c2c0..0f9129a 100644 --- a/Cargo.lock +++ b/Cargo.lock @@ -52,9 +52,9 @@ dependencies = [ [[package]] name = "libc" -version = "0.2.150" +version = "0.2.161" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "89d92a4743f9a61002fae18374ed11e7973f530cb3a3255fb354818118b2203c" +checksum = "8e9489c2807c139ffd9c1794f4af0ebe86a828db53ecdc7fea2111d0fed085d1" [[package]] name = "memchr" diff --git a/engine/narcissus-gpu/src/backend/vulkan/mod.rs b/engine/narcissus-gpu/src/backend/vulkan/mod.rs index 7c07dc9..8beb545 100644 --- a/engine/narcissus-gpu/src/backend/vulkan/mod.rs +++ b/engine/narcissus-gpu/src/backend/vulkan/mod.rs @@ -9,21 +9,22 @@ use std::{ }; use narcissus_core::{ - box_assume_init, default, is_aligned_to, + default, is_aligned_to, manual_arc::{self, ManualArc}, raw_window::AsRawWindow, - zeroed_box, Arc, Arena, HybridArena, Mutex, PhantomUnsend, Pool, Widen, + Arc, Arena, HybridArena, Mutex, PhantomUnsend, Pool, Widen, }; -use vulkan_sys as vk; +use vulkan_sys::{self as vk}; use crate::{ - frame_counter::FrameCounter, Bind, BindDesc, BindGroupLayout, BindingType, Buffer, BufferArg, - BufferDesc, BufferImageCopy, BufferUsageFlags, CmdEncoder, ComputePipelineDesc, Device, - Extent2d, Extent3d, Frame, GlobalBarrier, GpuConcurrent, GraphicsPipelineDesc, Image, - ImageBarrier, ImageBlit, ImageDesc, ImageDimension, ImageLayout, ImageTiling, ImageViewDesc, - IndexType, MemoryLocation, Offset2d, Offset3d, PersistentBuffer, Pipeline, PipelineLayout, - Sampler, SamplerAddressMode, SamplerCompareOp, SamplerDesc, SamplerFilter, ShaderStageFlags, + frame_counter::FrameCounter, mapped_buffer::TransientBindGroup, Bind, BindDesc, + BindGroupLayout, BindingType, Buffer, BufferAddress, BufferArg, BufferDesc, BufferImageCopy, + BufferUsageFlags, CmdEncoder, ComputePipelineDesc, Device, Extent2d, Extent3d, Frame, + GlobalBarrier, GpuConcurrent, GraphicsPipelineDesc, Image, ImageBarrier, ImageBlit, ImageDesc, + ImageDimension, ImageLayout, ImageTiling, ImageViewDesc, IndexType, MemoryLocation, Offset2d, + Offset3d, PersistentBuffer, Pipeline, PipelineLayout, Sampler, SamplerAddressMode, + SamplerCompareOp, SamplerDesc, SamplerFilter, ShaderStageFlags, SpecConstant, SwapchainConfigurator, SwapchainImage, SwapchainOutOfDateError, ThreadToken, TransientBuffer, TypedBind, }; @@ -384,12 +385,13 @@ pub(crate) struct VulkanDevice { physical_device_properties: Box, _physical_device_properties_11: Box, _physical_device_properties_12: Box, - _physical_device_properties_13: Box, + physical_device_properties_13: Box, + physical_device_memory_properties: Box, + _physical_device_features: Box, _physical_device_features_11: Box, _physical_device_features_12: Box, _physical_device_features_13: Box, - physical_device_memory_properties: Box, _global_fn: vk::GlobalFunctions, instance_fn: vk::InstanceFunctions, @@ -508,53 +510,31 @@ impl VulkanDevice { instance_fn.enumerate_physical_devices(instance, count, ptr) }); - let mut physical_device_properties = - unsafe { box_assume_init(zeroed_box::()) }; - let mut physical_device_properties_11 = - unsafe { box_assume_init(zeroed_box::()) }; - let mut physical_device_properties_12 = - unsafe { box_assume_init(zeroed_box::()) }; - let mut physical_device_properties_13 = - unsafe { box_assume_init(zeroed_box::()) }; - - physical_device_properties._type = vk::StructureType::PhysicalDeviceProperties2; - physical_device_properties_11._type = vk::StructureType::PhysicalDeviceVulkan11Properties; - physical_device_properties_12._type = vk::StructureType::PhysicalDeviceVulkan12Properties; - physical_device_properties_13._type = vk::StructureType::PhysicalDeviceVulkan13Properties; - - physical_device_properties_12._next = physical_device_properties_13.as_mut() - as *mut vk::PhysicalDeviceVulkan13Properties - as *mut _; - physical_device_properties_11._next = physical_device_properties_12.as_mut() - as *mut vk::PhysicalDeviceVulkan12Properties - as *mut _; - physical_device_properties._next = physical_device_properties_11.as_mut() - as *mut vk::PhysicalDeviceVulkan11Properties - as *mut _; - - let mut physical_device_features = - unsafe { box_assume_init(zeroed_box::()) }; - let mut physical_device_features_11 = - unsafe { box_assume_init(zeroed_box::()) }; - let mut physical_device_features_12 = - unsafe { box_assume_init(zeroed_box::()) }; - let mut physical_device_features_13 = - unsafe { box_assume_init(zeroed_box::()) }; - - physical_device_features._type = vk::StructureType::PhysicalDeviceFeatures2; - physical_device_features_11._type = vk::StructureType::PhysicalDeviceVulkan11Features; - physical_device_features_12._type = vk::StructureType::PhysicalDeviceVulkan12Features; - physical_device_features_13._type = vk::StructureType::PhysicalDeviceVulkan13Features; - - physical_device_features_12._next = physical_device_features_13.as_mut() - as *mut vk::PhysicalDeviceVulkan13Features - as *mut _; - physical_device_features_11._next = physical_device_features_12.as_mut() - as *mut vk::PhysicalDeviceVulkan12Features - as *mut _; - physical_device_features._next = physical_device_features_11.as_mut() - as *mut vk::PhysicalDeviceVulkan11Features - as *mut _; + let mut physical_device_properties: Box = default(); + let mut physical_device_properties_11: Box = + default(); + let mut physical_device_properties_12: Box = + default(); + let mut physical_device_properties_13: Box = + default(); + + physical_device_properties_12._next = + physical_device_properties_13.as_mut() as *mut _ as *mut _; + physical_device_properties_11._next = + physical_device_properties_12.as_mut() as *mut _ as *mut _; + physical_device_properties._next = + physical_device_properties_11.as_mut() as *mut _ as *mut _; + + let mut physical_device_features: Box = default(); + let mut physical_device_features_11: Box = default(); + let mut physical_device_features_12: Box = default(); + let mut physical_device_features_13: Box = default(); + + physical_device_features_12._next = + physical_device_features_13.as_mut() as *mut _ as *mut _; + physical_device_features_11._next = + physical_device_features_12.as_mut() as *mut _ as *mut _; + physical_device_features._next = physical_device_features_11.as_mut() as *mut _ as *mut _; let physical_device = physical_devices .iter() @@ -573,6 +553,9 @@ impl VulkanDevice { physical_device_properties.properties.api_version >= vk::VERSION_1_3 && physical_device_features_13.dynamic_rendering == vk::Bool32::True + && physical_device_features_13.subgroup_size_control == vk::Bool32::True + && physical_device_features_13.maintenance4 == vk::Bool32::True + && physical_device_features_13.compute_full_subgroups == vk::Bool32::True && physical_device_features_12.timeline_semaphore == vk::Bool32::True && physical_device_features_12.descriptor_indexing == vk::Bool32::True && physical_device_features_12.descriptor_binding_partially_bound @@ -636,6 +619,9 @@ impl VulkanDevice { let enabled_features_13 = vk::PhysicalDeviceVulkan13Features { dynamic_rendering: vk::Bool32::True, synchronization2: vk::Bool32::True, + subgroup_size_control: vk::Bool32::True, + compute_full_subgroups: vk::Bool32::True, + maintenance4: vk::Bool32::True, ..default() }; let enabled_features_12 = vk::PhysicalDeviceVulkan12Features { @@ -790,12 +776,13 @@ impl VulkanDevice { physical_device_properties, _physical_device_properties_11: physical_device_properties_11, _physical_device_properties_12: physical_device_properties_12, - _physical_device_properties_13: physical_device_properties_13, + physical_device_properties_13, + physical_device_memory_properties, + _physical_device_features: physical_device_features, _physical_device_features_11: physical_device_features_11, _physical_device_features_12: physical_device_features_12, _physical_device_features_13: physical_device_features_13, - physical_device_memory_properties, _global_fn: global_fn, instance_fn, @@ -1251,8 +1238,8 @@ impl Device for VulkanDevice { hasher.update(&bind_desc.count.to_le_bytes()); } - let layout_bindings = - arena.alloc_slice_fill_iter(binds_desc.iter().enumerate().map(|(i, bind_desc)| { + let layout_bindings: &mut [vulkan_sys::DescriptorSetLayoutBinding] = arena + .alloc_slice_fill_iter(binds_desc.iter().enumerate().map(|(i, bind_desc)| { let immutable_samplers = if !bind_desc.immutable_samplers.is_empty() { assert_eq!( bind_desc.binding_type, @@ -1323,7 +1310,7 @@ impl Device for VulkanDevice { } fn create_graphics_pipeline(&self, pipeline_desc: &GraphicsPipelineDesc) -> Pipeline { - let pipeline_layout = self.cache_pipeline_layout(pipeline_desc.layout); + let pipeline_layout = self.cache_pipeline_layout(&pipeline_desc.layout); let arena = HybridArena::<1024>::new(); @@ -1338,6 +1325,17 @@ impl Device for VulkanDevice { pipeline_desc.fragment_shader.code, ); + assert!( + !(pipeline_desc.vertex_shader.required_subgroup_size.is_some() + || pipeline_desc + .fragment_shader + .required_subgroup_size + .is_some() + || pipeline_desc.vertex_shader.allow_varying_subgroup_size + || pipeline_desc.fragment_shader.allow_varying_subgroup_size), + "subgroup size control features not implemented for graphics shader stages" + ); + let stages = &[ vk::PipelineShaderStageCreateInfo { stage: vk::ShaderStageFlags::VERTEX, @@ -1492,28 +1490,137 @@ impl Device for VulkanDevice { } fn create_compute_pipeline(&self, pipeline_desc: &ComputePipelineDesc) -> Pipeline { - let pipeline_layout = self.cache_pipeline_layout(pipeline_desc.layout); + let arena = HybridArena::<1024>::new(); + + let pipeline_layout = self.cache_pipeline_layout(&pipeline_desc.layout); let module = vulkan_shader_module(&self.device_fn, self.device, pipeline_desc.shader.code); - let stage = vk::PipelineShaderStageCreateInfo { - stage: vk::ShaderStageFlags::COMPUTE, - name: pipeline_desc.shader.entry.as_ptr(), - module, - ..default() - }; + let mut shader_stage_create_flags = default(); + + if pipeline_desc.shader.require_full_subgroups { + shader_stage_create_flags |= vk::PipelineShaderStageCreateFlags::REQUIRE_FULL_SUBGROUPS + } + + if pipeline_desc.shader.allow_varying_subgroup_size { + shader_stage_create_flags |= + vk::PipelineShaderStageCreateFlags::ALLOW_VARYING_SUBGROUP_SIZE; + } - let create_infos = &[vk::ComputePipelineCreateInfo { + let specialization_info: Option<&vk::SpecializationInfo> = + if !pipeline_desc.shader.spec_constants.is_empty() { + let block_len = pipeline_desc + .shader + .spec_constants + .iter() + .map(|spec_constant| match spec_constant { + SpecConstant::Bool { id: _, value: _ } + | SpecConstant::U32 { id: _, value: _ } + | SpecConstant::I32 { id: _, value: _ } + | SpecConstant::F32 { id: _, value: _ } => 4, + }) + .sum::(); + + let block = arena.alloc_slice_fill_copy(block_len, 0u8); + + let mut offset = 0; + let map_entries = + arena.alloc_slice_fill_iter(pipeline_desc.shader.spec_constants.iter().map( + |spec_constant| { + let constant_id; + let value_size; + match *spec_constant { + SpecConstant::Bool { id, value } => { + constant_id = id; + let value = if value { + vk::Bool32::True + } else { + vk::Bool32::False + } as u32; + value_size = std::mem::size_of_val(&value); + block[offset..offset + value_size] + .copy_from_slice(&value.to_ne_bytes()) + } + SpecConstant::U32 { id, value } => { + constant_id = id; + value_size = std::mem::size_of_val(&value); + block[offset..offset + value_size] + .copy_from_slice(&value.to_ne_bytes()); + } + SpecConstant::I32 { id, value } => { + constant_id = id; + value_size = std::mem::size_of_val(&value); + block[offset..offset + value_size] + .copy_from_slice(&value.to_ne_bytes()); + } + SpecConstant::F32 { id, value } => { + constant_id = id; + value_size = std::mem::size_of_val(&value); + block[offset..offset + value_size] + .copy_from_slice(&value.to_ne_bytes()); + } + } + + let map_entry = vk::SpecializationMapEntry { + constant_id, + offset: offset as u32, + size: value_size, + }; + + offset += value_size; + + map_entry + }, + )); + + Some(arena.alloc(vk::SpecializationInfo { + data: block.into(), + map_entries: map_entries.into(), + })) + } else { + None + }; + + let compute_pipeline_create_info = arena.alloc(vk::ComputePipelineCreateInfo { layout: pipeline_layout.pipeline_layout, - stage, + stage: vk::PipelineShaderStageCreateInfo { + stage: vk::ShaderStageFlags::COMPUTE, + name: pipeline_desc.shader.entry.as_ptr(), + module, + flags: shader_stage_create_flags, + specialization_info, + ..default() + }, ..default() - }]; + }); + + if let Some(required_subgroup_size) = pipeline_desc.shader.required_subgroup_size { + assert!(self + .physical_device_properties_13 + .required_subgroup_size_stages + .contains(vk::ShaderStageFlags::COMPUTE)); + assert!( + required_subgroup_size >= self.physical_device_properties_13.min_subgroup_size + && required_subgroup_size + <= self.physical_device_properties_13.max_subgroup_size + ); + + let shader_stage_required_subgroup_size_create_info = + arena.alloc(vk::PipelineShaderStageRequiredSubgroupSizeCreateInfo { + required_subgroup_size, + ..default() + }); + + // SAFETY: Both are arena allocations and therefore have identical lifetimes. + compute_pipeline_create_info.stage._next = + shader_stage_required_subgroup_size_create_info as *const _ as *const _; + } let mut pipelines = [vk::Pipeline::null()]; vk_check!(self.device_fn.create_compute_pipelines( self.device, vk::PipelineCache::null(), - create_infos, + std::slice::from_ref(compute_pipeline_create_info), None, &mut pipelines )); @@ -1615,6 +1722,71 @@ impl Device for VulkanDevice { } } + fn debug_name_bind_group_layout(&self, bind_group_layout: BindGroupLayout, name: &str) { + #[cfg(feature = "debug_markers")] + if let Some(debug_utils_fn) = &self.debug_utils_fn { + let descriptor_set_layout; + { + let bind_group_layout_pool = self.bind_group_layout_pool.lock(); + let Some(bind_group_layout) = bind_group_layout_pool.get(bind_group_layout.0) + else { + return; + }; + + descriptor_set_layout = bind_group_layout.descriptor_set_layout; + } + + let arena = HybridArena::<512>::new(); + let object_name = arena.alloc_cstr_from_str(name); + + let image_name_info = vk::DebugUtilsObjectNameInfoExt { + object_type: vk::ObjectType::DescriptorSetLayout, + object_handle: descriptor_set_layout.as_raw(), + object_name: object_name.as_ptr(), + ..default() + }; + unsafe { debug_utils_fn.set_debug_utils_object_name_ext(self.device, &image_name_info) } + } + } + + fn debug_name_pipeline(&self, pipeline: Pipeline, name: &str) { + #[cfg(feature = "debug_markers")] + if let Some(debug_utils_fn) = &self.debug_utils_fn { + let pipeline_handle; + let pipeline_layout_handle; + { + let pipeline_pool = self.pipeline_pool.lock(); + let Some(pipeline) = pipeline_pool.get(pipeline.0) else { + return; + }; + + pipeline_handle = pipeline.pipeline; + pipeline_layout_handle = pipeline.pipeline_layout.pipeline_layout; + } + + let arena = HybridArena::<512>::new(); + let object_name = arena.alloc_cstr_from_str(name); + + let image_name_info = vk::DebugUtilsObjectNameInfoExt { + object_type: vk::ObjectType::Pipeline, + object_handle: pipeline_handle.as_raw(), + object_name: object_name.as_ptr(), + ..default() + }; + unsafe { debug_utils_fn.set_debug_utils_object_name_ext(self.device, &image_name_info) } + + let image_view_name_info = vk::DebugUtilsObjectNameInfoExt { + object_type: vk::ObjectType::PipelineLayout, + object_handle: pipeline_layout_handle.as_raw(), + object_name: object_name.as_ptr(), + ..default() + }; + unsafe { + debug_utils_fn.set_debug_utils_object_name_ext(self.device, &image_view_name_info) + } + } + } + fn destroy_buffer(&self, frame: &Frame, buffer: Buffer) { if let Some(buffer) = self.buffer_pool.lock().remove(buffer.0) { assert_eq!( @@ -1733,118 +1905,13 @@ impl Device for VulkanDevice { self.request_transient_buffer(frame, thread_token, usage, size as u64) } - fn request_cmd_encoder<'a, 'thread>( + fn request_transient_bind_group<'a>( &self, - frame: &'a Frame, + frame: &'a Frame<'a>, thread_token: &'a ThreadToken, - ) -> CmdEncoder<'a> { - let frame = self.frame(frame); - let per_thread = frame.per_thread.get(thread_token); - let mut cmd_buffer_pool = per_thread.cmd_buffer_pool.borrow_mut(); - - // We have consumed all available command buffers, need to allocate a new one. - if cmd_buffer_pool.next_free_index >= cmd_buffer_pool.command_buffers.len() { - let mut cmd_buffers = [vk::CommandBuffer::null(); 4]; - let allocate_info = vk::CommandBufferAllocateInfo { - command_pool: cmd_buffer_pool.command_pool, - level: vk::CommandBufferLevel::Primary, - command_buffer_count: cmd_buffers.len() as u32, - ..default() - }; - vk_check!(self.device_fn.allocate_command_buffers( - self.device, - &allocate_info, - cmd_buffers.as_mut_ptr() - )); - cmd_buffer_pool.command_buffers.extend(cmd_buffers.iter()); - } - - let index = cmd_buffer_pool.next_free_index; - cmd_buffer_pool.next_free_index += 1; - let command_buffer = cmd_buffer_pool.command_buffers[index]; - - vk_check!(self.device_fn.begin_command_buffer( - command_buffer, - &vk::CommandBufferBeginInfo { - flags: vk::CommandBufferUsageFlags::ONE_TIME_SUBMIT, - ..default() - } - )); - - let vulkan_cmd_encoder = per_thread.arena.alloc(VulkanCmdEncoder { - command_buffer, - ..default() - }); - - CmdEncoder { - cmd_encoder_addr: vulkan_cmd_encoder as *mut _ as usize, - thread_token, - phantom_unsend: PhantomUnsend {}, - } - } - - fn cmd_insert_debug_marker( - &self, - cmd_encoder: &mut CmdEncoder, - label_name: &str, - color: [f32; 4], - ) { - #[cfg(feature = "debug_markers")] - if let Some(debug_utils_fn) = &self.debug_utils_fn { - let arena = HybridArena::<256>::new(); - let label_name = arena.alloc_cstr_from_str(label_name); - - let command_buffer = self.cmd_encoder_mut(cmd_encoder).command_buffer; - let label_info = vk::DebugUtilsLabelExt { - label_name: label_name.as_ptr(), - color, - ..default() - }; - unsafe { - debug_utils_fn.cmd_insert_debug_utils_label_ext(command_buffer, &label_info); - } - } - } - - fn cmd_begin_debug_marker( - &self, - cmd_encoder: &mut CmdEncoder, - label_name: &str, - color: [f32; 4], - ) { - #[cfg(feature = "debug_markers")] - if let Some(debug_utils_fn) = &self.debug_utils_fn { - let arena = HybridArena::<256>::new(); - let label_name = arena.alloc_cstr_from_str(label_name); - - let command_buffer = self.cmd_encoder_mut(cmd_encoder).command_buffer; - let label_info = vk::DebugUtilsLabelExt { - label_name: label_name.as_ptr(), - color, - ..default() - }; - unsafe { - debug_utils_fn.cmd_begin_debug_utils_label_ext(command_buffer, &label_info); - } - } - } - - fn cmd_end_debug_marker(&self, cmd_encoder: &mut CmdEncoder) { - #[cfg(feature = "debug_markers")] - if let Some(debug_utils_fn) = &self.debug_utils_fn { - let command_buffer = self.cmd_encoder_mut(cmd_encoder).command_buffer; - unsafe { debug_utils_fn.cmd_end_debug_utils_label_ext(command_buffer) } - } - } - - fn cmd_set_bind_group( - &self, - frame: &Frame, - cmd_encoder: &mut CmdEncoder, layout: BindGroupLayout, - bind_group_index: u32, bindings: &[Bind], - ) { + ) -> TransientBindGroup<'a> { let arena = HybridArena::<4096>::new(); let descriptor_set_layout = self @@ -1855,7 +1922,7 @@ impl Device for VulkanDevice { .descriptor_set_layout; let frame = self.frame(frame); - let per_thread = frame.per_thread.get(cmd_encoder.thread_token); + let per_thread = frame.per_thread.get(thread_token); let mut descriptor_pool = per_thread.descriptor_pool.get(); let mut allocated_pool = false; @@ -2006,23 +2073,137 @@ impl Device for VulkanDevice { .update_descriptor_sets(self.device, write_descriptors, &[]) }; + TransientBindGroup { + bind_group: descriptor_set.as_raw(), + phantom: PhantomData, + } + } + + fn request_cmd_encoder<'a, 'thread>( + &self, + frame: &'a Frame, + thread_token: &'a ThreadToken, + ) -> CmdEncoder<'a> { + let frame = self.frame(frame); + let per_thread = frame.per_thread.get(thread_token); + let mut cmd_buffer_pool = per_thread.cmd_buffer_pool.borrow_mut(); + + // We have consumed all available command buffers, need to allocate a new one. + if cmd_buffer_pool.next_free_index >= cmd_buffer_pool.command_buffers.len() { + let mut cmd_buffers = [vk::CommandBuffer::null(); 4]; + let allocate_info = vk::CommandBufferAllocateInfo { + command_pool: cmd_buffer_pool.command_pool, + level: vk::CommandBufferLevel::Primary, + command_buffer_count: cmd_buffers.len() as u32, + ..default() + }; + vk_check!(self.device_fn.allocate_command_buffers( + self.device, + &allocate_info, + cmd_buffers.as_mut_ptr() + )); + cmd_buffer_pool.command_buffers.extend(cmd_buffers.iter()); + } + + let index = cmd_buffer_pool.next_free_index; + cmd_buffer_pool.next_free_index += 1; + let command_buffer = cmd_buffer_pool.command_buffers[index]; + + vk_check!(self.device_fn.begin_command_buffer( + command_buffer, + &vk::CommandBufferBeginInfo { + flags: vk::CommandBufferUsageFlags::ONE_TIME_SUBMIT, + ..default() + } + )); + + let vulkan_cmd_encoder = per_thread.arena.alloc(VulkanCmdEncoder { + command_buffer, + ..default() + }); + + CmdEncoder { + cmd_encoder_addr: vulkan_cmd_encoder as *mut _ as usize, + phantom: PhantomData, + phantom_unsend: PhantomUnsend {}, + } + } + + fn cmd_insert_debug_marker( + &self, + cmd_encoder: &mut CmdEncoder, + label_name: &str, + color: [f32; 4], + ) { + #[cfg(feature = "debug_markers")] + if let Some(debug_utils_fn) = &self.debug_utils_fn { + let arena = HybridArena::<256>::new(); + let label_name = arena.alloc_cstr_from_str(label_name); + + let command_buffer = self.cmd_encoder_mut(cmd_encoder).command_buffer; + let label_info = vk::DebugUtilsLabelExt { + label_name: label_name.as_ptr(), + color, + ..default() + }; + unsafe { + debug_utils_fn.cmd_insert_debug_utils_label_ext(command_buffer, &label_info); + } + } + } + + fn cmd_begin_debug_marker( + &self, + cmd_encoder: &mut CmdEncoder, + label_name: &str, + color: [f32; 4], + ) { + #[cfg(feature = "debug_markers")] + if let Some(debug_utils_fn) = &self.debug_utils_fn { + let arena = HybridArena::<256>::new(); + let label_name = arena.alloc_cstr_from_str(label_name); + + let command_buffer = self.cmd_encoder_mut(cmd_encoder).command_buffer; + let label_info = vk::DebugUtilsLabelExt { + label_name: label_name.as_ptr(), + color, + ..default() + }; + unsafe { + debug_utils_fn.cmd_begin_debug_utils_label_ext(command_buffer, &label_info); + } + } + } + + fn cmd_end_debug_marker(&self, cmd_encoder: &mut CmdEncoder) { + #[cfg(feature = "debug_markers")] + if let Some(debug_utils_fn) = &self.debug_utils_fn { + let command_buffer = self.cmd_encoder_mut(cmd_encoder).command_buffer; + unsafe { debug_utils_fn.cmd_end_debug_utils_label_ext(command_buffer) } + } + } + + fn cmd_set_bind_group( + &self, + cmd_encoder: &mut CmdEncoder, + bind_group_index: u32, + bind_group: &TransientBindGroup, + ) { let cmd_encoder = self.cmd_encoder_mut(cmd_encoder); - let VulkanBoundPipeline { - pipeline_layout, - pipeline_bind_point, - } = cmd_encoder + + let bound_pipeline = cmd_encoder .bound_pipeline .as_ref() - .expect("cannot set bind groups without a pipeline bound") - .clone(); + .expect("cannot set bind group without a pipeline bound"); let command_buffer = cmd_encoder.command_buffer; + let descriptor_set = vk::DescriptorSet::from_raw(bind_group.bind_group); unsafe { self.device_fn.cmd_bind_descriptor_sets( command_buffer, - pipeline_bind_point, - pipeline_layout, + bound_pipeline.pipeline_bind_point, + bound_pipeline.pipeline_layout, bind_group_index, &[descriptor_set], &[], @@ -2519,6 +2700,16 @@ impl Device for VulkanDevice { } } + fn cmd_dispatch_indirect(&self, cmd_encoder: &mut CmdEncoder, buffer: BufferArg, offset: u64) { + let (buffer, base_offset, _range) = self.unwrap_buffer_arg(&buffer); + + let command_buffer = self.cmd_encoder_mut(cmd_encoder).command_buffer; + unsafe { + self.device_fn + .cmd_dispatch_indirect(command_buffer, buffer, base_offset + offset); + } + } + fn submit(&self, frame: &Frame, mut cmd_encoder: CmdEncoder) { let fence = self.universal_queue_fence.fetch_add(1, Ordering::SeqCst) + 1; @@ -2694,7 +2885,7 @@ impl Device for VulkanDevice { self.frame_counter.release(frame); } - fn get_buffer_address(&self, buffer: BufferArg) -> u64 { + fn get_buffer_address<'a>(&self, buffer: BufferArg<'a>) -> BufferAddress<'a> { let buffer = match buffer { BufferArg::Unmanaged(buffer) => buffer.0, BufferArg::Persistent(buffer) => buffer.buffer.0, @@ -2702,7 +2893,10 @@ impl Device for VulkanDevice { }; let buffer_pool = self.buffer_pool.lock(); let buffer = buffer_pool.get(buffer).unwrap(); - buffer.address + BufferAddress { + value: buffer.address, + phantom: PhantomData, + } } } @@ -2761,6 +2955,11 @@ impl VulkanDevice { ) }; + let address = BufferAddress { + value: address, + phantom: PhantomData, + }; + let ptr = NonNull::new(memory.mapped_ptr()).unwrap(); frame.destroyed_buffers.lock().push_back(buffer); @@ -2772,7 +2971,6 @@ impl VulkanDevice { buffer: buffer.as_raw(), address, offset: 0, - phantom: PhantomData, }; } @@ -2828,6 +3026,13 @@ impl VulkanDevice { let current = allocator.current.as_ref().unwrap(); + let address = BufferAddress { + value: current.address, + phantom: PhantomData, + }; + + let address = address.byte_add(allocator.offset); + TransientBuffer { ptr: NonNull::new( current @@ -2838,9 +3043,8 @@ impl VulkanDevice { .unwrap(), len: size as usize, buffer: current.buffer.as_raw(), - address: current.address + allocator.offset, + address, offset: allocator.offset, - phantom: PhantomData, } } diff --git a/engine/narcissus-gpu/src/lib.rs b/engine/narcissus-gpu/src/lib.rs index cb8a85d..054365b 100644 --- a/engine/narcissus-gpu/src/lib.rs +++ b/engine/narcissus-gpu/src/lib.rs @@ -1,6 +1,7 @@ use std::{ffi::CStr, marker::PhantomData}; use backend::vulkan; +use mapped_buffer::TransientBindGroup; use narcissus_core::{ default, flags_def, raw_window::AsRawWindow, thread_token_def, Handle, PhantomUnsend, }; @@ -117,6 +118,12 @@ impl Scissor { } } +#[derive(Clone, Copy, PartialEq, Eq)] +pub enum PipelineBindPoint { + Graphics, + Compute, +} + flags_def!(ShaderStageFlags); impl ShaderStageFlags { pub const VERTEX: Self = Self(1 << 0); @@ -265,11 +272,35 @@ pub struct ImageBlit { pub dst_offset_max: Offset3d, } +pub enum SpecConstant { + Bool { id: u32, value: bool }, + U32 { id: u32, value: u32 }, + I32 { id: u32, value: i32 }, + F32 { id: u32, value: f32 }, +} + pub struct ShaderDesc<'a> { pub entry: &'a CStr, + pub require_full_subgroups: bool, + pub allow_varying_subgroup_size: bool, + pub required_subgroup_size: Option, + pub spec_constants: &'a [SpecConstant], pub code: &'a [u8], } +impl<'a> Default for ShaderDesc<'a> { + fn default() -> Self { + Self { + entry: c"main", + require_full_subgroups: false, + allow_varying_subgroup_size: false, + required_subgroup_size: None, + spec_constants: &[], + code: &[], + } + } +} + #[derive(Clone, Copy, PartialEq, Eq)] pub enum SamplerFilter { Point, @@ -417,7 +448,7 @@ pub struct GraphicsPipelineAttachments<'a> { pub struct GraphicsPipelineDesc<'a> { pub vertex_shader: ShaderDesc<'a>, pub fragment_shader: ShaderDesc<'a>, - pub layout: &'a PipelineLayout<'a>, + pub layout: PipelineLayout<'a>, pub attachments: GraphicsPipelineAttachments<'a>, pub topology: Topology, pub primitive_restart: bool, @@ -436,7 +467,7 @@ pub struct GraphicsPipelineDesc<'a> { pub struct ComputePipelineDesc<'a> { pub shader: ShaderDesc<'a>, - pub layout: &'a PipelineLayout<'a>, + pub layout: PipelineLayout<'a>, } #[derive(Clone, Copy, Debug)] @@ -546,6 +577,39 @@ pub enum TypedBind<'a> { StorageBuffer(&'a [BufferArg<'a>]), } +#[repr(C)] +#[derive(Clone, Copy)] +pub struct BufferAddress<'a> { + value: u64, + phantom: PhantomData<&'a [u8]>, +} + +impl<'a> BufferAddress<'a> { + #[inline(always)] + #[must_use] + pub fn as_raw(self) -> u64 { + self.value + } + + #[inline(always)] + #[must_use] + pub fn byte_add(self, count: u64) -> Self { + Self { + value: self.value.wrapping_add(count), + phantom: self.phantom, + } + } + + #[inline(always)] + #[must_use] + pub fn byte_offset(self, count: i64) -> Self { + Self { + value: self.value.wrapping_add_signed(count), + phantom: self.phantom, + } + } +} + #[derive(Clone, Copy, PartialEq, Eq)] pub enum Access { /// No access. @@ -743,7 +807,7 @@ impl<'a> Frame<'a> { pub struct CmdEncoder<'a> { cmd_encoder_addr: usize, - thread_token: &'a ThreadToken, + phantom: PhantomData<&'a ()>, phantom_unsend: PhantomUnsend, } @@ -788,9 +852,10 @@ pub trait Device { fn debug_name_buffer(&self, buffer: BufferArg, label_name: &str); fn debug_name_image(&self, image: Image, label_name: &str); + fn debug_name_bind_group_layout(&self, bind_group_layout: BindGroupLayout, label_name: &str); + fn debug_name_pipeline(&self, pipeline: Pipeline, label_name: &str); - // Danger Zone - fn get_buffer_address(&self, buffer: BufferArg) -> u64; + fn get_buffer_address<'a>(&self, buffer: BufferArg<'a>) -> BufferAddress<'a>; fn destroy_buffer(&self, frame: &Frame, buffer: Buffer); fn destroy_persistent_buffer(&self, frame: &Frame, buffer: PersistentBuffer); @@ -834,6 +899,15 @@ pub trait Device { size: usize, ) -> TransientBuffer<'a>; + #[must_use] + fn request_transient_bind_group<'a>( + &self, + frame: &'a Frame<'a>, + thread_token: &'a ThreadToken, + layout: BindGroupLayout, + bindings: &[Bind], + ) -> TransientBindGroup<'a>; + #[must_use] fn request_cmd_encoder<'a>( &'a self, @@ -859,11 +933,9 @@ pub trait Device { fn cmd_set_bind_group( &self, - frame: &Frame, cmd_encoder: &mut CmdEncoder, - layout: BindGroupLayout, bind_group_index: u32, - bindings: &[Bind], + bind_group: &TransientBindGroup, ); fn cmd_set_index_buffer( @@ -948,6 +1020,8 @@ pub trait Device { group_count_z: u32, ); + fn cmd_dispatch_indirect(&self, cmd_encoder: &mut CmdEncoder, buffer: BufferArg, offset: u64); + fn submit(&self, frame: &Frame, cmd_encoder: CmdEncoder); fn begin_frame(&self) -> Frame; diff --git a/engine/narcissus-gpu/src/mapped_buffer.rs b/engine/narcissus-gpu/src/mapped_buffer.rs index e3e8e84..cdafabf 100644 --- a/engine/narcissus-gpu/src/mapped_buffer.rs +++ b/engine/narcissus-gpu/src/mapped_buffer.rs @@ -1,6 +1,6 @@ use std::{marker::PhantomData, ptr::NonNull}; -use crate::{Buffer, BufferArg}; +use crate::{Buffer, BufferAddress, BufferArg}; #[cold] fn overflow() -> ! { @@ -85,8 +85,7 @@ pub struct TransientBuffer<'a> { pub(crate) offset: u64, pub(crate) len: usize, pub(crate) buffer: u64, - pub(crate) address: u64, - pub(crate) phantom: PhantomData<&'a u8>, + pub(crate) address: BufferAddress<'a>, } impl<'a> TransientBuffer<'a> { @@ -102,3 +101,8 @@ impl<'a> TransientBuffer<'a> { unsafe { copy_from_with_offset(self.ptr, self.len, offset, src) } } } + +pub struct TransientBindGroup<'a> { + pub(crate) bind_group: u64, + pub(crate) phantom: PhantomData<&'a ()>, +} diff --git a/external/vulkan-sys/src/structs.rs b/external/vulkan-sys/src/structs.rs index 0ba737d..d901985 100644 --- a/external/vulkan-sys/src/structs.rs +++ b/external/vulkan-sys/src/structs.rs @@ -1364,10 +1364,7 @@ pub struct SpecializationMapEntry { #[repr(C)] pub struct SpecializationInfo<'a> { pub map_entries: VulkanSlice1<'a, u32, SpecializationMapEntry, 4>, - /// Size in bytes of pData - pub data_size: usize, - /// Pointer to SpecConstant data - pub data: *const c_void, + pub data: VulkanSlice1<'a, usize, u8, 0>, } impl<'a> Default for SpecializationInfo<'a> { @@ -1398,6 +1395,21 @@ impl<'a> Default for PipelineShaderStageCreateInfo<'a> { } } +#[repr(C)] +pub struct PipelineShaderStageRequiredSubgroupSizeCreateInfo { + pub _type: StructureType, + pub _next: *const c_void, + pub required_subgroup_size: u32, +} + +impl Default for PipelineShaderStageRequiredSubgroupSizeCreateInfo { + fn default() -> Self { + let mut x = unsafe { MaybeUninit::::zeroed().assume_init() }; + x._type = StructureType::PipelineShaderStageRequiredSubgroupSizeCreateInfo; + x + } +} + #[repr(C)] pub struct ComputePipelineCreateInfo<'a> { pub _type: StructureType, @@ -2750,6 +2762,7 @@ impl Default for PhysicalDeviceVulkan13Features { } } +#[repr(C)] pub struct PhysicalDeviceSwapchainMaintenance1FeaturesEXT { pub _type: StructureType, pub _next: *mut c_void, diff --git a/title/shark-shaders/build.rs b/title/shark-shaders/build.rs index 85d5639..a9acb52 100644 --- a/title/shark-shaders/build.rs +++ b/title/shark-shaders/build.rs @@ -20,19 +20,39 @@ const SHADERS: &[Shader] = &[ }, Shader { stage: "comp", - name: "primitive_2d_bin", + name: "draw_2d_bin_0_clear", }, Shader { stage: "comp", - name: "primitive_2d_bin_clear", + name: "draw_2d_bin_1_scatter", }, Shader { stage: "comp", - name: "primitive_2d_rasterize", + name: "draw_2d_bin_2_sort", }, Shader { stage: "comp", - name: "display_transform", + name: "draw_2d_bin_3_resolve", + }, + Shader { + stage: "comp", + name: "draw_2d_rasterize", + }, + Shader { + stage: "comp", + name: "radix_sort_0_upsweep", + }, + Shader { + stage: "comp", + name: "radix_sort_1_spine", + }, + Shader { + stage: "comp", + name: "radix_sort_2_downsweep", + }, + Shader { + stage: "comp", + name: "composite", }, ]; diff --git a/title/shark-shaders/shaders/basic.frag.glsl b/title/shark-shaders/shaders/basic.frag similarity index 75% rename from title/shark-shaders/shaders/basic.frag.glsl rename to title/shark-shaders/shaders/basic.frag index e179a7f..1693e95 100644 --- a/title/shark-shaders/shaders/basic.frag.glsl +++ b/title/shark-shaders/shaders/basic.frag @@ -1,7 +1,7 @@ #version 460 -layout(set = 0, binding = 1) uniform sampler bilinear_sampler; -layout(set = 1, binding = 2) uniform texture2D albedo; +layout(set = 0, binding = 0) uniform sampler bilinear_sampler; +layout(set = 0, binding = 1) uniform texture2D albedo; layout(location = 0) in vec2 tex_coord; layout(location = 1) in vec3 normal; diff --git a/title/shark-shaders/shaders/basic.vert b/title/shark-shaders/shaders/basic.vert new file mode 100644 index 0000000..a763036 --- /dev/null +++ b/title/shark-shaders/shaders/basic.vert @@ -0,0 +1,55 @@ +#version 460 + +#extension GL_EXT_buffer_reference : require +#extension GL_EXT_buffer_reference2 : require +#extension GL_EXT_scalar_block_layout : require + +struct Vertex { + vec4 position; + vec4 normal; + vec4 texcoord; +}; + +struct Transform { + vec4 transform[3]; +}; + +layout(buffer_reference, std430, buffer_reference_align = 16) readonly buffer VertexRef { + Vertex values[]; +}; + +layout(buffer_reference, std430, buffer_reference_align = 16) readonly buffer TransformRef { + Transform values[]; +}; + +struct BasicConstants { + mat4 clip_from_camera; + VertexRef vertex_buffer; + TransformRef transform_buffer; +}; + +layout(std430, row_major, push_constant) uniform BasicConstantsBlock { + BasicConstants constants; +}; + +layout(location = 0) out vec2 out_texcoord; +layout(location = 1) out vec3 out_normal; + +void main() { + const Transform td = constants.transform_buffer.values[gl_InstanceIndex]; + const Vertex vd = constants.vertex_buffer.values[gl_VertexIndex]; + + const mat4 camera_from_model = mat4( + td.transform[0].x, td.transform[0].w, td.transform[1].z, 0.0, + td.transform[0].y, td.transform[1].x, td.transform[1].w, 0.0, + td.transform[0].z, td.transform[1].y, td.transform[2].x, 0.0, + td.transform[2].y, td.transform[2].z, td.transform[2].w, 1.0 + ); + + const vec4 position_clip = constants.clip_from_camera * camera_from_model * vec4(vd.position.xyz, 1.0); + + gl_Position = position_clip; + + out_normal = vd.normal.xyz; + out_texcoord = vec2(vd.texcoord.x, 1.0 - vd.texcoord.y); +} diff --git a/title/shark-shaders/shaders/basic.vert.glsl b/title/shark-shaders/shaders/basic.vert.glsl deleted file mode 100644 index 7fab7f1..0000000 --- a/title/shark-shaders/shaders/basic.vert.glsl +++ /dev/null @@ -1,47 +0,0 @@ -#version 460 - -#extension GL_EXT_scalar_block_layout : require - -struct VertexData { - vec4 position; - vec4 normal; - vec4 texcoord; -}; - -struct TransformData { - vec4 transform[3]; -}; - -layout(std430, row_major, set = 0, binding = 0) uniform uniformBuffer { - mat4 clip_from_camera; -}; - -layout(std430, set = 1, binding = 0) readonly buffer vertexBuffer { - VertexData vertices[]; -}; - -layout(std430, set = 1, binding = 1) readonly buffer transformBuffer { - TransformData transforms[]; -}; - -layout(location = 0) out vec2 out_texcoord; -layout(location = 1) out vec3 out_normal; - -void main() { - const TransformData td = transforms[gl_InstanceIndex]; - const VertexData vd = vertices[gl_VertexIndex]; - - const mat4 camera_from_model = mat4( - td.transform[0].x, td.transform[0].w, td.transform[1].z, 0.0, - td.transform[0].y, td.transform[1].x, td.transform[1].w, 0.0, - td.transform[0].z, td.transform[1].y, td.transform[2].x, 0.0, - td.transform[2].y, td.transform[2].z, td.transform[2].w, 1.0 - ); - - const vec4 position_clip = clip_from_camera * camera_from_model * vec4(vd.position.xyz, 1.0); - - gl_Position = position_clip; - - out_normal = vd.normal.xyz; - out_texcoord = vec2(vd.texcoord.x, 1.0 - vd.texcoord.y); -} diff --git a/title/shark-shaders/shaders/display_transform.comp.glsl b/title/shark-shaders/shaders/composite.comp similarity index 64% rename from title/shark-shaders/shaders/display_transform.comp.glsl rename to title/shark-shaders/shaders/composite.comp index 465f44a..0e6211d 100644 --- a/title/shark-shaders/shaders/display_transform.comp.glsl +++ b/title/shark-shaders/shaders/composite.comp @@ -26,21 +26,23 @@ vec3 tony_mc_mapface(vec3 stimulus) { layout (local_size_x = 8, local_size_y = 8, local_size_z = 1) in; void main() { - const uvec2 tile_coord = gl_WorkGroupID.xy / 4; - const uint tile_index = tile_coord.y * uniforms.tile_stride + tile_coord.x; - const uint tile_base = tile_index * TILE_STRIDE; - TilesRead tiles_read = TilesRead(uniforms.tiles); - const uint lo = tiles_read.values[tile_base + TILE_BITMAP_RANGE_LO_OFFSET]; - const uint hi = tiles_read.values[tile_base + TILE_BITMAP_RANGE_HI_OFFSET]; - + // const uvec2 tile_coord = gl_WorkGroupID.xy / 4; + // const uint tile_index = tile_coord.y * uniforms.tile_resolution.x + tile_coord.x; + + // TilesRead tiles_read = TilesRead(uniforms.tiles); + // const uint lo = tiles_read.values[tile_base + TILE_BITMAP_RANGE_LO_OFFSET]; + // const uint hi = tiles_read.values[tile_base + TILE_BITMAP_RANGE_HI_OFFSET]; + + // Display transform const vec3 stimulus = imageLoad(color_layer, ivec2(gl_GlobalInvocationID.xy)).rgb; const vec3 transformed = tony_mc_mapface(stimulus); vec3 composited = srgb_oetf(transformed); - if (lo <= hi) { - const vec4 ui = imageLoad(ui_layer_read, ivec2(gl_GlobalInvocationID.xy)).rgba; - composited = ui.rgb + (composited * (1.0 - ui.a)); - } + // UI Composite + // if (lo <= hi) { + // const vec4 ui = imageLoad(ui_layer_read, ivec2(gl_GlobalInvocationID.xy)).rgba; + // composited = ui.rgb + (composited * (1.0 - ui.a)); + // } imageStore(composited_output, ivec2(gl_GlobalInvocationID.xy), vec4(composited, 1.0)); } diff --git a/title/shark-shaders/shaders/compute_bindings.h b/title/shark-shaders/shaders/compute_bindings.h index bbf7cf6..dc694d5 100644 --- a/title/shark-shaders/shaders/compute_bindings.h +++ b/title/shark-shaders/shaders/compute_bindings.h @@ -1,53 +1,6 @@ #ifndef COMPUTE_BINDINGS_INCLUDE #define COMPUTE_BINDINGS_INCLUDE -#include "primitive_2d.h" - -layout(buffer_reference, std430, buffer_reference_align = 16) readonly buffer PrimitiveInstances -{ - PrimitiveInstance values[]; -}; - -layout(buffer_reference, std430, buffer_reference_align = 16) readonly buffer Rects -{ - Rect values[]; -}; - -layout(buffer_reference, std430, buffer_reference_align = 16) readonly buffer Glyphs -{ - Glyph values[]; -}; - -layout(buffer_reference, std430, buffer_reference_align = 4) readonly buffer TilesRead -{ - uint values[]; -}; - -layout(buffer_reference, std430, buffer_reference_align = 4) writeonly buffer TilesWrite -{ - uint values[]; -}; - -struct ComputeUniforms { - uvec2 screen_resolution; - uvec2 atlas_resolution; - uvec2 tile_resolution; - - uint num_primitives; - uint num_primitives_32; - uint num_primitives_1024; - uint tile_stride; - - PrimitiveInstances primitive_instances; - Rects rects; - Glyphs glyphs; - TilesWrite tiles; -}; - -layout(std430, push_constant) uniform UniformBuffer { - ComputeUniforms uniforms; -}; - layout (set = 0, binding = 0) uniform sampler bilinear_sampler; layout (set = 0, binding = 1) uniform texture3D tony_mc_mapface_lut; layout (set = 0, binding = 2) uniform texture2D glyph_atlas; diff --git a/title/shark-shaders/shaders/draw_2d.h b/title/shark-shaders/shaders/draw_2d.h new file mode 100644 index 0000000..43181eb --- /dev/null +++ b/title/shark-shaders/shaders/draw_2d.h @@ -0,0 +1,69 @@ +#ifndef DRAW_2D_H +#define DRAW_2D_H + +const uint TILE_SIZE = 32; + +const uint DRAW_2D_CMD_RECT = 0; +const uint DRAW_2D_CMD_GLYPH = 1; + +struct Glyph { + ivec2 atlas_min; + ivec2 atlas_max; + + vec2 offset_min; + vec2 offset_max; +}; + +struct Draw2dCmd { + uint type; + uint words[7]; +}; + +struct Draw2dCmdRect { + uint border_width; + vec2 position; + vec2 half_extent; + uint background_color; + uint border_color; +}; + +struct Draw2dCmdGlyph { + uint index; + vec2 position; + uint color; +}; + +Draw2dCmdRect decode_rect(Draw2dCmd cmd) { + return Draw2dCmdRect( + cmd.words[0], + vec2(uintBitsToFloat(cmd.words[1]), uintBitsToFloat(cmd.words[2])), + vec2(uintBitsToFloat(cmd.words[3]), uintBitsToFloat(cmd.words[4])), + cmd.words[5], + cmd.words[6] + ); +} + +Draw2dCmdGlyph decode_glyph(Draw2dCmd cmd) { + return Draw2dCmdGlyph( + cmd.words[0], + vec2(uintBitsToFloat(cmd.words[1]), uintBitsToFloat(cmd.words[2])), + cmd.words[3] + ); +} + +layout(buffer_reference, std430, buffer_reference_align = 16) readonly buffer Draw2dCommandRef +{ + Draw2dCmd values[]; +}; + +layout(buffer_reference, std430, buffer_reference_align = 16) readonly buffer GlyphRef +{ + Glyph values[]; +}; + +layout(buffer_reference, std430, buffer_reference_align = 4) buffer CoarseRef +{ + uint values[]; +}; + +#endif \ No newline at end of file diff --git a/title/shark-shaders/shaders/draw_2d_bin_0_clear.comp b/title/shark-shaders/shaders/draw_2d_bin_0_clear.comp new file mode 100644 index 0000000..8d5e613 --- /dev/null +++ b/title/shark-shaders/shaders/draw_2d_bin_0_clear.comp @@ -0,0 +1,28 @@ +#version 460 + +#extension GL_GOOGLE_include_directive : require + +#extension GL_EXT_buffer_reference : require +#extension GL_EXT_buffer_reference2 : require +#extension GL_EXT_scalar_block_layout : require +#extension GL_EXT_control_flow_attributes : require + +#extension GL_KHR_shader_subgroup_vote : require +#extension GL_KHR_shader_subgroup_ballot : require + +#include "draw_2d.h" +#include "radix_sort.h" + +struct Draw2dClearConstants { + CoarseRef coarse_buffer; +}; + +layout(std430, push_constant) uniform Draw2dClearConstantsBlock { + Draw2dClearConstants constants; +}; + +layout (local_size_x = 1, local_size_y = 1, local_size_z = 1) in; + +void main() { + constants.coarse_buffer.values[0] = 0; +} diff --git a/title/shark-shaders/shaders/draw_2d_bin_1_scatter.comp b/title/shark-shaders/shaders/draw_2d_bin_1_scatter.comp new file mode 100644 index 0000000..d991eb1 --- /dev/null +++ b/title/shark-shaders/shaders/draw_2d_bin_1_scatter.comp @@ -0,0 +1,189 @@ +#version 460 + +#extension GL_GOOGLE_include_directive : require + +#extension GL_EXT_buffer_reference : require +#extension GL_EXT_buffer_reference2 : require +#extension GL_EXT_scalar_block_layout : require +#extension GL_EXT_control_flow_attributes : require + +#extension GL_KHR_shader_subgroup_arithmetic : require +#extension GL_KHR_shader_subgroup_ballot : require +#extension GL_KHR_shader_subgroup_vote : require + +#include "draw_2d.h" + +struct Draw2dScatterConstants { + uvec2 screen_resolution; + uvec2 tile_resolution; + + uint draw_buffer_len; + uint coarse_buffer_len; + + Draw2dCommandRef draw_buffer; + GlyphRef glyph_buffer; + CoarseRef coarse_buffer; +}; + +layout(std430, push_constant) uniform Draw2dScatterConstantsBlock { + Draw2dScatterConstants constants; +}; + +layout (local_size_x_id = 0, local_size_y = 1, local_size_z = 1) in; + +void main() { + const uint local_id = gl_SubgroupID * gl_SubgroupSize + gl_SubgroupInvocationID; + const uint draw_index = gl_WorkGroupID.x * gl_WorkGroupSize.x + local_id; + + // Bounds for this command, any tiles which intersect this AABB will be written. + vec2 cmd_min = vec2(99999.9); + vec2 cmd_max = vec2(-99999.9); + + if (draw_index < constants.draw_buffer_len) { + const Draw2dCmd cmd = constants.draw_buffer.values[draw_index]; + const uint type = cmd.type; + for (;;) { + const uint scalar_type = subgroupBroadcastFirst(type); + [[branch]] + if (scalar_type == type) { + switch (scalar_type) { + case DRAW_2D_CMD_RECT: + const Draw2dCmdRect cmd_rect = decode_rect(cmd); + cmd_min = cmd_rect.position - cmd_rect.half_extent - cmd_rect.border_width; + cmd_max = cmd_rect.position + cmd_rect.half_extent + cmd_rect.border_width; + break; + case DRAW_2D_CMD_GLYPH: + const Draw2dCmdGlyph cmd_glyph = decode_glyph(cmd); + const Glyph glyph = constants.glyph_buffer.values[cmd_glyph.index]; + cmd_min = cmd_glyph.position + glyph.offset_min; + cmd_max = cmd_glyph.position + glyph.offset_max; + break; + } + break; + } + } + } + + const vec2 cmds_min = subgroupMin(cmd_min); + const vec2 cmds_max = subgroupMax(cmd_max); + + // Are all our commands entirely offscreen? + if (any(greaterThan(cmds_min, constants.screen_resolution)) || any(lessThan(cmds_max, vec2(0.0)))) { + return; + } + + const uvec2 cmd_min_tile = uvec2(floor(max(min(cmd_min, constants.screen_resolution), 0.0) / TILE_SIZE)); + const uvec2 cmd_max_tile = uvec2(floor(max(min(cmd_max, constants.screen_resolution), 0.0) / TILE_SIZE)); + const uvec2 cmds_min_tile = subgroupMin(cmd_min_tile); + const uvec2 cmds_max_tile = subgroupMax(cmd_max_tile); + + // Are any single commands responsible for the entire bounds? + const bool cmd_absolute_min = cmd_min_tile == cmds_min_tile; + const bool cmd_absolute_max = cmd_max_tile == cmds_max_tile; + const bool use_individual_bounds = !any(notEqual(subgroupBallot(cmd_absolute_min) & subgroupBallot(cmd_absolute_max), uvec4(0))); + + if (false && use_individual_bounds) { + + } else { + const uvec2 tile_count = cmds_max_tile - cmds_min_tile + uvec2(1); + const uint count = tile_count.x * tile_count.y; + + uint offset; + if (subgroupElect()) { + offset = atomicAdd(constants.coarse_buffer.values[0], count) + 1; + } + offset = subgroupBroadcastFirst(offset); + + if (offset >= constants.coarse_buffer_len) { + return; + } + + for (uint y = 0; y < tile_count.y; y++) { + for (uint x = 0; x < tile_count.x; x += gl_SubgroupSize) { + const uint local_x = x + gl_SubgroupInvocationID; + if (local_x < tile_count.x) { + const uint yy = cmds_min_tile.y + y; + const uint xx = cmds_min_tile.x + local_x; + const uint packed = ((yy & 0xff) << 24) | ((xx & 0xff) << 16) | (gl_WorkGroupID.x & 0xffff); + constants.coarse_buffer.values[offset + local_x] = packed; + } + subgroupBarrier(); + } + offset += tile_count.x; + } + } + + // if (gl_SubgroupSize == 32 && fullscreen_ballot.x != 0) { + // uint offset; + // if (subgroupElect()) { + // const uint count = constants.tile_resolution.x * constants.tile_resolution.y; + // offset = atomicAdd(constants.coarse_buffer.values[0], count) + 1; + // } + // offset = subgroupBroadcastFirst(offset); + + // if (offset >= constants.coarse_buffer_len) { + // return; + // } + + // const uint word_index = gl_WorkGroupID.x; + + // for (uint y = 0; y < constants.tile_resolution.y; y++) { + // for (uint x = 0; x < constants.tile_resolution.x; x++) { + // const uint tile_index = y * constants.tile_resolution.x + x; + // const uint packed = (tile_index << 16) | word_index; + // if (subgroupElect() && offset + tile_index < constants.coarse_buffer_len) { + // constants.coarse_buffer.values[offset + tile_index] = packed; + // } + // } + // } + + // return; + // } + + // uint count = 0; + + // for (uint y = cmds_min_tile.y; y <= cmds_max_tile.y; y++) { + // for (uint x = cmds_min_tile.x; x <= cmds_max_tile.x; x++) { + // const vec2 tile_min = uvec2(x, y) * TILE_SIZE; + // const vec2 tile_max = min(tile_min + TILE_SIZE, constants.screen_resolution); + + // const bool intersects = !(any(lessThan(tile_max, cmd_min)) || any(greaterThan(tile_min, cmd_max))); + // const uvec4 ballot = subgroupBallot(intersects); + + // if (subgroupElect()) { + // count += uint(ballot.x != 0); + // } + // } + // } + + // if (count == 0) { + // return; + // } + + // uint offset; + // if (subgroupElect()) { + // offset = atomicAdd(constants.coarse_buffer.values[0], count) + 1; + // } + // offset = subgroupBroadcastFirst(offset); + + // if (offset >= constants.coarse_buffer_len) { + // return; + // } + + // for (uint y = cmds_min_tile.y; y <= cmds_max_tile.y; y++) { + // for (uint x = cmds_min_tile.x; x <= cmds_max_tile.x; x++) { + // const vec2 tile_min = uvec2(x, y) * TILE_SIZE; + // const vec2 tile_max = min(tile_min + TILE_SIZE, constants.screen_resolution); + // const uint tile_index = y * constants.tile_resolution.x + x; + + // const bool intersects = !(any(lessThan(tile_max, cmd_min)) || any(greaterThan(tile_min, cmd_max))); + // const uvec4 ballot = subgroupBallot(intersects); + + // if (subgroupElect() && ballot.x != 0 && offset < constants.coarse_buffer_len) { + // const uint word_index = gl_WorkGroupID.x; + // const uint packed = (tile_index << 16) | word_index; + // constants.coarse_buffer.values[offset++] = packed; + // } + // } + // } +} diff --git a/title/shark-shaders/shaders/draw_2d_bin_2_sort.comp b/title/shark-shaders/shaders/draw_2d_bin_2_sort.comp new file mode 100644 index 0000000..5a99c22 --- /dev/null +++ b/title/shark-shaders/shaders/draw_2d_bin_2_sort.comp @@ -0,0 +1,49 @@ +#version 460 + +#extension GL_GOOGLE_include_directive : require + +#extension GL_EXT_buffer_reference : require +#extension GL_EXT_buffer_reference2 : require +#extension GL_EXT_scalar_block_layout : require +#extension GL_EXT_control_flow_attributes : require + +#extension GL_KHR_shader_subgroup_arithmetic : require +#extension GL_KHR_shader_subgroup_ballot : require +#extension GL_KHR_shader_subgroup_shuffle_relative: enable +#extension GL_KHR_shader_subgroup_vote : require + +#include "compute_bindings.h" + +#include "draw_2d.h" +#include "indirect.h" +#include "radix_sort.h" + +layout(buffer_reference, std430, buffer_reference_align = 4) buffer VkDispatchIndirectCommandRef { + VkDispatchIndirectCommand dimensions; +}; + +struct Draw2dSortConstants { + uint coarse_buffer_len; + uint _pad; + VkDispatchIndirectCommandRef indirect_dispatch_buffer; + CoarseRef coarse_buffer; +}; + +layout(std430, push_constant) uniform Draw2dSortConstantsBlock { + Draw2dSortConstants constants; +}; + +layout (local_size_x = 1, local_size_y = 1, local_size_z = 1) in; + +void main() { + // We shouldn't overflow the coarse buffer in the scatter phase, but we can + // still end up with a count that's larger than the buffer size since we + // unconditionally atomicAdd. So we need to clamp to the actual size now + // before dispatching sort work. + const uint count = min(constants.coarse_buffer_len, constants.coarse_buffer.values[0]); + constants.coarse_buffer.values[0] = count; + + constants.indirect_dispatch_buffer.dimensions.x = (count + (RADIX_ITEMS_PER_WGP - 1)) / RADIX_ITEMS_PER_WGP; + constants.indirect_dispatch_buffer.dimensions.y = 1; + constants.indirect_dispatch_buffer.dimensions.z = 1; +} diff --git a/title/shark-shaders/shaders/draw_2d_bin_3_resolve.comp b/title/shark-shaders/shaders/draw_2d_bin_3_resolve.comp new file mode 100644 index 0000000..718563d --- /dev/null +++ b/title/shark-shaders/shaders/draw_2d_bin_3_resolve.comp @@ -0,0 +1,22 @@ +#version 460 + +#extension GL_GOOGLE_include_directive : require + +#extension GL_EXT_buffer_reference : require +#extension GL_EXT_buffer_reference2 : require +#extension GL_EXT_scalar_block_layout : require +#extension GL_EXT_control_flow_attributes : require + +#extension GL_KHR_shader_subgroup_arithmetic : require +#extension GL_KHR_shader_subgroup_ballot : require +#extension GL_KHR_shader_subgroup_vote : require + +#include "compute_bindings.h" + +const uint SUBGROUP_SIZE = 64; + +// TODO: Spec constant support for different subgroup sizes. +layout (local_size_x = SUBGROUP_SIZE, local_size_y = 1, local_size_z = 1) in; + +void main() { +} diff --git a/title/shark-shaders/shaders/draw_2d_rasterize.comp b/title/shark-shaders/shaders/draw_2d_rasterize.comp new file mode 100644 index 0000000..dfa9b7d --- /dev/null +++ b/title/shark-shaders/shaders/draw_2d_rasterize.comp @@ -0,0 +1,145 @@ +#version 460 + +#extension GL_GOOGLE_include_directive : require + +#extension GL_EXT_buffer_reference : require +#extension GL_EXT_buffer_reference2 : require +#extension GL_EXT_scalar_block_layout : require +#extension GL_EXT_control_flow_attributes : require + +#extension GL_KHR_shader_subgroup_vote : require +#extension GL_KHR_shader_subgroup_ballot : require + +#include "compute_bindings.h" + +layout (local_size_x = 8, local_size_y = 8, local_size_z = 1) in; + +/// x = (((index >> 2) & 0x0007) & 0xFFFE) | index & 0x0001 +/// y = ((index >> 1) & 0x0003) | (((index >> 3) & 0x0007) & 0xFFFC) + +#define DEBUG_SHOW_TILES 0 + +#if DEBUG_SHOW_TILES != 0 + +vec3 plasma_quintic(float x) +{ + x = clamp(x, 0.0, 1.0); + vec4 x1 = vec4(1.0, x, x * x, x * x * x); // 1 x x2 x3 + vec4 x2 = x1 * x1.w * x; // x4 x5 x6 x7 + return vec3( + dot(x1.xyzw, vec4(+0.063861086, +1.992659096, -1.023901152, -0.490832805)) + dot(x2.xy, vec2(+1.308442123, -0.914547012)), + dot(x1.xyzw, vec4(+0.049718590, -0.791144343, +2.892305078, +0.811726816)) + dot(x2.xy, vec2(-4.686502417, +2.717794514)), + dot(x1.xyzw, vec4(+0.513275779, +1.580255060, -5.164414457, +4.559573646)) + dot(x2.xy, vec2(-1.916810682, +0.570638854)) + ); +} + +#endif + +void main() { + // const uvec2 tile_coord = gl_WorkGroupID.xy / (TILE_SIZE / gl_WorkGroupSize.xy); + // const uint tile_index = tile_coord.y * uniforms.tile_resolution.x + tile_coord.x; + + // TilesRead tiles_read = TilesRead(uniforms.tiles); + + // const uint lo = tiles_read.values[tile_base + TILE_BITMAP_RANGE_LO_OFFSET]; + // const uint hi = tiles_read.values[tile_base + TILE_BITMAP_RANGE_HI_OFFSET]; + + // if (hi < lo) { + // return; + // } + +// #if DEBUG_SHOW_TILES == 1 + +// uint count = 0; +// // For each tile, iterate over all words in the L1 bitmap. +// for (uint index_l1 = lo; index_l1 <= hi; index_l1++) { +// // For each word, iterate all set bits. +// uint bitmap_l1 = tiles_read.values[tile_base + TILE_BITMAP_L1_OFFSET + index_l1]; + +// while (bitmap_l1 != 0) { +// const uint i = findLSB(bitmap_l1); +// bitmap_l1 ^= bitmap_l1 & -bitmap_l1; + +// // For each set bit in the L1 bitmap, iterate the set bits in the +// // corresponding L0 bitmap. +// const uint index_l0 = index_l1 * 32 + i; +// uint bitmap_l0 = tiles_read.values[tile_base + TILE_BITMAP_L0_OFFSET + index_l0]; + +// count += bitCount(bitmap_l0); +// } +// } + +// const vec3 color = plasma_quintic(float(count) / 100.0); +// imageStore(ui_layer_write, ivec2(gl_GlobalInvocationID.xy), vec4(color, 1.0)); + +// #elif DEBUG_SHOW_TILES == 2 + +// uint count = hi - lo; +// const vec3 color = plasma_quintic(float(count) / 100.0); +// imageStore(ui_layer_write, ivec2(gl_GlobalInvocationID.xy), vec4(color, 1.0)); + +// #else + +// const vec2 sample_center = gl_GlobalInvocationID.xy + vec2(0.5); + +// vec4 accum = vec4(0.0); + +// // For each tile, iterate over all words in the L1 bitmap. +// for (uint index_l1 = lo; index_l1 <= hi; index_l1++) { +// // For each word, iterate all set bits. +// uint bitmap_l1 = tiles_read.values[tile_base + TILE_BITMAP_L1_OFFSET + index_l1]; + +// while (bitmap_l1 != 0) { +// const uint i = findLSB(bitmap_l1); +// bitmap_l1 ^= bitmap_l1 & -bitmap_l1; + +// // For each set bit in the L1 bitmap, iterate the set bits in the +// // corresponding L0 bitmap. +// const uint index_l0 = index_l1 * 32 + i; +// uint bitmap_l0 = tiles_read.values[tile_base + TILE_BITMAP_L0_OFFSET + index_l0]; +// while (bitmap_l0 != 0) { +// const uint j = findLSB(bitmap_l0); +// bitmap_l0 ^= bitmap_l0 & -bitmap_l0; + +// // Set bits in the L0 bitmap indicate binned primitives for this tile. +// const uint primitive_index = index_l0 * 32 + j; +// const Primitive2d primitive = uniforms.primitives.values[primitive_index]; +// const uint type = primitive.type; + +// vec4 primitive_color = vec4(0.0); + +// switch (type) { +// case PRIMITIVE_TYPE_RECT: { +// // const Rect rect = uniforms.rects.values[offset]; +// // const vec2 rect_min = primitive_instance.position - rect.half_extent - rect.border_width; +// // const vec2 rect_max = primitive_instance.position + rect.half_extent + rect.border_width; +// if (all(greaterThanEqual(sample_center, rect_min)) && all(lessThanEqual(sample_center, rect_max))) { +// primitive_color = unpackUnorm4x8(primitive_instance.color).bgra; +// } +// break; +// } +// case PRIMITIVE_TYPE_GLYPH: { +// const Glyph glyph = uniforms.glyphs.values[offset]; +// const vec2 glyph_min = primitive_instance.position + glyph.offset_min; +// const vec2 glyph_max = primitive_instance.position + glyph.offset_max; +// if (all(greaterThanEqual(sample_center, glyph_min)) && all(lessThanEqual(sample_center, glyph_max))) { +// const vec2 glyph_size = glyph.offset_max - glyph.offset_min; +// const vec2 uv = mix(glyph.atlas_min, glyph.atlas_max, (sample_center - glyph_min) / glyph_size) / uniforms.atlas_resolution; +// const vec4 color = unpackUnorm4x8(primitive_instance.color).bgra; +// const float coverage = textureLod(sampler2D(glyph_atlas, bilinear_sampler), uv, 0.0).r * color.a; +// primitive_color = color * coverage; +// } +// break; +// } +// } + +// // does it blend? +// accum.rgba = primitive_color.rgba + accum.rgba * (1.0 - primitive_color.a); +// } +// } +// } + +// imageStore(ui_layer_write, ivec2(gl_GlobalInvocationID.xy), accum); + +// #endif +} diff --git a/title/shark-shaders/shaders/indirect.h b/title/shark-shaders/shaders/indirect.h new file mode 100644 index 0000000..d409cf2 --- /dev/null +++ b/title/shark-shaders/shaders/indirect.h @@ -0,0 +1,10 @@ +#ifndef INDIRECT_H +#define INDIRECT_H + +struct VkDispatchIndirectCommand { + uint x; + uint y; + uint z; +}; + +#endif \ No newline at end of file diff --git a/title/shark-shaders/shaders/primitive_2d.h b/title/shark-shaders/shaders/primitive_2d.h deleted file mode 100644 index ae6a2ec..0000000 --- a/title/shark-shaders/shaders/primitive_2d.h +++ /dev/null @@ -1,38 +0,0 @@ -#ifndef PRIMITIVE_2D_INCLUDE -#define PRIMITIVE_2D_INCLUDE - -const uint TILE_SIZE = 32; - -const uint MAX_PRIMS = 1 << 18; -const uint TILE_BITMAP_L1_WORDS = (MAX_PRIMS / 32 / 32); -const uint TILE_BITMAP_L0_WORDS = (MAX_PRIMS / 32); -const uint TILE_STRIDE = (TILE_BITMAP_L0_WORDS + TILE_BITMAP_L1_WORDS + 2); -const uint TILE_BITMAP_RANGE_LO_OFFSET = 0; -const uint TILE_BITMAP_RANGE_HI_OFFSET = (TILE_BITMAP_RANGE_LO_OFFSET + 1); -const uint TILE_BITMAP_L1_OFFSET = (TILE_BITMAP_RANGE_HI_OFFSET + 1); -const uint TILE_BITMAP_L0_OFFSET = (TILE_BITMAP_L1_OFFSET + TILE_BITMAP_L1_WORDS); - -const uint PRIMITIVE_TYPE_RECT = 0; -const uint PRIMITIVE_TYPE_GLYPH = 1; - -struct PrimitiveInstance { - uint packed; - uint color; - vec2 position; -}; - -struct Rect { - vec2 half_extent; - float border_width; - float border_radius; -}; - -struct Glyph { - ivec2 atlas_min; - ivec2 atlas_max; - - vec2 offset_min; - vec2 offset_max; -}; - -#endif \ No newline at end of file diff --git a/title/shark-shaders/shaders/primitive_2d_bin.comp.glsl b/title/shark-shaders/shaders/primitive_2d_bin.comp.glsl deleted file mode 100644 index 6398615..0000000 --- a/title/shark-shaders/shaders/primitive_2d_bin.comp.glsl +++ /dev/null @@ -1,106 +0,0 @@ -#version 460 - -#extension GL_GOOGLE_include_directive : require - -#extension GL_EXT_buffer_reference : require -#extension GL_EXT_buffer_reference2 : require -#extension GL_EXT_scalar_block_layout : require -#extension GL_EXT_control_flow_attributes : require - -#extension GL_KHR_shader_subgroup_arithmetic : require -#extension GL_KHR_shader_subgroup_ballot : require -#extension GL_KHR_shader_subgroup_vote : require - -#include "compute_bindings.h" - -const uint SUBGROUP_SIZE = 64; -const uint NUM_SUBGROUPS = 16; -const uint NUM_PRIMITIVES_WG = (SUBGROUP_SIZE * NUM_SUBGROUPS); - -// TODO: Spec constant support for different subgroup sizes. -layout (local_size_x = SUBGROUP_SIZE, local_size_y = 1, local_size_z = 1) in; - -void main() { - uint word_index = 0; - - for (uint i = 0; i < NUM_PRIMITIVES_WG; i += gl_SubgroupSize.x) { - const uint primitive_index = gl_WorkGroupID.x * NUM_PRIMITIVES_WG + i + gl_SubgroupInvocationID; - - // Bounds for this primitive, any tiles which intersect this AABB will be written. - vec2 primitive_min = vec2(99999.9); - vec2 primitive_max = vec2(-99999.9); - - if (primitive_index < uniforms.num_primitives) { - const PrimitiveInstance primitive_instance = uniforms.primitive_instances.values[primitive_index]; - const uint type = bitfieldExtract(primitive_instance.packed, 30, 2); - const uint offset = bitfieldExtract(primitive_instance.packed, 0, 20); - - for (;;) { - const uint scalar_type = subgroupBroadcastFirst(type); - [[branch]] - if (scalar_type == type) { - switch (type) { - case PRIMITIVE_TYPE_RECT: - const Rect rect = uniforms.rects.values[offset]; - primitive_min = primitive_instance.position - rect.half_extent; - primitive_max = primitive_instance.position + rect.half_extent; - break; - case PRIMITIVE_TYPE_GLYPH: - const Glyph glyph = uniforms.glyphs.values[offset]; - primitive_min = primitive_instance.position + glyph.offset_min; - primitive_max = primitive_instance.position + glyph.offset_max; - break; - } - break; - } - } - } - - const vec2 primitives_min = subgroupMin(primitive_min); - const vec2 primitives_max = subgroupMax(primitive_max); - - if (any(greaterThan(primitives_min, uniforms.screen_resolution)) || any(lessThan(primitives_max, vec2(0.0)))) { - word_index += 2; - continue; - } - - const ivec2 tile_start = ivec2(floor(max(min(primitives_min, uniforms.screen_resolution), 0.0) / TILE_SIZE)); - const ivec2 tile_end = ivec2(floor((max(min(primitives_max, uniforms.screen_resolution), 0.0) + (TILE_SIZE - 1)) / TILE_SIZE)); - - for (int y = tile_start.y; y < tile_end.y; y++) { - for (int x = tile_start.x; x < tile_end.x; x++) { - const uvec2 tile_coord = uvec2(x, y); - const vec2 tile_min = tile_coord * TILE_SIZE; - const vec2 tile_max = min(tile_min + TILE_SIZE, uniforms.screen_resolution); - - const bool intersects = !(any(lessThan(tile_max, primitive_min)) || any(greaterThan(tile_min, primitive_max))); - const uvec4 ballot = subgroupBallot(intersects); - - if (ballot.x == 0 && ballot.y == 0) { - continue; - } - - const uint tile_index = tile_coord.y * uniforms.tile_stride + tile_coord.x; - - if (ballot.x != 0) { - uniforms.tiles.values[tile_index * TILE_STRIDE + TILE_BITMAP_L0_OFFSET + gl_WorkGroupID.x * 32 + word_index + 0] = ballot.x; - } - - if (ballot.y != 0) { - uniforms.tiles.values[tile_index * TILE_STRIDE + TILE_BITMAP_L0_OFFSET + gl_WorkGroupID.x * 32 + word_index + 1] = ballot.y; - } - - if (subgroupElect()) { - uniforms.tiles.values[tile_index * TILE_STRIDE + TILE_BITMAP_L1_OFFSET + gl_WorkGroupID.x] |= - (uint(ballot.x != 0) << (word_index + 0)) | - (uint(ballot.y != 0) << (word_index + 1)); - - atomicMin(uniforms.tiles.values[tile_index * TILE_STRIDE + TILE_BITMAP_RANGE_LO_OFFSET], gl_WorkGroupID.x); - atomicMax(uniforms.tiles.values[tile_index * TILE_STRIDE + TILE_BITMAP_RANGE_HI_OFFSET], gl_WorkGroupID.x); - } - } - } - - word_index += 2; - } -} diff --git a/title/shark-shaders/shaders/primitive_2d_bin_clear.comp.glsl b/title/shark-shaders/shaders/primitive_2d_bin_clear.comp.glsl deleted file mode 100644 index 8d443a2..0000000 --- a/title/shark-shaders/shaders/primitive_2d_bin_clear.comp.glsl +++ /dev/null @@ -1,27 +0,0 @@ -#version 460 - -#extension GL_GOOGLE_include_directive : require - -#extension GL_EXT_buffer_reference : require -#extension GL_EXT_buffer_reference2 : require -#extension GL_EXT_scalar_block_layout : require -#extension GL_EXT_control_flow_attributes : require - -#extension GL_KHR_shader_subgroup_vote : require -#extension GL_KHR_shader_subgroup_ballot : require - -#include "compute_bindings.h" - -// TODO: Spec constant support for different subgroup sizes. -layout (local_size_x = 64, local_size_y = 1, local_size_z = 1) in; - -void main() { - const uint tile_index = gl_GlobalInvocationID.z * uniforms.tile_stride + gl_GlobalInvocationID.y; - - uniforms.tiles.values[tile_index * TILE_STRIDE + TILE_BITMAP_RANGE_LO_OFFSET] = 0xffffffff; - uniforms.tiles.values[tile_index * TILE_STRIDE + TILE_BITMAP_RANGE_HI_OFFSET] = 0; - - if (gl_GlobalInvocationID.x < TILE_BITMAP_L1_WORDS) { - uniforms.tiles.values[tile_index * TILE_STRIDE + TILE_BITMAP_L1_OFFSET + gl_GlobalInvocationID.x] = 0; - } -} diff --git a/title/shark-shaders/shaders/primitive_2d_rasterize.comp.glsl b/title/shark-shaders/shaders/primitive_2d_rasterize.comp.glsl deleted file mode 100644 index c61ce61..0000000 --- a/title/shark-shaders/shaders/primitive_2d_rasterize.comp.glsl +++ /dev/null @@ -1,143 +0,0 @@ -#version 460 - -#extension GL_GOOGLE_include_directive : require - -#extension GL_EXT_buffer_reference : require -#extension GL_EXT_buffer_reference2 : require -#extension GL_EXT_scalar_block_layout : require -#extension GL_EXT_control_flow_attributes : require - -#extension GL_KHR_shader_subgroup_vote : require -#extension GL_KHR_shader_subgroup_ballot : require - -#include "compute_bindings.h" -#include "primitive_2d.h" - -layout (local_size_x = 8, local_size_y = 8, local_size_z = 1) in; - -#define DEBUG_SHOW_TILES 0 - -#if DEBUG_SHOW_TILES != 0 - -vec3 plasma_quintic(float x) -{ - x = clamp(x, 0.0, 1.0); - vec4 x1 = vec4(1.0, x, x * x, x * x * x); // 1 x x2 x3 - vec4 x2 = x1 * x1.w * x; // x4 x5 x6 x7 - return vec3( - dot(x1.xyzw, vec4(+0.063861086, +1.992659096, -1.023901152, -0.490832805)) + dot(x2.xy, vec2(+1.308442123, -0.914547012)), - dot(x1.xyzw, vec4(+0.049718590, -0.791144343, +2.892305078, +0.811726816)) + dot(x2.xy, vec2(-4.686502417, +2.717794514)), - dot(x1.xyzw, vec4(+0.513275779, +1.580255060, -5.164414457, +4.559573646)) + dot(x2.xy, vec2(-1.916810682, +0.570638854)) - ); -} - -#endif - -void main() { - const uvec2 tile_coord = gl_WorkGroupID.xy / (TILE_SIZE / gl_WorkGroupSize.xy); - const uint tile_index = tile_coord.y * uniforms.tile_stride + tile_coord.x; - const uint tile_base = tile_index * TILE_STRIDE; - - TilesRead tiles_read = TilesRead(uniforms.tiles); - - const uint lo = tiles_read.values[tile_base + TILE_BITMAP_RANGE_LO_OFFSET]; - const uint hi = tiles_read.values[tile_base + TILE_BITMAP_RANGE_HI_OFFSET]; - - if (hi < lo) { - return; - } - -#if DEBUG_SHOW_TILES == 1 - - uint count = 0; - // For each tile, iterate over all words in the L1 bitmap. - for (uint index_l1 = lo; index_l1 <= hi; index_l1++) { - // For each word, iterate all set bits. - uint bitmap_l1 = tiles_read.values[tile_base + TILE_BITMAP_L1_OFFSET + index_l1]; - - while (bitmap_l1 != 0) { - const uint i = findLSB(bitmap_l1); - bitmap_l1 ^= bitmap_l1 & -bitmap_l1; - - // For each set bit in the L1 bitmap, iterate the set bits in the - // corresponding L0 bitmap. - const uint index_l0 = index_l1 * 32 + i; - uint bitmap_l0 = tiles_read.values[tile_base + TILE_BITMAP_L0_OFFSET + index_l0]; - - count += bitCount(bitmap_l0); - } - } - - const vec3 color = plasma_quintic(float(count) / 100.0); - imageStore(ui_layer_write, ivec2(gl_GlobalInvocationID.xy), vec4(color, 1.0)); - -#elif DEBUG_SHOW_TILES == 2 - - uint count = hi - lo; - const vec3 color = plasma_quintic(float(count) / 100.0); - imageStore(ui_layer_write, ivec2(gl_GlobalInvocationID.xy), vec4(color, 1.0)); - -#else - - const vec2 sample_center = gl_GlobalInvocationID.xy + vec2(0.5); - - vec4 accum = vec4(0.0); - - // For each tile, iterate over all words in the L1 bitmap. - for (uint index_l1 = lo; index_l1 <= hi; index_l1++) { - // For each word, iterate all set bits. - uint bitmap_l1 = tiles_read.values[tile_base + TILE_BITMAP_L1_OFFSET + index_l1]; - - while (bitmap_l1 != 0) { - const uint i = findLSB(bitmap_l1); - bitmap_l1 ^= bitmap_l1 & -bitmap_l1; - - // For each set bit in the L1 bitmap, iterate the set bits in the - // corresponding L0 bitmap. - const uint index_l0 = index_l1 * 32 + i; - uint bitmap_l0 = tiles_read.values[tile_base + TILE_BITMAP_L0_OFFSET + index_l0]; - while (bitmap_l0 != 0) { - const uint j = findLSB(bitmap_l0); - bitmap_l0 ^= bitmap_l0 & -bitmap_l0; - - // Set bits in the L0 bitmap indicate binned primitives for this tile. - const uint primitive_index = index_l0 * 32 + j; - const PrimitiveInstance primitive_instance = uniforms.primitive_instances.values[primitive_index]; - const uint type = bitfieldExtract(primitive_instance.packed, 30, 2); - const uint offset = bitfieldExtract(primitive_instance.packed, 0, 20); - - switch (type) { - case PRIMITIVE_TYPE_RECT: { - const Rect rect = uniforms.rects.values[offset]; - const vec2 rect_min = primitive_instance.position - rect.half_extent; - const vec2 rect_max = primitive_instance.position + rect.half_extent; - if (all(greaterThanEqual(sample_center, rect_min)) && all(lessThanEqual(sample_center, rect_max))) { - const vec4 color = unpackUnorm4x8(primitive_instance.color).bgra; - accum.rgb = color.rgb * color.a + accum.rgb * (1.0 - color.a); - accum.a = color.a + accum.a * (1.0 - color.a); - } - break; - } - case PRIMITIVE_TYPE_GLYPH: { - const Glyph glyph = uniforms.glyphs.values[offset]; - const vec2 glyph_min = primitive_instance.position + glyph.offset_min; - const vec2 glyph_max = primitive_instance.position + glyph.offset_max; - if (all(greaterThanEqual(sample_center, glyph_min)) && all(lessThanEqual(sample_center, glyph_max))) { - const vec2 glyph_size = glyph.offset_max - glyph.offset_min; - const vec2 uv = mix(glyph.atlas_min, glyph.atlas_max, (sample_center - glyph_min) / glyph_size) / uniforms.atlas_resolution; - const vec4 color = unpackUnorm4x8(primitive_instance.color).bgra; - const float coverage = textureLod(sampler2D(glyph_atlas, bilinear_sampler), uv, 0.0).r * color.a; - accum.rgb = color.rgb * coverage + accum.rgb * (1.0 - coverage); - accum.a = coverage + accum.a * (1.0 - coverage); - } - break; - } - } - } - } - } - - imageStore(ui_layer_write, ivec2(gl_GlobalInvocationID.xy), accum); - -#endif -} diff --git a/title/shark-shaders/shaders/radix_sort.h b/title/shark-shaders/shaders/radix_sort.h new file mode 100644 index 0000000..52ce630 --- /dev/null +++ b/title/shark-shaders/shaders/radix_sort.h @@ -0,0 +1,14 @@ +#ifndef RADIX_SORT_H +#define RADIX_SORT_H + +const uint RADIX_BITS = 8; +const uint RADIX_DIGITS = 1 << RADIX_BITS; +const uint RADIX_MASK = RADIX_DIGITS - 1; + +const uint RADIX_WGP_SIZE = 256; +const uint RADIX_ITEMS_PER_INVOCATION = 16; +const uint RADIX_ITEMS_PER_WGP = RADIX_WGP_SIZE * RADIX_ITEMS_PER_INVOCATION; + +const uint RADIX_SPINE_WGP_SIZE = 256; + +#endif diff --git a/title/shark-shaders/shaders/radix_sort_0_upsweep.comp b/title/shark-shaders/shaders/radix_sort_0_upsweep.comp new file mode 100644 index 0000000..69f4904 --- /dev/null +++ b/title/shark-shaders/shaders/radix_sort_0_upsweep.comp @@ -0,0 +1,84 @@ +#version 460 + +#extension GL_GOOGLE_include_directive : require + +#extension GL_EXT_buffer_reference : require +#extension GL_EXT_buffer_reference2 : require +#extension GL_EXT_scalar_block_layout : require +#extension GL_EXT_control_flow_attributes : require + +#include "compute_bindings.h" + +#include "radix_sort.h" + +#include "draw_2d.h" +#include "indirect.h" + +layout(buffer_reference, std430, buffer_reference_align = 4) readonly buffer CountRef { + uint value; +}; + +layout(buffer_reference, std430, buffer_reference_align = 4) readonly buffer ValuesRef { + uint values[]; +}; + +layout(buffer_reference, std430, buffer_reference_align = 4) writeonly buffer SpineRef { + uint values[]; +}; + +struct RadixSortUpsweepConstants { + uint shift; + uint _pad; + CountRef count_buffer; + ValuesRef src_buffer; + SpineRef spine_buffer; +}; + +layout(std430, push_constant) uniform RadixSortUpsweepConstantsBlock { + RadixSortUpsweepConstants constants; +}; + +shared uint histogram[RADIX_DIGITS]; + +layout (local_size_x = RADIX_DIGITS, local_size_y = 1, local_size_z = 1) in; + +void main() { + const uint count = constants.count_buffer.value; + const uint shift = constants.shift; + + // Clear local histogram + histogram[gl_LocalInvocationID.x] = 0; + + barrier(); + + const uint start = gl_WorkGroupID.x * RADIX_ITEMS_PER_WGP; + const uint end = start + RADIX_ITEMS_PER_WGP; + + const bool skip_bounds_check = end <= count; + + if (skip_bounds_check) { + for (uint i = start; i < end; i += RADIX_DIGITS) { + const uint index = i + gl_LocalInvocationID.x; + const uint value = constants.src_buffer.values[index]; + const uint digit = (value >> shift) & RADIX_MASK; + atomicAdd(histogram[digit], 1); + } + } else { + for (uint i = start; i < end; i += RADIX_DIGITS) { + const uint index = i + gl_LocalInvocationID.x; + if (index < count) { + const uint value = constants.src_buffer.values[index]; + const uint digit = (value >> shift) & RADIX_MASK; + atomicAdd(histogram[digit], 1); + } + } + } + + barrier(); + + // Scatter to the spine, this is a striped layout so we can efficiently + // calculate the prefix sum. Re-calculate how many workgroups we dispatched + // to determine the stride we need to write at. + const uint wgp_count = (count + (RADIX_ITEMS_PER_WGP - 1)) / RADIX_ITEMS_PER_WGP; + constants.spine_buffer.values[(gl_LocalInvocationID.x * wgp_count) + gl_WorkGroupID.x] = histogram[gl_LocalInvocationID.x]; +} \ No newline at end of file diff --git a/title/shark-shaders/shaders/radix_sort_1_spine.comp b/title/shark-shaders/shaders/radix_sort_1_spine.comp new file mode 100644 index 0000000..0b1dfc4 --- /dev/null +++ b/title/shark-shaders/shaders/radix_sort_1_spine.comp @@ -0,0 +1,91 @@ +#version 460 + +#extension GL_GOOGLE_include_directive : require + +#extension GL_EXT_buffer_reference : require +#extension GL_EXT_buffer_reference2 : require +#extension GL_EXT_scalar_block_layout : require +#extension GL_EXT_control_flow_attributes : require + +#extension GL_KHR_shader_subgroup_arithmetic : require +#extension GL_KHR_shader_subgroup_ballot : require +#extension GL_KHR_shader_subgroup_shuffle_relative: enable +#extension GL_KHR_shader_subgroup_vote : require + +//#extension GL_EXT_debug_printf : enable + +#include "compute_bindings.h" + +#include "radix_sort.h" + +#include "draw_2d.h" +#include "indirect.h" + +layout(buffer_reference, std430, buffer_reference_align = 4) buffer CountRef { + uint value; +}; + +layout(buffer_reference, std430, buffer_reference_align = 4) buffer SpineRef { + uint values[]; +}; + +struct RadixSortSpineConstants { + CountRef count_buffer; + SpineRef spine_buffer; +}; + +layout(std430, push_constant) uniform RadixSortSpineConstantsBlock { + RadixSortSpineConstants constants; +}; + +layout (constant_id = 0) const uint SUBGROUP_SIZE = 64; + +const uint NUM_SUBGROUPS = RADIX_SPINE_WGP_SIZE / SUBGROUP_SIZE; + +shared uint sums[NUM_SUBGROUPS]; +shared uint carry_in; + +layout (local_size_x = RADIX_SPINE_WGP_SIZE, local_size_y = 1, local_size_z = 1) in; + +void main() { + const uint local_id = gl_SubgroupID * gl_SubgroupSize + gl_SubgroupInvocationID; + + const uint count = constants.count_buffer.value; + + // Re-calculate how many workgroups pushed data into the spine + const uint upsweep_wgp_count = (count + (RADIX_ITEMS_PER_WGP - 1)) / RADIX_ITEMS_PER_WGP; + + carry_in = 0; + for (uint i = 0; i < upsweep_wgp_count; i++) { + const uint spine_index = i * RADIX_DIGITS + local_id; + + // Load values and calculate partial sums + const uint value = constants.spine_buffer.values[spine_index]; + const uint sum = subgroupAdd(value); + const uint scan = subgroupExclusiveAdd(value); + + if (subgroupElect()) { + sums[gl_SubgroupID] = sum; + } + + barrier(); + + // Scan partials + if (local_id < NUM_SUBGROUPS) { + sums[local_id] = subgroupExclusiveAdd(sums[local_id]); + } + + const uint carry = carry_in; + + barrier(); + + // Write out the final prefix sum, combining the carry-in, subgroup sums, and local scan + constants.spine_buffer.values[spine_index] = carry + sums[gl_SubgroupID] + scan; + + if (gl_SubgroupID == gl_NumSubgroups - 1 && subgroupElect()) { + carry_in += sums[gl_SubgroupID] + sum; + } + + memoryBarrierShared(); + } +} diff --git a/title/shark-shaders/shaders/radix_sort_2_downsweep.comp b/title/shark-shaders/shaders/radix_sort_2_downsweep.comp new file mode 100644 index 0000000..4a1f1c8 --- /dev/null +++ b/title/shark-shaders/shaders/radix_sort_2_downsweep.comp @@ -0,0 +1,115 @@ +#version 460 + +#extension GL_GOOGLE_include_directive : require + +#extension GL_EXT_buffer_reference : require +#extension GL_EXT_buffer_reference2 : require +#extension GL_EXT_scalar_block_layout : require +#extension GL_EXT_control_flow_attributes : require + +#extension GL_EXT_shader_explicit_arithmetic_types_int8 : require + +#extension GL_KHR_shader_subgroup_arithmetic : require +#extension GL_KHR_shader_subgroup_ballot : require +#extension GL_KHR_shader_subgroup_shuffle_relative: enable +#extension GL_KHR_shader_subgroup_vote : require + +//#extension GL_EXT_debug_printf : enable + +#include "compute_bindings.h" + +#include "radix_sort.h" + +#include "draw_2d.h" +#include "indirect.h" + +layout (constant_id = 0) const uint SUBGROUP_SIZE = 64; + +const uint NUM_SUBGROUPS = RADIX_WGP_SIZE / SUBGROUP_SIZE; + +layout(buffer_reference, std430, buffer_reference_align = 4) readonly buffer CountRef { + uint value; +}; + +layout(buffer_reference, std430, buffer_reference_align = 4) readonly buffer SpineRef { + uint values[]; +}; + +layout(buffer_reference, std430, buffer_reference_align = 4) buffer ValuesRef { + uint values[]; +}; + +struct RadixSortDownsweepConstants { + uint shift; + uint _pad; + CountRef count_buffer; + SpineRef spine_buffer; + ValuesRef src_buffer; + ValuesRef dst_buffer; +}; + +layout(std430, push_constant) uniform RadixSortDownsweepConstantsBlock { + RadixSortDownsweepConstants constants; +}; + +shared uint values[RADIX_WGP_SIZE]; +shared uint spine[RADIX_DIGITS]; +shared uint match_masks[NUM_SUBGROUPS][RADIX_DIGITS]; + +layout (local_size_x = RADIX_WGP_SIZE, local_size_y = 1, local_size_z = 1) in; + +void main() { + const uint shift = constants.shift; + const uint count = constants.count_buffer.value; + const uint wgp_count = (count + (RADIX_ITEMS_PER_WGP - 1)) / RADIX_ITEMS_PER_WGP; + + const uint start = gl_WorkGroupID.x * RADIX_ITEMS_PER_WGP; + const uint end = min(start + RADIX_ITEMS_PER_WGP, count); + + const uint local_id = gl_SubgroupID * gl_SubgroupSize + gl_SubgroupInvocationID; + + // Gather from spine. + spine[local_id] = constants.spine_buffer.values[(local_id * wgp_count) + gl_WorkGroupID.x]; + + for (uint value_base = start; value_base < end; value_base += RADIX_WGP_SIZE) { + // Clear shared memory and load values from src buffer. + for (uint i = 0; i < NUM_SUBGROUPS; i++) { + match_masks[i][local_id] = 0; + } + + barrier(); + + const uint global_offset = value_base + local_id; + uint value = 0xffffffff; + uint digit = 0xff; + + if (global_offset < end) { + value = constants.src_buffer.values[global_offset]; + digit = (value >> shift) & RADIX_MASK; + atomicOr(match_masks[gl_SubgroupID][digit], 1 << gl_SubgroupInvocationID); + } + + barrier(); + + if (global_offset < end) { + const uint peer_mask = match_masks[gl_SubgroupID][digit]; + + uint peer_scan = bitCount(peer_mask & gl_SubgroupLtMask.x); + for (uint i = 0; i < gl_NumSubgroups; i++) { + if (i < gl_SubgroupID) { + peer_scan += bitCount(match_masks[i][digit]); + } + } + + const uint dst_index = spine[digit] + peer_scan; + constants.dst_buffer.values[dst_index] = value; + } + + barrier(); + + // Increment the spine with the counts for the workgroup we just wrote out. + for (uint i = 0; i < NUM_SUBGROUPS; i++) { + atomicAdd(spine[local_id], bitCount(match_masks[i][local_id])); + } + } +} diff --git a/title/shark/src/helpers.rs b/title/shark/src/helpers.rs index d211c6f..e45e0ba 100644 --- a/title/shark/src/helpers.rs +++ b/title/shark/src/helpers.rs @@ -3,7 +3,7 @@ use std::path::Path; use narcissus_core::{obj, Widen}; use narcissus_maths::{vec2, vec3, vec4, Vec2, Vec3}; -use crate::pipelines::basic::Vertex; +use crate::pipelines::Vertex; pub fn load_obj>(path: P) -> (Vec, Vec) { #[derive(Default)] diff --git a/title/shark/src/main.rs b/title/shark/src/main.rs index 1adfc5f..8986482 100644 --- a/title/shark/src/main.rs +++ b/title/shark/src/main.rs @@ -3,9 +3,14 @@ use std::ops::Index; use std::path::Path; use std::time::{Duration, Instant}; -use narcissus_core::{dds, Widen as _}; -use pipelines::basic::BasicPipeline; -use pipelines::{PrimitiveInstance, PrimitiveUniforms, Rect, TILE_SIZE, TILE_STRIDE}; +use narcissus_core::dds; + +use pipelines::{ + BasicConstants, ComputeBinds, Draw2dClearConstants, Draw2dCmd, Draw2dScatterConstants, + Draw2dSortConstants, GraphicsBinds, Pipelines, RadixSortDownsweepConstants, + RadixSortSpineConstants, RadixSortUpsweepConstants, DRAW_2D_TILE_SIZE, +}; + use renderdoc_sys as rdoc; use fonts::{FontFamily, Fonts}; @@ -14,14 +19,13 @@ use narcissus_app::{create_app, Event, Key, PressedState, WindowDesc}; use narcissus_core::{box_assume_init, default, rand::Pcg64, zeroed_box, BitIter}; use narcissus_font::{FontCollection, GlyphCache, HorizontalMetrics}; use narcissus_gpu::{ - create_device, Access, Bind, BindDesc, BindGroupLayout, BindingType, Buffer, BufferDesc, - BufferImageCopy, BufferUsageFlags, ClearValue, CmdEncoder, ColorSpace, ComputePipelineDesc, - Device, DeviceExt, Extent2d, Extent3d, Frame, GlobalBarrier, Image, ImageAspectFlags, - ImageBarrier, ImageDesc, ImageDimension, ImageFormat, ImageLayout, ImageSubresourceRange, - ImageTiling, ImageUsageFlags, IndexType, LoadOp, MemoryLocation, Offset2d, PersistentBuffer, - Pipeline, PipelineLayout, PresentMode, PushConstantRange, RenderingAttachment, RenderingDesc, - Sampler, SamplerAddressMode, SamplerDesc, SamplerFilter, Scissor, ShaderDesc, ShaderStageFlags, - StoreOp, SwapchainConfigurator, SwapchainImage, ThreadToken, TypedBind, Viewport, + create_device, Access, Bind, BufferImageCopy, BufferUsageFlags, ClearValue, CmdEncoder, + ColorSpace, Device, DeviceExt, Extent2d, Extent3d, Frame, GlobalBarrier, Image, + ImageAspectFlags, ImageBarrier, ImageDesc, ImageDimension, ImageFormat, ImageLayout, + ImageSubresourceRange, ImageTiling, ImageUsageFlags, IndexType, LoadOp, MemoryLocation, + Offset2d, PersistentBuffer, PresentMode, RenderingAttachment, RenderingDesc, Scissor, + ShaderStageFlags, StoreOp, SwapchainConfigurator, SwapchainImage, ThreadToken, TypedBind, + Viewport, }; use narcissus_image as image; use narcissus_maths::{ @@ -30,8 +34,6 @@ use narcissus_maths::{ }; use spring::simple_spring_damper_exact; -use crate::pipelines::basic::BasicUniforms; - mod fonts; mod helpers; pub mod microshades; @@ -73,9 +75,9 @@ static GAME_VARIABLES: GameVariables = GameVariables { player_speed: 10.0, - weapon_cooldown: 0.0, + weapon_cooldown: 0.2, weapon_projectile_speed: 20.0, - weapon_projectile_lifetime: 6.0, + weapon_projectile_lifetime: 3.0, }; #[derive(Clone, Copy, Debug)] @@ -452,8 +454,7 @@ struct UiState<'a> { tmp_string: String, - primitive_instances: Vec, - rects: Vec, + draw_cmds: Vec, } impl<'a> UiState<'a> { @@ -465,8 +466,7 @@ impl<'a> UiState<'a> { fonts, glyph_cache, tmp_string: default(), - primitive_instances: vec![], - rects: vec![], + draw_cmds: vec![], } } @@ -476,17 +476,14 @@ impl<'a> UiState<'a> { let center_x = x + half_extent_x; let center_y = y + half_extent_y; - let rect_index = self.rects.len() as u32; - - self.rects.push(Rect { + self.draw_cmds.push(Draw2dCmd::rect( + center_x, + center_y, half_extent_x, half_extent_y, - border_width: 0.0, - border_radius: 0.0, - }); - - self.primitive_instances.push(PrimitiveInstance::rect( - rect_index, 0x4400ff00, center_x, center_y, + 1.0, + 0x4400ff00, + 0xffff0000, )) } @@ -530,50 +527,14 @@ impl<'a> UiState<'a> { x += advance * scale; - self.primitive_instances.push(PrimitiveInstance::glyph( - touched_glyph_index, - 0x880000ff, - x, - y, - )); + self.draw_cmds + .push(Draw2dCmd::glyph(touched_glyph_index, 0x880000ff, x, y)); x += advance_width * scale; } } } -enum SamplerRes { - Bilinear, -} - -pub struct Samplers { - bilinear: Sampler, -} - -impl Index for Samplers { - type Output = Sampler; - - fn index(&self, index: SamplerRes) -> &Self::Output { - match index { - SamplerRes::Bilinear => &self.bilinear, - } - } -} - -impl Samplers { - fn load(gpu: &Gpu) -> Samplers { - let bilinear = gpu.create_sampler(&SamplerDesc { - filter: SamplerFilter::Bilinear, - address_mode: SamplerAddressMode::Clamp, - compare_op: None, - mip_lod_bias: 0.0, - min_lod: 0.0, - max_lod: 0.0, - }); - Samplers { bilinear } - } -} - struct Model<'a> { indices: u32, vertex_buffer: PersistentBuffer<'a>, @@ -880,14 +841,6 @@ type Gpu = dyn Device + 'static; struct DrawState<'gpu> { gpu: &'gpu Gpu, - basic_pipeline: BasicPipeline, - - compute_bind_group_layout: BindGroupLayout, - bin_clear_pipeline: Pipeline, - bin_pipeline: Pipeline, - rasterize_pipeline: Pipeline, - display_transform_pipeline: Pipeline, - width: u32, height: u32, @@ -898,11 +851,10 @@ struct DrawState<'gpu> { color_image: Image, ui_image: Image, - tiles_buffer: Buffer, - glyph_atlas_image: Image, - _samplers: Samplers, + pipelines: Pipelines, + models: Models<'gpu>, images: Images, @@ -911,78 +863,12 @@ struct DrawState<'gpu> { impl<'gpu> DrawState<'gpu> { fn new(gpu: &'gpu Gpu, thread_token: &ThreadToken) -> Self { - let samplers = Samplers::load(gpu); - let immutable_samplers = &[samplers[SamplerRes::Bilinear]]; - - let compute_bind_group_layout = gpu.create_bind_group_layout(&[ - // Samplers - BindDesc::with_immutable_samplers(ShaderStageFlags::COMPUTE, immutable_samplers), - // Tony mc mapface LUT - BindDesc::new(ShaderStageFlags::COMPUTE, BindingType::SampledImage), - // Glyph Atlas - BindDesc::new(ShaderStageFlags::COMPUTE, BindingType::SampledImage), - // UI Render Target - BindDesc::new(ShaderStageFlags::COMPUTE, BindingType::StorageImage), - // Color Render Target - BindDesc::new(ShaderStageFlags::COMPUTE, BindingType::StorageImage), - // Composited output - BindDesc::new(ShaderStageFlags::COMPUTE, BindingType::StorageImage), - ]); - - let compute_pipeline_layout = PipelineLayout { - bind_group_layouts: &[compute_bind_group_layout], - push_constant_ranges: &[PushConstantRange { - stage_flags: ShaderStageFlags::COMPUTE, - offset: 0, - size: std::mem::size_of::() as u32, - }], - }; - - let bin_clear_pipeline = gpu.create_compute_pipeline(&ComputePipelineDesc { - shader: ShaderDesc { - entry: c"main", - code: shark_shaders::PRIMITIVE_2D_BIN_CLEAR_COMP_SPV, - }, - layout: &compute_pipeline_layout, - }); - - let bin_pipeline = gpu.create_compute_pipeline(&ComputePipelineDesc { - shader: ShaderDesc { - entry: c"main", - code: shark_shaders::PRIMITIVE_2D_BIN_COMP_SPV, - }, - layout: &compute_pipeline_layout, - }); - - let rasterize_pipeline = gpu.create_compute_pipeline(&ComputePipelineDesc { - shader: ShaderDesc { - entry: c"main", - code: shark_shaders::PRIMITIVE_2D_RASTERIZE_COMP_SPV, - }, - layout: &compute_pipeline_layout, - }); - - let display_transform_pipeline = gpu.create_compute_pipeline(&ComputePipelineDesc { - shader: ShaderDesc { - entry: c"main", - code: shark_shaders::DISPLAY_TRANSFORM_COMP_SPV, - }, - layout: &compute_pipeline_layout, - }); - - let basic_pipeline = BasicPipeline::new(gpu, immutable_samplers); - + let pipelines = Pipelines::load(gpu); let models = Models::load(gpu); let images = Images::load(gpu, thread_token); Self { gpu, - basic_pipeline, - compute_bind_group_layout, - bin_clear_pipeline, - bin_pipeline, - rasterize_pipeline, - display_transform_pipeline, width: 0, height: 0, tile_resolution_x: 0, @@ -990,9 +876,8 @@ impl<'gpu> DrawState<'gpu> { depth_image: default(), color_image: default(), ui_image: default(), - tiles_buffer: default(), glyph_atlas_image: default(), - _samplers: samplers, + pipelines, models, images, transforms: vec![], @@ -1012,7 +897,7 @@ impl<'gpu> DrawState<'gpu> { let gpu = self.gpu; let half_turn_y = Mat3::from_axis_rotation(Vec3::Y, HalfTurn::new(0.5)); - let scale = Mat3::from_scale(Vec3::splat(0.125)); + let scale = Mat3::from_scale(Vec3::splat(0.4)); fn rotate_dir(dir: Vec3, up: Vec3) -> Mat3 { let f = dir.normalized(); @@ -1103,33 +988,8 @@ impl<'gpu> DrawState<'gpu> { gpu.destroy_image(frame, self.color_image); gpu.destroy_image(frame, self.ui_image); - let tile_resolution_x = (width + (TILE_SIZE - 1)) / TILE_SIZE; - let tile_resolution_y = (height + (TILE_SIZE - 1)) / TILE_SIZE; - - if tile_resolution_x != self.tile_resolution_x - || tile_resolution_y != self.tile_resolution_y - { - gpu.destroy_buffer(frame, self.tiles_buffer); - - let bitmap_buffer_size = tile_resolution_x - * tile_resolution_y - * TILE_STRIDE - * std::mem::size_of::() as u32; - - self.tiles_buffer = gpu.create_buffer(&BufferDesc { - memory_location: MemoryLocation::Device, - host_mapped: false, - usage: BufferUsageFlags::STORAGE, - size: bitmap_buffer_size.widen(), - }); - - gpu.debug_name_buffer(self.tiles_buffer.to_arg(), "tile bitmap"); - - println!("tile_resolution: ({tile_resolution_x},{tile_resolution_y})"); - - self.tile_resolution_x = tile_resolution_x; - self.tile_resolution_y = tile_resolution_y; - } + self.tile_resolution_x = (width + (DRAW_2D_TILE_SIZE - 1)) / DRAW_2D_TILE_SIZE; + self.tile_resolution_y = (height + (DRAW_2D_TILE_SIZE - 1)) / DRAW_2D_TILE_SIZE; self.depth_image = gpu.create_image(&ImageDesc { memory_location: MemoryLocation::Device, @@ -1320,89 +1180,98 @@ impl<'gpu> DrawState<'gpu> { // Render basic stuff. { - gpu.cmd_set_pipeline(cmd_encoder, self.basic_pipeline.pipeline); + let model = &self.models[ModelRes::Shark]; + let image = self.images[ImageRes::Shark]; - let basic_uniforms = BasicUniforms { clip_from_model }; + let instance_count = self.transforms.len() as u32; - let uniform_buffer = gpu.request_transient_buffer_with_data( + let transform_buffer = gpu.request_transient_buffer_with_data( frame, thread_token, - BufferUsageFlags::UNIFORM, - &basic_uniforms, + BufferUsageFlags::STORAGE, + self.transforms.as_slice(), ); - gpu.cmd_set_bind_group( + // We're done with you now! + self.transforms.clear(); + + let graphics_bind_group = gpu.request_transient_bind_group( frame, - cmd_encoder, - self.basic_pipeline.uniforms_bind_group_layout, - 0, + thread_token, + self.pipelines.graphics_bind_group_layout, &[Bind { - binding: 0, + binding: GraphicsBinds::Albedo as u32, array_element: 0, - typed: TypedBind::UniformBuffer(&[uniform_buffer.to_arg()]), + typed: TypedBind::SampledImage(&[(ImageLayout::Optimal, image)]), }], ); - { - let model = &self.models[ModelRes::Shark]; - let image = self.images[ImageRes::Shark]; - - let transform_buffer = gpu.request_transient_buffer_with_data( - frame, - thread_token, - BufferUsageFlags::STORAGE, - self.transforms.as_slice(), - ); - - gpu.cmd_set_bind_group( - frame, - cmd_encoder, - self.basic_pipeline.storage_bind_group_layout, - 1, - &[ - Bind { - binding: 0, - array_element: 0, - typed: TypedBind::StorageBuffer(&[model.vertex_buffer.to_arg()]), - }, - Bind { - binding: 1, - array_element: 0, - typed: TypedBind::StorageBuffer(&[transform_buffer.to_arg()]), - }, - Bind { - binding: 2, - array_element: 0, - typed: TypedBind::SampledImage(&[(ImageLayout::Optimal, image)]), - }, - ], - ); - - gpu.cmd_set_index_buffer( - cmd_encoder, - model.index_buffer.to_arg(), - 0, - IndexType::U16, - ); + gpu.cmd_set_pipeline(cmd_encoder, self.pipelines.basic_pipeline); + gpu.cmd_set_bind_group(cmd_encoder, 0, &graphics_bind_group); + gpu.cmd_push_constants( + cmd_encoder, + ShaderStageFlags::VERTEX, + 0, + &BasicConstants { + clip_from_model, + vertex_buffer_address: gpu.get_buffer_address(model.vertex_buffer.to_arg()), + transform_buffer_address: gpu.get_buffer_address(transform_buffer.to_arg()), + }, + ); - gpu.cmd_draw_indexed( - cmd_encoder, - model.indices, - self.transforms.len() as u32, - 0, - 0, - 0, - ); - } + gpu.cmd_set_index_buffer( + cmd_encoder, + model.index_buffer.to_arg(), + 0, + IndexType::U16, + ); - // We're done with you now! - self.transforms.clear(); + gpu.cmd_draw_indexed(cmd_encoder, model.indices, instance_count, 0, 0, 0); } gpu.cmd_end_rendering(cmd_encoder); gpu.cmd_end_debug_marker(cmd_encoder); + let compute_bind_group = gpu.request_transient_bind_group( + frame, + thread_token, + self.pipelines.compute_bind_group_layout, + &[ + Bind { + binding: ComputeBinds::TonyMcMapfaceLut as u32, + array_element: 0, + typed: TypedBind::SampledImage(&[( + ImageLayout::Optimal, + self.images[ImageRes::TonyMcMapfaceLut], + )]), + }, + Bind { + binding: ComputeBinds::GlyphAtlas as u32, + array_element: 0, + typed: TypedBind::SampledImage(&[( + ImageLayout::Optimal, + self.glyph_atlas_image, + )]), + }, + Bind { + binding: ComputeBinds::UiRenderTarget as u32, + array_element: 0, + typed: TypedBind::StorageImage(&[(ImageLayout::General, self.ui_image)]), + }, + Bind { + binding: ComputeBinds::ColorRenderTarget as u32, + array_element: 0, + typed: TypedBind::StorageImage(&[(ImageLayout::General, self.color_image)]), + }, + Bind { + binding: ComputeBinds::CompositedRenderTarget as u32, + array_element: 0, + typed: TypedBind::StorageImage(&[(ImageLayout::General, swapchain_image)]), + }, + ], + ); + // Render UI { gpu.cmd_begin_debug_marker( @@ -1411,124 +1280,267 @@ impl<'gpu> DrawState<'gpu> { microshades::PURPLE_RGBA_F32[3], ); - let primitive_instance_buffer = gpu.request_transient_buffer_with_data( + let draw_buffer = gpu.request_transient_buffer_with_data( frame, thread_token, BufferUsageFlags::STORAGE, - ui_state.primitive_instances.as_slice(), + ui_state.draw_cmds.as_slice(), ); + + let draw_buffer_len = ui_state.draw_cmds.len() as u32; + ui_state.draw_cmds.clear(); + let glyph_buffer = gpu.request_transient_buffer_with_data( frame, thread_token, BufferUsageFlags::STORAGE, touched_glyphs, ); - let rect_buffer = gpu.request_transient_buffer_with_data( + + const COARSE_BUFFER_LEN: usize = 1 << 18; + let coarse_buffer = gpu.request_transient_buffer( frame, thread_token, BufferUsageFlags::STORAGE, - ui_state.rects.as_slice(), + COARSE_BUFFER_LEN * std::mem::size_of::(), ); - let num_primitives = ui_state.primitive_instances.len() as u32; - let num_primitives_32 = (num_primitives + 31) / 32; - let num_primitives_1024 = (num_primitives_32 + 31) / 32; - - ui_state.primitive_instances.clear(); + let indirect_dispatch_buffer = gpu.request_transient_buffer( + frame, + thread_token, + BufferUsageFlags::INDIRECT, + 3 * std::mem::size_of::(), + ); - gpu.cmd_set_pipeline(cmd_encoder, self.bin_clear_pipeline); + let sort_tmp_buffer = gpu.request_transient_buffer( + frame, + thread_token, + BufferUsageFlags::STORAGE, + COARSE_BUFFER_LEN * std::mem::size_of::(), + ); - gpu.cmd_set_bind_group( + let spine_buffer = gpu.request_transient_buffer( frame, + thread_token, + BufferUsageFlags::STORAGE, + (COARSE_BUFFER_LEN / (32 * 16)) * 256 * std::mem::size_of::(), // TODO: Fix size + ); + + let draw_buffer_address = gpu.get_buffer_address(draw_buffer.to_arg()); + let glyph_buffer_address = gpu.get_buffer_address(glyph_buffer.to_arg()); + let coarse_buffer_address = gpu.get_buffer_address(coarse_buffer.to_arg()); + let indirect_dispatch_buffer_address = + gpu.get_buffer_address(indirect_dispatch_buffer.to_arg()); + let sort_tmp_buffer_address = gpu.get_buffer_address(sort_tmp_buffer.to_arg()); + let spine_buffer_address = gpu.get_buffer_address(spine_buffer.to_arg()); + + gpu.cmd_set_pipeline(cmd_encoder, self.pipelines.draw_2d_bin_0_clear_pipeline); + gpu.cmd_set_bind_group(cmd_encoder, 0, &compute_bind_group); + gpu.cmd_push_constants( cmd_encoder, - self.compute_bind_group_layout, + ShaderStageFlags::COMPUTE, 0, - &[ - Bind { - binding: 1, - array_element: 0, - typed: TypedBind::SampledImage(&[( - ImageLayout::Optimal, - self.images[ImageRes::TonyMcMapfaceLut], - )]), - }, - Bind { - binding: 2, - array_element: 0, - typed: TypedBind::SampledImage(&[( - ImageLayout::Optimal, - self.glyph_atlas_image, - )]), - }, - Bind { - binding: 3, - array_element: 0, - typed: TypedBind::StorageImage(&[( - ImageLayout::General, - self.ui_image, - )]), - }, - Bind { - binding: 4, - array_element: 0, - typed: TypedBind::StorageImage(&[( - ImageLayout::General, - self.color_image, - )]), - }, - Bind { - binding: 5, - array_element: 0, - typed: TypedBind::StorageImage(&[( - ImageLayout::General, - swapchain_image, - )]), - }, - ], + &Draw2dClearConstants { + coarse_buffer_address, + }, + ); + gpu.cmd_dispatch(cmd_encoder, 1, 1, 1); + + gpu.cmd_barrier( + cmd_encoder, + Some(&GlobalBarrier { + prev_access: &[Access::ComputeWrite], + next_access: &[Access::ComputeOtherRead], + }), + &[], ); + gpu.cmd_set_pipeline(cmd_encoder, self.pipelines.draw_2d_bin_1_scatter_pipeline); + gpu.cmd_set_bind_group(cmd_encoder, 0, &compute_bind_group); gpu.cmd_push_constants( cmd_encoder, ShaderStageFlags::COMPUTE, 0, - &PrimitiveUniforms { + &Draw2dScatterConstants { screen_resolution_x: self.width, screen_resolution_y: self.height, - atlas_resolution_x: atlas_width, - atlas_resolution_y: atlas_height, - num_primitives, - num_primitives_32, - num_primitives_1024, tile_resolution_x: self.tile_resolution_x, tile_resolution_y: self.tile_resolution_y, - tile_stride: self.tile_resolution_x, - primitives_instances_buffer: gpu - .get_buffer_address(primitive_instance_buffer.to_arg()), - glyphs_buffer: gpu.get_buffer_address(glyph_buffer.to_arg()), - rects_buffer: gpu.get_buffer_address(rect_buffer.to_arg()), - tiles_buffer: gpu.get_buffer_address(self.tiles_buffer.to_arg()), + draw_buffer_len, + coarse_buffer_len: COARSE_BUFFER_LEN as u32, + draw_buffer_address, + glyph_buffer_address, + coarse_buffer_address, }, ); - gpu.cmd_dispatch( + for _ in 0..4 { + gpu.cmd_dispatch( + cmd_encoder, + (draw_buffer_len + + (self.pipelines.draw_2d_bin_1_scatter_pipeline_workgroup_size - 1)) + / self.pipelines.draw_2d_bin_1_scatter_pipeline_workgroup_size, + 1, + 1, + ); + } + + gpu.cmd_barrier( cmd_encoder, - (num_primitives_1024 + 63) / 64, - self.tile_resolution_x, - self.tile_resolution_y, + Some(&GlobalBarrier { + prev_access: &[Access::ComputeWrite], + next_access: &[Access::ComputeOtherRead], + }), + &[], + ); + + // let mut sort_data = Vec::new(); + // let count = 8192u32; + // sort_data.push(count); + // for i in 0..count { + // sort_data.push(255 - i / 32); + // } + + // let sort_buffer = gpu.request_transient_buffer_with_data( + // frame, + // thread_token, + // BufferUsageFlags::STORAGE, + // sort_data.as_slice(), + // ); + // let sort_buffer_address = gpu.get_buffer_address(sort_buffer.to_arg()); + + gpu.cmd_set_pipeline(cmd_encoder, self.pipelines.draw_2d_bin_2_sort_pipeline); + gpu.cmd_set_bind_group(cmd_encoder, 0, &compute_bind_group); + gpu.cmd_push_constants( + cmd_encoder, + ShaderStageFlags::COMPUTE, + 0, + &Draw2dSortConstants { + // -1 due to the count taking up a single slot in the buffer. + coarse_buffer_len: COARSE_BUFFER_LEN as u32 - 1, + _pad: 0, + indirect_dispatch_buffer_address, + coarse_buffer_address, + }, ); + gpu.cmd_dispatch(cmd_encoder, 1, 1, 1); gpu.cmd_barrier( cmd_encoder, Some(&GlobalBarrier { prev_access: &[Access::ComputeWrite], - next_access: &[Access::ComputeOtherRead], + next_access: &[Access::ComputeOtherRead, Access::IndirectBuffer], }), &[], ); - gpu.cmd_set_pipeline(cmd_encoder, self.bin_pipeline); + gpu.cmd_begin_debug_marker( + cmd_encoder, + "radix sort", + microshades::ORANGE_RGBA_F32[2], + ); - gpu.cmd_dispatch(cmd_encoder, (num_primitives + 1023) / 1024, 1, 1); + // First element in the scratch buffer is the count. + let count_buffer_address = coarse_buffer_address; + // Then the elements we want to sort follow. + let mut src_buffer_address = count_buffer_address.byte_add(4); + let mut dst_buffer_address = sort_tmp_buffer_address; + + for pass in 0..4 { + let shift = pass * 8; + + // Upsweep + gpu.cmd_set_pipeline(cmd_encoder, self.pipelines.radix_sort_0_upsweep_pipeline); + gpu.cmd_set_bind_group(cmd_encoder, 0, &compute_bind_group); + gpu.cmd_push_constants( + cmd_encoder, + ShaderStageFlags::COMPUTE, + 0, + &RadixSortUpsweepConstants { + shift, + _pad: 0, + count_buffer_address, + src_buffer_address, + spine_buffer_address, + }, + ); + gpu.cmd_dispatch_indirect(cmd_encoder, indirect_dispatch_buffer.to_arg(), 0); + + gpu.cmd_barrier( + cmd_encoder, + Some(&GlobalBarrier { + prev_access: &[Access::ComputeWrite], + next_access: &[Access::ComputeOtherRead], + }), + &[], + ); + + // Exclusive sum of the spine + gpu.cmd_set_pipeline(cmd_encoder, self.pipelines.radix_sort_1_spine_pipeline); + gpu.cmd_set_bind_group(cmd_encoder, 0, &compute_bind_group); + gpu.cmd_push_constants( + cmd_encoder, + ShaderStageFlags::COMPUTE, + 0, + &RadixSortSpineConstants { + count_buffer_address, + spine_buffer_address, + }, + ); + gpu.cmd_dispatch(cmd_encoder, 1, 1, 1); + + gpu.cmd_barrier( + cmd_encoder, + Some(&GlobalBarrier { + prev_access: &[Access::ComputeWrite], + next_access: &[Access::ComputeOtherRead], + }), + &[], + ); + + // Downsweep + gpu.cmd_set_pipeline( + cmd_encoder, + self.pipelines.radix_sort_2_downsweep_pipeline, + ); + gpu.cmd_set_bind_group(cmd_encoder, 0, &compute_bind_group); + gpu.cmd_push_constants( + cmd_encoder, + ShaderStageFlags::COMPUTE, + 0, + &RadixSortDownsweepConstants { + shift, + _pad: 0, + count_buffer_address, + src_buffer_address, + dst_buffer_address, + spine_buffer_address, + }, + ); + gpu.cmd_dispatch_indirect(cmd_encoder, indirect_dispatch_buffer.to_arg(), 0); + + gpu.cmd_barrier( + cmd_encoder, + Some(&GlobalBarrier { + prev_access: &[Access::ComputeWrite], + next_access: &[Access::ComputeOtherRead], + }), + &[], + ); + + std::mem::swap(&mut src_buffer_address, &mut dst_buffer_address); + } + + gpu.cmd_end_debug_marker(cmd_encoder); + + gpu.cmd_set_pipeline(cmd_encoder, self.pipelines.draw_2d_bin_3_resolve_pipeline); + gpu.cmd_set_bind_group(cmd_encoder, 0, &compute_bind_group); + gpu.cmd_dispatch( + cmd_encoder, + self.tile_resolution_x, + self.tile_resolution_y, + 1, + ); gpu.cmd_barrier( cmd_encoder, @@ -1539,8 +1551,8 @@ impl<'gpu> DrawState<'gpu> { &[], ); - gpu.cmd_set_pipeline(cmd_encoder, self.rasterize_pipeline); - + gpu.cmd_set_pipeline(cmd_encoder, self.pipelines.draw_2d_rasterize_pipeline); + gpu.cmd_set_bind_group(cmd_encoder, 0, &compute_bind_group); gpu.cmd_dispatch(cmd_encoder, (self.width + 7) / 8, (self.height + 7) / 8, 1); gpu.cmd_end_debug_marker(cmd_encoder); @@ -1579,7 +1591,8 @@ impl<'gpu> DrawState<'gpu> { gpu.cmd_compute_touch_swapchain(cmd_encoder, swapchain_image); - gpu.cmd_set_pipeline(cmd_encoder, self.display_transform_pipeline); + gpu.cmd_set_pipeline(cmd_encoder, self.pipelines.composite_pipeline); + gpu.cmd_set_bind_group(cmd_encoder, 0, &compute_bind_group); gpu.cmd_dispatch(cmd_encoder, (self.width + 7) / 8, (self.height + 7) / 8, 1); diff --git a/title/shark/src/pipelines.rs b/title/shark/src/pipelines.rs new file mode 100644 index 0000000..5b893b4 --- /dev/null +++ b/title/shark/src/pipelines.rs @@ -0,0 +1,411 @@ +use narcissus_core::default; +use narcissus_font::TouchedGlyphIndex; +use narcissus_gpu::{ + BindDesc, BindGroupLayout, BindingType, BlendMode, BufferAddress, CompareOp, + ComputePipelineDesc, CullingMode, FrontFace, GraphicsPipelineAttachments, GraphicsPipelineDesc, + ImageFormat, Pipeline, PipelineLayout, PolygonMode, PushConstantRange, Sampler, + SamplerAddressMode, SamplerDesc, SamplerFilter, ShaderDesc, ShaderStageFlags, SpecConstant, + Topology, +}; +use narcissus_maths::Mat4; + +use crate::Gpu; + +pub const DRAW_2D_TILE_SIZE: u32 = 32; + +#[allow(unused)] +#[repr(C)] +pub struct Vertex { + pub position: [f32; 4], + pub normal: [f32; 4], + pub texcoord: [f32; 4], +} + +#[repr(u32)] +enum Draw2dCmdType { + Rect, + Glyph, +} + +#[allow(unused)] +#[repr(C)] +pub union Draw2dCmd { + rect: CmdRect, + glyph: CmdGlyph, +} + +#[repr(C)] +#[derive(Clone, Copy)] +struct CmdGlyph { + r#type: u32, + index: u32, + x: f32, + y: f32, + color: u32, + _padding: [u8; 12], +} + +const _: () = assert!(std::mem::size_of::() == std::mem::size_of::()); + +#[repr(C)] +#[derive(Clone, Copy)] +struct CmdRect { + r#type: u32, + border_width: f32, + x: f32, + y: f32, + half_extent_x: f32, + half_extent_y: f32, + background_color: u32, + border_color: u32, +} + +const _: () = assert!(std::mem::size_of::() == std::mem::size_of::()); + +impl Draw2dCmd { + #[inline(always)] + pub fn glyph(glyph_index: TouchedGlyphIndex, color: u32, x: f32, y: f32) -> Self { + Self { + glyph: CmdGlyph { + r#type: Draw2dCmdType::Glyph as u32, + index: glyph_index.as_u32(), + x, + y, + color, + _padding: default(), + }, + } + } + + #[inline(always)] + pub fn rect( + x: f32, + y: f32, + half_extent_x: f32, + half_extent_y: f32, + border_width: f32, + background_color: u32, + border_color: u32, + ) -> Self { + Self { + rect: CmdRect { + r#type: Draw2dCmdType::Rect as u32, + border_width, + x, + y, + half_extent_x, + half_extent_y, + background_color, + border_color, + }, + } + } +} + +pub struct Samplers { + pub bilinear: Sampler, +} + +impl Samplers { + fn load(gpu: &Gpu) -> Samplers { + let bilinear = gpu.create_sampler(&SamplerDesc { + filter: SamplerFilter::Bilinear, + address_mode: SamplerAddressMode::Clamp, + compare_op: None, + mip_lod_bias: 0.0, + min_lod: 0.0, + max_lod: 0.0, + }); + Samplers { bilinear } + } +} + +pub enum GraphicsBinds { + #[allow(unused)] + ImmutableSamplers, + Albedo, +} + +pub enum ComputeBinds { + #[allow(unused)] + ImmutableSamplers, + TonyMcMapfaceLut, + GlyphAtlas, + UiRenderTarget, + ColorRenderTarget, + CompositedRenderTarget, +} + +#[repr(C)] +pub struct BasicConstants<'a> { + pub clip_from_model: Mat4, + pub vertex_buffer_address: BufferAddress<'a>, + pub transform_buffer_address: BufferAddress<'a>, +} + +#[repr(C)] +pub struct Draw2dClearConstants<'a> { + pub coarse_buffer_address: BufferAddress<'a>, +} + +#[repr(C)] +pub struct Draw2dScatterConstants<'a> { + pub screen_resolution_x: u32, + pub screen_resolution_y: u32, + pub tile_resolution_x: u32, + pub tile_resolution_y: u32, + + pub draw_buffer_len: u32, + pub coarse_buffer_len: u32, + + pub draw_buffer_address: BufferAddress<'a>, + pub glyph_buffer_address: BufferAddress<'a>, + pub coarse_buffer_address: BufferAddress<'a>, +} + +#[repr(C)] +pub struct Draw2dSortConstants<'a> { + pub coarse_buffer_len: u32, + pub _pad: u32, + pub indirect_dispatch_buffer_address: BufferAddress<'a>, + pub coarse_buffer_address: BufferAddress<'a>, +} + +#[repr(C)] +pub struct RadixSortUpsweepConstants<'a> { + pub shift: u32, + pub _pad: u32, + pub count_buffer_address: BufferAddress<'a>, + pub src_buffer_address: BufferAddress<'a>, + pub spine_buffer_address: BufferAddress<'a>, +} + +#[repr(C)] +pub struct RadixSortSpineConstants<'a> { + pub count_buffer_address: BufferAddress<'a>, + pub spine_buffer_address: BufferAddress<'a>, +} + +#[repr(C)] +pub struct RadixSortDownsweepConstants<'a> { + pub shift: u32, + pub _pad: u32, + pub count_buffer_address: BufferAddress<'a>, + pub spine_buffer_address: BufferAddress<'a>, + pub src_buffer_address: BufferAddress<'a>, + pub dst_buffer_address: BufferAddress<'a>, +} + +pub struct Pipelines { + _samplers: Samplers, + + pub graphics_bind_group_layout: BindGroupLayout, + pub compute_bind_group_layout: BindGroupLayout, + + pub basic_pipeline: Pipeline, + + pub draw_2d_bin_0_clear_pipeline: Pipeline, + pub draw_2d_bin_1_scatter_pipeline_workgroup_size: u32, + pub draw_2d_bin_1_scatter_pipeline: Pipeline, + pub draw_2d_bin_2_sort_pipeline: Pipeline, + pub draw_2d_bin_3_resolve_pipeline: Pipeline, + pub draw_2d_rasterize_pipeline: Pipeline, + + pub radix_sort_0_upsweep_pipeline: Pipeline, + pub radix_sort_1_spine_pipeline: Pipeline, + pub radix_sort_2_downsweep_pipeline: Pipeline, + + pub composite_pipeline: Pipeline, +} + +impl Pipelines { + pub fn load(gpu: &Gpu) -> Self { + let samplers = Samplers::load(gpu); + let immutable_samplers = &[samplers.bilinear]; + + let graphics_bind_group_layout = gpu.create_bind_group_layout(&[ + // Samplers + BindDesc::with_immutable_samplers(ShaderStageFlags::FRAGMENT, immutable_samplers), + // Albedo + BindDesc::new(ShaderStageFlags::FRAGMENT, BindingType::SampledImage), + ]); + + gpu.debug_name_bind_group_layout(graphics_bind_group_layout, "graphics"); + + let compute_bind_group_layout = gpu.create_bind_group_layout(&[ + // Samplers + BindDesc::with_immutable_samplers(ShaderStageFlags::COMPUTE, immutable_samplers), + // Tony mc mapface LUT + BindDesc::new(ShaderStageFlags::COMPUTE, BindingType::SampledImage), + // Glyph Atlas + BindDesc::new(ShaderStageFlags::COMPUTE, BindingType::SampledImage), + // UI Render Target + BindDesc::new(ShaderStageFlags::COMPUTE, BindingType::StorageImage), + // Color Render Target + BindDesc::new(ShaderStageFlags::COMPUTE, BindingType::StorageImage), + // Composited Render Target + BindDesc::new(ShaderStageFlags::COMPUTE, BindingType::StorageImage), + ]); + + gpu.debug_name_bind_group_layout(compute_bind_group_layout, "compute"); + + let basic_pipeline = gpu.create_graphics_pipeline(&GraphicsPipelineDesc { + vertex_shader: ShaderDesc { + code: shark_shaders::BASIC_VERT_SPV, + ..default() + }, + fragment_shader: ShaderDesc { + code: shark_shaders::BASIC_FRAG_SPV, + ..default() + }, + layout: PipelineLayout { + bind_group_layouts: &[graphics_bind_group_layout], + push_constant_ranges: &[PushConstantRange { + stage_flags: ShaderStageFlags::VERTEX, + offset: 0, + size: std::mem::size_of::() as u32, + }], + }, + attachments: GraphicsPipelineAttachments { + color_attachment_formats: &[ImageFormat::RGBA16_FLOAT], + depth_attachment_format: Some(ImageFormat::DEPTH_F32), + stencil_attachment_format: None, + }, + topology: Topology::Triangles, + primitive_restart: false, + polygon_mode: PolygonMode::Fill, + culling_mode: CullingMode::Back, + front_face: FrontFace::CounterClockwise, + blend_mode: BlendMode::Opaque, + depth_bias: None, + depth_compare_op: CompareOp::GreaterOrEqual, + depth_test_enable: true, + depth_write_enable: true, + stencil_test_enable: false, + stencil_back: default(), + stencil_front: default(), + }); + + gpu.debug_name_pipeline(basic_pipeline, "basic"); + + let create_compute_pipeline = |code, name, workgroup_size, push_constant_size| { + let push_constant_range = PushConstantRange { + stage_flags: ShaderStageFlags::COMPUTE, + offset: 0, + size: push_constant_size as u32, + }; + + let pipeline = gpu.create_compute_pipeline(&ComputePipelineDesc { + shader: ShaderDesc { + code, + require_full_subgroups: workgroup_size != 0, + required_subgroup_size: if workgroup_size != 0 { + Some(workgroup_size) + } else { + None + }, + spec_constants: &[SpecConstant::U32 { + id: 0, + value: workgroup_size, + }], + ..default() + }, + layout: PipelineLayout { + bind_group_layouts: &[compute_bind_group_layout], + // Validation cries about push constant ranges with zero size. + push_constant_ranges: if push_constant_range.size != 0 { + std::slice::from_ref(&push_constant_range) + } else { + &[] + }, + }, + }); + + gpu.debug_name_pipeline(pipeline, name); + + pipeline + }; + + let draw_2d_bin_0_clear_pipeline = create_compute_pipeline( + shark_shaders::DRAW_2D_BIN_0_CLEAR_COMP_SPV, + "draw2d_bin_clear", + 0, + std::mem::size_of::(), + ); + + let draw_2d_bin_1_scatter_pipeline_workgroup_size = 32; + let draw_2d_bin_1_scatter_pipeline = create_compute_pipeline( + shark_shaders::DRAW_2D_BIN_1_SCATTER_COMP_SPV, + "draw2d_bin_scatter", + draw_2d_bin_1_scatter_pipeline_workgroup_size, + std::mem::size_of::(), + ); + + let draw_2d_bin_2_sort_pipeline = create_compute_pipeline( + shark_shaders::DRAW_2D_BIN_2_SORT_COMP_SPV, + "draw2d_bin_sort", + 0, + std::mem::size_of::(), + ); + + let draw_2d_bin_3_resolve_pipeline = create_compute_pipeline( + shark_shaders::DRAW_2D_BIN_3_RESOLVE_COMP_SPV, + "draw2d_bin_resolve", + 0, + 0, + ); + + let draw_2d_rasterize_pipeline = create_compute_pipeline( + shark_shaders::DRAW_2D_RASTERIZE_COMP_SPV, + "draw2d_rasterize", + 0, + 0, + ); + + let radix_sort_0_upsweep_pipeline = create_compute_pipeline( + shark_shaders::RADIX_SORT_0_UPSWEEP_COMP_SPV, + "radix_sort_upsweep", + 32, + std::mem::size_of::(), + ); + + let radix_sort_1_spine_pipeline = create_compute_pipeline( + shark_shaders::RADIX_SORT_1_SPINE_COMP_SPV, + "radix_sort_spine", + 32, + std::mem::size_of::(), + ); + + let radix_sort_2_downsweep_pipeline = create_compute_pipeline( + shark_shaders::RADIX_SORT_2_DOWNSWEEP_COMP_SPV, + "radix_sort_downsweep", + 32, + std::mem::size_of::(), + ); + + let composite_pipeline = + create_compute_pipeline(shark_shaders::COMPOSITE_COMP_SPV, "composite", 0, 0); + + Self { + _samplers: samplers, + + graphics_bind_group_layout, + compute_bind_group_layout, + + basic_pipeline, + + draw_2d_bin_0_clear_pipeline, + draw_2d_bin_1_scatter_pipeline_workgroup_size, + draw_2d_bin_1_scatter_pipeline, + draw_2d_bin_2_sort_pipeline, + draw_2d_bin_3_resolve_pipeline, + draw_2d_rasterize_pipeline, + + radix_sort_0_upsweep_pipeline, + radix_sort_1_spine_pipeline, + radix_sort_2_downsweep_pipeline, + + composite_pipeline, + } + } +} diff --git a/title/shark/src/pipelines/basic.rs b/title/shark/src/pipelines/basic.rs deleted file mode 100644 index fc4d88a..0000000 --- a/title/shark/src/pipelines/basic.rs +++ /dev/null @@ -1,90 +0,0 @@ -use narcissus_core::default; -use narcissus_gpu::{ - BindDesc, BindGroupLayout, BindingType, BlendMode, CompareOp, CullingMode, FrontFace, - GraphicsPipelineAttachments, GraphicsPipelineDesc, ImageFormat, Pipeline, PipelineLayout, - PolygonMode, Sampler, ShaderDesc, ShaderStageFlags, Topology, -}; -use narcissus_maths::Mat4; - -use crate::Gpu; - -#[allow(unused)] -#[repr(C)] -pub struct BasicUniforms { - pub clip_from_model: Mat4, -} - -#[allow(unused)] -#[repr(C)] -pub struct Vertex { - pub position: [f32; 4], - pub normal: [f32; 4], - pub texcoord: [f32; 4], -} - -pub struct BasicPipeline { - pub uniforms_bind_group_layout: BindGroupLayout, - pub storage_bind_group_layout: BindGroupLayout, - pub pipeline: Pipeline, -} - -impl BasicPipeline { - pub fn new(gpu: &Gpu, immutable_samplers: &[Sampler]) -> Self { - let uniforms_bind_group_layout = gpu.create_bind_group_layout(&[ - // Uniforms - BindDesc::new(ShaderStageFlags::ALL, BindingType::UniformBuffer), - // Samplers - BindDesc::with_immutable_samplers(ShaderStageFlags::ALL, immutable_samplers), - ]); - - let storage_bind_group_layout = gpu.create_bind_group_layout(&[ - // Vertex Buffer - BindDesc::new(ShaderStageFlags::ALL, BindingType::StorageBuffer), - // Transform Buffer - BindDesc::new(ShaderStageFlags::ALL, BindingType::StorageBuffer), - // Albedo - BindDesc::new(ShaderStageFlags::ALL, BindingType::SampledImage), - ]); - - let layout = &PipelineLayout { - bind_group_layouts: &[uniforms_bind_group_layout, storage_bind_group_layout], - push_constant_ranges: &[], - }; - - let pipeline = gpu.create_graphics_pipeline(&GraphicsPipelineDesc { - vertex_shader: ShaderDesc { - entry: c"main", - code: shark_shaders::BASIC_VERT_SPV, - }, - fragment_shader: ShaderDesc { - entry: c"main", - code: shark_shaders::BASIC_FRAG_SPV, - }, - layout, - attachments: GraphicsPipelineAttachments { - color_attachment_formats: &[ImageFormat::RGBA16_FLOAT], - depth_attachment_format: Some(ImageFormat::DEPTH_F32), - stencil_attachment_format: None, - }, - topology: Topology::Triangles, - primitive_restart: false, - polygon_mode: PolygonMode::Fill, - culling_mode: CullingMode::Back, - front_face: FrontFace::CounterClockwise, - blend_mode: BlendMode::Opaque, - depth_bias: None, - depth_compare_op: CompareOp::GreaterOrEqual, - depth_test_enable: true, - depth_write_enable: true, - stencil_test_enable: false, - stencil_back: default(), - stencil_front: default(), - }); - - Self { - uniforms_bind_group_layout, - storage_bind_group_layout, - pipeline, - } - } -} diff --git a/title/shark/src/pipelines/mod.rs b/title/shark/src/pipelines/mod.rs deleted file mode 100644 index d544f07..0000000 --- a/title/shark/src/pipelines/mod.rs +++ /dev/null @@ -1,79 +0,0 @@ -use narcissus_font::TouchedGlyphIndex; - -pub mod basic; - -pub const TILE_SIZE: u32 = 32; -pub const MAX_PRIMS: u32 = 1 << 18; -pub const TILE_BITMAP_WORDS_L1: u32 = MAX_PRIMS / 32 / 32; -pub const TILE_BITMAP_WORDS_L0: u32 = MAX_PRIMS / 32; -pub const TILE_STRIDE: u32 = TILE_BITMAP_WORDS_L0 + TILE_BITMAP_WORDS_L1 + 2; - -#[allow(unused)] -#[repr(C)] -pub struct PrimitiveUniforms { - pub screen_resolution_x: u32, - pub screen_resolution_y: u32, - - pub atlas_resolution_x: u32, - pub atlas_resolution_y: u32, - - pub tile_resolution_x: u32, - pub tile_resolution_y: u32, - - pub num_primitives: u32, - pub num_primitives_32: u32, - pub num_primitives_1024: u32, - pub tile_stride: u32, - - pub primitives_instances_buffer: u64, - pub rects_buffer: u64, - pub glyphs_buffer: u64, - pub tiles_buffer: u64, -} - -#[repr(u32)] -pub enum PrimitiveType { - Rect, - Glyph, -} - -#[allow(unused)] -#[repr(C)] -pub struct PrimitiveInstance { - pub packed: u32, - pub color: u32, - pub x: f32, - pub y: f32, -} - -#[repr(C)] -pub struct Rect { - pub half_extent_x: f32, - pub half_extent_y: f32, - pub border_width: f32, - pub border_radius: f32, -} - -impl PrimitiveInstance { - #[inline(always)] - pub fn glyph(glyph_index: TouchedGlyphIndex, color: u32, x: f32, y: f32) -> Self { - let packed = glyph_index.as_u32() | ((PrimitiveType::Glyph as u32) << 30); - Self { - packed, - color, - x, - y, - } - } - - #[inline(always)] - pub fn rect(rect_index: u32, color: u32, x: f32, y: f32) -> Self { - let packed = rect_index | ((PrimitiveType::Rect as u32) << 30); - Self { - packed, - color, - x, - y, - } - } -} -- 2.49.0