]> git.nega.tv - josh/narcissus/commitdiff
shark: Re-work UI
authorJosh Simmons <josh@nega.tv>
Mon, 4 Nov 2024 07:33:52 +0000 (08:33 +0100)
committerJosh Simmons <josh@nega.tv>
Mon, 4 Nov 2024 21:13:45 +0000 (22:13 +0100)
Add radix sort pipelines.
Add buffer address abstraction.
Add spec constants.
Add subgroup size control.
Add `cmd_dispatch_indirect`.

Change bind group API to separate allocation and binding.

Surely I won't regret the mega-commit...

31 files changed:
Cargo.lock
engine/narcissus-gpu/src/backend/vulkan/mod.rs
engine/narcissus-gpu/src/lib.rs
engine/narcissus-gpu/src/mapped_buffer.rs
external/vulkan-sys/src/structs.rs
title/shark-shaders/build.rs
title/shark-shaders/shaders/basic.frag [moved from title/shark-shaders/shaders/basic.frag.glsl with 75% similarity]
title/shark-shaders/shaders/basic.vert [new file with mode: 0644]
title/shark-shaders/shaders/basic.vert.glsl [deleted file]
title/shark-shaders/shaders/composite.comp [moved from title/shark-shaders/shaders/display_transform.comp.glsl with 64% similarity]
title/shark-shaders/shaders/compute_bindings.h
title/shark-shaders/shaders/draw_2d.h [new file with mode: 0644]
title/shark-shaders/shaders/draw_2d_bin_0_clear.comp [new file with mode: 0644]
title/shark-shaders/shaders/draw_2d_bin_1_scatter.comp [new file with mode: 0644]
title/shark-shaders/shaders/draw_2d_bin_2_sort.comp [new file with mode: 0644]
title/shark-shaders/shaders/draw_2d_bin_3_resolve.comp [new file with mode: 0644]
title/shark-shaders/shaders/draw_2d_rasterize.comp [new file with mode: 0644]
title/shark-shaders/shaders/indirect.h [new file with mode: 0644]
title/shark-shaders/shaders/primitive_2d.h [deleted file]
title/shark-shaders/shaders/primitive_2d_bin.comp.glsl [deleted file]
title/shark-shaders/shaders/primitive_2d_bin_clear.comp.glsl [deleted file]
title/shark-shaders/shaders/primitive_2d_rasterize.comp.glsl [deleted file]
title/shark-shaders/shaders/radix_sort.h [new file with mode: 0644]
title/shark-shaders/shaders/radix_sort_0_upsweep.comp [new file with mode: 0644]
title/shark-shaders/shaders/radix_sort_1_spine.comp [new file with mode: 0644]
title/shark-shaders/shaders/radix_sort_2_downsweep.comp [new file with mode: 0644]
title/shark/src/helpers.rs
title/shark/src/main.rs
title/shark/src/pipelines.rs [new file with mode: 0644]
title/shark/src/pipelines/basic.rs [deleted file]
title/shark/src/pipelines/mod.rs [deleted file]

index 8b2c2c0b1c18b407f34f4f1b0b6c3363efb31fce..0f9129a0861b7b4951824e1fb0fba21bf2a1501d 100644 (file)
@@ -52,9 +52,9 @@ dependencies = [
 
 [[package]]
 name = "libc"
-version = "0.2.150"
+version = "0.2.161"
 source = "registry+https://github.com/rust-lang/crates.io-index"
-checksum = "89d92a4743f9a61002fae18374ed11e7973f530cb3a3255fb354818118b2203c"
+checksum = "8e9489c2807c139ffd9c1794f4af0ebe86a828db53ecdc7fea2111d0fed085d1"
 
 [[package]]
 name = "memchr"
index 7c07dc98ec7d3efbc8420a5f5847a3dae4af1a01..8beb5456c65fa004250eca44312e5b0418dbd8cb 100644 (file)
@@ -9,21 +9,22 @@ use std::{
 };
 
 use narcissus_core::{
-    box_assume_init, default, is_aligned_to,
+    default, is_aligned_to,
     manual_arc::{self, ManualArc},
     raw_window::AsRawWindow,
-    zeroed_box, Arc, Arena, HybridArena, Mutex, PhantomUnsend, Pool, Widen,
+    Arc, Arena, HybridArena, Mutex, PhantomUnsend, Pool, Widen,
 };
 
-use vulkan_sys as vk;
+use vulkan_sys::{self as vk};
 
 use crate::{
-    frame_counter::FrameCounter, Bind, BindDesc, BindGroupLayout, BindingType, Buffer, BufferArg,
-    BufferDesc, BufferImageCopy, BufferUsageFlags, CmdEncoder, ComputePipelineDesc, Device,
-    Extent2d, Extent3d, Frame, GlobalBarrier, GpuConcurrent, GraphicsPipelineDesc, Image,
-    ImageBarrier, ImageBlit, ImageDesc, ImageDimension, ImageLayout, ImageTiling, ImageViewDesc,
-    IndexType, MemoryLocation, Offset2d, Offset3d, PersistentBuffer, Pipeline, PipelineLayout,
-    Sampler, SamplerAddressMode, SamplerCompareOp, SamplerDesc, SamplerFilter, ShaderStageFlags,
+    frame_counter::FrameCounter, mapped_buffer::TransientBindGroup, Bind, BindDesc,
+    BindGroupLayout, BindingType, Buffer, BufferAddress, BufferArg, BufferDesc, BufferImageCopy,
+    BufferUsageFlags, CmdEncoder, ComputePipelineDesc, Device, Extent2d, Extent3d, Frame,
+    GlobalBarrier, GpuConcurrent, GraphicsPipelineDesc, Image, ImageBarrier, ImageBlit, ImageDesc,
+    ImageDimension, ImageLayout, ImageTiling, ImageViewDesc, IndexType, MemoryLocation, Offset2d,
+    Offset3d, PersistentBuffer, Pipeline, PipelineLayout, Sampler, SamplerAddressMode,
+    SamplerCompareOp, SamplerDesc, SamplerFilter, ShaderStageFlags, SpecConstant,
     SwapchainConfigurator, SwapchainImage, SwapchainOutOfDateError, ThreadToken, TransientBuffer,
     TypedBind,
 };
@@ -384,12 +385,13 @@ pub(crate) struct VulkanDevice {
     physical_device_properties: Box<vk::PhysicalDeviceProperties2>,
     _physical_device_properties_11: Box<vk::PhysicalDeviceVulkan11Properties>,
     _physical_device_properties_12: Box<vk::PhysicalDeviceVulkan12Properties>,
-    _physical_device_properties_13: Box<vk::PhysicalDeviceVulkan13Properties>,
+    physical_device_properties_13: Box<vk::PhysicalDeviceVulkan13Properties>,
+    physical_device_memory_properties: Box<vk::PhysicalDeviceMemoryProperties>,
+
     _physical_device_features: Box<vk::PhysicalDeviceFeatures2>,
     _physical_device_features_11: Box<vk::PhysicalDeviceVulkan11Features>,
     _physical_device_features_12: Box<vk::PhysicalDeviceVulkan12Features>,
     _physical_device_features_13: Box<vk::PhysicalDeviceVulkan13Features>,
-    physical_device_memory_properties: Box<vk::PhysicalDeviceMemoryProperties>,
 
     _global_fn: vk::GlobalFunctions,
     instance_fn: vk::InstanceFunctions,
@@ -508,53 +510,31 @@ impl VulkanDevice {
             instance_fn.enumerate_physical_devices(instance, count, ptr)
         });
 
-        let mut physical_device_properties =
-            unsafe { box_assume_init(zeroed_box::<vk::PhysicalDeviceProperties2>()) };
-        let mut physical_device_properties_11 =
-            unsafe { box_assume_init(zeroed_box::<vk::PhysicalDeviceVulkan11Properties>()) };
-        let mut physical_device_properties_12 =
-            unsafe { box_assume_init(zeroed_box::<vk::PhysicalDeviceVulkan12Properties>()) };
-        let mut physical_device_properties_13 =
-            unsafe { box_assume_init(zeroed_box::<vk::PhysicalDeviceVulkan13Properties>()) };
-
-        physical_device_properties._type = vk::StructureType::PhysicalDeviceProperties2;
-        physical_device_properties_11._type = vk::StructureType::PhysicalDeviceVulkan11Properties;
-        physical_device_properties_12._type = vk::StructureType::PhysicalDeviceVulkan12Properties;
-        physical_device_properties_13._type = vk::StructureType::PhysicalDeviceVulkan13Properties;
-
-        physical_device_properties_12._next = physical_device_properties_13.as_mut()
-            as *mut vk::PhysicalDeviceVulkan13Properties
-            as *mut _;
-        physical_device_properties_11._next = physical_device_properties_12.as_mut()
-            as *mut vk::PhysicalDeviceVulkan12Properties
-            as *mut _;
-        physical_device_properties._next = physical_device_properties_11.as_mut()
-            as *mut vk::PhysicalDeviceVulkan11Properties
-            as *mut _;
-
-        let mut physical_device_features =
-            unsafe { box_assume_init(zeroed_box::<vk::PhysicalDeviceFeatures2>()) };
-        let mut physical_device_features_11 =
-            unsafe { box_assume_init(zeroed_box::<vk::PhysicalDeviceVulkan11Features>()) };
-        let mut physical_device_features_12 =
-            unsafe { box_assume_init(zeroed_box::<vk::PhysicalDeviceVulkan12Features>()) };
-        let mut physical_device_features_13 =
-            unsafe { box_assume_init(zeroed_box::<vk::PhysicalDeviceVulkan13Features>()) };
-
-        physical_device_features._type = vk::StructureType::PhysicalDeviceFeatures2;
-        physical_device_features_11._type = vk::StructureType::PhysicalDeviceVulkan11Features;
-        physical_device_features_12._type = vk::StructureType::PhysicalDeviceVulkan12Features;
-        physical_device_features_13._type = vk::StructureType::PhysicalDeviceVulkan13Features;
-
-        physical_device_features_12._next = physical_device_features_13.as_mut()
-            as *mut vk::PhysicalDeviceVulkan13Features
-            as *mut _;
-        physical_device_features_11._next = physical_device_features_12.as_mut()
-            as *mut vk::PhysicalDeviceVulkan12Features
-            as *mut _;
-        physical_device_features._next = physical_device_features_11.as_mut()
-            as *mut vk::PhysicalDeviceVulkan11Features
-            as *mut _;
+        let mut physical_device_properties: Box<vk::PhysicalDeviceProperties2> = default();
+        let mut physical_device_properties_11: Box<vk::PhysicalDeviceVulkan11Properties> =
+            default();
+        let mut physical_device_properties_12: Box<vk::PhysicalDeviceVulkan12Properties> =
+            default();
+        let mut physical_device_properties_13: Box<vk::PhysicalDeviceVulkan13Properties> =
+            default();
+
+        physical_device_properties_12._next =
+            physical_device_properties_13.as_mut() as *mut _ as *mut _;
+        physical_device_properties_11._next =
+            physical_device_properties_12.as_mut() as *mut _ as *mut _;
+        physical_device_properties._next =
+            physical_device_properties_11.as_mut() as *mut _ as *mut _;
+
+        let mut physical_device_features: Box<vk::PhysicalDeviceFeatures2> = default();
+        let mut physical_device_features_11: Box<vk::PhysicalDeviceVulkan11Features> = default();
+        let mut physical_device_features_12: Box<vk::PhysicalDeviceVulkan12Features> = default();
+        let mut physical_device_features_13: Box<vk::PhysicalDeviceVulkan13Features> = default();
+
+        physical_device_features_12._next =
+            physical_device_features_13.as_mut() as *mut _ as *mut _;
+        physical_device_features_11._next =
+            physical_device_features_12.as_mut() as *mut _ as *mut _;
+        physical_device_features._next = physical_device_features_11.as_mut() as *mut _ as *mut _;
 
         let physical_device = physical_devices
             .iter()
@@ -573,6 +553,9 @@ impl VulkanDevice {
 
                 physical_device_properties.properties.api_version >= vk::VERSION_1_3
                     && physical_device_features_13.dynamic_rendering == vk::Bool32::True
+                    && physical_device_features_13.subgroup_size_control == vk::Bool32::True
+                    && physical_device_features_13.maintenance4 == vk::Bool32::True
+                    && physical_device_features_13.compute_full_subgroups == vk::Bool32::True
                     && physical_device_features_12.timeline_semaphore == vk::Bool32::True
                     && physical_device_features_12.descriptor_indexing == vk::Bool32::True
                     && physical_device_features_12.descriptor_binding_partially_bound
@@ -636,6 +619,9 @@ impl VulkanDevice {
             let enabled_features_13 = vk::PhysicalDeviceVulkan13Features {
                 dynamic_rendering: vk::Bool32::True,
                 synchronization2: vk::Bool32::True,
+                subgroup_size_control: vk::Bool32::True,
+                compute_full_subgroups: vk::Bool32::True,
+                maintenance4: vk::Bool32::True,
                 ..default()
             };
             let enabled_features_12 = vk::PhysicalDeviceVulkan12Features {
@@ -790,12 +776,13 @@ impl VulkanDevice {
             physical_device_properties,
             _physical_device_properties_11: physical_device_properties_11,
             _physical_device_properties_12: physical_device_properties_12,
-            _physical_device_properties_13: physical_device_properties_13,
+            physical_device_properties_13,
+            physical_device_memory_properties,
+
             _physical_device_features: physical_device_features,
             _physical_device_features_11: physical_device_features_11,
             _physical_device_features_12: physical_device_features_12,
             _physical_device_features_13: physical_device_features_13,
-            physical_device_memory_properties,
 
             _global_fn: global_fn,
             instance_fn,
@@ -1251,8 +1238,8 @@ impl Device for VulkanDevice {
             hasher.update(&bind_desc.count.to_le_bytes());
         }
 
-        let layout_bindings =
-            arena.alloc_slice_fill_iter(binds_desc.iter().enumerate().map(|(i, bind_desc)| {
+        let layout_bindings: &mut [vulkan_sys::DescriptorSetLayoutBinding] = arena
+            .alloc_slice_fill_iter(binds_desc.iter().enumerate().map(|(i, bind_desc)| {
                 let immutable_samplers = if !bind_desc.immutable_samplers.is_empty() {
                     assert_eq!(
                         bind_desc.binding_type,
@@ -1323,7 +1310,7 @@ impl Device for VulkanDevice {
     }
 
     fn create_graphics_pipeline(&self, pipeline_desc: &GraphicsPipelineDesc) -> Pipeline {
-        let pipeline_layout = self.cache_pipeline_layout(pipeline_desc.layout);
+        let pipeline_layout = self.cache_pipeline_layout(&pipeline_desc.layout);
 
         let arena = HybridArena::<1024>::new();
 
@@ -1338,6 +1325,17 @@ impl Device for VulkanDevice {
             pipeline_desc.fragment_shader.code,
         );
 
+        assert!(
+            !(pipeline_desc.vertex_shader.required_subgroup_size.is_some()
+                || pipeline_desc
+                    .fragment_shader
+                    .required_subgroup_size
+                    .is_some()
+                || pipeline_desc.vertex_shader.allow_varying_subgroup_size
+                || pipeline_desc.fragment_shader.allow_varying_subgroup_size),
+            "subgroup size control features not implemented for graphics shader stages"
+        );
+
         let stages = &[
             vk::PipelineShaderStageCreateInfo {
                 stage: vk::ShaderStageFlags::VERTEX,
@@ -1492,28 +1490,137 @@ impl Device for VulkanDevice {
     }
 
     fn create_compute_pipeline(&self, pipeline_desc: &ComputePipelineDesc) -> Pipeline {
-        let pipeline_layout = self.cache_pipeline_layout(pipeline_desc.layout);
+        let arena = HybridArena::<1024>::new();
+
+        let pipeline_layout = self.cache_pipeline_layout(&pipeline_desc.layout);
 
         let module = vulkan_shader_module(&self.device_fn, self.device, pipeline_desc.shader.code);
 
-        let stage = vk::PipelineShaderStageCreateInfo {
-            stage: vk::ShaderStageFlags::COMPUTE,
-            name: pipeline_desc.shader.entry.as_ptr(),
-            module,
-            ..default()
-        };
+        let mut shader_stage_create_flags = default();
+
+        if pipeline_desc.shader.require_full_subgroups {
+            shader_stage_create_flags |= vk::PipelineShaderStageCreateFlags::REQUIRE_FULL_SUBGROUPS
+        }
+
+        if pipeline_desc.shader.allow_varying_subgroup_size {
+            shader_stage_create_flags |=
+                vk::PipelineShaderStageCreateFlags::ALLOW_VARYING_SUBGROUP_SIZE;
+        }
 
-        let create_infos = &[vk::ComputePipelineCreateInfo {
+        let specialization_info: Option<&vk::SpecializationInfo> =
+            if !pipeline_desc.shader.spec_constants.is_empty() {
+                let block_len = pipeline_desc
+                    .shader
+                    .spec_constants
+                    .iter()
+                    .map(|spec_constant| match spec_constant {
+                        SpecConstant::Bool { id: _, value: _ }
+                        | SpecConstant::U32 { id: _, value: _ }
+                        | SpecConstant::I32 { id: _, value: _ }
+                        | SpecConstant::F32 { id: _, value: _ } => 4,
+                    })
+                    .sum::<usize>();
+
+                let block = arena.alloc_slice_fill_copy(block_len, 0u8);
+
+                let mut offset = 0;
+                let map_entries =
+                    arena.alloc_slice_fill_iter(pipeline_desc.shader.spec_constants.iter().map(
+                        |spec_constant| {
+                            let constant_id;
+                            let value_size;
+                            match *spec_constant {
+                                SpecConstant::Bool { id, value } => {
+                                    constant_id = id;
+                                    let value = if value {
+                                        vk::Bool32::True
+                                    } else {
+                                        vk::Bool32::False
+                                    } as u32;
+                                    value_size = std::mem::size_of_val(&value);
+                                    block[offset..offset + value_size]
+                                        .copy_from_slice(&value.to_ne_bytes())
+                                }
+                                SpecConstant::U32 { id, value } => {
+                                    constant_id = id;
+                                    value_size = std::mem::size_of_val(&value);
+                                    block[offset..offset + value_size]
+                                        .copy_from_slice(&value.to_ne_bytes());
+                                }
+                                SpecConstant::I32 { id, value } => {
+                                    constant_id = id;
+                                    value_size = std::mem::size_of_val(&value);
+                                    block[offset..offset + value_size]
+                                        .copy_from_slice(&value.to_ne_bytes());
+                                }
+                                SpecConstant::F32 { id, value } => {
+                                    constant_id = id;
+                                    value_size = std::mem::size_of_val(&value);
+                                    block[offset..offset + value_size]
+                                        .copy_from_slice(&value.to_ne_bytes());
+                                }
+                            }
+
+                            let map_entry = vk::SpecializationMapEntry {
+                                constant_id,
+                                offset: offset as u32,
+                                size: value_size,
+                            };
+
+                            offset += value_size;
+
+                            map_entry
+                        },
+                    ));
+
+                Some(arena.alloc(vk::SpecializationInfo {
+                    data: block.into(),
+                    map_entries: map_entries.into(),
+                }))
+            } else {
+                None
+            };
+
+        let compute_pipeline_create_info = arena.alloc(vk::ComputePipelineCreateInfo {
             layout: pipeline_layout.pipeline_layout,
-            stage,
+            stage: vk::PipelineShaderStageCreateInfo {
+                stage: vk::ShaderStageFlags::COMPUTE,
+                name: pipeline_desc.shader.entry.as_ptr(),
+                module,
+                flags: shader_stage_create_flags,
+                specialization_info,
+                ..default()
+            },
             ..default()
-        }];
+        });
+
+        if let Some(required_subgroup_size) = pipeline_desc.shader.required_subgroup_size {
+            assert!(self
+                .physical_device_properties_13
+                .required_subgroup_size_stages
+                .contains(vk::ShaderStageFlags::COMPUTE));
+            assert!(
+                required_subgroup_size >= self.physical_device_properties_13.min_subgroup_size
+                    && required_subgroup_size
+                        <= self.physical_device_properties_13.max_subgroup_size
+            );
+
+            let shader_stage_required_subgroup_size_create_info =
+                arena.alloc(vk::PipelineShaderStageRequiredSubgroupSizeCreateInfo {
+                    required_subgroup_size,
+                    ..default()
+                });
+
+            // SAFETY: Both are arena allocations and therefore have identical lifetimes.
+            compute_pipeline_create_info.stage._next =
+                shader_stage_required_subgroup_size_create_info as *const _ as *const _;
+        }
 
         let mut pipelines = [vk::Pipeline::null()];
         vk_check!(self.device_fn.create_compute_pipelines(
             self.device,
             vk::PipelineCache::null(),
-            create_infos,
+            std::slice::from_ref(compute_pipeline_create_info),
             None,
             &mut pipelines
         ));
@@ -1615,6 +1722,71 @@ impl Device for VulkanDevice {
         }
     }
 
+    fn debug_name_bind_group_layout(&self, bind_group_layout: BindGroupLayout, name: &str) {
+        #[cfg(feature = "debug_markers")]
+        if let Some(debug_utils_fn) = &self.debug_utils_fn {
+            let descriptor_set_layout;
+            {
+                let bind_group_layout_pool = self.bind_group_layout_pool.lock();
+                let Some(bind_group_layout) = bind_group_layout_pool.get(bind_group_layout.0)
+                else {
+                    return;
+                };
+
+                descriptor_set_layout = bind_group_layout.descriptor_set_layout;
+            }
+
+            let arena = HybridArena::<512>::new();
+            let object_name = arena.alloc_cstr_from_str(name);
+
+            let image_name_info = vk::DebugUtilsObjectNameInfoExt {
+                object_type: vk::ObjectType::DescriptorSetLayout,
+                object_handle: descriptor_set_layout.as_raw(),
+                object_name: object_name.as_ptr(),
+                ..default()
+            };
+            unsafe { debug_utils_fn.set_debug_utils_object_name_ext(self.device, &image_name_info) }
+        }
+    }
+
+    fn debug_name_pipeline(&self, pipeline: Pipeline, name: &str) {
+        #[cfg(feature = "debug_markers")]
+        if let Some(debug_utils_fn) = &self.debug_utils_fn {
+            let pipeline_handle;
+            let pipeline_layout_handle;
+            {
+                let pipeline_pool = self.pipeline_pool.lock();
+                let Some(pipeline) = pipeline_pool.get(pipeline.0) else {
+                    return;
+                };
+
+                pipeline_handle = pipeline.pipeline;
+                pipeline_layout_handle = pipeline.pipeline_layout.pipeline_layout;
+            }
+
+            let arena = HybridArena::<512>::new();
+            let object_name = arena.alloc_cstr_from_str(name);
+
+            let image_name_info = vk::DebugUtilsObjectNameInfoExt {
+                object_type: vk::ObjectType::Pipeline,
+                object_handle: pipeline_handle.as_raw(),
+                object_name: object_name.as_ptr(),
+                ..default()
+            };
+            unsafe { debug_utils_fn.set_debug_utils_object_name_ext(self.device, &image_name_info) }
+
+            let image_view_name_info = vk::DebugUtilsObjectNameInfoExt {
+                object_type: vk::ObjectType::PipelineLayout,
+                object_handle: pipeline_layout_handle.as_raw(),
+                object_name: object_name.as_ptr(),
+                ..default()
+            };
+            unsafe {
+                debug_utils_fn.set_debug_utils_object_name_ext(self.device, &image_view_name_info)
+            }
+        }
+    }
+
     fn destroy_buffer(&self, frame: &Frame, buffer: Buffer) {
         if let Some(buffer) = self.buffer_pool.lock().remove(buffer.0) {
             assert_eq!(
@@ -1733,118 +1905,13 @@ impl Device for VulkanDevice {
         self.request_transient_buffer(frame, thread_token, usage, size as u64)
     }
 
-    fn request_cmd_encoder<'a, 'thread>(
+    fn request_transient_bind_group<'a>(
         &self,
-        frame: &'a Frame,
+        frame: &'a Frame<'a>,
         thread_token: &'a ThreadToken,
-    ) -> CmdEncoder<'a> {
-        let frame = self.frame(frame);
-        let per_thread = frame.per_thread.get(thread_token);
-        let mut cmd_buffer_pool = per_thread.cmd_buffer_pool.borrow_mut();
-
-        // We have consumed all available command buffers, need to allocate a new one.
-        if cmd_buffer_pool.next_free_index >= cmd_buffer_pool.command_buffers.len() {
-            let mut cmd_buffers = [vk::CommandBuffer::null(); 4];
-            let allocate_info = vk::CommandBufferAllocateInfo {
-                command_pool: cmd_buffer_pool.command_pool,
-                level: vk::CommandBufferLevel::Primary,
-                command_buffer_count: cmd_buffers.len() as u32,
-                ..default()
-            };
-            vk_check!(self.device_fn.allocate_command_buffers(
-                self.device,
-                &allocate_info,
-                cmd_buffers.as_mut_ptr()
-            ));
-            cmd_buffer_pool.command_buffers.extend(cmd_buffers.iter());
-        }
-
-        let index = cmd_buffer_pool.next_free_index;
-        cmd_buffer_pool.next_free_index += 1;
-        let command_buffer = cmd_buffer_pool.command_buffers[index];
-
-        vk_check!(self.device_fn.begin_command_buffer(
-            command_buffer,
-            &vk::CommandBufferBeginInfo {
-                flags: vk::CommandBufferUsageFlags::ONE_TIME_SUBMIT,
-                ..default()
-            }
-        ));
-
-        let vulkan_cmd_encoder = per_thread.arena.alloc(VulkanCmdEncoder {
-            command_buffer,
-            ..default()
-        });
-
-        CmdEncoder {
-            cmd_encoder_addr: vulkan_cmd_encoder as *mut _ as usize,
-            thread_token,
-            phantom_unsend: PhantomUnsend {},
-        }
-    }
-
-    fn cmd_insert_debug_marker(
-        &self,
-        cmd_encoder: &mut CmdEncoder,
-        label_name: &str,
-        color: [f32; 4],
-    ) {
-        #[cfg(feature = "debug_markers")]
-        if let Some(debug_utils_fn) = &self.debug_utils_fn {
-            let arena = HybridArena::<256>::new();
-            let label_name = arena.alloc_cstr_from_str(label_name);
-
-            let command_buffer = self.cmd_encoder_mut(cmd_encoder).command_buffer;
-            let label_info = vk::DebugUtilsLabelExt {
-                label_name: label_name.as_ptr(),
-                color,
-                ..default()
-            };
-            unsafe {
-                debug_utils_fn.cmd_insert_debug_utils_label_ext(command_buffer, &label_info);
-            }
-        }
-    }
-
-    fn cmd_begin_debug_marker(
-        &self,
-        cmd_encoder: &mut CmdEncoder,
-        label_name: &str,
-        color: [f32; 4],
-    ) {
-        #[cfg(feature = "debug_markers")]
-        if let Some(debug_utils_fn) = &self.debug_utils_fn {
-            let arena = HybridArena::<256>::new();
-            let label_name = arena.alloc_cstr_from_str(label_name);
-
-            let command_buffer = self.cmd_encoder_mut(cmd_encoder).command_buffer;
-            let label_info = vk::DebugUtilsLabelExt {
-                label_name: label_name.as_ptr(),
-                color,
-                ..default()
-            };
-            unsafe {
-                debug_utils_fn.cmd_begin_debug_utils_label_ext(command_buffer, &label_info);
-            }
-        }
-    }
-
-    fn cmd_end_debug_marker(&self, cmd_encoder: &mut CmdEncoder) {
-        #[cfg(feature = "debug_markers")]
-        if let Some(debug_utils_fn) = &self.debug_utils_fn {
-            let command_buffer = self.cmd_encoder_mut(cmd_encoder).command_buffer;
-            unsafe { debug_utils_fn.cmd_end_debug_utils_label_ext(command_buffer) }
-        }
-    }
-
-    fn cmd_set_bind_group(
-        &self,
-        frame: &Frame,
-        cmd_encoder: &mut CmdEncoder,
         layout: BindGroupLayout,
-        bind_group_index: u32,
         bindings: &[Bind],
-    ) {
+    ) -> TransientBindGroup<'a> {
         let arena = HybridArena::<4096>::new();
 
         let descriptor_set_layout = self
@@ -1855,7 +1922,7 @@ impl Device for VulkanDevice {
             .descriptor_set_layout;
 
         let frame = self.frame(frame);
-        let per_thread = frame.per_thread.get(cmd_encoder.thread_token);
+        let per_thread = frame.per_thread.get(thread_token);
 
         let mut descriptor_pool = per_thread.descriptor_pool.get();
         let mut allocated_pool = false;
@@ -2006,23 +2073,137 @@ impl Device for VulkanDevice {
                 .update_descriptor_sets(self.device, write_descriptors, &[])
         };
 
+        TransientBindGroup {
+            bind_group: descriptor_set.as_raw(),
+            phantom: PhantomData,
+        }
+    }
+
+    fn request_cmd_encoder<'a, 'thread>(
+        &self,
+        frame: &'a Frame,
+        thread_token: &'a ThreadToken,
+    ) -> CmdEncoder<'a> {
+        let frame = self.frame(frame);
+        let per_thread = frame.per_thread.get(thread_token);
+        let mut cmd_buffer_pool = per_thread.cmd_buffer_pool.borrow_mut();
+
+        // We have consumed all available command buffers, need to allocate a new one.
+        if cmd_buffer_pool.next_free_index >= cmd_buffer_pool.command_buffers.len() {
+            let mut cmd_buffers = [vk::CommandBuffer::null(); 4];
+            let allocate_info = vk::CommandBufferAllocateInfo {
+                command_pool: cmd_buffer_pool.command_pool,
+                level: vk::CommandBufferLevel::Primary,
+                command_buffer_count: cmd_buffers.len() as u32,
+                ..default()
+            };
+            vk_check!(self.device_fn.allocate_command_buffers(
+                self.device,
+                &allocate_info,
+                cmd_buffers.as_mut_ptr()
+            ));
+            cmd_buffer_pool.command_buffers.extend(cmd_buffers.iter());
+        }
+
+        let index = cmd_buffer_pool.next_free_index;
+        cmd_buffer_pool.next_free_index += 1;
+        let command_buffer = cmd_buffer_pool.command_buffers[index];
+
+        vk_check!(self.device_fn.begin_command_buffer(
+            command_buffer,
+            &vk::CommandBufferBeginInfo {
+                flags: vk::CommandBufferUsageFlags::ONE_TIME_SUBMIT,
+                ..default()
+            }
+        ));
+
+        let vulkan_cmd_encoder = per_thread.arena.alloc(VulkanCmdEncoder {
+            command_buffer,
+            ..default()
+        });
+
+        CmdEncoder {
+            cmd_encoder_addr: vulkan_cmd_encoder as *mut _ as usize,
+            phantom: PhantomData,
+            phantom_unsend: PhantomUnsend {},
+        }
+    }
+
+    fn cmd_insert_debug_marker(
+        &self,
+        cmd_encoder: &mut CmdEncoder,
+        label_name: &str,
+        color: [f32; 4],
+    ) {
+        #[cfg(feature = "debug_markers")]
+        if let Some(debug_utils_fn) = &self.debug_utils_fn {
+            let arena = HybridArena::<256>::new();
+            let label_name = arena.alloc_cstr_from_str(label_name);
+
+            let command_buffer = self.cmd_encoder_mut(cmd_encoder).command_buffer;
+            let label_info = vk::DebugUtilsLabelExt {
+                label_name: label_name.as_ptr(),
+                color,
+                ..default()
+            };
+            unsafe {
+                debug_utils_fn.cmd_insert_debug_utils_label_ext(command_buffer, &label_info);
+            }
+        }
+    }
+
+    fn cmd_begin_debug_marker(
+        &self,
+        cmd_encoder: &mut CmdEncoder,
+        label_name: &str,
+        color: [f32; 4],
+    ) {
+        #[cfg(feature = "debug_markers")]
+        if let Some(debug_utils_fn) = &self.debug_utils_fn {
+            let arena = HybridArena::<256>::new();
+            let label_name = arena.alloc_cstr_from_str(label_name);
+
+            let command_buffer = self.cmd_encoder_mut(cmd_encoder).command_buffer;
+            let label_info = vk::DebugUtilsLabelExt {
+                label_name: label_name.as_ptr(),
+                color,
+                ..default()
+            };
+            unsafe {
+                debug_utils_fn.cmd_begin_debug_utils_label_ext(command_buffer, &label_info);
+            }
+        }
+    }
+
+    fn cmd_end_debug_marker(&self, cmd_encoder: &mut CmdEncoder) {
+        #[cfg(feature = "debug_markers")]
+        if let Some(debug_utils_fn) = &self.debug_utils_fn {
+            let command_buffer = self.cmd_encoder_mut(cmd_encoder).command_buffer;
+            unsafe { debug_utils_fn.cmd_end_debug_utils_label_ext(command_buffer) }
+        }
+    }
+
+    fn cmd_set_bind_group(
+        &self,
+        cmd_encoder: &mut CmdEncoder,
+        bind_group_index: u32,
+        bind_group: &TransientBindGroup,
+    ) {
         let cmd_encoder = self.cmd_encoder_mut(cmd_encoder);
-        let VulkanBoundPipeline {
-            pipeline_layout,
-            pipeline_bind_point,
-        } = cmd_encoder
+
+        let bound_pipeline = cmd_encoder
             .bound_pipeline
             .as_ref()
-            .expect("cannot set bind groups without a pipeline bound")
-            .clone();
+            .expect("cannot set bind group without a pipeline bound");
 
         let command_buffer = cmd_encoder.command_buffer;
+        let descriptor_set = vk::DescriptorSet::from_raw(bind_group.bind_group);
 
         unsafe {
             self.device_fn.cmd_bind_descriptor_sets(
                 command_buffer,
-                pipeline_bind_point,
-                pipeline_layout,
+                bound_pipeline.pipeline_bind_point,
+                bound_pipeline.pipeline_layout,
                 bind_group_index,
                 &[descriptor_set],
                 &[],
@@ -2519,6 +2700,16 @@ impl Device for VulkanDevice {
         }
     }
 
+    fn cmd_dispatch_indirect(&self, cmd_encoder: &mut CmdEncoder, buffer: BufferArg, offset: u64) {
+        let (buffer, base_offset, _range) = self.unwrap_buffer_arg(&buffer);
+
+        let command_buffer = self.cmd_encoder_mut(cmd_encoder).command_buffer;
+        unsafe {
+            self.device_fn
+                .cmd_dispatch_indirect(command_buffer, buffer, base_offset + offset);
+        }
+    }
+
     fn submit(&self, frame: &Frame, mut cmd_encoder: CmdEncoder) {
         let fence = self.universal_queue_fence.fetch_add(1, Ordering::SeqCst) + 1;
 
@@ -2694,7 +2885,7 @@ impl Device for VulkanDevice {
         self.frame_counter.release(frame);
     }
 
-    fn get_buffer_address(&self, buffer: BufferArg) -> u64 {
+    fn get_buffer_address<'a>(&self, buffer: BufferArg<'a>) -> BufferAddress<'a> {
         let buffer = match buffer {
             BufferArg::Unmanaged(buffer) => buffer.0,
             BufferArg::Persistent(buffer) => buffer.buffer.0,
@@ -2702,7 +2893,10 @@ impl Device for VulkanDevice {
         };
         let buffer_pool = self.buffer_pool.lock();
         let buffer = buffer_pool.get(buffer).unwrap();
-        buffer.address
+        BufferAddress {
+            value: buffer.address,
+            phantom: PhantomData,
+        }
     }
 }
 
@@ -2761,6 +2955,11 @@ impl VulkanDevice {
                 )
             };
 
+            let address = BufferAddress {
+                value: address,
+                phantom: PhantomData,
+            };
+
             let ptr = NonNull::new(memory.mapped_ptr()).unwrap();
 
             frame.destroyed_buffers.lock().push_back(buffer);
@@ -2772,7 +2971,6 @@ impl VulkanDevice {
                 buffer: buffer.as_raw(),
                 address,
                 offset: 0,
-                phantom: PhantomData,
             };
         }
 
@@ -2828,6 +3026,13 @@ impl VulkanDevice {
 
         let current = allocator.current.as_ref().unwrap();
 
+        let address = BufferAddress {
+            value: current.address,
+            phantom: PhantomData,
+        };
+
+        let address = address.byte_add(allocator.offset);
+
         TransientBuffer {
             ptr: NonNull::new(
                 current
@@ -2838,9 +3043,8 @@ impl VulkanDevice {
             .unwrap(),
             len: size as usize,
             buffer: current.buffer.as_raw(),
-            address: current.address + allocator.offset,
+            address,
             offset: allocator.offset,
-            phantom: PhantomData,
         }
     }
 
index cb8a85d07f631db2e74d078e0095f2f0e34c0899..054365b6772bac53ec32bf6b629bac226733d5ae 100644 (file)
@@ -1,6 +1,7 @@
 use std::{ffi::CStr, marker::PhantomData};
 
 use backend::vulkan;
+use mapped_buffer::TransientBindGroup;
 use narcissus_core::{
     default, flags_def, raw_window::AsRawWindow, thread_token_def, Handle, PhantomUnsend,
 };
@@ -117,6 +118,12 @@ impl Scissor {
     }
 }
 
+#[derive(Clone, Copy, PartialEq, Eq)]
+pub enum PipelineBindPoint {
+    Graphics,
+    Compute,
+}
+
 flags_def!(ShaderStageFlags);
 impl ShaderStageFlags {
     pub const VERTEX: Self = Self(1 << 0);
@@ -265,11 +272,35 @@ pub struct ImageBlit {
     pub dst_offset_max: Offset3d,
 }
 
+pub enum SpecConstant {
+    Bool { id: u32, value: bool },
+    U32 { id: u32, value: u32 },
+    I32 { id: u32, value: i32 },
+    F32 { id: u32, value: f32 },
+}
+
 pub struct ShaderDesc<'a> {
     pub entry: &'a CStr,
+    pub require_full_subgroups: bool,
+    pub allow_varying_subgroup_size: bool,
+    pub required_subgroup_size: Option<u32>,
+    pub spec_constants: &'a [SpecConstant],
     pub code: &'a [u8],
 }
 
+impl<'a> Default for ShaderDesc<'a> {
+    fn default() -> Self {
+        Self {
+            entry: c"main",
+            require_full_subgroups: false,
+            allow_varying_subgroup_size: false,
+            required_subgroup_size: None,
+            spec_constants: &[],
+            code: &[],
+        }
+    }
+}
+
 #[derive(Clone, Copy, PartialEq, Eq)]
 pub enum SamplerFilter {
     Point,
@@ -417,7 +448,7 @@ pub struct GraphicsPipelineAttachments<'a> {
 pub struct GraphicsPipelineDesc<'a> {
     pub vertex_shader: ShaderDesc<'a>,
     pub fragment_shader: ShaderDesc<'a>,
-    pub layout: &'a PipelineLayout<'a>,
+    pub layout: PipelineLayout<'a>,
     pub attachments: GraphicsPipelineAttachments<'a>,
     pub topology: Topology,
     pub primitive_restart: bool,
@@ -436,7 +467,7 @@ pub struct GraphicsPipelineDesc<'a> {
 
 pub struct ComputePipelineDesc<'a> {
     pub shader: ShaderDesc<'a>,
-    pub layout: &'a PipelineLayout<'a>,
+    pub layout: PipelineLayout<'a>,
 }
 
 #[derive(Clone, Copy, Debug)]
@@ -546,6 +577,39 @@ pub enum TypedBind<'a> {
     StorageBuffer(&'a [BufferArg<'a>]),
 }
 
+#[repr(C)]
+#[derive(Clone, Copy)]
+pub struct BufferAddress<'a> {
+    value: u64,
+    phantom: PhantomData<&'a [u8]>,
+}
+
+impl<'a> BufferAddress<'a> {
+    #[inline(always)]
+    #[must_use]
+    pub fn as_raw(self) -> u64 {
+        self.value
+    }
+
+    #[inline(always)]
+    #[must_use]
+    pub fn byte_add(self, count: u64) -> Self {
+        Self {
+            value: self.value.wrapping_add(count),
+            phantom: self.phantom,
+        }
+    }
+
+    #[inline(always)]
+    #[must_use]
+    pub fn byte_offset(self, count: i64) -> Self {
+        Self {
+            value: self.value.wrapping_add_signed(count),
+            phantom: self.phantom,
+        }
+    }
+}
+
 #[derive(Clone, Copy, PartialEq, Eq)]
 pub enum Access {
     /// No access.
@@ -743,7 +807,7 @@ impl<'a> Frame<'a> {
 
 pub struct CmdEncoder<'a> {
     cmd_encoder_addr: usize,
-    thread_token: &'a ThreadToken,
+    phantom: PhantomData<&'a ()>,
     phantom_unsend: PhantomUnsend,
 }
 
@@ -788,9 +852,10 @@ pub trait Device {
 
     fn debug_name_buffer(&self, buffer: BufferArg, label_name: &str);
     fn debug_name_image(&self, image: Image, label_name: &str);
+    fn debug_name_bind_group_layout(&self, bind_group_layout: BindGroupLayout, label_name: &str);
+    fn debug_name_pipeline(&self, pipeline: Pipeline, label_name: &str);
 
-    // Danger Zone
-    fn get_buffer_address(&self, buffer: BufferArg) -> u64;
+    fn get_buffer_address<'a>(&self, buffer: BufferArg<'a>) -> BufferAddress<'a>;
 
     fn destroy_buffer(&self, frame: &Frame, buffer: Buffer);
     fn destroy_persistent_buffer(&self, frame: &Frame, buffer: PersistentBuffer);
@@ -834,6 +899,15 @@ pub trait Device {
         size: usize,
     ) -> TransientBuffer<'a>;
 
+    #[must_use]
+    fn request_transient_bind_group<'a>(
+        &self,
+        frame: &'a Frame<'a>,
+        thread_token: &'a ThreadToken,
+        layout: BindGroupLayout,
+        bindings: &[Bind],
+    ) -> TransientBindGroup<'a>;
+
     #[must_use]
     fn request_cmd_encoder<'a>(
         &'a self,
@@ -859,11 +933,9 @@ pub trait Device {
 
     fn cmd_set_bind_group(
         &self,
-        frame: &Frame,
         cmd_encoder: &mut CmdEncoder,
-        layout: BindGroupLayout,
         bind_group_index: u32,
-        bindings: &[Bind],
+        bind_group: &TransientBindGroup,
     );
 
     fn cmd_set_index_buffer(
@@ -948,6 +1020,8 @@ pub trait Device {
         group_count_z: u32,
     );
 
+    fn cmd_dispatch_indirect(&self, cmd_encoder: &mut CmdEncoder, buffer: BufferArg, offset: u64);
+
     fn submit(&self, frame: &Frame, cmd_encoder: CmdEncoder);
 
     fn begin_frame(&self) -> Frame;
index e3e8e849f742b4f54e2ae3b5a4946fc9a2896ef3..cdafabfa3c0d508df9f55f3c0588d8d36be66552 100644 (file)
@@ -1,6 +1,6 @@
 use std::{marker::PhantomData, ptr::NonNull};
 
-use crate::{Buffer, BufferArg};
+use crate::{Buffer, BufferAddress, BufferArg};
 
 #[cold]
 fn overflow() -> ! {
@@ -85,8 +85,7 @@ pub struct TransientBuffer<'a> {
     pub(crate) offset: u64,
     pub(crate) len: usize,
     pub(crate) buffer: u64,
-    pub(crate) address: u64,
-    pub(crate) phantom: PhantomData<&'a u8>,
+    pub(crate) address: BufferAddress<'a>,
 }
 
 impl<'a> TransientBuffer<'a> {
@@ -102,3 +101,8 @@ impl<'a> TransientBuffer<'a> {
         unsafe { copy_from_with_offset(self.ptr, self.len, offset, src) }
     }
 }
+
+pub struct TransientBindGroup<'a> {
+    pub(crate) bind_group: u64,
+    pub(crate) phantom: PhantomData<&'a ()>,
+}
index 0ba737dd5f8706ee305dc62d19786996a417cbaa..d901985b77a8c538b1d00a90a87367a8ff01db19 100644 (file)
@@ -1364,10 +1364,7 @@ pub struct SpecializationMapEntry {
 #[repr(C)]
 pub struct SpecializationInfo<'a> {
     pub map_entries: VulkanSlice1<'a, u32, SpecializationMapEntry, 4>,
-    ///  Size in bytes of pData
-    pub data_size: usize,
-    ///  Pointer to SpecConstant data
-    pub data: *const c_void,
+    pub data: VulkanSlice1<'a, usize, u8, 0>,
 }
 
 impl<'a> Default for SpecializationInfo<'a> {
@@ -1398,6 +1395,21 @@ impl<'a> Default for PipelineShaderStageCreateInfo<'a> {
     }
 }
 
+#[repr(C)]
+pub struct PipelineShaderStageRequiredSubgroupSizeCreateInfo {
+    pub _type: StructureType,
+    pub _next: *const c_void,
+    pub required_subgroup_size: u32,
+}
+
+impl Default for PipelineShaderStageRequiredSubgroupSizeCreateInfo {
+    fn default() -> Self {
+        let mut x = unsafe { MaybeUninit::<Self>::zeroed().assume_init() };
+        x._type = StructureType::PipelineShaderStageRequiredSubgroupSizeCreateInfo;
+        x
+    }
+}
+
 #[repr(C)]
 pub struct ComputePipelineCreateInfo<'a> {
     pub _type: StructureType,
@@ -2750,6 +2762,7 @@ impl Default for PhysicalDeviceVulkan13Features {
     }
 }
 
+#[repr(C)]
 pub struct PhysicalDeviceSwapchainMaintenance1FeaturesEXT {
     pub _type: StructureType,
     pub _next: *mut c_void,
index 85d563950d5ceafe8b92f280ca4aa5f3eeb8839a..a9acb52939dfb2984308f2e87a861be6e79f0777 100644 (file)
@@ -20,19 +20,39 @@ const SHADERS: &[Shader] = &[
     },
     Shader {
         stage: "comp",
-        name: "primitive_2d_bin",
+        name: "draw_2d_bin_0_clear",
     },
     Shader {
         stage: "comp",
-        name: "primitive_2d_bin_clear",
+        name: "draw_2d_bin_1_scatter",
     },
     Shader {
         stage: "comp",
-        name: "primitive_2d_rasterize",
+        name: "draw_2d_bin_2_sort",
     },
     Shader {
         stage: "comp",
-        name: "display_transform",
+        name: "draw_2d_bin_3_resolve",
+    },
+    Shader {
+        stage: "comp",
+        name: "draw_2d_rasterize",
+    },
+    Shader {
+        stage: "comp",
+        name: "radix_sort_0_upsweep",
+    },
+    Shader {
+        stage: "comp",
+        name: "radix_sort_1_spine",
+    },
+    Shader {
+        stage: "comp",
+        name: "radix_sort_2_downsweep",
+    },
+    Shader {
+        stage: "comp",
+        name: "composite",
     },
 ];
 
similarity index 75%
rename from title/shark-shaders/shaders/basic.frag.glsl
rename to title/shark-shaders/shaders/basic.frag
index e179a7fc88ae6add42554b8068796ea3625334b3..1693e95b1a476ea169996f0c4996d06d98535c6f 100644 (file)
@@ -1,7 +1,7 @@
 #version 460
 
-layout(set = 0, binding = 1) uniform sampler bilinear_sampler;
-layout(set = 1, binding = 2) uniform texture2D albedo;
+layout(set = 0, binding = 0) uniform sampler bilinear_sampler;
+layout(set = 0, binding = 1) uniform texture2D albedo;
 
 layout(location = 0) in vec2 tex_coord;
 layout(location = 1) in vec3 normal;
diff --git a/title/shark-shaders/shaders/basic.vert b/title/shark-shaders/shaders/basic.vert
new file mode 100644 (file)
index 0000000..a763036
--- /dev/null
@@ -0,0 +1,55 @@
+#version 460
+
+#extension GL_EXT_buffer_reference : require
+#extension GL_EXT_buffer_reference2 : require
+#extension GL_EXT_scalar_block_layout : require
+
+struct Vertex {
+    vec4 position;
+    vec4 normal;
+    vec4 texcoord;
+};
+
+struct Transform {
+    vec4 transform[3];
+};
+
+layout(buffer_reference, std430, buffer_reference_align = 16) readonly buffer VertexRef {
+    Vertex values[];
+};
+
+layout(buffer_reference, std430, buffer_reference_align = 16) readonly buffer TransformRef {
+    Transform values[];
+};
+
+struct BasicConstants {
+    mat4 clip_from_camera;
+    VertexRef vertex_buffer;
+    TransformRef transform_buffer;
+};
+
+layout(std430, row_major, push_constant) uniform BasicConstantsBlock {
+    BasicConstants constants;
+};
+
+layout(location = 0) out vec2 out_texcoord;
+layout(location = 1) out vec3 out_normal;
+
+void main() {
+    const Transform td = constants.transform_buffer.values[gl_InstanceIndex];
+    const Vertex vd = constants.vertex_buffer.values[gl_VertexIndex];
+
+    const mat4 camera_from_model = mat4(
+        td.transform[0].x, td.transform[0].w, td.transform[1].z, 0.0,
+        td.transform[0].y, td.transform[1].x, td.transform[1].w, 0.0, 
+        td.transform[0].z, td.transform[1].y, td.transform[2].x, 0.0, 
+        td.transform[2].y, td.transform[2].z, td.transform[2].w, 1.0
+    );
+
+    const vec4 position_clip = constants.clip_from_camera * camera_from_model * vec4(vd.position.xyz, 1.0);
+
+    gl_Position = position_clip;
+
+    out_normal = vd.normal.xyz;
+    out_texcoord = vec2(vd.texcoord.x, 1.0 - vd.texcoord.y);
+}
diff --git a/title/shark-shaders/shaders/basic.vert.glsl b/title/shark-shaders/shaders/basic.vert.glsl
deleted file mode 100644 (file)
index 7fab7f1..0000000
+++ /dev/null
@@ -1,47 +0,0 @@
-#version 460
-
-#extension GL_EXT_scalar_block_layout : require
-
-struct VertexData {
-    vec4 position;
-    vec4 normal;
-    vec4 texcoord;
-};
-
-struct TransformData {
-    vec4 transform[3];
-};
-
-layout(std430, row_major, set = 0, binding = 0) uniform uniformBuffer {
-    mat4 clip_from_camera;
-};
-
-layout(std430, set = 1, binding = 0) readonly buffer vertexBuffer {
-    VertexData vertices[];
-};
-
-layout(std430, set = 1, binding = 1) readonly buffer transformBuffer {
-    TransformData transforms[];
-};
-
-layout(location = 0) out vec2 out_texcoord;
-layout(location = 1) out vec3 out_normal;
-
-void main() {
-    const TransformData td = transforms[gl_InstanceIndex];
-    const VertexData vd = vertices[gl_VertexIndex];
-
-    const mat4 camera_from_model = mat4(
-        td.transform[0].x, td.transform[0].w, td.transform[1].z, 0.0,
-        td.transform[0].y, td.transform[1].x, td.transform[1].w, 0.0, 
-        td.transform[0].z, td.transform[1].y, td.transform[2].x, 0.0, 
-        td.transform[2].y, td.transform[2].z, td.transform[2].w, 1.0
-    );
-
-    const vec4 position_clip = clip_from_camera * camera_from_model * vec4(vd.position.xyz, 1.0);
-
-    gl_Position = position_clip;
-
-    out_normal = vd.normal.xyz;
-    out_texcoord = vec2(vd.texcoord.x, 1.0 - vd.texcoord.y);
-}
similarity index 64%
rename from title/shark-shaders/shaders/display_transform.comp.glsl
rename to title/shark-shaders/shaders/composite.comp
index 465f44a21fbda38001fc0111386e45ea92630bd2..0e6211d16063f7db03395424be44ab2338dd8389 100644 (file)
@@ -26,21 +26,23 @@ vec3 tony_mc_mapface(vec3 stimulus) {
 layout (local_size_x = 8, local_size_y = 8, local_size_z = 1) in;
 
 void main() {
-    const uvec2 tile_coord = gl_WorkGroupID.xy / 4;
-    const uint tile_index = tile_coord.y * uniforms.tile_stride + tile_coord.x;
-    const uint tile_base = tile_index * TILE_STRIDE;
-    TilesRead tiles_read = TilesRead(uniforms.tiles);
-    const uint lo = tiles_read.values[tile_base + TILE_BITMAP_RANGE_LO_OFFSET];
-    const uint hi = tiles_read.values[tile_base + TILE_BITMAP_RANGE_HI_OFFSET];
-
+    // const uvec2 tile_coord = gl_WorkGroupID.xy / 4;
+    // const uint tile_index = tile_coord.y * uniforms.tile_resolution.x + tile_coord.x;
+    
+    // TilesRead tiles_read = TilesRead(uniforms.tiles);
+    // const uint lo = tiles_read.values[tile_base + TILE_BITMAP_RANGE_LO_OFFSET];
+    // const uint hi = tiles_read.values[tile_base + TILE_BITMAP_RANGE_HI_OFFSET];
+
+    // Display transform
     const vec3 stimulus = imageLoad(color_layer, ivec2(gl_GlobalInvocationID.xy)).rgb;
     const vec3 transformed = tony_mc_mapface(stimulus);
     vec3 composited = srgb_oetf(transformed);
 
-    if (lo <= hi) {
-        const vec4 ui = imageLoad(ui_layer_read, ivec2(gl_GlobalInvocationID.xy)).rgba;
-        composited = ui.rgb + (composited * (1.0 - ui.a));
-    }
+    // UI Composite
+    // if (lo <= hi) {
+    //     const vec4 ui = imageLoad(ui_layer_read, ivec2(gl_GlobalInvocationID.xy)).rgba;
+    //     composited = ui.rgb + (composited * (1.0 - ui.a));
+    // }
 
     imageStore(composited_output, ivec2(gl_GlobalInvocationID.xy), vec4(composited, 1.0));
 }
index bbf7cf62c5b545a2ff64d1710688f2158f4df222..dc694d547ac65e5b3f31daeae9b341e2571a4b2f 100644 (file)
@@ -1,53 +1,6 @@
 #ifndef COMPUTE_BINDINGS_INCLUDE
 #define COMPUTE_BINDINGS_INCLUDE
 
-#include "primitive_2d.h"
-
-layout(buffer_reference, std430, buffer_reference_align = 16) readonly buffer PrimitiveInstances
-{
-    PrimitiveInstance values[];
-};
-
-layout(buffer_reference, std430, buffer_reference_align = 16) readonly buffer Rects
-{
-    Rect values[];
-};
-
-layout(buffer_reference, std430, buffer_reference_align = 16) readonly buffer Glyphs
-{
-    Glyph values[];
-};
-
-layout(buffer_reference, std430, buffer_reference_align = 4) readonly buffer TilesRead
-{
-    uint values[];
-};
-
-layout(buffer_reference, std430, buffer_reference_align = 4) writeonly buffer TilesWrite
-{
-    uint values[];
-};
-
-struct ComputeUniforms {
-    uvec2 screen_resolution;
-    uvec2 atlas_resolution;
-    uvec2 tile_resolution;
-
-    uint num_primitives;
-    uint num_primitives_32;
-    uint num_primitives_1024;
-    uint tile_stride;
-
-    PrimitiveInstances primitive_instances;
-    Rects rects;
-    Glyphs glyphs;
-    TilesWrite tiles;
-};
-
-layout(std430, push_constant) uniform UniformBuffer {
-    ComputeUniforms uniforms;
-};
-
 layout (set = 0, binding = 0) uniform sampler bilinear_sampler;
 layout (set = 0, binding = 1) uniform texture3D tony_mc_mapface_lut;
 layout (set = 0, binding = 2) uniform texture2D glyph_atlas;
diff --git a/title/shark-shaders/shaders/draw_2d.h b/title/shark-shaders/shaders/draw_2d.h
new file mode 100644 (file)
index 0000000..43181eb
--- /dev/null
@@ -0,0 +1,69 @@
+#ifndef DRAW_2D_H
+#define DRAW_2D_H
+
+const uint TILE_SIZE = 32;
+
+const uint DRAW_2D_CMD_RECT = 0;
+const uint DRAW_2D_CMD_GLYPH = 1;
+
+struct Glyph {
+    ivec2 atlas_min;
+    ivec2 atlas_max;
+
+    vec2 offset_min;
+    vec2 offset_max;
+};
+
+struct Draw2dCmd {
+    uint type;
+    uint words[7];
+};
+
+struct Draw2dCmdRect {
+    uint border_width;
+    vec2 position;
+    vec2 half_extent;
+    uint background_color;
+    uint border_color;
+};
+
+struct Draw2dCmdGlyph {
+    uint index;
+    vec2 position;
+    uint color;
+};
+
+Draw2dCmdRect decode_rect(Draw2dCmd cmd) {
+    return Draw2dCmdRect(
+        cmd.words[0],
+        vec2(uintBitsToFloat(cmd.words[1]), uintBitsToFloat(cmd.words[2])),
+        vec2(uintBitsToFloat(cmd.words[3]), uintBitsToFloat(cmd.words[4])),
+        cmd.words[5],
+        cmd.words[6]
+    );
+}
+
+Draw2dCmdGlyph decode_glyph(Draw2dCmd cmd) {
+    return Draw2dCmdGlyph(
+        cmd.words[0],
+        vec2(uintBitsToFloat(cmd.words[1]), uintBitsToFloat(cmd.words[2])),
+        cmd.words[3]
+    );
+}
+
+layout(buffer_reference, std430, buffer_reference_align = 16) readonly buffer Draw2dCommandRef
+{
+    Draw2dCmd values[];
+};
+
+layout(buffer_reference, std430, buffer_reference_align = 16) readonly buffer GlyphRef
+{
+    Glyph values[];
+};
+
+layout(buffer_reference, std430, buffer_reference_align = 4) buffer CoarseRef
+{
+    uint values[];
+};
+
+#endif
\ No newline at end of file
diff --git a/title/shark-shaders/shaders/draw_2d_bin_0_clear.comp b/title/shark-shaders/shaders/draw_2d_bin_0_clear.comp
new file mode 100644 (file)
index 0000000..8d5e613
--- /dev/null
@@ -0,0 +1,28 @@
+#version 460
+
+#extension GL_GOOGLE_include_directive : require
+
+#extension GL_EXT_buffer_reference : require
+#extension GL_EXT_buffer_reference2 : require
+#extension GL_EXT_scalar_block_layout : require
+#extension GL_EXT_control_flow_attributes : require
+
+#extension GL_KHR_shader_subgroup_vote : require
+#extension GL_KHR_shader_subgroup_ballot : require
+
+#include "draw_2d.h"
+#include "radix_sort.h"
+
+struct Draw2dClearConstants {
+    CoarseRef coarse_buffer;
+};
+
+layout(std430, push_constant) uniform Draw2dClearConstantsBlock {
+    Draw2dClearConstants constants;
+};
+
+layout (local_size_x = 1, local_size_y = 1, local_size_z = 1) in;
+
+void main() {
+    constants.coarse_buffer.values[0] = 0;
+}
diff --git a/title/shark-shaders/shaders/draw_2d_bin_1_scatter.comp b/title/shark-shaders/shaders/draw_2d_bin_1_scatter.comp
new file mode 100644 (file)
index 0000000..d991eb1
--- /dev/null
@@ -0,0 +1,189 @@
+#version 460
+
+#extension GL_GOOGLE_include_directive : require
+
+#extension GL_EXT_buffer_reference : require
+#extension GL_EXT_buffer_reference2 : require
+#extension GL_EXT_scalar_block_layout : require
+#extension GL_EXT_control_flow_attributes : require
+
+#extension GL_KHR_shader_subgroup_arithmetic : require
+#extension GL_KHR_shader_subgroup_ballot : require
+#extension GL_KHR_shader_subgroup_vote : require
+
+#include "draw_2d.h"
+
+struct Draw2dScatterConstants {
+    uvec2 screen_resolution;
+    uvec2 tile_resolution;
+
+    uint draw_buffer_len;
+    uint coarse_buffer_len;
+
+    Draw2dCommandRef draw_buffer;
+    GlyphRef glyph_buffer;
+    CoarseRef coarse_buffer;
+};
+
+layout(std430, push_constant) uniform Draw2dScatterConstantsBlock {
+    Draw2dScatterConstants constants;
+};
+
+layout (local_size_x_id = 0, local_size_y = 1, local_size_z = 1) in;
+
+void main() {
+    const uint local_id = gl_SubgroupID * gl_SubgroupSize + gl_SubgroupInvocationID;
+    const uint draw_index = gl_WorkGroupID.x * gl_WorkGroupSize.x +  local_id;
+
+    // Bounds for this command, any tiles which intersect this AABB will be written.
+    vec2 cmd_min = vec2(99999.9);
+    vec2 cmd_max = vec2(-99999.9);
+
+    if (draw_index < constants.draw_buffer_len) {
+        const Draw2dCmd cmd = constants.draw_buffer.values[draw_index];
+        const uint type = cmd.type;
+        for (;;) {
+            const uint scalar_type = subgroupBroadcastFirst(type);
+            [[branch]]
+            if (scalar_type == type) {
+                switch (scalar_type) {
+                    case DRAW_2D_CMD_RECT:
+                        const Draw2dCmdRect cmd_rect = decode_rect(cmd);
+                        cmd_min = cmd_rect.position - cmd_rect.half_extent - cmd_rect.border_width;
+                        cmd_max = cmd_rect.position + cmd_rect.half_extent + cmd_rect.border_width;
+                        break;
+                    case DRAW_2D_CMD_GLYPH:
+                        const Draw2dCmdGlyph cmd_glyph = decode_glyph(cmd);
+                        const Glyph glyph = constants.glyph_buffer.values[cmd_glyph.index];
+                        cmd_min = cmd_glyph.position + glyph.offset_min;
+                        cmd_max = cmd_glyph.position + glyph.offset_max;
+                        break;
+                }
+                break;
+            }
+        }
+    }
+
+    const vec2 cmds_min = subgroupMin(cmd_min);
+    const vec2 cmds_max = subgroupMax(cmd_max);
+
+    // Are all our commands entirely offscreen?
+    if (any(greaterThan(cmds_min, constants.screen_resolution)) || any(lessThan(cmds_max, vec2(0.0)))) {
+        return;
+    }
+
+    const uvec2 cmd_min_tile = uvec2(floor(max(min(cmd_min, constants.screen_resolution), 0.0) / TILE_SIZE));
+    const uvec2 cmd_max_tile = uvec2(floor(max(min(cmd_max, constants.screen_resolution), 0.0) / TILE_SIZE));
+    const uvec2 cmds_min_tile = subgroupMin(cmd_min_tile);
+    const uvec2 cmds_max_tile = subgroupMax(cmd_max_tile);
+
+    // Are any single commands responsible for the entire bounds?
+    const bool cmd_absolute_min = cmd_min_tile == cmds_min_tile;
+    const bool cmd_absolute_max = cmd_max_tile == cmds_max_tile;
+    const bool use_individual_bounds = !any(notEqual(subgroupBallot(cmd_absolute_min) & subgroupBallot(cmd_absolute_max), uvec4(0)));
+
+    if (false && use_individual_bounds) {
+
+    } else {
+        const uvec2 tile_count = cmds_max_tile - cmds_min_tile + uvec2(1);
+        const uint count = tile_count.x * tile_count.y;
+
+        uint offset;
+        if (subgroupElect()) {
+            offset = atomicAdd(constants.coarse_buffer.values[0], count) + 1;
+        }
+        offset = subgroupBroadcastFirst(offset);
+
+        if (offset >= constants.coarse_buffer_len) {
+            return;
+        }
+
+        for (uint y = 0; y < tile_count.y; y++) {
+            for (uint x = 0; x < tile_count.x; x += gl_SubgroupSize) {
+                const uint local_x = x + gl_SubgroupInvocationID;
+                if (local_x < tile_count.x) {
+                    const uint yy = cmds_min_tile.y + y;
+                    const uint xx = cmds_min_tile.x + local_x;
+                    const uint packed = ((yy & 0xff) << 24) | ((xx & 0xff) << 16) | (gl_WorkGroupID.x & 0xffff);
+                    constants.coarse_buffer.values[offset + local_x] = packed;
+                }
+                subgroupBarrier();
+            }
+            offset += tile_count.x;
+        }
+    }
+
+    // if (gl_SubgroupSize == 32 && fullscreen_ballot.x != 0) {
+    //     uint offset;
+    //     if (subgroupElect()) {
+    //         const uint count = constants.tile_resolution.x * constants.tile_resolution.y;
+    //         offset = atomicAdd(constants.coarse_buffer.values[0], count) + 1;
+    //     }
+    //     offset = subgroupBroadcastFirst(offset);
+
+    //     if (offset >= constants.coarse_buffer_len) {
+    //         return;
+    //     }
+
+    //     const uint word_index = gl_WorkGroupID.x;
+
+    //     for (uint y = 0; y < constants.tile_resolution.y; y++) {
+    //         for (uint x = 0; x < constants.tile_resolution.x; x++) {
+    //             const uint tile_index = y * constants.tile_resolution.x + x;
+    //             const uint packed = (tile_index << 16) | word_index;
+    //             if (subgroupElect() && offset + tile_index < constants.coarse_buffer_len) {
+    //                 constants.coarse_buffer.values[offset + tile_index] = packed;
+    //             }
+    //         }
+    //     }
+
+    //     return;
+    // }
+
+    // uint count = 0;
+
+    // for (uint y = cmds_min_tile.y; y <= cmds_max_tile.y; y++) {
+    //     for (uint x = cmds_min_tile.x; x <= cmds_max_tile.x; x++) {
+    //         const vec2 tile_min = uvec2(x, y) * TILE_SIZE;
+    //         const vec2 tile_max = min(tile_min + TILE_SIZE, constants.screen_resolution);
+
+    //         const bool intersects = !(any(lessThan(tile_max, cmd_min)) || any(greaterThan(tile_min, cmd_max)));
+    //         const uvec4 ballot = subgroupBallot(intersects);
+
+    //         if (subgroupElect()) {
+    //             count += uint(ballot.x != 0);
+    //         }
+    //     }
+    // }
+
+    // if (count == 0) {
+    //     return;
+    // }
+
+    // uint offset;
+    // if (subgroupElect()) {
+    //     offset = atomicAdd(constants.coarse_buffer.values[0], count) + 1;
+    // }
+    // offset = subgroupBroadcastFirst(offset);
+
+    // if (offset >= constants.coarse_buffer_len) {
+    //     return;
+    // }
+
+    // for (uint y = cmds_min_tile.y; y <= cmds_max_tile.y; y++) {
+    //     for (uint x = cmds_min_tile.x; x <= cmds_max_tile.x; x++) {
+    //         const vec2 tile_min = uvec2(x, y) * TILE_SIZE;
+    //         const vec2 tile_max = min(tile_min + TILE_SIZE, constants.screen_resolution);
+    //         const uint tile_index = y * constants.tile_resolution.x + x;
+
+    //         const bool intersects = !(any(lessThan(tile_max, cmd_min)) || any(greaterThan(tile_min, cmd_max)));
+    //         const uvec4 ballot = subgroupBallot(intersects);
+
+    //         if (subgroupElect() && ballot.x != 0 && offset < constants.coarse_buffer_len) {
+    //             const uint word_index = gl_WorkGroupID.x;
+    //             const uint packed = (tile_index << 16) | word_index;
+    //             constants.coarse_buffer.values[offset++] = packed;
+    //         }
+    //     }
+    // }
+}
diff --git a/title/shark-shaders/shaders/draw_2d_bin_2_sort.comp b/title/shark-shaders/shaders/draw_2d_bin_2_sort.comp
new file mode 100644 (file)
index 0000000..5a99c22
--- /dev/null
@@ -0,0 +1,49 @@
+#version 460
+
+#extension GL_GOOGLE_include_directive : require
+
+#extension GL_EXT_buffer_reference : require
+#extension GL_EXT_buffer_reference2 : require
+#extension GL_EXT_scalar_block_layout : require
+#extension GL_EXT_control_flow_attributes : require
+
+#extension GL_KHR_shader_subgroup_arithmetic : require
+#extension GL_KHR_shader_subgroup_ballot : require
+#extension GL_KHR_shader_subgroup_shuffle_relative: enable
+#extension GL_KHR_shader_subgroup_vote : require
+
+#include "compute_bindings.h"
+
+#include "draw_2d.h"
+#include "indirect.h"
+#include "radix_sort.h"
+
+layout(buffer_reference, std430, buffer_reference_align = 4) buffer VkDispatchIndirectCommandRef {
+    VkDispatchIndirectCommand dimensions;
+};
+
+struct Draw2dSortConstants {
+    uint coarse_buffer_len;
+    uint _pad;
+    VkDispatchIndirectCommandRef indirect_dispatch_buffer;
+    CoarseRef coarse_buffer;
+};
+
+layout(std430, push_constant) uniform Draw2dSortConstantsBlock {
+    Draw2dSortConstants constants;
+};
+
+layout (local_size_x = 1, local_size_y = 1, local_size_z = 1) in;
+
+void main() {
+    // We shouldn't overflow the coarse buffer in the scatter phase, but we can
+    // still end up with a count that's larger than the buffer size since we
+    // unconditionally atomicAdd. So we need to clamp to the actual size now
+    // before dispatching sort work.
+    const uint count = min(constants.coarse_buffer_len, constants.coarse_buffer.values[0]);
+    constants.coarse_buffer.values[0] = count;
+
+    constants.indirect_dispatch_buffer.dimensions.x = (count + (RADIX_ITEMS_PER_WGP - 1)) / RADIX_ITEMS_PER_WGP;
+    constants.indirect_dispatch_buffer.dimensions.y = 1;
+    constants.indirect_dispatch_buffer.dimensions.z = 1;
+}
diff --git a/title/shark-shaders/shaders/draw_2d_bin_3_resolve.comp b/title/shark-shaders/shaders/draw_2d_bin_3_resolve.comp
new file mode 100644 (file)
index 0000000..718563d
--- /dev/null
@@ -0,0 +1,22 @@
+#version 460
+
+#extension GL_GOOGLE_include_directive : require
+
+#extension GL_EXT_buffer_reference : require
+#extension GL_EXT_buffer_reference2 : require
+#extension GL_EXT_scalar_block_layout : require
+#extension GL_EXT_control_flow_attributes : require
+
+#extension GL_KHR_shader_subgroup_arithmetic : require
+#extension GL_KHR_shader_subgroup_ballot : require
+#extension GL_KHR_shader_subgroup_vote : require
+
+#include "compute_bindings.h"
+
+const uint SUBGROUP_SIZE = 64;
+
+// TODO: Spec constant support for different subgroup sizes.
+layout (local_size_x = SUBGROUP_SIZE, local_size_y = 1, local_size_z = 1) in;
+
+void main() {
+}
diff --git a/title/shark-shaders/shaders/draw_2d_rasterize.comp b/title/shark-shaders/shaders/draw_2d_rasterize.comp
new file mode 100644 (file)
index 0000000..dfa9b7d
--- /dev/null
@@ -0,0 +1,145 @@
+#version 460
+
+#extension GL_GOOGLE_include_directive : require
+
+#extension GL_EXT_buffer_reference : require
+#extension GL_EXT_buffer_reference2 : require
+#extension GL_EXT_scalar_block_layout : require
+#extension GL_EXT_control_flow_attributes : require
+
+#extension GL_KHR_shader_subgroup_vote : require
+#extension GL_KHR_shader_subgroup_ballot : require
+
+#include "compute_bindings.h"
+
+layout (local_size_x = 8, local_size_y = 8, local_size_z = 1) in;
+
+/// x = (((index >> 2) & 0x0007) & 0xFFFE) | index & 0x0001
+/// y = ((index >> 1) & 0x0003) | (((index >> 3) & 0x0007) & 0xFFFC)
+
+#define DEBUG_SHOW_TILES 0
+
+#if DEBUG_SHOW_TILES != 0
+
+vec3 plasma_quintic(float x)
+{
+       x = clamp(x, 0.0, 1.0);
+       vec4 x1 = vec4(1.0, x, x * x, x * x * x); // 1 x x2 x3
+       vec4 x2 = x1 * x1.w * x; // x4 x5 x6 x7
+       return vec3(
+               dot(x1.xyzw, vec4(+0.063861086, +1.992659096, -1.023901152, -0.490832805)) + dot(x2.xy, vec2(+1.308442123, -0.914547012)),
+               dot(x1.xyzw, vec4(+0.049718590, -0.791144343, +2.892305078, +0.811726816)) + dot(x2.xy, vec2(-4.686502417, +2.717794514)),
+               dot(x1.xyzw, vec4(+0.513275779, +1.580255060, -5.164414457, +4.559573646)) + dot(x2.xy, vec2(-1.916810682, +0.570638854))
+    );
+}
+
+#endif
+
+void main() {
+    // const uvec2 tile_coord = gl_WorkGroupID.xy / (TILE_SIZE / gl_WorkGroupSize.xy);
+    // const uint tile_index = tile_coord.y * uniforms.tile_resolution.x + tile_coord.x;
+
+    // TilesRead tiles_read = TilesRead(uniforms.tiles);
+
+    // const uint lo = tiles_read.values[tile_base + TILE_BITMAP_RANGE_LO_OFFSET];
+    // const uint hi = tiles_read.values[tile_base + TILE_BITMAP_RANGE_HI_OFFSET];
+
+    // if (hi < lo) {
+    //     return;
+    // }
+
+// #if DEBUG_SHOW_TILES == 1
+
+//     uint count = 0;
+//     // For each tile, iterate over all words in the L1 bitmap.
+//     for (uint index_l1 = lo; index_l1 <= hi; index_l1++) {
+//         // For each word, iterate all set bits.
+//         uint bitmap_l1 = tiles_read.values[tile_base + TILE_BITMAP_L1_OFFSET + index_l1];
+
+//         while (bitmap_l1 != 0) {
+//             const uint i = findLSB(bitmap_l1);
+//             bitmap_l1 ^= bitmap_l1 & -bitmap_l1;
+
+//             // For each set bit in the L1 bitmap, iterate the set bits in the
+//             // corresponding L0 bitmap.
+//             const uint index_l0 = index_l1 * 32 + i;
+//             uint bitmap_l0 = tiles_read.values[tile_base + TILE_BITMAP_L0_OFFSET + index_l0];
+
+//             count += bitCount(bitmap_l0);
+//         }
+//     }
+
+//     const vec3 color = plasma_quintic(float(count) / 100.0);
+//     imageStore(ui_layer_write, ivec2(gl_GlobalInvocationID.xy), vec4(color, 1.0));
+
+// #elif DEBUG_SHOW_TILES == 2
+
+//     uint count = hi - lo;
+//     const vec3 color = plasma_quintic(float(count) / 100.0);
+//     imageStore(ui_layer_write, ivec2(gl_GlobalInvocationID.xy), vec4(color, 1.0));
+
+// #else
+
+//     const vec2 sample_center = gl_GlobalInvocationID.xy + vec2(0.5);
+
+//     vec4 accum = vec4(0.0);
+
+//     // For each tile, iterate over all words in the L1 bitmap.
+//     for (uint index_l1 = lo; index_l1 <= hi; index_l1++) {
+//         // For each word, iterate all set bits.
+//         uint bitmap_l1 = tiles_read.values[tile_base + TILE_BITMAP_L1_OFFSET + index_l1];
+
+//         while (bitmap_l1 != 0) {
+//             const uint i = findLSB(bitmap_l1);
+//             bitmap_l1 ^= bitmap_l1 & -bitmap_l1;
+
+//             // For each set bit in the L1 bitmap, iterate the set bits in the
+//             // corresponding L0 bitmap.
+//             const uint index_l0 = index_l1 * 32 + i;
+//             uint bitmap_l0 = tiles_read.values[tile_base + TILE_BITMAP_L0_OFFSET + index_l0];
+//             while (bitmap_l0 != 0) {
+//                 const uint j = findLSB(bitmap_l0);
+//                 bitmap_l0 ^= bitmap_l0 & -bitmap_l0;
+
+//                 // Set bits in the L0 bitmap indicate binned primitives for this tile.
+//                 const uint primitive_index = index_l0 * 32 + j;
+//                 const Primitive2d primitive = uniforms.primitives.values[primitive_index];
+//                 const uint type = primitive.type;
+
+//                 vec4 primitive_color = vec4(0.0);
+
+//                 switch (type) {
+//                     case PRIMITIVE_TYPE_RECT: {
+//                         // const Rect rect = uniforms.rects.values[offset];
+//                         // const vec2 rect_min = primitive_instance.position - rect.half_extent - rect.border_width;
+//                         // const vec2 rect_max = primitive_instance.position + rect.half_extent + rect.border_width;
+//                         if (all(greaterThanEqual(sample_center, rect_min)) && all(lessThanEqual(sample_center, rect_max))) {
+//                             primitive_color = unpackUnorm4x8(primitive_instance.color).bgra;
+//                         }
+//                         break;
+//                     }
+//                     case PRIMITIVE_TYPE_GLYPH: {
+//                         const Glyph glyph = uniforms.glyphs.values[offset];
+//                         const vec2 glyph_min = primitive_instance.position + glyph.offset_min;
+//                         const vec2 glyph_max = primitive_instance.position + glyph.offset_max;
+//                         if (all(greaterThanEqual(sample_center, glyph_min)) && all(lessThanEqual(sample_center, glyph_max))) {
+//                             const vec2 glyph_size = glyph.offset_max - glyph.offset_min;
+//                             const vec2 uv = mix(glyph.atlas_min, glyph.atlas_max, (sample_center - glyph_min) / glyph_size) / uniforms.atlas_resolution;
+//                             const vec4 color = unpackUnorm4x8(primitive_instance.color).bgra;
+//                             const float coverage = textureLod(sampler2D(glyph_atlas, bilinear_sampler), uv, 0.0).r * color.a;
+//                             primitive_color = color * coverage;
+//                         }
+//                         break;
+//                     }
+//                 }
+
+//                 // does it blend?
+//                 accum.rgba = primitive_color.rgba + accum.rgba * (1.0 - primitive_color.a);
+//             }
+//         }
+//     }
+
+//     imageStore(ui_layer_write, ivec2(gl_GlobalInvocationID.xy), accum);
+
+// #endif
+}
diff --git a/title/shark-shaders/shaders/indirect.h b/title/shark-shaders/shaders/indirect.h
new file mode 100644 (file)
index 0000000..d409cf2
--- /dev/null
@@ -0,0 +1,10 @@
+#ifndef INDIRECT_H
+#define INDIRECT_H
+
+struct VkDispatchIndirectCommand {
+    uint x;
+    uint y;
+    uint z;
+};
+
+#endif
\ No newline at end of file
diff --git a/title/shark-shaders/shaders/primitive_2d.h b/title/shark-shaders/shaders/primitive_2d.h
deleted file mode 100644 (file)
index ae6a2ec..0000000
+++ /dev/null
@@ -1,38 +0,0 @@
-#ifndef PRIMITIVE_2D_INCLUDE
-#define PRIMITIVE_2D_INCLUDE
-
-const uint TILE_SIZE = 32;
-
-const uint MAX_PRIMS = 1 << 18;
-const uint TILE_BITMAP_L1_WORDS = (MAX_PRIMS / 32 / 32);
-const uint TILE_BITMAP_L0_WORDS = (MAX_PRIMS / 32);
-const uint TILE_STRIDE = (TILE_BITMAP_L0_WORDS + TILE_BITMAP_L1_WORDS + 2);
-const uint TILE_BITMAP_RANGE_LO_OFFSET = 0;
-const uint TILE_BITMAP_RANGE_HI_OFFSET = (TILE_BITMAP_RANGE_LO_OFFSET + 1);
-const uint TILE_BITMAP_L1_OFFSET = (TILE_BITMAP_RANGE_HI_OFFSET + 1);
-const uint TILE_BITMAP_L0_OFFSET = (TILE_BITMAP_L1_OFFSET + TILE_BITMAP_L1_WORDS);
-
-const uint PRIMITIVE_TYPE_RECT = 0;
-const uint PRIMITIVE_TYPE_GLYPH = 1;
-
-struct PrimitiveInstance {
-    uint packed;
-    uint color;
-    vec2 position;
-};
-
-struct Rect {
-    vec2 half_extent;
-    float border_width;
-    float border_radius;
-};
-
-struct Glyph {
-    ivec2 atlas_min;
-    ivec2 atlas_max;
-
-    vec2 offset_min;
-    vec2 offset_max;
-};
-
-#endif
\ No newline at end of file
diff --git a/title/shark-shaders/shaders/primitive_2d_bin.comp.glsl b/title/shark-shaders/shaders/primitive_2d_bin.comp.glsl
deleted file mode 100644 (file)
index 6398615..0000000
+++ /dev/null
@@ -1,106 +0,0 @@
-#version 460
-
-#extension GL_GOOGLE_include_directive : require
-
-#extension GL_EXT_buffer_reference : require
-#extension GL_EXT_buffer_reference2 : require
-#extension GL_EXT_scalar_block_layout : require
-#extension GL_EXT_control_flow_attributes : require
-
-#extension GL_KHR_shader_subgroup_arithmetic : require
-#extension GL_KHR_shader_subgroup_ballot : require
-#extension GL_KHR_shader_subgroup_vote : require
-
-#include "compute_bindings.h"
-
-const uint SUBGROUP_SIZE = 64;
-const uint NUM_SUBGROUPS = 16;
-const uint NUM_PRIMITIVES_WG = (SUBGROUP_SIZE * NUM_SUBGROUPS);
-
-// TODO: Spec constant support for different subgroup sizes.
-layout (local_size_x = SUBGROUP_SIZE, local_size_y = 1, local_size_z = 1) in;
-
-void main() {
-    uint word_index = 0;
-
-    for (uint i = 0; i < NUM_PRIMITIVES_WG; i += gl_SubgroupSize.x) {
-        const uint primitive_index = gl_WorkGroupID.x * NUM_PRIMITIVES_WG + i + gl_SubgroupInvocationID;
-
-        // Bounds for this primitive, any tiles which intersect this AABB will be written.
-        vec2 primitive_min = vec2(99999.9);
-        vec2 primitive_max = vec2(-99999.9);
-
-        if (primitive_index < uniforms.num_primitives) {
-            const PrimitiveInstance primitive_instance = uniforms.primitive_instances.values[primitive_index];
-            const uint type = bitfieldExtract(primitive_instance.packed, 30, 2);
-            const uint offset = bitfieldExtract(primitive_instance.packed, 0, 20);
-
-            for (;;) {
-                const uint scalar_type = subgroupBroadcastFirst(type);
-                [[branch]]
-                if (scalar_type == type) {
-                    switch (type) {
-                        case PRIMITIVE_TYPE_RECT:
-                            const Rect rect = uniforms.rects.values[offset];
-                            primitive_min = primitive_instance.position - rect.half_extent;
-                            primitive_max = primitive_instance.position + rect.half_extent;
-                            break;
-                        case PRIMITIVE_TYPE_GLYPH:
-                            const Glyph glyph = uniforms.glyphs.values[offset];
-                            primitive_min = primitive_instance.position + glyph.offset_min;
-                            primitive_max = primitive_instance.position + glyph.offset_max;
-                            break;
-                    }
-                    break;
-                }
-            }
-        }
-
-        const vec2 primitives_min = subgroupMin(primitive_min);
-        const vec2 primitives_max = subgroupMax(primitive_max);
-
-        if (any(greaterThan(primitives_min, uniforms.screen_resolution)) || any(lessThan(primitives_max, vec2(0.0)))) {
-            word_index += 2;
-            continue;
-        }
-
-        const ivec2 tile_start = ivec2(floor(max(min(primitives_min, uniforms.screen_resolution), 0.0) / TILE_SIZE));
-        const ivec2 tile_end = ivec2(floor((max(min(primitives_max, uniforms.screen_resolution), 0.0) + (TILE_SIZE - 1)) / TILE_SIZE));
-
-        for (int y = tile_start.y; y < tile_end.y; y++) {
-            for (int x = tile_start.x; x < tile_end.x; x++) {
-                const uvec2 tile_coord = uvec2(x, y);
-                const vec2 tile_min = tile_coord * TILE_SIZE;
-                const vec2 tile_max = min(tile_min + TILE_SIZE, uniforms.screen_resolution);
-
-                const bool intersects = !(any(lessThan(tile_max, primitive_min)) || any(greaterThan(tile_min, primitive_max)));
-                const uvec4 ballot = subgroupBallot(intersects);
-
-                if (ballot.x == 0 && ballot.y == 0) {
-                    continue;
-                }
-
-                const uint tile_index = tile_coord.y * uniforms.tile_stride + tile_coord.x;
-
-                if (ballot.x != 0) {
-                    uniforms.tiles.values[tile_index * TILE_STRIDE + TILE_BITMAP_L0_OFFSET + gl_WorkGroupID.x * 32 + word_index + 0] = ballot.x;
-                }
-
-                if (ballot.y != 0) {
-                    uniforms.tiles.values[tile_index * TILE_STRIDE + TILE_BITMAP_L0_OFFSET + gl_WorkGroupID.x * 32 + word_index + 1] = ballot.y;
-                }
-
-                if (subgroupElect()) {
-                    uniforms.tiles.values[tile_index * TILE_STRIDE + TILE_BITMAP_L1_OFFSET + gl_WorkGroupID.x] |=
-                        (uint(ballot.x != 0) << (word_index + 0)) |
-                        (uint(ballot.y != 0) << (word_index + 1));
-
-                    atomicMin(uniforms.tiles.values[tile_index * TILE_STRIDE + TILE_BITMAP_RANGE_LO_OFFSET], gl_WorkGroupID.x);
-                    atomicMax(uniforms.tiles.values[tile_index * TILE_STRIDE + TILE_BITMAP_RANGE_HI_OFFSET], gl_WorkGroupID.x);
-                }
-            }
-        }
-
-        word_index += 2;
-    }
-}
diff --git a/title/shark-shaders/shaders/primitive_2d_bin_clear.comp.glsl b/title/shark-shaders/shaders/primitive_2d_bin_clear.comp.glsl
deleted file mode 100644 (file)
index 8d443a2..0000000
+++ /dev/null
@@ -1,27 +0,0 @@
-#version 460
-
-#extension GL_GOOGLE_include_directive : require
-
-#extension GL_EXT_buffer_reference : require
-#extension GL_EXT_buffer_reference2 : require
-#extension GL_EXT_scalar_block_layout : require
-#extension GL_EXT_control_flow_attributes : require
-
-#extension GL_KHR_shader_subgroup_vote : require
-#extension GL_KHR_shader_subgroup_ballot : require
-
-#include "compute_bindings.h"
-
-// TODO: Spec constant support for different subgroup sizes.
-layout (local_size_x = 64, local_size_y = 1, local_size_z = 1) in;
-
-void main() {
-    const uint tile_index = gl_GlobalInvocationID.z * uniforms.tile_stride + gl_GlobalInvocationID.y;
-
-    uniforms.tiles.values[tile_index * TILE_STRIDE + TILE_BITMAP_RANGE_LO_OFFSET] = 0xffffffff;
-    uniforms.tiles.values[tile_index * TILE_STRIDE + TILE_BITMAP_RANGE_HI_OFFSET] = 0;
-
-    if (gl_GlobalInvocationID.x < TILE_BITMAP_L1_WORDS) {
-        uniforms.tiles.values[tile_index * TILE_STRIDE + TILE_BITMAP_L1_OFFSET + gl_GlobalInvocationID.x] = 0;
-    }
-}
diff --git a/title/shark-shaders/shaders/primitive_2d_rasterize.comp.glsl b/title/shark-shaders/shaders/primitive_2d_rasterize.comp.glsl
deleted file mode 100644 (file)
index c61ce61..0000000
+++ /dev/null
@@ -1,143 +0,0 @@
-#version 460
-
-#extension GL_GOOGLE_include_directive : require
-
-#extension GL_EXT_buffer_reference : require
-#extension GL_EXT_buffer_reference2 : require
-#extension GL_EXT_scalar_block_layout : require
-#extension GL_EXT_control_flow_attributes : require
-
-#extension GL_KHR_shader_subgroup_vote : require
-#extension GL_KHR_shader_subgroup_ballot : require
-
-#include "compute_bindings.h"
-#include "primitive_2d.h"
-
-layout (local_size_x = 8, local_size_y = 8, local_size_z = 1) in;
-
-#define DEBUG_SHOW_TILES 0
-
-#if DEBUG_SHOW_TILES != 0
-
-vec3 plasma_quintic(float x)
-{
-       x = clamp(x, 0.0, 1.0);
-       vec4 x1 = vec4(1.0, x, x * x, x * x * x); // 1 x x2 x3
-       vec4 x2 = x1 * x1.w * x; // x4 x5 x6 x7
-       return vec3(
-               dot(x1.xyzw, vec4(+0.063861086, +1.992659096, -1.023901152, -0.490832805)) + dot(x2.xy, vec2(+1.308442123, -0.914547012)),
-               dot(x1.xyzw, vec4(+0.049718590, -0.791144343, +2.892305078, +0.811726816)) + dot(x2.xy, vec2(-4.686502417, +2.717794514)),
-               dot(x1.xyzw, vec4(+0.513275779, +1.580255060, -5.164414457, +4.559573646)) + dot(x2.xy, vec2(-1.916810682, +0.570638854))
-    );
-}
-
-#endif
-
-void main() {
-    const uvec2 tile_coord = gl_WorkGroupID.xy / (TILE_SIZE / gl_WorkGroupSize.xy);
-    const uint tile_index = tile_coord.y * uniforms.tile_stride + tile_coord.x;
-    const uint tile_base = tile_index * TILE_STRIDE;
-
-    TilesRead tiles_read = TilesRead(uniforms.tiles);
-
-    const uint lo = tiles_read.values[tile_base + TILE_BITMAP_RANGE_LO_OFFSET];
-    const uint hi = tiles_read.values[tile_base + TILE_BITMAP_RANGE_HI_OFFSET];
-
-    if (hi < lo) {
-        return;
-    }
-
-#if DEBUG_SHOW_TILES == 1
-
-    uint count = 0;
-    // For each tile, iterate over all words in the L1 bitmap.
-    for (uint index_l1 = lo; index_l1 <= hi; index_l1++) {
-        // For each word, iterate all set bits.
-        uint bitmap_l1 = tiles_read.values[tile_base + TILE_BITMAP_L1_OFFSET + index_l1];
-
-        while (bitmap_l1 != 0) {
-            const uint i = findLSB(bitmap_l1);
-            bitmap_l1 ^= bitmap_l1 & -bitmap_l1;
-
-            // For each set bit in the L1 bitmap, iterate the set bits in the
-            // corresponding L0 bitmap.
-            const uint index_l0 = index_l1 * 32 + i;
-            uint bitmap_l0 = tiles_read.values[tile_base + TILE_BITMAP_L0_OFFSET + index_l0];
-
-            count += bitCount(bitmap_l0);
-        }
-    }
-
-    const vec3 color = plasma_quintic(float(count) / 100.0);
-    imageStore(ui_layer_write, ivec2(gl_GlobalInvocationID.xy), vec4(color, 1.0));
-
-#elif DEBUG_SHOW_TILES == 2
-
-    uint count = hi - lo;
-    const vec3 color = plasma_quintic(float(count) / 100.0);
-    imageStore(ui_layer_write, ivec2(gl_GlobalInvocationID.xy), vec4(color, 1.0));
-
-#else
-
-    const vec2 sample_center = gl_GlobalInvocationID.xy + vec2(0.5);
-
-    vec4 accum = vec4(0.0);
-
-    // For each tile, iterate over all words in the L1 bitmap.
-    for (uint index_l1 = lo; index_l1 <= hi; index_l1++) {
-        // For each word, iterate all set bits.
-        uint bitmap_l1 = tiles_read.values[tile_base + TILE_BITMAP_L1_OFFSET + index_l1];
-
-        while (bitmap_l1 != 0) {
-            const uint i = findLSB(bitmap_l1);
-            bitmap_l1 ^= bitmap_l1 & -bitmap_l1;
-
-            // For each set bit in the L1 bitmap, iterate the set bits in the
-            // corresponding L0 bitmap.
-            const uint index_l0 = index_l1 * 32 + i;
-            uint bitmap_l0 = tiles_read.values[tile_base + TILE_BITMAP_L0_OFFSET + index_l0];
-            while (bitmap_l0 != 0) {
-                const uint j = findLSB(bitmap_l0);
-                bitmap_l0 ^= bitmap_l0 & -bitmap_l0;
-
-                // Set bits in the L0 bitmap indicate binned primitives for this tile.
-                const uint primitive_index = index_l0 * 32 + j;
-                const PrimitiveInstance primitive_instance = uniforms.primitive_instances.values[primitive_index];
-                const uint type = bitfieldExtract(primitive_instance.packed, 30, 2);
-                const uint offset = bitfieldExtract(primitive_instance.packed, 0, 20);
-
-                switch (type) {
-                    case PRIMITIVE_TYPE_RECT: {
-                        const Rect rect = uniforms.rects.values[offset];
-                        const vec2 rect_min = primitive_instance.position - rect.half_extent;
-                        const vec2 rect_max = primitive_instance.position + rect.half_extent;
-                        if (all(greaterThanEqual(sample_center, rect_min)) && all(lessThanEqual(sample_center, rect_max))) {
-                            const vec4 color = unpackUnorm4x8(primitive_instance.color).bgra;
-                            accum.rgb = color.rgb * color.a + accum.rgb * (1.0 - color.a);
-                            accum.a = color.a + accum.a * (1.0 - color.a);
-                        }
-                        break;
-                    }
-                    case PRIMITIVE_TYPE_GLYPH: {
-                        const Glyph glyph = uniforms.glyphs.values[offset];
-                        const vec2 glyph_min = primitive_instance.position + glyph.offset_min;
-                        const vec2 glyph_max = primitive_instance.position + glyph.offset_max;
-                        if (all(greaterThanEqual(sample_center, glyph_min)) && all(lessThanEqual(sample_center, glyph_max))) {
-                            const vec2 glyph_size = glyph.offset_max - glyph.offset_min;
-                            const vec2 uv = mix(glyph.atlas_min, glyph.atlas_max, (sample_center - glyph_min) / glyph_size) / uniforms.atlas_resolution;
-                            const vec4 color = unpackUnorm4x8(primitive_instance.color).bgra;
-                            const float coverage = textureLod(sampler2D(glyph_atlas, bilinear_sampler), uv, 0.0).r * color.a;
-                            accum.rgb = color.rgb * coverage + accum.rgb * (1.0 - coverage);
-                            accum.a = coverage + accum.a * (1.0 - coverage);
-                        }
-                        break;
-                    }
-                }
-            }
-        }
-    }
-
-    imageStore(ui_layer_write, ivec2(gl_GlobalInvocationID.xy), accum);
-
-#endif
-}
diff --git a/title/shark-shaders/shaders/radix_sort.h b/title/shark-shaders/shaders/radix_sort.h
new file mode 100644 (file)
index 0000000..52ce630
--- /dev/null
@@ -0,0 +1,14 @@
+#ifndef RADIX_SORT_H
+#define RADIX_SORT_H
+
+const uint RADIX_BITS = 8;
+const uint RADIX_DIGITS = 1 << RADIX_BITS;
+const uint RADIX_MASK = RADIX_DIGITS - 1;
+
+const uint RADIX_WGP_SIZE = 256;
+const uint RADIX_ITEMS_PER_INVOCATION = 16;
+const uint RADIX_ITEMS_PER_WGP = RADIX_WGP_SIZE * RADIX_ITEMS_PER_INVOCATION;
+
+const uint RADIX_SPINE_WGP_SIZE = 256;
+
+#endif
diff --git a/title/shark-shaders/shaders/radix_sort_0_upsweep.comp b/title/shark-shaders/shaders/radix_sort_0_upsweep.comp
new file mode 100644 (file)
index 0000000..69f4904
--- /dev/null
@@ -0,0 +1,84 @@
+#version 460
+
+#extension GL_GOOGLE_include_directive : require
+
+#extension GL_EXT_buffer_reference : require
+#extension GL_EXT_buffer_reference2 : require
+#extension GL_EXT_scalar_block_layout : require
+#extension GL_EXT_control_flow_attributes : require
+
+#include "compute_bindings.h"
+
+#include "radix_sort.h"
+
+#include "draw_2d.h"
+#include "indirect.h"
+
+layout(buffer_reference, std430, buffer_reference_align = 4) readonly buffer CountRef {
+    uint value;
+};
+
+layout(buffer_reference, std430, buffer_reference_align = 4) readonly buffer ValuesRef {
+    uint values[];
+};
+
+layout(buffer_reference, std430, buffer_reference_align = 4) writeonly buffer SpineRef {
+    uint values[];
+};
+
+struct RadixSortUpsweepConstants {
+    uint shift;
+    uint _pad;
+    CountRef count_buffer;
+    ValuesRef src_buffer;
+    SpineRef spine_buffer;
+};
+
+layout(std430, push_constant) uniform RadixSortUpsweepConstantsBlock {
+    RadixSortUpsweepConstants constants;
+};
+
+shared uint histogram[RADIX_DIGITS];
+
+layout (local_size_x = RADIX_DIGITS, local_size_y = 1, local_size_z = 1) in;
+
+void main() {
+    const uint count = constants.count_buffer.value;
+    const uint shift = constants.shift;
+
+    // Clear local histogram
+    histogram[gl_LocalInvocationID.x] = 0;
+
+    barrier();
+
+    const uint start = gl_WorkGroupID.x * RADIX_ITEMS_PER_WGP;
+    const uint end = start + RADIX_ITEMS_PER_WGP;
+
+    const bool skip_bounds_check = end <= count;
+
+    if (skip_bounds_check) {
+        for (uint i = start; i < end; i += RADIX_DIGITS) {
+            const uint index = i + gl_LocalInvocationID.x;
+            const uint value = constants.src_buffer.values[index];
+            const uint digit = (value >> shift) & RADIX_MASK;
+            atomicAdd(histogram[digit], 1);
+        }
+    } else {
+        for (uint i = start; i < end; i += RADIX_DIGITS) {
+            const uint index = i + gl_LocalInvocationID.x;
+            if (index < count) {
+                const uint value = constants.src_buffer.values[index];
+                const uint digit = (value >> shift) & RADIX_MASK;
+                atomicAdd(histogram[digit], 1);
+            }
+        }
+    }
+
+    barrier();
+
+    // Scatter to the spine, this is a striped layout so we can efficiently
+    // calculate the prefix sum. Re-calculate how many workgroups we dispatched
+    // to determine the stride we need to write at.
+    const uint wgp_count = (count + (RADIX_ITEMS_PER_WGP - 1)) / RADIX_ITEMS_PER_WGP;
+    constants.spine_buffer.values[(gl_LocalInvocationID.x * wgp_count) + gl_WorkGroupID.x] = histogram[gl_LocalInvocationID.x];
+}
\ No newline at end of file
diff --git a/title/shark-shaders/shaders/radix_sort_1_spine.comp b/title/shark-shaders/shaders/radix_sort_1_spine.comp
new file mode 100644 (file)
index 0000000..0b1dfc4
--- /dev/null
@@ -0,0 +1,91 @@
+#version 460
+
+#extension GL_GOOGLE_include_directive : require
+
+#extension GL_EXT_buffer_reference : require
+#extension GL_EXT_buffer_reference2 : require
+#extension GL_EXT_scalar_block_layout : require
+#extension GL_EXT_control_flow_attributes : require
+
+#extension GL_KHR_shader_subgroup_arithmetic : require
+#extension GL_KHR_shader_subgroup_ballot : require
+#extension GL_KHR_shader_subgroup_shuffle_relative: enable
+#extension GL_KHR_shader_subgroup_vote : require
+
+//#extension GL_EXT_debug_printf : enable
+
+#include "compute_bindings.h"
+
+#include "radix_sort.h"
+
+#include "draw_2d.h"
+#include "indirect.h"
+
+layout(buffer_reference, std430, buffer_reference_align = 4) buffer CountRef {
+    uint value;
+};
+
+layout(buffer_reference, std430, buffer_reference_align = 4) buffer SpineRef {
+    uint values[];
+};
+
+struct RadixSortSpineConstants {
+    CountRef count_buffer;
+    SpineRef spine_buffer;
+};
+
+layout(std430, push_constant) uniform RadixSortSpineConstantsBlock {
+    RadixSortSpineConstants constants;
+};
+
+layout (constant_id = 0) const uint SUBGROUP_SIZE = 64;
+
+const uint NUM_SUBGROUPS = RADIX_SPINE_WGP_SIZE / SUBGROUP_SIZE;
+
+shared uint sums[NUM_SUBGROUPS];
+shared uint carry_in;
+
+layout (local_size_x = RADIX_SPINE_WGP_SIZE, local_size_y = 1, local_size_z = 1) in;
+
+void main() {
+    const uint local_id = gl_SubgroupID * gl_SubgroupSize + gl_SubgroupInvocationID;
+
+    const uint count = constants.count_buffer.value;
+
+    // Re-calculate how many workgroups pushed data into the spine
+    const uint upsweep_wgp_count = (count + (RADIX_ITEMS_PER_WGP - 1)) / RADIX_ITEMS_PER_WGP;
+
+    carry_in = 0;
+    for (uint i = 0; i < upsweep_wgp_count; i++) {
+        const uint spine_index = i * RADIX_DIGITS + local_id;
+
+        // Load values and calculate partial sums
+        const uint value = constants.spine_buffer.values[spine_index];
+        const uint sum = subgroupAdd(value);
+        const uint scan = subgroupExclusiveAdd(value);
+
+        if (subgroupElect()) {
+            sums[gl_SubgroupID] = sum;
+        }
+
+        barrier();
+
+        // Scan partials
+        if (local_id < NUM_SUBGROUPS) {
+            sums[local_id] = subgroupExclusiveAdd(sums[local_id]);
+        }
+
+        const uint carry = carry_in;
+
+        barrier();
+
+        // Write out the final prefix sum, combining the carry-in, subgroup sums, and local scan
+        constants.spine_buffer.values[spine_index] = carry + sums[gl_SubgroupID] + scan;
+
+        if (gl_SubgroupID == gl_NumSubgroups - 1 && subgroupElect()) {
+            carry_in += sums[gl_SubgroupID] + sum;
+        }
+
+        memoryBarrierShared();
+    }
+}
diff --git a/title/shark-shaders/shaders/radix_sort_2_downsweep.comp b/title/shark-shaders/shaders/radix_sort_2_downsweep.comp
new file mode 100644 (file)
index 0000000..4a1f1c8
--- /dev/null
@@ -0,0 +1,115 @@
+#version 460
+
+#extension GL_GOOGLE_include_directive : require
+
+#extension GL_EXT_buffer_reference : require
+#extension GL_EXT_buffer_reference2 : require
+#extension GL_EXT_scalar_block_layout : require
+#extension GL_EXT_control_flow_attributes : require
+
+#extension GL_EXT_shader_explicit_arithmetic_types_int8 : require
+
+#extension GL_KHR_shader_subgroup_arithmetic : require
+#extension GL_KHR_shader_subgroup_ballot : require
+#extension GL_KHR_shader_subgroup_shuffle_relative: enable
+#extension GL_KHR_shader_subgroup_vote : require
+
+//#extension GL_EXT_debug_printf : enable
+
+#include "compute_bindings.h"
+
+#include "radix_sort.h"
+
+#include "draw_2d.h"
+#include "indirect.h"
+
+layout (constant_id = 0) const uint SUBGROUP_SIZE = 64;
+
+const uint NUM_SUBGROUPS = RADIX_WGP_SIZE / SUBGROUP_SIZE;
+
+layout(buffer_reference, std430, buffer_reference_align = 4) readonly buffer CountRef {
+    uint value;
+};
+
+layout(buffer_reference, std430, buffer_reference_align = 4) readonly buffer SpineRef {
+    uint values[];
+};
+
+layout(buffer_reference, std430, buffer_reference_align = 4) buffer ValuesRef {
+    uint values[];
+};
+
+struct RadixSortDownsweepConstants {
+    uint shift;
+    uint _pad;
+    CountRef count_buffer;
+    SpineRef spine_buffer;
+    ValuesRef src_buffer;
+    ValuesRef dst_buffer;
+};
+
+layout(std430, push_constant) uniform RadixSortDownsweepConstantsBlock {
+    RadixSortDownsweepConstants constants;
+};
+
+shared uint values[RADIX_WGP_SIZE];
+shared uint spine[RADIX_DIGITS];
+shared uint match_masks[NUM_SUBGROUPS][RADIX_DIGITS];
+
+layout (local_size_x = RADIX_WGP_SIZE, local_size_y = 1, local_size_z = 1) in;
+
+void main() {
+    const uint shift = constants.shift;
+    const uint count = constants.count_buffer.value;
+    const uint wgp_count = (count + (RADIX_ITEMS_PER_WGP - 1)) / RADIX_ITEMS_PER_WGP;
+
+    const uint start = gl_WorkGroupID.x * RADIX_ITEMS_PER_WGP;
+    const uint end = min(start + RADIX_ITEMS_PER_WGP, count);
+
+    const uint local_id = gl_SubgroupID * gl_SubgroupSize + gl_SubgroupInvocationID;
+
+    // Gather from spine.
+    spine[local_id] = constants.spine_buffer.values[(local_id * wgp_count) + gl_WorkGroupID.x];
+
+    for (uint value_base = start; value_base < end; value_base += RADIX_WGP_SIZE) {
+        // Clear shared memory and load values from src buffer.
+        for (uint i = 0; i < NUM_SUBGROUPS; i++) {
+            match_masks[i][local_id] = 0;
+        }
+
+        barrier();
+
+        const uint global_offset = value_base + local_id;
+        uint value = 0xffffffff;
+        uint digit = 0xff;
+
+        if (global_offset < end) {
+            value = constants.src_buffer.values[global_offset];
+            digit = (value >> shift) & RADIX_MASK;
+            atomicOr(match_masks[gl_SubgroupID][digit], 1 << gl_SubgroupInvocationID);
+        }
+
+        barrier();
+
+        if (global_offset < end) {
+            const uint peer_mask = match_masks[gl_SubgroupID][digit];
+
+            uint peer_scan = bitCount(peer_mask & gl_SubgroupLtMask.x);
+            for (uint i = 0; i < gl_NumSubgroups; i++) {
+                if (i < gl_SubgroupID) {
+                    peer_scan += bitCount(match_masks[i][digit]);
+                }
+            }
+
+            const uint dst_index = spine[digit] + peer_scan;
+            constants.dst_buffer.values[dst_index] = value;
+        }
+
+        barrier();
+
+        // Increment the spine with the counts for the workgroup we just wrote out.
+        for (uint i = 0; i < NUM_SUBGROUPS; i++) {
+            atomicAdd(spine[local_id], bitCount(match_masks[i][local_id]));
+        }
+    }
+}
index d211c6f4ea271bc75ae9317fba879e831013ed8b..e45e0ba8f64843a56a5280043baf450bbebc84e2 100644 (file)
@@ -3,7 +3,7 @@ use std::path::Path;
 use narcissus_core::{obj, Widen};
 use narcissus_maths::{vec2, vec3, vec4, Vec2, Vec3};
 
-use crate::pipelines::basic::Vertex;
+use crate::pipelines::Vertex;
 
 pub fn load_obj<P: AsRef<Path>>(path: P) -> (Vec<Vertex>, Vec<u16>) {
     #[derive(Default)]
index 1adfc5fc008c0834e42f007f83a778638e4686ca..89864823c9bbc2bef6288fc1af026bbde71b0dfb 100644 (file)
@@ -3,9 +3,14 @@ use std::ops::Index;
 use std::path::Path;
 use std::time::{Duration, Instant};
 
-use narcissus_core::{dds, Widen as _};
-use pipelines::basic::BasicPipeline;
-use pipelines::{PrimitiveInstance, PrimitiveUniforms, Rect, TILE_SIZE, TILE_STRIDE};
+use narcissus_core::dds;
+
+use pipelines::{
+    BasicConstants, ComputeBinds, Draw2dClearConstants, Draw2dCmd, Draw2dScatterConstants,
+    Draw2dSortConstants, GraphicsBinds, Pipelines, RadixSortDownsweepConstants,
+    RadixSortSpineConstants, RadixSortUpsweepConstants, DRAW_2D_TILE_SIZE,
+};
+
 use renderdoc_sys as rdoc;
 
 use fonts::{FontFamily, Fonts};
@@ -14,14 +19,13 @@ use narcissus_app::{create_app, Event, Key, PressedState, WindowDesc};
 use narcissus_core::{box_assume_init, default, rand::Pcg64, zeroed_box, BitIter};
 use narcissus_font::{FontCollection, GlyphCache, HorizontalMetrics};
 use narcissus_gpu::{
-    create_device, Access, Bind, BindDesc, BindGroupLayout, BindingType, Buffer, BufferDesc,
-    BufferImageCopy, BufferUsageFlags, ClearValue, CmdEncoder, ColorSpace, ComputePipelineDesc,
-    Device, DeviceExt, Extent2d, Extent3d, Frame, GlobalBarrier, Image, ImageAspectFlags,
-    ImageBarrier, ImageDesc, ImageDimension, ImageFormat, ImageLayout, ImageSubresourceRange,
-    ImageTiling, ImageUsageFlags, IndexType, LoadOp, MemoryLocation, Offset2d, PersistentBuffer,
-    Pipeline, PipelineLayout, PresentMode, PushConstantRange, RenderingAttachment, RenderingDesc,
-    Sampler, SamplerAddressMode, SamplerDesc, SamplerFilter, Scissor, ShaderDesc, ShaderStageFlags,
-    StoreOp, SwapchainConfigurator, SwapchainImage, ThreadToken, TypedBind, Viewport,
+    create_device, Access, Bind, BufferImageCopy, BufferUsageFlags, ClearValue, CmdEncoder,
+    ColorSpace, Device, DeviceExt, Extent2d, Extent3d, Frame, GlobalBarrier, Image,
+    ImageAspectFlags, ImageBarrier, ImageDesc, ImageDimension, ImageFormat, ImageLayout,
+    ImageSubresourceRange, ImageTiling, ImageUsageFlags, IndexType, LoadOp, MemoryLocation,
+    Offset2d, PersistentBuffer, PresentMode, RenderingAttachment, RenderingDesc, Scissor,
+    ShaderStageFlags, StoreOp, SwapchainConfigurator, SwapchainImage, ThreadToken, TypedBind,
+    Viewport,
 };
 use narcissus_image as image;
 use narcissus_maths::{
@@ -30,8 +34,6 @@ use narcissus_maths::{
 };
 use spring::simple_spring_damper_exact;
 
-use crate::pipelines::basic::BasicUniforms;
-
 mod fonts;
 mod helpers;
 pub mod microshades;
@@ -73,9 +75,9 @@ static GAME_VARIABLES: GameVariables = GameVariables {
 
     player_speed: 10.0,
 
-    weapon_cooldown: 0.0,
+    weapon_cooldown: 0.2,
     weapon_projectile_speed: 20.0,
-    weapon_projectile_lifetime: 6.0,
+    weapon_projectile_lifetime: 3.0,
 };
 
 #[derive(Clone, Copy, Debug)]
@@ -452,8 +454,7 @@ struct UiState<'a> {
 
     tmp_string: String,
 
-    primitive_instances: Vec<PrimitiveInstance>,
-    rects: Vec<Rect>,
+    draw_cmds: Vec<Draw2dCmd>,
 }
 
 impl<'a> UiState<'a> {
@@ -465,8 +466,7 @@ impl<'a> UiState<'a> {
             fonts,
             glyph_cache,
             tmp_string: default(),
-            primitive_instances: vec![],
-            rects: vec![],
+            draw_cmds: vec![],
         }
     }
 
@@ -476,17 +476,14 @@ impl<'a> UiState<'a> {
         let center_x = x + half_extent_x;
         let center_y = y + half_extent_y;
 
-        let rect_index = self.rects.len() as u32;
-
-        self.rects.push(Rect {
+        self.draw_cmds.push(Draw2dCmd::rect(
+            center_x,
+            center_y,
             half_extent_x,
             half_extent_y,
-            border_width: 0.0,
-            border_radius: 0.0,
-        });
-
-        self.primitive_instances.push(PrimitiveInstance::rect(
-            rect_index, 0x4400ff00, center_x, center_y,
+            1.0,
+            0x4400ff00,
+            0xffff0000,
         ))
     }
 
@@ -530,50 +527,14 @@ impl<'a> UiState<'a> {
 
             x += advance * scale;
 
-            self.primitive_instances.push(PrimitiveInstance::glyph(
-                touched_glyph_index,
-                0x880000ff,
-                x,
-                y,
-            ));
+            self.draw_cmds
+                .push(Draw2dCmd::glyph(touched_glyph_index, 0x880000ff, x, y));
 
             x += advance_width * scale;
         }
     }
 }
 
-enum SamplerRes {
-    Bilinear,
-}
-
-pub struct Samplers {
-    bilinear: Sampler,
-}
-
-impl Index<SamplerRes> for Samplers {
-    type Output = Sampler;
-
-    fn index(&self, index: SamplerRes) -> &Self::Output {
-        match index {
-            SamplerRes::Bilinear => &self.bilinear,
-        }
-    }
-}
-
-impl Samplers {
-    fn load(gpu: &Gpu) -> Samplers {
-        let bilinear = gpu.create_sampler(&SamplerDesc {
-            filter: SamplerFilter::Bilinear,
-            address_mode: SamplerAddressMode::Clamp,
-            compare_op: None,
-            mip_lod_bias: 0.0,
-            min_lod: 0.0,
-            max_lod: 0.0,
-        });
-        Samplers { bilinear }
-    }
-}
-
 struct Model<'a> {
     indices: u32,
     vertex_buffer: PersistentBuffer<'a>,
@@ -880,14 +841,6 @@ type Gpu = dyn Device + 'static;
 struct DrawState<'gpu> {
     gpu: &'gpu Gpu,
 
-    basic_pipeline: BasicPipeline,
-
-    compute_bind_group_layout: BindGroupLayout,
-    bin_clear_pipeline: Pipeline,
-    bin_pipeline: Pipeline,
-    rasterize_pipeline: Pipeline,
-    display_transform_pipeline: Pipeline,
-
     width: u32,
     height: u32,
 
@@ -898,11 +851,10 @@ struct DrawState<'gpu> {
     color_image: Image,
     ui_image: Image,
 
-    tiles_buffer: Buffer,
-
     glyph_atlas_image: Image,
 
-    _samplers: Samplers,
+    pipelines: Pipelines,
+
     models: Models<'gpu>,
     images: Images,
 
@@ -911,78 +863,12 @@ struct DrawState<'gpu> {
 
 impl<'gpu> DrawState<'gpu> {
     fn new(gpu: &'gpu Gpu, thread_token: &ThreadToken) -> Self {
-        let samplers = Samplers::load(gpu);
-        let immutable_samplers = &[samplers[SamplerRes::Bilinear]];
-
-        let compute_bind_group_layout = gpu.create_bind_group_layout(&[
-            // Samplers
-            BindDesc::with_immutable_samplers(ShaderStageFlags::COMPUTE, immutable_samplers),
-            // Tony mc mapface LUT
-            BindDesc::new(ShaderStageFlags::COMPUTE, BindingType::SampledImage),
-            // Glyph Atlas
-            BindDesc::new(ShaderStageFlags::COMPUTE, BindingType::SampledImage),
-            // UI Render Target
-            BindDesc::new(ShaderStageFlags::COMPUTE, BindingType::StorageImage),
-            // Color Render Target
-            BindDesc::new(ShaderStageFlags::COMPUTE, BindingType::StorageImage),
-            // Composited output
-            BindDesc::new(ShaderStageFlags::COMPUTE, BindingType::StorageImage),
-        ]);
-
-        let compute_pipeline_layout = PipelineLayout {
-            bind_group_layouts: &[compute_bind_group_layout],
-            push_constant_ranges: &[PushConstantRange {
-                stage_flags: ShaderStageFlags::COMPUTE,
-                offset: 0,
-                size: std::mem::size_of::<PrimitiveUniforms>() as u32,
-            }],
-        };
-
-        let bin_clear_pipeline = gpu.create_compute_pipeline(&ComputePipelineDesc {
-            shader: ShaderDesc {
-                entry: c"main",
-                code: shark_shaders::PRIMITIVE_2D_BIN_CLEAR_COMP_SPV,
-            },
-            layout: &compute_pipeline_layout,
-        });
-
-        let bin_pipeline = gpu.create_compute_pipeline(&ComputePipelineDesc {
-            shader: ShaderDesc {
-                entry: c"main",
-                code: shark_shaders::PRIMITIVE_2D_BIN_COMP_SPV,
-            },
-            layout: &compute_pipeline_layout,
-        });
-
-        let rasterize_pipeline = gpu.create_compute_pipeline(&ComputePipelineDesc {
-            shader: ShaderDesc {
-                entry: c"main",
-                code: shark_shaders::PRIMITIVE_2D_RASTERIZE_COMP_SPV,
-            },
-            layout: &compute_pipeline_layout,
-        });
-
-        let display_transform_pipeline = gpu.create_compute_pipeline(&ComputePipelineDesc {
-            shader: ShaderDesc {
-                entry: c"main",
-                code: shark_shaders::DISPLAY_TRANSFORM_COMP_SPV,
-            },
-            layout: &compute_pipeline_layout,
-        });
-
-        let basic_pipeline = BasicPipeline::new(gpu, immutable_samplers);
-
+        let pipelines = Pipelines::load(gpu);
         let models = Models::load(gpu);
         let images = Images::load(gpu, thread_token);
 
         Self {
             gpu,
-            basic_pipeline,
-            compute_bind_group_layout,
-            bin_clear_pipeline,
-            bin_pipeline,
-            rasterize_pipeline,
-            display_transform_pipeline,
             width: 0,
             height: 0,
             tile_resolution_x: 0,
@@ -990,9 +876,8 @@ impl<'gpu> DrawState<'gpu> {
             depth_image: default(),
             color_image: default(),
             ui_image: default(),
-            tiles_buffer: default(),
             glyph_atlas_image: default(),
-            _samplers: samplers,
+            pipelines,
             models,
             images,
             transforms: vec![],
@@ -1012,7 +897,7 @@ impl<'gpu> DrawState<'gpu> {
         let gpu = self.gpu;
 
         let half_turn_y = Mat3::from_axis_rotation(Vec3::Y, HalfTurn::new(0.5));
-        let scale = Mat3::from_scale(Vec3::splat(0.125));
+        let scale = Mat3::from_scale(Vec3::splat(0.4));
 
         fn rotate_dir(dir: Vec3, up: Vec3) -> Mat3 {
             let f = dir.normalized();
@@ -1103,33 +988,8 @@ impl<'gpu> DrawState<'gpu> {
                 gpu.destroy_image(frame, self.color_image);
                 gpu.destroy_image(frame, self.ui_image);
 
-                let tile_resolution_x = (width + (TILE_SIZE - 1)) / TILE_SIZE;
-                let tile_resolution_y = (height + (TILE_SIZE - 1)) / TILE_SIZE;
-
-                if tile_resolution_x != self.tile_resolution_x
-                    || tile_resolution_y != self.tile_resolution_y
-                {
-                    gpu.destroy_buffer(frame, self.tiles_buffer);
-
-                    let bitmap_buffer_size = tile_resolution_x
-                        * tile_resolution_y
-                        * TILE_STRIDE
-                        * std::mem::size_of::<u32>() as u32;
-
-                    self.tiles_buffer = gpu.create_buffer(&BufferDesc {
-                        memory_location: MemoryLocation::Device,
-                        host_mapped: false,
-                        usage: BufferUsageFlags::STORAGE,
-                        size: bitmap_buffer_size.widen(),
-                    });
-
-                    gpu.debug_name_buffer(self.tiles_buffer.to_arg(), "tile bitmap");
-
-                    println!("tile_resolution: ({tile_resolution_x},{tile_resolution_y})");
-
-                    self.tile_resolution_x = tile_resolution_x;
-                    self.tile_resolution_y = tile_resolution_y;
-                }
+                self.tile_resolution_x = (width + (DRAW_2D_TILE_SIZE - 1)) / DRAW_2D_TILE_SIZE;
+                self.tile_resolution_y = (height + (DRAW_2D_TILE_SIZE - 1)) / DRAW_2D_TILE_SIZE;
 
                 self.depth_image = gpu.create_image(&ImageDesc {
                     memory_location: MemoryLocation::Device,
@@ -1320,89 +1180,98 @@ impl<'gpu> DrawState<'gpu> {
 
             // Render basic stuff.
             {
-                gpu.cmd_set_pipeline(cmd_encoder, self.basic_pipeline.pipeline);
+                let model = &self.models[ModelRes::Shark];
+                let image = self.images[ImageRes::Shark];
 
-                let basic_uniforms = BasicUniforms { clip_from_model };
+                let instance_count = self.transforms.len() as u32;
 
-                let uniform_buffer = gpu.request_transient_buffer_with_data(
+                let transform_buffer = gpu.request_transient_buffer_with_data(
                     frame,
                     thread_token,
-                    BufferUsageFlags::UNIFORM,
-                    &basic_uniforms,
+                    BufferUsageFlags::STORAGE,
+                    self.transforms.as_slice(),
                 );
 
-                gpu.cmd_set_bind_group(
+                // We're done with you now!
+                self.transforms.clear();
+
+                let graphics_bind_group = gpu.request_transient_bind_group(
                     frame,
-                    cmd_encoder,
-                    self.basic_pipeline.uniforms_bind_group_layout,
-                    0,
+                    thread_token,
+                    self.pipelines.graphics_bind_group_layout,
                     &[Bind {
-                        binding: 0,
+                        binding: GraphicsBinds::Albedo as u32,
                         array_element: 0,
-                        typed: TypedBind::UniformBuffer(&[uniform_buffer.to_arg()]),
+                        typed: TypedBind::SampledImage(&[(ImageLayout::Optimal, image)]),
                     }],
                 );
 
-                {
-                    let model = &self.models[ModelRes::Shark];
-                    let image = self.images[ImageRes::Shark];
-
-                    let transform_buffer = gpu.request_transient_buffer_with_data(
-                        frame,
-                        thread_token,
-                        BufferUsageFlags::STORAGE,
-                        self.transforms.as_slice(),
-                    );
-
-                    gpu.cmd_set_bind_group(
-                        frame,
-                        cmd_encoder,
-                        self.basic_pipeline.storage_bind_group_layout,
-                        1,
-                        &[
-                            Bind {
-                                binding: 0,
-                                array_element: 0,
-                                typed: TypedBind::StorageBuffer(&[model.vertex_buffer.to_arg()]),
-                            },
-                            Bind {
-                                binding: 1,
-                                array_element: 0,
-                                typed: TypedBind::StorageBuffer(&[transform_buffer.to_arg()]),
-                            },
-                            Bind {
-                                binding: 2,
-                                array_element: 0,
-                                typed: TypedBind::SampledImage(&[(ImageLayout::Optimal, image)]),
-                            },
-                        ],
-                    );
-
-                    gpu.cmd_set_index_buffer(
-                        cmd_encoder,
-                        model.index_buffer.to_arg(),
-                        0,
-                        IndexType::U16,
-                    );
+                gpu.cmd_set_pipeline(cmd_encoder, self.pipelines.basic_pipeline);
+                gpu.cmd_set_bind_group(cmd_encoder, 0, &graphics_bind_group);
+                gpu.cmd_push_constants(
+                    cmd_encoder,
+                    ShaderStageFlags::VERTEX,
+                    0,
+                    &BasicConstants {
+                        clip_from_model,
+                        vertex_buffer_address: gpu.get_buffer_address(model.vertex_buffer.to_arg()),
+                        transform_buffer_address: gpu.get_buffer_address(transform_buffer.to_arg()),
+                    },
+                );
 
-                    gpu.cmd_draw_indexed(
-                        cmd_encoder,
-                        model.indices,
-                        self.transforms.len() as u32,
-                        0,
-                        0,
-                        0,
-                    );
-                }
+                gpu.cmd_set_index_buffer(
+                    cmd_encoder,
+                    model.index_buffer.to_arg(),
+                    0,
+                    IndexType::U16,
+                );
 
-                // We're done with you now!
-                self.transforms.clear();
+                gpu.cmd_draw_indexed(cmd_encoder, model.indices, instance_count, 0, 0, 0);
             }
 
             gpu.cmd_end_rendering(cmd_encoder);
 
             gpu.cmd_end_debug_marker(cmd_encoder);
 
+            let compute_bind_group = gpu.request_transient_bind_group(
+                frame,
+                thread_token,
+                self.pipelines.compute_bind_group_layout,
+                &[
+                    Bind {
+                        binding: ComputeBinds::TonyMcMapfaceLut as u32,
+                        array_element: 0,
+                        typed: TypedBind::SampledImage(&[(
+                            ImageLayout::Optimal,
+                            self.images[ImageRes::TonyMcMapfaceLut],
+                        )]),
+                    },
+                    Bind {
+                        binding: ComputeBinds::GlyphAtlas as u32,
+                        array_element: 0,
+                        typed: TypedBind::SampledImage(&[(
+                            ImageLayout::Optimal,
+                            self.glyph_atlas_image,
+                        )]),
+                    },
+                    Bind {
+                        binding: ComputeBinds::UiRenderTarget as u32,
+                        array_element: 0,
+                        typed: TypedBind::StorageImage(&[(ImageLayout::General, self.ui_image)]),
+                    },
+                    Bind {
+                        binding: ComputeBinds::ColorRenderTarget as u32,
+                        array_element: 0,
+                        typed: TypedBind::StorageImage(&[(ImageLayout::General, self.color_image)]),
+                    },
+                    Bind {
+                        binding: ComputeBinds::CompositedRenderTarget as u32,
+                        array_element: 0,
+                        typed: TypedBind::StorageImage(&[(ImageLayout::General, swapchain_image)]),
+                    },
+                ],
+            );
+
             // Render UI
             {
                 gpu.cmd_begin_debug_marker(
@@ -1411,124 +1280,267 @@ impl<'gpu> DrawState<'gpu> {
                     microshades::PURPLE_RGBA_F32[3],
                 );
 
-                let primitive_instance_buffer = gpu.request_transient_buffer_with_data(
+                let draw_buffer = gpu.request_transient_buffer_with_data(
                     frame,
                     thread_token,
                     BufferUsageFlags::STORAGE,
-                    ui_state.primitive_instances.as_slice(),
+                    ui_state.draw_cmds.as_slice(),
                 );
+
+                let draw_buffer_len = ui_state.draw_cmds.len() as u32;
+                ui_state.draw_cmds.clear();
+
                 let glyph_buffer = gpu.request_transient_buffer_with_data(
                     frame,
                     thread_token,
                     BufferUsageFlags::STORAGE,
                     touched_glyphs,
                 );
-                let rect_buffer = gpu.request_transient_buffer_with_data(
+
+                const COARSE_BUFFER_LEN: usize = 1 << 18;
+                let coarse_buffer = gpu.request_transient_buffer(
                     frame,
                     thread_token,
                     BufferUsageFlags::STORAGE,
-                    ui_state.rects.as_slice(),
+                    COARSE_BUFFER_LEN * std::mem::size_of::<u32>(),
                 );
 
-                let num_primitives = ui_state.primitive_instances.len() as u32;
-                let num_primitives_32 = (num_primitives + 31) / 32;
-                let num_primitives_1024 = (num_primitives_32 + 31) / 32;
-
-                ui_state.primitive_instances.clear();
+                let indirect_dispatch_buffer = gpu.request_transient_buffer(
+                    frame,
+                    thread_token,
+                    BufferUsageFlags::INDIRECT,
+                    3 * std::mem::size_of::<u32>(),
+                );
 
-                gpu.cmd_set_pipeline(cmd_encoder, self.bin_clear_pipeline);
+                let sort_tmp_buffer = gpu.request_transient_buffer(
+                    frame,
+                    thread_token,
+                    BufferUsageFlags::STORAGE,
+                    COARSE_BUFFER_LEN * std::mem::size_of::<u32>(),
+                );
 
-                gpu.cmd_set_bind_group(
+                let spine_buffer = gpu.request_transient_buffer(
                     frame,
+                    thread_token,
+                    BufferUsageFlags::STORAGE,
+                    (COARSE_BUFFER_LEN / (32 * 16)) * 256 * std::mem::size_of::<u32>(), // TODO: Fix size
+                );
+
+                let draw_buffer_address = gpu.get_buffer_address(draw_buffer.to_arg());
+                let glyph_buffer_address = gpu.get_buffer_address(glyph_buffer.to_arg());
+                let coarse_buffer_address = gpu.get_buffer_address(coarse_buffer.to_arg());
+                let indirect_dispatch_buffer_address =
+                    gpu.get_buffer_address(indirect_dispatch_buffer.to_arg());
+                let sort_tmp_buffer_address = gpu.get_buffer_address(sort_tmp_buffer.to_arg());
+                let spine_buffer_address = gpu.get_buffer_address(spine_buffer.to_arg());
+
+                gpu.cmd_set_pipeline(cmd_encoder, self.pipelines.draw_2d_bin_0_clear_pipeline);
+                gpu.cmd_set_bind_group(cmd_encoder, 0, &compute_bind_group);
+                gpu.cmd_push_constants(
                     cmd_encoder,
-                    self.compute_bind_group_layout,
+                    ShaderStageFlags::COMPUTE,
                     0,
-                    &[
-                        Bind {
-                            binding: 1,
-                            array_element: 0,
-                            typed: TypedBind::SampledImage(&[(
-                                ImageLayout::Optimal,
-                                self.images[ImageRes::TonyMcMapfaceLut],
-                            )]),
-                        },
-                        Bind {
-                            binding: 2,
-                            array_element: 0,
-                            typed: TypedBind::SampledImage(&[(
-                                ImageLayout::Optimal,
-                                self.glyph_atlas_image,
-                            )]),
-                        },
-                        Bind {
-                            binding: 3,
-                            array_element: 0,
-                            typed: TypedBind::StorageImage(&[(
-                                ImageLayout::General,
-                                self.ui_image,
-                            )]),
-                        },
-                        Bind {
-                            binding: 4,
-                            array_element: 0,
-                            typed: TypedBind::StorageImage(&[(
-                                ImageLayout::General,
-                                self.color_image,
-                            )]),
-                        },
-                        Bind {
-                            binding: 5,
-                            array_element: 0,
-                            typed: TypedBind::StorageImage(&[(
-                                ImageLayout::General,
-                                swapchain_image,
-                            )]),
-                        },
-                    ],
+                    &Draw2dClearConstants {
+                        coarse_buffer_address,
+                    },
+                );
+                gpu.cmd_dispatch(cmd_encoder, 1, 1, 1);
+
+                gpu.cmd_barrier(
+                    cmd_encoder,
+                    Some(&GlobalBarrier {
+                        prev_access: &[Access::ComputeWrite],
+                        next_access: &[Access::ComputeOtherRead],
+                    }),
+                    &[],
                 );
 
+                gpu.cmd_set_pipeline(cmd_encoder, self.pipelines.draw_2d_bin_1_scatter_pipeline);
+                gpu.cmd_set_bind_group(cmd_encoder, 0, &compute_bind_group);
                 gpu.cmd_push_constants(
                     cmd_encoder,
                     ShaderStageFlags::COMPUTE,
                     0,
-                    &PrimitiveUniforms {
+                    &Draw2dScatterConstants {
                         screen_resolution_x: self.width,
                         screen_resolution_y: self.height,
-                        atlas_resolution_x: atlas_width,
-                        atlas_resolution_y: atlas_height,
-                        num_primitives,
-                        num_primitives_32,
-                        num_primitives_1024,
                         tile_resolution_x: self.tile_resolution_x,
                         tile_resolution_y: self.tile_resolution_y,
-                        tile_stride: self.tile_resolution_x,
-                        primitives_instances_buffer: gpu
-                            .get_buffer_address(primitive_instance_buffer.to_arg()),
-                        glyphs_buffer: gpu.get_buffer_address(glyph_buffer.to_arg()),
-                        rects_buffer: gpu.get_buffer_address(rect_buffer.to_arg()),
-                        tiles_buffer: gpu.get_buffer_address(self.tiles_buffer.to_arg()),
+                        draw_buffer_len,
+                        coarse_buffer_len: COARSE_BUFFER_LEN as u32,
+                        draw_buffer_address,
+                        glyph_buffer_address,
+                        coarse_buffer_address,
                     },
                 );
 
-                gpu.cmd_dispatch(
+                for _ in 0..4 {
+                    gpu.cmd_dispatch(
+                        cmd_encoder,
+                        (draw_buffer_len
+                            + (self.pipelines.draw_2d_bin_1_scatter_pipeline_workgroup_size - 1))
+                            / self.pipelines.draw_2d_bin_1_scatter_pipeline_workgroup_size,
+                        1,
+                        1,
+                    );
+                }
+
+                gpu.cmd_barrier(
                     cmd_encoder,
-                    (num_primitives_1024 + 63) / 64,
-                    self.tile_resolution_x,
-                    self.tile_resolution_y,
+                    Some(&GlobalBarrier {
+                        prev_access: &[Access::ComputeWrite],
+                        next_access: &[Access::ComputeOtherRead],
+                    }),
+                    &[],
+                );
+
+                // let mut sort_data = Vec::new();
+                // let count = 8192u32;
+                // sort_data.push(count);
+                // for i in 0..count {
+                //     sort_data.push(255 - i / 32);
+                // }
+
+                // let sort_buffer = gpu.request_transient_buffer_with_data(
+                //     frame,
+                //     thread_token,
+                //     BufferUsageFlags::STORAGE,
+                //     sort_data.as_slice(),
+                // );
+                // let sort_buffer_address = gpu.get_buffer_address(sort_buffer.to_arg());
+
+                gpu.cmd_set_pipeline(cmd_encoder, self.pipelines.draw_2d_bin_2_sort_pipeline);
+                gpu.cmd_set_bind_group(cmd_encoder, 0, &compute_bind_group);
+                gpu.cmd_push_constants(
+                    cmd_encoder,
+                    ShaderStageFlags::COMPUTE,
+                    0,
+                    &Draw2dSortConstants {
+                        // -1 due to the count taking up a single slot in the buffer.
+                        coarse_buffer_len: COARSE_BUFFER_LEN as u32 - 1,
+                        _pad: 0,
+                        indirect_dispatch_buffer_address,
+                        coarse_buffer_address,
+                    },
                 );
+                gpu.cmd_dispatch(cmd_encoder, 1, 1, 1);
 
                 gpu.cmd_barrier(
                     cmd_encoder,
                     Some(&GlobalBarrier {
                         prev_access: &[Access::ComputeWrite],
-                        next_access: &[Access::ComputeOtherRead],
+                        next_access: &[Access::ComputeOtherRead, Access::IndirectBuffer],
                     }),
                     &[],
                 );
 
-                gpu.cmd_set_pipeline(cmd_encoder, self.bin_pipeline);
+                gpu.cmd_begin_debug_marker(
+                    cmd_encoder,
+                    "radix sort",
+                    microshades::ORANGE_RGBA_F32[2],
+                );
 
-                gpu.cmd_dispatch(cmd_encoder, (num_primitives + 1023) / 1024, 1, 1);
+                // First element in the scratch buffer is the count.
+                let count_buffer_address = coarse_buffer_address;
+                // Then the elements we want to sort follow.
+                let mut src_buffer_address = count_buffer_address.byte_add(4);
+                let mut dst_buffer_address = sort_tmp_buffer_address;
+
+                for pass in 0..4 {
+                    let shift = pass * 8;
+
+                    // Upsweep
+                    gpu.cmd_set_pipeline(cmd_encoder, self.pipelines.radix_sort_0_upsweep_pipeline);
+                    gpu.cmd_set_bind_group(cmd_encoder, 0, &compute_bind_group);
+                    gpu.cmd_push_constants(
+                        cmd_encoder,
+                        ShaderStageFlags::COMPUTE,
+                        0,
+                        &RadixSortUpsweepConstants {
+                            shift,
+                            _pad: 0,
+                            count_buffer_address,
+                            src_buffer_address,
+                            spine_buffer_address,
+                        },
+                    );
+                    gpu.cmd_dispatch_indirect(cmd_encoder, indirect_dispatch_buffer.to_arg(), 0);
+
+                    gpu.cmd_barrier(
+                        cmd_encoder,
+                        Some(&GlobalBarrier {
+                            prev_access: &[Access::ComputeWrite],
+                            next_access: &[Access::ComputeOtherRead],
+                        }),
+                        &[],
+                    );
+
+                    // Exclusive sum of the spine
+                    gpu.cmd_set_pipeline(cmd_encoder, self.pipelines.radix_sort_1_spine_pipeline);
+                    gpu.cmd_set_bind_group(cmd_encoder, 0, &compute_bind_group);
+                    gpu.cmd_push_constants(
+                        cmd_encoder,
+                        ShaderStageFlags::COMPUTE,
+                        0,
+                        &RadixSortSpineConstants {
+                            count_buffer_address,
+                            spine_buffer_address,
+                        },
+                    );
+                    gpu.cmd_dispatch(cmd_encoder, 1, 1, 1);
+
+                    gpu.cmd_barrier(
+                        cmd_encoder,
+                        Some(&GlobalBarrier {
+                            prev_access: &[Access::ComputeWrite],
+                            next_access: &[Access::ComputeOtherRead],
+                        }),
+                        &[],
+                    );
+
+                    // Downsweep
+                    gpu.cmd_set_pipeline(
+                        cmd_encoder,
+                        self.pipelines.radix_sort_2_downsweep_pipeline,
+                    );
+                    gpu.cmd_set_bind_group(cmd_encoder, 0, &compute_bind_group);
+                    gpu.cmd_push_constants(
+                        cmd_encoder,
+                        ShaderStageFlags::COMPUTE,
+                        0,
+                        &RadixSortDownsweepConstants {
+                            shift,
+                            _pad: 0,
+                            count_buffer_address,
+                            src_buffer_address,
+                            dst_buffer_address,
+                            spine_buffer_address,
+                        },
+                    );
+                    gpu.cmd_dispatch_indirect(cmd_encoder, indirect_dispatch_buffer.to_arg(), 0);
+
+                    gpu.cmd_barrier(
+                        cmd_encoder,
+                        Some(&GlobalBarrier {
+                            prev_access: &[Access::ComputeWrite],
+                            next_access: &[Access::ComputeOtherRead],
+                        }),
+                        &[],
+                    );
+
+                    std::mem::swap(&mut src_buffer_address, &mut dst_buffer_address);
+                }
+
+                gpu.cmd_end_debug_marker(cmd_encoder);
+
+                gpu.cmd_set_pipeline(cmd_encoder, self.pipelines.draw_2d_bin_3_resolve_pipeline);
+                gpu.cmd_set_bind_group(cmd_encoder, 0, &compute_bind_group);
+                gpu.cmd_dispatch(
+                    cmd_encoder,
+                    self.tile_resolution_x,
+                    self.tile_resolution_y,
+                    1,
+                );
 
                 gpu.cmd_barrier(
                     cmd_encoder,
@@ -1539,8 +1551,8 @@ impl<'gpu> DrawState<'gpu> {
                     &[],
                 );
 
-                gpu.cmd_set_pipeline(cmd_encoder, self.rasterize_pipeline);
-
+                gpu.cmd_set_pipeline(cmd_encoder, self.pipelines.draw_2d_rasterize_pipeline);
+                gpu.cmd_set_bind_group(cmd_encoder, 0, &compute_bind_group);
                 gpu.cmd_dispatch(cmd_encoder, (self.width + 7) / 8, (self.height + 7) / 8, 1);
 
                 gpu.cmd_end_debug_marker(cmd_encoder);
@@ -1579,7 +1591,8 @@ impl<'gpu> DrawState<'gpu> {
 
                 gpu.cmd_compute_touch_swapchain(cmd_encoder, swapchain_image);
 
-                gpu.cmd_set_pipeline(cmd_encoder, self.display_transform_pipeline);
+                gpu.cmd_set_pipeline(cmd_encoder, self.pipelines.composite_pipeline);
+                gpu.cmd_set_bind_group(cmd_encoder, 0, &compute_bind_group);
 
                 gpu.cmd_dispatch(cmd_encoder, (self.width + 7) / 8, (self.height + 7) / 8, 1);
 
diff --git a/title/shark/src/pipelines.rs b/title/shark/src/pipelines.rs
new file mode 100644 (file)
index 0000000..5b893b4
--- /dev/null
@@ -0,0 +1,411 @@
+use narcissus_core::default;
+use narcissus_font::TouchedGlyphIndex;
+use narcissus_gpu::{
+    BindDesc, BindGroupLayout, BindingType, BlendMode, BufferAddress, CompareOp,
+    ComputePipelineDesc, CullingMode, FrontFace, GraphicsPipelineAttachments, GraphicsPipelineDesc,
+    ImageFormat, Pipeline, PipelineLayout, PolygonMode, PushConstantRange, Sampler,
+    SamplerAddressMode, SamplerDesc, SamplerFilter, ShaderDesc, ShaderStageFlags, SpecConstant,
+    Topology,
+};
+use narcissus_maths::Mat4;
+
+use crate::Gpu;
+
+pub const DRAW_2D_TILE_SIZE: u32 = 32;
+
+#[allow(unused)]
+#[repr(C)]
+pub struct Vertex {
+    pub position: [f32; 4],
+    pub normal: [f32; 4],
+    pub texcoord: [f32; 4],
+}
+
+#[repr(u32)]
+enum Draw2dCmdType {
+    Rect,
+    Glyph,
+}
+
+#[allow(unused)]
+#[repr(C)]
+pub union Draw2dCmd {
+    rect: CmdRect,
+    glyph: CmdGlyph,
+}
+
+#[repr(C)]
+#[derive(Clone, Copy)]
+struct CmdGlyph {
+    r#type: u32,
+    index: u32,
+    x: f32,
+    y: f32,
+    color: u32,
+    _padding: [u8; 12],
+}
+
+const _: () = assert!(std::mem::size_of::<CmdGlyph>() == std::mem::size_of::<Draw2dCmd>());
+
+#[repr(C)]
+#[derive(Clone, Copy)]
+struct CmdRect {
+    r#type: u32,
+    border_width: f32,
+    x: f32,
+    y: f32,
+    half_extent_x: f32,
+    half_extent_y: f32,
+    background_color: u32,
+    border_color: u32,
+}
+
+const _: () = assert!(std::mem::size_of::<CmdRect>() == std::mem::size_of::<Draw2dCmd>());
+
+impl Draw2dCmd {
+    #[inline(always)]
+    pub fn glyph(glyph_index: TouchedGlyphIndex, color: u32, x: f32, y: f32) -> Self {
+        Self {
+            glyph: CmdGlyph {
+                r#type: Draw2dCmdType::Glyph as u32,
+                index: glyph_index.as_u32(),
+                x,
+                y,
+                color,
+                _padding: default(),
+            },
+        }
+    }
+
+    #[inline(always)]
+    pub fn rect(
+        x: f32,
+        y: f32,
+        half_extent_x: f32,
+        half_extent_y: f32,
+        border_width: f32,
+        background_color: u32,
+        border_color: u32,
+    ) -> Self {
+        Self {
+            rect: CmdRect {
+                r#type: Draw2dCmdType::Rect as u32,
+                border_width,
+                x,
+                y,
+                half_extent_x,
+                half_extent_y,
+                background_color,
+                border_color,
+            },
+        }
+    }
+}
+
+pub struct Samplers {
+    pub bilinear: Sampler,
+}
+
+impl Samplers {
+    fn load(gpu: &Gpu) -> Samplers {
+        let bilinear = gpu.create_sampler(&SamplerDesc {
+            filter: SamplerFilter::Bilinear,
+            address_mode: SamplerAddressMode::Clamp,
+            compare_op: None,
+            mip_lod_bias: 0.0,
+            min_lod: 0.0,
+            max_lod: 0.0,
+        });
+        Samplers { bilinear }
+    }
+}
+
+pub enum GraphicsBinds {
+    #[allow(unused)]
+    ImmutableSamplers,
+    Albedo,
+}
+
+pub enum ComputeBinds {
+    #[allow(unused)]
+    ImmutableSamplers,
+    TonyMcMapfaceLut,
+    GlyphAtlas,
+    UiRenderTarget,
+    ColorRenderTarget,
+    CompositedRenderTarget,
+}
+
+#[repr(C)]
+pub struct BasicConstants<'a> {
+    pub clip_from_model: Mat4,
+    pub vertex_buffer_address: BufferAddress<'a>,
+    pub transform_buffer_address: BufferAddress<'a>,
+}
+
+#[repr(C)]
+pub struct Draw2dClearConstants<'a> {
+    pub coarse_buffer_address: BufferAddress<'a>,
+}
+
+#[repr(C)]
+pub struct Draw2dScatterConstants<'a> {
+    pub screen_resolution_x: u32,
+    pub screen_resolution_y: u32,
+    pub tile_resolution_x: u32,
+    pub tile_resolution_y: u32,
+
+    pub draw_buffer_len: u32,
+    pub coarse_buffer_len: u32,
+
+    pub draw_buffer_address: BufferAddress<'a>,
+    pub glyph_buffer_address: BufferAddress<'a>,
+    pub coarse_buffer_address: BufferAddress<'a>,
+}
+
+#[repr(C)]
+pub struct Draw2dSortConstants<'a> {
+    pub coarse_buffer_len: u32,
+    pub _pad: u32,
+    pub indirect_dispatch_buffer_address: BufferAddress<'a>,
+    pub coarse_buffer_address: BufferAddress<'a>,
+}
+
+#[repr(C)]
+pub struct RadixSortUpsweepConstants<'a> {
+    pub shift: u32,
+    pub _pad: u32,
+    pub count_buffer_address: BufferAddress<'a>,
+    pub src_buffer_address: BufferAddress<'a>,
+    pub spine_buffer_address: BufferAddress<'a>,
+}
+
+#[repr(C)]
+pub struct RadixSortSpineConstants<'a> {
+    pub count_buffer_address: BufferAddress<'a>,
+    pub spine_buffer_address: BufferAddress<'a>,
+}
+
+#[repr(C)]
+pub struct RadixSortDownsweepConstants<'a> {
+    pub shift: u32,
+    pub _pad: u32,
+    pub count_buffer_address: BufferAddress<'a>,
+    pub spine_buffer_address: BufferAddress<'a>,
+    pub src_buffer_address: BufferAddress<'a>,
+    pub dst_buffer_address: BufferAddress<'a>,
+}
+
+pub struct Pipelines {
+    _samplers: Samplers,
+
+    pub graphics_bind_group_layout: BindGroupLayout,
+    pub compute_bind_group_layout: BindGroupLayout,
+
+    pub basic_pipeline: Pipeline,
+
+    pub draw_2d_bin_0_clear_pipeline: Pipeline,
+    pub draw_2d_bin_1_scatter_pipeline_workgroup_size: u32,
+    pub draw_2d_bin_1_scatter_pipeline: Pipeline,
+    pub draw_2d_bin_2_sort_pipeline: Pipeline,
+    pub draw_2d_bin_3_resolve_pipeline: Pipeline,
+    pub draw_2d_rasterize_pipeline: Pipeline,
+
+    pub radix_sort_0_upsweep_pipeline: Pipeline,
+    pub radix_sort_1_spine_pipeline: Pipeline,
+    pub radix_sort_2_downsweep_pipeline: Pipeline,
+
+    pub composite_pipeline: Pipeline,
+}
+
+impl Pipelines {
+    pub fn load(gpu: &Gpu) -> Self {
+        let samplers = Samplers::load(gpu);
+        let immutable_samplers = &[samplers.bilinear];
+
+        let graphics_bind_group_layout = gpu.create_bind_group_layout(&[
+            // Samplers
+            BindDesc::with_immutable_samplers(ShaderStageFlags::FRAGMENT, immutable_samplers),
+            // Albedo
+            BindDesc::new(ShaderStageFlags::FRAGMENT, BindingType::SampledImage),
+        ]);
+
+        gpu.debug_name_bind_group_layout(graphics_bind_group_layout, "graphics");
+
+        let compute_bind_group_layout = gpu.create_bind_group_layout(&[
+            // Samplers
+            BindDesc::with_immutable_samplers(ShaderStageFlags::COMPUTE, immutable_samplers),
+            // Tony mc mapface LUT
+            BindDesc::new(ShaderStageFlags::COMPUTE, BindingType::SampledImage),
+            // Glyph Atlas
+            BindDesc::new(ShaderStageFlags::COMPUTE, BindingType::SampledImage),
+            // UI Render Target
+            BindDesc::new(ShaderStageFlags::COMPUTE, BindingType::StorageImage),
+            // Color Render Target
+            BindDesc::new(ShaderStageFlags::COMPUTE, BindingType::StorageImage),
+            // Composited Render Target
+            BindDesc::new(ShaderStageFlags::COMPUTE, BindingType::StorageImage),
+        ]);
+
+        gpu.debug_name_bind_group_layout(compute_bind_group_layout, "compute");
+
+        let basic_pipeline = gpu.create_graphics_pipeline(&GraphicsPipelineDesc {
+            vertex_shader: ShaderDesc {
+                code: shark_shaders::BASIC_VERT_SPV,
+                ..default()
+            },
+            fragment_shader: ShaderDesc {
+                code: shark_shaders::BASIC_FRAG_SPV,
+                ..default()
+            },
+            layout: PipelineLayout {
+                bind_group_layouts: &[graphics_bind_group_layout],
+                push_constant_ranges: &[PushConstantRange {
+                    stage_flags: ShaderStageFlags::VERTEX,
+                    offset: 0,
+                    size: std::mem::size_of::<BasicConstants>() as u32,
+                }],
+            },
+            attachments: GraphicsPipelineAttachments {
+                color_attachment_formats: &[ImageFormat::RGBA16_FLOAT],
+                depth_attachment_format: Some(ImageFormat::DEPTH_F32),
+                stencil_attachment_format: None,
+            },
+            topology: Topology::Triangles,
+            primitive_restart: false,
+            polygon_mode: PolygonMode::Fill,
+            culling_mode: CullingMode::Back,
+            front_face: FrontFace::CounterClockwise,
+            blend_mode: BlendMode::Opaque,
+            depth_bias: None,
+            depth_compare_op: CompareOp::GreaterOrEqual,
+            depth_test_enable: true,
+            depth_write_enable: true,
+            stencil_test_enable: false,
+            stencil_back: default(),
+            stencil_front: default(),
+        });
+
+        gpu.debug_name_pipeline(basic_pipeline, "basic");
+
+        let create_compute_pipeline = |code, name, workgroup_size, push_constant_size| {
+            let push_constant_range = PushConstantRange {
+                stage_flags: ShaderStageFlags::COMPUTE,
+                offset: 0,
+                size: push_constant_size as u32,
+            };
+
+            let pipeline = gpu.create_compute_pipeline(&ComputePipelineDesc {
+                shader: ShaderDesc {
+                    code,
+                    require_full_subgroups: workgroup_size != 0,
+                    required_subgroup_size: if workgroup_size != 0 {
+                        Some(workgroup_size)
+                    } else {
+                        None
+                    },
+                    spec_constants: &[SpecConstant::U32 {
+                        id: 0,
+                        value: workgroup_size,
+                    }],
+                    ..default()
+                },
+                layout: PipelineLayout {
+                    bind_group_layouts: &[compute_bind_group_layout],
+                    // Validation cries about push constant ranges with zero size.
+                    push_constant_ranges: if push_constant_range.size != 0 {
+                        std::slice::from_ref(&push_constant_range)
+                    } else {
+                        &[]
+                    },
+                },
+            });
+
+            gpu.debug_name_pipeline(pipeline, name);
+
+            pipeline
+        };
+
+        let draw_2d_bin_0_clear_pipeline = create_compute_pipeline(
+            shark_shaders::DRAW_2D_BIN_0_CLEAR_COMP_SPV,
+            "draw2d_bin_clear",
+            0,
+            std::mem::size_of::<Draw2dClearConstants>(),
+        );
+
+        let draw_2d_bin_1_scatter_pipeline_workgroup_size = 32;
+        let draw_2d_bin_1_scatter_pipeline = create_compute_pipeline(
+            shark_shaders::DRAW_2D_BIN_1_SCATTER_COMP_SPV,
+            "draw2d_bin_scatter",
+            draw_2d_bin_1_scatter_pipeline_workgroup_size,
+            std::mem::size_of::<Draw2dScatterConstants>(),
+        );
+
+        let draw_2d_bin_2_sort_pipeline = create_compute_pipeline(
+            shark_shaders::DRAW_2D_BIN_2_SORT_COMP_SPV,
+            "draw2d_bin_sort",
+            0,
+            std::mem::size_of::<Draw2dSortConstants>(),
+        );
+
+        let draw_2d_bin_3_resolve_pipeline = create_compute_pipeline(
+            shark_shaders::DRAW_2D_BIN_3_RESOLVE_COMP_SPV,
+            "draw2d_bin_resolve",
+            0,
+            0,
+        );
+
+        let draw_2d_rasterize_pipeline = create_compute_pipeline(
+            shark_shaders::DRAW_2D_RASTERIZE_COMP_SPV,
+            "draw2d_rasterize",
+            0,
+            0,
+        );
+
+        let radix_sort_0_upsweep_pipeline = create_compute_pipeline(
+            shark_shaders::RADIX_SORT_0_UPSWEEP_COMP_SPV,
+            "radix_sort_upsweep",
+            32,
+            std::mem::size_of::<RadixSortUpsweepConstants>(),
+        );
+
+        let radix_sort_1_spine_pipeline = create_compute_pipeline(
+            shark_shaders::RADIX_SORT_1_SPINE_COMP_SPV,
+            "radix_sort_spine",
+            32,
+            std::mem::size_of::<RadixSortSpineConstants>(),
+        );
+
+        let radix_sort_2_downsweep_pipeline = create_compute_pipeline(
+            shark_shaders::RADIX_SORT_2_DOWNSWEEP_COMP_SPV,
+            "radix_sort_downsweep",
+            32,
+            std::mem::size_of::<RadixSortDownsweepConstants>(),
+        );
+
+        let composite_pipeline =
+            create_compute_pipeline(shark_shaders::COMPOSITE_COMP_SPV, "composite", 0, 0);
+
+        Self {
+            _samplers: samplers,
+
+            graphics_bind_group_layout,
+            compute_bind_group_layout,
+
+            basic_pipeline,
+
+            draw_2d_bin_0_clear_pipeline,
+            draw_2d_bin_1_scatter_pipeline_workgroup_size,
+            draw_2d_bin_1_scatter_pipeline,
+            draw_2d_bin_2_sort_pipeline,
+            draw_2d_bin_3_resolve_pipeline,
+            draw_2d_rasterize_pipeline,
+
+            radix_sort_0_upsweep_pipeline,
+            radix_sort_1_spine_pipeline,
+            radix_sort_2_downsweep_pipeline,
+
+            composite_pipeline,
+        }
+    }
+}
diff --git a/title/shark/src/pipelines/basic.rs b/title/shark/src/pipelines/basic.rs
deleted file mode 100644 (file)
index fc4d88a..0000000
+++ /dev/null
@@ -1,90 +0,0 @@
-use narcissus_core::default;
-use narcissus_gpu::{
-    BindDesc, BindGroupLayout, BindingType, BlendMode, CompareOp, CullingMode, FrontFace,
-    GraphicsPipelineAttachments, GraphicsPipelineDesc, ImageFormat, Pipeline, PipelineLayout,
-    PolygonMode, Sampler, ShaderDesc, ShaderStageFlags, Topology,
-};
-use narcissus_maths::Mat4;
-
-use crate::Gpu;
-
-#[allow(unused)]
-#[repr(C)]
-pub struct BasicUniforms {
-    pub clip_from_model: Mat4,
-}
-
-#[allow(unused)]
-#[repr(C)]
-pub struct Vertex {
-    pub position: [f32; 4],
-    pub normal: [f32; 4],
-    pub texcoord: [f32; 4],
-}
-
-pub struct BasicPipeline {
-    pub uniforms_bind_group_layout: BindGroupLayout,
-    pub storage_bind_group_layout: BindGroupLayout,
-    pub pipeline: Pipeline,
-}
-
-impl BasicPipeline {
-    pub fn new(gpu: &Gpu, immutable_samplers: &[Sampler]) -> Self {
-        let uniforms_bind_group_layout = gpu.create_bind_group_layout(&[
-            // Uniforms
-            BindDesc::new(ShaderStageFlags::ALL, BindingType::UniformBuffer),
-            // Samplers
-            BindDesc::with_immutable_samplers(ShaderStageFlags::ALL, immutable_samplers),
-        ]);
-
-        let storage_bind_group_layout = gpu.create_bind_group_layout(&[
-            // Vertex Buffer
-            BindDesc::new(ShaderStageFlags::ALL, BindingType::StorageBuffer),
-            // Transform Buffer
-            BindDesc::new(ShaderStageFlags::ALL, BindingType::StorageBuffer),
-            // Albedo
-            BindDesc::new(ShaderStageFlags::ALL, BindingType::SampledImage),
-        ]);
-
-        let layout = &PipelineLayout {
-            bind_group_layouts: &[uniforms_bind_group_layout, storage_bind_group_layout],
-            push_constant_ranges: &[],
-        };
-
-        let pipeline = gpu.create_graphics_pipeline(&GraphicsPipelineDesc {
-            vertex_shader: ShaderDesc {
-                entry: c"main",
-                code: shark_shaders::BASIC_VERT_SPV,
-            },
-            fragment_shader: ShaderDesc {
-                entry: c"main",
-                code: shark_shaders::BASIC_FRAG_SPV,
-            },
-            layout,
-            attachments: GraphicsPipelineAttachments {
-                color_attachment_formats: &[ImageFormat::RGBA16_FLOAT],
-                depth_attachment_format: Some(ImageFormat::DEPTH_F32),
-                stencil_attachment_format: None,
-            },
-            topology: Topology::Triangles,
-            primitive_restart: false,
-            polygon_mode: PolygonMode::Fill,
-            culling_mode: CullingMode::Back,
-            front_face: FrontFace::CounterClockwise,
-            blend_mode: BlendMode::Opaque,
-            depth_bias: None,
-            depth_compare_op: CompareOp::GreaterOrEqual,
-            depth_test_enable: true,
-            depth_write_enable: true,
-            stencil_test_enable: false,
-            stencil_back: default(),
-            stencil_front: default(),
-        });
-
-        Self {
-            uniforms_bind_group_layout,
-            storage_bind_group_layout,
-            pipeline,
-        }
-    }
-}
diff --git a/title/shark/src/pipelines/mod.rs b/title/shark/src/pipelines/mod.rs
deleted file mode 100644 (file)
index d544f07..0000000
+++ /dev/null
@@ -1,79 +0,0 @@
-use narcissus_font::TouchedGlyphIndex;
-
-pub mod basic;
-
-pub const TILE_SIZE: u32 = 32;
-pub const MAX_PRIMS: u32 = 1 << 18;
-pub const TILE_BITMAP_WORDS_L1: u32 = MAX_PRIMS / 32 / 32;
-pub const TILE_BITMAP_WORDS_L0: u32 = MAX_PRIMS / 32;
-pub const TILE_STRIDE: u32 = TILE_BITMAP_WORDS_L0 + TILE_BITMAP_WORDS_L1 + 2;
-
-#[allow(unused)]
-#[repr(C)]
-pub struct PrimitiveUniforms {
-    pub screen_resolution_x: u32,
-    pub screen_resolution_y: u32,
-
-    pub atlas_resolution_x: u32,
-    pub atlas_resolution_y: u32,
-
-    pub tile_resolution_x: u32,
-    pub tile_resolution_y: u32,
-
-    pub num_primitives: u32,
-    pub num_primitives_32: u32,
-    pub num_primitives_1024: u32,
-    pub tile_stride: u32,
-
-    pub primitives_instances_buffer: u64,
-    pub rects_buffer: u64,
-    pub glyphs_buffer: u64,
-    pub tiles_buffer: u64,
-}
-
-#[repr(u32)]
-pub enum PrimitiveType {
-    Rect,
-    Glyph,
-}
-
-#[allow(unused)]
-#[repr(C)]
-pub struct PrimitiveInstance {
-    pub packed: u32,
-    pub color: u32,
-    pub x: f32,
-    pub y: f32,
-}
-
-#[repr(C)]
-pub struct Rect {
-    pub half_extent_x: f32,
-    pub half_extent_y: f32,
-    pub border_width: f32,
-    pub border_radius: f32,
-}
-
-impl PrimitiveInstance {
-    #[inline(always)]
-    pub fn glyph(glyph_index: TouchedGlyphIndex, color: u32, x: f32, y: f32) -> Self {
-        let packed = glyph_index.as_u32() | ((PrimitiveType::Glyph as u32) << 30);
-        Self {
-            packed,
-            color,
-            x,
-            y,
-        }
-    }
-
-    #[inline(always)]
-    pub fn rect(rect_index: u32, color: u32, x: f32, y: f32) -> Self {
-        let packed = rect_index | ((PrimitiveType::Rect as u32) << 30);
-        Self {
-            packed,
-            color,
-            x,
-            y,
-        }
-    }
-}