From: Joshua Simmons Date: Mon, 29 May 2023 07:10:03 +0000 (+0200) Subject: narcissus-gpu: Add TLSF Allocator X-Git-Url: https://git.nega.tv//gitweb.cgi?a=commitdiff_plain;h=db3ac721514a214ecfebd7b705aea303b513d2df;p=josh%2Fnarcissus narcissus-gpu: Add TLSF Allocator --- diff --git a/bins/narcissus/src/helpers.rs b/bins/narcissus/src/helpers.rs index b0612cc..16c86de 100644 --- a/bins/narcissus/src/helpers.rs +++ b/bins/narcissus/src/helpers.rs @@ -101,21 +101,19 @@ pub fn create_buffer_with_data( where T: Blittable, { - let len = data.len() * std::mem::size_of::(); - let buffer = device.create_buffer(&BufferDesc { - location: MemoryLocation::HostMapped, - usage, - size: len, - }); - // SAFETY: T: Blittable which implies it's freely convertable to a byte - // slice. + // SAFETY: T: Blittable which implies it's freely convertable to a byte slice. unsafe { - let dst = std::slice::from_raw_parts_mut(device.map_buffer(buffer), len); - let src = std::slice::from_raw_parts(data.as_ptr() as *const u8, len); - dst.copy_from_slice(src); - device.unmap_buffer(buffer); + let len = data.len() * std::mem::size_of::(); + let initial_data = std::slice::from_raw_parts(data.as_ptr() as *const u8, len); + device.create_buffer_with_data( + &BufferDesc { + location: MemoryLocation::HostMapped, + usage, + size: len, + }, + initial_data, + ) } - buffer } pub fn create_image_with_data( diff --git a/libs/narcissus-gpu/src/backend/vulkan/mod.rs b/libs/narcissus-gpu/src/backend/vulkan/mod.rs index 8984bae..78360f6 100644 --- a/libs/narcissus-gpu/src/backend/vulkan/mod.rs +++ b/libs/narcissus-gpu/src/backend/vulkan/mod.rs @@ -1,8 +1,8 @@ use std::{ cell::{Cell, RefCell, UnsafeCell}, - collections::{hash_map::Entry, HashMap, VecDeque}, + collections::{hash_map::Entry, HashMap, HashSet, VecDeque}, marker::PhantomData, - os::raw::{c_char, c_void}, + os::raw::c_char, ptr::NonNull, sync::atomic::{AtomicU64, Ordering}, }; @@ -17,15 +17,18 @@ use narcissus_core::{ use vulkan_sys as vk; use crate::{ - delay_queue::DelayQueue, frame_counter::FrameCounter, Access, Bind, BindGroupLayout, - BindGroupLayoutDesc, BindingType, BlendMode, Buffer, BufferDesc, BufferImageCopy, - BufferUsageFlags, ClearValue, CmdBuffer, CompareOp, ComputePipelineDesc, CullingMode, Device, - Extent2d, Extent3d, Frame, FrontFace, GlobalBarrier, GpuConcurrent, GraphicsPipelineDesc, - Image, ImageAspectFlags, ImageBarrier, ImageBlit, ImageDesc, ImageDimension, ImageFormat, - ImageLayout, ImageSubresourceLayers, ImageSubresourceRange, ImageUsageFlags, ImageViewDesc, - IndexType, LoadOp, MemoryLocation, Offset2d, Offset3d, Pipeline, PolygonMode, Sampler, - SamplerAddressMode, SamplerCompareOp, SamplerDesc, SamplerFilter, ShaderStageFlags, StencilOp, - StencilOpState, StoreOp, SwapchainOutOfDateError, ThreadToken, Topology, TypedBind, + delay_queue::DelayQueue, + frame_counter::FrameCounter, + tlsf::{self, Tlsf}, + Access, Bind, BindGroupLayout, BindGroupLayoutDesc, BindingType, BlendMode, Buffer, BufferDesc, + BufferImageCopy, BufferUsageFlags, ClearValue, CmdBuffer, CompareOp, ComputePipelineDesc, + CullingMode, Device, Extent2d, Extent3d, Frame, FrontFace, GlobalBarrier, GpuConcurrent, + GraphicsPipelineDesc, Image, ImageAspectFlags, ImageBarrier, ImageBlit, ImageDesc, + ImageDimension, ImageFormat, ImageLayout, ImageSubresourceLayers, ImageSubresourceRange, + ImageUsageFlags, ImageViewDesc, IndexType, LoadOp, MemoryLocation, Offset2d, Offset3d, + Pipeline, PolygonMode, Sampler, SamplerAddressMode, SamplerCompareOp, SamplerDesc, + SamplerFilter, ShaderStageFlags, StencilOp, StencilOpState, StoreOp, SwapchainOutOfDateError, + ThreadToken, Topology, TypedBind, }; const NUM_FRAMES: usize = 2; @@ -727,6 +730,7 @@ fn vulkan_image_memory_barrier( struct VulkanBuffer { memory: VulkanMemory, buffer: vk::Buffer, + map_count: u64, } #[derive(Clone)] @@ -814,6 +818,12 @@ struct VulkanPresentInfo { image_index: u32, } +#[derive(Clone, Copy)] +struct VulkanAllocationInfo { + memory: vk::DeviceMemory, + mapped_ptr: *mut u8, +} + enum VulkanMemoryDedicatedDesc { Image(vk::Image), Buffer(vk::Buffer), @@ -822,15 +832,71 @@ enum VulkanMemoryDedicatedDesc { struct VulkanMemoryDesc { requirements: vk::MemoryRequirements, memory_location: MemoryLocation, - dedicated: Option, _linear: bool, } #[derive(Clone)] -struct VulkanMemory { +struct VulkanMemoryDedicated { memory: vk::DeviceMemory, - offset: u64, + mapped_ptr: *mut u8, size: u64, + memory_type_index: u32, +} + +#[derive(Clone)] +struct VulkanMemorySubAlloc { + allocation: tlsf::Allocation, + size: u64, + memory_type_index: u32, +} + +#[derive(Clone)] +enum VulkanMemory { + Dedicated(VulkanMemoryDedicated), + SubAlloc(VulkanMemorySubAlloc), +} + +impl VulkanMemory { + #[inline(always)] + fn device_memory(&self) -> vk::DeviceMemory { + match self { + VulkanMemory::Dedicated(dedicated) => dedicated.memory, + VulkanMemory::SubAlloc(sub_alloc) => sub_alloc.allocation.user_data().memory, + } + } + + #[inline(always)] + fn offset(&self) -> u64 { + match self { + VulkanMemory::Dedicated(_) => 0, + VulkanMemory::SubAlloc(sub_alloc) => sub_alloc.allocation.offset(), + } + } + + #[inline(always)] + fn size(&self) -> u64 { + match self { + VulkanMemory::Dedicated(dedicated) => dedicated.size, + VulkanMemory::SubAlloc(sub_alloc) => sub_alloc.size, + } + } + + #[inline(always)] + fn mapped_ptr(&self) -> *mut u8 { + match self { + VulkanMemory::Dedicated(dedicated) => dedicated.mapped_ptr, + VulkanMemory::SubAlloc(sub_alloc) => { + let user_data = sub_alloc.allocation.user_data(); + if user_data.mapped_ptr.is_null() { + std::ptr::null_mut() + } else { + user_data + .mapped_ptr + .wrapping_add(sub_alloc.allocation.offset() as usize) + } + } + } + } } #[derive(Clone)] @@ -890,6 +956,12 @@ impl VulkanFrame { } } +#[derive(Default)] +struct VulkanAllocator { + tlsf: Mutex>, + dedicated: Mutex>, +} + type SwapchainDestroyQueue = DelayQueue<(vk::SwapchainKHR, vk::SurfaceKHR, Box<[vk::ImageView]>)>; pub(crate) struct VulkanDevice { @@ -920,6 +992,8 @@ pub(crate) struct VulkanDevice { recycled_semaphores: Mutex>, recycled_descriptor_pools: Mutex>, + allocators: [Option>; vk::MAX_MEMORY_TYPES as usize], + _global_fn: vk::GlobalFunctions, instance_fn: vk::InstanceFunctions, xcb_surface_fn: Option, @@ -1239,6 +1313,14 @@ impl VulkanDevice { }) })); + let allocators = std::array::from_fn(|i| { + if i < physical_device_memory_properties.memory_type_count as usize { + Some(default()) + } else { + None + } + }); + Self { instance, physical_device, @@ -1266,6 +1348,8 @@ impl VulkanDevice { recycled_semaphores: default(), recycled_descriptor_pools: default(), + allocators, + _global_fn: global_fn, instance_fn, xcb_surface_fn, @@ -1320,7 +1404,11 @@ impl VulkanDevice { .0 } - fn allocate_memory(&self, desc: &VulkanMemoryDesc) -> VulkanMemory { + fn allocate_memory_dedicated( + &self, + desc: &VulkanMemoryDesc, + dedicated_desc: &VulkanMemoryDedicatedDesc, + ) -> VulkanMemory { let memory_property_flags = match desc.memory_location { MemoryLocation::HostMapped => vk::MemoryPropertyFlags::HOST_VISIBLE, MemoryLocation::Device => vk::MemoryPropertyFlags::DEVICE_LOCAL, @@ -1329,7 +1417,9 @@ impl VulkanDevice { let memory_type_index = self.find_memory_type_index(desc.requirements.memory_type_bits, memory_property_flags); - let mut dedicated_allocate_info = vk::MemoryDedicatedAllocateInfo::default(); + let allocator = self.allocators[memory_type_index as usize] + .as_ref() + .expect("returned a memory type index that has no associated allocator"); let mut allocate_info = vk::MemoryAllocateInfo { allocation_size: desc.requirements.size, @@ -1337,29 +1427,119 @@ impl VulkanDevice { ..default() }; - if let Some(dedicated) = &desc.dedicated { - match dedicated { - &VulkanMemoryDedicatedDesc::Image(image) => { - dedicated_allocate_info.image = image; - } - &VulkanMemoryDedicatedDesc::Buffer(buffer) => { - dedicated_allocate_info.buffer = buffer - } + let mut dedicated_allocate_info = vk::MemoryDedicatedAllocateInfo::default(); + + match *dedicated_desc { + VulkanMemoryDedicatedDesc::Image(image) => { + dedicated_allocate_info.image = image; } - allocate_info._next = - &dedicated_allocate_info as *const vk::MemoryDedicatedAllocateInfo as *const _ + VulkanMemoryDedicatedDesc::Buffer(buffer) => dedicated_allocate_info.buffer = buffer, } + allocate_info._next = + &dedicated_allocate_info as *const vk::MemoryDedicatedAllocateInfo as *const _; let mut memory = vk::DeviceMemory::null(); vk_check!(self .device_fn .allocate_memory(self.device, &allocate_info, None, &mut memory)); - VulkanMemory { + allocator.dedicated.lock().insert(memory); + + let mapped_ptr = if self.physical_device_memory_properties.memory_types + [memory_type_index as usize] + .property_flags + .contains(vk::MemoryPropertyFlags::HOST_VISIBLE) + { + let mut data = std::ptr::null_mut(); + vk_check!(self.device_fn.map_memory( + self.device, + memory, + 0, + vk::WHOLE_SIZE, + vk::MemoryMapFlags::default(), + &mut data + )); + data as *mut u8 + } else { + std::ptr::null_mut() + }; + + VulkanMemory::Dedicated(VulkanMemoryDedicated { memory, - offset: 0, + mapped_ptr, size: desc.requirements.size, - } + memory_type_index, + }) + } + + fn allocate_memory(&self, desc: &VulkanMemoryDesc) -> VulkanMemory { + let memory_property_flags = match desc.memory_location { + MemoryLocation::HostMapped => vk::MemoryPropertyFlags::HOST_VISIBLE, + MemoryLocation::Device => vk::MemoryPropertyFlags::DEVICE_LOCAL, + }; + + let memory_type_index = + self.find_memory_type_index(desc.requirements.memory_type_bits, memory_property_flags); + + let allocator = self.allocators[memory_type_index as usize] + .as_ref() + .expect("returned a memory type index that has no associated allocator"); + + let mut tlsf = allocator.tlsf.lock(); + + let allocation = { + if let Some(allocation) = + tlsf.alloc(desc.requirements.size, desc.requirements.alignment) + { + allocation + } else { + const BLOCK_SIZE: u64 = 128 * 1024 * 1024; + + let allocate_info = vk::MemoryAllocateInfo { + allocation_size: BLOCK_SIZE, + memory_type_index, + ..default() + }; + + let mut memory = vk::DeviceMemory::null(); + vk_check!(self.device_fn.allocate_memory( + self.device, + &allocate_info, + None, + &mut memory + )); + + let mapped_ptr = if self.physical_device_memory_properties.memory_types + [memory_type_index as usize] + .property_flags + .contains(vk::MemoryPropertyFlags::HOST_VISIBLE) + { + let mut data = std::ptr::null_mut(); + vk_check!(self.device_fn.map_memory( + self.device, + memory, + 0, + vk::WHOLE_SIZE, + vk::MemoryMapFlags::default(), + &mut data + )); + data as *mut u8 + } else { + std::ptr::null_mut() + }; + + tlsf.insert_super_block(BLOCK_SIZE, VulkanAllocationInfo { memory, mapped_ptr }); + + tlsf.alloc(desc.requirements.size, desc.requirements.alignment) + .expect("failed to allocate") + } + }; + + VulkanMemory::SubAlloc(VulkanMemorySubAlloc { + allocation, + size: desc.requirements.size, + memory_type_index, + }) } fn request_descriptor_pool(&self) -> vk::DescriptorPool { @@ -1416,68 +1596,7 @@ impl VulkanDevice { semaphore } - fn destroy_deferred( - device_fn: &vk::DeviceFunctions, - device: vk::Device, - frame: &mut VulkanFrame, - ) { - for pipeline_layout in frame.destroyed_pipeline_layouts.get_mut().drain(..) { - unsafe { device_fn.destroy_pipeline_layout(device, pipeline_layout, None) } - } - for pipeline in frame.destroyed_pipelines.get_mut().drain(..) { - unsafe { device_fn.destroy_pipeline(device, pipeline, None) } - } - for descriptor_set_layout in frame.destroyed_descriptor_set_layouts.get_mut().drain(..) { - unsafe { device_fn.destroy_descriptor_set_layout(device, descriptor_set_layout, None) } - } - for sampler in frame.destroyed_samplers.get_mut().drain(..) { - unsafe { device_fn.destroy_sampler(device, sampler, None) } - } - for image_view in frame.destroyed_image_views.get_mut().drain(..) { - unsafe { device_fn.destroy_image_view(device, image_view, None) } - } - for image in frame.destroyed_images.get_mut().drain(..) { - unsafe { device_fn.destroy_image(device, image, None) } - } - for buffer_view in frame.destroyed_buffer_views.get_mut().drain(..) { - unsafe { device_fn.destroy_buffer_view(device, buffer_view, None) } - } - for buffer in frame.destroyed_buffers.get_mut().drain(..) { - unsafe { device_fn.destroy_buffer(device, buffer, None) } - } - for memory in frame.destroyed_allocations.get_mut().drain(..) { - unsafe { device_fn.free_memory(device, memory.memory, None) }; - } - } - - fn destroy_swapchain_deferred( - &self, - surface: vk::SurfaceKHR, - swapchain: vk::SwapchainKHR, - image_views: &[vk::ImageView], - ) { - let device_fn = &self.device_fn; - let swapchain_fn = &self.swapchain_fn; - let surface_fn = &self.surface_fn; - let instance = self.instance; - let device = self.device; - - if !image_views.is_empty() { - for &image_view in image_views { - unsafe { device_fn.destroy_image_view(device, image_view, None) } - } - } - if !swapchain.is_null() { - unsafe { swapchain_fn.destroy_swapchain(device, swapchain, None) } - } - if !surface.is_null() { - unsafe { surface_fn.destroy_surface(instance, surface, None) } - } - } -} - -impl Device for VulkanDevice { - fn create_buffer(&self, desc: &BufferDesc) -> Buffer { + fn create_buffer(&self, desc: &BufferDesc, initial_data: Option<&[u8]>) -> Buffer { let mut usage = vk::BufferUsageFlags::default(); if desc.usage.contains(BufferUsageFlags::UNIFORM) { usage |= vk::BufferUsageFlags::UNIFORM_BUFFER; @@ -1525,54 +1644,122 @@ impl Device for VulkanDevice { &mut memory_requirements, ); - let dedicated = - if memory_dedicated_requirements.prefers_dedicated_allocation == vk::Bool32::True { - Some(VulkanMemoryDedicatedDesc::Buffer(buffer)) - } else { - None - }; + let memory = if memory_dedicated_requirements.prefers_dedicated_allocation + == vk::Bool32::True + || memory_dedicated_requirements.requires_dedicated_allocation == vk::Bool32::True + { + self.allocate_memory_dedicated( + &VulkanMemoryDesc { + requirements: memory_requirements.memory_requirements, + memory_location: desc.location, + _linear: true, + }, + &VulkanMemoryDedicatedDesc::Buffer(buffer), + ) + } else { + self.allocate_memory(&VulkanMemoryDesc { + requirements: memory_requirements.memory_requirements, + memory_location: desc.location, + _linear: true, + }) + }; - let memory = self.allocate_memory(&VulkanMemoryDesc { - requirements: memory_requirements.memory_requirements, - memory_location: desc.location, - dedicated, - _linear: true, - }); + if let Some(initial_data) = initial_data { + assert!(!memory.mapped_ptr().is_null()); + // SAFETY: The memory has just been allocated, so as long as the pointer is + // non-null, then we can create a slice for it. + unsafe { + let dst = + std::slice::from_raw_parts_mut(memory.mapped_ptr(), memory.size() as usize); + dst.copy_from_slice(initial_data); + } + } unsafe { self.device_fn.bind_buffer_memory2( self.device, &[vk::BindBufferMemoryInfo { buffer, - memory: memory.memory, - offset: memory.offset, + memory: memory.device_memory(), + offset: memory.offset(), ..default() }], ) }; - let handle = self - .buffer_pool - .lock() - .insert(VulkanBuffer { memory, buffer }); + let handle = self.buffer_pool.lock().insert(VulkanBuffer { + memory, + buffer, + map_count: 0, + }); Buffer(handle) } - fn create_buffer_with_data(&self, desc: &BufferDesc, initial_data: &[u8]) -> Buffer { - let len = initial_data.len(); + fn destroy_deferred( + device_fn: &vk::DeviceFunctions, + device: vk::Device, + frame: &mut VulkanFrame, + ) { + for pipeline_layout in frame.destroyed_pipeline_layouts.get_mut().drain(..) { + unsafe { device_fn.destroy_pipeline_layout(device, pipeline_layout, None) } + } + for pipeline in frame.destroyed_pipelines.get_mut().drain(..) { + unsafe { device_fn.destroy_pipeline(device, pipeline, None) } + } + for descriptor_set_layout in frame.destroyed_descriptor_set_layouts.get_mut().drain(..) { + unsafe { device_fn.destroy_descriptor_set_layout(device, descriptor_set_layout, None) } + } + for sampler in frame.destroyed_samplers.get_mut().drain(..) { + unsafe { device_fn.destroy_sampler(device, sampler, None) } + } + for image_view in frame.destroyed_image_views.get_mut().drain(..) { + unsafe { device_fn.destroy_image_view(device, image_view, None) } + } + for image in frame.destroyed_images.get_mut().drain(..) { + unsafe { device_fn.destroy_image(device, image, None) } + } + for buffer_view in frame.destroyed_buffer_views.get_mut().drain(..) { + unsafe { device_fn.destroy_buffer_view(device, buffer_view, None) } + } + for buffer in frame.destroyed_buffers.get_mut().drain(..) { + unsafe { device_fn.destroy_buffer(device, buffer, None) } + } + } - assert!(len <= desc.size, "initial data larger than buffer"); - assert!(desc.location == MemoryLocation::HostMapped); - let buffer = self.create_buffer(desc); + fn destroy_swapchain_deferred( + &self, + surface: vk::SurfaceKHR, + swapchain: vk::SwapchainKHR, + image_views: &[vk::ImageView], + ) { + let device_fn = &self.device_fn; + let swapchain_fn = &self.swapchain_fn; + let surface_fn = &self.surface_fn; + let instance = self.instance; + let device = self.device; - unsafe { - let dst = std::slice::from_raw_parts_mut(self.map_buffer(buffer), len); - dst.copy_from_slice(initial_data); - self.unmap_buffer(buffer); + if !image_views.is_empty() { + for &image_view in image_views { + unsafe { device_fn.destroy_image_view(device, image_view, None) } + } + } + if !swapchain.is_null() { + unsafe { swapchain_fn.destroy_swapchain(device, swapchain, None) } + } + if !surface.is_null() { + unsafe { surface_fn.destroy_surface(instance, surface, None) } } + } +} + +impl Device for VulkanDevice { + fn create_buffer(&self, desc: &BufferDesc) -> Buffer { + self.create_buffer(desc, None) + } - buffer + fn create_buffer_with_data(&self, desc: &BufferDesc, initial_data: &[u8]) -> Buffer { + self.create_buffer(desc, Some(initial_data)) } fn create_image(&self, desc: &ImageDesc) -> Image { @@ -1669,27 +1856,33 @@ impl Device for VulkanDevice { &mut memory_requirements, ); - let dedicated = - if memory_dedicated_requirements.prefers_dedicated_allocation == vk::Bool32::True { - Some(VulkanMemoryDedicatedDesc::Image(image)) - } else { - None - }; - - let memory = self.allocate_memory(&VulkanMemoryDesc { - requirements: memory_requirements.memory_requirements, - memory_location: desc.location, - dedicated, - _linear: true, - }); + let memory = if memory_dedicated_requirements.prefers_dedicated_allocation + == vk::Bool32::True + || memory_dedicated_requirements.requires_dedicated_allocation == vk::Bool32::True + { + self.allocate_memory_dedicated( + &VulkanMemoryDesc { + requirements: memory_requirements.memory_requirements, + memory_location: desc.location, + _linear: true, + }, + &VulkanMemoryDedicatedDesc::Image(image), + ) + } else { + self.allocate_memory(&VulkanMemoryDesc { + requirements: memory_requirements.memory_requirements, + memory_location: desc.location, + _linear: true, + }) + }; unsafe { self.device_fn.bind_image_memory2( self.device, &[vk::BindImageMemoryInfo { image, - memory: memory.memory, - offset: memory.offset, + memory: memory.device_memory(), + offset: memory.offset(), ..default() }], ) @@ -2113,6 +2306,10 @@ impl Device for VulkanDevice { fn destroy_buffer(&self, frame: &Frame, buffer: Buffer) { if let Some(buffer) = self.buffer_pool.lock().remove(buffer.0) { + assert_eq!( + buffer.map_count, 0, + "destroying a buffer that is still mapped" + ); let frame = self.frame(frame); frame.destroyed_buffers.lock().push_back(buffer.buffer); frame.destroyed_allocations.lock().push_back(buffer.memory); @@ -2931,6 +3128,24 @@ impl Device for VulkanDevice { Self::destroy_deferred(device_fn, device, frame); + for allocation in frame.destroyed_allocations.get_mut().drain(..) { + match allocation { + VulkanMemory::Dedicated(dedicated) => { + let allocator = self.allocators[dedicated.memory_type_index as usize] + .as_ref() + .unwrap(); + allocator.dedicated.lock().remove(&dedicated.memory); + unsafe { device_fn.free_memory(device, dedicated.memory, None) } + } + VulkanMemory::SubAlloc(sub_alloc) => { + let allocator = self.allocators[sub_alloc.memory_type_index as usize] + .as_ref() + .unwrap(); + allocator.tlsf.lock().free(sub_alloc.allocation) + } + } + } + self.destroyed_swapchains .lock() .expire(|(swapchain, surface, image_views)| { @@ -3010,25 +3225,17 @@ impl Device for VulkanDevice { } unsafe fn map_buffer(&self, buffer: Buffer) -> *mut u8 { - let mut ptr = std::ptr::null_mut(); - if let Some(buffer) = self.buffer_pool.lock().get(buffer.0) { - vk_check!(self.device_fn.map_memory( - self.device, - buffer.memory.memory, - buffer.memory.offset, - buffer.memory.size, - vk::MemoryMapFlags::default(), - &mut ptr - )) - } - std::mem::transmute::<*mut c_void, *mut u8>(ptr) + let mut buffer_pool = self.buffer_pool.lock(); + let buffer = buffer_pool.get_mut(buffer.0).unwrap(); + buffer.map_count += 1; + buffer.memory.mapped_ptr() } unsafe fn unmap_buffer(&self, buffer: Buffer) { - if let Some(buffer) = self.buffer_pool.lock().get(buffer.0) { - self.device_fn - .unmap_memory(self.device, buffer.memory.memory) - } + let mut buffer_pool = self.buffer_pool.lock(); + let buffer = buffer_pool.get_mut(buffer.0).unwrap(); + assert!(buffer.map_count > 0); + buffer.map_count -= 1; } fn acquire_swapchain( @@ -3449,19 +3656,16 @@ impl Drop for VulkanDevice { for buffer in self.buffer_pool.get_mut().values() { unsafe { device_fn.destroy_buffer(device, buffer.buffer, None) } - unsafe { device_fn.free_memory(device, buffer.memory.memory, None) } } { let mut image_views = Vec::new(); let mut images = Vec::new(); - let mut memories = Vec::new(); for image in self.image_pool.get_mut().values() { match image { VulkanImageHolder::Unique(image) => { image_views.push(image.view); images.push(image.image.image); - memories.push(image.image.memory.memory); } VulkanImageHolder::Shared(image) => { image_views.push(image.view); @@ -3479,10 +3683,6 @@ impl Drop for VulkanDevice { for image in images { unsafe { device_fn.destroy_image(device, image, None) } } - - for memory in memories { - unsafe { device_fn.free_memory(device, memory, None) } - } } for sampler in self.sampler_pool.get_mut().values() { @@ -3541,6 +3741,23 @@ impl Drop for VulkanDevice { unsafe { self.surface_fn.destroy_surface(instance, surface, None) } } + for allocator in self.allocators.iter_mut().flatten() { + // Clear out all memory blocks held by the TLSF allocators. + let tlsf = allocator.tlsf.get_mut(); + for super_block in tlsf.super_blocks() { + unsafe { + self.device_fn + .free_memory(device, super_block.user_data.memory, None) + } + } + + // Clear out all dedicated allocations. + let dedicated = allocator.dedicated.get_mut(); + for memory in dedicated.iter().copied() { + unsafe { self.device_fn.free_memory(device, memory, None) } + } + } + unsafe { device_fn.destroy_device(device, None) } unsafe { self.instance_fn.destroy_instance(self.instance, None) }; } diff --git a/libs/narcissus-gpu/src/lib.rs b/libs/narcissus-gpu/src/lib.rs index 1e64970..7105184 100644 --- a/libs/narcissus-gpu/src/lib.rs +++ b/libs/narcissus-gpu/src/lib.rs @@ -8,6 +8,7 @@ use narcissus_core::{ mod backend; mod delay_queue; mod frame_counter; +pub mod tlsf; pub enum DeviceBackend { Vulkan, diff --git a/libs/narcissus-gpu/src/tlsf.rs b/libs/narcissus-gpu/src/tlsf.rs new file mode 100644 index 0000000..6508150 --- /dev/null +++ b/libs/narcissus-gpu/src/tlsf.rs @@ -0,0 +1,724 @@ +//! Two Level Seggregated Fit Allocator +//! === +//! +//! [TLSF][tlsf] is a constant time, low fragmentation good-fit allocator based +//! on seggregated free-lists with a two-level bitmap acceleration structure. +//! +//! Memory is allocated by the underlying allocator into super-blocks, +//! representing large chunks of contiguous memory. The allocation routines +//! then work on blocks, which subdivide those regions. +//! +//! In order to quickly find a large-enough block, free blocks are stored in a +//! set of seggregated free-lists by their size. The requirements for a binning +//! strategy are as follows; +//! +//! 1) Must have a bounded number of bins. +//! +//! 2) Must be fast to find the bin for a given size. +//! +//! 3) Bin sizes must closely match allocation sizes to minimise fragmentation. +//! +//! For these purposes we use a [linear-log][linearlog] strategy for binning. An +//! initial 'linear' bin is divided into N sub-bins, then power-of-two sized +//! bins follow, also divided into N sub-bins. With some simple bit arithmetic +//! we can calculate the bucket for a given size. +//! +//! For example, if the initial linear region was 16, and the number of sub-bins +//! was 4, we would end up with a layout something like the following. +//! +//! ```text +//! 1..=4 5..=8 9..=12 13..=16 +//! +------------+------------+------------+------------+ +//! Linear Region | 0x01 | 0x00 | 0x00 | 0x00 | +//! +------------+------------+------------+------------+ +//! +//! 17..=20 21..=24 25..=28 29..=32 +//! +------------+------------+------------+------------+ +//! 2^4 | 0x00 | 0x00 | 0x00 | 0x00 | +//! +------------+------------+------------+------------+ +//! +//! 31..=40 41..=48 49..=56 57..=64 +//! +------------+------------+------------+------------+ +//! 2^5 | 0x00 | 0x00 | 0x00 | 0x00 | +//! +------------+------------+------------+------------+ +//! +//! 65..=80 81..=96 97..=112 113..=128 +//! +------------+------------+------------+------------+ +//! 2^6 | 0x01 | 0x00 | 0x04 | 0x00 | +//! +------------+------------+------------+------------+ +//! +//! ``` +//! +//! In order to avoid linearly scanning the free-lists to find suitable empty +//! blocks, we maintain a two-level bitmap acceleration structure. The first +//! level has a bit set for each non-empty bin, then the second level likewise +//! has a bit set for each non-empty sub-bin. From there it's possible to scan +//! with bit arithmetic to find the first suitable non-empty block without +//! traversing the entire free-lists structure. +//! +//! ```text +//! +//! +---+---+---+---+ +//! Level 0: | 1 | 0 | 0 | 1 | 0x9 +//! +-+-+-+-+-+-+-+-+ +//! | | | | +//! | | | | +//! | | | | +//! | | | | +---+---+---+---+ +//! Level 1: | | | +----->| 0 | 0 | 0 | 1 | Linear Region 0x1 +//! | | | +---+---+---+---+ +//! | | | +//! | | | +---+---+---+---+ +//! | | +--------->| 0 | 0 | 0 | 0 | 2^4 0x0 +//! | | +---+---+---+---+ +//! | | +//! | | +---+---+---+---+ +//! | +------------->| 0 | 0 | 0 | 0 | 2^5 0x0 +//! | +---+---+---+---+ +//! | +//! | +---+---+---+---+ +//! +----------------->| 0 | 1 | 0 | 0 | 2^6 0x4 +//! +---+---+---+---+ +//! +//! ``` +//! +//! [tlsf]: http://www.gii.upv.es/tlsf/files/spe_2008.pdf +//! [linearlog]: https://pvk.ca/Blog/2015/06/27/linear-log-bucketing-fast-versatile-simple/ + +use std::{ + num::NonZeroU32, + ops::{Index, IndexMut}, +}; + +use narcissus_core::{linear_log_binning, static_assert}; + +// The log2 of the size of the 'linear' bin. +pub const LINEAR_LOG2: u32 = 7; // 2^7 = 128 + +// The log2 of the number of sub-bins in each bin. +pub const SUB_BINS_LOG2: u32 = 5; // 2^5 = 32 + +type Bin = linear_log_binning::Bin; + +pub const BIN_COUNT: usize = 24; +pub const SUB_BIN_COUNT: usize = 1 << SUB_BINS_LOG2; + +static_assert!(SUB_BIN_COUNT <= u32::BITS as usize); +static_assert!(BIN_COUNT <= u32::BITS as usize); + +pub const MIN_BLOCK_SIZE: u32 = 16; + +#[derive(Clone, Copy, PartialEq, Eq, PartialOrd, Ord, Debug)] +struct SuperBlockIndex(u32); + +pub struct SuperBlock +where + T: Copy, +{ + _first_block_index: BlockIndex, + pub user_data: T, +} + +#[derive(Clone, Copy, PartialEq, Eq, PartialOrd, Ord, Debug)] +struct BlockIndex(NonZeroU32); + +const INVALID_BLOCK_INDEX: BlockIndex = BlockIndex(match NonZeroU32::new(0xffff_ffff) { + Some(x) => x, + None => panic!(), +}); + +struct BlockLink { + prev: BlockIndex, + next: BlockIndex, +} + +impl BlockLink { + /// Create a new unlinked BlockLink for the given `block_index`. + const fn new(block_index: BlockIndex) -> Self { + Self { + prev: block_index, + next: block_index, + } + } + + /// Returns true if the given link is not inserted into any list. + fn is_unlinked(&self) -> bool { + self.prev == self.next + } +} + +/// Insert the node at index `$insert` before the node at index `$x` for the +/// list given by `$storage` and `$link_name`. +macro_rules! list_insert_before { + ($storage:expr, $link_name:ident, $x:expr, $insert:expr) => { + $storage[$insert].$link_name.prev = $storage[$x].$link_name.prev; + $storage[$insert].$link_name.next = $x; + let prev_index = $storage[$insert].$link_name.prev; + $storage[prev_index].$link_name.next = $insert; + let next_index = $storage[$insert].$link_name.next; + $storage[next_index].$link_name.prev = $insert; + }; +} + +/// Insert the node at index `$insert` after the node at index `$x` for the +/// list given by `$storage` and `$link_name`. +macro_rules! list_insert_after { + ($storage:expr, $link_name:ident, $x:expr, $insert:expr) => { + $storage[$insert].$link_name.prev = $x; + $storage[$insert].$link_name.next = $storage[$x].$link_name.next; + let prev_index = $storage[$insert].$link_name.prev; + $storage[prev_index].$link_name.next = $insert; + let next_index = $storage[$insert].$link_name.next; + $storage[next_index].$link_name.prev = $insert; + }; +} + +/// Unlink the node`$x` for the list given by `$storage` and `$link_name`. +macro_rules! list_unlink { + ($storage:expr, $link_name:ident, $x:expr) => { + let prev_index = $storage[$x].$link_name.prev; + $storage[prev_index].$link_name.next = $storage[$x].$link_name.next; + let next_index = $storage[$x].$link_name.next; + $storage[next_index].$link_name.prev = $storage[$x].$link_name.prev; + $storage[$x].$link_name.prev = $x; + $storage[$x].$link_name.next = $x; + }; +} + +struct Block { + size: u32, + offset: u32, + generation: u32, + super_block_index: SuperBlockIndex, + + free_link: BlockLink, + phys_link: BlockLink, +} + +const DUMMY_BLOCK: Block = Block { + generation: 0xffff_ffff, + size: 0xffff_ffff, + offset: 0xffff_ffff, + free_link: BlockLink::new(INVALID_BLOCK_INDEX), + phys_link: BlockLink::new(INVALID_BLOCK_INDEX), + super_block_index: SuperBlockIndex(0xffff_ffff), +}; + +impl Block { + fn is_used(&self) -> bool { + self.generation & 1 == 1 + } + + fn is_free(&self) -> bool { + self.generation & 1 == 0 + } +} + +impl Index for Vec { + type Output = Block; + + #[inline(always)] + fn index(&self, index: BlockIndex) -> &Self::Output { + &self[index.0.get() as usize] + } +} + +impl IndexMut for Vec { + #[inline(always)] + fn index_mut(&mut self, index: BlockIndex) -> &mut Self::Output { + &mut self[index.0.get() as usize] + } +} + +impl Index for Vec> +where + T: Copy, +{ + type Output = SuperBlock; + + #[inline(always)] + fn index(&self, index: SuperBlockIndex) -> &Self::Output { + &self[index.0 as usize] + } +} + +impl IndexMut for Vec> +where + T: Copy, +{ + #[inline(always)] + fn index_mut(&mut self, index: SuperBlockIndex) -> &mut Self::Output { + &mut self[index.0 as usize] + } +} + +#[derive(Clone)] +pub struct Allocation { + block_index: BlockIndex, + generation: u32, + offset: u64, + user_data: T, +} + +impl Allocation { + pub fn user_data(&self) -> &T { + &self.user_data + } + + /// Returns the offset into the super-block where this allocation starts. + pub fn offset(&self) -> u64 { + self.offset + } +} + +pub struct Tlsf +where + T: Copy, +{ + bitmap_0: u32, + bitmap_1: [u32; BIN_COUNT], + empty_block_heads: [Option; SUB_BIN_COUNT * BIN_COUNT], + + free_block_head: Option, + blocks: Vec, + + super_blocks: Vec>, +} + +impl Default for Tlsf +where + T: Copy, +{ + fn default() -> Self { + Self::new() + } +} + +impl Tlsf +where + T: Copy, +{ + pub fn new() -> Self { + Self { + bitmap_0: 0, + bitmap_1: [0; BIN_COUNT], + empty_block_heads: [None; SUB_BIN_COUNT * BIN_COUNT], + free_block_head: None, + blocks: vec![DUMMY_BLOCK], + super_blocks: vec![], + } + } + + /// Returns a slice containing all the super_blocks added to the allocator. + /// Only the `user_data` field is accessible. + pub fn super_blocks(&self) -> &[SuperBlock] { + &self.super_blocks + } + + /// Clear the allocator state. + /// + /// Make sure to clean up any super blocks before calling this. + pub fn clear(&mut self) { + self.bitmap_0 = 0; + self.bitmap_1.fill(0); + self.empty_block_heads.fill(None); + self.free_block_head = None; + self.blocks.clear(); + self.blocks.push(DUMMY_BLOCK); + self.super_blocks.clear() + } + + /// Search the acceleration structure for a non-empty list from the given + /// bin. + /// + /// Returns the bin index if a non-empty list is found, or None. + fn search_non_empty_bin(&self, starting_bin: Bin) -> Option { + let mut bin = starting_bin.bin(); + let sub_bin = starting_bin.sub_bin(); + + // First we scan the second-level bitmap from sub_bin, masking out the earlier + // sub-bins so we don't end up returning a bin that's too small for the + // allocation. + let mut second_level = self.bitmap_1[bin as usize] & (!0 << sub_bin); + + // If that search failed, then we must scan the first-level bitmap from the next + // bin forward. If we find anything here it cannot possibly be smaller than the + // requested allocation. + if second_level == 0 { + let first_level = self.bitmap_0 & (!0 << (bin + 1)); + + // If that search also failed, there's no suitable blocks. + if first_level == 0 { + return None; + } + + // Recalculate the bin from the first level bitmap. + bin = first_level.trailing_zeros(); + second_level = self.bitmap_1[bin as usize]; + } + + // Find the sub-bin from the second level bitmap. + let sub_bin = second_level.trailing_zeros(); + Some(Bin::new(bin, sub_bin)) + } + + /// Marks a given bin as containing empty blocks in the bitmap acceleration + /// structure. + fn set_metadata_bit(&mut self, bin: Bin) { + let sub_bin = bin.sub_bin(); + let bin = bin.bin() as usize; + self.bitmap_0 |= 1 << bin; + self.bitmap_1[bin] |= 1 << sub_bin; + } + + /// Marks a given bin as containing no empty blocks in the bitmap acceleration + /// structure. + fn clear_metadata_bit(&mut self, bin: Bin) { + let sub_bin = bin.sub_bin(); + let bin = bin.bin() as usize; + self.bitmap_1[bin] &= !(1 << sub_bin); + if self.bitmap_1[bin] == 0 { + self.bitmap_0 &= !(1 << bin); + } + } + + /// Inserts a block into the empty blocks lists. + fn insert_block(&mut self, block_index: BlockIndex) { + debug_assert!(self.blocks[block_index].is_free()); + debug_assert!(self.blocks[block_index].free_link.is_unlinked()); + + let (_, bin) = Bin::from_size_round_down(self.blocks[block_index].size); + let bin_index = bin.index(); + + if let Some(empty_block_index) = self.empty_block_heads[bin_index] { + list_insert_before!(self.blocks, free_link, empty_block_index, block_index); + } else { + self.set_metadata_bit(bin); + } + + self.empty_block_heads[bin_index] = Some(block_index); + } + + /// Removes a block from the empty blocks lists. + fn extract_block(&mut self, block_index: BlockIndex) { + debug_assert!(self.blocks[block_index].is_free()); + + let (_, bin) = Bin::from_size_round_down(self.blocks[block_index].size); + + let bin_index = bin.index(); + + debug_assert!(self.empty_block_heads[bin_index].is_some()); + + if self.empty_block_heads[bin_index] == Some(block_index) { + let next_index = self.blocks[block_index].free_link.next; + if next_index != block_index { + self.empty_block_heads[bin_index] = Some(next_index); + } else { + self.empty_block_heads[bin_index] = None; + self.clear_metadata_bit(bin); + } + } + + list_unlink!(self.blocks, free_link, block_index); + } + + /// Returns true if we should merge `from_block_index` into `into_block_index`. + fn can_merge_block_left( + &self, + into_block_index: BlockIndex, + from_block_index: BlockIndex, + ) -> bool { + // Cannot merge into ourselves. + if into_block_index == from_block_index { + return false; + } + + // Cannot merge the first block in a physical range into the last block. + // This check is necessary because the linked lists are cyclic. + if self.blocks[from_block_index].offset == 0 { + return false; + } + + // Cannot merge blocks that are in-use. + if self.blocks[into_block_index].is_used() || self.blocks[from_block_index].is_used() { + return false; + } + + true + } + + /// Requests a new block, and returns its `BlockIndex`. + fn request_block( + &mut self, + offset: u32, + size: u32, + super_block_index: SuperBlockIndex, + ) -> BlockIndex { + let block_index = if let Some(free_block_index) = self.free_block_head { + let next_index = self.blocks[free_block_index].free_link.next; + self.free_block_head = if next_index != free_block_index { + Some(next_index) + } else { + None + }; + list_unlink!(self.blocks, free_link, free_block_index); + free_block_index + } else { + assert!(self.blocks.len() < i32::MAX as usize); + let block_index = BlockIndex(NonZeroU32::new(self.blocks.len() as u32).unwrap()); + self.blocks.push(Block { + generation: 0, + size, + offset, + free_link: BlockLink::new(block_index), + phys_link: BlockLink::new(block_index), + super_block_index, + }); + block_index + }; + + let block = &mut self.blocks[block_index]; + block.offset = offset; + block.size = size; + block.super_block_index = super_block_index; + + block_index + } + + /// Recycles the block indicated by `block_index` for re-use. + fn recycle_block(&mut self, block_index: BlockIndex) { + let block = &mut self.blocks[block_index]; + debug_assert!(block.free_link.is_unlinked()); + debug_assert!(block.phys_link.is_unlinked()); + + block.size = 0xffff_ffff; + block.offset = 0xffff_ffff; + + if let Some(free_block_index) = self.free_block_head { + list_insert_before!(self.blocks, free_link, free_block_index, block_index); + } + + self.free_block_head = Some(block_index); + } + + pub fn insert_super_block(&mut self, size: u64, user_data: T) { + assert!(size != 0 && size < i32::MAX as u64); + assert!(self.super_blocks.len() < i32::MAX as usize); + + // Ranges checked in asserts above. + let size = size as u32; + let len = self.super_blocks.len() as u32; + + let super_block_index = SuperBlockIndex(len); + let block_index = self.request_block(0, size, super_block_index); + + self.super_blocks.push(SuperBlock { + // The block at offset 0 in a SuperBlock will never be merged away, so the index + // is stable and we can store it in the SuperBlock itself. + _first_block_index: block_index, + user_data, + }); + + self.insert_block(block_index); + } + + pub fn alloc(&mut self, size: u64, align: u64) -> Option> { + assert!( + size != 0 + && align != 0 + && align < i32::MAX as u64 + && size < (i32::MAX as u64 - align) + && align.is_power_of_two() + ); + let size = size.max(MIN_BLOCK_SIZE as u64); + let size = if align > MIN_BLOCK_SIZE as u64 { + size - 1 + align + } else { + size + } as u32; + + // We need to find the bin which contains only empty-blocks large enough for the + // given size because we unconditionally use the first empty block found. So + // this must round up. + let (rounded_size, starting_bin) = Bin::from_size_round_up(size); + + let Some(bin) = self.search_non_empty_bin(starting_bin) else { + return None; + }; + + let block_index = self.empty_block_heads[bin.index()].unwrap(); + + debug_assert!( + self.blocks[block_index].is_free() && self.blocks[block_index].size >= rounded_size + ); + + self.extract_block(block_index); + + let remainder = self.blocks[block_index].size - rounded_size; + let super_block_index = self.blocks[block_index].super_block_index; + + // Should we should split the block? + if remainder >= MIN_BLOCK_SIZE { + self.blocks[block_index].size -= remainder; + let offset = self.blocks[block_index].offset + rounded_size; + let new_block_index = self.request_block(offset, remainder, super_block_index); + list_insert_after!(self.blocks, phys_link, block_index, new_block_index); + self.insert_block(new_block_index); + } + + let generation = self.blocks[block_index].generation.wrapping_add(1); + self.blocks[block_index].generation = generation; + + let user_data = self.super_blocks[super_block_index].user_data; + // The mask is a no-op if the alignment is already met, do it unconditionally. + let offset = (self.blocks[block_index].offset as u64 + align - 1) & !(align - 1); + + Some(Allocation { + block_index, + generation, + offset, + user_data, + }) + } + + pub fn free(&mut self, allocation: Allocation) { + let mut block_index = allocation.block_index; + let generation = self.blocks[block_index].generation; + assert_eq!(generation, allocation.generation, "double-free"); + self.blocks[block_index].generation = generation.wrapping_add(1); + + // Merge next block into the current block. + { + let into_block_index = block_index; + let from_block_index = self.blocks[block_index].phys_link.next; + if self.can_merge_block_left(into_block_index, from_block_index) { + let from_size = self.blocks[from_block_index].size; + self.extract_block(from_block_index); + list_unlink!(self.blocks, phys_link, from_block_index); + self.recycle_block(from_block_index); + self.blocks[into_block_index].size += from_size; + } + } + + // Merge current block into the prev block. + { + let into_block_index = self.blocks[block_index].phys_link.prev; + let from_block_index = block_index; + if self.can_merge_block_left(into_block_index, from_block_index) { + let from_size = self.blocks[from_block_index].size; + self.extract_block(into_block_index); + list_unlink!(self.blocks, phys_link, from_block_index); + self.recycle_block(from_block_index); + self.blocks[into_block_index].size += from_size; + block_index = into_block_index; + } + } + + // Insert the merged free block. + self.insert_block(block_index); + } +} + +#[cfg(test)] +mod tests { + use narcissus_core::rand::Pcg64; + + use super::*; + + #[test] + fn split_and_merge() { + let mut tlsf = Tlsf::new(); + + tlsf.insert_super_block(1024, ()); + + let alloc0 = tlsf.alloc(512, 1).unwrap(); + let alloc1 = tlsf.alloc(512, 1).unwrap(); + assert!(tlsf.alloc(512, 1).is_none()); + + // Freeing should merge the blocks. + + tlsf.free(alloc0); + tlsf.free(alloc1); + + // and allow us to allocate the full size again. + let alloc2 = tlsf.alloc(1024, 1).unwrap(); + assert!(tlsf.alloc(512, 1).is_none()); + tlsf.free(alloc2); + + { + let mut allocations = (0..64) + .map(|_| tlsf.alloc(16, 1).unwrap()) + .collect::>(); + + assert!(tlsf.alloc(16, 1).is_none()); + + for allocation in allocations.drain(..).rev() { + tlsf.free(allocation); + } + } + + // and allow us to allocate the full size again. + let alloc2 = tlsf.alloc(1024, 1).unwrap(); + assert!(tlsf.alloc(512, 1).is_none()); + tlsf.free(alloc2); + + { + let mut allocations = (0..64) + .map(|_| tlsf.alloc(16, 1).unwrap()) + .collect::>(); + + assert!(tlsf.alloc(16, 1).is_none()); + + for allocation in allocations.drain(..) { + tlsf.free(allocation); + } + } + + // and allow us to allocate the full size again. + let alloc2 = tlsf.alloc(1024, 1).unwrap(); + assert!(tlsf.alloc(512, 1).is_none()); + tlsf.free(alloc2); + } + + #[test] + fn multiple_super_blocks() { + let mut tlsf = Tlsf::new(); + + const NUM_SUPER_BLOCKS: u64 = 16; + const SUPER_BLOCK_SIZE: u64 = 10 * 1024; + + const TOTAL_SIZE: u64 = NUM_SUPER_BLOCKS * SUPER_BLOCK_SIZE; + const ALLOCATION_SIZE: u64 = 16; + + for _ in 0..NUM_SUPER_BLOCKS { + tlsf.insert_super_block(SUPER_BLOCK_SIZE, ()); + } + + let mut seed_rng = Pcg64::new(); + + for _run in 0..4 { + let seed = seed_rng.next_u64() as u128 | (seed_rng.next_u64() as u128) << 64; + let mut rng = Pcg64::with_seed(seed); + + let mut allocations = (0..(TOTAL_SIZE / ALLOCATION_SIZE)) + .map(|_| tlsf.alloc(ALLOCATION_SIZE, 1).unwrap()) + .collect::>(); + + rng.shuffle(allocations.as_mut_slice()); + + for allocation in allocations.drain(..) { + tlsf.free(allocation); + } + } + } + + #[test] + #[should_panic] + fn double_free() { + let mut tlsf = Tlsf::new(); + tlsf.insert_super_block(1024, ()); + let alloc = tlsf.alloc(512, 1).unwrap(); + tlsf.free(alloc.clone()); + tlsf.free(alloc); + } +}