From: Joshua Simmons Date: Thu, 27 Jul 2023 09:36:06 +0000 (+0200) Subject: narcissus-gpu: Simplify allocator stats tracking X-Git-Url: https://git.nega.tv//gitweb.cgi?a=commitdiff_plain;h=3b8df6b451868b19859c061053b7f49f38206c9b;p=josh%2Fnarcissus narcissus-gpu: Simplify allocator stats tracking --- diff --git a/libs/narcissus-gpu/src/backend/vulkan/allocator.rs b/libs/narcissus-gpu/src/backend/vulkan/allocator.rs index cd1f630..72bdc14 100644 --- a/libs/narcissus-gpu/src/backend/vulkan/allocator.rs +++ b/libs/narcissus-gpu/src/backend/vulkan/allocator.rs @@ -14,36 +14,82 @@ use super::{VulkanDevice, VulkanFrame, VULKAN_CONSTANTS}; type Tlsf = tlsf::Tlsf; #[derive(Default, Debug)] -pub struct VulkanMemoryHeap { - /// The calculated Tlsf super-block size for this memory heap. +pub struct VulkanHeapStats { + num_allocated_bytes: AtomicU64, + num_allocations: AtomicU32, +} + +#[derive(Default, Debug)] +pub struct VulkanAllocatorStats { + heap_stats: [VulkanHeapStats; vk::MAX_MEMORY_HEAPS as usize], + num_allocations: AtomicU32, +} + +impl VulkanAllocatorStats { + /// Returns the total number of allocations made with `vkAllocateMemory` for all + /// memory types. + fn num_allocations(&self) -> u32 { + self.num_allocations.load(Ordering::Relaxed) + } + + /// Returns the total number of bytes allocated from the given heap index. + fn num_allocated_bytes(&self, memory_heap_index: u32) -> u64 { + self.heap_stats[memory_heap_index.widen()] + .num_allocated_bytes + .load(Ordering::Relaxed) + } + + /// Update the stats with an allocation of the given size and heap index. + fn allocate(&self, memory_heap_index: u32, size: u64) { + self.num_allocations.fetch_add(1, Ordering::SeqCst); + let heap_stats = &self.heap_stats[memory_heap_index.widen()]; + heap_stats.num_allocations.fetch_add(1, Ordering::SeqCst); + heap_stats + .num_allocated_bytes + .fetch_add(size, Ordering::SeqCst); + } + + /// Update the stats with a free of the given size and heap index. + fn free(&self, memory_heap_index: u32, size: u64) { + self.num_allocations.fetch_sub(1, Ordering::SeqCst); + let heap_stats = &self.heap_stats[memory_heap_index.widen()]; + heap_stats.num_allocations.fetch_sub(1, Ordering::SeqCst); + heap_stats + .num_allocated_bytes + .fetch_sub(size, Ordering::SeqCst); + } +} + +#[derive(Default)] +pub struct VulkanAllocator { + /// The calculated Tlsf super-block size for each memory heap. /// /// Smaller heaps will require a smaller super-block size to prevent excess /// memory waste. Calculate a suitable super-block size using /// `VULKAN_CONSTANTS.tlsf_default_super_block_size` and /// `VULKAN_CONSTANTS.tlsf_small_super_block_divisor`. - tlsf_super_block_size: u64, + tlsf_super_block_size: [u64; vk::MAX_MEMORY_HEAPS as usize], - /// Total size in bytes we have allocated against this memory heap. - total_allocated_bytes: AtomicU64, -} + /// Tracker for allocation statistics used for both debugging / profiling + /// features and budget decisions. + stats: VulkanAllocatorStats, -#[derive(Default)] -pub struct VulkanMemoryType { - tlsf: Mutex, + /// Tlsf instance for each vulkan memory type. + tlsf: [Mutex; vk::MAX_MEMORY_TYPES as usize], - /// Tlsf instance used exclusively for non-linear images when the - /// `buffer_image_granularity` limit is greater than the minimum alignment - /// guaranteed by the current Tlsf configuration. - tlsf_non_linear: Mutex, -} + /// Tlsf instance for each vulkan memory type used exclusively for non-linear + /// images when `use_segregated_non_linear_allocator` is true. + tlsf_non_linear: [Mutex; vk::MAX_MEMORY_TYPES as usize], -#[derive(Default)] -pub struct VulkanAllocator { - memory_heaps: [VulkanMemoryHeap; vk::MAX_MEMORY_HEAPS as usize], - memory_types: [VulkanMemoryType; vk::MAX_MEMORY_TYPES as usize], + /// Tracks all live dedicated allocations, excluding those which are used as + /// Tlsf super-blocks. dedicated: Mutex>, + + /// When the physical device `buffer_image_granularity` limit is greater than + /// the minimum alignment guaranteed by the current Tlsf configuration this will + /// be true, and `tlsf_non_linear` Tlsf instances will be used for non-linear + /// image allocations. use_segregated_non_linear_allocator: bool, - allocation_count: AtomicU32, } impl VulkanAllocator { @@ -51,32 +97,32 @@ impl VulkanAllocator { buffer_image_granularity: u64, memory_properties: &vk::PhysicalDeviceMemoryProperties, ) -> Self { - let memory_heaps = std::array::from_fn(|memory_heap_index| { + // Try to estimate a suitable Tlsf super-block size. + // Some heaps are very small and their super-block size must be scaled down + // to avoid exhausting the entire heap with one or two block allocations. + // For everything else we just use the constant super block size. + let tlsf_super_block_size = std::array::from_fn(|memory_heap_index| { let memory_heap_properties = &memory_properties.memory_heaps[memory_heap_index]; - let tlsf_super_block_size = if memory_heap_properties.size + if memory_heap_properties.size >= VULKAN_CONSTANTS.tlsf_small_super_block_divisor * VULKAN_CONSTANTS.tlsf_default_super_block_size { VULKAN_CONSTANTS.tlsf_default_super_block_size } else { memory_heap_properties.size / VULKAN_CONSTANTS.tlsf_small_super_block_divisor - }; - VulkanMemoryHeap { - tlsf_super_block_size, - total_allocated_bytes: default(), } }); // buffer_image_granularity is an additional alignment constraint for buffers // and images that are allocated adjacently. Rather than trying to handle this - // restriction cleverly, use a separate Tlsf allocator for images if - // `buffer_image_granularity` is greater than the guaranteed alignment of the - // Tlsf configuration. + // restriction within the Tlsf allocator, use a separate Tlsf instance for + // images if `buffer_image_granularity` is greater than the guaranteed + // alignment of the Tlsf configuration. let use_segregated_non_linear_allocator = buffer_image_granularity > tlsf::MIN_ALIGNMENT as u64; Self { - memory_heaps, + tlsf_super_block_size, use_segregated_non_linear_allocator, ..default() } @@ -172,43 +218,7 @@ impl VulkanMemory { } impl VulkanDevice { - fn free_memory(&self, memory: VulkanMemory) { - match memory { - VulkanMemory::Dedicated(dedicated) => { - self.allocator.dedicated.lock().remove(&dedicated.memory); - - let memory_heap = &self.allocator.memory_heaps[self - .physical_device_memory_properties - .memory_types[dedicated.memory_type_index.widen()] - .heap_index - .widen()]; - - memory_heap - .total_allocated_bytes - .fetch_sub(dedicated.size, Ordering::SeqCst); - - self.allocator - .allocation_count - .fetch_sub(1, Ordering::SeqCst); - - unsafe { - self.device_fn - .free_memory(self.device, dedicated.memory, None) - } - } - VulkanMemory::SubAlloc(sub_alloc) => { - let user_data = sub_alloc.allocation.user_data(); - let memory_type = &self.allocator.memory_types[user_data.memory_type_index.widen()]; - let mut tlsf = if user_data.non_linear { - memory_type.tlsf_non_linear.lock() - } else { - memory_type.tlsf.lock() - }; - tlsf.free(sub_alloc.allocation) - } - } - } - + /// Attempt to allocate a block of memory from vulkan. fn try_allocate_device_memory( &self, host_mapped: bool, @@ -216,8 +226,7 @@ impl VulkanDevice { memory_type_index: u32, memory_dedicated_allocate_info: Option<&vk::MemoryDedicatedAllocateInfo>, ) -> Option<(vk::DeviceMemory, *mut u8)> { - // Can't allocate if we would blow the global allocation limit. - if self.allocator.allocation_count.load(Ordering::Relaxed) + if self.allocator.stats.num_allocations() >= self .physical_device_properties .properties @@ -227,17 +236,19 @@ impl VulkanDevice { return None; } - let heap_index = self.physical_device_memory_properties.memory_types + let memory_heap_index = self.physical_device_memory_properties.memory_types [memory_type_index.widen()] .heap_index; let memory_heap_properties = - &self.physical_device_memory_properties.memory_heaps[heap_index.widen()]; - let memory_heap = &self.allocator.memory_heaps[heap_index.widen()]; + &self.physical_device_memory_properties.memory_heaps[memory_heap_index.widen()]; // Can't allocate if we would blow this heap's size. - let current_allocated_bytes = memory_heap.total_allocated_bytes.load(Ordering::Relaxed); - if current_allocated_bytes + size > memory_heap_properties.size { + // TODO: This should calculate a smaller budget than the heap's total + // capacity. + if self.allocator.stats.num_allocated_bytes(memory_heap_index) + size + > memory_heap_properties.size + { return None; } @@ -263,14 +274,7 @@ impl VulkanDevice { _ => panic!(), }; - // Update allocation statistics. - self.allocator - .allocation_count - .fetch_add(1, Ordering::AcqRel); - - memory_heap - .total_allocated_bytes - .fetch_add(size, Ordering::SeqCst); + self.allocator.stats.allocate(memory_heap_index, size); let mapped_ptr = if host_mapped { let mut data = std::ptr::null_mut(); @@ -294,41 +298,12 @@ impl VulkanDevice { self.device_fn .free_memory(self.device, user_data.memory, None); - let heap_index = self.physical_device_memory_properties.memory_types - [user_data.memory_type_index.widen()] - .heap_index; - let memory_heap = &self.allocator.memory_heaps[heap_index.widen()]; - - self.allocator - .allocation_count - .fetch_sub(1, Ordering::SeqCst); - - memory_heap - .total_allocated_bytes - .fetch_sub(memory_heap.tlsf_super_block_size, Ordering::SeqCst); - } - - #[cold] - fn emergency_gc(&self) { - for memory_type in &self.allocator.memory_types[..self - .physical_device_memory_properties - .memory_type_count - .widen()] - { - memory_type - .tlsf - .lock() - .remove_empty_super_blocks(|user_data| unsafe { - self.free_super_block(&user_data) - }); + let memory_type_index = user_data.memory_type_index.widen(); + let memory_heap_index = + self.physical_device_memory_properties.memory_types[memory_type_index].heap_index; + let size = self.allocator.tlsf_super_block_size[memory_heap_index.widen()]; - memory_type - .tlsf_non_linear - .lock() - .remove_empty_super_blocks(|user_data| unsafe { - self.free_super_block(&user_data) - }); - } + self.allocator.stats.free(memory_heap_index, size); } pub fn allocate_memory( @@ -420,9 +395,6 @@ impl VulkanDevice { continue; } - let memory_type = &self.allocator.memory_types[memory_type_index]; - let memory_heap = &self.allocator.memory_heaps[memory_heap_index]; - // Does the driver want a dedicated allocation? if memory_dedicated_requirements.requires_dedicated_allocation == vk::Bool32::True || memory_dedicated_requirements.prefers_dedicated_allocation @@ -447,32 +419,38 @@ impl VulkanDevice { // If the allocation is smaller than the Tlsf super-block size for this // allocation type, we should attempt sub-allocation. - if size <= memory_heap.tlsf_super_block_size { + if size <= self.allocator.tlsf_super_block_size[memory_heap_index] { let (non_linear, mut tlsf) = if (VULKAN_CONSTANTS .tlsf_force_segregated_non_linear_allocator || self.allocator.use_segregated_non_linear_allocator) && non_linear { - (true, memory_type.tlsf_non_linear.lock()) + ( + true, + self.allocator.tlsf_non_linear[memory_type_index].lock(), + ) } else { - (false, memory_type.tlsf.lock()) + (false, self.allocator.tlsf[memory_type_index].lock()) }; - if let Some(allocation) = tlsf.alloc(size, align) { + if let Some(allocation) = tlsf.allocate(size, align) { return VulkanMemory::SubAlloc(VulkanMemorySubAlloc { allocation, size }); } else { + let super_block_size = + self.allocator.tlsf_super_block_size[memory_heap_index]; + // When allocating backing storage for Tlsf super-blocks, ensure that all memory // is mapped if the memory type supports host mapping. This ensures we never // have to map a super-block later if an individual allocation desires it. if let Some((memory, mapped_ptr)) = self.try_allocate_device_memory( memory_type_property_flags .contains(vk::MemoryPropertyFlags::HOST_VISIBLE), - memory_heap.tlsf_super_block_size, + super_block_size, memory_type_index as u32, None, ) { tlsf.insert_super_block( - memory_heap.tlsf_super_block_size, + super_block_size, VulkanSuperBlockInfo { memory, mapped_ptr, @@ -485,7 +463,7 @@ impl VulkanDevice { // After inserting a new super-block we should always be able to service the // allocation request since the outer condition checks `size` <= `block_size`. - let allocation = tlsf.alloc(size, align).unwrap(); + let allocation = tlsf.allocate(size, align).unwrap(); return VulkanMemory::SubAlloc(VulkanMemorySubAlloc { allocation, @@ -521,47 +499,112 @@ impl VulkanDevice { panic!("allocation failure") } + /// Called once per frame to flush deferred allocations and release any empty + /// super-blocks. pub fn allocator_begin_frame(&self, frame: &mut VulkanFrame) { for allocation in frame.destroyed_allocations.get_mut().drain(..) { - self.free_memory(allocation); + match allocation { + VulkanMemory::Dedicated(dedicated) => { + self.allocator.dedicated.lock().remove(&dedicated.memory); + + let memory_heap_index = self.physical_device_memory_properties.memory_types + [dedicated.memory_type_index.widen()] + .heap_index; + + self.allocator.stats.free(memory_heap_index, dedicated.size); + + unsafe { + self.device_fn + .free_memory(self.device, dedicated.memory, None) + } + } + VulkanMemory::SubAlloc(sub_alloc) => { + let user_data = sub_alloc.allocation.user_data(); + let mut tlsf = if user_data.non_linear { + self.allocator.tlsf_non_linear[user_data.memory_type_index.widen()].lock() + } else { + self.allocator.tlsf[user_data.memory_type_index.widen()].lock() + }; + tlsf.free(sub_alloc.allocation) + } + } } - for memory_type in &self.allocator.memory_types[..self + let memory_type_count = self .physical_device_memory_properties .memory_type_count - .widen()] + .widen(); + + if self.allocator.use_segregated_non_linear_allocator + || VULKAN_CONSTANTS.tlsf_force_segregated_non_linear_allocator { - memory_type - .tlsf - .lock() - .remove_empty_super_blocks(|user_data| unsafe { + for tlsf in &self.allocator.tlsf_non_linear[..memory_type_count] { + tlsf.lock().remove_empty_super_blocks(|user_data| unsafe { self.free_super_block(&user_data) }); + } + } - memory_type - .tlsf_non_linear - .lock() - .remove_empty_super_blocks(|user_data| unsafe { self.free_super_block(&user_data) }) + for tlsf in &self.allocator.tlsf[..memory_type_count] { + tlsf.lock().remove_empty_super_blocks(|user_data| unsafe { + self.free_super_block(&user_data) + }); } } pub fn allocator_drop(&mut self) { - for memory_type in self.allocator.memory_types.iter_mut() { - memory_type.tlsf.get_mut().clear(|user_data| unsafe { - self.device_fn - .free_memory(self.device, user_data.memory, None) - }); - memory_type - .tlsf_non_linear - .get_mut() - .clear(|user_data| unsafe { + let memory_type_count = self + .physical_device_memory_properties + .memory_type_count + .widen(); + + if self.allocator.use_segregated_non_linear_allocator + || VULKAN_CONSTANTS.tlsf_force_segregated_non_linear_allocator + { + for tlsf in &mut self.allocator.tlsf_non_linear[..memory_type_count] { + tlsf.get_mut().clear(|user_data| unsafe { self.device_fn .free_memory(self.device, user_data.memory, None) }); + } + } + + for tlsf in &mut self.allocator.tlsf[..memory_type_count] { + tlsf.get_mut().clear(|user_data| unsafe { + self.device_fn + .free_memory(self.device, user_data.memory, None) + }); } for &memory in self.allocator.dedicated.get_mut().iter() { unsafe { self.device_fn.free_memory(self.device, memory, None) } } } + + /// When allocation is about to fail, this function is called to flush any empty + /// Tlsf super-blocks in an attempt to free memory before completely failing to + /// allocate. + #[cold] + fn emergency_gc(&self) { + let memory_type_count = self + .physical_device_memory_properties + .memory_type_count + .widen(); + + if self.allocator.use_segregated_non_linear_allocator + || VULKAN_CONSTANTS.tlsf_force_segregated_non_linear_allocator + { + for tlsf in &self.allocator.tlsf_non_linear[..memory_type_count] { + tlsf.lock().remove_empty_super_blocks(|user_data| unsafe { + self.free_super_block(&user_data) + }); + } + } + + for tlsf in &self.allocator.tlsf[..memory_type_count] { + tlsf.lock().remove_empty_super_blocks(|user_data| unsafe { + self.free_super_block(&user_data) + }); + } + } } diff --git a/libs/narcissus-gpu/src/tlsf.rs b/libs/narcissus-gpu/src/tlsf.rs index b4fbb03..a151bba 100644 --- a/libs/narcissus-gpu/src/tlsf.rs +++ b/libs/narcissus-gpu/src/tlsf.rs @@ -538,6 +538,23 @@ where self.free_block_head = Some(block_index); } + fn recycle_super_block(&mut self, super_block_index: SuperBlockIndex) { + let super_block = &self.super_blocks[super_block_index]; + + let block = &self.blocks[super_block.first_block_index]; + debug_assert!(block.is_free()); + debug_assert!(block.phys_link.is_unlinked()); + let block_index = super_block.first_block_index; + + // Block is free so we always need to extract it first. + self.extract_block(block_index); + self.recycle_block(block_index); + + self.super_blocks[super_block_index] = default(); + + self.free_super_blocks.push(super_block_index); + } + /// Insert a super block into the memory allocator. pub fn insert_super_block(&mut self, size: u64, user_data: T) { assert!(size != 0 && size < i32::MAX as u64); @@ -562,23 +579,6 @@ where super_block.user_data = user_data; } - fn recycle_super_block(&mut self, super_block_index: SuperBlockIndex) { - let super_block = &self.super_blocks[super_block_index]; - - let block = &self.blocks[super_block.first_block_index]; - debug_assert!(block.is_free()); - debug_assert!(block.phys_link.is_unlinked()); - let block_index = super_block.first_block_index; - - // Block is free so we always need to extract it first. - self.extract_block(block_index); - self.recycle_block(block_index); - - self.super_blocks[super_block_index] = default(); - - self.free_super_blocks.push(super_block_index); - } - /// Walk all the super blocks in this Tlsf instance, removing all empty blocks. /// /// The callback `f` will be called for each freed block, passing the user_data @@ -622,7 +622,7 @@ where } } - pub fn alloc(&mut self, size: u64, align: u64) -> Option> { + pub fn allocate(&mut self, size: u64, align: u64) -> Option> { assert!( size != 0 && align != 0 @@ -833,9 +833,9 @@ mod tests { tlsf.insert_super_block(1024, ()); - let alloc0 = tlsf.alloc(512, 1).unwrap(); - let alloc1 = tlsf.alloc(512, 1).unwrap(); - assert!(tlsf.alloc(512, 1).is_none()); + let alloc0 = tlsf.allocate(512, 1).unwrap(); + let alloc1 = tlsf.allocate(512, 1).unwrap(); + assert!(tlsf.allocate(512, 1).is_none()); // Freeing should merge the blocks. @@ -843,16 +843,16 @@ mod tests { tlsf.free(alloc1); // and allow us to allocate the full size again. - let alloc2 = tlsf.alloc(1024, 1).unwrap(); - assert!(tlsf.alloc(512, 1).is_none()); + let alloc2 = tlsf.allocate(1024, 1).unwrap(); + assert!(tlsf.allocate(512, 1).is_none()); tlsf.free(alloc2); { let mut allocations = (0..64) - .map(|_| tlsf.alloc(16, 1).unwrap()) + .map(|_| tlsf.allocate(16, 1).unwrap()) .collect::>(); - assert!(tlsf.alloc(16, 1).is_none()); + assert!(tlsf.allocate(16, 1).is_none()); for allocation in allocations.drain(..).rev() { tlsf.free(allocation); @@ -860,16 +860,16 @@ mod tests { } // and allow us to allocate the full size again. - let alloc2 = tlsf.alloc(1024, 1).unwrap(); - assert!(tlsf.alloc(512, 1).is_none()); + let alloc2 = tlsf.allocate(1024, 1).unwrap(); + assert!(tlsf.allocate(512, 1).is_none()); tlsf.free(alloc2); { let mut allocations = (0..64) - .map(|_| tlsf.alloc(16, 1).unwrap()) + .map(|_| tlsf.allocate(16, 1).unwrap()) .collect::>(); - assert!(tlsf.alloc(16, 1).is_none()); + assert!(tlsf.allocate(16, 1).is_none()); for allocation in allocations.drain(..) { tlsf.free(allocation); @@ -877,8 +877,8 @@ mod tests { } // and allow us to allocate the full size again. - let alloc2 = tlsf.alloc(1024, 1).unwrap(); - assert!(tlsf.alloc(512, 1).is_none()); + let alloc2 = tlsf.allocate(1024, 1).unwrap(); + assert!(tlsf.allocate(512, 1).is_none()); tlsf.free(alloc2); } @@ -903,7 +903,7 @@ mod tests { let mut rng = Pcg64::with_seed(seed); let mut allocations = (0..(TOTAL_SIZE / ALLOCATION_SIZE)) - .map(|_| tlsf.alloc(ALLOCATION_SIZE, 1).unwrap()) + .map(|_| tlsf.allocate(ALLOCATION_SIZE, 1).unwrap()) .collect::>(); rng.shuffle(allocations.as_mut_slice()); @@ -923,14 +923,14 @@ mod tests { let small_size = 30; // Make a large allocation that splits the block. - let large = tlsf.alloc(large_size, 1).unwrap(); + let large = tlsf.allocate(large_size, 1).unwrap(); // Make a small allocation to inhibit merging upon free. - tlsf.alloc(small_size, 1).unwrap(); + tlsf.allocate(small_size, 1).unwrap(); // Free the large block, if all goes well this will be added to a bin which is // large enough to service another allocation of the same size. tlsf.free(large); // Allocate another large block, if this fails we've "lost" memory. - tlsf.alloc(large_size, 1).unwrap(); + tlsf.allocate(large_size, 1).unwrap(); } #[test] @@ -938,7 +938,7 @@ mod tests { fn double_free() { let mut tlsf = Tlsf::new(); tlsf.insert_super_block(1024, ()); - let alloc = tlsf.alloc(512, 1).unwrap(); + let alloc = tlsf.allocate(512, 1).unwrap(); tlsf.free(alloc.clone()); tlsf.free(alloc); }