From: Joshua Simmons <josh@nega.tv>
Date: Mon, 29 May 2023 07:10:03 +0000 (+0200)
Subject: narcissus-gpu: Add TLSF Allocator
X-Git-Url: https://git.nega.tv//gitweb.cgi?a=commitdiff_plain;h=db3ac721514a214ecfebd7b705aea303b513d2df;p=josh%2Fnarcissus

narcissus-gpu: Add TLSF Allocator
---

diff --git a/bins/narcissus/src/helpers.rs b/bins/narcissus/src/helpers.rs
index b0612cc..16c86de 100644
--- a/bins/narcissus/src/helpers.rs
+++ b/bins/narcissus/src/helpers.rs
@@ -101,21 +101,19 @@ pub fn create_buffer_with_data<T>(
 where
     T: Blittable,
 {
-    let len = data.len() * std::mem::size_of::<T>();
-    let buffer = device.create_buffer(&BufferDesc {
-        location: MemoryLocation::HostMapped,
-        usage,
-        size: len,
-    });
-    // SAFETY: T: Blittable which implies it's freely convertable to a byte
-    // slice.
+    // SAFETY: T: Blittable which implies it's freely convertable to a byte slice.
     unsafe {
-        let dst = std::slice::from_raw_parts_mut(device.map_buffer(buffer), len);
-        let src = std::slice::from_raw_parts(data.as_ptr() as *const u8, len);
-        dst.copy_from_slice(src);
-        device.unmap_buffer(buffer);
+        let len = data.len() * std::mem::size_of::<T>();
+        let initial_data = std::slice::from_raw_parts(data.as_ptr() as *const u8, len);
+        device.create_buffer_with_data(
+            &BufferDesc {
+                location: MemoryLocation::HostMapped,
+                usage,
+                size: len,
+            },
+            initial_data,
+        )
     }
-    buffer
 }
 
 pub fn create_image_with_data(
diff --git a/libs/narcissus-gpu/src/backend/vulkan/mod.rs b/libs/narcissus-gpu/src/backend/vulkan/mod.rs
index 8984bae..78360f6 100644
--- a/libs/narcissus-gpu/src/backend/vulkan/mod.rs
+++ b/libs/narcissus-gpu/src/backend/vulkan/mod.rs
@@ -1,8 +1,8 @@
 use std::{
     cell::{Cell, RefCell, UnsafeCell},
-    collections::{hash_map::Entry, HashMap, VecDeque},
+    collections::{hash_map::Entry, HashMap, HashSet, VecDeque},
     marker::PhantomData,
-    os::raw::{c_char, c_void},
+    os::raw::c_char,
     ptr::NonNull,
     sync::atomic::{AtomicU64, Ordering},
 };
@@ -17,15 +17,18 @@ use narcissus_core::{
 use vulkan_sys as vk;
 
 use crate::{
-    delay_queue::DelayQueue, frame_counter::FrameCounter, Access, Bind, BindGroupLayout,
-    BindGroupLayoutDesc, BindingType, BlendMode, Buffer, BufferDesc, BufferImageCopy,
-    BufferUsageFlags, ClearValue, CmdBuffer, CompareOp, ComputePipelineDesc, CullingMode, Device,
-    Extent2d, Extent3d, Frame, FrontFace, GlobalBarrier, GpuConcurrent, GraphicsPipelineDesc,
-    Image, ImageAspectFlags, ImageBarrier, ImageBlit, ImageDesc, ImageDimension, ImageFormat,
-    ImageLayout, ImageSubresourceLayers, ImageSubresourceRange, ImageUsageFlags, ImageViewDesc,
-    IndexType, LoadOp, MemoryLocation, Offset2d, Offset3d, Pipeline, PolygonMode, Sampler,
-    SamplerAddressMode, SamplerCompareOp, SamplerDesc, SamplerFilter, ShaderStageFlags, StencilOp,
-    StencilOpState, StoreOp, SwapchainOutOfDateError, ThreadToken, Topology, TypedBind,
+    delay_queue::DelayQueue,
+    frame_counter::FrameCounter,
+    tlsf::{self, Tlsf},
+    Access, Bind, BindGroupLayout, BindGroupLayoutDesc, BindingType, BlendMode, Buffer, BufferDesc,
+    BufferImageCopy, BufferUsageFlags, ClearValue, CmdBuffer, CompareOp, ComputePipelineDesc,
+    CullingMode, Device, Extent2d, Extent3d, Frame, FrontFace, GlobalBarrier, GpuConcurrent,
+    GraphicsPipelineDesc, Image, ImageAspectFlags, ImageBarrier, ImageBlit, ImageDesc,
+    ImageDimension, ImageFormat, ImageLayout, ImageSubresourceLayers, ImageSubresourceRange,
+    ImageUsageFlags, ImageViewDesc, IndexType, LoadOp, MemoryLocation, Offset2d, Offset3d,
+    Pipeline, PolygonMode, Sampler, SamplerAddressMode, SamplerCompareOp, SamplerDesc,
+    SamplerFilter, ShaderStageFlags, StencilOp, StencilOpState, StoreOp, SwapchainOutOfDateError,
+    ThreadToken, Topology, TypedBind,
 };
 
 const NUM_FRAMES: usize = 2;
@@ -727,6 +730,7 @@ fn vulkan_image_memory_barrier(
 struct VulkanBuffer {
     memory: VulkanMemory,
     buffer: vk::Buffer,
+    map_count: u64,
 }
 
 #[derive(Clone)]
@@ -814,6 +818,12 @@ struct VulkanPresentInfo {
     image_index: u32,
 }
 
+#[derive(Clone, Copy)]
+struct VulkanAllocationInfo {
+    memory: vk::DeviceMemory,
+    mapped_ptr: *mut u8,
+}
+
 enum VulkanMemoryDedicatedDesc {
     Image(vk::Image),
     Buffer(vk::Buffer),
@@ -822,15 +832,71 @@ enum VulkanMemoryDedicatedDesc {
 struct VulkanMemoryDesc {
     requirements: vk::MemoryRequirements,
     memory_location: MemoryLocation,
-    dedicated: Option<VulkanMemoryDedicatedDesc>,
     _linear: bool,
 }
 
 #[derive(Clone)]
-struct VulkanMemory {
+struct VulkanMemoryDedicated {
     memory: vk::DeviceMemory,
-    offset: u64,
+    mapped_ptr: *mut u8,
     size: u64,
+    memory_type_index: u32,
+}
+
+#[derive(Clone)]
+struct VulkanMemorySubAlloc {
+    allocation: tlsf::Allocation<VulkanAllocationInfo>,
+    size: u64,
+    memory_type_index: u32,
+}
+
+#[derive(Clone)]
+enum VulkanMemory {
+    Dedicated(VulkanMemoryDedicated),
+    SubAlloc(VulkanMemorySubAlloc),
+}
+
+impl VulkanMemory {
+    #[inline(always)]
+    fn device_memory(&self) -> vk::DeviceMemory {
+        match self {
+            VulkanMemory::Dedicated(dedicated) => dedicated.memory,
+            VulkanMemory::SubAlloc(sub_alloc) => sub_alloc.allocation.user_data().memory,
+        }
+    }
+
+    #[inline(always)]
+    fn offset(&self) -> u64 {
+        match self {
+            VulkanMemory::Dedicated(_) => 0,
+            VulkanMemory::SubAlloc(sub_alloc) => sub_alloc.allocation.offset(),
+        }
+    }
+
+    #[inline(always)]
+    fn size(&self) -> u64 {
+        match self {
+            VulkanMemory::Dedicated(dedicated) => dedicated.size,
+            VulkanMemory::SubAlloc(sub_alloc) => sub_alloc.size,
+        }
+    }
+
+    #[inline(always)]
+    fn mapped_ptr(&self) -> *mut u8 {
+        match self {
+            VulkanMemory::Dedicated(dedicated) => dedicated.mapped_ptr,
+            VulkanMemory::SubAlloc(sub_alloc) => {
+                let user_data = sub_alloc.allocation.user_data();
+                if user_data.mapped_ptr.is_null() {
+                    std::ptr::null_mut()
+                } else {
+                    user_data
+                        .mapped_ptr
+                        .wrapping_add(sub_alloc.allocation.offset() as usize)
+                }
+            }
+        }
+    }
 }
 
 #[derive(Clone)]
@@ -890,6 +956,12 @@ impl VulkanFrame {
     }
 }
 
+#[derive(Default)]
+struct VulkanAllocator {
+    tlsf: Mutex<Tlsf<VulkanAllocationInfo>>,
+    dedicated: Mutex<HashSet<vk::DeviceMemory>>,
+}
+
 type SwapchainDestroyQueue = DelayQueue<(vk::SwapchainKHR, vk::SurfaceKHR, Box<[vk::ImageView]>)>;
 
 pub(crate) struct VulkanDevice {
@@ -920,6 +992,8 @@ pub(crate) struct VulkanDevice {
     recycled_semaphores: Mutex<VecDeque<vk::Semaphore>>,
     recycled_descriptor_pools: Mutex<VecDeque<vk::DescriptorPool>>,
 
+    allocators: [Option<Box<VulkanAllocator>>; vk::MAX_MEMORY_TYPES as usize],
+
     _global_fn: vk::GlobalFunctions,
     instance_fn: vk::InstanceFunctions,
     xcb_surface_fn: Option<vk::XcbSurfaceKHRFunctions>,
@@ -1239,6 +1313,14 @@ impl VulkanDevice {
             })
         }));
 
+        let allocators = std::array::from_fn(|i| {
+            if i < physical_device_memory_properties.memory_type_count as usize {
+                Some(default())
+            } else {
+                None
+            }
+        });
+
         Self {
             instance,
             physical_device,
@@ -1266,6 +1348,8 @@ impl VulkanDevice {
             recycled_semaphores: default(),
             recycled_descriptor_pools: default(),
 
+            allocators,
+
             _global_fn: global_fn,
             instance_fn,
             xcb_surface_fn,
@@ -1320,7 +1404,11 @@ impl VulkanDevice {
             .0
     }
 
-    fn allocate_memory(&self, desc: &VulkanMemoryDesc) -> VulkanMemory {
+    fn allocate_memory_dedicated(
+        &self,
+        desc: &VulkanMemoryDesc,
+        dedicated_desc: &VulkanMemoryDedicatedDesc,
+    ) -> VulkanMemory {
         let memory_property_flags = match desc.memory_location {
             MemoryLocation::HostMapped => vk::MemoryPropertyFlags::HOST_VISIBLE,
             MemoryLocation::Device => vk::MemoryPropertyFlags::DEVICE_LOCAL,
@@ -1329,7 +1417,9 @@ impl VulkanDevice {
         let memory_type_index =
             self.find_memory_type_index(desc.requirements.memory_type_bits, memory_property_flags);
 
-        let mut dedicated_allocate_info = vk::MemoryDedicatedAllocateInfo::default();
+        let allocator = self.allocators[memory_type_index as usize]
+            .as_ref()
+            .expect("returned a memory type index that has no associated allocator");
 
         let mut allocate_info = vk::MemoryAllocateInfo {
             allocation_size: desc.requirements.size,
@@ -1337,29 +1427,119 @@ impl VulkanDevice {
             ..default()
         };
 
-        if let Some(dedicated) = &desc.dedicated {
-            match dedicated {
-                &VulkanMemoryDedicatedDesc::Image(image) => {
-                    dedicated_allocate_info.image = image;
-                }
-                &VulkanMemoryDedicatedDesc::Buffer(buffer) => {
-                    dedicated_allocate_info.buffer = buffer
-                }
+        let mut dedicated_allocate_info = vk::MemoryDedicatedAllocateInfo::default();
+
+        match *dedicated_desc {
+            VulkanMemoryDedicatedDesc::Image(image) => {
+                dedicated_allocate_info.image = image;
             }
-            allocate_info._next =
-                &dedicated_allocate_info as *const vk::MemoryDedicatedAllocateInfo as *const _
+            VulkanMemoryDedicatedDesc::Buffer(buffer) => dedicated_allocate_info.buffer = buffer,
         }
+        allocate_info._next =
+            &dedicated_allocate_info as *const vk::MemoryDedicatedAllocateInfo as *const _;
 
         let mut memory = vk::DeviceMemory::null();
         vk_check!(self
             .device_fn
             .allocate_memory(self.device, &allocate_info, None, &mut memory));
 
-        VulkanMemory {
+        allocator.dedicated.lock().insert(memory);
+
+        let mapped_ptr = if self.physical_device_memory_properties.memory_types
+            [memory_type_index as usize]
+            .property_flags
+            .contains(vk::MemoryPropertyFlags::HOST_VISIBLE)
+        {
+            let mut data = std::ptr::null_mut();
+            vk_check!(self.device_fn.map_memory(
+                self.device,
+                memory,
+                0,
+                vk::WHOLE_SIZE,
+                vk::MemoryMapFlags::default(),
+                &mut data
+            ));
+            data as *mut u8
+        } else {
+            std::ptr::null_mut()
+        };
+
+        VulkanMemory::Dedicated(VulkanMemoryDedicated {
             memory,
-            offset: 0,
+            mapped_ptr,
             size: desc.requirements.size,
-        }
+            memory_type_index,
+        })
+    }
+
+    fn allocate_memory(&self, desc: &VulkanMemoryDesc) -> VulkanMemory {
+        let memory_property_flags = match desc.memory_location {
+            MemoryLocation::HostMapped => vk::MemoryPropertyFlags::HOST_VISIBLE,
+            MemoryLocation::Device => vk::MemoryPropertyFlags::DEVICE_LOCAL,
+        };
+
+        let memory_type_index =
+            self.find_memory_type_index(desc.requirements.memory_type_bits, memory_property_flags);
+
+        let allocator = self.allocators[memory_type_index as usize]
+            .as_ref()
+            .expect("returned a memory type index that has no associated allocator");
+
+        let mut tlsf = allocator.tlsf.lock();
+
+        let allocation = {
+            if let Some(allocation) =
+                tlsf.alloc(desc.requirements.size, desc.requirements.alignment)
+            {
+                allocation
+            } else {
+                const BLOCK_SIZE: u64 = 128 * 1024 * 1024;
+
+                let allocate_info = vk::MemoryAllocateInfo {
+                    allocation_size: BLOCK_SIZE,
+                    memory_type_index,
+                    ..default()
+                };
+
+                let mut memory = vk::DeviceMemory::null();
+                vk_check!(self.device_fn.allocate_memory(
+                    self.device,
+                    &allocate_info,
+                    None,
+                    &mut memory
+                ));
+
+                let mapped_ptr = if self.physical_device_memory_properties.memory_types
+                    [memory_type_index as usize]
+                    .property_flags
+                    .contains(vk::MemoryPropertyFlags::HOST_VISIBLE)
+                {
+                    let mut data = std::ptr::null_mut();
+                    vk_check!(self.device_fn.map_memory(
+                        self.device,
+                        memory,
+                        0,
+                        vk::WHOLE_SIZE,
+                        vk::MemoryMapFlags::default(),
+                        &mut data
+                    ));
+                    data as *mut u8
+                } else {
+                    std::ptr::null_mut()
+                };
+
+                tlsf.insert_super_block(BLOCK_SIZE, VulkanAllocationInfo { memory, mapped_ptr });
+
+                tlsf.alloc(desc.requirements.size, desc.requirements.alignment)
+                    .expect("failed to allocate")
+            }
+        };
+
+        VulkanMemory::SubAlloc(VulkanMemorySubAlloc {
+            allocation,
+            size: desc.requirements.size,
+            memory_type_index,
+        })
     }
 
     fn request_descriptor_pool(&self) -> vk::DescriptorPool {
@@ -1416,68 +1596,7 @@ impl VulkanDevice {
         semaphore
     }
 
-    fn destroy_deferred(
-        device_fn: &vk::DeviceFunctions,
-        device: vk::Device,
-        frame: &mut VulkanFrame,
-    ) {
-        for pipeline_layout in frame.destroyed_pipeline_layouts.get_mut().drain(..) {
-            unsafe { device_fn.destroy_pipeline_layout(device, pipeline_layout, None) }
-        }
-        for pipeline in frame.destroyed_pipelines.get_mut().drain(..) {
-            unsafe { device_fn.destroy_pipeline(device, pipeline, None) }
-        }
-        for descriptor_set_layout in frame.destroyed_descriptor_set_layouts.get_mut().drain(..) {
-            unsafe { device_fn.destroy_descriptor_set_layout(device, descriptor_set_layout, None) }
-        }
-        for sampler in frame.destroyed_samplers.get_mut().drain(..) {
-            unsafe { device_fn.destroy_sampler(device, sampler, None) }
-        }
-        for image_view in frame.destroyed_image_views.get_mut().drain(..) {
-            unsafe { device_fn.destroy_image_view(device, image_view, None) }
-        }
-        for image in frame.destroyed_images.get_mut().drain(..) {
-            unsafe { device_fn.destroy_image(device, image, None) }
-        }
-        for buffer_view in frame.destroyed_buffer_views.get_mut().drain(..) {
-            unsafe { device_fn.destroy_buffer_view(device, buffer_view, None) }
-        }
-        for buffer in frame.destroyed_buffers.get_mut().drain(..) {
-            unsafe { device_fn.destroy_buffer(device, buffer, None) }
-        }
-        for memory in frame.destroyed_allocations.get_mut().drain(..) {
-            unsafe { device_fn.free_memory(device, memory.memory, None) };
-        }
-    }
-
-    fn destroy_swapchain_deferred(
-        &self,
-        surface: vk::SurfaceKHR,
-        swapchain: vk::SwapchainKHR,
-        image_views: &[vk::ImageView],
-    ) {
-        let device_fn = &self.device_fn;
-        let swapchain_fn = &self.swapchain_fn;
-        let surface_fn = &self.surface_fn;
-        let instance = self.instance;
-        let device = self.device;
-
-        if !image_views.is_empty() {
-            for &image_view in image_views {
-                unsafe { device_fn.destroy_image_view(device, image_view, None) }
-            }
-        }
-        if !swapchain.is_null() {
-            unsafe { swapchain_fn.destroy_swapchain(device, swapchain, None) }
-        }
-        if !surface.is_null() {
-            unsafe { surface_fn.destroy_surface(instance, surface, None) }
-        }
-    }
-}
-
-impl Device for VulkanDevice {
-    fn create_buffer(&self, desc: &BufferDesc) -> Buffer {
+    fn create_buffer(&self, desc: &BufferDesc, initial_data: Option<&[u8]>) -> Buffer {
         let mut usage = vk::BufferUsageFlags::default();
         if desc.usage.contains(BufferUsageFlags::UNIFORM) {
             usage |= vk::BufferUsageFlags::UNIFORM_BUFFER;
@@ -1525,54 +1644,122 @@ impl Device for VulkanDevice {
             &mut memory_requirements,
         );
 
-        let dedicated =
-            if memory_dedicated_requirements.prefers_dedicated_allocation == vk::Bool32::True {
-                Some(VulkanMemoryDedicatedDesc::Buffer(buffer))
-            } else {
-                None
-            };
+        let memory = if memory_dedicated_requirements.prefers_dedicated_allocation
+            == vk::Bool32::True
+            || memory_dedicated_requirements.requires_dedicated_allocation == vk::Bool32::True
+        {
+            self.allocate_memory_dedicated(
+                &VulkanMemoryDesc {
+                    requirements: memory_requirements.memory_requirements,
+                    memory_location: desc.location,
+                    _linear: true,
+                },
+                &VulkanMemoryDedicatedDesc::Buffer(buffer),
+            )
+        } else {
+            self.allocate_memory(&VulkanMemoryDesc {
+                requirements: memory_requirements.memory_requirements,
+                memory_location: desc.location,
+                _linear: true,
+            })
+        };
 
-        let memory = self.allocate_memory(&VulkanMemoryDesc {
-            requirements: memory_requirements.memory_requirements,
-            memory_location: desc.location,
-            dedicated,
-            _linear: true,
-        });
+        if let Some(initial_data) = initial_data {
+            assert!(!memory.mapped_ptr().is_null());
+            // SAFETY: The memory has just been allocated, so as long as the pointer is
+            // non-null, then we can create a slice for it.
+            unsafe {
+                let dst =
+                    std::slice::from_raw_parts_mut(memory.mapped_ptr(), memory.size() as usize);
+                dst.copy_from_slice(initial_data);
+            }
+        }
 
         unsafe {
             self.device_fn.bind_buffer_memory2(
                 self.device,
                 &[vk::BindBufferMemoryInfo {
                     buffer,
-                    memory: memory.memory,
-                    offset: memory.offset,
+                    memory: memory.device_memory(),
+                    offset: memory.offset(),
                     ..default()
                 }],
             )
         };
 
-        let handle = self
-            .buffer_pool
-            .lock()
-            .insert(VulkanBuffer { memory, buffer });
+        let handle = self.buffer_pool.lock().insert(VulkanBuffer {
+            memory,
+            buffer,
+            map_count: 0,
+        });
 
         Buffer(handle)
     }
 
-    fn create_buffer_with_data(&self, desc: &BufferDesc, initial_data: &[u8]) -> Buffer {
-        let len = initial_data.len();
+    fn destroy_deferred(
+        device_fn: &vk::DeviceFunctions,
+        device: vk::Device,
+        frame: &mut VulkanFrame,
+    ) {
+        for pipeline_layout in frame.destroyed_pipeline_layouts.get_mut().drain(..) {
+            unsafe { device_fn.destroy_pipeline_layout(device, pipeline_layout, None) }
+        }
+        for pipeline in frame.destroyed_pipelines.get_mut().drain(..) {
+            unsafe { device_fn.destroy_pipeline(device, pipeline, None) }
+        }
+        for descriptor_set_layout in frame.destroyed_descriptor_set_layouts.get_mut().drain(..) {
+            unsafe { device_fn.destroy_descriptor_set_layout(device, descriptor_set_layout, None) }
+        }
+        for sampler in frame.destroyed_samplers.get_mut().drain(..) {
+            unsafe { device_fn.destroy_sampler(device, sampler, None) }
+        }
+        for image_view in frame.destroyed_image_views.get_mut().drain(..) {
+            unsafe { device_fn.destroy_image_view(device, image_view, None) }
+        }
+        for image in frame.destroyed_images.get_mut().drain(..) {
+            unsafe { device_fn.destroy_image(device, image, None) }
+        }
+        for buffer_view in frame.destroyed_buffer_views.get_mut().drain(..) {
+            unsafe { device_fn.destroy_buffer_view(device, buffer_view, None) }
+        }
+        for buffer in frame.destroyed_buffers.get_mut().drain(..) {
+            unsafe { device_fn.destroy_buffer(device, buffer, None) }
+        }
+    }
 
-        assert!(len <= desc.size, "initial data larger than buffer");
-        assert!(desc.location == MemoryLocation::HostMapped);
-        let buffer = self.create_buffer(desc);
+    fn destroy_swapchain_deferred(
+        &self,
+        surface: vk::SurfaceKHR,
+        swapchain: vk::SwapchainKHR,
+        image_views: &[vk::ImageView],
+    ) {
+        let device_fn = &self.device_fn;
+        let swapchain_fn = &self.swapchain_fn;
+        let surface_fn = &self.surface_fn;
+        let instance = self.instance;
+        let device = self.device;
 
-        unsafe {
-            let dst = std::slice::from_raw_parts_mut(self.map_buffer(buffer), len);
-            dst.copy_from_slice(initial_data);
-            self.unmap_buffer(buffer);
+        if !image_views.is_empty() {
+            for &image_view in image_views {
+                unsafe { device_fn.destroy_image_view(device, image_view, None) }
+            }
+        }
+        if !swapchain.is_null() {
+            unsafe { swapchain_fn.destroy_swapchain(device, swapchain, None) }
+        }
+        if !surface.is_null() {
+            unsafe { surface_fn.destroy_surface(instance, surface, None) }
         }
+    }
+}
+
+impl Device for VulkanDevice {
+    fn create_buffer(&self, desc: &BufferDesc) -> Buffer {
+        self.create_buffer(desc, None)
+    }
 
-        buffer
+    fn create_buffer_with_data(&self, desc: &BufferDesc, initial_data: &[u8]) -> Buffer {
+        self.create_buffer(desc, Some(initial_data))
     }
 
     fn create_image(&self, desc: &ImageDesc) -> Image {
@@ -1669,27 +1856,33 @@ impl Device for VulkanDevice {
             &mut memory_requirements,
         );
 
-        let dedicated =
-            if memory_dedicated_requirements.prefers_dedicated_allocation == vk::Bool32::True {
-                Some(VulkanMemoryDedicatedDesc::Image(image))
-            } else {
-                None
-            };
-
-        let memory = self.allocate_memory(&VulkanMemoryDesc {
-            requirements: memory_requirements.memory_requirements,
-            memory_location: desc.location,
-            dedicated,
-            _linear: true,
-        });
+        let memory = if memory_dedicated_requirements.prefers_dedicated_allocation
+            == vk::Bool32::True
+            || memory_dedicated_requirements.requires_dedicated_allocation == vk::Bool32::True
+        {
+            self.allocate_memory_dedicated(
+                &VulkanMemoryDesc {
+                    requirements: memory_requirements.memory_requirements,
+                    memory_location: desc.location,
+                    _linear: true,
+                },
+                &VulkanMemoryDedicatedDesc::Image(image),
+            )
+        } else {
+            self.allocate_memory(&VulkanMemoryDesc {
+                requirements: memory_requirements.memory_requirements,
+                memory_location: desc.location,
+                _linear: true,
+            })
+        };
 
         unsafe {
             self.device_fn.bind_image_memory2(
                 self.device,
                 &[vk::BindImageMemoryInfo {
                     image,
-                    memory: memory.memory,
-                    offset: memory.offset,
+                    memory: memory.device_memory(),
+                    offset: memory.offset(),
                     ..default()
                 }],
             )
@@ -2113,6 +2306,10 @@ impl Device for VulkanDevice {
 
     fn destroy_buffer(&self, frame: &Frame, buffer: Buffer) {
         if let Some(buffer) = self.buffer_pool.lock().remove(buffer.0) {
+            assert_eq!(
+                buffer.map_count, 0,
+                "destroying a buffer that is still mapped"
+            );
             let frame = self.frame(frame);
             frame.destroyed_buffers.lock().push_back(buffer.buffer);
             frame.destroyed_allocations.lock().push_back(buffer.memory);
@@ -2931,6 +3128,24 @@ impl Device for VulkanDevice {
 
             Self::destroy_deferred(device_fn, device, frame);
 
+            for allocation in frame.destroyed_allocations.get_mut().drain(..) {
+                match allocation {
+                    VulkanMemory::Dedicated(dedicated) => {
+                        let allocator = self.allocators[dedicated.memory_type_index as usize]
+                            .as_ref()
+                            .unwrap();
+                        allocator.dedicated.lock().remove(&dedicated.memory);
+                        unsafe { device_fn.free_memory(device, dedicated.memory, None) }
+                    }
+                    VulkanMemory::SubAlloc(sub_alloc) => {
+                        let allocator = self.allocators[sub_alloc.memory_type_index as usize]
+                            .as_ref()
+                            .unwrap();
+                        allocator.tlsf.lock().free(sub_alloc.allocation)
+                    }
+                }
+            }
+
             self.destroyed_swapchains
                 .lock()
                 .expire(|(swapchain, surface, image_views)| {
@@ -3010,25 +3225,17 @@ impl Device for VulkanDevice {
     }
 
     unsafe fn map_buffer(&self, buffer: Buffer) -> *mut u8 {
-        let mut ptr = std::ptr::null_mut();
-        if let Some(buffer) = self.buffer_pool.lock().get(buffer.0) {
-            vk_check!(self.device_fn.map_memory(
-                self.device,
-                buffer.memory.memory,
-                buffer.memory.offset,
-                buffer.memory.size,
-                vk::MemoryMapFlags::default(),
-                &mut ptr
-            ))
-        }
-        std::mem::transmute::<*mut c_void, *mut u8>(ptr)
+        let mut buffer_pool = self.buffer_pool.lock();
+        let buffer = buffer_pool.get_mut(buffer.0).unwrap();
+        buffer.map_count += 1;
+        buffer.memory.mapped_ptr()
     }
 
     unsafe fn unmap_buffer(&self, buffer: Buffer) {
-        if let Some(buffer) = self.buffer_pool.lock().get(buffer.0) {
-            self.device_fn
-                .unmap_memory(self.device, buffer.memory.memory)
-        }
+        let mut buffer_pool = self.buffer_pool.lock();
+        let buffer = buffer_pool.get_mut(buffer.0).unwrap();
+        assert!(buffer.map_count > 0);
+        buffer.map_count -= 1;
     }
 
     fn acquire_swapchain(
@@ -3449,19 +3656,16 @@ impl Drop for VulkanDevice {
 
         for buffer in self.buffer_pool.get_mut().values() {
             unsafe { device_fn.destroy_buffer(device, buffer.buffer, None) }
-            unsafe { device_fn.free_memory(device, buffer.memory.memory, None) }
         }
 
         {
             let mut image_views = Vec::new();
             let mut images = Vec::new();
-            let mut memories = Vec::new();
             for image in self.image_pool.get_mut().values() {
                 match image {
                     VulkanImageHolder::Unique(image) => {
                         image_views.push(image.view);
                         images.push(image.image.image);
-                        memories.push(image.image.memory.memory);
                     }
                     VulkanImageHolder::Shared(image) => {
                         image_views.push(image.view);
@@ -3479,10 +3683,6 @@ impl Drop for VulkanDevice {
             for image in images {
                 unsafe { device_fn.destroy_image(device, image, None) }
             }
-
-            for memory in memories {
-                unsafe { device_fn.free_memory(device, memory, None) }
-            }
         }
 
         for sampler in self.sampler_pool.get_mut().values() {
@@ -3541,6 +3741,23 @@ impl Drop for VulkanDevice {
             unsafe { self.surface_fn.destroy_surface(instance, surface, None) }
         }
 
+        for allocator in self.allocators.iter_mut().flatten() {
+            // Clear out all memory blocks held by the TLSF allocators.
+            let tlsf = allocator.tlsf.get_mut();
+            for super_block in tlsf.super_blocks() {
+                unsafe {
+                    self.device_fn
+                        .free_memory(device, super_block.user_data.memory, None)
+                }
+            }
+
+            // Clear out all dedicated allocations.
+            let dedicated = allocator.dedicated.get_mut();
+            for memory in dedicated.iter().copied() {
+                unsafe { self.device_fn.free_memory(device, memory, None) }
+            }
+        }
+
         unsafe { device_fn.destroy_device(device, None) }
         unsafe { self.instance_fn.destroy_instance(self.instance, None) };
     }
diff --git a/libs/narcissus-gpu/src/lib.rs b/libs/narcissus-gpu/src/lib.rs
index 1e64970..7105184 100644
--- a/libs/narcissus-gpu/src/lib.rs
+++ b/libs/narcissus-gpu/src/lib.rs
@@ -8,6 +8,7 @@ use narcissus_core::{
 mod backend;
 mod delay_queue;
 mod frame_counter;
+pub mod tlsf;
 
 pub enum DeviceBackend {
     Vulkan,
diff --git a/libs/narcissus-gpu/src/tlsf.rs b/libs/narcissus-gpu/src/tlsf.rs
new file mode 100644
index 0000000..6508150
--- /dev/null
+++ b/libs/narcissus-gpu/src/tlsf.rs
@@ -0,0 +1,724 @@
+//! Two Level Seggregated Fit Allocator
+//! ===
+//!
+//! [TLSF][tlsf] is a constant time, low fragmentation good-fit allocator based
+//! on seggregated free-lists with a two-level bitmap acceleration structure.
+//!
+//! Memory is allocated by the underlying allocator into super-blocks,
+//! representing large chunks of contiguous memory. The allocation routines
+//! then work on blocks, which subdivide those regions.
+//!
+//! In order to quickly find a large-enough block, free blocks are stored in a
+//! set of seggregated free-lists by their size. The requirements for a binning
+//! strategy are as follows;
+//!
+//! 1) Must have a bounded number of bins.
+//!
+//! 2) Must be fast to find the bin for a given size.
+//!
+//! 3) Bin sizes must closely match allocation sizes to minimise fragmentation.
+//!
+//! For these purposes we use a [linear-log][linearlog] strategy for binning. An
+//! initial 'linear' bin is divided into N sub-bins, then power-of-two sized
+//! bins follow, also divided into N sub-bins. With some simple bit arithmetic
+//! we can calculate the bucket for a given size.
+//!
+//! For example, if the initial linear region was 16, and the number of sub-bins
+//! was 4, we would end up with a layout something like the following.
+//!
+//! ```text
+//!                     1..=4       5..=8        9..=12      13..=16
+//!                +------------+------------+------------+------------+
+//! Linear Region  |    0x01    |    0x00    |    0x00    |    0x00    |
+//!                +------------+------------+------------+------------+
+//!
+//!                   17..=20      21..=24      25..=28      29..=32
+//!                +------------+------------+------------+------------+
+//! 2^4            |    0x00    |    0x00    |    0x00    |    0x00    |
+//!                +------------+------------+------------+------------+
+//!
+//!                   31..=40      41..=48      49..=56      57..=64
+//!                +------------+------------+------------+------------+
+//! 2^5            |    0x00    |    0x00    |    0x00    |    0x00    |
+//!                +------------+------------+------------+------------+
+//!
+//!                   65..=80      81..=96      97..=112    113..=128
+//!                +------------+------------+------------+------------+
+//! 2^6            |    0x01    |    0x00    |    0x04    |    0x00    |
+//!                +------------+------------+------------+------------+
+//!
+//! ```
+//!
+//! In order to avoid linearly scanning the free-lists to find suitable empty
+//! blocks, we maintain a two-level bitmap acceleration structure. The first
+//! level has a bit set for each non-empty bin, then the second level likewise
+//! has a bit set for each non-empty sub-bin. From there it's possible to scan
+//! with bit arithmetic to find the first suitable non-empty block without
+//! traversing the entire free-lists structure.
+//!
+//! ```text
+//!
+//!               +---+---+---+---+
+//! Level 0:      | 1 | 0 | 0 | 1 |                                          0x9
+//!               +-+-+-+-+-+-+-+-+
+//!                 |   |   |   |
+//!                 |   |   |   |
+//!                 |   |   |   |
+//!                 |   |   |   |      +---+---+---+---+
+//! Level 1:        |   |   |   +----->| 0 | 0 | 0 | 1 |   Linear Region     0x1
+//!                 |   |   |          +---+---+---+---+
+//!                 |   |   |
+//!                 |   |   |          +---+---+---+---+
+//!                 |   |   +--------->| 0 | 0 | 0 | 0 |   2^4               0x0
+//!                 |   |              +---+---+---+---+
+//!                 |   |
+//!                 |   |              +---+---+---+---+
+//!                 |   +------------->| 0 | 0 | 0 | 0 |   2^5               0x0
+//!                 |                  +---+---+---+---+
+//!                 |
+//!                 |                  +---+---+---+---+
+//!                 +----------------->| 0 | 1 | 0 | 0 |   2^6               0x4
+//!                                    +---+---+---+---+
+//!
+//! ```
+//!
+//! [tlsf]: http://www.gii.upv.es/tlsf/files/spe_2008.pdf
+//! [linearlog]: https://pvk.ca/Blog/2015/06/27/linear-log-bucketing-fast-versatile-simple/
+
+use std::{
+    num::NonZeroU32,
+    ops::{Index, IndexMut},
+};
+
+use narcissus_core::{linear_log_binning, static_assert};
+
+// The log2 of the size of the 'linear' bin.
+pub const LINEAR_LOG2: u32 = 7; // 2^7 = 128
+
+// The log2 of the number of sub-bins in each bin.
+pub const SUB_BINS_LOG2: u32 = 5; // 2^5 = 32
+
+type Bin = linear_log_binning::Bin<LINEAR_LOG2, SUB_BINS_LOG2>;
+
+pub const BIN_COUNT: usize = 24;
+pub const SUB_BIN_COUNT: usize = 1 << SUB_BINS_LOG2;
+
+static_assert!(SUB_BIN_COUNT <= u32::BITS as usize);
+static_assert!(BIN_COUNT <= u32::BITS as usize);
+
+pub const MIN_BLOCK_SIZE: u32 = 16;
+
+#[derive(Clone, Copy, PartialEq, Eq, PartialOrd, Ord, Debug)]
+struct SuperBlockIndex(u32);
+
+pub struct SuperBlock<T>
+where
+    T: Copy,
+{
+    _first_block_index: BlockIndex,
+    pub user_data: T,
+}
+
+#[derive(Clone, Copy, PartialEq, Eq, PartialOrd, Ord, Debug)]
+struct BlockIndex(NonZeroU32);
+
+const INVALID_BLOCK_INDEX: BlockIndex = BlockIndex(match NonZeroU32::new(0xffff_ffff) {
+    Some(x) => x,
+    None => panic!(),
+});
+
+struct BlockLink {
+    prev: BlockIndex,
+    next: BlockIndex,
+}
+
+impl BlockLink {
+    /// Create a new unlinked BlockLink for the given `block_index`.
+    const fn new(block_index: BlockIndex) -> Self {
+        Self {
+            prev: block_index,
+            next: block_index,
+        }
+    }
+
+    /// Returns true if the given link is not inserted into any list.
+    fn is_unlinked(&self) -> bool {
+        self.prev == self.next
+    }
+}
+
+/// Insert the node at index `$insert` before the node at index `$x` for the
+/// list given by `$storage` and `$link_name`.
+macro_rules! list_insert_before {
+    ($storage:expr, $link_name:ident, $x:expr, $insert:expr) => {
+        $storage[$insert].$link_name.prev = $storage[$x].$link_name.prev;
+        $storage[$insert].$link_name.next = $x;
+        let prev_index = $storage[$insert].$link_name.prev;
+        $storage[prev_index].$link_name.next = $insert;
+        let next_index = $storage[$insert].$link_name.next;
+        $storage[next_index].$link_name.prev = $insert;
+    };
+}
+
+/// Insert the node at index `$insert` after the node at index `$x` for the
+/// list given by `$storage` and `$link_name`.
+macro_rules! list_insert_after {
+    ($storage:expr, $link_name:ident, $x:expr, $insert:expr) => {
+        $storage[$insert].$link_name.prev = $x;
+        $storage[$insert].$link_name.next = $storage[$x].$link_name.next;
+        let prev_index = $storage[$insert].$link_name.prev;
+        $storage[prev_index].$link_name.next = $insert;
+        let next_index = $storage[$insert].$link_name.next;
+        $storage[next_index].$link_name.prev = $insert;
+    };
+}
+
+/// Unlink the node`$x` for the list given by `$storage` and `$link_name`.
+macro_rules! list_unlink {
+    ($storage:expr, $link_name:ident, $x:expr) => {
+        let prev_index = $storage[$x].$link_name.prev;
+        $storage[prev_index].$link_name.next = $storage[$x].$link_name.next;
+        let next_index = $storage[$x].$link_name.next;
+        $storage[next_index].$link_name.prev = $storage[$x].$link_name.prev;
+        $storage[$x].$link_name.prev = $x;
+        $storage[$x].$link_name.next = $x;
+    };
+}
+
+struct Block {
+    size: u32,
+    offset: u32,
+    generation: u32,
+    super_block_index: SuperBlockIndex,
+
+    free_link: BlockLink,
+    phys_link: BlockLink,
+}
+
+const DUMMY_BLOCK: Block = Block {
+    generation: 0xffff_ffff,
+    size: 0xffff_ffff,
+    offset: 0xffff_ffff,
+    free_link: BlockLink::new(INVALID_BLOCK_INDEX),
+    phys_link: BlockLink::new(INVALID_BLOCK_INDEX),
+    super_block_index: SuperBlockIndex(0xffff_ffff),
+};
+
+impl Block {
+    fn is_used(&self) -> bool {
+        self.generation & 1 == 1
+    }
+
+    fn is_free(&self) -> bool {
+        self.generation & 1 == 0
+    }
+}
+
+impl Index<BlockIndex> for Vec<Block> {
+    type Output = Block;
+
+    #[inline(always)]
+    fn index(&self, index: BlockIndex) -> &Self::Output {
+        &self[index.0.get() as usize]
+    }
+}
+
+impl IndexMut<BlockIndex> for Vec<Block> {
+    #[inline(always)]
+    fn index_mut(&mut self, index: BlockIndex) -> &mut Self::Output {
+        &mut self[index.0.get() as usize]
+    }
+}
+
+impl<T> Index<SuperBlockIndex> for Vec<SuperBlock<T>>
+where
+    T: Copy,
+{
+    type Output = SuperBlock<T>;
+
+    #[inline(always)]
+    fn index(&self, index: SuperBlockIndex) -> &Self::Output {
+        &self[index.0 as usize]
+    }
+}
+
+impl<T> IndexMut<SuperBlockIndex> for Vec<SuperBlock<T>>
+where
+    T: Copy,
+{
+    #[inline(always)]
+    fn index_mut(&mut self, index: SuperBlockIndex) -> &mut Self::Output {
+        &mut self[index.0 as usize]
+    }
+}
+
+#[derive(Clone)]
+pub struct Allocation<T> {
+    block_index: BlockIndex,
+    generation: u32,
+    offset: u64,
+    user_data: T,
+}
+
+impl<T> Allocation<T> {
+    pub fn user_data(&self) -> &T {
+        &self.user_data
+    }
+
+    /// Returns the offset into the super-block where this allocation starts.
+    pub fn offset(&self) -> u64 {
+        self.offset
+    }
+}
+
+pub struct Tlsf<T>
+where
+    T: Copy,
+{
+    bitmap_0: u32,
+    bitmap_1: [u32; BIN_COUNT],
+    empty_block_heads: [Option<BlockIndex>; SUB_BIN_COUNT * BIN_COUNT],
+
+    free_block_head: Option<BlockIndex>,
+    blocks: Vec<Block>,
+
+    super_blocks: Vec<SuperBlock<T>>,
+}
+
+impl<T> Default for Tlsf<T>
+where
+    T: Copy,
+{
+    fn default() -> Self {
+        Self::new()
+    }
+}
+
+impl<T> Tlsf<T>
+where
+    T: Copy,
+{
+    pub fn new() -> Self {
+        Self {
+            bitmap_0: 0,
+            bitmap_1: [0; BIN_COUNT],
+            empty_block_heads: [None; SUB_BIN_COUNT * BIN_COUNT],
+            free_block_head: None,
+            blocks: vec![DUMMY_BLOCK],
+            super_blocks: vec![],
+        }
+    }
+
+    /// Returns a slice containing all the super_blocks added to the allocator.
+    /// Only the `user_data` field is accessible.
+    pub fn super_blocks(&self) -> &[SuperBlock<T>] {
+        &self.super_blocks
+    }
+
+    /// Clear the allocator state.
+    ///
+    /// Make sure to clean up any super blocks before calling this.
+    pub fn clear(&mut self) {
+        self.bitmap_0 = 0;
+        self.bitmap_1.fill(0);
+        self.empty_block_heads.fill(None);
+        self.free_block_head = None;
+        self.blocks.clear();
+        self.blocks.push(DUMMY_BLOCK);
+        self.super_blocks.clear()
+    }
+
+    /// Search the acceleration structure for a non-empty list from the given
+    /// bin.
+    ///
+    /// Returns the bin index if a non-empty list is found, or None.
+    fn search_non_empty_bin(&self, starting_bin: Bin) -> Option<Bin> {
+        let mut bin = starting_bin.bin();
+        let sub_bin = starting_bin.sub_bin();
+
+        // First we scan the second-level bitmap from sub_bin, masking out the earlier
+        // sub-bins so we don't end up returning a bin that's too small for the
+        // allocation.
+        let mut second_level = self.bitmap_1[bin as usize] & (!0 << sub_bin);
+
+        // If that search failed, then we must scan the first-level bitmap from the next
+        // bin forward. If we find anything here it cannot possibly be smaller than the
+        // requested allocation.
+        if second_level == 0 {
+            let first_level = self.bitmap_0 & (!0 << (bin + 1));
+
+            // If that search also failed, there's no suitable blocks.
+            if first_level == 0 {
+                return None;
+            }
+
+            // Recalculate the bin from the first level bitmap.
+            bin = first_level.trailing_zeros();
+            second_level = self.bitmap_1[bin as usize];
+        }
+
+        // Find the sub-bin from the second level bitmap.
+        let sub_bin = second_level.trailing_zeros();
+        Some(Bin::new(bin, sub_bin))
+    }
+
+    /// Marks a given bin as containing empty blocks in the bitmap acceleration
+    /// structure.
+    fn set_metadata_bit(&mut self, bin: Bin) {
+        let sub_bin = bin.sub_bin();
+        let bin = bin.bin() as usize;
+        self.bitmap_0 |= 1 << bin;
+        self.bitmap_1[bin] |= 1 << sub_bin;
+    }
+
+    /// Marks a given bin as containing no empty blocks in the bitmap acceleration
+    /// structure.
+    fn clear_metadata_bit(&mut self, bin: Bin) {
+        let sub_bin = bin.sub_bin();
+        let bin = bin.bin() as usize;
+        self.bitmap_1[bin] &= !(1 << sub_bin);
+        if self.bitmap_1[bin] == 0 {
+            self.bitmap_0 &= !(1 << bin);
+        }
+    }
+
+    /// Inserts a block into the empty blocks lists.
+    fn insert_block(&mut self, block_index: BlockIndex) {
+        debug_assert!(self.blocks[block_index].is_free());
+        debug_assert!(self.blocks[block_index].free_link.is_unlinked());
+
+        let (_, bin) = Bin::from_size_round_down(self.blocks[block_index].size);
+        let bin_index = bin.index();
+
+        if let Some(empty_block_index) = self.empty_block_heads[bin_index] {
+            list_insert_before!(self.blocks, free_link, empty_block_index, block_index);
+        } else {
+            self.set_metadata_bit(bin);
+        }
+
+        self.empty_block_heads[bin_index] = Some(block_index);
+    }
+
+    /// Removes a block from the empty blocks lists.
+    fn extract_block(&mut self, block_index: BlockIndex) {
+        debug_assert!(self.blocks[block_index].is_free());
+
+        let (_, bin) = Bin::from_size_round_down(self.blocks[block_index].size);
+
+        let bin_index = bin.index();
+
+        debug_assert!(self.empty_block_heads[bin_index].is_some());
+
+        if self.empty_block_heads[bin_index] == Some(block_index) {
+            let next_index = self.blocks[block_index].free_link.next;
+            if next_index != block_index {
+                self.empty_block_heads[bin_index] = Some(next_index);
+            } else {
+                self.empty_block_heads[bin_index] = None;
+                self.clear_metadata_bit(bin);
+            }
+        }
+
+        list_unlink!(self.blocks, free_link, block_index);
+    }
+
+    /// Returns true if we should merge `from_block_index` into `into_block_index`.
+    fn can_merge_block_left(
+        &self,
+        into_block_index: BlockIndex,
+        from_block_index: BlockIndex,
+    ) -> bool {
+        // Cannot merge into ourselves.
+        if into_block_index == from_block_index {
+            return false;
+        }
+
+        // Cannot merge the first block in a physical range into the last block.
+        // This check is necessary because the linked lists are cyclic.
+        if self.blocks[from_block_index].offset == 0 {
+            return false;
+        }
+
+        // Cannot merge blocks that are in-use.
+        if self.blocks[into_block_index].is_used() || self.blocks[from_block_index].is_used() {
+            return false;
+        }
+
+        true
+    }
+
+    /// Requests a new block, and returns its `BlockIndex`.
+    fn request_block(
+        &mut self,
+        offset: u32,
+        size: u32,
+        super_block_index: SuperBlockIndex,
+    ) -> BlockIndex {
+        let block_index = if let Some(free_block_index) = self.free_block_head {
+            let next_index = self.blocks[free_block_index].free_link.next;
+            self.free_block_head = if next_index != free_block_index {
+                Some(next_index)
+            } else {
+                None
+            };
+            list_unlink!(self.blocks, free_link, free_block_index);
+            free_block_index
+        } else {
+            assert!(self.blocks.len() < i32::MAX as usize);
+            let block_index = BlockIndex(NonZeroU32::new(self.blocks.len() as u32).unwrap());
+            self.blocks.push(Block {
+                generation: 0,
+                size,
+                offset,
+                free_link: BlockLink::new(block_index),
+                phys_link: BlockLink::new(block_index),
+                super_block_index,
+            });
+            block_index
+        };
+
+        let block = &mut self.blocks[block_index];
+        block.offset = offset;
+        block.size = size;
+        block.super_block_index = super_block_index;
+
+        block_index
+    }
+
+    /// Recycles the block indicated by `block_index` for re-use.
+    fn recycle_block(&mut self, block_index: BlockIndex) {
+        let block = &mut self.blocks[block_index];
+        debug_assert!(block.free_link.is_unlinked());
+        debug_assert!(block.phys_link.is_unlinked());
+
+        block.size = 0xffff_ffff;
+        block.offset = 0xffff_ffff;
+
+        if let Some(free_block_index) = self.free_block_head {
+            list_insert_before!(self.blocks, free_link, free_block_index, block_index);
+        }
+
+        self.free_block_head = Some(block_index);
+    }
+
+    pub fn insert_super_block(&mut self, size: u64, user_data: T) {
+        assert!(size != 0 && size < i32::MAX as u64);
+        assert!(self.super_blocks.len() < i32::MAX as usize);
+
+        // Ranges checked in asserts above.
+        let size = size as u32;
+        let len = self.super_blocks.len() as u32;
+
+        let super_block_index = SuperBlockIndex(len);
+        let block_index = self.request_block(0, size, super_block_index);
+
+        self.super_blocks.push(SuperBlock {
+            // The block at offset 0 in a SuperBlock will never be merged away, so the index
+            // is stable and we can store it in the SuperBlock itself.
+            _first_block_index: block_index,
+            user_data,
+        });
+
+        self.insert_block(block_index);
+    }
+
+    pub fn alloc(&mut self, size: u64, align: u64) -> Option<Allocation<T>> {
+        assert!(
+            size != 0
+                && align != 0
+                && align < i32::MAX as u64
+                && size < (i32::MAX as u64 - align)
+                && align.is_power_of_two()
+        );
+        let size = size.max(MIN_BLOCK_SIZE as u64);
+        let size = if align > MIN_BLOCK_SIZE as u64 {
+            size - 1 + align
+        } else {
+            size
+        } as u32;
+
+        // We need to find the bin which contains only empty-blocks large enough for the
+        // given size because we unconditionally use the first empty block found. So
+        // this must round up.
+        let (rounded_size, starting_bin) = Bin::from_size_round_up(size);
+
+        let Some(bin) = self.search_non_empty_bin(starting_bin) else {
+            return None;
+        };
+
+        let block_index = self.empty_block_heads[bin.index()].unwrap();
+
+        debug_assert!(
+            self.blocks[block_index].is_free() && self.blocks[block_index].size >= rounded_size
+        );
+
+        self.extract_block(block_index);
+
+        let remainder = self.blocks[block_index].size - rounded_size;
+        let super_block_index = self.blocks[block_index].super_block_index;
+
+        // Should we should split the block?
+        if remainder >= MIN_BLOCK_SIZE {
+            self.blocks[block_index].size -= remainder;
+            let offset = self.blocks[block_index].offset + rounded_size;
+            let new_block_index = self.request_block(offset, remainder, super_block_index);
+            list_insert_after!(self.blocks, phys_link, block_index, new_block_index);
+            self.insert_block(new_block_index);
+        }
+
+        let generation = self.blocks[block_index].generation.wrapping_add(1);
+        self.blocks[block_index].generation = generation;
+
+        let user_data = self.super_blocks[super_block_index].user_data;
+        // The mask is a no-op if the alignment is already met, do it unconditionally.
+        let offset = (self.blocks[block_index].offset as u64 + align - 1) & !(align - 1);
+
+        Some(Allocation {
+            block_index,
+            generation,
+            offset,
+            user_data,
+        })
+    }
+
+    pub fn free(&mut self, allocation: Allocation<T>) {
+        let mut block_index = allocation.block_index;
+        let generation = self.blocks[block_index].generation;
+        assert_eq!(generation, allocation.generation, "double-free");
+        self.blocks[block_index].generation = generation.wrapping_add(1);
+
+        // Merge next block into the current block.
+        {
+            let into_block_index = block_index;
+            let from_block_index = self.blocks[block_index].phys_link.next;
+            if self.can_merge_block_left(into_block_index, from_block_index) {
+                let from_size = self.blocks[from_block_index].size;
+                self.extract_block(from_block_index);
+                list_unlink!(self.blocks, phys_link, from_block_index);
+                self.recycle_block(from_block_index);
+                self.blocks[into_block_index].size += from_size;
+            }
+        }
+
+        // Merge current block into the prev block.
+        {
+            let into_block_index = self.blocks[block_index].phys_link.prev;
+            let from_block_index = block_index;
+            if self.can_merge_block_left(into_block_index, from_block_index) {
+                let from_size = self.blocks[from_block_index].size;
+                self.extract_block(into_block_index);
+                list_unlink!(self.blocks, phys_link, from_block_index);
+                self.recycle_block(from_block_index);
+                self.blocks[into_block_index].size += from_size;
+                block_index = into_block_index;
+            }
+        }
+
+        // Insert the merged free block.
+        self.insert_block(block_index);
+    }
+}
+
+#[cfg(test)]
+mod tests {
+    use narcissus_core::rand::Pcg64;
+
+    use super::*;
+
+    #[test]
+    fn split_and_merge() {
+        let mut tlsf = Tlsf::new();
+
+        tlsf.insert_super_block(1024, ());
+
+        let alloc0 = tlsf.alloc(512, 1).unwrap();
+        let alloc1 = tlsf.alloc(512, 1).unwrap();
+        assert!(tlsf.alloc(512, 1).is_none());
+
+        // Freeing should merge the blocks.
+
+        tlsf.free(alloc0);
+        tlsf.free(alloc1);
+
+        // and allow us to allocate the full size again.
+        let alloc2 = tlsf.alloc(1024, 1).unwrap();
+        assert!(tlsf.alloc(512, 1).is_none());
+        tlsf.free(alloc2);
+
+        {
+            let mut allocations = (0..64)
+                .map(|_| tlsf.alloc(16, 1).unwrap())
+                .collect::<Vec<_>>();
+
+            assert!(tlsf.alloc(16, 1).is_none());
+
+            for allocation in allocations.drain(..).rev() {
+                tlsf.free(allocation);
+            }
+        }
+
+        // and allow us to allocate the full size again.
+        let alloc2 = tlsf.alloc(1024, 1).unwrap();
+        assert!(tlsf.alloc(512, 1).is_none());
+        tlsf.free(alloc2);
+
+        {
+            let mut allocations = (0..64)
+                .map(|_| tlsf.alloc(16, 1).unwrap())
+                .collect::<Vec<_>>();
+
+            assert!(tlsf.alloc(16, 1).is_none());
+
+            for allocation in allocations.drain(..) {
+                tlsf.free(allocation);
+            }
+        }
+
+        // and allow us to allocate the full size again.
+        let alloc2 = tlsf.alloc(1024, 1).unwrap();
+        assert!(tlsf.alloc(512, 1).is_none());
+        tlsf.free(alloc2);
+    }
+
+    #[test]
+    fn multiple_super_blocks() {
+        let mut tlsf = Tlsf::new();
+
+        const NUM_SUPER_BLOCKS: u64 = 16;
+        const SUPER_BLOCK_SIZE: u64 = 10 * 1024;
+
+        const TOTAL_SIZE: u64 = NUM_SUPER_BLOCKS * SUPER_BLOCK_SIZE;
+        const ALLOCATION_SIZE: u64 = 16;
+
+        for _ in 0..NUM_SUPER_BLOCKS {
+            tlsf.insert_super_block(SUPER_BLOCK_SIZE, ());
+        }
+
+        let mut seed_rng = Pcg64::new();
+
+        for _run in 0..4 {
+            let seed = seed_rng.next_u64() as u128 | (seed_rng.next_u64() as u128) << 64;
+            let mut rng = Pcg64::with_seed(seed);
+
+            let mut allocations = (0..(TOTAL_SIZE / ALLOCATION_SIZE))
+                .map(|_| tlsf.alloc(ALLOCATION_SIZE, 1).unwrap())
+                .collect::<Vec<_>>();
+
+            rng.shuffle(allocations.as_mut_slice());
+
+            for allocation in allocations.drain(..) {
+                tlsf.free(allocation);
+            }
+        }
+    }
+
+    #[test]
+    #[should_panic]
+    fn double_free() {
+        let mut tlsf = Tlsf::new();
+        tlsf.insert_super_block(1024, ());
+        let alloc = tlsf.alloc(512, 1).unwrap();
+        tlsf.free(alloc.clone());
+        tlsf.free(alloc);
+    }
+}