From: Josh Simmons Date: Sun, 26 May 2024 16:40:50 +0000 (+0200) Subject: shark: Multipass primitives X-Git-Url: https://git.nega.tv//gitweb.cgi?a=commitdiff_plain;h=6a17b1620c94bef11944bb759cf46f4620682739;p=josh%2Fnarcissus shark: Multipass primitives --- diff --git a/title/shark-shaders/shaders/display_transform.comp.glsl b/title/shark-shaders/shaders/display_transform.comp.glsl index aea5280..08f6a86 100644 --- a/title/shark-shaders/shaders/display_transform.comp.glsl +++ b/title/shark-shaders/shaders/display_transform.comp.glsl @@ -10,8 +10,9 @@ struct PrimitiveUniforms { uint num_primitives_32; uint num_primitives_1024; - uint tile_stride_coarse; uint tile_stride_fine; + + uvec2 tile_offset; }; layout(std430, push_constant) uniform uniformBuffer { diff --git a/title/shark-shaders/shaders/primitive_2d.h b/title/shark-shaders/shaders/primitive_2d.h index 57064a3..d8f2261 100644 --- a/title/shark-shaders/shaders/primitive_2d.h +++ b/title/shark-shaders/shaders/primitive_2d.h @@ -10,6 +10,8 @@ #define TILE_BITMAP_L1_OFFSET_FINE 0 #define TILE_BITMAP_L0_OFFSET_FINE TILE_BITMAP_L1_WORDS +#define TILE_DISPATCH_X 8 + struct PrimitiveUniforms { uvec2 screen_resolution; uvec2 atlas_resolution; @@ -17,9 +19,9 @@ struct PrimitiveUniforms { uint num_primitives; uint num_primitives_32; uint num_primitives_1024; - - uint tile_stride_coarse; uint tile_stride_fine; + + uvec2 tile_offset_coarse; }; struct Glyph { diff --git a/title/shark-shaders/shaders/primitive_2d_bin_coarse.comp.glsl b/title/shark-shaders/shaders/primitive_2d_bin_coarse.comp.glsl index 80e803b..6be407c 100644 --- a/title/shark-shaders/shaders/primitive_2d_bin_coarse.comp.glsl +++ b/title/shark-shaders/shaders/primitive_2d_bin_coarse.comp.glsl @@ -15,7 +15,9 @@ layout (local_size_x = 64, local_size_y = 1, local_size_z = 1) in; void main() { const uvec2 tile_coord = gl_GlobalInvocationID.yz; - const uvec2 tile_min = tile_coord * TILE_SIZE_COARSE; + const uvec2 tile_coord_global = tile_coord + primitive_uniforms.tile_offset_coarse; + + const uvec2 tile_min = tile_coord_global * TILE_SIZE_COARSE; const uvec2 tile_max = min(tile_min + TILE_SIZE_COARSE, primitive_uniforms.screen_resolution); const uint primitive_index = gl_WorkGroupID.x * gl_WorkGroupSize.x + gl_SubgroupID * gl_SubgroupSize + gl_SubgroupInvocationID; @@ -27,7 +29,7 @@ void main() { uvec4 ballot_result = subgroupBallot(intersects); if (subgroupElect()) { // managed democracy wins again - const uint tile_index = tile_coord.y * primitive_uniforms.tile_stride_coarse + tile_coord.x; + const uint tile_index = tile_coord.y * TILE_DISPATCH_X + tile_coord.x; const uint tile_offset = tile_index * TILE_STRIDE_COARSE; coarse_bitmap_wo[tile_offset + 2 * gl_WorkGroupID.x + 0] = ballot_result.x; coarse_bitmap_wo[tile_offset + 2 * gl_WorkGroupID.x + 1] = ballot_result.y; diff --git a/title/shark-shaders/shaders/primitive_2d_bin_fine.comp.glsl b/title/shark-shaders/shaders/primitive_2d_bin_fine.comp.glsl index 9efb59b..80f0b23 100644 --- a/title/shark-shaders/shaders/primitive_2d_bin_fine.comp.glsl +++ b/title/shark-shaders/shaders/primitive_2d_bin_fine.comp.glsl @@ -15,16 +15,20 @@ layout (local_size_x = 64, local_size_y = 1, local_size_z = 1) in; void main() { const uvec2 tile_coord = gl_GlobalInvocationID.yz; - const uvec2 tile_min = tile_coord * TILE_SIZE_FINE; + const uint tile_index = tile_coord.y * (TILE_DISPATCH_X << TILE_SIZE_SHIFT) + tile_coord.x; + + const uvec2 tile_coord_global = tile_coord + (primitive_uniforms.tile_offset_coarse << TILE_SIZE_SHIFT); + const uint tile_index_global = tile_coord_global.y * primitive_uniforms.tile_stride_fine + tile_coord_global.x; + + const uvec2 tile_min = tile_coord_global * TILE_SIZE_FINE; const uvec2 tile_max = min(tile_min + TILE_SIZE_FINE, primitive_uniforms.screen_resolution); - const uint tile_index = tile_coord.y * primitive_uniforms.tile_stride_fine + tile_coord.x; const uint index = gl_WorkGroupID.x * gl_WorkGroupSize.x + gl_SubgroupID * gl_SubgroupSize + gl_SubgroupInvocationID; uint bitmap_l0 = 0; if (index < primitive_uniforms.num_primitives_32) { const uvec2 tile_coord_coarse = tile_coord >> TILE_SIZE_SHIFT; - const uint tile_index_coarse = tile_coord_coarse.y * primitive_uniforms.tile_stride_coarse + tile_coord_coarse.x; + const uint tile_index_coarse = tile_coord_coarse.y * TILE_DISPATCH_X + tile_coord_coarse.x; const uint tile_base_coarse = tile_index_coarse * TILE_STRIDE_COARSE; const uint tile_bitmap_base_coarse = tile_base_coarse + TILE_BITMAP_OFFSET_COARSE; @@ -55,7 +59,7 @@ void main() { const uint count = uint(ballot_result.x != 0) + uint(ballot_result.y != 0); if (count != 0) { - atomicAdd(fine_count_wo[tile_index], count); + atomicAdd(fine_count_wo[tile_index_global], count); } } } diff --git a/title/shark-shaders/shaders/primitive_2d_rasterize.comp.glsl b/title/shark-shaders/shaders/primitive_2d_rasterize.comp.glsl index 0ee9f47..71e447f 100644 --- a/title/shark-shaders/shaders/primitive_2d_rasterize.comp.glsl +++ b/title/shark-shaders/shaders/primitive_2d_rasterize.comp.glsl @@ -11,14 +11,18 @@ layout (local_size_x = TILE_SIZE_FINE, local_size_y = TILE_SIZE_FINE, local_size void main() { const uvec2 tile_coord = gl_WorkGroupID.xy; - const uint tile_index = tile_coord.y * primitive_uniforms.tile_stride_fine + tile_coord.x; + const uint tile_index = tile_coord.y * (TILE_DISPATCH_X << TILE_SIZE_SHIFT) + tile_coord.x; + + const uvec2 tile_coord_global = tile_coord + (primitive_uniforms.tile_offset_coarse << TILE_SIZE_SHIFT); + const uint tile_index_global = tile_coord_global.y * primitive_uniforms.tile_stride_fine + tile_coord_global.x; + const uint tile_base_fine = tile_index * TILE_STRIDE_FINE; const uint tile_bitmap_l1_base_fine = tile_base_fine + TILE_BITMAP_L1_OFFSET_FINE; const uint tile_bitmap_l0_base_fine = tile_base_fine + TILE_BITMAP_L0_OFFSET_FINE; vec4 accum = vec4(0.0); - uint word_count = fine_count_ro[tile_index]; + uint word_count = fine_count_ro[tile_index_global]; // For each tile, iterate over all words in the L1 bitmap. for (int index_l1 = 0; word_count != 0 && index_l1 < primitive_uniforms.num_primitives_1024; index_l1++) { @@ -47,7 +51,7 @@ void main() { const Glyph gl = glyphs[gi.index]; const vec2 glyph_min = gi.position + gl.offset_min; const vec2 glyph_max = gi.position + gl.offset_max; - const vec2 sample_center = gl_GlobalInvocationID.xy + vec2(0.5); + const vec2 sample_center = gl_GlobalInvocationID.xy + primitive_uniforms.tile_offset_coarse * TILE_SIZE_COARSE + vec2(0.5); if (all(greaterThanEqual(sample_center, glyph_min)) && all(lessThanEqual(sample_center, glyph_max))) { const vec2 glyph_size = gl.offset_max - gl.offset_min; const vec4 color = unpackUnorm4x8(gi.color).bgra; @@ -60,5 +64,5 @@ void main() { } } - imageStore(ui_image, ivec2(gl_GlobalInvocationID.xy), accum); + imageStore(ui_image, ivec2(gl_GlobalInvocationID.xy + primitive_uniforms.tile_offset_coarse * TILE_SIZE_COARSE), accum); } diff --git a/title/shark/src/main.rs b/title/shark/src/main.rs index b90401f..9825556 100644 --- a/title/shark/src/main.rs +++ b/title/shark/src/main.rs @@ -32,7 +32,8 @@ use spring::simple_spring_damper_exact; use crate::pipelines::basic::BasicUniforms; use crate::pipelines::primitive_2d::{ - PrimitiveUniforms, TILE_SIZE_COARSE, TILE_SIZE_FINE, TILE_STRIDE_COARSE, TILE_STRIDE_FINE, + PrimitiveUniforms, TILE_DISPATCH_COARSE_X, TILE_DISPATCH_COARSE_Y, TILE_DISPATCH_FINE_X, + TILE_DISPATCH_FINE_Y, TILE_SIZE_COARSE, TILE_SIZE_FINE, TILE_STRIDE_COARSE, TILE_STRIDE_FINE, }; mod fonts; @@ -884,6 +885,30 @@ impl<'gpu> DrawState<'gpu> { let models = Models::load(gpu); let images = Images::load(gpu, thread_token); + let coarse_bitmap_buffer_size = TILE_DISPATCH_COARSE_X + * TILE_DISPATCH_COARSE_Y + * TILE_STRIDE_COARSE + * std::mem::size_of::() as u32; + + let fine_bitmap_buffer_size = TILE_DISPATCH_FINE_X + * TILE_DISPATCH_FINE_Y + * TILE_STRIDE_FINE + * std::mem::size_of::() as u32; + + let coarse_tile_bitmap_buffer = gpu.create_buffer(&BufferDesc { + memory_location: MemoryLocation::Device, + host_mapped: false, + usage: BufferUsageFlags::STORAGE, + size: coarse_bitmap_buffer_size.widen(), + }); + + let fine_tile_bitmap_buffer = gpu.create_buffer(&BufferDesc { + memory_location: MemoryLocation::Device, + host_mapped: false, + usage: BufferUsageFlags::STORAGE, + size: fine_bitmap_buffer_size.widen(), + }); + Self { gpu, basic_pipeline, @@ -898,8 +923,8 @@ impl<'gpu> DrawState<'gpu> { depth_image: default(), rt_image: default(), ui_image: default(), - coarse_tile_bitmap_buffer: default(), - fine_tile_bitmap_buffer: default(), + coarse_tile_bitmap_buffer, + fine_tile_bitmap_buffer, fine_tile_color_buffer: default(), glyph_atlas_image: default(), samplers, @@ -1021,39 +1046,13 @@ impl<'gpu> DrawState<'gpu> { || tile_resolution_fine_x != self.tile_resolution_fine_x || tile_resolution_fine_y != self.tile_resolution_fine_y { - gpu.destroy_buffer(frame, self.coarse_tile_bitmap_buffer); - gpu.destroy_buffer(frame, self.fine_tile_bitmap_buffer); gpu.destroy_buffer(frame, self.fine_tile_color_buffer); - let coarse_bitmap_buffer_size = tile_resolution_coarse_x - * tile_resolution_coarse_y - * TILE_STRIDE_COARSE - * std::mem::size_of::() as u32; - - let fine_bitmap_buffer_size = tile_resolution_fine_x - * tile_resolution_fine_y - * TILE_STRIDE_FINE - * std::mem::size_of::() as u32; - // align to the workgroup size to simplify shader let fine_color_buffer_size = ((tile_resolution_fine_x * tile_resolution_fine_y + 63) & !63) * std::mem::size_of::() as u32; - self.coarse_tile_bitmap_buffer = gpu.create_buffer(&BufferDesc { - memory_location: MemoryLocation::Device, - host_mapped: false, - usage: BufferUsageFlags::STORAGE, - size: coarse_bitmap_buffer_size.widen(), - }); - - self.fine_tile_bitmap_buffer = gpu.create_buffer(&BufferDesc { - memory_location: MemoryLocation::Device, - host_mapped: false, - usage: BufferUsageFlags::STORAGE, - size: fine_bitmap_buffer_size.widen(), - }); - self.fine_tile_color_buffer = gpu.create_buffer(&BufferDesc { memory_location: MemoryLocation::Device, host_mapped: false, @@ -1419,7 +1418,7 @@ impl<'gpu> DrawState<'gpu> { gpu.cmd_dispatch( cmd_encoder, - (self.tile_resolution_coarse_x * self.tile_resolution_coarse_y + 63) / 64, + (self.tile_resolution_fine_x * self.tile_resolution_fine_y + 63) / 64, 1, 1, ); @@ -1430,8 +1429,33 @@ impl<'gpu> DrawState<'gpu> { ui_state.primitive_instances.clear(); - for _pass_y in 0..1 { - for _pass_x in 0..1 { + for tile_offset_y in + (0..self.tile_resolution_coarse_y).step_by(TILE_DISPATCH_COARSE_Y as usize) + { + for tile_offset_x in + (0..self.tile_resolution_coarse_x).step_by(TILE_DISPATCH_COARSE_X as usize) + { + let coarse_dispatch_x = (tile_offset_x + TILE_DISPATCH_COARSE_X) + .min(self.tile_resolution_coarse_x) + - tile_offset_x; + let coarse_dispatch_y = (tile_offset_y + TILE_DISPATCH_COARSE_Y) + .min(self.tile_resolution_coarse_y) + - tile_offset_y; + + let tile_offset_fine_x = + tile_offset_x * (TILE_SIZE_COARSE / TILE_SIZE_FINE); + + let tile_offset_fine_y = + tile_offset_y * (TILE_SIZE_COARSE / TILE_SIZE_FINE); + + let fine_dispatch_x = (tile_offset_fine_x + TILE_DISPATCH_FINE_X) + .min(self.tile_resolution_fine_x) + - tile_offset_fine_x; + + let fine_dispatch_y = (tile_offset_fine_y + TILE_DISPATCH_FINE_Y) + .min(self.tile_resolution_fine_y) + - tile_offset_fine_y; + gpu.cmd_set_pipeline( cmd_encoder, self.primitive_2d_pipeline.coarse_bin_pipeline, @@ -1449,16 +1473,17 @@ impl<'gpu> DrawState<'gpu> { num_primitives, num_primitives_32, num_primitives_1024, - tile_stride_coarse: self.tile_resolution_coarse_x, tile_stride_fine: self.tile_resolution_fine_x, + tile_offset_x, + tile_offset_y, }, ); gpu.cmd_dispatch( cmd_encoder, (num_primitives + 63) / 64, - self.tile_resolution_coarse_x, - self.tile_resolution_coarse_y, + coarse_dispatch_x, + coarse_dispatch_y, ); gpu.cmd_barrier( @@ -1478,8 +1503,8 @@ impl<'gpu> DrawState<'gpu> { gpu.cmd_dispatch( cmd_encoder, (num_primitives_32 + 63) / 64, - self.tile_resolution_fine_x, - self.tile_resolution_fine_y, + fine_dispatch_x, + fine_dispatch_y, ); gpu.cmd_barrier( @@ -1496,12 +1521,7 @@ impl<'gpu> DrawState<'gpu> { self.primitive_2d_pipeline.rasterize_pipeline, ); - gpu.cmd_dispatch( - cmd_encoder, - self.tile_resolution_fine_x, - self.tile_resolution_fine_y, - 1, - ); + gpu.cmd_dispatch(cmd_encoder, fine_dispatch_x, fine_dispatch_y, 1); } } } diff --git a/title/shark/src/pipelines/primitive_2d.rs b/title/shark/src/pipelines/primitive_2d.rs index c65e791..56d9538 100644 --- a/title/shark/src/pipelines/primitive_2d.rs +++ b/title/shark/src/pipelines/primitive_2d.rs @@ -14,6 +14,11 @@ pub const TILE_BITMAP_WORDS_L0: u32 = MAX_PRIMS / 32; pub const TILE_STRIDE_COARSE: u32 = TILE_BITMAP_WORDS_L0; pub const TILE_STRIDE_FINE: u32 = TILE_BITMAP_WORDS_L0 + TILE_BITMAP_WORDS_L1; +pub const TILE_DISPATCH_COARSE_X: u32 = 8; +pub const TILE_DISPATCH_COARSE_Y: u32 = 5; +pub const TILE_DISPATCH_FINE_X: u32 = TILE_DISPATCH_COARSE_X * (TILE_SIZE_COARSE / TILE_SIZE_FINE); +pub const TILE_DISPATCH_FINE_Y: u32 = TILE_DISPATCH_COARSE_Y * (TILE_SIZE_COARSE / TILE_SIZE_FINE); + #[allow(unused)] #[repr(C)] pub struct PrimitiveUniforms { @@ -26,8 +31,10 @@ pub struct PrimitiveUniforms { pub num_primitives_32: u32, pub num_primitives_1024: u32, - pub tile_stride_coarse: u32, pub tile_stride_fine: u32, + + pub tile_offset_x: u32, + pub tile_offset_y: u32, } #[allow(unused)]