From 2fd8493a11367fedd042a416adc1a27b988b0002 Mon Sep 17 00:00:00 2001 From: Josh Simmons Date: Sun, 2 Jun 2024 14:52:54 +0200 Subject: [PATCH] shark: Track range of non-zero L1 buckets --- title/shark-shaders/build.rs | 4 +++ .../shaders/display_transform.comp.glsl | 28 +++++++++++++--- title/shark-shaders/shaders/primitive_2d.h | 7 ++-- .../shaders/primitive_2d_bin.comp.glsl | 5 +++ .../shaders/primitive_2d_bin_clear.comp.glsl | 19 +++++++++++ .../shaders/primitive_2d_rasterize.comp.glsl | 22 ++++++++----- title/shark/src/main.rs | 33 ++++++++++++++++--- .../shark/src/pipelines/display_transform.rs | 2 ++ title/shark/src/pipelines/primitive_2d.rs | 12 ++++++- 9 files changed, 110 insertions(+), 22 deletions(-) create mode 100644 title/shark-shaders/shaders/primitive_2d_bin_clear.comp.glsl diff --git a/title/shark-shaders/build.rs b/title/shark-shaders/build.rs index af40540..623cd4d 100644 --- a/title/shark-shaders/build.rs +++ b/title/shark-shaders/build.rs @@ -22,6 +22,10 @@ const SHADERS: &[Shader] = &[ stage: "comp", name: "primitive_2d_bin", }, + Shader { + stage: "comp", + name: "primitive_2d_bin_clear", + }, Shader { stage: "comp", name: "primitive_2d_rasterize", diff --git a/title/shark-shaders/shaders/display_transform.comp.glsl b/title/shark-shaders/shaders/display_transform.comp.glsl index f927713..5f1bf32 100644 --- a/title/shark-shaders/shaders/display_transform.comp.glsl +++ b/title/shark-shaders/shaders/display_transform.comp.glsl @@ -2,6 +2,12 @@ #extension GL_EXT_control_flow_attributes : require +const uint MAX_PRIMS = 1 << 18; +const uint TILE_BITMAP_L1_WORDS = (MAX_PRIMS / 32 / 32); +const uint TILE_BITMAP_L0_WORDS = (MAX_PRIMS / 32); +const uint TILE_STRIDE = (TILE_BITMAP_L0_WORDS + TILE_BITMAP_L1_WORDS + 2); +const uint TILE_BITMAP_RANGE_OFFSET = 0; + struct PrimitiveUniforms { uvec2 screen_resolution; uvec2 atlas_resolution; @@ -20,10 +26,14 @@ layout (set = 0, binding = 0) uniform sampler bilinear_sampler; layout (set = 0, binding = 1) uniform texture3D tony_mc_mapface_lut; -layout (set = 0, binding = 2, rgba16f) uniform readonly image2D layer_rt; -layout (set = 0, binding = 3, rgba16f) uniform readonly image2D layer_ui; +layout(std430, set = 0, binding = 2) readonly buffer tileBufferRead { + uint tile_bitmap_ro[]; +}; + +layout (set = 0, binding = 3, rgba16f) uniform readonly image2D layer_rt; +layout (set = 0, binding = 4, rgba16f) uniform readonly image2D layer_ui; -layout (set = 0, binding = 4, rgba16f) uniform writeonly image2D composited_output; +layout (set = 0, binding = 5, rgba16f) uniform writeonly image2D composited_output; float srgb_oetf(float a) { return (.0031308f >= a) ? 12.92f * a : 1.055f * pow(a, .4166666666666667f) - .055f; @@ -47,8 +57,16 @@ void main() { const vec3 transformed = tony_mc_mapface(stimulus); vec3 composited = srgb_oetf(transformed); - const vec4 ui = imageLoad(layer_ui, ivec2(gl_GlobalInvocationID.xy)).rgba; - composited = ui.rgb + (composited * (1.0 - ui.a)); + const uvec2 tile_coord = gl_WorkGroupID.xy / 4; + const uint tile_index = tile_coord.y * primitive_uniforms.tile_stride + tile_coord.x; + const uint tile_base = tile_index * TILE_STRIDE; + + const uint first = tile_bitmap_ro[tile_base + TILE_BITMAP_RANGE_OFFSET + 0]; + const uint last = tile_bitmap_ro[tile_base + TILE_BITMAP_RANGE_OFFSET + 1]; + if (first <= last) { + const vec4 ui = imageLoad(layer_ui, ivec2(gl_GlobalInvocationID.xy)).rgba; + composited = ui.rgb + (composited * (1.0 - ui.a)); + } imageStore(composited_output, ivec2(gl_GlobalInvocationID.xy), vec4(composited, 1.0)); } diff --git a/title/shark-shaders/shaders/primitive_2d.h b/title/shark-shaders/shaders/primitive_2d.h index 46df2fd..231cc6f 100644 --- a/title/shark-shaders/shaders/primitive_2d.h +++ b/title/shark-shaders/shaders/primitive_2d.h @@ -3,9 +3,10 @@ const uint TILE_SIZE = 32; const uint MAX_PRIMS = 1 << 18; const uint TILE_BITMAP_L1_WORDS = (MAX_PRIMS / 32 / 32); const uint TILE_BITMAP_L0_WORDS = (MAX_PRIMS / 32); -const uint TILE_STRIDE = (TILE_BITMAP_L0_WORDS + TILE_BITMAP_L1_WORDS); -const uint TILE_BITMAP_L1_OFFSET = 0; -const uint TILE_BITMAP_L0_OFFSET = TILE_BITMAP_L1_WORDS; +const uint TILE_STRIDE = (TILE_BITMAP_L0_WORDS + TILE_BITMAP_L1_WORDS + 2); +const uint TILE_BITMAP_RANGE_OFFSET = 0; +const uint TILE_BITMAP_L1_OFFSET = 2; +const uint TILE_BITMAP_L0_OFFSET = TILE_BITMAP_L1_OFFSET + TILE_BITMAP_L1_WORDS; struct PrimitiveUniforms { uvec2 screen_resolution; diff --git a/title/shark-shaders/shaders/primitive_2d_bin.comp.glsl b/title/shark-shaders/shaders/primitive_2d_bin.comp.glsl index 12ddab4..b2e3e9f 100644 --- a/title/shark-shaders/shaders/primitive_2d_bin.comp.glsl +++ b/title/shark-shaders/shaders/primitive_2d_bin.comp.glsl @@ -73,6 +73,11 @@ void main() { } tile_bitmap_wo[tile_index * TILE_STRIDE + TILE_BITMAP_L1_OFFSET + gl_WorkGroupID.x * 2 + i] = out_1; + + if (out_1 != 0) { + atomicMin(tile_bitmap_wo[tile_index * TILE_STRIDE + TILE_BITMAP_RANGE_OFFSET + 0], gl_WorkGroupID.x * 2 + i); + atomicMax(tile_bitmap_wo[tile_index * TILE_STRIDE + TILE_BITMAP_RANGE_OFFSET + 1], gl_WorkGroupID.x * 2 + i); + } } } } diff --git a/title/shark-shaders/shaders/primitive_2d_bin_clear.comp.glsl b/title/shark-shaders/shaders/primitive_2d_bin_clear.comp.glsl new file mode 100644 index 0000000..9913ec0 --- /dev/null +++ b/title/shark-shaders/shaders/primitive_2d_bin_clear.comp.glsl @@ -0,0 +1,19 @@ +#version 460 + +#extension GL_GOOGLE_include_directive : require + +#extension GL_EXT_scalar_block_layout : require +#extension GL_EXT_control_flow_attributes : require + +#extension GL_KHR_shader_subgroup_vote : require +#extension GL_KHR_shader_subgroup_ballot : require + +#include "primitive_2d.h" + +// TODO: Spec constant support for different subgroup sizes. +layout (local_size_x = 64, local_size_y = 1, local_size_z = 1) in; + +void main() { + tile_bitmap_wo[gl_GlobalInvocationID.x * TILE_STRIDE + TILE_BITMAP_RANGE_OFFSET + 0] = 0xffffffff; + tile_bitmap_wo[gl_GlobalInvocationID.x * TILE_STRIDE + TILE_BITMAP_RANGE_OFFSET + 1] = 0; +} diff --git a/title/shark-shaders/shaders/primitive_2d_rasterize.comp.glsl b/title/shark-shaders/shaders/primitive_2d_rasterize.comp.glsl index 01bb882..2bd3cd5 100644 --- a/title/shark-shaders/shaders/primitive_2d_rasterize.comp.glsl +++ b/title/shark-shaders/shaders/primitive_2d_rasterize.comp.glsl @@ -34,16 +34,22 @@ void main() { const uvec2 tile_coord = gl_WorkGroupID.xy / 4; const uint tile_index = tile_coord.y * primitive_uniforms.tile_stride + tile_coord.x; const uint tile_base = tile_index * TILE_STRIDE; - const uint tile_bitmap_l1_base_fine = tile_base + TILE_BITMAP_L1_OFFSET; - const uint tile_bitmap_l0_base_fine = tile_base + TILE_BITMAP_L0_OFFSET; + + const uint first = tile_bitmap_ro[tile_base + TILE_BITMAP_RANGE_OFFSET + 0]; + const uint last = tile_bitmap_ro[tile_base + TILE_BITMAP_RANGE_OFFSET + 1]; + + [[branch]] + if (last < first) { + return; + } #if DEBUG_SHOW_TILES == 1 int count = 0; // For each tile, iterate over all words in the L1 bitmap. - for (int index_l1 = 0; index_l1 < primitive_uniforms.num_primitives_1024; index_l1++) { + for (uint index_l1 = first; index_l1 <= last; index_l1++) { // For each word, iterate all set bits. - uint bitmap_l1 = tile_bitmap_ro[tile_bitmap_l1_base_fine + index_l1]; + uint bitmap_l1 = tile_bitmap_ro[tile_base + TILE_BITMAP_L1_OFFSET + index_l1]; while (bitmap_l1 != 0) { const uint i = findLSB(bitmap_l1); @@ -52,7 +58,7 @@ void main() { // For each set bit in the L1 bitmap, iterate the set bits in the // corresponding L0 bitmap. const uint index_l0 = index_l1 * 32 + i; - uint bitmap_l0 = tile_bitmap_ro[tile_bitmap_l0_base_fine + index_l0]; + uint bitmap_l0 = tile_bitmap_ro[tile_base + TILE_BITMAP_L0_OFFSET + index_l0]; count += bitCount(bitmap_l0); } @@ -66,9 +72,9 @@ void main() { vec4 accum = vec4(0.0); // For each tile, iterate over all words in the L1 bitmap. - for (int index_l1 = 0; index_l1 < primitive_uniforms.num_primitives_1024; index_l1++) { + for (uint index_l1 = first; index_l1 <= last; index_l1++) { // For each word, iterate all set bits. - uint bitmap_l1 = tile_bitmap_ro[tile_bitmap_l1_base_fine + index_l1]; + uint bitmap_l1 = tile_bitmap_ro[tile_base + TILE_BITMAP_L1_OFFSET + index_l1]; while (bitmap_l1 != 0) { const uint i = findLSB(bitmap_l1); @@ -77,7 +83,7 @@ void main() { // For each set bit in the L1 bitmap, iterate the set bits in the // corresponding L0 bitmap. const uint index_l0 = index_l1 * 32 + i; - uint bitmap_l0 = tile_bitmap_ro[tile_bitmap_l0_base_fine + index_l0]; + uint bitmap_l0 = tile_bitmap_ro[tile_base + TILE_BITMAP_L0_OFFSET + index_l0]; while (bitmap_l0 != 0) { const uint j = findLSB(bitmap_l0); bitmap_l0 ^= bitmap_l0 & -bitmap_l0; diff --git a/title/shark/src/main.rs b/title/shark/src/main.rs index fd621b6..0742a0c 100644 --- a/title/shark/src/main.rs +++ b/title/shark/src/main.rs @@ -1309,7 +1309,7 @@ impl<'gpu> DrawState<'gpu> { ui_state.primitive_instances.clear(); - gpu.cmd_set_pipeline(cmd_encoder, self.primitive_2d_pipeline.bin_pipeline); + gpu.cmd_set_pipeline(cmd_encoder, self.primitive_2d_pipeline.bin_clear_pipeline); gpu.cmd_set_bind_group( frame, @@ -1372,6 +1372,24 @@ impl<'gpu> DrawState<'gpu> { }, ); + gpu.cmd_dispatch( + cmd_encoder, + (self.tile_resolution_y * self.tile_resolution_x + 63) / 64, + 1, + 1, + ); + + gpu.cmd_barrier( + cmd_encoder, + Some(&GlobalBarrier { + prev_access: &[Access::ComputeWrite], + next_access: &[Access::ComputeOtherRead], + }), + &[], + ); + + gpu.cmd_set_pipeline(cmd_encoder, self.primitive_2d_pipeline.bin_pipeline); + gpu.cmd_dispatch( cmd_encoder, (num_primitives + 2047) / 2048, @@ -1382,8 +1400,8 @@ impl<'gpu> DrawState<'gpu> { gpu.cmd_barrier( cmd_encoder, Some(&GlobalBarrier { - prev_access: &[Access::ShaderWrite], - next_access: &[Access::ShaderOtherRead], + prev_access: &[Access::ComputeWrite], + next_access: &[Access::ComputeOtherRead], }), &[], ); @@ -1444,13 +1462,18 @@ impl<'gpu> DrawState<'gpu> { Bind { binding: 2, array_element: 0, + typed: TypedBind::StorageBuffer(&[self.tile_bitmap_buffer.to_arg()]), + }, + Bind { + binding: 3, + array_element: 0, typed: TypedBind::StorageImage(&[( ImageLayout::General, self.rt_image, )]), }, Bind { - binding: 3, + binding: 4, array_element: 0, typed: TypedBind::StorageImage(&[( ImageLayout::General, @@ -1458,7 +1481,7 @@ impl<'gpu> DrawState<'gpu> { )]), }, Bind { - binding: 4, + binding: 5, array_element: 0, typed: TypedBind::StorageImage(&[( ImageLayout::General, diff --git a/title/shark/src/pipelines/display_transform.rs b/title/shark/src/pipelines/display_transform.rs index 7a602c2..4405b4e 100644 --- a/title/shark/src/pipelines/display_transform.rs +++ b/title/shark/src/pipelines/display_transform.rs @@ -19,6 +19,8 @@ impl DisplayTransformPipeline { BindDesc::new(ShaderStageFlags::COMPUTE, BindingType::Sampler), // Tony Mc'mapface LUT BindDesc::new(ShaderStageFlags::COMPUTE, BindingType::SampledImage), + // Tiles + BindDesc::new(ShaderStageFlags::COMPUTE, BindingType::StorageBuffer), // Layer RT BindDesc::new(ShaderStageFlags::COMPUTE, BindingType::StorageImage), // Layer UI diff --git a/title/shark/src/pipelines/primitive_2d.rs b/title/shark/src/pipelines/primitive_2d.rs index fcf828b..2259d77 100644 --- a/title/shark/src/pipelines/primitive_2d.rs +++ b/title/shark/src/pipelines/primitive_2d.rs @@ -10,7 +10,7 @@ pub const TILE_SIZE: u32 = 32; pub const MAX_PRIMS: u32 = 1 << 18; pub const TILE_BITMAP_WORDS_L1: u32 = MAX_PRIMS / 32 / 32; pub const TILE_BITMAP_WORDS_L0: u32 = MAX_PRIMS / 32; -pub const TILE_STRIDE: u32 = TILE_BITMAP_WORDS_L0 + TILE_BITMAP_WORDS_L1; +pub const TILE_STRIDE: u32 = TILE_BITMAP_WORDS_L0 + TILE_BITMAP_WORDS_L1 + 2; #[allow(unused)] #[repr(C)] @@ -38,6 +38,7 @@ pub struct GlyphInstance { pub struct Primitive2dPipeline { pub bind_group_layout: BindGroupLayout, + pub bin_clear_pipeline: Pipeline, pub bin_pipeline: Pipeline, pub rasterize_pipeline: Pipeline, } @@ -68,6 +69,14 @@ impl Primitive2dPipeline { }], }; + let bin_clear_pipeline = gpu.create_compute_pipeline(&ComputePipelineDesc { + shader: ShaderDesc { + entry: c"main", + code: shark_shaders::PRIMITIVE_2D_BIN_CLEAR_COMP_SPV, + }, + layout, + }); + let bin_pipeline = gpu.create_compute_pipeline(&ComputePipelineDesc { shader: ShaderDesc { entry: c"main", @@ -86,6 +95,7 @@ impl Primitive2dPipeline { Self { bind_group_layout, + bin_clear_pipeline, bin_pipeline, rasterize_pipeline, } -- 2.49.0