From: Josh Simmons Date: Fri, 31 May 2024 04:41:11 +0000 (+0200) Subject: shark: Switch to single-pass binning for primitive-2d X-Git-Url: https://git.nega.tv//gitweb.cgi?a=commitdiff_plain;h=13cd7a3da241f4106288c2fc53f573f68adc1c00;p=josh%2Fnarcissus shark: Switch to single-pass binning for primitive-2d --- diff --git a/title/shark-shaders/build.rs b/title/shark-shaders/build.rs index c6eb9d1..af40540 100644 --- a/title/shark-shaders/build.rs +++ b/title/shark-shaders/build.rs @@ -20,15 +20,7 @@ const SHADERS: &[Shader] = &[ }, Shader { stage: "comp", - name: "primitive_2d_bin_coarse", - }, - Shader { - stage: "comp", - name: "primitive_2d_bin_fine", - }, - Shader { - stage: "comp", - name: "primitive_2d_clear_fine", + name: "primitive_2d_bin", }, Shader { stage: "comp", diff --git a/title/shark-shaders/shaders/display_transform.comp.glsl b/title/shark-shaders/shaders/display_transform.comp.glsl index 08f6a86..f927713 100644 --- a/title/shark-shaders/shaders/display_transform.comp.glsl +++ b/title/shark-shaders/shaders/display_transform.comp.glsl @@ -9,10 +9,7 @@ struct PrimitiveUniforms { uint num_primitives; uint num_primitives_32; uint num_primitives_1024; - - uint tile_stride_fine; - - uvec2 tile_offset; + uint tile_stride; }; layout(std430, push_constant) uniform uniformBuffer { @@ -28,10 +25,6 @@ layout (set = 0, binding = 3, rgba16f) uniform readonly image2D layer_ui; layout (set = 0, binding = 4, rgba16f) uniform writeonly image2D composited_output; -layout(std430, set = 0, binding = 5) readonly buffer fineTileCountBufferRead { - uint fine_count_ro[]; -}; - float srgb_oetf(float a) { return (.0031308f >= a) ? 12.92f * a : 1.055f * pow(a, .4166666666666667f) - .055f; } @@ -50,18 +43,12 @@ vec3 tony_mc_mapface(vec3 stimulus) { layout (local_size_x = 8, local_size_y = 8, local_size_z = 1) in; void main() { - const uvec2 tile_coord = gl_WorkGroupID.xy >> 1; - const uint tile_index = tile_coord.y * primitive_uniforms.tile_stride_fine + tile_coord.x; - const vec3 stimulus = imageLoad(layer_rt, ivec2(gl_GlobalInvocationID.xy)).rgb; const vec3 transformed = tony_mc_mapface(stimulus); vec3 composited = srgb_oetf(transformed); - [[branch]] - if (fine_count_ro[tile_index] != 0) { - const vec4 ui = imageLoad(layer_ui, ivec2(gl_GlobalInvocationID.xy)).rgba; - composited = ui.rgb + (composited * (1.0 - ui.a)); - } + const vec4 ui = imageLoad(layer_ui, ivec2(gl_GlobalInvocationID.xy)).rgba; + composited = ui.rgb + (composited * (1.0 - ui.a)); imageStore(composited_output, ivec2(gl_GlobalInvocationID.xy), vec4(composited, 1.0)); } diff --git a/title/shark-shaders/shaders/primitive_2d.h b/title/shark-shaders/shaders/primitive_2d.h index 16a2c18..b8a3d0c 100644 --- a/title/shark-shaders/shaders/primitive_2d.h +++ b/title/shark-shaders/shaders/primitive_2d.h @@ -1,16 +1,11 @@ -#define MAX_PRIMS 0x20000u -#define TILE_SIZE_COARSE 64 -#define TILE_SIZE_FINE 16 -#define TILE_SIZE_MUL (TILE_SIZE_COARSE / TILE_SIZE_FINE) +#define TILE_SIZE 32 + +#define MAX_PRIMS (1 << 18) #define TILE_BITMAP_L1_WORDS (MAX_PRIMS / 32 / 32) #define TILE_BITMAP_L0_WORDS (MAX_PRIMS / 32) -#define TILE_STRIDE_COARSE TILE_BITMAP_L0_WORDS -#define TILE_STRIDE_FINE (TILE_BITMAP_L0_WORDS + TILE_BITMAP_L1_WORDS) -#define TILE_BITMAP_OFFSET_COARSE 0 -#define TILE_BITMAP_L1_OFFSET_FINE 0 -#define TILE_BITMAP_L0_OFFSET_FINE TILE_BITMAP_L1_WORDS - -#define TILE_DISPATCH_X 15 +#define TILE_STRIDE (TILE_BITMAP_L0_WORDS + TILE_BITMAP_L1_WORDS) +#define TILE_BITMAP_L1_OFFSET 0 +#define TILE_BITMAP_L0_OFFSET TILE_BITMAP_L1_WORDS struct PrimitiveUniforms { uvec2 screen_resolution; @@ -19,9 +14,7 @@ struct PrimitiveUniforms { uint num_primitives; uint num_primitives_32; uint num_primitives_1024; - uint tile_stride_fine; - - uvec2 tile_offset_coarse; + uint tile_stride; }; struct Glyph { diff --git a/title/shark-shaders/shaders/primitive_2d_bin.comp.glsl b/title/shark-shaders/shaders/primitive_2d_bin.comp.glsl new file mode 100644 index 0000000..894ebb5 --- /dev/null +++ b/title/shark-shaders/shaders/primitive_2d_bin.comp.glsl @@ -0,0 +1,86 @@ +#version 460 + +#extension GL_GOOGLE_include_directive : require + +#extension GL_EXT_scalar_block_layout : require +#extension GL_EXT_control_flow_attributes : require + +#extension GL_KHR_shader_subgroup_vote : require +#extension GL_KHR_shader_subgroup_ballot : require + +#include "primitive_2d.h" + +// TODO: Spec constant support for different subgroup sizes. +layout (local_size_x = 64, local_size_y = 1, local_size_z = 1) in; + +shared uint bitmap_0[64]; + +void main() { + const uvec2 bin_coord = gl_GlobalInvocationID.yz; + const uvec2 bin_min = bin_coord * TILE_SIZE * 8; + const uvec2 bin_max = min(bin_min + TILE_SIZE * 8, primitive_uniforms.screen_resolution); + + for (uint i = 0; i < 2048; i += 64) { + const uint prim_index = gl_WorkGroupID.x * 2048 + i + gl_SubgroupInvocationID; + bool intersects = false; + if (prim_index < primitive_uniforms.num_primitives) { + intersects = test_glyph(prim_index, bin_min, bin_max); + } + const uvec4 ballot = subgroupBallot(intersects); + bitmap_0[i / 32 + 0] = ballot.x; + bitmap_0[i / 32 + 1] = ballot.y; + } + + memoryBarrierShared(); + + uint bitmap_1[2]; + { + const uvec4 ballot = subgroupBallot(bitmap_0[gl_SubgroupInvocationID] != 0); + bitmap_1[0] = ballot.x; + bitmap_1[1] = ballot.y; + } + + for (uint y = 0; y < 8; y++) { + for (uint x = 0; x < 8; x++) { + const uvec2 tile_coord = gl_GlobalInvocationID.yz * 8 + uvec2(x, y); + const uvec2 tile_min = tile_coord * TILE_SIZE; + const uvec2 tile_max = min(tile_min + TILE_SIZE, primitive_uniforms.screen_resolution); + [[branch]] + if (any(greaterThanEqual(tile_min, tile_max))) { + continue; + } + + const uint tile_index = tile_coord.y * primitive_uniforms.tile_stride + tile_coord.x; + + for (uint i = 0; i < 2; i++) { + uint out_1 = 0; + + uint word_1 = bitmap_1[i]; + while (word_1 != 0) { + const uint bit_1 = findLSB(word_1); + word_1 ^= word_1 & -word_1; + + uint out_0 = 0; + uint index_0 = i * 32 + bit_1; + uint word_0 = bitmap_0[index_0]; + while (word_0 != 0) { + const uint bit_0 = findLSB(word_0); + word_0 ^= word_0 & -word_0; + + const uint prim_index = gl_WorkGroupID.x * 2048 + index_0 * 32 + bit_0; + if (test_glyph(prim_index, tile_min, tile_max)) { + out_0 |= 1 << bit_0; + } + } + + if (out_0 != 0) { + out_1 |= 1 << bit_1; + } + tile_bitmap_wo[tile_index * TILE_STRIDE + TILE_BITMAP_L0_OFFSET + gl_WorkGroupID.x * 64 + index_0] = out_0; + } + + tile_bitmap_wo[tile_index * TILE_STRIDE + TILE_BITMAP_L1_OFFSET + gl_WorkGroupID.x * 2 + i] = out_1; + } + } + } +} diff --git a/title/shark-shaders/shaders/primitive_2d_bin_coarse.comp.glsl b/title/shark-shaders/shaders/primitive_2d_bin_coarse.comp.glsl deleted file mode 100644 index 00fa31c..0000000 --- a/title/shark-shaders/shaders/primitive_2d_bin_coarse.comp.glsl +++ /dev/null @@ -1,37 +0,0 @@ -#version 460 - -#extension GL_GOOGLE_include_directive : require - -#extension GL_EXT_scalar_block_layout : require -#extension GL_EXT_control_flow_attributes : require - -#extension GL_KHR_shader_subgroup_vote : require -#extension GL_KHR_shader_subgroup_ballot : require - -#include "primitive_2d.h" - -// TODO: Spec constant support for different subgroup sizes. -layout (local_size_x = 64, local_size_y = 1, local_size_z = 1) in; - -void main() { - const uvec2 tile_coord = gl_GlobalInvocationID.yz; - const uvec2 tile_coord_global = tile_coord + primitive_uniforms.tile_offset_coarse; - - const uvec2 tile_min = tile_coord_global * TILE_SIZE_COARSE; - const uvec2 tile_max = min(tile_min + TILE_SIZE_COARSE, primitive_uniforms.screen_resolution); - - const uint primitive_index = gl_WorkGroupID.x * gl_WorkGroupSize.x + gl_SubgroupID * gl_SubgroupSize + gl_SubgroupInvocationID; - - bool intersects = false; - if (primitive_index < primitive_uniforms.num_primitives) { - intersects = test_glyph(primitive_index, tile_min, tile_max); - } - - uvec4 ballot_result = subgroupBallot(intersects); - if (subgroupElect()) { // managed democracy wins again - const uint tile_index = tile_coord.y * (primitive_uniforms.tile_stride_fine / TILE_SIZE_MUL) + tile_coord.x; - const uint tile_offset = tile_index * TILE_STRIDE_COARSE; - coarse_bitmap_wo[tile_offset + 2 * gl_WorkGroupID.x + 0] = ballot_result.x; - coarse_bitmap_wo[tile_offset + 2 * gl_WorkGroupID.x + 1] = ballot_result.y; - } -} diff --git a/title/shark-shaders/shaders/primitive_2d_bin_fine.comp.glsl b/title/shark-shaders/shaders/primitive_2d_bin_fine.comp.glsl deleted file mode 100644 index 06c9a7b..0000000 --- a/title/shark-shaders/shaders/primitive_2d_bin_fine.comp.glsl +++ /dev/null @@ -1,65 +0,0 @@ -#version 460 - -#extension GL_GOOGLE_include_directive : require - -#extension GL_EXT_scalar_block_layout : require -#extension GL_EXT_control_flow_attributes : require - -#extension GL_KHR_shader_subgroup_vote : require -#extension GL_KHR_shader_subgroup_ballot : require - -#include "primitive_2d.h" - -// TODO: Spec constant support for different subgroup sizes. -layout (local_size_x = 64, local_size_y = 1, local_size_z = 1) in; - -void main() { - const uvec2 tile_coord = gl_WorkGroupID.yz; - const uint tile_index = tile_coord.y * TILE_DISPATCH_X * TILE_SIZE_MUL + tile_coord.x; - - const uvec2 tile_coord_global = tile_coord + primitive_uniforms.tile_offset_coarse * TILE_SIZE_MUL; - const uint tile_index_global = tile_coord_global.y * primitive_uniforms.tile_stride_fine + tile_coord_global.x; - - const uvec2 tile_min = tile_coord_global * TILE_SIZE_FINE; - const uvec2 tile_max = min(tile_min + TILE_SIZE_FINE, primitive_uniforms.screen_resolution); - - const uint index = gl_WorkGroupID.x * gl_WorkGroupSize.x + gl_SubgroupID * gl_SubgroupSize + gl_SubgroupInvocationID; - - uint bitmap_l0 = 0; - if (index < primitive_uniforms.num_primitives_32) { - const uvec2 tile_coord_coarse = (tile_coord / TILE_SIZE_MUL) + primitive_uniforms.tile_offset_coarse; - const uint tile_index_coarse = tile_coord_coarse.y * (primitive_uniforms.tile_stride_fine / TILE_SIZE_MUL) + tile_coord_coarse.x; - const uint tile_base_coarse = tile_index_coarse * TILE_STRIDE_COARSE; - const uint tile_bitmap_base_coarse = tile_base_coarse + TILE_BITMAP_OFFSET_COARSE; - - uint bitmap_coarse = coarse_bitmap_ro[tile_bitmap_base_coarse + index]; - while (bitmap_coarse != 0) { - const uint i = findLSB(bitmap_coarse); - bitmap_coarse ^= bitmap_coarse & -bitmap_coarse; - - const uint primitive_index = index * 32 + i; - if (test_glyph(primitive_index, tile_min, tile_max)) { - bitmap_l0 |= 1 << i; - } - } - } - - const uint tile_base_fine = tile_index * TILE_STRIDE_FINE; - const uint tile_bitmap_l0_base_fine = tile_base_fine + TILE_BITMAP_L0_OFFSET_FINE; - - // Write the L0 per-primitive bitmap. - fine_bitmap_wo[tile_bitmap_l0_base_fine + index] = bitmap_l0; - - uvec4 ballot_result = subgroupBallot(bitmap_l0 != 0); - if (subgroupElect()) { - // Write the L1 per-bitmap-word bitmap. - const uint tile_bitmap_l1_base_fine = tile_base_fine + TILE_BITMAP_L1_OFFSET_FINE; - fine_bitmap_wo[tile_bitmap_l1_base_fine + 2 * gl_WorkGroupID.x + 0] = ballot_result.x; - fine_bitmap_wo[tile_bitmap_l1_base_fine + 2 * gl_WorkGroupID.x + 1] = ballot_result.y; - - const uint count = uint(ballot_result.x != 0) + uint(ballot_result.y != 0); - if (count != 0) { - atomicAdd(fine_count_wo[tile_index_global], count); - } - } -} diff --git a/title/shark-shaders/shaders/primitive_2d_bindings.h b/title/shark-shaders/shaders/primitive_2d_bindings.h index e54758b..c08cece 100644 --- a/title/shark-shaders/shaders/primitive_2d_bindings.h +++ b/title/shark-shaders/shaders/primitive_2d_bindings.h @@ -16,28 +16,12 @@ layout(std430, set = 0, binding = 3) readonly buffer glyphInstanceBuffer { GlyphInstance glyph_instances[]; }; -layout(std430, set = 0, binding = 4) readonly buffer coarseTileBufferRead { - uint coarse_bitmap_ro[]; +layout(std430, set = 0, binding = 4) readonly buffer tileBufferRead { + uint tile_bitmap_ro[]; }; -layout(std430, set = 0, binding = 4) writeonly buffer coarseTileBufferWrite { - uint coarse_bitmap_wo[]; +layout(std430, set = 0, binding = 4) writeonly buffer tileBufferWrite { + uint tile_bitmap_wo[]; }; -layout(std430, set = 0, binding = 5) readonly buffer fineTileBitmapBufferRead { - uint fine_bitmap_ro[]; -}; - -layout(std430, set = 0, binding = 5) writeonly buffer fineTileBitmapBufferWrite { - uint fine_bitmap_wo[]; -}; - -layout(std430, set = 0, binding = 6) readonly buffer fineTileCountBufferRead { - uint fine_count_ro[]; -}; - -layout(std430, set = 0, binding = 6) writeonly buffer fineTileCountBufferWrite { - uint fine_count_wo[]; -}; - -layout (set = 0, binding = 7, rgba16f) uniform writeonly image2D ui_image; +layout (set = 0, binding = 5, rgba16f) uniform writeonly image2D ui_image; diff --git a/title/shark-shaders/shaders/primitive_2d_clear_fine.comp.glsl b/title/shark-shaders/shaders/primitive_2d_clear_fine.comp.glsl deleted file mode 100644 index 3d486b6..0000000 --- a/title/shark-shaders/shaders/primitive_2d_clear_fine.comp.glsl +++ /dev/null @@ -1,14 +0,0 @@ -#version 460 - -#extension GL_GOOGLE_include_directive : require - -#extension GL_EXT_scalar_block_layout : require -#extension GL_EXT_control_flow_attributes : require - -#include "primitive_2d.h" - -layout (local_size_x = 64, local_size_y = 1, local_size_z = 1) in; - -void main() { - fine_count_wo[gl_GlobalInvocationID.x] = 0; -} diff --git a/title/shark-shaders/shaders/primitive_2d_rasterize.comp.glsl b/title/shark-shaders/shaders/primitive_2d_rasterize.comp.glsl index 813eddb..01bb882 100644 --- a/title/shark-shaders/shaders/primitive_2d_rasterize.comp.glsl +++ b/title/shark-shaders/shaders/primitive_2d_rasterize.comp.glsl @@ -10,7 +10,7 @@ #include "primitive_2d.h" -layout (local_size_x = TILE_SIZE_FINE, local_size_y = TILE_SIZE_FINE, local_size_z = 1) in; +layout (local_size_x = 8, local_size_y = 8, local_size_z = 1) in; #define DEBUG_SHOW_TILES 0 @@ -31,23 +31,19 @@ vec3 plasma_quintic(float x) #endif void main() { - const uvec2 tile_coord = gl_WorkGroupID.xy; - const uint tile_index = tile_coord.y * TILE_DISPATCH_X * TILE_SIZE_MUL + tile_coord.x; - - const uvec2 tile_coord_global = tile_coord + primitive_uniforms.tile_offset_coarse * TILE_SIZE_MUL; - const uint tile_index_global = tile_coord_global.y * primitive_uniforms.tile_stride_fine + tile_coord_global.x; - - const uint tile_base_fine = tile_index * TILE_STRIDE_FINE; - const uint tile_bitmap_l1_base_fine = tile_base_fine + TILE_BITMAP_L1_OFFSET_FINE; - const uint tile_bitmap_l0_base_fine = tile_base_fine + TILE_BITMAP_L0_OFFSET_FINE; + const uvec2 tile_coord = gl_WorkGroupID.xy / 4; + const uint tile_index = tile_coord.y * primitive_uniforms.tile_stride + tile_coord.x; + const uint tile_base = tile_index * TILE_STRIDE; + const uint tile_bitmap_l1_base_fine = tile_base + TILE_BITMAP_L1_OFFSET; + const uint tile_bitmap_l0_base_fine = tile_base + TILE_BITMAP_L0_OFFSET; #if DEBUG_SHOW_TILES == 1 - uint count = 0; + int count = 0; // For each tile, iterate over all words in the L1 bitmap. for (int index_l1 = 0; index_l1 < primitive_uniforms.num_primitives_1024; index_l1++) { // For each word, iterate all set bits. - uint bitmap_l1 = fine_bitmap_ro[tile_bitmap_l1_base_fine + index_l1]; + uint bitmap_l1 = tile_bitmap_ro[tile_bitmap_l1_base_fine + index_l1]; while (bitmap_l1 != 0) { const uint i = findLSB(bitmap_l1); @@ -56,31 +52,23 @@ void main() { // For each set bit in the L1 bitmap, iterate the set bits in the // corresponding L0 bitmap. const uint index_l0 = index_l1 * 32 + i; - uint bitmap_l0 = fine_bitmap_ro[tile_bitmap_l0_base_fine + index_l0]; + uint bitmap_l0 = tile_bitmap_ro[tile_bitmap_l0_base_fine + index_l0]; count += bitCount(bitmap_l0); } } - const vec3 color = plasma_quintic(float(count) / 50.0); - imageStore(ui_image, ivec2(gl_GlobalInvocationID.xy + primitive_uniforms.tile_offset_coarse * TILE_SIZE_COARSE), vec4(color, 1.0)); + const vec3 color = plasma_quintic(float(count) / 100.0); + imageStore(ui_image, ivec2(gl_GlobalInvocationID.xy), vec4(color, 1.0)); #else vec4 accum = vec4(0.0); - uint word_count = fine_count_ro[tile_index_global]; - if (word_count == 0) { - return; - } - // For each tile, iterate over all words in the L1 bitmap. - for (int index_l1 = 0; word_count != 0 && index_l1 < primitive_uniforms.num_primitives_1024; index_l1++) { + for (int index_l1 = 0; index_l1 < primitive_uniforms.num_primitives_1024; index_l1++) { // For each word, iterate all set bits. - uint bitmap_l1 = fine_bitmap_ro[tile_bitmap_l1_base_fine + index_l1]; - - if (bitmap_l1 != 0) - word_count -= 1; + uint bitmap_l1 = tile_bitmap_ro[tile_bitmap_l1_base_fine + index_l1]; while (bitmap_l1 != 0) { const uint i = findLSB(bitmap_l1); @@ -89,7 +77,7 @@ void main() { // For each set bit in the L1 bitmap, iterate the set bits in the // corresponding L0 bitmap. const uint index_l0 = index_l1 * 32 + i; - uint bitmap_l0 = fine_bitmap_ro[tile_bitmap_l0_base_fine + index_l0]; + uint bitmap_l0 = tile_bitmap_ro[tile_bitmap_l0_base_fine + index_l0]; while (bitmap_l0 != 0) { const uint j = findLSB(bitmap_l0); bitmap_l0 ^= bitmap_l0 & -bitmap_l0; @@ -101,7 +89,8 @@ void main() { const Glyph gl = glyphs[gi.index]; const vec2 glyph_min = gi.position + gl.offset_min; const vec2 glyph_max = gi.position + gl.offset_max; - const vec2 sample_center = gl_GlobalInvocationID.xy + primitive_uniforms.tile_offset_coarse * TILE_SIZE_COARSE + vec2(0.5); + const vec2 sample_center = gl_GlobalInvocationID.xy + vec2(0.5); + [[branch]] if (all(greaterThanEqual(sample_center, glyph_min)) && all(lessThanEqual(sample_center, glyph_max))) { const vec2 glyph_size = gl.offset_max - gl.offset_min; const vec4 color = unpackUnorm4x8(gi.color).bgra; @@ -114,7 +103,7 @@ void main() { } } - imageStore(ui_image, ivec2(gl_GlobalInvocationID.xy + primitive_uniforms.tile_offset_coarse * TILE_SIZE_COARSE), accum); + imageStore(ui_image, ivec2(gl_GlobalInvocationID.xy), accum); #endif } diff --git a/title/shark/src/main.rs b/title/shark/src/main.rs index 64227b8..fd621b6 100644 --- a/title/shark/src/main.rs +++ b/title/shark/src/main.rs @@ -31,10 +31,7 @@ use pipelines::primitive_2d::{GlyphInstance, Primitive2dPipeline}; use spring::simple_spring_damper_exact; use crate::pipelines::basic::BasicUniforms; -use crate::pipelines::primitive_2d::{ - PrimitiveUniforms, TILE_DISPATCH_COARSE_X, TILE_DISPATCH_COARSE_Y, TILE_DISPATCH_FINE_X, - TILE_DISPATCH_FINE_Y, TILE_SIZE_COARSE, TILE_SIZE_FINE, TILE_STRIDE_COARSE, TILE_STRIDE_FINE, -}; +use crate::pipelines::primitive_2d::{PrimitiveUniforms, TILE_SIZE, TILE_STRIDE}; mod fonts; mod helpers; @@ -853,18 +850,14 @@ struct DrawState<'gpu> { width: u32, height: u32, - tile_resolution_coarse_x: u32, - tile_resolution_coarse_y: u32, - tile_resolution_fine_x: u32, - tile_resolution_fine_y: u32, + tile_resolution_x: u32, + tile_resolution_y: u32, depth_image: Image, rt_image: Image, ui_image: Image, - coarse_tile_bitmap_buffer: Buffer, - fine_tile_bitmap_buffer: Buffer, - fine_tile_color_buffer: Buffer, + tile_bitmap_buffer: Buffer, glyph_atlas_image: Image, @@ -885,18 +878,6 @@ impl<'gpu> DrawState<'gpu> { let models = Models::load(gpu); let images = Images::load(gpu, thread_token); - let fine_bitmap_buffer_size = TILE_DISPATCH_FINE_X - * TILE_DISPATCH_FINE_Y - * TILE_STRIDE_FINE - * std::mem::size_of::() as u32; - - let fine_tile_bitmap_buffer = gpu.create_buffer(&BufferDesc { - memory_location: MemoryLocation::Device, - host_mapped: false, - usage: BufferUsageFlags::STORAGE, - size: fine_bitmap_buffer_size.widen(), - }); - Self { gpu, basic_pipeline, @@ -904,16 +885,12 @@ impl<'gpu> DrawState<'gpu> { display_transform_pipeline, width: 0, height: 0, - tile_resolution_coarse_x: 0, - tile_resolution_coarse_y: 0, - tile_resolution_fine_x: 0, - tile_resolution_fine_y: 0, + tile_resolution_x: 0, + tile_resolution_y: 0, depth_image: default(), rt_image: default(), ui_image: default(), - coarse_tile_bitmap_buffer: default(), - fine_tile_bitmap_buffer, - fine_tile_color_buffer: default(), + tile_bitmap_buffer: default(), glyph_atlas_image: default(), samplers, models, @@ -1024,47 +1001,30 @@ impl<'gpu> DrawState<'gpu> { gpu.destroy_image(frame, self.rt_image); gpu.destroy_image(frame, self.ui_image); - let tile_resolution_coarse_x = (width + (TILE_SIZE_COARSE - 1)) / TILE_SIZE_COARSE; - let tile_resolution_coarse_y = (height + (TILE_SIZE_COARSE - 1)) / TILE_SIZE_COARSE; - let tile_resolution_fine_x = (width + (TILE_SIZE_FINE - 1)) / TILE_SIZE_FINE; - let tile_resolution_fine_y = (height + (TILE_SIZE_FINE - 1)) / TILE_SIZE_FINE; + let tile_resolution_x = (width + (TILE_SIZE - 1)) / TILE_SIZE; + let tile_resolution_y = (height + (TILE_SIZE - 1)) / TILE_SIZE; - if tile_resolution_coarse_x != self.tile_resolution_coarse_x - || tile_resolution_coarse_y != self.tile_resolution_coarse_y - || tile_resolution_fine_x != self.tile_resolution_fine_x - || tile_resolution_fine_y != self.tile_resolution_fine_y + if tile_resolution_x != self.tile_resolution_x + || tile_resolution_y != self.tile_resolution_y { - gpu.destroy_buffer(frame, self.fine_tile_color_buffer); - gpu.destroy_buffer(frame, self.coarse_tile_bitmap_buffer); + gpu.destroy_buffer(frame, self.tile_bitmap_buffer); - let coarse_bitmap_buffer_size = tile_resolution_coarse_x - * tile_resolution_coarse_y - * TILE_STRIDE_COARSE + let bitmap_buffer_size = tile_resolution_x + * tile_resolution_y + * TILE_STRIDE * std::mem::size_of::() as u32; - self.coarse_tile_bitmap_buffer = gpu.create_buffer(&BufferDesc { + self.tile_bitmap_buffer = gpu.create_buffer(&BufferDesc { memory_location: MemoryLocation::Device, host_mapped: false, usage: BufferUsageFlags::STORAGE, - size: coarse_bitmap_buffer_size.widen(), + size: bitmap_buffer_size.widen(), }); - // align to the workgroup size to simplify shader - let fine_color_buffer_size = - ((tile_resolution_fine_x * tile_resolution_fine_y + 63) & !63) - * std::mem::size_of::() as u32; + println!("tile_resolution: ({tile_resolution_x},{tile_resolution_y})"); - self.fine_tile_color_buffer = gpu.create_buffer(&BufferDesc { - memory_location: MemoryLocation::Device, - host_mapped: false, - usage: BufferUsageFlags::STORAGE, - size: fine_color_buffer_size.widen(), - }); - - self.tile_resolution_coarse_x = tile_resolution_coarse_x; - self.tile_resolution_coarse_y = tile_resolution_coarse_y; - self.tile_resolution_fine_x = tile_resolution_fine_x; - self.tile_resolution_fine_y = tile_resolution_fine_y; + self.tile_resolution_x = tile_resolution_x; + self.tile_resolution_y = tile_resolution_y; } self.depth_image = gpu.create_image(&ImageDesc { @@ -1330,8 +1290,6 @@ impl<'gpu> DrawState<'gpu> { // Render UI { - gpu.cmd_set_pipeline(cmd_encoder, self.primitive_2d_pipeline.fine_clear_pipeline); - let glyph_buffer = gpu.request_transient_buffer_with_data( frame, thread_token, @@ -1345,6 +1303,14 @@ impl<'gpu> DrawState<'gpu> { ui_state.primitive_instances.as_slice(), ); + let num_primitives = ui_state.primitive_instances.len() as u32; + let num_primitives_32 = (num_primitives + 31) / 32; + let num_primitives_1024 = (num_primitives_32 + 31) / 32; + + ui_state.primitive_instances.clear(); + + gpu.cmd_set_pipeline(cmd_encoder, self.primitive_2d_pipeline.bin_pipeline); + gpu.cmd_set_bind_group( frame, cmd_encoder, @@ -1377,27 +1343,11 @@ impl<'gpu> DrawState<'gpu> { Bind { binding: 4, array_element: 0, - typed: TypedBind::StorageBuffer(&[self - .coarse_tile_bitmap_buffer - .to_arg()]), + typed: TypedBind::StorageBuffer(&[self.tile_bitmap_buffer.to_arg()]), }, Bind { binding: 5, array_element: 0, - typed: TypedBind::StorageBuffer(&[self - .fine_tile_bitmap_buffer - .to_arg()]), - }, - Bind { - binding: 6, - array_element: 0, - typed: TypedBind::StorageBuffer(&[self - .fine_tile_color_buffer - .to_arg()]), - }, - Bind { - binding: 7, - array_element: 0, typed: TypedBind::StorageImage(&[( ImageLayout::General, self.ui_image, @@ -1406,21 +1356,6 @@ impl<'gpu> DrawState<'gpu> { ], ); - gpu.cmd_dispatch( - cmd_encoder, - (self.tile_resolution_fine_x * self.tile_resolution_fine_y + 63) / 64, - 1, - 1, - ); - - let num_primitives = ui_state.primitive_instances.len() as u32; - let num_primitives_32 = (num_primitives + 31) / 32; - let num_primitives_1024 = (num_primitives_32 + 31) / 32; - - ui_state.primitive_instances.clear(); - - gpu.cmd_set_pipeline(cmd_encoder, self.primitive_2d_pipeline.coarse_bin_pipeline); - gpu.cmd_push_constants( cmd_encoder, ShaderStageFlags::COMPUTE, @@ -1433,17 +1368,15 @@ impl<'gpu> DrawState<'gpu> { num_primitives, num_primitives_32, num_primitives_1024, - tile_stride_fine: self.tile_resolution_fine_x, - tile_offset_x: 0, - tile_offset_y: 0, + tile_stride: self.tile_resolution_x, }, ); gpu.cmd_dispatch( cmd_encoder, - (num_primitives + 63) / 64, - self.tile_resolution_coarse_x, - self.tile_resolution_coarse_y, + (num_primitives + 2047) / 2048, + (self.tile_resolution_x + 3) / 4, + (self.tile_resolution_y + 3) / 4, ); gpu.cmd_barrier( @@ -1455,73 +1388,9 @@ impl<'gpu> DrawState<'gpu> { &[], ); - for tile_offset_y in - (0..self.tile_resolution_coarse_y).step_by(TILE_DISPATCH_COARSE_Y as usize) - { - for tile_offset_x in - (0..self.tile_resolution_coarse_x).step_by(TILE_DISPATCH_COARSE_X as usize) - { - let tile_offset_fine_x = - tile_offset_x * (TILE_SIZE_COARSE / TILE_SIZE_FINE); - - let tile_offset_fine_y = - tile_offset_y * (TILE_SIZE_COARSE / TILE_SIZE_FINE); - - let fine_dispatch_x = (tile_offset_fine_x + TILE_DISPATCH_FINE_X) - .min(self.tile_resolution_fine_x) - - tile_offset_fine_x; - - let fine_dispatch_y = (tile_offset_fine_y + TILE_DISPATCH_FINE_Y) - .min(self.tile_resolution_fine_y) - - tile_offset_fine_y; - - gpu.cmd_push_constants( - cmd_encoder, - ShaderStageFlags::COMPUTE, - 0, - &PrimitiveUniforms { - screen_resolution_x: self.width, - screen_resolution_y: self.height, - atlas_resolution_x: atlas_width, - atlas_resolution_y: atlas_height, - num_primitives, - num_primitives_32, - num_primitives_1024, - tile_stride_fine: self.tile_resolution_fine_x, - tile_offset_x, - tile_offset_y, - }, - ); - - gpu.cmd_set_pipeline( - cmd_encoder, - self.primitive_2d_pipeline.fine_bin_pipeline, - ); - - gpu.cmd_dispatch( - cmd_encoder, - (num_primitives_32 + 63) / 64, - fine_dispatch_x, - fine_dispatch_y, - ); - - gpu.cmd_barrier( - cmd_encoder, - Some(&GlobalBarrier { - prev_access: &[Access::ShaderWrite], - next_access: &[Access::ShaderOtherRead], - }), - &[], - ); - - gpu.cmd_set_pipeline( - cmd_encoder, - self.primitive_2d_pipeline.rasterize_pipeline, - ); - - gpu.cmd_dispatch(cmd_encoder, fine_dispatch_x, fine_dispatch_y, 1); - } - } + gpu.cmd_set_pipeline(cmd_encoder, self.primitive_2d_pipeline.rasterize_pipeline); + + gpu.cmd_dispatch(cmd_encoder, (self.width + 7) / 8, (self.height + 7) / 8, 1); } // Display transform and composite @@ -1596,13 +1465,6 @@ impl<'gpu> DrawState<'gpu> { swapchain_image, )]), }, - Bind { - binding: 5, - array_element: 0, - typed: TypedBind::StorageBuffer(&[self - .fine_tile_color_buffer - .to_arg()]), - }, ], ); @@ -1793,7 +1655,7 @@ pub fn main() { ); } - for i in 0..180 { + for i in 0..224 { let i = i as f32; ui_state.text_fmt( 5.0, diff --git a/title/shark/src/pipelines/display_transform.rs b/title/shark/src/pipelines/display_transform.rs index ae01864..7a602c2 100644 --- a/title/shark/src/pipelines/display_transform.rs +++ b/title/shark/src/pipelines/display_transform.rs @@ -25,8 +25,6 @@ impl DisplayTransformPipeline { BindDesc::new(ShaderStageFlags::COMPUTE, BindingType::StorageImage), // Composited Output BindDesc::new(ShaderStageFlags::COMPUTE, BindingType::StorageImage), - // Tile color buffer - BindDesc::new(ShaderStageFlags::COMPUTE, BindingType::StorageBuffer), ]); let layout = &PipelineLayout { diff --git a/title/shark/src/pipelines/primitive_2d.rs b/title/shark/src/pipelines/primitive_2d.rs index c9e85e3..fcf828b 100644 --- a/title/shark/src/pipelines/primitive_2d.rs +++ b/title/shark/src/pipelines/primitive_2d.rs @@ -6,18 +6,11 @@ use narcissus_gpu::{ use crate::Gpu; -pub const MAX_PRIMS: u32 = 0x20000; -pub const TILE_SIZE_COARSE: u32 = 64; -pub const TILE_SIZE_FINE: u32 = 16; +pub const TILE_SIZE: u32 = 32; +pub const MAX_PRIMS: u32 = 1 << 18; pub const TILE_BITMAP_WORDS_L1: u32 = MAX_PRIMS / 32 / 32; pub const TILE_BITMAP_WORDS_L0: u32 = MAX_PRIMS / 32; -pub const TILE_STRIDE_COARSE: u32 = TILE_BITMAP_WORDS_L0; -pub const TILE_STRIDE_FINE: u32 = TILE_BITMAP_WORDS_L0 + TILE_BITMAP_WORDS_L1; - -pub const TILE_DISPATCH_COARSE_X: u32 = 15; -pub const TILE_DISPATCH_COARSE_Y: u32 = 15; -pub const TILE_DISPATCH_FINE_X: u32 = TILE_DISPATCH_COARSE_X * (TILE_SIZE_COARSE / TILE_SIZE_FINE); -pub const TILE_DISPATCH_FINE_Y: u32 = TILE_DISPATCH_COARSE_Y * (TILE_SIZE_COARSE / TILE_SIZE_FINE); +pub const TILE_STRIDE: u32 = TILE_BITMAP_WORDS_L0 + TILE_BITMAP_WORDS_L1; #[allow(unused)] #[repr(C)] @@ -31,10 +24,7 @@ pub struct PrimitiveUniforms { pub num_primitives_32: u32, pub num_primitives_1024: u32, - pub tile_stride_fine: u32, - - pub tile_offset_x: u32, - pub tile_offset_y: u32, + pub tile_stride: u32, } #[allow(unused)] @@ -48,9 +38,7 @@ pub struct GlyphInstance { pub struct Primitive2dPipeline { pub bind_group_layout: BindGroupLayout, - pub coarse_bin_pipeline: Pipeline, - pub fine_bin_pipeline: Pipeline, - pub fine_clear_pipeline: Pipeline, + pub bin_pipeline: Pipeline, pub rasterize_pipeline: Pipeline, } @@ -65,11 +53,7 @@ impl Primitive2dPipeline { BindDesc::new(ShaderStageFlags::COMPUTE, BindingType::StorageBuffer), // Glyph Instances BindDesc::new(ShaderStageFlags::COMPUTE, BindingType::StorageBuffer), - // Coarse Tiles - BindDesc::new(ShaderStageFlags::COMPUTE, BindingType::StorageBuffer), - // Fine Tiles - BindDesc::new(ShaderStageFlags::COMPUTE, BindingType::StorageBuffer), - // Fine Color + // Tiles BindDesc::new(ShaderStageFlags::COMPUTE, BindingType::StorageBuffer), // UI Image Output BindDesc::new(ShaderStageFlags::COMPUTE, BindingType::StorageImage), @@ -84,26 +68,10 @@ impl Primitive2dPipeline { }], }; - let coarse_bin_pipeline = gpu.create_compute_pipeline(&ComputePipelineDesc { - shader: ShaderDesc { - entry: c"main", - code: shark_shaders::PRIMITIVE_2D_BIN_COARSE_COMP_SPV, - }, - layout, - }); - - let fine_bin_pipeline = gpu.create_compute_pipeline(&ComputePipelineDesc { - shader: ShaderDesc { - entry: c"main", - code: shark_shaders::PRIMITIVE_2D_BIN_FINE_COMP_SPV, - }, - layout, - }); - - let fine_clear_pipeline = gpu.create_compute_pipeline(&ComputePipelineDesc { + let bin_pipeline = gpu.create_compute_pipeline(&ComputePipelineDesc { shader: ShaderDesc { entry: c"main", - code: shark_shaders::PRIMITIVE_2D_CLEAR_FINE_COMP_SPV, + code: shark_shaders::PRIMITIVE_2D_BIN_COMP_SPV, }, layout, }); @@ -118,9 +86,7 @@ impl Primitive2dPipeline { Self { bind_group_layout, - coarse_bin_pipeline, - fine_bin_pipeline, - fine_clear_pipeline, + bin_pipeline, rasterize_pipeline, } }