From 904bee0762e0b1eccbec8c4f6b4c6fa0e63c057b Mon Sep 17 00:00:00 2001 From: Joshua Simmons Date: Sat, 25 Oct 2025 16:12:57 +0200 Subject: [PATCH] shark: Render only front-facing block faces --- title/shark-shaders/shaders/block.slang | 119 ++++++++++++++++++++---- title/shark-shaders/src/pipelines.rs | 9 +- title/shark/src/draw.rs | 71 +++++++++++++- 3 files changed, 175 insertions(+), 24 deletions(-) diff --git a/title/shark-shaders/shaders/block.slang b/title/shark-shaders/shaders/block.slang index 017b705..e19cabd 100644 --- a/title/shark-shaders/shaders/block.slang +++ b/title/shark-shaders/shaders/block.slang @@ -1,5 +1,26 @@ +import color_map; + +// Size of a block. +static const float BLOCK_SIZE = 2.0; + +struct Block { + uint value; + + float3 position() { + return float3(value & 31, (value >> 5) & 31, (value >> 10) & 31) * BLOCK_SIZE; + } +} + +struct BlockBuffer { + float3 position; + uint count; + Block blocks[32 * 32 * 32]; +} + struct BlockConstants { - float4x4 clip_from_camera; + float4x4 clip_from_model; + float4 camera_position; + BlockBuffer *block_buffer; } struct VertexAttributes { @@ -11,30 +32,90 @@ struct PrimitiveAttributes { float3 color; } +// Max blocks per-group. +static const uint MAX_BLOCK = 4; + +// Max vertices per-group. +static const uint V = MAX_BLOCK * 8; + +// Max triangles per-group. +static const uint T = MAX_BLOCK * 6; + [shader("mesh")] [outputtopology("triangle")] +[require(spvMeshShadingEXT, spvGroupNonUniformShuffle)] [numthreads(32, 1, 1)] -void mesh(uniform BlockConstants constants, OutputVertices vertices, OutputPrimitives primitives, OutputIndices indices, uint thread_id_in_group: SV_GroupThreadID) { - SetMeshOutputCounts(4, 2); - - static const float4 positions[4] = { - float4(-0.5, 0.0, -0.5, 1.0), - float4(-0.5, 0.0, 0.5, 1.0), - float4(0.5, 0.0, 0.5, 1.0), - float4(0.5, 0.0, -0.5, 1.0), - }; +void mesh(uniform BlockConstants constants, + out OutputVertices vertices, + out OutputPrimitives primitives, + out OutputIndices indices, + uint group_id: SV_GroupID, + uint local_id: SV_GroupThreadID) { + let start = group_id.x * MAX_BLOCK; + let end = min(constants.block_buffer.count, start + MAX_BLOCK); + let count = end - start; - if (thread_id_in_group < 4) { - vertices[thread_id_in_group].position = mul(constants.clip_from_camera, positions[thread_id_in_group]); - } + SetMeshOutputCounts(count * 8, count * 6); - if (thread_id_in_group < 2) { - primitives[thread_id_in_group].color = float3(0x9b / 255.0, 0x61 / 255.0, 0x56 / 255.0); - primitives[thread_id_in_group].normal = float3(0.0, 1.0, 0.0); + bool3 face_flip; + { + let block = constants.block_buffer.blocks[start + local_id / 8]; + let block_position = constants.block_buffer.position + block.position(); + let local_camera_position = constants.camera_position.xyz - block_position; + face_flip = local_camera_position > float3(0.0); + + let vertex = uint3(local_id & 0x1, (local_id & 0x2) >> 1, (local_id & 0x4) >> 2) * BLOCK_SIZE; + vertices[local_id].position = mul(constants.clip_from_model, float4(block_position + vertex, 1.0)); } - indices[0] = uint3(0, 1, 2); - indices[1] = uint3(2, 3, 0); + let primitive_index = local_id % 6; + let face_index = primitive_index / 2; // x = 0, y = 1, z = 2 + + // We calculated the face flip values while generating vertices, so we must + // shuffle those values down to the correct lanes. + let primitive_vertex_offset = (local_id / 6) * 8; + let flip = WaveReadLaneAt(face_flip, primitive_vertex_offset)[face_index]; + + // Indices for the face on the opposite side are given by a power-of-two + // offset. That is, on the x-axis if we look at triangle 0-4-2, then we flip + // by adding 1 to each index, producing 1-5-3. Similarly, to flip 0-5-4 + // around z, we must add 2, leaving us with 2-7-6. This means the offset is + // always a power of two, `1 << flip_index`. + // + // Additionally, we need to flip the winding to support backface culling in + // the rasterizer. + // + // 7 *------* 6 +y + // /| /| ^ +z + // 3 / | 2 / | |/ + // *------* | +x <--* + // |5 *---|--* 4 + // | / | / flip x = +1 + // |/ |/ flip y = +2 + // *------* flip z = +4 + // 1 0 + // + // `CUBE_INDICES` contains the indices for -x, -y, and -z faces. + static const uint3 CUBE_INDICES[6] = { + uint3(2, 0, 4), + uint3(2, 4, 6), + uint3(0, 1, 5), + uint3(0, 5, 4), + uint3(0, 2, 3), + uint3(0, 3, 1), + }; + + let flip_offset = uint(flip) << face_index; + let index = primitive_vertex_offset + CUBE_INDICES[primitive_index] + flip_offset; + + let flip_sign = flip ? 1.0 : -1.0; + primitives[local_id].normal = float3(face_index == 0 ? flip_sign : 0, + face_index == 1 ? flip_sign : 0, + face_index == 2 ? flip_sign : 0); + primitives[local_id].color = color_map::plasma(primitive_index / 6.0); + + // Flip winding for backface culling. + indices[local_id] = flip ? index.yxz : index; } struct Fragment { @@ -42,7 +123,7 @@ struct Fragment { } [shader("fragment")] -Fragment fragment(PrimitiveAttributes primitive, VertexAttributes vertex) { +Fragment fragment(in perprimitive PrimitiveAttributes primitive, in VertexAttributes vertex) { let n_dot_l = max(dot(primitive.normal, float3(0.0, 1.0, 0.0)), 0.1); Fragment output; diff --git a/title/shark-shaders/src/pipelines.rs b/title/shark-shaders/src/pipelines.rs index ff16084..ffc07a8 100644 --- a/title/shark-shaders/src/pipelines.rs +++ b/title/shark-shaders/src/pipelines.rs @@ -7,7 +7,7 @@ use narcissus_gpu::{ Sampler, SamplerAddressMode, SamplerDesc, SamplerFilter, ShaderDesc, ShaderStageFlags, SpecConstant, Topology, VertexOrMeshShader, }; -use narcissus_maths::{Mat4, Vec2}; +use narcissus_maths::{Mat4, Point3, Vec2}; pub const DRAW_2D_TILE_SIZE: u32 = 32; @@ -175,8 +175,11 @@ pub struct BasicConstants<'a> { } #[repr(C)] -pub struct BlockConstants { +pub struct BlockConstants<'a> { pub clip_from_model: Mat4, + pub camera_position: Point3, + pub _pad: f32, + pub block_buffer_address: BufferAddress<'a>, } #[repr(C)] @@ -376,6 +379,8 @@ impl Pipelines { vertex_or_mesh_shader: VertexOrMeshShader::Mesh(ShaderDesc { code: crate::BLOCK_SPV, entry: c"mesh", + require_full_subgroups: true, + required_subgroup_size: Some(32), ..default() }), fragment_shader: ShaderDesc { diff --git a/title/shark/src/draw.rs b/title/shark/src/draw.rs index bca0e41..3438922 100644 --- a/title/shark/src/draw.rs +++ b/title/shark/src/draw.rs @@ -1,3 +1,4 @@ +use std::mem::MaybeUninit; use std::ops::Index; use std::path::Path; @@ -22,7 +23,7 @@ use narcissus_gpu::{ RenderingDesc, Scissor, ShaderStageFlags, StoreOp, ThreadToken, TypedBind, Viewport, }; use narcissus_image as image; -use narcissus_maths::{Affine3, HalfTurn, Mat3, Mat4, Vec3, vec3}; +use narcissus_maths::{Affine3, HalfTurn, Mat3, Mat4, Vec3, sin_cos_pi_f32, vec3}; pub struct Model<'a> { indices: u32, @@ -420,6 +421,7 @@ impl<'gpu> DrawState<'gpu> { } } + let camera_position = game_state.camera.position(); let camera_from_model = game_state.camera.camera_from_model(); let clip_from_camera = Mat4::perspective_rev_inf_zo( HalfTurn::new(1.0 / 3.0), @@ -684,16 +686,79 @@ impl<'gpu> DrawState<'gpu> { }], ); + #[repr(C)] + struct BlockMeshPacket { + x: f32, + y: f32, + z: f32, + count: u32, + blocks: [u32; 32 * 32 * 32], + } + + impl BlockMeshPacket { + fn push(&mut self, x: usize, y: usize, z: usize) { + let index = self.count as usize; + if index >= 32 * 32 * 32 { + return; + } + + self.blocks[index] = ((z & 31) << 10 | (y & 31) << 5 | x & 31) as u32; + self.count += 1; + } + + fn as_slice(&self) -> &[u8] { + let size = 4 * 4 + self.count as usize * 4; + unsafe { + core::slice::from_raw_parts( + self as *const BlockMeshPacket as *const u8, + size, + ) + } + } + } + + let mut block_packet = BlockMeshPacket { + x: 0.0, + y: -64.0, + z: 0.0, + count: 0, + blocks: unsafe { MaybeUninit::zeroed().assume_init() }, + }; + + for y in (0..32).rev() { + for z in 0..32 { + for x in 0..32 { + let freq = 2.0; + let (s, _) = sin_cos_pi_f32(x as f32 / 32.0 * freq); + if (y as f32) < ((s + 1.0) * 16.0) + 16.0 { + block_packet.push(x, y, z); + } + } + } + } + // Render blocks. gpu.cmd_set_pipeline(cmd_encoder, self.pipelines.block_pipeline); gpu.cmd_set_bind_group(cmd_encoder, 0, &graphics_bind_group); + + let block_buffer = gpu.request_transient_buffer_with_data( + frame, + thread_token, + BufferUsageFlags::STORAGE, + block_packet.as_slice(), + ); gpu.cmd_push_constants_with_data( cmd_encoder, ShaderStageFlags::MESH, 0, - &BlockConstants { clip_from_model }, + &BlockConstants { + clip_from_model, + camera_position, + _pad: 0.0, + block_buffer_address: gpu.get_buffer_address(block_buffer.to_arg()), + }, ); - gpu.cmd_draw_mesh_tasks(cmd_encoder, 1, 1, 1); + gpu.cmd_draw_mesh_tasks(cmd_encoder, block_packet.count.div_ceil(4), 1, 1); gpu.cmd_set_pipeline(cmd_encoder, self.pipelines.basic_pipeline); gpu.cmd_set_bind_group(cmd_encoder, 0, &graphics_bind_group); -- 2.51.1