From: Josh Simmons Date: Thu, 17 Apr 2025 06:34:45 +0000 (+0200) Subject: narcissus-math: Add intrinsics version of sincospi X-Git-Url: https://git.nega.tv//gitweb.cgi?a=commitdiff_plain;h=7445410550a66d62c73595f51d3b6de719c79b07;p=josh%2Fnarcissus narcissus-math: Add intrinsics version of sincospi --- diff --git a/engine/narcissus-maths/src/lib.rs b/engine/narcissus-maths/src/lib.rs index edde9b1..434fca4 100644 --- a/engine/narcissus-maths/src/lib.rs +++ b/engine/narcissus-maths/src/lib.rs @@ -24,7 +24,7 @@ pub use perlin::{perlin_noise3, perlin_noise3_wrap, perlin_noise3_wrap_seed}; pub use point2::{Point2, point2}; pub use point3::{Point3, point3}; pub use quat::Quat; -pub use sin_cos_pi::{cos_pi_f32, sin_cos_pi_f32, sin_pi_f32}; +pub use sin_cos_pi::{cos_pi_f32, sin_cos_pi_f32, sin_cos_pi_f32x4, sin_pi_f32}; pub use tan_pi::tan_pi_f32; pub use vec2::{Vec2, vec2}; pub use vec3::{Vec3, vec3}; @@ -223,7 +223,7 @@ pub fn lerp(t: f32, a: f32, b: f32) -> f32 { #[inline(always)] pub fn f32_to_i32(x: f32) -> i32 { #[cfg(not(target_arch = "x86_64"))] - const _: () = panic!("unsupported platform"); + compile_error!("unsupported platform"); #[cfg(target_arch = "x86_64")] unsafe { @@ -257,30 +257,6 @@ pub fn f32_to_i64(x: f32) -> i64 { } } -/// Returns either `x`, if `t` is `false` or `y` if `t` is `true`, avoiding branches. -#[must_use] -#[inline(always)] -fn select_f32(x: f32, y: f32, t: bool) -> f32 { - // With avx512 the compiler tends to emit masked moves anyway, so don't bother being clever. - #[cfg(any(target_feature = "avx512f", not(target_feature = "sse4.1")))] - { - if t { y } else { x } - } - - #[cfg(all(target_feature = "sse4.1", not(target_feature = "avx512f")))] - unsafe { - use core::arch::x86_64::{ - __m128, __m128i, _mm_blendv_ps, _mm_cvtsi32_si128, _mm_load_ss, _mm_store_ss, - }; - let x = _mm_load_ss(&x); - let y = _mm_load_ss(&y); - let mask = std::mem::transmute::<__m128i, __m128>(_mm_cvtsi32_si128(-(t as i32))); - let mut res = 0.0_f32; - _mm_store_ss(&mut res, _mm_blendv_ps(x, y, mask)); - res - } -} - #[macro_export] macro_rules! impl_shared { ($name:ty, $t:ty, $n:expr) => { @@ -522,7 +498,7 @@ macro_rules! impl_vector { #[cfg(test)] mod tests { - use crate::{dequantize_unorm_u8, quantize_unorm_u8, select_f32}; + use crate::{dequantize_unorm_u8, quantize_unorm_u8}; #[test] fn quantize_dequantize() { @@ -531,10 +507,4 @@ mod tests { assert_eq!(dequantize_unorm_u8(255), 1.0); assert_eq!(dequantize_unorm_u8(0), 0.0); } - - #[test] - fn select() { - assert_eq!(select_f32(1.0, 2.0, true), 2.0); - assert_eq!(select_f32(1.0, 2.0, false), 1.0); - } } diff --git a/engine/narcissus-maths/src/sin_cos_pi.rs b/engine/narcissus-maths/src/sin_cos_pi.rs index e28e9f8..33919c5 100644 --- a/engine/narcissus-maths/src/sin_cos_pi.rs +++ b/engine/narcissus-maths/src/sin_cos_pi.rs @@ -2,7 +2,7 @@ // // Sollya code for generating these polynomials is in `doc/sincostan.sollya` -use crate::{f32_to_i32, select_f32}; +use crate::f32_to_i32; // constants for sin(pi x), cos(pi x) for x on [-1/4,1/4] const F32_SIN_PI_7_K: [f32; 3] = unsafe { @@ -39,6 +39,28 @@ const F32_COS_PI_8_K: [f32; 4] = unsafe { /// assert_eq!(cos, 1.0); /// ``` pub fn sin_cos_pi_f32(a: f32) -> (f32, f32) { + #[cfg(not(all( + target_arch = "x86_64", + target_feature = "sse4.1", + target_feature = "fma" + )))] + { + sin_cos_pi_f32_base(a) + 1.0 + } + + #[cfg(all( + target_arch = "x86_64", + target_feature = "sse4.1", + target_feature = "fma" + ))] + { + // SAFETY: We checked the required features above. + unsafe { x86_64::sin_cos_pi_f32_sse41_fma(a) } + } +} + +#[allow(unused)] +fn sin_cos_pi_f32_base(a: f32) -> (f32, f32) { const S: [f32; 3] = F32_SIN_PI_7_K; const C: [f32; 4] = F32_COS_PI_8_K; @@ -72,16 +94,45 @@ pub fn sin_cos_pi_f32(a: f32) -> (f32, f32) { let s = s.mul_add(r2, S[0]); let s = r_sign.mul_add(std::f32::consts::PI, r_sign * r2.mul_add(s, -8.742278e-8)); - let t = s; - let s = select_f32(s, c, i & 1 != 0); - let c = select_f32(c, t, i & 1 != 0); + let (s, c) = if i & 1 != 0 { (c, s) } else { (s, c) }; - // IEEE-754: sin_pi(+n) is +0 and sin_pi(-n) is -0 for positive integers n + // sin_pi(+n) is +0 and sin_pi(-n) is -0 for positive integers n let s = if a == a.floor() { a * 0.0 } else { s }; (s, c) } +pub fn sin_cos_pi_f32x4(a: &[f32; 4], sin: &mut [f32; 4], cos: &mut [f32; 4]) { + #[cfg(not(all( + target_arch = "x86_64", + target_feature = "sse4.1", + target_feature = "fma" + )))] + { + let (s0, c0) = sin_cos_pi_f32(a[0]); + let (s1, c1) = sin_cos_pi_f32(a[1]); + let (s2, c2) = sin_cos_pi_f32(a[2]); + let (s3, c3) = sin_cos_pi_f32(a[3]); + sin[0] = s0; + sin[1] = s1; + sin[2] = s2; + sin[3] = s3; + cos[0] = c0; + cos[1] = c1; + cos[2] = c2; + cos[3] = c3; + } + + #[cfg(all( + target_arch = "x86_64", + target_feature = "sse4.1", + target_feature = "fma" + ))] + unsafe { + x86_64::sin_cos_pi_f32x4_sse41_fma(a, sin, cos) + } +} + #[inline(always)] pub fn sin_pi_f32(a: f32) -> f32 { sin_cos_pi_f32(a).0 @@ -92,14 +143,156 @@ pub fn cos_pi_f32(a: f32) -> f32 { sin_cos_pi_f32(a).1 } +#[allow(unused)] +#[cfg(target_arch = "x86_64")] +mod x86_64 { + use std::arch::x86_64::*; + use std::f32::consts::PI; + + use crate::sin_cos_pi::{F32_COS_PI_8_K, F32_SIN_PI_7_K}; + + #[inline(always)] + fn to_m128(x: __m128i) -> __m128 { + // SAFETY: Unconditionally safe. + unsafe { std::mem::transmute::<__m128i, __m128>(x) } + } + + #[inline(always)] + fn to_m128i(x: __m128) -> __m128i { + // SAFETY: Unconditionally safe. + unsafe { std::mem::transmute::<__m128, __m128i>(x) } + } + + #[target_feature(enable = "sse4.1,fma")] + pub unsafe fn sin_cos_pi_f32_sse41_fma(a: f32) -> (f32, f32) { + const S: [f32; 3] = F32_SIN_PI_7_K; + const C: [f32; 4] = F32_COS_PI_8_K; + + let mut sin_out = 0.0; + let mut cos_out = 0.0; + + unsafe { + let a = _mm_load_ss(&a); + let a_zero = _mm_mul_ss(_mm_setzero_ps(), a); + let a_abs = _mm_andnot_ps(_mm_load_ss(&-0.0), a); + let cmp_mask = _mm_cmplt_ss(a_abs, _mm_load_ss(&16777216.0)); + let a = _mm_blendv_ps(a_zero, a, cmp_mask); + let a2 = _mm_add_ss(a, a); + let r = _mm_round_ss::<_MM_FROUND_TO_NEAREST_INT>(a2, a2); + let i = _mm_cvttps_epi32(r); + + let r = _mm_fmadd_ss(r, _mm_load_ss(&-0.5), a); + let r_sq = _mm_mul_ss(r, r); + + let i_lsb = _mm_slli_epi32::<31>(i); + let x_sign = _mm_slli_epi32::<31>(_mm_srli_epi32::<1>(i)); + let y_sign = _mm_xor_si128(i_lsb, x_sign); + let r_sign = to_m128(_mm_xor_si128(to_m128i(r), y_sign)); + let r_sq_sign = to_m128(_mm_xor_si128(to_m128i(r_sq), x_sign)); + let one_sign = to_m128(_mm_xor_si128(to_m128i(_mm_load_ss(&1.0)), x_sign)); + + let c = _mm_load_ss(&C[3]); + let c = _mm_fmadd_ss(c, r_sq, _mm_load_ss(&C[2])); + let c = _mm_fmadd_ss(c, r_sq, _mm_load_ss(&C[1])); + let c = _mm_fmadd_ss(c, r_sq, _mm_load_ss(&C[0])); + let c = _mm_fmadd_ss(c, r_sq_sign, one_sign); + + let s = _mm_load_ss(&S[2]); + let s = _mm_fmadd_ss(s, r_sq, _mm_load_ss(&S[1])); + let s = _mm_fmadd_ss(s, r_sq, _mm_load_ss(&S[0])); + let s = _mm_fmadd_ss(r_sq, s, _mm_load_ss(&-8.742278e-8)); + let s = _mm_fmadd_ss(r_sign, _mm_load_ss(&PI), _mm_mul_ss(r_sign, s)); + + let sin = _mm_blendv_ps(s, c, to_m128(i_lsb)); + let cos = _mm_blendv_ps(c, s, to_m128(i_lsb)); + + let a_floor = _mm_round_ss::<_MM_FROUND_FLOOR>(a, a); + let positive_int_mask = _mm_cmpeq_ss(a, a_floor); + let a_zero = _mm_mul_ss(_mm_setzero_ps(), a); + let sin = _mm_blendv_ps(sin, a_zero, positive_int_mask); + + _mm_store_ss(&mut sin_out, sin); + _mm_store_ss(&mut cos_out, cos); + } + + (sin_out, cos_out) + } + + #[allow(unused)] + #[cfg(target_arch = "x86_64")] + #[target_feature(enable = "sse4.1,fma")] + pub unsafe fn sin_cos_pi_f32x4_sse41_fma( + a: &[f32; 4], + sin_out: &mut [f32; 4], + cos_out: &mut [f32; 4], + ) { + const S: [f32; 3] = F32_SIN_PI_7_K; + const C: [f32; 4] = F32_COS_PI_8_K; + + unsafe { + let a = _mm_loadu_ps(a.as_ptr()); + let a_zero = _mm_mul_ps(_mm_setzero_ps(), a); + let a_abs = _mm_andnot_ps(_mm_set1_ps(-0.0), a); + let cmp_mask = _mm_cmplt_ps(a_abs, _mm_set1_ps(16777216.0)); + let a = _mm_blendv_ps(a_zero, a, cmp_mask); + let r = _mm_round_ps::<_MM_FROUND_TO_NEAREST_INT>(_mm_add_ps(a, a)); + let i = _mm_cvttps_epi32(r); + + let r = _mm_fmadd_ps(r, _mm_set1_ps(-0.5), a); + let r_sq = _mm_mul_ps(r, r); + + let i_msb = _mm_slli_epi32::<31>(i); + let x_sign = _mm_slli_epi32::<31>(_mm_srli_epi32::<1>(i)); + let y_sign = _mm_xor_si128(i_msb, x_sign); + let r_sign = to_m128(_mm_xor_si128(to_m128i(r), y_sign)); + let r_sq_sign = to_m128(_mm_xor_si128(to_m128i(r_sq), x_sign)); + let one_sign = to_m128(_mm_xor_si128(to_m128i(_mm_set1_ps(1.0)), x_sign)); + + let c = _mm_load1_ps(&C[3]); + let c = _mm_fmadd_ps(c, r_sq, _mm_load1_ps(&C[2])); + let c = _mm_fmadd_ps(c, r_sq, _mm_load1_ps(&C[1])); + let c = _mm_fmadd_ps(c, r_sq, _mm_load1_ps(&C[0])); + let c = _mm_fmadd_ps(c, r_sq_sign, one_sign); + + let s = _mm_load1_ps(&S[2]); + let s = _mm_fmadd_ps(s, r_sq, _mm_load1_ps(&S[1])); + let s = _mm_fmadd_ps(s, r_sq, _mm_load1_ps(&S[0])); + let s = _mm_fmadd_ps(r_sq, s, _mm_set1_ps(-8.742278e-8)); + let s = _mm_fmadd_ps(r_sign, _mm_set1_ps(PI), _mm_mul_ps(r_sign, s)); + + let sin = _mm_blendv_ps(s, c, to_m128(i_msb)); + let cos = _mm_blendv_ps(c, s, to_m128(i_msb)); + + let a_floor = _mm_round_ps::<_MM_FROUND_FLOOR>(a); + let positive_int_mask = _mm_cmpeq_ps(a, a_floor); + let a_zero = _mm_mul_ps(_mm_setzero_ps(), a); + let sin = _mm_blendv_ps(sin, a_zero, positive_int_mask); + + _mm_storeu_ps(sin_out.as_mut_ptr(), sin); + _mm_storeu_ps(cos_out.as_mut_ptr(), cos); + } + } +} + #[cfg(test)] mod tests { + use crate::sin_cos_pi::sin_cos_pi_f32x4; + use super::sin_cos_pi_f32; #[test] fn basics() { assert_eq!(sin_cos_pi_f32(f32::from_bits(0x7f4135c6)), (0.0, 1.0)); + assert_eq!(sin_cos_pi_f32(16777216.0).1, 1.0); + assert_eq!(sin_cos_pi_f32(16777218.0).1, 1.0); + assert_eq!(sin_cos_pi_f32(16777220.0).1, 1.0); + assert_eq!(sin_cos_pi_f32(-16777216.0).1, 1.0); + assert_eq!(sin_cos_pi_f32(-16777218.0).1, 1.0); + assert_eq!(sin_cos_pi_f32(-16777220.0).1, 1.0); + assert!(sin_cos_pi_f32(f32::INFINITY).1.is_nan()); + assert!(sin_cos_pi_f32(-f32::INFINITY).1.is_nan()); + assert_eq!(sin_cos_pi_f32(-1.5), (1.0, 0.0)); assert_eq!(sin_cos_pi_f32(-1.0), (0.0, -1.0)); assert_eq!(sin_cos_pi_f32(-0.5), (-1.0, 0.0)); @@ -110,4 +303,128 @@ mod tests { assert_eq!(sin_cos_pi_f32(1.0), (0.0, -1.0)); assert_eq!(sin_cos_pi_f32(1.5), (-1.0, 0.0)); } + + #[test] + fn basics_f32x4() { + let mut sin = [0.0; 4]; + let mut cos = [0.0; 4]; + + let a = [16777216.0; 4]; + sin_cos_pi_f32x4(&a, &mut sin, &mut cos); + assert_eq!(cos, [1.0; 4]); + + let a = [16777218.0; 4]; + sin_cos_pi_f32x4(&a, &mut sin, &mut cos); + assert_eq!(cos, [1.0; 4]); + + let a = [16777220.0; 4]; + sin_cos_pi_f32x4(&a, &mut sin, &mut cos); + assert_eq!(cos, [1.0; 4]); + + let a = [-16777216.0; 4]; + sin_cos_pi_f32x4(&a, &mut sin, &mut cos); + assert_eq!(cos, [1.0; 4]); + + let a = [-16777218.0; 4]; + sin_cos_pi_f32x4(&a, &mut sin, &mut cos); + assert_eq!(cos, [1.0; 4]); + + let a = [-16777220.0; 4]; + sin_cos_pi_f32x4(&a, &mut sin, &mut cos); + assert_eq!(cos, [1.0; 4]); + + let a = [f32::INFINITY; 4]; + sin_cos_pi_f32x4(&a, &mut sin, &mut cos); + assert!(cos.iter().all(|&x| f32::is_nan(x)), "cos_pi(inf): {cos:?}"); + + let a = [-f32::INFINITY; 4]; + sin_cos_pi_f32x4(&a, &mut sin, &mut cos); + assert!(cos.iter().all(|&x| f32::is_nan(x)), "cos_pi(-inf): {cos:?}"); + + let a = [-1.5; 4]; + sin_cos_pi_f32x4(&a, &mut sin, &mut cos); + assert_eq!(sin, [1.0; 4]); + assert_eq!(cos, [0.0; 4]); + + let a = [-1.0; 4]; + sin_cos_pi_f32x4(&a, &mut sin, &mut cos); + assert_eq!(sin, [0.0; 4]); + assert_eq!(cos, [-1.0; 4]); + + let a = [-0.5; 4]; + sin_cos_pi_f32x4(&a, &mut sin, &mut cos); + assert_eq!(sin, [-1.0; 4]); + assert_eq!(cos, [0.0; 4]); + + let a = [-0.0; 4]; + sin_cos_pi_f32x4(&a, &mut sin, &mut cos); + assert_eq!(sin, [0.0; 4]); + assert_eq!(cos, [1.0; 4]); + + let a = [0.0; 4]; + sin_cos_pi_f32x4(&a, &mut sin, &mut cos); + assert_eq!(sin, [0.0; 4]); + assert_eq!(cos, [1.0; 4]); + + let a = [0.5; 4]; + sin_cos_pi_f32x4(&a, &mut sin, &mut cos); + assert_eq!(sin, [1.0; 4]); + assert_eq!(cos, [0.0; 4]); + + let a = [1.0; 4]; + sin_cos_pi_f32x4(&a, &mut sin, &mut cos); + assert_eq!(sin, [0.0; 4]); + assert_eq!(cos, [-1.0; 4]); + + let a = [1.5; 4]; + sin_cos_pi_f32x4(&a, &mut sin, &mut cos); + assert_eq!(sin, [-1.0; 4]); + assert_eq!(cos, [0.0; 4]); + + let mut x = 0.0; + for _ in 0..10_000 { + let a: [f32; 4] = std::array::from_fn(|_| { + let r = x; + x += 0.01; + r + }); + + let mut sin = [0.0; 4]; + let mut cos = [0.0; 4]; + sin_cos_pi_f32x4(&a, &mut sin, &mut cos); + + for (a, (vector_sin, vector_cos)) in a.iter().zip(sin.iter().zip(cos)) { + let (scalar_sin, scalar_cos) = sin_cos_pi_f32(*a); + assert_eq!(scalar_sin.to_bits(), vector_sin.to_bits()); + assert_eq!(scalar_cos.to_bits(), vector_cos.to_bits()); + } + } + } + + #[test] + #[ignore] + fn special_cases() { + // sin_pi(+n) is +0 and sin_pi(-n) is -0 for positive integers n + { + let mut i = 0.0_f32; + while i.is_finite() { + assert_eq!(sin_cos_pi_f32(i).0.to_bits(), 0.0_f32.to_bits()); + assert_eq!(sin_cos_pi_f32(-i).0.to_bits(), (-0.0_f32).to_bits()); + + i = i.next_up().ceil() + } + } + + // cos_pi(a) = 1.0f for |a| > 2^24, but cos_pi(Inf) = NaN + { + let mut i = 16777216.0_f32; + while i.is_finite() { + assert_eq!(sin_cos_pi_f32(i).1.to_bits(), 1.0_f32.to_bits()); + assert_eq!(sin_cos_pi_f32(-i).1.to_bits(), 1.0_f32.to_bits()); + i = i.next_up(); + } + + assert!(sin_cos_pi_f32(i).1.is_nan()); + } + } }