From: Josh Simmons <josh@nega.tv>
Date: Mon, 2 Dec 2024 18:20:02 +0000 (+0100)
Subject: narcissus-math: Make `sin_cos_pi_f32` branchless
X-Git-Url: https://git.nega.tv//gitweb.cgi?a=commitdiff_plain;h=9870f437208403eb8bbeb2f88600d94732931015;p=josh%2Fnarcissus

narcissus-math: Make `sin_cos_pi_f32` branchless
---

diff --git a/engine/narcissus-maths/src/lib.rs b/engine/narcissus-maths/src/lib.rs
index 97fb55e..8ec1f66 100644
--- a/engine/narcissus-maths/src/lib.rs
+++ b/engine/narcissus-maths/src/lib.rs
@@ -261,6 +261,31 @@ pub fn f32_to_i64(x: f32) -> i64 {
     }
 }
 
+/// Returns either `x`, if `t` is `false` or `y` if `t` is `true`, avoiding branches.
+#[must_use]
+#[inline(always)]
+fn select_f32(x: f32, y: f32, t: bool) -> f32 {
+    // With avx512 the compiler tends to emit masked moves anyway, so don't bother being clever.
+    #[cfg(any(target_feature = "avx512f", not(target_feature = "sse4.1")))]
+    {
+        if t {
+            y
+        } else {
+            x
+        }
+    }
+
+    #[cfg(all(target_feature = "sse4.1", not(target_feature = "avx512f")))]
+    unsafe {
+        let x = core::arch::x86_64::_mm_load_ss(&x);
+        let y = core::arch::x86_64::_mm_load_ss(&y);
+        let mask = std::mem::transmute(core::arch::x86_64::_mm_cvtsi32_si128(-(t as i32)));
+        let mut res = 0.0_f32;
+        core::arch::x86_64::_mm_store_ss(&mut res, core::arch::x86_64::_mm_blendv_ps(x, y, mask));
+        res
+    }
+}
+
 #[macro_export]
 macro_rules! impl_shared {
     ($name:ty, $t:ty, $n:expr) => {
@@ -502,7 +527,7 @@ macro_rules! impl_vector {
 
 #[cfg(test)]
 mod tests {
-    use crate::{dequantize_unorm_u8, quantize_unorm_u8};
+    use crate::{dequantize_unorm_u8, quantize_unorm_u8, select_f32};
 
     #[test]
     fn quantize_dequantize() {
@@ -511,4 +536,10 @@ mod tests {
         assert_eq!(dequantize_unorm_u8(255), 1.0);
         assert_eq!(dequantize_unorm_u8(0), 0.0);
     }
+
+    #[test]
+    fn select() {
+        assert_eq!(select_f32(1.0, 2.0, true), 2.0);
+        assert_eq!(select_f32(1.0, 2.0, false), 1.0);
+    }
 }
diff --git a/engine/narcissus-maths/src/sin_cos_pi.rs b/engine/narcissus-maths/src/sin_cos_pi.rs
index 76c400b..b3cc416 100644
--- a/engine/narcissus-maths/src/sin_cos_pi.rs
+++ b/engine/narcissus-maths/src/sin_cos_pi.rs
@@ -2,7 +2,7 @@
 //
 // Sollya code for generating these polynomials is in `doc/sincostan.sollya`
 
-use crate::f32_to_i32;
+use crate::{f32_to_i32, select_f32};
 
 // constants for sin(pi x), cos(pi x) for x on [-1/4,1/4]
 const F32_SIN_PI_7_K: [f32; 3] = unsafe {
@@ -22,11 +22,6 @@ const F32_COS_PI_8_K: [f32; 4] = unsafe {
     ])
 };
 
-#[inline(always)]
-fn mulsign_f32(x: f32, s: u32) -> f32 {
-    f32::from_bits(x.to_bits() ^ s)
-}
-
 /// Simultaneously computes the sine and cosine of `a` expressed in multiples of
 /// *pi* radians, or half-turns.
 ///
@@ -56,26 +51,30 @@ pub fn sin_cos_pi_f32(a: f32) -> (f32, f32) {
     let i = f32_to_i32(r) as u32;
     let r = r.mul_add(-0.5, a);
 
-    let sx = (i >> 1) << 31;
-    let sy = (i << 31) ^ sx;
-
-    // Core approximation.
     let r2 = r * r;
-    let r = mulsign_f32(r, sy);
 
+    // Reconstruct signs early.
+    let sign_x = (i >> 1) << 31;
+    let sign_y = sign_x ^ i << 31;
+    let r_sign = r.copysign(f32::from_bits(r.to_bits() ^ sign_y));
+    let r2_sign = r2.copysign(f32::from_bits(r2.to_bits() ^ sign_x));
+    let one_sign = 1.0_f32.copysign(f32::from_bits(sign_x));
+
+    // Core approximation.
     let c = C[3];
     let c = c.mul_add(r2, C[2]);
     let c = c.mul_add(r2, C[1]);
     let c = c.mul_add(r2, C[0]);
-    let c = c.mul_add(r2, 1.0);
-    let c = mulsign_f32(c, sx);
+    let c = c.mul_add(r2_sign, one_sign);
 
     let s = S[2];
     let s = s.mul_add(r2, S[1]);
     let s = s.mul_add(r2, S[0]);
-    let s = r.mul_add(std::f32::consts::PI, r * r2.mul_add(s, -8.742278e-8));
+    let s = r_sign.mul_add(std::f32::consts::PI, r_sign * r2.mul_add(s, -8.742278e-8));
 
-    let (s, c) = if i & 1 != 0 { (c, s) } else { (s, c) };
+    let t = s;
+    let s = select_f32(s, c, i & 1 != 0);
+    let c = select_f32(c, t, i & 1 != 0);
 
     // IEEE-754: sin_pi(+n) is +0 and sin_pi(-n) is -0 for positive integers n
     let s = if a == a.floor() { a * 0.0 } else { s };
@@ -95,7 +94,7 @@ pub fn cos_pi_f32(a: f32) -> f32 {
 
 #[cfg(test)]
 mod tests {
-    use crate::sin_cos_pi_f32;
+    use super::sin_cos_pi_f32;
 
     #[test]
     fn basics() {