From: Josh Simmons <josh@nega.tv>
Date: Fri, 4 Nov 2022 20:59:59 +0000 (+0100)
Subject: Fix reduction using the wrong rounding mode (#1)
X-Git-Url: https://git.nega.tv//gitweb.cgi?a=commitdiff_plain;h=1b87d0565a0b16fc5ab03dd7ef267933dfe0a8fa;p=josh%2Fnarcissus

Fix reduction using the wrong rounding mode (#1)

Previously we were using `f32::round` in the reduction step, however
that isn't correct. We really need IEEE-754 tiesToEven semantics for the
rounding step, so manually implement that.
---

diff --git a/narcissus-maths/src/lib.rs b/narcissus-maths/src/lib.rs
index ab5eda3..2e71ed0 100644
--- a/narcissus-maths/src/lib.rs
+++ b/narcissus-maths/src/lib.rs
@@ -172,6 +172,23 @@ pub fn clamp(x: f32, lo: f32, hi: f32) -> f32 {
     max(min(x, hi), lo)
 }
 
+/// Returns the nearest integer to a given `f32`.
+///
+/// Respects IEEE-754 tiesToEven
+#[inline(always)]
+fn round_ties_to_even(x: f32) -> f32 {
+    #[cfg(target_feature = "sse4.1")]
+    unsafe {
+        use std::arch::x86_64::{
+            _mm_load_ss, _mm_round_ss, _MM_FROUND_NO_EXC, _MM_FROUND_TO_NEAREST_INT,
+        };
+        const ROUNDING: i32 = (_MM_FROUND_TO_NEAREST_INT | _MM_FROUND_NO_EXC) as i32;
+        let x = _mm_load_ss(&x);
+        let x = _mm_round_ss::<ROUNDING>(x, x);
+        std::arch::x86_64::_mm_cvtss_f32(x)
+    }
+}
+
 #[macro_export]
 macro_rules! impl_shared {
     ($name:ty, $t:ty, $n:expr) => {
diff --git a/narcissus-maths/src/sin_cos_pi.rs b/narcissus-maths/src/sin_cos_pi.rs
index e9912cb..65c9aaa 100644
--- a/narcissus-maths/src/sin_cos_pi.rs
+++ b/narcissus-maths/src/sin_cos_pi.rs
@@ -2,6 +2,8 @@
 //
 // Sollya code for generating these polynomials is in `doc/sincostan.sollya`
 
+use crate::round_ties_to_even;
+
 // constants for sin(pi x), cos(pi x) for x on [-1/4,1/4]
 const F32_SIN_PI_7_K: [f32; 3] = unsafe {
     std::mem::transmute::<[u32; 3], _>([
@@ -48,7 +50,9 @@ pub fn sin_cos_pi_f32(a: f32) -> (f32, f32) {
     let a = if a.abs() < 16777216.0 { a } else { a * 0.0 };
 
     // Range reduction.
-    let r = (a + a).round();
+    let r = round_ties_to_even(a + a);
+
+    // Safety: The clamp above avoids the possibility of overflow here.
     let i = unsafe { r.to_int_unchecked::<i32>() } as u32;
     let r = r.mul_add(-0.5, a);
 
diff --git a/narcissus-maths/src/tan_pi.rs b/narcissus-maths/src/tan_pi.rs
index 16d1a5a..37700bc 100644
--- a/narcissus-maths/src/tan_pi.rs
+++ b/narcissus-maths/src/tan_pi.rs
@@ -4,6 +4,8 @@
 //
 // Sollya code for generating these polynomials is in `doc/sincostan.sollya`
 
+use crate::round_ties_to_even;
+
 const F32_TAN_PI_15_K: [f32; 7] = unsafe {
     std::mem::transmute::<[u32; 7], _>([
         0x41255def, // 0x1.4abbdep3
@@ -31,7 +33,9 @@ pub fn tan_pi_f32(a: f32) -> f32 {
     const T: [f32; 7] = F32_TAN_PI_15_K;
 
     // Range reduction.
-    let r = (a + a).round();
+    let r = round_ties_to_even(a + a);
+
+    // Safety: The clamp above avoids the possibility of overflow here.
     let i = unsafe { r.to_int_unchecked::<i32>() } as u32;
     let r = r.mul_add(-0.5, a);
 
diff --git a/narcissus-maths/tests/exhaustive_f32.rs b/narcissus-maths/tests/exhaustive_f32.rs
index d02f3b0..0fcc12f 100644
--- a/narcissus-maths/tests/exhaustive_f32.rs
+++ b/narcissus-maths/tests/exhaustive_f32.rs
@@ -432,7 +432,7 @@ pub fn exhaustive_sin_pi() {
 pub fn exhaustive_cos_pi() {
     let errors = check_exhaustive_f32(ref_cos_pi_f32, |a| sin_cos_pi_f32(a).1, false);
     println!("COS: {:?}", errors);
-    assert_eq!(errors.num_errors, 62_426_004);
+    assert_eq!(errors.num_errors, 45_896_848);
     assert_eq!(errors.max_error_ulp, 1);
 }