From: Joshua Simmons Date: Sun, 5 May 2024 14:16:18 +0000 (+0200) Subject: narcissus-maths: Add `exp_f32` implementation X-Git-Url: https://git.nega.tv//gitweb.cgi?a=commitdiff_plain;h=b60fb0c77e392bacb8f40bde23a3a76aaae26285;p=josh%2Fnarcissus narcissus-maths: Add `exp_f32` implementation Based on Norbert Juffa's implemention for CUDA from https://forums.developer.nvidia.com/t/a-more-accurate-performance-competitive-implementation-of-expf/47528 --- diff --git a/engine/narcissus-maths/src/exp.rs b/engine/narcissus-maths/src/exp.rs new file mode 100644 index 0000000..226891f --- /dev/null +++ b/engine/narcissus-maths/src/exp.rs @@ -0,0 +1,53 @@ +// Norbert Juffa's exp for CUDA, incompletely translated. +// https://forums.developer.nvidia.com/t/a-more-accurate-performance-competitive-implementation-of-expf/47528 + +use crate::f32_to_i32; + +pub fn exp_f32(a: f32) -> f32 { + // exp(a) = 2**i * exp(f); i = rintf (a / log(2)) + let j = a.mul_add(std::f32::consts::LOG2_E, 12582912.0) - 12582912.0; // 0x1.715476p0, 0x1.8p23 + let f = j.mul_add(-6.931_457_5e-1, a); // -0x1.62e400p-1 // log_2_hi + let f = j.mul_add(-1.428_606_8e-6, f); // -0x1.7f7d1cp-20 // log_2_lo + + let i = f32_to_i32(j); + + // approximate r = exp(f) on interval [-log(2)/2, +log(2)/2] + let r = 1.378_059_4e-3_f32; // 0x1.694000p-10 + let r = r.mul_add(f, 8.373_124_5e-3); // 0x1.125edcp-7 + let r = r.mul_add(f, 4.166_953_6e-2); // 0x1.555b5ap-5 + let r = r.mul_add(f, 1.666_647_2e-1); // 0x1.555450p-3 + let r = r.mul_add(f, 4.999_998_5e-1); // 0x1.fffff6p-2 + let r = r.mul_add(f, 1.0); + let r = r.mul_add(f, 1.0); + + // exp(a) = 2**i * r + let ia = if i > 0 { 0_u32 } else { 0x83000000 }; + let s = f32::from_bits(0x7f000000_u32.wrapping_add(ia)); + let t = f32::from_bits(((i as u32) << 23).wrapping_sub(ia) as u32); + let r = r * s; + let r = r * t; + + // handle special cases: severe overflow / underflow + if a.abs() >= 104.0 { + if a > 0.0 { + f32::INFINITY + } else { + 0.0 + } + } else { + r + } +} + +#[cfg(test)] +mod tests { + use crate::exp_f32; + + #[test] + fn basics() { + assert_eq!( + exp_f32(255544082189650076565756907338896244736.0), + f32::INFINITY + ); + } +} diff --git a/engine/narcissus-maths/src/lib.rs b/engine/narcissus-maths/src/lib.rs index 45b0541..0337a8a 100644 --- a/engine/narcissus-maths/src/lib.rs +++ b/engine/narcissus-maths/src/lib.rs @@ -1,5 +1,6 @@ mod affine2; mod affine3; +mod exp; mod mat2; mod mat3; mod mat4; @@ -16,6 +17,7 @@ mod vec4; pub use affine2::Affine2; pub use affine3::Affine3; +pub use exp::exp_f32; pub use mat2::Mat2; pub use mat3::Mat3; pub use mat4::Mat4; diff --git a/engine/narcissus-maths/tests/exhaustive_f32.rs b/engine/narcissus-maths/tests/exhaustive_f32.rs index af505c2..385b0db 100644 --- a/engine/narcissus-maths/tests/exhaustive_f32.rs +++ b/engine/narcissus-maths/tests/exhaustive_f32.rs @@ -5,7 +5,7 @@ use std::{ sync::atomic::{AtomicUsize, Ordering}, }; -use narcissus_maths::{next_after_f32, sin_cos_pi_f32, tan_pi_f32}; +use narcissus_maths::{exp_f32, next_after_f32, sin_cos_pi_f32, tan_pi_f32}; use gmp_mpfr_sys::mpfr; @@ -124,6 +124,10 @@ impl Float { unsafe { mpfr::tan(&mut self.0, &self.0, round.to_raw()) } } + pub fn exp_mut(&mut self, round: Round) -> i32 { + unsafe { mpfr::exp(&mut self.0, &self.0, round.to_raw()) } + } + pub fn abs_mut(&mut self, round: Round) -> i32 { unsafe { mpfr::abs(&mut self.0, &self.0, round.to_raw()) } } @@ -418,6 +422,10 @@ fn ref_tan_pi_f32(x: &mut Float, pi: &Float) -> i32 { x.tan_mut(Round::TiesToEven) } +fn ref_exp_f32(x: &mut Float, _: &Float) -> i32 { + x.exp_mut(Round::TiesToEven) +} + #[test] #[ignore] pub fn exhaustive_sin_pi() { @@ -444,3 +452,11 @@ pub fn exhaustive_tan_pi() { assert_eq!(errors.num_errors, 100_555_422); assert_eq!(errors.max_error_ulp, 2); } + +#[test] +#[ignore] +pub fn exhaustive_exp() { + let errors = check_exhaustive_f32(ref_exp_f32, |a| exp_f32(a), false); + println!("EXP: {errors:?}"); + assert_eq!(errors.max_error_ulp, 1); +}