From: Joshua Simmons <josh@nega.tv>
Date: Sat, 15 Oct 2022 13:32:22 +0000 (+0200)
Subject: Add some new maths functions
X-Git-Url: https://git.nega.tv//gitweb.cgi?a=commitdiff_plain;h=ed87042544b4a8e254611409d68b5fa3249edf54;p=josh%2Fnarcissus

Add some new maths functions

Add `{sin,cos,tan}_pi_f32` functions.
Add `next_after_f32`.
Add `HalfTurns` unit type.
Switch `Mat4` to use turns and `sin_cos_pi_f32`.
---

diff --git a/Cargo.lock b/Cargo.lock
index a070e00..2351191 100644
--- a/Cargo.lock
+++ b/Cargo.lock
@@ -2,12 +2,34 @@
 # It is not intended for manual editing.
 version = 3
 
+[[package]]
+name = "az"
+version = "1.2.1"
+source = "registry+https://github.com/rust-lang/crates.io-index"
+checksum = "7b7e4c2464d97fe331d41de9d5db0def0a96f4d823b8b32a2efd503578988973"
+
 [[package]]
 name = "fast-float"
 version = "0.2.0"
 source = "registry+https://github.com/rust-lang/crates.io-index"
 checksum = "95765f67b4b18863968b4a1bd5bb576f732b29a4a28c7cd84c09fa3e2875f33c"
 
+[[package]]
+name = "gmp-mpfr-sys"
+version = "1.4.10"
+source = "registry+https://github.com/rust-lang/crates.io-index"
+checksum = "ea3f42dadb6c75f122e9aa87e757ef11d4282f664c9f2e6476a9c2c8970f9d19"
+dependencies = [
+ "libc",
+ "winapi",
+]
+
+[[package]]
+name = "libc"
+version = "0.2.135"
+source = "registry+https://github.com/rust-lang/crates.io-index"
+checksum = "68783febc7782c6c5cb401fbda4de5a9898be1762314da0bb2c10ced61f18b0c"
+
 [[package]]
 name = "narcissus"
 version = "0.1.0"
@@ -46,6 +68,9 @@ dependencies = [
 [[package]]
 name = "narcissus-maths"
 version = "0.1.0"
+dependencies = [
+ "rug",
+]
 
 [[package]]
 name = "narcissus-world"
@@ -58,6 +83,17 @@ dependencies = [
 name = "renderdoc-sys"
 version = "0.1.0"
 
+[[package]]
+name = "rug"
+version = "1.17.0"
+source = "registry+https://github.com/rust-lang/crates.io-index"
+checksum = "203180f444c95eac53586ed04793ecf6454c5d28f9eca8eead815fc19e136c47"
+dependencies = [
+ "az",
+ "gmp-mpfr-sys",
+ "libc",
+]
+
 [[package]]
 name = "sdl2-sys"
 version = "0.1.0"
@@ -69,3 +105,25 @@ version = "0.1.0"
 [[package]]
 name = "vulkan-sys"
 version = "0.1.0"
+
+[[package]]
+name = "winapi"
+version = "0.3.9"
+source = "registry+https://github.com/rust-lang/crates.io-index"
+checksum = "5c839a674fcd7a98952e593242ea400abe93992746761e38641405d28b00f419"
+dependencies = [
+ "winapi-i686-pc-windows-gnu",
+ "winapi-x86_64-pc-windows-gnu",
+]
+
+[[package]]
+name = "winapi-i686-pc-windows-gnu"
+version = "0.4.0"
+source = "registry+https://github.com/rust-lang/crates.io-index"
+checksum = "ac3b87c63620426dd9b991e5ce0329eff545bccbbb34f3be09ff6fb6ab51b7b6"
+
+[[package]]
+name = "winapi-x86_64-pc-windows-gnu"
+version = "0.4.0"
+source = "registry+https://github.com/rust-lang/crates.io-index"
+checksum = "712e227841d057c1ee1cd2fb22fa7e5a5461ae8e48fa2ca79ec42cfc1931183f"
diff --git a/Cargo.toml b/Cargo.toml
index 6d034de..0f8c72e 100644
--- a/Cargo.toml
+++ b/Cargo.toml
@@ -6,9 +6,9 @@ members = [
     "ffi/stb_image-sys",
     "ffi/vulkan-sys",
     "narcissus",
+    "narcissus-app",
     "narcissus-core",
     "narcissus-gpu",
-    "narcissus-app",
     "narcissus-maths",
     "narcissus-world",
 ]
diff --git a/narcissus-maths/Cargo.toml b/narcissus-maths/Cargo.toml
index 344b2d1..5065f8e 100644
--- a/narcissus-maths/Cargo.toml
+++ b/narcissus-maths/Cargo.toml
@@ -8,3 +8,6 @@ edition = "2021"
 [features]
 
 [dependencies]
+
+[dev-dependencies]
+rug = "1.17.0"
\ No newline at end of file
diff --git a/narcissus-maths/doc/sincostan.m b/narcissus-maths/doc/sincostan.m
new file mode 100644
index 0000000..2d12e6f
--- /dev/null
+++ b/narcissus-maths/doc/sincostan.m
@@ -0,0 +1,89 @@
+
+x = 0:0.001:1/4
+
+tanpi7 = x .* (3.1414539813995361328125 + x.^2
+           .* (10.40517330169677734375 + x.^2
+           .* (35.490978240966796875 + x.^2
+           .* 284.301177978515625)));
+
+tanpi9 = x    .* (3.14160251617431640625 +
+         x.^2 .* (10.3275852203369140625 +
+         x.^2 .* (41.77574920654296875 +
+         x.^2 .* (122.0411834716796875 +
+         x.^2 .* 1306.0660400390625))));
+
+tanpi13 = x    .* (3.1415927410125732421875 +
+          x.^2 .* (10.3353290557861328125 +
+          x.^2 .* (40.82488250732421875 +
+          x.^2 .* (161.0076446533203125 +
+          x.^2 .* (736.640869140625 +
+          x.^2 .* (809.0543212890625 +
+          x.^2 .* 27793.21875))))));
+
+tanpi15 = x    .* (3.1415927410125732421875 +
+          x.^2 .* (10.33537769317626953125 +
+          x.^2 .* (40.80986785888671875 +
+          x.^2 .* (162.5588531494140625 +
+          x.^2 .* (662.82147216796875 +
+          x.^2 .* (2586.50048828125 +
+          x.^2 .* (6733.18896484375 +
+          x.^2 .* 97579.828125)))))));
+
+tanpic  = x    .* (3.1415927410125732421875 +
+          x.^2 .* (1.03354301e+1 +
+          x.^2 .* (4.08006248e+1 +
+          x.^2 .* (1.63275925e+2 +
+          x.^2 .* (6.34397766e+2 +
+          x.^2 .* (3.18810547e+3 +
+          x.^2 .* (2.59904968e+2 +
+          x.^2 .* (1.25408000e+5))))))));
+
+tanpo   = x    .* (3.1415927410125732421875 +
+          x.^2 .* (10.3353290557861328125 +
+          x.^2 .* (40.82488250732421875 +
+          x.^2 .* (161.0076446533203125 +
+          x.^2 .* (736.640869140625 +
+          x.^2 .* (809.0543212890625 +
+          x.^2 .* 27793.21875))))));
+
+tanpid  = x    .* (3.14159270438032223182744928635656833648681640625 +
+          x.^2 .* (10.335346221923828125 +
+          x.^2 .* (40.822620391845703125 +
+          x.^2 .* (161.1344146728515625 +
+          x.^2 .* (733.203857421875 +
+          x.^2 .* (853.6375732421875 +
+          x.^2 .* 27571.314453125))))));
+
+tanpid2 = x    .* (3.1415926499513684433395610540173947811126708984375 +
+          x.^2 .* (10.33543300628662109375 +
+          x.^2 .* (40.800151824951171875 +
+          x.^2 .* (163.3097381591796875 +
+          x.^2 .* (633.093994140625 +
+          x.^2 .* (3215.677978515625 +
+          x.^2 .* (-38.601871490478515625 +
+          x.^2 .* 1.2669925e5)))))));
+
+tanpi = tan(x .* pi);
+
+plot (x, (tanpi7 - tanpi) ./ tanpi);
+title("tanpi7");
+xlabel("x");
+ylabel("rel error");
+figure
+plot (x, (tanpid2 - tanpi) ./ tanpi)
+title("tanpi double");
+xlabel("x");
+ylabel("rel error");
+
+cospi = cos(x .* pi);
+
+cospi8 = x.^2 .* (-4.93480205535888671875 +
+         x.^2 .* (4.0586986541748046875 +
+         x.^2 .* (-1.33483588695526123046875 +
+         x.^2 .* 0.22974522411823272705078125))) + 1.0;
+
+figure
+plot (x, (cospi - cospi8) ./ cospi)
+title("cospi");
+xlabel("x");
+ylabel("abs error");
diff --git a/narcissus-maths/doc/sincostan.sollya b/narcissus-maths/doc/sincostan.sollya
new file mode 100644
index 0000000..eb3d517
--- /dev/null
+++ b/narcissus-maths/doc/sincostan.sollya
@@ -0,0 +1,149 @@
+// create primary range approximations of sin(pi x) and cos(pi x) for x on [-1/4,1/4]
+// both for single and double precision
+// stolen from https://marc-b-reynolds.github.io/math/2020/03/11/SinCosPi.html
+
+R = [0;1/4];
+T = floating;
+E = relative;
+B = [|SG...|];
+
+// write fp values such that they always have a decimal point
+procedure fp_write(v)
+{
+  if (v != nearestint(v)) then {
+   write(v);
+  } else {
+   write(v,".0");
+  };
+};
+
+// make a copy of list with f applied
+procedure list_apply(l,f)
+{
+  var r,v,i;
+
+  r = [||];
+
+  for i from 0 to length(l)-1 do {
+   v = f(l[i]);
+   r = r :. v;
+  };
+
+  return r;
+};
+
+f32_round = proc(n) { return single(n); };
+f64_round = proc(n) { return double(n); };
+
+// make a copy of list with elements rounded to binary32
+procedure list_to_f32(l) { return list_apply(l, f32_round); };
+procedure list_to_f64(l) { return list_apply(l, f64_round); };
+
+procedure list_of_even_coeff(p)
+{
+  var r,e,i;
+  e = degree(p);
+  r = [||];
+
+  for i from 0 to e by 2 do { r = r :. coeff(p,i); };
+
+  return r;
+};
+
+procedure list_of_odd_coeff(p)
+{
+  var r,e,i;
+  e = degree(p);
+  r = [||];
+
+  for i from 1 to e by 2 do { r = r :. coeff(p,i); };
+
+  return r;
+};
+
+procedure f32_write_list(p,l)
+{
+  var e,i;
+  e = length(l)-1;
+  write("const ", p,"_K: [f32; " @ length(l) @ "] = [\n");
+  display=hexadecimal!;
+  for i from 0 to e do { printsingle(l[i]); print(", //", l[i]); };
+  print("];\n");
+  display=decimal!;
+};
+
+procedure f32_data_write(name,a,f,r)
+{
+  var e,z,d,l,i;
+
+  // dump out the supnorm
+  display=hexadecimal!;
+  e = single(dirtyinfnorm(a-f, r));
+  write("// peak-error: ", e);
+  display=decimal!;
+  print(" (~", e,")");
+
+  // create a debug list of values of 'x' where the approximation
+  // error is max.
+  z = list_to_f32(dirtyfindzeros(diff(a-f), r));
+  l = length(z);
+
+
+  write("const " @ name @ "_EMAX: [f32; " @ l @ "] = [");
+
+  printsingle(z[0]);
+
+  for i from 1 to l-1 do {
+    write(",");
+    printsingle(z[i]);
+  };
+
+  z = list_to_f32(dirtyfindzeros(a-f, r));
+  l = length(z);
+
+  // and where the error is approximately zero
+  write("];\nconst " @ name @ "_EMIN: [f32; " @ l @ "] = [");
+
+  if (l > 0) then {
+    printsingle(z[0]);
+    for i from 1 to l-1 do {
+      write(",");
+      printsingle(z[i]);
+    };
+  };
+
+  // dump out the supnorm
+  e = single(dirtyinfnorm(a-f, r));
+  print("];\n");
+
+  a;
+};
+
+// Actual start of building polynomial
+
+print("sin(pi x) in binary32");
+I = [|1,3,5,7|];
+F = sin(pi*x);
+P = fpminimax(F,I,B,R,T,E);
+f32_data_write("F32_SIN_PI_7", P,F,R);
+f32_write_list("F32_SIN_PI_7", list_of_odd_coeff(P));
+print("\n");
+
+print("cos(pi x) in binary32");
+I = [|0,2,4,6,8|];
+F = cos(pi*x);
+P = fpminimax(F,I,B,R,T,E);
+f32_data_write("F32_COS_PI_8", P,F,R);
+f32_write_list("F32_COS_PI_8", list_of_even_coeff(P));
+print("\n");
+
+B = [|D,SG...|];
+
+print("tan(pi x) in binary32");
+I = [|1,3,5,7,9,11,13,15|];
+F = tan(pi*x);
+
+P = fpminimax(F,I,B,R,T,E);
+
+f32_data_write("F32_TAN_PI", P,F,R);
+f32_write_list("F32_TAN_PI", list_of_odd_coeff(P));
diff --git a/narcissus-maths/src/lib.rs b/narcissus-maths/src/lib.rs
index 0af05bf..ab5eda3 100644
--- a/narcissus-maths/src/lib.rs
+++ b/narcissus-maths/src/lib.rs
@@ -3,9 +3,12 @@ mod affine3;
 mod mat2;
 mod mat3;
 mod mat4;
+mod next_after_f32;
 mod point2;
 mod point3;
 mod quat;
+mod sin_cos_pi;
+mod tan_pi;
 mod vec2;
 mod vec3;
 mod vec4;
@@ -15,9 +18,12 @@ pub use affine3::Affine3;
 pub use mat2::Mat2;
 pub use mat3::Mat3;
 pub use mat4::Mat4;
+pub use next_after_f32::next_after_f32;
 pub use point2::Point2;
 pub use point3::Point3;
 pub use quat::Quat;
+pub use sin_cos_pi::sin_cos_pi_f32;
+pub use tan_pi::tan_pi_f32;
 pub use vec2::Vec2;
 pub use vec3::Vec3;
 pub use vec4::Vec4;
@@ -27,10 +33,12 @@ pub use vec4::Vec4;
 pub struct Rad(f32);
 
 impl Rad {
+    #[inline(always)]
     pub const fn new(x: f32) -> Self {
         Self(x)
     }
 
+    #[inline(always)]
     pub const fn as_f32(self) -> f32 {
         self.0
     }
@@ -41,10 +49,31 @@ impl Rad {
 pub struct Deg(f32);
 
 impl Deg {
+    #[inline(always)]
+    pub const fn new(x: f32) -> Self {
+        Self(x)
+    }
+
+    #[inline(always)]
+    pub const fn as_f32(self) -> f32 {
+        self.0
+    }
+}
+
+/// Unit type for an angle expressed in half-turns.
+///
+/// A turn represents a 360 degree rotation, a half-turn represents a 180 degree rotation. A
+/// half-turn is implicitly scaled by pi.
+#[derive(Clone, Copy, PartialEq, PartialOrd, Debug, Default)]
+pub struct HalfTurn(f32);
+
+impl HalfTurn {
+    #[inline(always)]
     pub const fn new(x: f32) -> Self {
         Self(x)
     }
 
+    #[inline(always)]
     pub const fn as_f32(self) -> f32 {
         self.0
     }
@@ -57,6 +86,12 @@ impl From<Rad> for Deg {
     }
 }
 
+impl From<HalfTurn> for Deg {
+    fn from(x: HalfTurn) -> Self {
+        Self(x.0 * 180.0)
+    }
+}
+
 impl From<Deg> for Rad {
     #[inline(always)]
     fn from(x: Deg) -> Self {
@@ -64,6 +99,27 @@ impl From<Deg> for Rad {
     }
 }
 
+impl From<HalfTurn> for Rad {
+    #[inline(always)]
+    fn from(x: HalfTurn) -> Self {
+        Self(x.0 * std::f32::consts::PI)
+    }
+}
+
+impl From<Rad> for HalfTurn {
+    #[inline(always)]
+    fn from(x: Rad) -> Self {
+        Self(x.0 / std::f32::consts::PI)
+    }
+}
+
+impl From<Deg> for HalfTurn {
+    #[inline(always)]
+    fn from(x: Deg) -> Self {
+        Self(x.0 / 180.0)
+    }
+}
+
 /// Returns the minimum of `x` and `y`.
 ///
 /// This function returns a platform dependent value if either of its inputs are `NaN`.
diff --git a/narcissus-maths/src/mat4.rs b/narcissus-maths/src/mat4.rs
index fe11369..c506275 100644
--- a/narcissus-maths/src/mat4.rs
+++ b/narcissus-maths/src/mat4.rs
@@ -1,4 +1,4 @@
-use crate::{Point2, Point3, Rad, Vec2, Vec3, Vec4};
+use crate::{sin_cos_pi_f32, HalfTurn, Point2, Point3, Rad, Vec2, Vec3, Vec4};
 
 /// 4x4 matrix.
 ///
@@ -111,8 +111,8 @@ impl Mat4 {
     }
 
     /// Constructs a transformation matrix which rotates around the given `axis` by `angle`.
-    pub fn from_axis_angle(axis: Vec3, angle: Rad) -> Mat4 {
-        let (sin, cos) = angle.as_f32().sin_cos();
+    pub fn from_axis_rotation(axis: Vec3, rotation: HalfTurn) -> Mat4 {
+        let (sin, cos) = sin_cos_pi_f32(rotation.as_f32());
         let axis_sin = axis * sin;
         let axis_sq = axis * axis;
         let one_minus_cos = 1.0 - cos;
@@ -489,8 +489,6 @@ impl std::ops::Mul<Point2> for Mat4 {
 
 #[cfg(test)]
 mod tests {
-    use crate::Deg;
-
     use super::*;
 
     const IDENTITY: Mat4 = Mat4::IDENTITY;
@@ -524,14 +522,17 @@ mod tests {
 
     #[test]
     fn axis_angle() {
-        let rot_180_x = Mat4::from_axis_angle(Vec3::X, Deg::new(180.0).into());
+        let rot_180_x = Mat4::from_axis_rotation(Vec3::X, HalfTurn::new(1.0));
         assert_eq!(rot_180_x * Vec3::X, Vec3::X);
-        // TODO: requires approximate equality assert.
-        // assert_eq!(rot_180_x * Vec3::Y, -Vec3::Y);
-        // assert_eq!(rot_180_x * Vec3::Z, -Vec3::Z);
-        let rot_180_y = Mat4::from_axis_angle(Vec3::Y, Deg::new(180.0).into());
+        assert_eq!(rot_180_x * Vec3::Y, -Vec3::Y);
+        assert_eq!(rot_180_x * Vec3::Z, -Vec3::Z);
+        let rot_180_y = Mat4::from_axis_rotation(Vec3::Y, HalfTurn::new(1.0));
+        assert_eq!(rot_180_y * Vec3::X, -Vec3::X);
         assert_eq!(rot_180_y * Vec3::Y, Vec3::Y);
-        let rot_180_z = Mat4::from_axis_angle(Vec3::Z, Deg::new(180.0).into());
+        assert_eq!(rot_180_y * Vec3::Z, -Vec3::Z);
+        let rot_180_z = Mat4::from_axis_rotation(Vec3::Z, HalfTurn::new(1.0));
+        assert_eq!(rot_180_z * Vec3::X, -Vec3::X);
+        assert_eq!(rot_180_z * Vec3::Y, -Vec3::Y);
         assert_eq!(rot_180_z * Vec3::Z, Vec3::Z);
     }
 
diff --git a/narcissus-maths/src/next_after_f32.rs b/narcissus-maths/src/next_after_f32.rs
new file mode 100644
index 0000000..0de54d3
--- /dev/null
+++ b/narcissus-maths/src/next_after_f32.rs
@@ -0,0 +1,57 @@
+/// Calculate the next representable floating-point value following x in the direction of y.
+///
+/// If y is less than x, these functions will return the largest representable number less than x.
+///
+/// # Returns
+///
+/// On success, the function returns the next representable floating-point value after x in the
+/// direction of y.
+///  
+/// * If `x` equals `y`, then `y` is returned.
+/// * If `x` or `y` is a `NaN`, a `NaN` is returned.
+/// * If `x` is finite, and the result would overflow, a range error occurs, and the function
+///   returns `inf` with the correct mathematical sign.
+/// * If `x` is not equal to `y`, and the correct function result would be subnormal, zero, or
+///   underflow, a range error occurs, and either the correct value (if it can be represented),
+///   or `0.0`, is returned.
+/// * If x equals y, the function returns y.
+pub fn next_after_f32(x: f32, y: f32) -> f32 {
+    if x.is_nan() || y.is_nan() {
+        return x + y;
+    }
+
+    let ux = x.to_bits();
+    let uy = y.to_bits();
+
+    if ux == uy {
+        return y;
+    }
+
+    let ax = ux & 0x7fff_ffff;
+    let ay = uy & 0x7fff_ffff;
+
+    let ux = if ax == 0 {
+        if ay == 0 {
+            return y;
+        }
+        (uy & 0x8000_0000) | 1
+    } else if ax > ay || ((ux ^ uy) & 0x8000_0000) != 0 {
+        ux - 1
+    } else {
+        ux + 1
+    };
+
+    let e = ux & 0x7f800000;
+    // Overflow if ux is infinite, and x is finite.
+    if e == 0x7f800000 {
+        return x + x;
+    }
+    // Force underflow if ux is subnormal or zero.
+    if e == 0 {
+        let mut force_eval = 0.0;
+        let val = f32::from_bits(ux);
+        unsafe { std::ptr::write_volatile(&mut force_eval, x * x + val * val) };
+    }
+
+    f32::from_bits(ux)
+}
diff --git a/narcissus-maths/src/sin_cos_pi.rs b/narcissus-maths/src/sin_cos_pi.rs
new file mode 100644
index 0000000..9d6f5fd
--- /dev/null
+++ b/narcissus-maths/src/sin_cos_pi.rs
@@ -0,0 +1,93 @@
+// Based on https://marc-b-reynolds.github.io/math/2020/03/11/SinCosPi.html
+//
+// Sollya code for generating these polynomials is in `doc/sincostan.sollya`
+
+// constants for sin(pi x), cos(pi x) for x on [-1/4,1/4]
+const F32_SIN_PI_7_K: [f32; 4] = unsafe {
+    std::mem::transmute::<[u32; 4], _>([
+        0x40490fdb, // 0x1.921fb6p1
+        0xc0a55df6, // -0x1.4abbecp2
+        0x40233590, // 0x1.466b2p1
+        0xbf17acc9, // -0x1.2f5992p-1
+    ])
+};
+
+const F32_COS_PI_8_K: [f32; 4] = unsafe {
+    std::mem::transmute::<[u32; 4], _>([
+        0xc09de9e6, // -0x1.3bd3ccp2
+        0x4081e0dc, // 0x1.03c1b8p2
+        0xbfaadbe7, // -0x1.55b7cep0
+        0x3e6b4255, // 0x1.d684aap-3
+    ])
+};
+
+#[inline(always)]
+fn mulsign_f32(x: f32, s: u32) -> f32 {
+    f32::from_bits(x.to_bits() ^ s)
+}
+
+/// Simultaneously computes the sine and cosine of `a` expressed in multiples of *pi* radians, or half-turns.
+///
+/// Returns `(sin(a * pi), cos(a * pi))`
+///
+/// # Examples
+///
+/// ```
+/// use narcissus_maths::sin_cos_pi_f32;
+/// let (sin, cos) = sin_cos_pi_f32(0.0);
+/// assert_eq!(sin, 0.0);
+/// assert_eq!(cos, 1.0);
+/// ```
+pub fn sin_cos_pi_f32(a: f32) -> (f32, f32) {
+    const S: [f32; 4] = F32_SIN_PI_7_K;
+    const C: [f32; 4] = F32_COS_PI_8_K;
+
+    // Range reduction.
+    let r = (a + a).round();
+    let i: u32 = unsafe { r.to_int_unchecked() };
+    let r = r.mul_add(-0.5, a);
+
+    let sx = (i >> 1) << 31;
+    let sy = (i << 31) ^ sx;
+
+    // Core approximation.
+    let r2 = r * r;
+    let r = mulsign_f32(r, sy);
+
+    let c = C[3];
+    let c = c.mul_add(r2, C[2]);
+    let c = c.mul_add(r2, C[1]);
+    let c = c.mul_add(r2, C[0]);
+    let c = c.mul_add(r2, 1.0);
+    let c = mulsign_f32(c, sx);
+
+    let r3 = r2 * r;
+    let s = S[3];
+    let s = s.mul_add(r2, S[2]);
+    let s = s.mul_add(r2, S[1]);
+    let s = r.mul_add(S[0], r3 * s);
+
+    if i & 1 != 0 {
+        (c, s)
+    } else {
+        (s, c)
+    }
+}
+
+#[cfg(test)]
+mod tests {
+    use crate::sin_cos_pi_f32;
+
+    #[test]
+    fn basics() {
+        assert_eq!(sin_cos_pi_f32(-1.5), (1.0, 0.0));
+        assert_eq!(sin_cos_pi_f32(-1.0), (0.0, -1.0));
+        assert_eq!(sin_cos_pi_f32(-0.5), (-1.0, 0.0));
+        assert_eq!(sin_cos_pi_f32(-0.0), (0.0, 1.0));
+
+        assert_eq!(sin_cos_pi_f32(0.0), (0.0, 1.0));
+        assert_eq!(sin_cos_pi_f32(0.5), (1.0, 0.0));
+        assert_eq!(sin_cos_pi_f32(1.0), (0.0, -1.0));
+        assert_eq!(sin_cos_pi_f32(1.5), (-1.0, 0.0));
+    }
+}
diff --git a/narcissus-maths/src/tan_pi.rs b/narcissus-maths/src/tan_pi.rs
new file mode 100644
index 0000000..e220f99
--- /dev/null
+++ b/narcissus-maths/src/tan_pi.rs
@@ -0,0 +1,77 @@
+// Based on Norbert Juffa's tanpi posted to the cuda forums. Using my own polynomial, but that might
+// be worse, todo: check whether polynomial is worse.
+// https://forums.developer.nvidia.com/t/an-implementation-of-single-precision-tanpi-for-cuda/48024
+//
+// Sollya code for generating these polynomials is in `doc/sincostan.sollya`
+
+const F32_TAN_PI_15_K: [f32; 7] = unsafe {
+    std::mem::transmute::<[u32; 7], _>([
+        0x41255def, // 0x1.4abbdep3
+        0x4223335b, // 0x1.4666b6p5
+        0x43234f4b, // 0x1.469e96p7
+        0x441e4604, // 0x1.3c8c08p9
+        0x4548fad9, // 0x1.91f5b2p11
+        0xc21a6851, // -0x1.34d0a2p5
+        0x47f775a0, // 0x1.eeeb4p16
+    ])
+};
+
+/// Computes the tangent of `a` expressed in multiples of *pi* radians, or half-turns.
+///
+/// Returns `tan(a * pi)`
+///
+/// # Examples
+///
+/// ```
+/// use narcissus_maths::tan_pi_f32;
+/// ```
+pub fn tan_pi_f32(a: f32) -> f32 {
+    const T: [f32; 7] = F32_TAN_PI_15_K;
+
+    // Range reduction.
+    let r = (a + a).round();
+    let i: u32 = unsafe { r.to_int_unchecked() };
+    let r = r.mul_add(-0.5, a);
+
+    let e = if i.wrapping_add(1) & 2 != 0 {
+        -0.0
+    } else {
+        0.0
+    };
+
+    // Core approximation.
+    let r2 = r * r;
+    let p = T[6];
+    let p = p.mul_add(r2, T[5]);
+    let p = p.mul_add(r2, T[4]);
+    let p = p.mul_add(r2, T[3]);
+    let p = p.mul_add(r2, T[2]);
+    let p = p.mul_add(r2, T[1]);
+    let p = p.mul_add(r2, T[0]);
+
+    let t = r2 * r;
+    let t = p.mul_add(t, -8.742278e-8 * r);
+    let r = r.mul_add(std::f32::consts::PI, t);
+
+    // Handle half-integer arguments.
+    let r = if r == 0.0 { e } else { r };
+    let r = if i & 1 == 1 { 1.0 / -r } else { r };
+
+    // Handle integer arguments.
+    if a == a.floor() {
+        a * e
+    } else {
+        r
+    }
+}
+
+#[cfg(test)]
+mod tests {
+    use super::tan_pi_f32;
+
+    #[test]
+    fn basics() {
+        assert_eq!(tan_pi_f32(0.0), 0.0);
+        assert_eq!(tan_pi_f32(0.25), 1.0);
+    }
+}
diff --git a/narcissus-maths/tests/exhaustive_f32.rs b/narcissus-maths/tests/exhaustive_f32.rs
new file mode 100644
index 0000000..9d9f85c
--- /dev/null
+++ b/narcissus-maths/tests/exhaustive_f32.rs
@@ -0,0 +1,246 @@
+/// Rough translation of Paul Zimmermann's binary32_exhaustive to Rust.
+/// https://gitlab.inria.fr/zimmerma/math_accuracy/-/blob/master/binary32_exhaustive/check_exhaustive.c
+use std::{
+    ops::MulAssign,
+    sync::atomic::{AtomicUsize, Ordering},
+};
+
+use narcissus_maths::{next_after_f32, sin_cos_pi_f32, tan_pi_f32};
+use rug::{
+    ops::{AssignRound, DivAssignRound},
+    Assign,
+};
+
+fn ulp_error_f32(our_val: f32, ref_val: f32, x: f32, reference: fn(&mut rug::Float)) -> u32 {
+    if our_val == ref_val {
+        return 0;
+    }
+
+    let mut our_val = our_val;
+    let mut ref_val = ref_val;
+
+    if ref_val.is_infinite() {
+        if our_val.is_infinite() {
+            // Then ours and reference have different signs.
+            assert!(our_val * ref_val < 0.0);
+            return u32::MAX;
+        }
+
+        our_val /= 2.0;
+        let mut ref_val_2 = rug::Float::new(24);
+        ref_val_2.assign_round(x, rug::float::Round::Down);
+        reference(&mut ref_val_2);
+        ref_val_2.div_assign_round(2, rug::float::Round::Down);
+        ref_val = ref_val_2.to_f32_round(rug::float::Round::Down);
+        if ref_val.is_infinite() {
+            ref_val = if ref_val > 0.0 {
+                f32::from_bits(0x7f00_0000)
+            } else {
+                f32::from_bits(0xff00_0000)
+            };
+        }
+    }
+
+    if our_val.is_infinite() {
+        assert!(!ref_val.is_infinite());
+        // If ours gives +/-Inf but the correct rounding is in the binary32 range, assume we gave
+        // +/-2^128.
+        ref_val /= 2.0;
+        our_val = if our_val > 0.0 {
+            f32::from_bits(0x7f00_0000)
+        } else {
+            f32::from_bits(0xff00_0000)
+        }
+    }
+
+    let err = our_val - ref_val;
+    let ulp = next_after_f32(ref_val, our_val) - ref_val;
+    let err = (err / ulp).abs();
+    if err >= u32::MAX as f32 {
+        u32::MAX
+    } else {
+        err as u32
+    }
+}
+
+#[derive(Clone, Copy, Debug)]
+struct TestRange {
+    base: u32,
+    end: u32,
+}
+
+#[derive(Clone, Copy, Debug, Default)]
+struct FloatErrors {
+    num_errors: u32,
+    max_error_ulp: u32,
+    max_error_val: u32,
+}
+
+const PREC: u32 = 280;
+
+struct FloatCheck {
+    reference: fn(&mut rug::Float),
+    ours: fn(f32) -> f32,
+
+    tan_mode: bool,
+
+    pi: rug::Float,
+    tmp: rug::Float,
+
+    errors: FloatErrors,
+}
+
+impl FloatCheck {
+    fn new(reference: fn(&mut rug::Float), ours: fn(f32) -> f32, tan_mode: bool) -> Self {
+        Self {
+            reference,
+            ours,
+            tan_mode,
+            pi: rug::Float::with_val(PREC, rug::float::Constant::Pi),
+            tmp: rug::Float::new(PREC),
+            errors: Default::default(),
+        }
+    }
+
+    #[inline(always)]
+    fn check(&mut self, u: u32) {
+        let x = f32::from_bits(u);
+        assert!(x.is_finite());
+
+        let our_val = (self.ours)(x);
+
+        if self.tan_mode {
+            let fract = x.fract();
+            if fract == 0.5 || fract == -0.5 {
+                assert!(our_val.is_infinite());
+                return;
+            }
+        }
+
+        self.tmp.assign(x);
+        self.tmp.mul_assign(&self.pi);
+        (self.reference)(&mut self.tmp);
+        self.tmp.subnormalize_ieee();
+        let ref_val = self.tmp.to_f32_round(rug::float::Round::Nearest);
+
+        if our_val != ref_val && !(our_val.is_nan() && ref_val.is_nan()) {
+            println!("u: {u:#08x}, our_val: {our_val}, ref_val: {ref_val}");
+
+            self.errors.num_errors += 1;
+            let e = ulp_error_f32(our_val, ref_val, x, self.reference);
+            if e > self.errors.max_error_ulp {
+                self.errors.max_error_ulp = e;
+                self.errors.max_error_val = u;
+            }
+        }
+    }
+}
+
+fn check_exhaustive_f32(reference: fn(&mut rug::Float), ours: fn(f32) -> f32, tan_mode: bool) {
+    const COUNT: u32 = 2_139_095_040_u32;
+    const SPLIT: u32 = 256;
+
+    // Generate ranges.
+    assert_eq!(COUNT % SPLIT, 0);
+    let per_split = COUNT / SPLIT;
+
+    let work_index = AtomicUsize::new(0);
+    let mut work = (0..SPLIT)
+        .map(|i| {
+            let base = i * per_split;
+            let end = i * per_split + per_split;
+            TestRange { base, end }
+        })
+        .collect::<Vec<_>>();
+
+    // Try and start with big numbers to avoid reallocs?
+    work.reverse();
+
+    let count = AtomicUsize::new(0);
+
+    let mut errors = FloatErrors::default();
+
+    // Spawn threads.
+    std::thread::scope(|s| {
+        let num_threads = std::thread::available_parallelism().unwrap().get();
+
+        let threads = (0..num_threads)
+            .map(|_| {
+                s.spawn(|| {
+                    let mut float_check = FloatCheck::new(reference, ours, tan_mode);
+                    loop {
+                        let index = work_index.fetch_add(1, Ordering::SeqCst);
+                        if let Some(range) = work.get(index) {
+                            let start = std::time::Instant::now();
+                            for i in range.base..range.end {
+                                float_check.check(i);
+                            }
+                            for i in range.base..range.end {
+                                float_check.check(0x8000_0000 + i);
+                            }
+
+                            let i = count.fetch_add(1, Ordering::SeqCst);
+                            println!(
+                                "{:.1}% chunk {index} took {:?}",
+                                ((i + 1) as f32 * (1.0 / SPLIT as f32)) * 100.0,
+                                std::time::Instant::now() - start
+                            );
+
+                            continue;
+                        }
+                        break;
+                    }
+                    float_check.errors
+                })
+            })
+            .collect::<Vec<_>>();
+
+        for thread in threads {
+            let thread_errors = thread.join().unwrap();
+            errors.num_errors += thread_errors.num_errors;
+            if thread_errors.max_error_ulp > errors.max_error_ulp {
+                errors.max_error_ulp = thread_errors.max_error_ulp;
+                errors.max_error_val = thread_errors.max_error_val;
+            }
+        }
+    });
+
+    println!(
+        "errors: {} ({:.2}%)",
+        errors.num_errors,
+        (errors.num_errors as f64 / (COUNT * 2) as f64) * 100.0
+    );
+    println!("max error ulps: {}", errors.max_error_ulp);
+    println!("max error error at: {:#08x}", errors.max_error_val);
+    assert_eq!(errors.num_errors, 0);
+}
+
+fn ref_sin_pi_f32(x: &mut rug::Float) {
+    x.sin_mut();
+}
+
+fn ref_cos_pi_f32(x: &mut rug::Float) {
+    x.cos_mut();
+}
+
+fn ref_tan_pi_f32(x: &mut rug::Float) {
+    x.tan_mut();
+}
+
+#[test]
+#[ignore]
+pub fn exhaustive_sin_pi() {
+    check_exhaustive_f32(ref_sin_pi_f32, |a| sin_cos_pi_f32(a).0, false)
+}
+
+#[test]
+#[ignore]
+pub fn exhaustive_cos_pi() {
+    check_exhaustive_f32(ref_cos_pi_f32, |a| sin_cos_pi_f32(a).1, false)
+}
+
+#[test]
+#[ignore]
+pub fn exhaustive_tan_pi() {
+    check_exhaustive_f32(ref_tan_pi_f32, tan_pi_f32, true)
+}
diff --git a/narcissus-maths/tests/next_after_f32.rs b/narcissus-maths/tests/next_after_f32.rs
new file mode 100644
index 0000000..e03e099
--- /dev/null
+++ b/narcissus-maths/tests/next_after_f32.rs
@@ -0,0 +1,43 @@
+use narcissus_maths::next_after_f32;
+
+mod libc {
+    use std::ffi::c_float;
+    extern "C" {
+        pub fn nextafterf(x: c_float, y: c_float) -> c_float;
+    }
+}
+
+#[inline(always)]
+fn nextafterf(x: f32, y: f32) -> f32 {
+    unsafe { libc::nextafterf(x, y) }
+}
+
+fn test_towards(y: f32) {
+    for u in 0..=0xffff_ffff_u32 {
+        let x = f32::from_bits(u);
+        let ours = next_after_f32(x, y);
+        let reference = nextafterf(x, y);
+        assert!(
+            ours == reference || (ours.is_nan() && reference.is_nan()),
+            "x ({u:X}): ours ({ours}) != reference ({reference})"
+        );
+    }
+}
+
+#[test]
+#[ignore]
+fn next_after_f32_to_zero() {
+    test_towards(0.0)
+}
+
+#[test]
+#[ignore]
+fn next_after_f32_to_inf() {
+    test_towards(f32::INFINITY)
+}
+
+#[test]
+#[ignore]
+fn next_after_f32_to_neg_inf() {
+    test_towards(f32::NEG_INFINITY)
+}