+#[allow(unused)]
+#[cfg(target_arch = "x86_64")]
+mod x86_64 {
+ use std::arch::x86_64::*;
+ use std::f32::consts::PI;
+
+ use crate::sin_cos_pi::{F32_COS_PI_8_K, F32_SIN_PI_7_K};
+
+ #[inline(always)]
+ fn to_m128(x: __m128i) -> __m128 {
+ // SAFETY: Unconditionally safe.
+ unsafe { std::mem::transmute::<__m128i, __m128>(x) }
+ }
+
+ #[inline(always)]
+ fn to_m128i(x: __m128) -> __m128i {
+ // SAFETY: Unconditionally safe.
+ unsafe { std::mem::transmute::<__m128, __m128i>(x) }
+ }
+
+ #[target_feature(enable = "sse4.1,fma")]
+ pub unsafe fn sin_cos_pi_f32_sse41_fma(a: f32) -> (f32, f32) {
+ const S: [f32; 3] = F32_SIN_PI_7_K;
+ const C: [f32; 4] = F32_COS_PI_8_K;
+
+ let mut sin_out = 0.0;
+ let mut cos_out = 0.0;
+
+ unsafe {
+ let a = _mm_load_ss(&a);
+ let a_zero = _mm_mul_ss(_mm_setzero_ps(), a);
+ let a_abs = _mm_andnot_ps(_mm_load_ss(&-0.0), a);
+ let cmp_mask = _mm_cmplt_ss(a_abs, _mm_load_ss(&16777216.0));
+ let a = _mm_blendv_ps(a_zero, a, cmp_mask);
+ let a2 = _mm_add_ss(a, a);
+ let r = _mm_round_ss::<_MM_FROUND_TO_NEAREST_INT>(a2, a2);
+ let i = _mm_cvttps_epi32(r);
+
+ let r = _mm_fmadd_ss(r, _mm_load_ss(&-0.5), a);
+ let r_sq = _mm_mul_ss(r, r);
+
+ let i_lsb = _mm_slli_epi32::<31>(i);
+ let x_sign = _mm_slli_epi32::<31>(_mm_srli_epi32::<1>(i));
+ let y_sign = _mm_xor_si128(i_lsb, x_sign);
+ let r_sign = to_m128(_mm_xor_si128(to_m128i(r), y_sign));
+ let r_sq_sign = to_m128(_mm_xor_si128(to_m128i(r_sq), x_sign));
+ let one_sign = to_m128(_mm_xor_si128(to_m128i(_mm_load_ss(&1.0)), x_sign));
+
+ let c = _mm_load_ss(&C[3]);
+ let c = _mm_fmadd_ss(c, r_sq, _mm_load_ss(&C[2]));
+ let c = _mm_fmadd_ss(c, r_sq, _mm_load_ss(&C[1]));
+ let c = _mm_fmadd_ss(c, r_sq, _mm_load_ss(&C[0]));
+ let c = _mm_fmadd_ss(c, r_sq_sign, one_sign);
+
+ let s = _mm_load_ss(&S[2]);
+ let s = _mm_fmadd_ss(s, r_sq, _mm_load_ss(&S[1]));
+ let s = _mm_fmadd_ss(s, r_sq, _mm_load_ss(&S[0]));
+ let s = _mm_fmadd_ss(r_sq, s, _mm_load_ss(&-8.742278e-8));
+ let s = _mm_fmadd_ss(r_sign, _mm_load_ss(&PI), _mm_mul_ss(r_sign, s));
+
+ let sin = _mm_blendv_ps(s, c, to_m128(i_lsb));
+ let cos = _mm_blendv_ps(c, s, to_m128(i_lsb));
+
+ let a_floor = _mm_round_ss::<_MM_FROUND_FLOOR>(a, a);
+ let positive_int_mask = _mm_cmpeq_ss(a, a_floor);
+ let a_zero = _mm_mul_ss(_mm_setzero_ps(), a);
+ let sin = _mm_blendv_ps(sin, a_zero, positive_int_mask);
+
+ _mm_store_ss(&mut sin_out, sin);
+ _mm_store_ss(&mut cos_out, cos);
+ }
+
+ (sin_out, cos_out)
+ }
+
+ #[allow(unused)]
+ #[cfg(target_arch = "x86_64")]
+ #[target_feature(enable = "sse4.1,fma")]
+ pub unsafe fn sin_cos_pi_f32x4_sse41_fma(
+ a: &[f32; 4],
+ sin_out: &mut [f32; 4],
+ cos_out: &mut [f32; 4],
+ ) {
+ const S: [f32; 3] = F32_SIN_PI_7_K;
+ const C: [f32; 4] = F32_COS_PI_8_K;
+
+ unsafe {
+ let a = _mm_loadu_ps(a.as_ptr());
+ let a_zero = _mm_mul_ps(_mm_setzero_ps(), a);
+ let a_abs = _mm_andnot_ps(_mm_set1_ps(-0.0), a);
+ let cmp_mask = _mm_cmplt_ps(a_abs, _mm_set1_ps(16777216.0));
+ let a = _mm_blendv_ps(a_zero, a, cmp_mask);
+ let r = _mm_round_ps::<_MM_FROUND_TO_NEAREST_INT>(_mm_add_ps(a, a));
+ let i = _mm_cvttps_epi32(r);
+
+ let r = _mm_fmadd_ps(r, _mm_set1_ps(-0.5), a);
+ let r_sq = _mm_mul_ps(r, r);
+
+ let i_msb = _mm_slli_epi32::<31>(i);
+ let x_sign = _mm_slli_epi32::<31>(_mm_srli_epi32::<1>(i));
+ let y_sign = _mm_xor_si128(i_msb, x_sign);
+ let r_sign = to_m128(_mm_xor_si128(to_m128i(r), y_sign));
+ let r_sq_sign = to_m128(_mm_xor_si128(to_m128i(r_sq), x_sign));
+ let one_sign = to_m128(_mm_xor_si128(to_m128i(_mm_set1_ps(1.0)), x_sign));
+
+ let c = _mm_load1_ps(&C[3]);
+ let c = _mm_fmadd_ps(c, r_sq, _mm_load1_ps(&C[2]));
+ let c = _mm_fmadd_ps(c, r_sq, _mm_load1_ps(&C[1]));
+ let c = _mm_fmadd_ps(c, r_sq, _mm_load1_ps(&C[0]));
+ let c = _mm_fmadd_ps(c, r_sq_sign, one_sign);
+
+ let s = _mm_load1_ps(&S[2]);
+ let s = _mm_fmadd_ps(s, r_sq, _mm_load1_ps(&S[1]));
+ let s = _mm_fmadd_ps(s, r_sq, _mm_load1_ps(&S[0]));
+ let s = _mm_fmadd_ps(r_sq, s, _mm_set1_ps(-8.742278e-8));
+ let s = _mm_fmadd_ps(r_sign, _mm_set1_ps(PI), _mm_mul_ps(r_sign, s));
+
+ let sin = _mm_blendv_ps(s, c, to_m128(i_msb));
+ let cos = _mm_blendv_ps(c, s, to_m128(i_msb));
+
+ let a_floor = _mm_round_ps::<_MM_FROUND_FLOOR>(a);
+ let positive_int_mask = _mm_cmpeq_ps(a, a_floor);
+ let a_zero = _mm_mul_ps(_mm_setzero_ps(), a);
+ let sin = _mm_blendv_ps(sin, a_zero, positive_int_mask);
+
+ _mm_storeu_ps(sin_out.as_mut_ptr(), sin);
+ _mm_storeu_ps(cos_out.as_mut_ptr(), cos);
+ }
+ }
+}
+