bitshifter
diff --git a/‎codegen/templates/quat.rs.tera
+26-103 b/‎codegen/templates/quat.rs.tera
+26-103
diff --git a/‎src/f32/coresimd/quat.rs
+15-20 b/‎src/f32/coresimd/quat.rs
+15-20
diff --git a/‎src/f32/neon/quat.rs
+21-34 b/‎src/f32/neon/quat.rs
+21-34
@@ -750,6 +750,12 @@ impl {{ self_t }} {
         {{ vec4_t }}::from(self).abs_diff_eq({{ vec4_t }}::from(rhs), max_abs_diff)
     }
 
+    #[inline(always)]
+    #[must_use]
+    fn lerp_impl(self, end: Self, s: {{ scalar_t }}) -> Self {
+        (self * (1.0 - s) + end * s).normalize()
+    }
+
     /// Performs a linear interpolation between `self` and `rhs` based on
     /// the value `s`.
     ///
@@ -767,69 +773,41 @@ impl {{ self_t }} {
         glam_assert!(end.is_normalized());
 
         {% if is_scalar %}
-            let start = self;
-            let dot = start.dot(end);
+            let dot = self.dot(end);
             let bias = if dot >= 0.0 { 1.0 } else { -1.0 };
-            let interpolated = start.add(end.mul(bias).sub(start).mul(s));
-            interpolated.normalize()
+            self.lerp_impl(end * bias, s) 
         {% elif is_sse2 %}
             const NEG_ZERO: __m128 = m128_from_f32x4([-0.0; 4]);
-            let start = self.0;
-            let end = end.0;
             unsafe {
-                let dot = dot4_into_m128(start, end);
+                let dot = dot4_into_m128(self.0, end.0);
                 // Calculate the bias, if the dot product is positive or zero, there is no bias
                 // but if it is negative, we want to flip the 'end' rotation XYZW components
                 let bias = _mm_and_ps(dot, NEG_ZERO);
-                let interpolated = _mm_add_ps(
-                    _mm_mul_ps(_mm_sub_ps(_mm_xor_ps(end, bias), start), _mm_set_ps1(s)),
-                    start,
-                );
-                {{ self_t }}(interpolated).normalize()
+                self.lerp_impl(Self(_mm_xor_ps(end.0, bias)), s)
             }
         {% elif is_wasm32 %}
             const NEG_ZERO: v128 = v128_from_f32x4([-0.0; 4]);
-            let start = self.0;
-            let end = end.0;
-            let dot = dot4_into_v128(start, end);
+            let dot = dot4_into_v128(self.0, end.0);
             // Calculate the bias, if the dot product is positive or zero, there is no bias
             // but if it is negative, we want to flip the 'end' rotation XYZW components
             let bias = v128_and(dot, NEG_ZERO);
-            let interpolated = f32x4_add(
-                f32x4_mul(f32x4_sub(v128_xor(end, bias), start), f32x4_splat(s)),
-                start,
-            );
-            {{ self_t }}(interpolated).normalize()
+            self.lerp_impl(Self(v128_xor(end.0, bias)), s)
         {% elif is_coresimd %}
             const NEG_ZERO: f32x4 = f32x4::from_array([-0.0; 4]);
-            let start = self.0;
-            let end = end.0;
-            let dot = dot4_into_f32x4(start, end);
+            let dot = dot4_into_f32x4(self.0, end.0);
             // Calculate the bias, if the dot product is positive or zero, there is no bias
             // but if it is negative, we want to flip the 'end' rotation XYZW components
             let bias = f32x4_bitand(dot, NEG_ZERO);
-            let interpolated = start + ((f32x4_bitxor(end, bias) - start) * f32x4::splat(s));
-            {{ self_t }}(interpolated).normalize()
+            self.lerp_impl(Self(f32x4_bitxor(end.0, bias)), s)
         {% elif is_neon %}
             const NEG_ZERO: float32x4_t = f32x4_from_array([-0.0; 4]);
-            let start = self.0;
-            let end = end.0;
             unsafe {
-                let dot = dot4_into_f32x4(start, end);
+                let dot = dot4_into_f32x4(self.0, end.0);
                 // Calculate the bias, if the dot product is positive or zero, there is no bias
                 // but if it is negative, we want to flip the 'end' rotation XYZW components
                 let bias = vandq_u32(vreinterpretq_u32_f32(dot), vreinterpretq_u32_f32(NEG_ZERO));
-                let interpolated = vaddq_f32(
-                    vmulq_f32(
-                        vsubq_f32(
-                            vreinterpretq_f32_u32(veorq_u32(vreinterpretq_u32_f32(end), bias)),
-                            start,
-                        ),
-                        vld1q_dup_f32(&s),
-                    ),
-                    start,
-                );
-                {{ self_t }}(interpolated).normalize()
+                self.lerp_impl(
+                    Self(vreinterpretq_f32_u32(veorq_u32(vreinterpretq_u32_f32(end.0), bias))), s)
             }
         {% else %}
             unimplemented!()
@@ -852,8 +830,6 @@ impl {{ self_t }} {
         glam_assert!(self.is_normalized());
         glam_assert!(end.is_normalized());
 
-        const DOT_THRESHOLD: {{ scalar_t }} = 0.9995;
-
         // Note that a rotation can be represented by two quaternions: `q` and
         // `-q`. The slerp path between `q` and `end` will be different from the
         // path between `-q` and `end`. One path will take the long way around and
@@ -866,20 +842,13 @@ impl {{ self_t }} {
             dot = -dot;
         }
 
+        const DOT_THRESHOLD: {{ scalar_t }} = 1.0 - {{ scalar_t }}::EPSILON;
         if dot > DOT_THRESHOLD {
-            // assumes lerp returns a normalized quaternion
-            self.lerp(end, s)
+            // if above threshold perform linear interpolation to avoid divide by zero
+            self.lerp_impl(end, s)
         } else {
             let theta = math::acos_approx(dot);
-            {% if is_scalar %}
-                let scale1 = math::sin(theta * (1.0 - s));
-                let scale2 = math::sin(theta * s);
-                let theta_sin = math::sin(theta);
-
-                self.mul(scale1)
-                    .add(end.mul(scale2))
-                    .mul(1.0 / theta_sin)
-            {% elif is_sse2 %}
+            {% if is_sse2 %}
                 let x = 1.0 - s;
                 let y = s;
                 let z = 1.0;
@@ -897,57 +866,11 @@ impl {{ self_t }} {
                         theta_sin,
                     ))
                 }
-            {% elif is_wasm32 %}
-                // TODO: v128_sin is broken
-                // let x = 1.0 - s;
-                // let y = s;
-                // let z = 1.0;
-                // let w = 0.0;
-                // let tmp = f32x4_mul(f32x4_splat(theta), f32x4(x, y, z, w));
-                // let tmp = v128_sin(tmp);
-                let x = math::sin(theta * (1.0 - s));
-                let y = math::sin(theta * s);
-                let z = math::sin(theta);
-                let w = 0.0;
-                let tmp = f32x4(x, y, z, w);
-
-                let scale1 = i32x4_shuffle::<0, 0, 4, 4>(tmp, tmp);
-                let scale2 = i32x4_shuffle::<1, 1, 5, 5>(tmp, tmp);
-                let theta_sin = i32x4_shuffle::<2, 2, 6, 6>(tmp, tmp);
-
-                Self(f32x4_div(
-                    f32x4_add(f32x4_mul(self.0, scale1), f32x4_mul(end.0, scale2)),
-                    theta_sin,
-                ))
-            {% elif is_coresimd %}
-                let x = math::sin(theta * (1.0 - s));
-                let y = math::sin(theta * s);
-                let z = math::sin(theta);
-                let w = 0.0;
-                let tmp = f32x4::from_array([x, y, z, w]);
-
-                let scale1 = simd_swizzle!(tmp, [0, 0, 0, 0]);
-                let scale2 = simd_swizzle!(tmp, [1, 1, 1, 1]);
-                let theta_sin = simd_swizzle!(tmp, [2, 2, 2, 2]);
-
-                Self(self.0.mul(scale1).add(end.0.mul(scale2)).div(theta_sin))
-            {% elif is_neon %}
-                let x = math::sin(theta * (1.0 - s));
-                let y = math::sin(theta * s);
-                let z = math::sin(theta);
-                let w = 0.0;
-                unsafe {
-                    let tmp = vld1q_f32([x, y, z, w].as_ptr());
-
-                    let scale1 = vdupq_laneq_f32(tmp, 0);
-                    let scale2 = vdupq_laneq_f32(tmp, 1);
-                    let theta_sin = vdupq_laneq_f32(tmp, 2);
-
-                    Self(vdivq_f32(
-                        vaddq_f32(vmulq_f32(self.0, scale1), vmulq_f32(end.0, scale2)),
-                        theta_sin,
-                    ))
-                }
+            {% else %}
+                let scale1 = math::sin(theta * (1.0 - s));
+                let scale2 = math::sin(theta * s);
+                let theta_sin = math::sin(theta);
+                ((self * scale1) + (end * scale2)) * (1.0 / theta_sin)
             {% endif %}
         }
     }
 
@@ -606,6 +606,12 @@ impl Quat {
         Vec4::from(self).abs_diff_eq(Vec4::from(rhs), max_abs_diff)
     }
 
+    #[inline(always)]
+    #[must_use]
+    fn lerp_impl(self, end: Self, s: f32) -> Self {
+        (self * (1.0 - s) + end * s).normalize()
+    }
+
     /// Performs a linear interpolation between `self` and `rhs` based on
     /// the value `s`.
     ///
@@ -623,14 +629,11 @@ impl Quat {
         glam_assert!(end.is_normalized());
 
         const NEG_ZERO: f32x4 = f32x4::from_array([-0.0; 4]);
-        let start = self.0;
-        let end = end.0;
-        let dot = dot4_into_f32x4(start, end);
+        let dot = dot4_into_f32x4(self.0, end.0);
         // Calculate the bias, if the dot product is positive or zero, there is no bias
         // but if it is negative, we want to flip the 'end' rotation XYZW components
         let bias = f32x4_bitand(dot, NEG_ZERO);
-        let interpolated = start + ((f32x4_bitxor(end, bias) - start) * f32x4::splat(s));
-        Quat(interpolated).normalize()
+        self.lerp_impl(Self(f32x4_bitxor(end.0, bias)), s)
     }
 
     /// Performs a spherical linear interpolation between `self` and `end`
@@ -649,8 +652,6 @@ impl Quat {
         glam_assert!(self.is_normalized());
         glam_assert!(end.is_normalized());
 
-        const DOT_THRESHOLD: f32 = 0.9995;
-
         // Note that a rotation can be represented by two quaternions: `q` and
         // `-q`. The slerp path between `q` and `end` will be different from the
         // path between `-q` and `end`. One path will take the long way around and
@@ -663,23 +664,17 @@ impl Quat {
             dot = -dot;
         }
 
+        const DOT_THRESHOLD: f32 = 1.0 - f32::EPSILON;
         if dot > DOT_THRESHOLD {
-            // assumes lerp returns a normalized quaternion
-            self.lerp(end, s)
+            // if above threshold perform linear interpolation to avoid divide by zero
+            self.lerp_impl(end, s)
         } else {
             let theta = math::acos_approx(dot);
 
-            let x = math::sin(theta * (1.0 - s));
-            let y = math::sin(theta * s);
-            let z = math::sin(theta);
-            let w = 0.0;
-            let tmp = f32x4::from_array([x, y, z, w]);
-
-            let scale1 = simd_swizzle!(tmp, [0, 0, 0, 0]);
-            let scale2 = simd_swizzle!(tmp, [1, 1, 1, 1]);
-            let theta_sin = simd_swizzle!(tmp, [2, 2, 2, 2]);
-
-            Self(self.0.mul(scale1).add(end.0.mul(scale2)).div(theta_sin))
+            let scale1 = math::sin(theta * (1.0 - s));
+            let scale2 = math::sin(theta * s);
+            let theta_sin = math::sin(theta);
+            ((self * scale1) + (end * scale2)) * (1.0 / theta_sin)
         }
     }
 
 
@@ -611,6 +611,12 @@ impl Quat {
         Vec4::from(self).abs_diff_eq(Vec4::from(rhs), max_abs_diff)
     }
 
+    #[inline(always)]
+    #[must_use]
+    fn lerp_impl(self, end: Self, s: f32) -> Self {
+        (self * (1.0 - s) + end * s).normalize()
+    }
+
     /// Performs a linear interpolation between `self` and `rhs` based on
     /// the value `s`.
     ///
@@ -628,24 +634,18 @@ impl Quat {
         glam_assert!(end.is_normalized());
 
         const NEG_ZERO: float32x4_t = f32x4_from_array([-0.0; 4]);
-        let start = self.0;
-        let end = end.0;
         unsafe {
-            let dot = dot4_into_f32x4(start, end);
+            let dot = dot4_into_f32x4(self.0, end.0);
             // Calculate the bias, if the dot product is positive or zero, there is no bias
             // but if it is negative, we want to flip the 'end' rotation XYZW components
             let bias = vandq_u32(vreinterpretq_u32_f32(dot), vreinterpretq_u32_f32(NEG_ZERO));
-            let interpolated = vaddq_f32(
-                vmulq_f32(
-                    vsubq_f32(
-                        vreinterpretq_f32_u32(veorq_u32(vreinterpretq_u32_f32(end), bias)),
-                        start,
-                    ),
-                    vld1q_dup_f32(&s),
-                ),
-                start,
-            );
-            Quat(interpolated).normalize()
+            self.lerp_impl(
+                Self(vreinterpretq_f32_u32(veorq_u32(
+                    vreinterpretq_u32_f32(end.0),
+                    bias,
+                ))),
+                s,
+            )
         }
     }
 
@@ -665,8 +665,6 @@ impl Quat {
         glam_assert!(self.is_normalized());
         glam_assert!(end.is_normalized());
 
-        const DOT_THRESHOLD: f32 = 0.9995;
-
         // Note that a rotation can be represented by two quaternions: `q` and
         // `-q`. The slerp path between `q` and `end` will be different from the
         // path between `-q` and `end`. One path will take the long way around and
@@ -679,28 +677,17 @@ impl Quat {
             dot = -dot;
         }
 
+        const DOT_THRESHOLD: f32 = 1.0 - f32::EPSILON;
         if dot > DOT_THRESHOLD {
-            // assumes lerp returns a normalized quaternion
-            self.lerp(end, s)
+            // if above threshold perform linear interpolation to avoid divide by zero
+            self.lerp_impl(end, s)
         } else {
             let theta = math::acos_approx(dot);
 
-            let x = math::sin(theta * (1.0 - s));
-            let y = math::sin(theta * s);
-            let z = math::sin(theta);
-            let w = 0.0;
-            unsafe {
-                let tmp = vld1q_f32([x, y, z, w].as_ptr());
-
-                let scale1 = vdupq_laneq_f32(tmp, 0);
-                let scale2 = vdupq_laneq_f32(tmp, 1);
-                let theta_sin = vdupq_laneq_f32(tmp, 2);
-
-                Self(vdivq_f32(
-                    vaddq_f32(vmulq_f32(self.0, scale1), vmulq_f32(end.0, scale2)),
-                    theta_sin,
-                ))
-            }
+            let scale1 = math::sin(theta * (1.0 - s));
+            let scale2 = math::sin(theta * s);
+            let theta_sin = math::sin(theta);
+            ((self * scale1) + (end * scale2)) * (1.0 / theta_sin)
         }
     }