From: Joshua Simmons <josh@nega.tv>
Date: Sun, 4 Sep 2022 21:01:33 +0000 (+0200)
Subject: Start implementing basic maths functions
X-Git-Url: https://git.nega.tv//gitweb.cgi?a=commitdiff_plain;h=b0181a7ae785b9dc0122ce81d8fd23130e88db47;p=josh%2Fnarcissus

Start implementing basic maths functions
---

diff --git a/Cargo.lock b/Cargo.lock
index d2584da..e21a220 100644
--- a/Cargo.lock
+++ b/Cargo.lock
@@ -16,6 +16,7 @@ dependencies = [
  "narcissus-app",
  "narcissus-core",
  "narcissus-gpu",
+ "narcissus-maths",
 ]
 
 [[package]]
@@ -44,7 +45,7 @@ dependencies = [
 ]
 
 [[package]]
-name = "narcissus-math"
+name = "narcissus-maths"
 version = "0.1.0"
 
 [[package]]
diff --git a/narcissus-maths/Cargo.toml b/narcissus-maths/Cargo.toml
index e993f6c..344b2d1 100644
--- a/narcissus-maths/Cargo.toml
+++ b/narcissus-maths/Cargo.toml
@@ -1,8 +1,10 @@
 [package]
-name = "narcissus-math"
+name = "narcissus-maths"
 version = "0.1.0"
 edition = "2021"
 
 # See more keys and their definitions at https://doc.rust-lang.org/cargo/reference/manifest.html
 
+[features]
+
 [dependencies]
diff --git a/narcissus-maths/src/lib.rs b/narcissus-maths/src/lib.rs
index 77b7ef0..817616f 100644
--- a/narcissus-maths/src/lib.rs
+++ b/narcissus-maths/src/lib.rs
@@ -1,46 +1,1041 @@
-#[derive(Clone, Copy, PartialEq, PartialOrd)]
+#[derive(Clone, Copy, PartialEq, PartialOrd, Default, Debug)]
 #[repr(C)]
 pub struct Vec2 {
-    x: f32,
-    y: f32,
+    pub x: f32,
+    pub y: f32,
 }
 
-#[derive(Clone, Copy, PartialEq, PartialOrd)]
+impl Vec2 {
+    pub const ZERO: Self = Self::splat(0.0);
+
+    pub const X: Self = Self::new(1.0, 0.0);
+    pub const Y: Self = Self::new(0.0, 1.0);
+
+    #[inline(always)]
+    pub const fn new(x: f32, y: f32) -> Self {
+        Self { x, y }
+    }
+
+    #[inline(always)]
+    pub const fn splat(value: f32) -> Self {
+        Self { x: value, y: value }
+    }
+
+    #[inline(always)]
+    pub fn as_array(self) -> [f32; 2] {
+        unsafe { std::mem::transmute(self) }
+    }
+
+    #[inline(always)]
+    pub fn from_array(values: [f32; 2]) -> Self {
+        unsafe { std::mem::transmute(values) }
+    }
+
+    #[inline]
+    pub fn distance(a: Self, b: Self) -> f32 {
+        (a - b).length()
+    }
+
+    #[inline]
+    pub fn distance_sq(a: Self, b: Self) -> f32 {
+        (a - b).length_sq()
+    }
+
+    #[inline]
+    pub fn dot(a: Self, b: Self) -> f32 {
+        a.x * b.x + a.y * b.y
+    }
+
+    #[inline]
+    pub fn length(self) -> f32 {
+        self.length_sq().sqrt()
+    }
+
+    #[inline]
+    pub fn length_sq(self) -> f32 {
+        Self::dot(self, self)
+    }
+
+    #[inline]
+    pub fn ceil(self) -> Self {
+        Self {
+            x: self.x.ceil(),
+            y: self.y.ceil(),
+        }
+    }
+
+    #[inline]
+    pub fn floor(self) -> Self {
+        Self {
+            x: self.x.floor(),
+            y: self.y.floor(),
+        }
+    }
+
+    #[inline]
+    pub fn round(self) -> Self {
+        Self {
+            x: self.x.round(),
+            y: self.y.round(),
+        }
+    }
+}
+
+impl std::ops::Add for Vec2 {
+    type Output = Vec2;
+    #[inline]
+    fn add(self, rhs: Self) -> Self::Output {
+        Self::Output {
+            x: self.x + rhs.x,
+            y: self.y + rhs.y,
+        }
+    }
+}
+
+impl std::ops::Sub for Vec2 {
+    type Output = Vec2;
+    #[inline]
+    fn sub(self, rhs: Self) -> Self::Output {
+        Self::Output {
+            x: self.x - rhs.x,
+            y: self.y - rhs.y,
+        }
+    }
+}
+
+impl std::ops::Mul for Vec2 {
+    type Output = Vec2;
+    #[inline]
+    fn mul(self, rhs: Self) -> Self::Output {
+        Self::Output {
+            x: self.x * rhs.x,
+            y: self.y * rhs.y,
+        }
+    }
+}
+
+impl std::ops::Div for Vec2 {
+    type Output = Vec2;
+    #[inline]
+    fn div(self, rhs: Self) -> Self::Output {
+        Self::Output {
+            x: self.x / rhs.x,
+            y: self.y / rhs.y,
+        }
+    }
+}
+
+impl std::ops::AddAssign for Vec2 {
+    #[inline]
+    fn add_assign(&mut self, rhs: Self) {
+        self.x += rhs.x;
+        self.y += rhs.y;
+    }
+}
+
+impl std::ops::SubAssign for Vec2 {
+    #[inline]
+    fn sub_assign(&mut self, rhs: Self) {
+        self.x -= rhs.x;
+        self.y -= rhs.y;
+    }
+}
+
+impl std::ops::MulAssign for Vec2 {
+    #[inline]
+    fn mul_assign(&mut self, rhs: Self) {
+        self.x *= rhs.x;
+        self.y *= rhs.y;
+    }
+}
+
+impl std::ops::DivAssign for Vec2 {
+    #[inline]
+    fn div_assign(&mut self, rhs: Self) {
+        self.x /= rhs.x;
+        self.y /= rhs.y;
+    }
+}
+
+#[derive(Clone, Copy, PartialEq, PartialOrd, Default, Debug)]
 #[repr(C)]
 pub struct Vec3 {
-    x: f32,
-    y: f32,
-    z: f32,
+    pub x: f32,
+    pub y: f32,
+    pub z: f32,
 }
 
-#[derive(Clone, Copy, PartialEq, PartialOrd)]
+impl Vec3 {
+    pub const ZERO: Self = Self::splat(0.0);
+
+    pub const X: Self = Self::new(1.0, 0.0, 0.0);
+    pub const Y: Self = Self::new(0.0, 1.0, 0.0);
+    pub const Z: Self = Self::new(0.0, 0.0, 1.0);
+
+    #[inline(always)]
+    pub const fn new(x: f32, y: f32, z: f32) -> Self {
+        Self { x, y, z }
+    }
+
+    #[inline(always)]
+    pub const fn splat(value: f32) -> Self {
+        Self {
+            x: value,
+            y: value,
+            z: value,
+        }
+    }
+
+    #[inline(always)]
+    pub fn as_array(self) -> [f32; 3] {
+        unsafe { std::mem::transmute(self) }
+    }
+
+    #[inline(always)]
+    pub fn from_array(values: [f32; 3]) -> Self {
+        unsafe { std::mem::transmute(values) }
+    }
+
+    #[inline]
+    pub fn distance(a: Self, b: Self) -> f32 {
+        (a - b).length()
+    }
+
+    #[inline]
+    pub fn distance_sq(a: Self, b: Self) -> f32 {
+        (a - b).length_sq()
+    }
+
+    #[inline]
+    pub fn dot(a: Self, b: Self) -> f32 {
+        a.x * b.x + a.y * b.y + a.z * b.z
+    }
+
+    #[inline]
+    pub fn length(self) -> f32 {
+        self.length_sq().sqrt()
+    }
+
+    #[inline]
+    pub fn length_sq(self) -> f32 {
+        Self::dot(self, self)
+    }
+
+    #[inline]
+    pub fn ceil(self) -> Self {
+        Self {
+            x: self.x.ceil(),
+            y: self.y.ceil(),
+            z: self.z.ceil(),
+        }
+    }
+
+    #[inline]
+    pub fn floor(self) -> Self {
+        Self {
+            x: self.x.floor(),
+            y: self.y.floor(),
+            z: self.z.floor(),
+        }
+    }
+
+    #[inline]
+    pub fn round(self) -> Self {
+        Self {
+            x: self.x.round(),
+            y: self.y.round(),
+            z: self.z.round(),
+        }
+    }
+}
+
+impl std::ops::Add for Vec3 {
+    type Output = Vec3;
+    #[inline]
+    fn add(self, rhs: Self) -> Self::Output {
+        Self::Output {
+            x: self.x + rhs.x,
+            y: self.y + rhs.y,
+            z: self.z + rhs.z,
+        }
+    }
+}
+
+impl std::ops::Sub for Vec3 {
+    type Output = Vec3;
+    #[inline]
+    fn sub(self, rhs: Self) -> Self::Output {
+        Self::Output {
+            x: self.x - rhs.x,
+            y: self.y - rhs.y,
+            z: self.z - rhs.z,
+        }
+    }
+}
+
+impl std::ops::Mul for Vec3 {
+    type Output = Vec3;
+    #[inline]
+    fn mul(self, rhs: Self) -> Self::Output {
+        Self::Output {
+            x: self.x * rhs.x,
+            y: self.y * rhs.y,
+            z: self.z * rhs.z,
+        }
+    }
+}
+
+impl std::ops::Div for Vec3 {
+    type Output = Vec3;
+    #[inline]
+    fn div(self, rhs: Self) -> Self::Output {
+        Self::Output {
+            x: self.x / rhs.x,
+            y: self.y / rhs.y,
+            z: self.z / rhs.z,
+        }
+    }
+}
+
+impl std::ops::AddAssign for Vec3 {
+    #[inline]
+    fn add_assign(&mut self, rhs: Self) {
+        self.x += rhs.x;
+        self.y += rhs.y;
+        self.z += rhs.z;
+    }
+}
+
+impl std::ops::SubAssign for Vec3 {
+    #[inline]
+    fn sub_assign(&mut self, rhs: Self) {
+        self.x -= rhs.x;
+        self.y -= rhs.y;
+        self.z -= rhs.z;
+    }
+}
+
+impl std::ops::MulAssign for Vec3 {
+    #[inline]
+    fn mul_assign(&mut self, rhs: Self) {
+        self.x *= rhs.x;
+        self.y *= rhs.y;
+        self.z *= rhs.z;
+    }
+}
+
+impl std::ops::DivAssign for Vec3 {
+    #[inline]
+    fn div_assign(&mut self, rhs: Self) {
+        self.x /= rhs.x;
+        self.y /= rhs.y;
+        self.z /= rhs.z;
+    }
+}
+
+#[derive(Clone, Copy, PartialEq, PartialOrd, Default, Debug)]
 #[repr(C)]
 pub struct Vec4 {
-    x: f32,
-    y: f32,
-    z: f32,
-    w: f32,
+    pub x: f32,
+    pub y: f32,
+    pub z: f32,
+    pub w: f32,
 }
 
-#[derive(Clone, Copy, PartialEq)]
+impl Vec4 {
+    pub const ZERO: Self = Self::splat(0.0);
+
+    pub const X: Self = Self::new(1.0, 0.0, 0.0, 0.0);
+    pub const Y: Self = Self::new(0.0, 1.0, 0.0, 0.0);
+    pub const Z: Self = Self::new(0.0, 0.0, 1.0, 0.0);
+    pub const W: Self = Self::new(0.0, 0.0, 0.0, 1.0);
+
+    #[inline(always)]
+    pub const fn new(x: f32, y: f32, z: f32, w: f32) -> Self {
+        Self { x, y, z, w }
+    }
+
+    #[inline(always)]
+    pub const fn splat(value: f32) -> Self {
+        Self {
+            x: value,
+            y: value,
+            z: value,
+            w: value,
+        }
+    }
+
+    #[inline(always)]
+    pub fn as_array(self) -> [f32; 4] {
+        unsafe { std::mem::transmute(self) }
+    }
+
+    #[inline(always)]
+    pub fn from_array(values: [f32; 4]) -> Self {
+        unsafe { std::mem::transmute(values) }
+    }
+
+    #[cfg(target_feature = "sse2")]
+    #[inline(always)]
+    fn as_m128(self) -> std::arch::x86_64::__m128 {
+        unsafe { std::arch::x86_64::_mm_loadu_ps(&self.x) }
+    }
+
+    #[cfg(target_feature = "sse2")]
+    #[inline(always)]
+    fn from_m128(values: std::arch::x86_64::__m128) -> Self {
+        use std::arch::x86_64::_mm_storeu_ps;
+        let mut result = Vec4::ZERO;
+        unsafe { _mm_storeu_ps(&mut result.x, values) }
+        result
+    }
+
+    #[inline]
+    pub fn distance(a: Self, b: Self) -> f32 {
+        (a - b).length()
+    }
+
+    #[inline]
+    pub fn distance_sq(a: Self, b: Self) -> f32 {
+        (a - b).length_sq()
+    }
+
+    #[inline]
+    pub fn dot(a: Self, b: Self) -> f32 {
+        a.x * b.x + a.y * b.y + a.z * b.z + a.w * b.w
+    }
+
+    #[inline]
+    pub fn length(self) -> f32 {
+        self.length_sq().sqrt()
+    }
+
+    #[inline]
+    pub fn length_sq(self) -> f32 {
+        Self::dot(self, self)
+    }
+
+    #[inline]
+    pub fn ceil(self) -> Self {
+        Self {
+            x: self.x.ceil(),
+            y: self.y.ceil(),
+            z: self.z.ceil(),
+            w: self.w.ceil(),
+        }
+    }
+
+    #[inline]
+    pub fn floor(self) -> Self {
+        Self {
+            x: self.x.floor(),
+            y: self.y.floor(),
+            z: self.z.floor(),
+            w: self.w.floor(),
+        }
+    }
+
+    #[inline]
+    pub fn round(self) -> Self {
+        Self {
+            x: self.x.round(),
+            y: self.y.round(),
+            z: self.z.round(),
+            w: self.w.round(),
+        }
+    }
+}
+
+impl std::ops::Add for Vec4 {
+    type Output = Vec4;
+
+    #[cfg(not(target_feature = "sse2"))]
+    #[inline(always)]
+    fn add(self, rhs: Self) -> Self::Output {
+        Self::Output {
+            x: self.x + rhs.x,
+            y: self.y + rhs.y,
+            z: self.z + rhs.z,
+            w: self.w + rhs.w,
+        }
+    }
+
+    #[cfg(target_feature = "sse2")]
+    #[inline(always)]
+    fn add(self, rhs: Self) -> Self::Output {
+        unsafe {
+            use std::arch::x86_64::_mm_add_ps;
+            Vec4::from_m128(_mm_add_ps(self.as_m128(), rhs.as_m128()))
+        }
+    }
+}
+
+impl std::ops::Sub for Vec4 {
+    type Output = Vec4;
+
+    #[cfg(not(target_feature = "sse2"))]
+    #[inline(always)]
+    fn sub(self, rhs: Self) -> Self::Output {
+        Self::Output {
+            x: self.x - rhs.x,
+            y: self.y - rhs.y,
+            z: self.z - rhs.z,
+            w: self.w - rhs.w,
+        }
+    }
+
+    #[cfg(target_feature = "sse2")]
+    #[inline(always)]
+    fn sub(self, rhs: Self) -> Self::Output {
+        unsafe {
+            use std::arch::x86_64::_mm_sub_ps;
+            Vec4::from_m128(_mm_sub_ps(self.as_m128(), rhs.as_m128()))
+        }
+    }
+}
+
+impl std::ops::Mul for Vec4 {
+    type Output = Vec4;
+
+    #[cfg(not(target_feature = "sse2"))]
+    #[inline(always)]
+    fn mul(self, rhs: Self) -> Self::Output {
+        Self::Output {
+            x: self.x * rhs.x,
+            y: self.y * rhs.y,
+            z: self.z * rhs.z,
+            w: self.w * rhs.w,
+        }
+    }
+
+    #[cfg(target_feature = "sse2")]
+    #[inline(always)]
+    fn mul(self, rhs: Self) -> Self::Output {
+        unsafe {
+            use std::arch::x86_64::_mm_mul_ps;
+            Vec4::from_m128(_mm_mul_ps(self.as_m128(), rhs.as_m128()))
+        }
+    }
+}
+
+impl std::ops::Div for Vec4 {
+    type Output = Vec4;
+
+    #[cfg(not(target_feature = "sse2"))]
+    #[inline(always)]
+    fn div(self, rhs: Self) -> Self::Output {
+        Self::Output {
+            x: self.x / rhs.x,
+            y: self.y / rhs.y,
+            z: self.z / rhs.z,
+            w: self.w / rhs.w,
+        }
+    }
+
+    #[cfg(target_feature = "sse2")]
+    #[inline(always)]
+    fn div(self, rhs: Self) -> Self::Output {
+        unsafe {
+            use std::arch::x86_64::_mm_div_ps;
+            Vec4::from_m128(_mm_div_ps(self.as_m128(), rhs.as_m128()))
+        }
+    }
+}
+
+impl std::ops::AddAssign for Vec4 {
+    #[cfg(not(target_feature = "sse2"))]
+    #[inline(always)]
+    fn add_assign(&mut self, rhs: Self) {
+        self.x += rhs.x;
+        self.y += rhs.y;
+        self.z += rhs.z;
+        self.w += rhs.w;
+    }
+
+    #[cfg(target_feature = "sse2")]
+    #[inline(always)]
+    fn add_assign(&mut self, rhs: Self) {
+        use std::arch::x86_64::_mm_add_ps;
+        unsafe {
+            *self = Vec4::from_m128(_mm_add_ps(self.as_m128(), rhs.as_m128()));
+        }
+    }
+}
+
+impl std::ops::SubAssign for Vec4 {
+    #[cfg(not(target_feature = "sse2"))]
+    #[inline(always)]
+    fn sub_assign(&mut self, rhs: Self) {
+        self.x -= rhs.x;
+        self.y -= rhs.y;
+        self.z -= rhs.z;
+        self.w -= rhs.w;
+    }
+
+    #[cfg(target_feature = "sse2")]
+    #[inline(always)]
+    fn sub_assign(&mut self, rhs: Self) {
+        unsafe {
+            *self = Vec4::from_m128(std::arch::x86_64::_mm_sub_ps(self.as_m128(), rhs.as_m128()));
+        }
+    }
+}
+
+impl std::ops::MulAssign for Vec4 {
+    #[cfg(not(target_feature = "sse2"))]
+    #[inline(always)]
+    fn mul_assign(&mut self, rhs: Self) {
+        self.x *= rhs.x;
+        self.y *= rhs.y;
+        self.z *= rhs.z;
+        self.w *= rhs.w;
+    }
+
+    #[cfg(target_feature = "sse2")]
+    #[inline(always)]
+    fn mul_assign(&mut self, rhs: Self) {
+        unsafe {
+            *self = Vec4::from_m128(std::arch::x86_64::_mm_mul_ps(self.as_m128(), rhs.as_m128()));
+        }
+    }
+}
+
+impl std::ops::DivAssign for Vec4 {
+    #[cfg(not(target_feature = "sse2"))]
+    #[inline(always)]
+    fn div_assign(&mut self, rhs: Self) {
+        self.x /= rhs.x;
+        self.y /= rhs.y;
+        self.z /= rhs.z;
+        self.w /= rhs.w;
+    }
+
+    #[cfg(target_feature = "sse2")]
+    #[inline(always)]
+    fn div_assign(&mut self, rhs: Self) {
+        unsafe {
+            *self = Vec4::from_m128(std::arch::x86_64::_mm_div_ps(self.as_m128(), rhs.as_m128()));
+        }
+    }
+}
+
+#[derive(Clone, Copy, PartialEq, Debug)]
 #[repr(C)]
 pub struct Quat {
-    a: f32,
-    b: f32,
-    c: f32,
-    d: f32,
+    pub a: f32,
+    pub b: f32,
+    pub c: f32,
+    pub d: f32,
 }
 
 #[derive(Clone, Copy, PartialEq)]
 #[repr(C)]
-pub struct Mat44([f32; 16]);
+pub struct Mat4(pub [f32; 16]);
+
+impl std::fmt::Debug for Mat4 {
+    fn fmt(&self, f: &mut std::fmt::Formatter<'_>) -> std::fmt::Result {
+        f.write_str("Mat4 [")?;
+        if f.alternate() {
+            writeln!(f)?;
+            for row in self.as_rows_array() {
+                f.write_str("\t")?;
+                for value in row {
+                    f.write_fmt(format_args!("{value}, "))?;
+                }
+                f.write_str("\n")?;
+            }
+        } else {
+            for value in &self.0[..15] {
+                f.write_fmt(format_args!("{value}, "))?;
+            }
+            f.write_fmt(format_args!("{}", self.0[15]))?;
+        }
+        f.write_str("]")
+    }
+}
+
+impl Mat4 {
+    pub const ZERO: Mat4 = Mat4::from_rows_array([
+        [0.0, 0.0, 0.0, 0.0],
+        [0.0, 0.0, 0.0, 0.0],
+        [0.0, 0.0, 0.0, 0.0],
+        [0.0, 0.0, 0.0, 0.0],
+    ]);
+
+    pub const IDENTITY: Mat4 = Mat4::from_rows_array([
+        [1.0, 0.0, 0.0, 0.0],
+        [0.0, 1.0, 0.0, 0.0],
+        [0.0, 0.0, 1.0, 0.0],
+        [0.0, 0.0, 0.0, 1.0],
+    ]);
+
+    #[inline(always)]
+    pub fn as_rows_array(&self) -> &[[f32; 4]; 4] {
+        unsafe { std::mem::transmute(&self.0) }
+    }
+
+    #[inline(always)]
+    pub fn as_rows_array_mut(&mut self) -> &mut [[f32; 4]; 4] {
+        unsafe { std::mem::transmute(&mut self.0) }
+    }
+
+    #[inline(always)]
+    pub const fn from_rows_array(rows: [[f32; 4]; 4]) -> Self {
+        unsafe { std::mem::transmute(rows) }
+    }
+
+    #[cfg(target_feature = "sse2")]
+    #[inline(always)]
+    fn as_m128_array(&self) -> [std::arch::x86_64::__m128; 4] {
+        use std::arch::x86_64::_mm_loadu_ps;
+        unsafe {
+            [
+                _mm_loadu_ps(&self.0[0x0]),
+                _mm_loadu_ps(&self.0[0x4]),
+                _mm_loadu_ps(&self.0[0x8]),
+                _mm_loadu_ps(&self.0[0xc]),
+            ]
+        }
+    }
+
+    #[cfg(target_feature = "sse2")]
+    #[inline(always)]
+    fn from_m128_array(values: [std::arch::x86_64::__m128; 4]) -> Self {
+        use std::arch::x86_64::_mm_storeu_ps;
+
+        let mut result = Mat4::IDENTITY;
+        unsafe {
+            _mm_storeu_ps(&mut result.0[0x0], values[0]);
+            _mm_storeu_ps(&mut result.0[0x4], values[1]);
+            _mm_storeu_ps(&mut result.0[0x8], values[2]);
+            _mm_storeu_ps(&mut result.0[0xc], values[3]);
+        }
+        result
+    }
+
+    pub fn from_scale(scale: Vec3) -> Mat4 {
+        Mat4::from_rows_array([
+            [scale.x, 0.0, 0.0, 0.0],
+            [0.0, scale.y, 0.0, 0.0],
+            [0.0, 0.0, scale.z, 0.0],
+            [0.0, 0.0, 0.0, 1.0],
+        ])
+    }
+
+    #[allow(dead_code)]
+    #[inline(always)]
+    fn transpose_base(self) -> Mat4 {
+        let m = &self.0;
+        Mat4::from_rows_array([
+            [m[0x0], m[0x4], m[0x8], m[0xc]],
+            [m[0x1], m[0x5], m[0x9], m[0xd]],
+            [m[0x2], m[0x6], m[0xa], m[0xe]],
+            [m[0x3], m[0x7], m[0xb], m[0xf]],
+        ])
+    }
+
+    #[inline]
+    #[target_feature(enable = "sse2")]
+    unsafe fn transpose_sse2(self) -> Mat4 {
+        use std::arch::x86_64::_MM_TRANSPOSE4_PS;
+        let [mut row0, mut row1, mut row2, mut row3] = self.as_m128_array();
+        _MM_TRANSPOSE4_PS(&mut row0, &mut row1, &mut row2, &mut row3);
+        Mat4::from_m128_array([row0, row1, row2, row3])
+    }
+
+    #[must_use]
+    #[inline(always)]
+    pub fn transpose(self) -> Mat4 {
+        #[cfg(not(target_feature = "sse2"))]
+        {
+            self.transpose_base()
+        }
+        #[cfg(target_feature = "sse2")]
+        unsafe {
+            self.transpose_sse2()
+        }
+    }
+
+    #[must_use]
+    #[inline]
+    pub fn mul_vec3(&self, vec: Vec3) -> Vec3 {
+        let vec = Vec4::new(vec.x, vec.y, vec.z, 1.0);
+        let vec = self.mul_vec4(vec);
+        Vec3::new(vec.x, vec.y, vec.z)
+    }
+
+    #[inline(always)]
+    fn mul_vec4_base(&self, vec: Vec4) -> Vec4 {
+        let rows = self.as_rows_array();
+        Vec4::new(
+            Vec4::dot(Vec4::from_array(rows[0]), vec),
+            Vec4::dot(Vec4::from_array(rows[1]), vec),
+            Vec4::dot(Vec4::from_array(rows[2]), vec),
+            Vec4::dot(Vec4::from_array(rows[3]), vec),
+        )
+    }
+
+    #[allow(dead_code)]
+    #[inline]
+    #[target_feature(enable = "sse4.1")]
+    unsafe fn mul_vec4_sse41(&self, vec: Vec4) -> Vec4 {
+        use std::arch::x86_64::{_mm_hadd_ps, _mm_mul_ps};
+
+        let vec = vec.as_m128();
+        let rows = self.as_m128_array();
+
+        let values = _mm_hadd_ps(
+            _mm_hadd_ps(_mm_mul_ps(rows[0], vec), _mm_mul_ps(rows[1], vec)),
+            _mm_hadd_ps(_mm_mul_ps(rows[2], vec), _mm_mul_ps(rows[3], vec)),
+        );
+
+        Vec4::from_m128(values)
+    }
+
+    #[must_use]
+    #[inline(always)]
+    pub fn mul_vec4(&self, vec: Vec4) -> Vec4 {
+        #[cfg(not(target_feature = "sse4.1"))]
+        {
+            self.mul_vec4_base(vec)
+        }
+
+        #[cfg(target_feature = "sse4.1")]
+        unsafe {
+            self.mul_vec4_sse41(vec)
+        }
+    }
+
+    #[allow(dead_code)]
+    #[inline(always)]
+    fn mul_mat4_base(self: &Mat4, rhs: Mat4) -> Mat4 {
+        let mut result = Mat4::IDENTITY;
+        {
+            let result = result.as_rows_array_mut();
+            let lhs = self.as_rows_array();
+            let rhs = rhs.as_rows_array();
+            for i in 0..4 {
+                for j in 0..4 {
+                    result[i][j] = lhs[i][0] * rhs[0][j]
+                        + lhs[i][1] * rhs[1][j]
+                        + lhs[i][2] * rhs[2][j]
+                        + lhs[i][3] * rhs[3][j];
+                }
+            }
+        }
+        result
+    }
+
+    #[allow(dead_code)]
+    #[inline]
+    #[target_feature(enable = "sse2")]
+    unsafe fn mul_mat4_sse2(&self, rhs: Mat4) -> Mat4 {
+        use std::arch::x86_64::{__m128, _mm_add_ps, _mm_mul_ps, _mm_shuffle_ps};
+
+        #[inline(always)]
+        fn linear_combine(a: __m128, mat: &[__m128; 4]) -> __m128 {
+            unsafe {
+                let r = _mm_mul_ps(_mm_shuffle_ps(a, a, 0x00), mat[0]);
+                let r = _mm_add_ps(r, _mm_mul_ps(_mm_shuffle_ps(a, a, 0x55), mat[1]));
+                let r = _mm_add_ps(r, _mm_mul_ps(_mm_shuffle_ps(a, a, 0xaa), mat[2]));
+                _mm_add_ps(r, _mm_mul_ps(_mm_shuffle_ps(a, a, 0xff), mat[3]))
+            }
+        }
+
+        let lhs = self.as_m128_array();
+        let rhs = rhs.as_m128_array();
+
+        let x0 = linear_combine(lhs[0], &rhs);
+        let x1 = linear_combine(lhs[1], &rhs);
+        let x2 = linear_combine(lhs[2], &rhs);
+        let x3 = linear_combine(lhs[3], &rhs);
+
+        Mat4::from_m128_array([x0, x1, x2, x3])
+    }
+
+    #[allow(dead_code)]
+    #[inline]
+    #[target_feature(enable = "avx2")]
+    unsafe fn mul_mat4_avx2(&self, rhs: Mat4) -> Mat4 {
+        use std::arch::x86_64::{
+            __m128, __m256, _mm256_add_ps, _mm256_broadcast_ps, _mm256_loadu_ps, _mm256_mul_ps,
+            _mm256_shuffle_ps, _mm256_storeu_ps, _mm256_zeroupper,
+        };
+
+        #[inline(always)]
+        unsafe fn two_linear_combine(a: __m256, mat: &[__m128; 4]) -> __m256 {
+            let r = _mm256_mul_ps(_mm256_shuffle_ps(a, a, 0x00), _mm256_broadcast_ps(&mat[0]));
+            let r = _mm256_add_ps(
+                r,
+                _mm256_mul_ps(_mm256_shuffle_ps(a, a, 0x55), _mm256_broadcast_ps(&mat[1])),
+            );
+            let r = _mm256_add_ps(
+                r,
+                _mm256_mul_ps(_mm256_shuffle_ps(a, a, 0xaa), _mm256_broadcast_ps(&mat[2])),
+            );
+            _mm256_add_ps(
+                r,
+                _mm256_mul_ps(_mm256_shuffle_ps(a, a, 0xff), _mm256_broadcast_ps(&mat[3])),
+            )
+        }
+
+        _mm256_zeroupper();
+
+        let a0 = _mm256_loadu_ps(&self.0[0]);
+        let a1 = _mm256_loadu_ps(&self.0[8]);
+        let rhs = rhs.as_m128_array();
+
+        let x0 = two_linear_combine(a0, &rhs);
+        let x1 = two_linear_combine(a1, &rhs);
+
+        let mut result = Mat4::IDENTITY;
+        _mm256_storeu_ps(&mut result.0[0], x0);
+        _mm256_storeu_ps(&mut result.0[8], x1);
+        result
+    }
+}
+
+impl std::ops::Mul for Mat4 {
+    type Output = Mat4;
+
+    #[inline(always)]
+    fn mul(self, rhs: Self) -> Self::Output {
+        #[cfg(not(target_feature = "sse2"))]
+        {
+            self.mul_mat4_base(rhs)
+        }
+        #[cfg(all(target_feature = "sse2", not(target_feature = "avx2")))]
+        unsafe {
+            self.mul_mat4_sse2(rhs)
+        }
+        #[cfg(target_feature = "avx2")]
+        unsafe {
+            self.mul_mat4_avx2(rhs)
+        }
+    }
+}
+
+impl std::ops::MulAssign for Mat4 {
+    #[inline(always)]
+    fn mul_assign(&mut self, rhs: Self) {
+        *self = *self * rhs
+    }
+}
+
+impl std::ops::Mul<Vec4> for Mat4 {
+    type Output = Vec4;
+
+    #[inline(always)]
+    fn mul(self, rhs: Vec4) -> Self::Output {
+        self.mul_vec4(rhs)
+    }
+}
+
+impl std::ops::Mul<Vec3> for Mat4 {
+    type Output = Vec3;
+
+    #[inline(always)]
+    fn mul(self, rhs: Vec3) -> Self::Output {
+        self.mul_vec3(rhs)
+    }
+}
 
 #[derive(Clone, Copy, PartialEq)]
 #[repr(C)]
-pub struct Mat43([f32; 12]);
+pub struct Mat43(pub [f32; 12]);
 
 #[cfg(test)]
 mod tests {
+    use super::*;
+
     #[test]
-    fn it_works() {}
+    fn basic_vec4() {
+        assert_eq!(
+            Vec4::new(1.0, 2.0, 3.0, 4.0) + Vec4::new(4.0, 3.0, 2.0, 1.0),
+            Vec4::splat(5.0)
+        );
+        assert_eq!(
+            Vec4::new(4.0, 3.0, 2.0, 1.0) - Vec4::new(3.0, 2.0, 1.0, 0.0),
+            Vec4::splat(1.0)
+        );
+        assert_eq!(
+            Vec4::new(1.0, 2.0, 3.0, 4.0) * Vec4::new(4.0, 3.0, 2.0, 1.0),
+            Vec4::new(4.0, 6.0, 6.0, 4.0)
+        );
+        assert_eq!(
+            Vec4::new(1.0, 2.0, 3.0, 4.0) / Vec4::splat(2.0),
+            Vec4::new(0.5, 1.0, 1.5, 2.0)
+        );
+
+        assert_eq!(Vec4::new(2.0, 2.0, 2.0, 2.0).length_sq(), 16.0);
+        assert_eq!(Vec4::new(2.0, 2.0, 2.0, 2.0).length(), 4.0);
+
+        assert_eq!(Vec4::distance_sq(Vec4::splat(-1.0), Vec4::splat(1.0)), 16.0);
+        assert_eq!(Vec4::distance(Vec4::splat(-1.0), Vec4::splat(1.0)), 4.0);
+    }
+
+    #[test]
+    fn mat4_basics() {
+        assert_eq!(Mat4::IDENTITY.transpose(), Mat4::IDENTITY);
+        #[rustfmt::skip]
+        let x = Mat4([
+            1.0,  2.0,  3.0,  4.0,
+            5.0,  6.0,  7.0,  8.0,
+            9.0,  10.0, 11.0, 12.0,
+            13.0, 14.0, 15.0, 16.0,
+        ]);
+        #[rustfmt::skip]
+        let y = Mat4([
+            1.0, 5.0,  9.0, 13.0,
+            2.0, 6.0, 10.0, 14.0,
+            3.0, 7.0, 11.0, 15.0,
+            4.0, 8.0, 12.0, 16.0,
+        ]);
+        assert_eq!(x.transpose(), y);
+        assert_eq!(x.transpose().transpose(), x);
+    }
+
+    #[test]
+    fn mat4_multiply() {
+        assert_eq!(Mat4::IDENTITY.mul_mat4_base(Mat4::IDENTITY), Mat4::IDENTITY);
+
+        if std::is_x86_feature_detected!("sse2") {
+            assert_eq!(
+                unsafe { Mat4::IDENTITY.mul_mat4_sse2(Mat4::IDENTITY) },
+                Mat4::IDENTITY
+            );
+        }
+
+        if std::is_x86_feature_detected!("avx2") {
+            assert_eq!(
+                unsafe { Mat4::IDENTITY.mul_mat4_avx2(Mat4::IDENTITY) },
+                Mat4::IDENTITY
+            );
+        }
+
+        assert_eq!(Mat4::IDENTITY * Mat4::IDENTITY, Mat4::IDENTITY);
+
+        let scale = Mat4::from_scale(Vec3::splat(2.0));
+        assert_eq!(scale * Mat4::IDENTITY, scale);
+    }
+
+    #[test]
+    fn mat4_mul_vec4() {
+        assert_eq!(Mat4::IDENTITY * Vec4::ZERO, Vec4::ZERO);
+
+        assert_eq!(
+            Mat4::IDENTITY.mul_vec4_base(Vec4::new(1.0, 2.0, 3.0, 4.0)),
+            Vec4::new(1.0, 2.0, 3.0, 4.0)
+        );
+
+        if std::is_x86_feature_detected!("sse4.1") {
+            assert_eq!(
+                unsafe { Mat4::IDENTITY.mul_vec4_sse41(Vec4::new(1.0, 2.0, 3.0, 4.0)) },
+                Vec4::new(1.0, 2.0, 3.0, 4.0)
+            );
+        }
+
+        let scale = Mat4::from_scale(Vec3::splat(2.0));
+        assert_eq!(scale.mul_vec3(Vec3::splat(1.0)), Vec3::splat(2.0));
+    }
 }