From 4cd10ddfdb1e5a1c7320162d0a3e341b166b598d Mon Sep 17 00:00:00 2001
From: Sergey Sharybin <sergey@blender.org>
Date: Mon, 3 Jul 2023 18:26:02 +0200
Subject: [PATCH] Fix naive 4x4 matrix multiplication C++ implementation

Is not visible on any of the officially platforms, as everywhere
SSE2 is available (on Apple Silicon via sse2neon).

Only got noticed by some intermittent issue during development
which made BLI_HAVE_SSE2 unaccessible.

Seems that transpose was done a bit wrong. Not sure if worth trying
to fold the equation into C++ types, as that requires extra memory
transfers for transpose. Opted for a more naive folding, which
avoids extra copies.

Added a regression test for it, verified against numpy, the BLI
SSE2 implementation.
---
 source/blender/blenlib/intern/math_matrix.cc  | 24 +++++++++++----
 .../blenlib/tests/BLI_math_matrix_test.cc     | 30 +++++++++++++++++++
 2 files changed, 49 insertions(+), 5 deletions(-)

diff --git a/source/blender/blenlib/intern/math_matrix.cc b/source/blender/blenlib/intern/math_matrix.cc
index f590bbb726b..349b9d70035 100644
--- a/source/blender/blenlib/intern/math_matrix.cc
+++ b/source/blender/blenlib/intern/math_matrix.cc
@@ -43,13 +43,27 @@ template<> float4x4 float4x4::operator*(const float4x4 &b) const
     _mm_store_ps(result[i], sum);
   }
 #else
-  const float4x4 T = transpose(b);
+  result[0][0] = b[0][0] * a[0][0] + b[0][1] * a[1][0] + b[0][2] * a[2][0] + b[0][3] * a[3][0];
+  result[0][1] = b[0][0] * a[0][1] + b[0][1] * a[1][1] + b[0][2] * a[2][1] + b[0][3] * a[3][1];
+  result[0][2] = b[0][0] * a[0][2] + b[0][1] * a[1][2] + b[0][2] * a[2][2] + b[0][3] * a[3][2];
+  result[0][3] = b[0][0] * a[0][3] + b[0][1] * a[1][3] + b[0][2] * a[2][3] + b[0][3] * a[3][3];
 
-  result.x = float4(dot(a.x, T.x), dot(a.x, T.y), dot(a.x, T.z), dot(a.x, T.w));
-  result.y = float4(dot(a.y, T.x), dot(a.y, T.y), dot(a.y, T.z), dot(a.y, T.w));
-  result.z = float4(dot(a.z, T.x), dot(a.z, T.y), dot(a.z, T.z), dot(a.z, T.w));
-  result.w = float4(dot(a.w, T.x), dot(a.w, T.y), dot(a.w, T.z), dot(a.w, T.w));
+  result[1][0] = b[1][0] * a[0][0] + b[1][1] * a[1][0] + b[1][2] * a[2][0] + b[1][3] * a[3][0];
+  result[1][1] = b[1][0] * a[0][1] + b[1][1] * a[1][1] + b[1][2] * a[2][1] + b[1][3] * a[3][1];
+  result[1][2] = b[1][0] * a[0][2] + b[1][1] * a[1][2] + b[1][2] * a[2][2] + b[1][3] * a[3][2];
+  result[1][3] = b[1][0] * a[0][3] + b[1][1] * a[1][3] + b[1][2] * a[2][3] + b[1][3] * a[3][3];
+
+  result[2][0] = b[2][0] * a[0][0] + b[2][1] * a[1][0] + b[2][2] * a[2][0] + b[2][3] * a[3][0];
+  result[2][1] = b[2][0] * a[0][1] + b[2][1] * a[1][1] + b[2][2] * a[2][1] + b[2][3] * a[3][1];
+  result[2][2] = b[2][0] * a[0][2] + b[2][1] * a[1][2] + b[2][2] * a[2][2] + b[2][3] * a[3][2];
+  result[2][3] = b[2][0] * a[0][3] + b[2][1] * a[1][3] + b[2][2] * a[2][3] + b[2][3] * a[3][3];
+
+  result[3][0] = b[3][0] * a[0][0] + b[3][1] * a[1][0] + b[3][2] * a[2][0] + b[3][3] * a[3][0];
+  result[3][1] = b[3][0] * a[0][1] + b[3][1] * a[1][1] + b[3][2] * a[2][1] + b[3][3] * a[3][1];
+  result[3][2] = b[3][0] * a[0][2] + b[3][1] * a[1][2] + b[3][2] * a[2][2] + b[3][3] * a[3][2];
+  result[3][3] = b[3][0] * a[0][3] + b[3][1] * a[1][3] + b[3][2] * a[2][3] + b[3][3] * a[3][3];
 #endif
+
   return result;
 }
 
diff --git a/source/blender/blenlib/tests/BLI_math_matrix_test.cc b/source/blender/blenlib/tests/BLI_math_matrix_test.cc
index cc7e7c00766..4ca51c3c38b 100644
--- a/source/blender/blenlib/tests/BLI_math_matrix_test.cc
+++ b/source/blender/blenlib/tests/BLI_math_matrix_test.cc
@@ -328,6 +328,36 @@ TEST(math_matrix, MatrixCompareTest)
   EXPECT_FALSE(is_negative(m6));
 }
 
+TEST(math_matrix, MatrixMultiply)
+{
+
+  {
+    const float4x4 matrix_a = {
+        {1.0, 2.0, 3.0, 4.0},
+        {5.0, 6.0, 7.0, 8.0},
+        {9.0, 10.0, 11.0, 12.0},
+        {13.0, 14.0, 15.0, 16.0},
+    };
+    const float4x4 matrix_b = {
+        {0.1f, 0.2f, 0.3f, 0.4f},
+        {0.5f, 0.6f, 0.7f, 0.8f},
+        {0.9f, 1.0f, 1.1f, 1.2f},
+        {1.3f, 1.4f, 1.5f, 1.6f},
+    };
+
+    const float4x4 expected = {
+        {9.0f, 10.0f, 11.0f, 12.0f},
+        {20.2f, 22.8f, 25.4f, 28.0f},
+        {31.4f, 35.6f, 39.8f, 44.0f},
+        {42.6f, 48.4f, 54.2f, 60.0f},
+    };
+
+    const float4x4 result = matrix_a * matrix_b;
+
+    EXPECT_M4_NEAR(result, expected, 1e-5f);
+  }
+}
+
 TEST(math_matrix, MatrixToNearestEuler)
 {
   EulerXYZ eul1 = EulerXYZ(225.08542, -1.12485, -121.23738);