Fix naive 4x4 matrix multiplication C++ implementation

Is not visible on any of the officially platforms, as everywhere SSE2 is available (on Apple Silicon via sse2neon). Only got noticed by some intermittent issue during development which made BLI_HAVE_SSE2 unaccessible. Seems that transpose was done a bit wrong. Not sure if worth trying to fold the equation into C++ types, as that requires extra memory transfers for transpose. Opted for a more naive folding, which avoids extra copies. Added a regression test for it, verified against numpy, the BLI SSE2 implementation.
2023-07-03 18:26:02 +02:00
parent baeb314eb7
commit 4cd10ddfdb
2 changed files with 49 additions and 5 deletions
--- a/source/blender/blenlib/intern/math_matrix.cc
+++ b/source/blender/blenlib/intern/math_matrix.cc
@@ -43,13 +43,27 @@ template<> float4x4 float4x4::operator*(const float4x4 &b) const
    _mm_store_ps(result[i], sum);
  }
 #else
-  const float4x4 T = transpose(b);
+  result[0][0] = b[0][0] * a[0][0] + b[0][1] * a[1][0] + b[0][2] * a[2][0] + b[0][3] * a[3][0];
+  result[0][1] = b[0][0] * a[0][1] + b[0][1] * a[1][1] + b[0][2] * a[2][1] + b[0][3] * a[3][1];
+  result[0][2] = b[0][0] * a[0][2] + b[0][1] * a[1][2] + b[0][2] * a[2][2] + b[0][3] * a[3][2];
+  result[0][3] = b[0][0] * a[0][3] + b[0][1] * a[1][3] + b[0][2] * a[2][3] + b[0][3] * a[3][3];

-  result.x = float4(dot(a.x, T.x), dot(a.x, T.y), dot(a.x, T.z), dot(a.x, T.w));
-  result.y = float4(dot(a.y, T.x), dot(a.y, T.y), dot(a.y, T.z), dot(a.y, T.w));
-  result.z = float4(dot(a.z, T.x), dot(a.z, T.y), dot(a.z, T.z), dot(a.z, T.w));
-  result.w = float4(dot(a.w, T.x), dot(a.w, T.y), dot(a.w, T.z), dot(a.w, T.w));
+  result[1][0] = b[1][0] * a[0][0] + b[1][1] * a[1][0] + b[1][2] * a[2][0] + b[1][3] * a[3][0];
+  result[1][1] = b[1][0] * a[0][1] + b[1][1] * a[1][1] + b[1][2] * a[2][1] + b[1][3] * a[3][1];
+  result[1][2] = b[1][0] * a[0][2] + b[1][1] * a[1][2] + b[1][2] * a[2][2] + b[1][3] * a[3][2];
+  result[1][3] = b[1][0] * a[0][3] + b[1][1] * a[1][3] + b[1][2] * a[2][3] + b[1][3] * a[3][3];
+
+  result[2][0] = b[2][0] * a[0][0] + b[2][1] * a[1][0] + b[2][2] * a[2][0] + b[2][3] * a[3][0];
+  result[2][1] = b[2][0] * a[0][1] + b[2][1] * a[1][1] + b[2][2] * a[2][1] + b[2][3] * a[3][1];
+  result[2][2] = b[2][0] * a[0][2] + b[2][1] * a[1][2] + b[2][2] * a[2][2] + b[2][3] * a[3][2];
+  result[2][3] = b[2][0] * a[0][3] + b[2][1] * a[1][3] + b[2][2] * a[2][3] + b[2][3] * a[3][3];
+
+  result[3][0] = b[3][0] * a[0][0] + b[3][1] * a[1][0] + b[3][2] * a[2][0] + b[3][3] * a[3][0];
+  result[3][1] = b[3][0] * a[0][1] + b[3][1] * a[1][1] + b[3][2] * a[2][1] + b[3][3] * a[3][1];
+  result[3][2] = b[3][0] * a[0][2] + b[3][1] * a[1][2] + b[3][2] * a[2][2] + b[3][3] * a[3][2];
+  result[3][3] = b[3][0] * a[0][3] + b[3][1] * a[1][3] + b[3][2] * a[2][3] + b[3][3] * a[3][3];
 #endif
+
  return result;
 }

--- a/source/blender/blenlib/tests/BLI_math_matrix_test.cc
+++ b/source/blender/blenlib/tests/BLI_math_matrix_test.cc
@@ -328,6 +328,36 @@ TEST(math_matrix, MatrixCompareTest)
  EXPECT_FALSE(is_negative(m6));
 }

+TEST(math_matrix, MatrixMultiply)
+{
+
+  {
+    const float4x4 matrix_a = {
+        {1.0, 2.0, 3.0, 4.0},
+        {5.0, 6.0, 7.0, 8.0},
+        {9.0, 10.0, 11.0, 12.0},
+        {13.0, 14.0, 15.0, 16.0},
+    };
+    const float4x4 matrix_b = {
+        {0.1f, 0.2f, 0.3f, 0.4f},
+        {0.5f, 0.6f, 0.7f, 0.8f},
+        {0.9f, 1.0f, 1.1f, 1.2f},
+        {1.3f, 1.4f, 1.5f, 1.6f},
+    };
+
+    const float4x4 expected = {
+        {9.0f, 10.0f, 11.0f, 12.0f},
+        {20.2f, 22.8f, 25.4f, 28.0f},
+        {31.4f, 35.6f, 39.8f, 44.0f},
+        {42.6f, 48.4f, 54.2f, 60.0f},
+    };
+
+    const float4x4 result = matrix_a * matrix_b;
+
+    EXPECT_M4_NEAR(result, expected, 1e-5f);
+  }
+}
+
 TEST(math_matrix, MatrixToNearestEuler)
 {
  EulerXYZ eul1 = EulerXYZ(225.08542, -1.12485, -121.23738);