From 806b0e837940d6a943e826bb804c3d3f615fc0d2 Mon Sep 17 00:00:00 2001
From: Aras Pranckevicius <aras@nesnausk.org>
Date: Mon, 16 Sep 2024 13:06:16 +0200
Subject: [PATCH] BLI: improve 2/3/4d vector codegen for debug or
 asserts-enabled builds

Majority of math operations on VecBase<> were implemented by calling into an
indexing operator, sometimes coupled with unroll<Size> template.

When compiler optimizations are off (e.g. Debug build), or when asserts are on
(e.g. usual "developer" setup), this resulted in codegen that is very
sub-optimal. Especially if these vector types are used a lot, e.g. when
scaling down a screenshot for saving as a thumbnail into the blend file.

Address that by explicit code paths for 4,3,2 dimensional vectors, that
avoids both the unroll<> template and indexing operator. To avoid repeated long
typo-prone code, do that with C preprocessor :( -- however all of the
preprocessor innards are in a separate file BLI_math_vector_unroll.hh so they
do not get into the way much.

Scaling down a screenshot to the blend file thumbnail, while saving the blend
file, on my machine: (4K screen resolution, Ryzen 5950X, VS2022 build), which
involves two calls to IMB_scale which uses float4 for pixel operations:

- Release with asserts off (what ships to users): no change at 9.4ms
- Release with asserts on ("developer" setup): 38.1ms -> 9.4ms
- Debug: 226ms -> 64ms
- Debug w/ ASAN: 314ms -> 78ms

Pull Request: https://projects.blender.org/blender/blender/pulls/127577
---
 source/blender/blenlib/BLI_math_vector.hh     | 103 ++-------
 .../blender/blenlib/BLI_math_vector_types.hh  | 211 +++++++-----------
 .../blender/blenlib/BLI_math_vector_unroll.hh | 209 +++++++++++++++++
 source/blender/blenlib/CMakeLists.txt         |   1 +
 4 files changed, 307 insertions(+), 217 deletions(-)
 create mode 100644 source/blender/blenlib/BLI_math_vector_unroll.hh
diff --git a/source/blender/blenlib/BLI_math_vector.hh b/source/blender/blenlib/BLI_math_vector.hh
index 24201f3ea36..545f286f3ef 100644
--- a/source/blender/blenlib/BLI_math_vector.hh
+++ b/source/blender/blenlib/BLI_math_vector.hh
@@ -51,31 +51,19 @@ template<typename T, int Size> [[nodiscard]] inline VecBase<T, Size> abs(const V
 template<typename T, int Size>
 [[nodiscard]] inline VecBase<T, Size> sign(const VecBase<T, Size> &a)
 {
-  VecBase<T, Size> result;
-  for (int i = 0; i < Size; i++) {
-    result[i] = math::sign(a[i]);
-  }
-  return result;
+  BLI_UNROLL_MATH_VEC_OP_VEC(math::sign, a);
 }
 
 template<typename T, int Size>
 [[nodiscard]] inline VecBase<T, Size> min(const VecBase<T, Size> &a, const VecBase<T, Size> &b)
 {
-  VecBase<T, Size> result;
-  for (int i = 0; i < Size; i++) {
-    result[i] = a[i] < b[i] ? a[i] : b[i];
-  }
-  return result;
+  BLI_UNROLL_MATH_VEC_FUNC_VEC_VEC(math::min, a, b);
 }
 
 template<typename T, int Size>
 [[nodiscard]] inline VecBase<T, Size> max(const VecBase<T, Size> &a, const VecBase<T, Size> &b)
 {
-  VecBase<T, Size> result;
-  for (int i = 0; i < Size; i++) {
-    result[i] = a[i] > b[i] ? a[i] : b[i];
-  }
-  return result;
+  BLI_UNROLL_MATH_VEC_FUNC_VEC_VEC(math::max, a, b);
 }
 
 template<typename T, int Size>
@@ -104,11 +92,7 @@ template<typename T, int Size>
 [[nodiscard]] inline VecBase<T, Size> step(const VecBase<T, Size> &edge,
                                            const VecBase<T, Size> &value)
 {
-  VecBase<T, Size> result = value;
-  for (int i = 0; i < Size; i++) {
-    result[i] = math::step(edge[i], result[i]);
-  }
-  return result;
+  BLI_UNROLL_MATH_VEC_FUNC_VEC_VEC(math::step, edge, value);
 }
 
 template<typename T, int Size>
@@ -124,12 +108,7 @@ template<typename T, int Size>
 template<typename T, int Size>
 [[nodiscard]] inline VecBase<T, Size> mod(const VecBase<T, Size> &a, const VecBase<T, Size> &b)
 {
-  VecBase<T, Size> result;
-  for (int i = 0; i < Size; i++) {
-    BLI_assert(b[i] != 0);
-    result[i] = math::mod(a[i], b[i]);
-  }
-  return result;
+  BLI_UNROLL_MATH_VEC_FUNC_VEC_VEC(math::mod, a, b);
 }
 
 template<typename T, int Size>
@@ -150,11 +129,7 @@ template<typename T, int Size>
 [[nodiscard]] inline VecBase<T, Size> safe_mod(const VecBase<T, Size> &a,
                                                const VecBase<T, Size> &b)
 {
-  VecBase<T, Size> result;
-  for (int i = 0; i < Size; i++) {
-    result[i] = (b[i] != 0) ? math::mod(a[i], b[i]) : 0;
-  }
-  return result;
+  BLI_UNROLL_MATH_VEC_FUNC_VEC_VEC(math::safe_mod, a, b);
 }
 
 /**
@@ -194,32 +169,20 @@ template<typename T, int Size>
 template<typename T, int Size>
 [[nodiscard]] inline VecBase<T, Size> pow(const VecBase<T, Size> &x, const VecBase<T, Size> &y)
 {
-  VecBase<T, Size> result;
-  for (int i = 0; i < Size; i++) {
-    result[i] = math::pow(x[i], y[i]);
-  }
-  return result;
+  BLI_UNROLL_MATH_VEC_FUNC_VEC_VEC(math::pow, x, y);
 }
 
 /** Per-element square. */
 template<typename T, int Size>
 [[nodiscard]] inline VecBase<T, Size> square(const VecBase<T, Size> &a)
 {
-  VecBase<T, Size> result;
-  for (int i = 0; i < Size; i++) {
-    result[i] = math::square(a[i]);
-  }
-  return result;
+  BLI_UNROLL_MATH_VEC_OP_VEC(math::square, a);
 }
 
 /* Per-element exponent. */
 template<typename T, int Size> [[nodiscard]] inline VecBase<T, Size> exp(const VecBase<T, Size> &x)
 {
-  VecBase<T, Size> result;
-  for (int i = 0; i < Size; i++) {
-    result[i] = math::exp(x[i]);
-  }
-  return result;
+  BLI_UNROLL_MATH_VEC_OP_VEC(math::exp, x);
 }
 
 /**
@@ -271,11 +234,7 @@ template<typename T, int Size>
 [[nodiscard]] inline VecBase<T, Size> safe_divide(const VecBase<T, Size> &a,
                                                   const VecBase<T, Size> &b)
 {
-  VecBase<T, Size> result;
-  for (int i = 0; i < Size; i++) {
-    result[i] = (b[i] == 0) ? 0 : a[i] / b[i];
-  }
-  return result;
+  BLI_UNROLL_MATH_VEC_FUNC_VEC_VEC(math::safe_divide, a, b);
 }
 
 /**
@@ -290,31 +249,19 @@ template<typename T, int Size>
 template<typename T, int Size>
 [[nodiscard]] inline VecBase<T, Size> floor(const VecBase<T, Size> &a)
 {
-  VecBase<T, Size> result;
-  for (int i = 0; i < Size; i++) {
-    result[i] = math::floor(a[i]);
-  }
-  return result;
+  BLI_UNROLL_MATH_VEC_OP_VEC(math::floor, a);
 }
 
 template<typename T, int Size>
 [[nodiscard]] inline VecBase<T, Size> round(const VecBase<T, Size> &a)
 {
-  VecBase<T, Size> result;
-  for (int i = 0; i < Size; i++) {
-    result[i] = math::round(a[i]);
-  }
-  return result;
+  BLI_UNROLL_MATH_VEC_OP_VEC(math::round, a);
 }
 
 template<typename T, int Size>
 [[nodiscard]] inline VecBase<T, Size> ceil(const VecBase<T, Size> &a)
 {
-  VecBase<T, Size> result;
-  for (int i = 0; i < Size; i++) {
-    result[i] = math::ceil(a[i]);
-  }
-  return result;
+  BLI_UNROLL_MATH_VEC_OP_VEC(math::ceil, a);
 }
 
 /**
@@ -324,11 +271,7 @@ template<typename T, int Size>
 template<typename T, int Size>
 [[nodiscard]] inline VecBase<T, Size> sqrt(const VecBase<T, Size> &a)
 {
-  VecBase<T, Size> result;
-  for (int i = 0; i < Size; i++) {
-    result[i] = math::sqrt(a[i]);
-  }
-  return result;
+  BLI_UNROLL_MATH_VEC_OP_VEC(math::sqrt, a);
 }
 
 /**
@@ -351,11 +294,7 @@ template<typename T, int Size>
  */
 template<typename T, int Size> [[nodiscard]] inline VecBase<T, Size> rcp(const VecBase<T, Size> &a)
 {
-  VecBase<T, Size> result;
-  for (int i = 0; i < Size; i++) {
-    result[i] = math::rcp(a[i]);
-  }
-  return result;
+  BLI_UNROLL_MATH_VEC_OP_VEC(math::rcp, a);
 }
 
 /**
@@ -365,21 +304,13 @@ template<typename T, int Size> [[nodiscard]] inline VecBase<T, Size> rcp(const V
 template<typename T, int Size>
 [[nodiscard]] inline VecBase<T, Size> safe_rcp(const VecBase<T, Size> &a)
 {
-  VecBase<T, Size> result;
-  for (int i = 0; i < Size; i++) {
-    result[i] = math::safe_rcp(a[i]);
-  }
-  return result;
+  BLI_UNROLL_MATH_VEC_OP_VEC(math::safe_rcp, a);
 }
 
 template<typename T, int Size>
 [[nodiscard]] inline VecBase<T, Size> fract(const VecBase<T, Size> &a)
 {
-  VecBase<T, Size> result;
-  for (int i = 0; i < Size; i++) {
-    result[i] = math::fract(a[i]);
-  }
-  return result;
+  BLI_UNROLL_MATH_VEC_OP_VEC(math::fract, a);
 }
 
 /**
diff --git a/source/blender/blenlib/BLI_math_vector_types.hh b/source/blender/blenlib/BLI_math_vector_types.hh
index 65b325b1c49..26df467afda 100644
--- a/source/blender/blenlib/BLI_math_vector_types.hh
+++ b/source/blender/blenlib/BLI_math_vector_types.hh
@@ -13,6 +13,7 @@
 #include <ostream>
 #include <type_traits>
 
+#include "BLI_math_vector_unroll.hh"
 #include "BLI_unroll.hh"
 #include "BLI_utildefines.h"
 
@@ -93,28 +94,28 @@ template<typename T, int Size> struct VecBase : public vec_struct_base<T, Size>
 
   template<BLI_ENABLE_IF_VEC(Size, == 1)> VecBase(T _x)
   {
-    (*this)[0] = _x;
+    this->x = _x;
   }
 
   template<BLI_ENABLE_IF_VEC(Size, == 2)> VecBase(T _x, T _y)
   {
-    (*this)[0] = _x;
-    (*this)[1] = _y;
+    this->x = _x;
+    this->y = _y;
   }
 
   template<BLI_ENABLE_IF_VEC(Size, == 3)> VecBase(T _x, T _y, T _z)
   {
-    (*this)[0] = _x;
-    (*this)[1] = _y;
-    (*this)[2] = _z;
+    this->x = _x;
+    this->y = _y;
+    this->z = _z;
   }
 
   template<BLI_ENABLE_IF_VEC(Size, == 4)> VecBase(T _x, T _y, T _z, T _w)
   {
-    (*this)[0] = _x;
-    (*this)[1] = _y;
-    (*this)[2] = _z;
-    (*this)[3] = _w;
+    this->x = _x;
+    this->y = _y;
+    this->z = _z;
+    this->w = _w;
   }
 
   /** Mixed scalar-vector constructors. */
@@ -205,23 +206,35 @@ template<typename T, int Size> struct VecBase : public vec_struct_base<T, Size>
 
   /** Conversion from pointers (from C-style vectors). */
 
+  /* False positive warning with GCC: it sees array access like [3] but
+   * input is only a 3-element array. But it fails to realize that the
+   * [3] access is within "if constexpr (Size == 4)" check already. */
+#ifdef __GNUC__
+#  pragma GCC diagnostic push
+#  pragma GCC diagnostic ignored "-Warray-bounds"
+#endif
+
   VecBase(const T *ptr)
   {
-    unroll<Size>([&](auto i) { (*this)[i] = ptr[i]; });
+    BLI_UNROLL_MATH_VEC_OP_INIT_INDEX(ptr);
   }
 
   template<typename U, BLI_ENABLE_IF((std::is_convertible_v<U, T>))> explicit VecBase(const U *ptr)
   {
-    unroll<Size>([&](auto i) { (*this)[i] = ptr[i]; });
+    BLI_UNROLL_MATH_VEC_OP_INIT_INDEX(ptr);
   }
 
   VecBase(const T (*ptr)[Size]) : VecBase(static_cast<const T *>(ptr[0])) {}
 
+#ifdef __GNUC__
+#  pragma GCC diagnostic pop
+#endif
+
   /** Conversion from other vector types. */
 
   template<typename U> explicit VecBase(const VecBase<U, Size> &vec)
   {
-    unroll<Size>([&](auto i) { (*this)[i] = T(vec[i]); });
+    BLI_UNROLL_MATH_VEC_OP_INIT_VECTOR(vec);
   }
 
   /** C-style pointer dereference. */
@@ -260,104 +273,82 @@ template<typename T, int Size> struct VecBase : public vec_struct_base<T, Size>
 
   friend VecBase operator+(const VecBase &a, const VecBase &b)
   {
-    VecBase result;
-    unroll<Size>([&](auto i) { result[i] = a[i] + b[i]; });
-    return result;
+    BLI_UNROLL_MATH_VEC_OP_VEC_VEC(+, a, b);
   }
 
   friend VecBase operator+(const VecBase &a, const T &b)
   {
-    VecBase result;
-    unroll<Size>([&](auto i) { result[i] = a[i] + b; });
-    return result;
+    BLI_UNROLL_MATH_VEC_OP_VEC_SCALAR(+, a, b);
   }
 
   friend VecBase operator+(const T &a, const VecBase &b)
   {
-    return b + a;
+    BLI_UNROLL_MATH_VEC_OP_SCALAR_VEC(+, a, b);
   }
 
   VecBase &operator+=(const VecBase &b)
   {
-    unroll<Size>([&](auto i) { (*this)[i] += b[i]; });
-    return *this;
+    BLI_UNROLL_MATH_VEC_OP_ASSIGN_VEC(+=, b);
   }
 
   VecBase &operator+=(const T &b)
   {
-    unroll<Size>([&](auto i) { (*this)[i] += b; });
-    return *this;
+    BLI_UNROLL_MATH_VEC_OP_ASSIGN_SCALAR(+=, b);
   }
 
   friend VecBase operator-(const VecBase &a)
   {
-    VecBase result;
-    unroll<Size>([&](auto i) { result[i] = -a[i]; });
-    return result;
+    BLI_UNROLL_MATH_VEC_OP_VEC(-, a);
   }
 
   friend VecBase operator-(const VecBase &a, const VecBase &b)
   {
-    VecBase result;
-    unroll<Size>([&](auto i) { result[i] = a[i] - b[i]; });
-    return result;
+    BLI_UNROLL_MATH_VEC_OP_VEC_VEC(-, a, b);
   }
 
   friend VecBase operator-(const VecBase &a, const T &b)
   {
-    VecBase result;
-    unroll<Size>([&](auto i) { result[i] = a[i] - b; });
-    return result;
+    BLI_UNROLL_MATH_VEC_OP_VEC_SCALAR(-, a, b);
   }
 
   friend VecBase operator-(const T &a, const VecBase &b)
   {
-    VecBase result;
-    unroll<Size>([&](auto i) { result[i] = a - b[i]; });
-    return result;
+    BLI_UNROLL_MATH_VEC_OP_SCALAR_VEC(-, a, b);
   }
 
   VecBase &operator-=(const VecBase &b)
   {
-    unroll<Size>([&](auto i) { (*this)[i] -= b[i]; });
-    return *this;
+    BLI_UNROLL_MATH_VEC_OP_ASSIGN_VEC(-=, b);
   }
 
   VecBase &operator-=(const T &b)
   {
-    unroll<Size>([&](auto i) { (*this)[i] -= b; });
-    return *this;
+    BLI_UNROLL_MATH_VEC_OP_ASSIGN_SCALAR(-=, b);
   }
 
   friend VecBase operator*(const VecBase &a, const VecBase &b)
   {
-    VecBase result;
-    unroll<Size>([&](auto i) { result[i] = a[i] * b[i]; });
-    return result;
+    BLI_UNROLL_MATH_VEC_OP_VEC_VEC(*, a, b);
   }
 
   template<typename FactorT> friend VecBase operator*(const VecBase &a, FactorT b)
   {
-    VecBase result;
-    unroll<Size>([&](auto i) { result[i] = a[i] * b; });
-    return result;
+    BLI_UNROLL_MATH_VEC_OP_VEC_SCALAR(*, a, b);
   }
 
   friend VecBase operator*(T a, const VecBase &b)
   {
-    return b * a;
+    BLI_UNROLL_MATH_VEC_OP_SCALAR_VEC(*, a, b);
   }
 
   VecBase &operator*=(T b)
   {
-    unroll<Size>([&](auto i) { (*this)[i] *= b; });
-    return *this;
+    BLI_UNROLL_MATH_VEC_OP_ASSIGN_SCALAR(*=, b);
   }
 
   VecBase &operator*=(const VecBase &b)
   {
-    unroll<Size>([&](auto i) { (*this)[i] *= b[i]; });
-    return *this;
+    BLI_UNROLL_MATH_VEC_OP_ASSIGN_VEC(*=, b);
   }
 
   friend VecBase operator/(const VecBase &a, const VecBase &b)
@@ -365,17 +356,13 @@ template<typename T, int Size> struct VecBase : public vec_struct_base<T, Size>
     for (int i = 0; i < Size; i++) {
       BLI_assert(b[i] != T(0));
     }
-    VecBase result;
-    unroll<Size>([&](auto i) { result[i] = a[i] / b[i]; });
-    return result;
+    BLI_UNROLL_MATH_VEC_OP_VEC_VEC(/, a, b);
   }
 
   friend VecBase operator/(const VecBase &a, T b)
   {
     BLI_assert(b != T(0));
-    VecBase result;
-    unroll<Size>([&](auto i) { result[i] = a[i] / b; });
-    return result;
+    BLI_UNROLL_MATH_VEC_OP_VEC_SCALAR(/, a, b);
   }
 
   friend VecBase operator/(T a, const VecBase &b)
@@ -383,179 +370,145 @@ template<typename T, int Size> struct VecBase : public vec_struct_base<T, Size>
     for (int i = 0; i < Size; i++) {
       BLI_assert(b[i] != T(0));
     }
-    VecBase result;
-    unroll<Size>([&](auto i) { result[i] = a / b[i]; });
-    return result;
+    BLI_UNROLL_MATH_VEC_OP_SCALAR_VEC(/, a, b);
   }
 
   VecBase &operator/=(T b)
   {
     BLI_assert(b != T(0));
-    unroll<Size>([&](auto i) { (*this)[i] /= b; });
-    return *this;
+    BLI_UNROLL_MATH_VEC_OP_ASSIGN_SCALAR(/=, b);
   }
 
   VecBase &operator/=(const VecBase &b)
   {
-    BLI_assert(b != T(0));
-    unroll<Size>([&](auto i) { (*this)[i] /= b[i]; });
-    return *this;
+    for (int i = 0; i < Size; i++) {
+      BLI_assert(b[i] != T(0));
+    }
+    BLI_UNROLL_MATH_VEC_OP_ASSIGN_VEC(/=, b);
   }
 
   /** Binary operators. */
 
   BLI_INT_OP(T) friend VecBase operator&(const VecBase &a, const VecBase &b)
   {
-    VecBase result;
-    unroll<Size>([&](auto i) { result[i] = a[i] & b[i]; });
-    return result;
+    BLI_UNROLL_MATH_VEC_OP_VEC_VEC(&, a, b);
   }
 
   BLI_INT_OP(T) friend VecBase operator&(const VecBase &a, T b)
   {
-    VecBase result;
-    unroll<Size>([&](auto i) { result[i] = a[i] & b; });
-    return result;
+    BLI_UNROLL_MATH_VEC_OP_VEC_SCALAR(&, a, b);
   }
 
   BLI_INT_OP(T) friend VecBase operator&(T a, const VecBase &b)
   {
-    return b & a;
+    BLI_UNROLL_MATH_VEC_OP_SCALAR_VEC(&, a, b);
   }
 
   BLI_INT_OP(T) VecBase &operator&=(T b)
   {
-    unroll<Size>([&](auto i) { (*this)[i] &= b; });
-    return *this;
+    BLI_UNROLL_MATH_VEC_OP_ASSIGN_SCALAR(&=, b);
   }
 
   BLI_INT_OP(T) VecBase &operator&=(const VecBase &b)
   {
-    unroll<Size>([&](auto i) { (*this)[i] &= b[i]; });
-    return *this;
+    BLI_UNROLL_MATH_VEC_OP_ASSIGN_VEC(&=, b);
   }
 
   BLI_INT_OP(T) friend VecBase operator|(const VecBase &a, const VecBase &b)
   {
-    VecBase result;
-    unroll<Size>([&](auto i) { result[i] = a[i] | b[i]; });
-    return result;
+    BLI_UNROLL_MATH_VEC_OP_VEC_VEC(|, a, b);
   }
 
   BLI_INT_OP(T) friend VecBase operator|(const VecBase &a, T b)
   {
-    VecBase result;
-    unroll<Size>([&](auto i) { result[i] = a[i] | b; });
-    return result;
+    BLI_UNROLL_MATH_VEC_OP_VEC_SCALAR(|, a, b);
   }
 
   BLI_INT_OP(T) friend VecBase operator|(T a, const VecBase &b)
   {
-    return b | a;
+    BLI_UNROLL_MATH_VEC_OP_SCALAR_VEC(|, a, b);
   }
 
   BLI_INT_OP(T) VecBase &operator|=(T b)
   {
-    unroll<Size>([&](auto i) { (*this)[i] |= b; });
-    return *this;
+    BLI_UNROLL_MATH_VEC_OP_ASSIGN_SCALAR(|=, b);
   }
 
   BLI_INT_OP(T) VecBase &operator|=(const VecBase &b)
   {
-    unroll<Size>([&](auto i) { (*this)[i] |= b[i]; });
-    return *this;
+    BLI_UNROLL_MATH_VEC_OP_ASSIGN_VEC(|=, b);
   }
 
   BLI_INT_OP(T) friend VecBase operator^(const VecBase &a, const VecBase &b)
   {
-    VecBase result;
-    unroll<Size>([&](auto i) { result[i] = a[i] ^ b[i]; });
-    return result;
+    BLI_UNROLL_MATH_VEC_OP_VEC_VEC(^, a, b);
   }
 
   BLI_INT_OP(T) friend VecBase operator^(const VecBase &a, T b)
   {
-    VecBase result;
-    unroll<Size>([&](auto i) { result[i] = a[i] ^ b; });
-    return result;
+    BLI_UNROLL_MATH_VEC_OP_VEC_SCALAR(^, a, b);
   }
 
   BLI_INT_OP(T) friend VecBase operator^(T a, const VecBase &b)
   {
-    return b ^ a;
+    BLI_UNROLL_MATH_VEC_OP_SCALAR_VEC(^, a, b);
   }
 
   BLI_INT_OP(T) VecBase &operator^=(T b)
   {
-    unroll<Size>([&](auto i) { (*this)[i] ^= b; });
-    return *this;
+    BLI_UNROLL_MATH_VEC_OP_ASSIGN_SCALAR(^=, b);
   }
 
   BLI_INT_OP(T) VecBase &operator^=(const VecBase &b)
   {
-    unroll<Size>([&](auto i) { (*this)[i] ^= b[i]; });
-    return *this;
+    BLI_UNROLL_MATH_VEC_OP_ASSIGN_VEC(^=, b);
   }
 
   BLI_INT_OP(T) friend VecBase operator~(const VecBase &a)
   {
-    VecBase result;
-    unroll<Size>([&](auto i) { result[i] = ~a[i]; });
-    return result;
+    BLI_UNROLL_MATH_VEC_OP_VEC(~, a);
   }
 
   /** Bit-shift operators. */
 
   BLI_INT_OP(T) friend VecBase operator<<(const VecBase &a, const VecBase &b)
   {
-    VecBase result;
-    unroll<Size>([&](auto i) { result[i] = a[i] << b[i]; });
-    return result;
+    BLI_UNROLL_MATH_VEC_OP_VEC_VEC(<<, a, b);
   }
 
   BLI_INT_OP(T) friend VecBase operator<<(const VecBase &a, T b)
   {
-    VecBase result;
-    unroll<Size>([&](auto i) { result[i] = a[i] << b; });
-    return result;
+    BLI_UNROLL_MATH_VEC_OP_VEC_SCALAR(<<, a, b);
   }
 
   BLI_INT_OP(T) VecBase &operator<<=(T b)
   {
-    unroll<Size>([&](auto i) { (*this)[i] <<= b; });
-    return *this;
+    BLI_UNROLL_MATH_VEC_OP_ASSIGN_SCALAR(<<=, b);
   }
 
   BLI_INT_OP(T) VecBase &operator<<=(const VecBase &b)
   {
-    unroll<Size>([&](auto i) { (*this)[i] <<= b[i]; });
-    return *this;
+    BLI_UNROLL_MATH_VEC_OP_ASSIGN_VEC(<<=, b);
   }
 
   BLI_INT_OP(T) friend VecBase operator>>(const VecBase &a, const VecBase &b)
   {
-    VecBase result;
-    unroll<Size>([&](auto i) { result[i] = a[i] >> b[i]; });
-    return result;
+    BLI_UNROLL_MATH_VEC_OP_VEC_VEC(>>, a, b);
   }
 
   BLI_INT_OP(T) friend VecBase operator>>(const VecBase &a, T b)
   {
-    VecBase result;
-    unroll<Size>([&](auto i) { result[i] = a[i] >> b; });
-    return result;
+    BLI_UNROLL_MATH_VEC_OP_VEC_SCALAR(>>, a, b);
   }
 
   BLI_INT_OP(T) VecBase &operator>>=(T b)
   {
-    unroll<Size>([&](auto i) { (*this)[i] >>= b; });
-    return *this;
+    BLI_UNROLL_MATH_VEC_OP_ASSIGN_SCALAR(>>=, b);
   }
 
   BLI_INT_OP(T) VecBase &operator>>=(const VecBase &b)
   {
-    unroll<Size>([&](auto i) { (*this)[i] >>= b[i]; });
-    return *this;
+    BLI_UNROLL_MATH_VEC_OP_ASSIGN_VEC(>>=, b);
   }
 
   /** Modulo operators. */
@@ -565,25 +518,21 @@ template<typename T, int Size> struct VecBase : public vec_struct_base<T, Size>
     for (int i = 0; i < Size; i++) {
       BLI_assert(b[i] != T(0));
     }
-    VecBase result;
-    unroll<Size>([&](auto i) { result[i] = a[i] % b[i]; });
-    return result;
+    BLI_UNROLL_MATH_VEC_OP_VEC_VEC(%, a, b);
   }
 
   BLI_INT_OP(T) friend VecBase operator%(const VecBase &a, T b)
   {
     BLI_assert(b != 0);
-    VecBase result;
-    unroll<Size>([&](auto i) { result[i] = a[i] % b; });
-    return result;
+    BLI_UNROLL_MATH_VEC_OP_VEC_SCALAR(%, a, b);
   }
 
   BLI_INT_OP(T) friend VecBase operator%(T a, const VecBase &b)
   {
-    BLI_assert(b != T(0));
-    VecBase result;
-    unroll<Size>([&](auto i) { result[i] = a % b[i]; });
-    return result;
+    for (int i = 0; i < Size; i++) {
+      BLI_assert(b[i] != T(0));
+    }
+    BLI_UNROLL_MATH_VEC_OP_SCALAR_VEC(%, a, b);
   }
 
 #undef BLI_INT_OP
diff --git a/source/blender/blenlib/BLI_math_vector_unroll.hh b/source/blender/blenlib/BLI_math_vector_unroll.hh
new file mode 100644
index 00000000000..944caadac6b
--- /dev/null
+++ b/source/blender/blenlib/BLI_math_vector_unroll.hh
@@ -0,0 +1,209 @@
+/* SPDX-FileCopyrightText: 2024 Blender Authors
+ *
+ * SPDX-License-Identifier: GPL-2.0-or-later */
+
+#pragma once
+
+/** \file
+ * \ingroup bli
+ */
+
+/*
+ * Macros that implement an arithmetic operator or a math function
+ * on vector types. Only for internal BLI vector math library use!
+ *
+ * Just doing per-element loop is enough for correct result, however
+ * in debug / non-optimized builds (or even release builds with
+ * assertions enabled), these result in very sub-optimal generated code.
+ * So instead of the loop, also explicitly implement the operator for
+ * common vector sizes (4, 3, 2).
+ */
+
+/* Binary operator `op` on vectors `a` and `b`. */
+#define BLI_UNROLL_MATH_VEC_OP_VEC_VEC(op, a, b) \
+  if constexpr (Size == 4) { \
+    return VecBase<T, Size>(a.x op b.x, a.y op b.y, a.z op b.z, a.w op b.w); \
+  } \
+  else if constexpr (Size == 3) { \
+    return VecBase<T, Size>(a.x op b.x, a.y op b.y, a.z op b.z); \
+  } \
+  else if constexpr (Size == 2) { \
+    return VecBase<T, Size>(a.x op b.x, a.y op b.y); \
+  } \
+  else { \
+    VecBase<T, Size> result; \
+    for (int i = 0; i < Size; i++) { \
+      result[i] = a[i] op b[i]; \
+    } \
+    return result; \
+  }
+
+/* Binary function `op` on vectors `a` and `b`. */
+#define BLI_UNROLL_MATH_VEC_FUNC_VEC_VEC(op, a, b) \
+  if constexpr (Size == 4) { \
+    return VecBase<T, Size>(op(a.x, b.x), op(a.y, b.y), op(a.z, b.z), op(a.w, b.w)); \
+  } \
+  else if constexpr (Size == 3) { \
+    return VecBase<T, Size>(op(a.x, b.x), op(a.y, b.y), op(a.z, b.z)); \
+  } \
+  else if constexpr (Size == 2) { \
+    return VecBase<T, Size>(op(a.x, b.x), op(a.y, b.y)); \
+  } \
+  else { \
+    VecBase<T, Size> result; \
+    for (int i = 0; i < Size; i++) { \
+      result[i] = op(a[i], b[i]); \
+    } \
+    return result; \
+  }
+
+/* Unary operator or function `op` on vector `a`. */
+#define BLI_UNROLL_MATH_VEC_OP_VEC(op, a) \
+  if constexpr (Size == 4) { \
+    return VecBase<T, Size>(op(a.x), op(a.y), op(a.z), op(a.w)); \
+  } \
+  else if constexpr (Size == 3) { \
+    return VecBase<T, Size>(op(a.x), op(a.y), op(a.z)); \
+  } \
+  else if constexpr (Size == 2) { \
+    return VecBase<T, Size>(op(a.x), op(a.y)); \
+  } \
+  else { \
+    VecBase<T, Size> result; \
+    for (int i = 0; i < Size; i++) { \
+      result[i] = op(a[i]); \
+    } \
+    return result; \
+  }
+
+/* Binary operator `op` on scalar `a` and vector `b`. */
+#define BLI_UNROLL_MATH_VEC_OP_SCALAR_VEC(op, a, b) \
+  if constexpr (Size == 4) { \
+    return VecBase<T, Size>(a op b.x, a op b.y, a op b.z, a op b.w); \
+  } \
+  else if constexpr (Size == 3) { \
+    return VecBase<T, Size>(a op b.x, a op b.y, a op b.z); \
+  } \
+  else if constexpr (Size == 2) { \
+    return VecBase<T, Size>(a op b.x, a op b.y); \
+  } \
+  else { \
+    VecBase<T, Size> result; \
+    for (int i = 0; i < Size; i++) { \
+      result[i] = a op b[i]; \
+    } \
+    return result; \
+  }
+
+/* Binary operator `op` on vector `a` and scalar `b`. */
+#define BLI_UNROLL_MATH_VEC_OP_VEC_SCALAR(op, a, b) \
+  if constexpr (Size == 4) { \
+    return VecBase<T, Size>(a.x op b, a.y op b, a.z op b, a.w op b); \
+  } \
+  else if constexpr (Size == 3) { \
+    return VecBase<T, Size>(a.x op b, a.y op b, a.z op b); \
+  } \
+  else if constexpr (Size == 2) { \
+    return VecBase<T, Size>(a.x op b, a.y op b); \
+  } \
+  else { \
+    VecBase<T, Size> result; \
+    for (int i = 0; i < Size; i++) { \
+      result[i] = a[i] op b; \
+    } \
+    return result; \
+  }
+
+/* Assignment-like operator `op` with vector argument `b`. */
+#define BLI_UNROLL_MATH_VEC_OP_ASSIGN_VEC(op, b) \
+  if constexpr (Size == 4) { \
+    this->x op b.x; \
+    this->y op b.y; \
+    this->z op b.z; \
+    this->w op b.w; \
+  } \
+  else if constexpr (Size == 3) { \
+    this->x op b.x; \
+    this->y op b.y; \
+    this->z op b.z; \
+  } \
+  else if constexpr (Size == 2) { \
+    this->x op b.x; \
+    this->y op b.y; \
+  } \
+  else { \
+    for (int i = 0; i < Size; i++) { \
+      (*this)[i] op b[i]; \
+    } \
+  } \
+  return *this
+
+/* Assignment-like operator `op` with scalar argument `b`. */
+#define BLI_UNROLL_MATH_VEC_OP_ASSIGN_SCALAR(op, b) \
+  if constexpr (Size == 4) { \
+    this->x op b; \
+    this->y op b; \
+    this->z op b; \
+    this->w op b; \
+  } \
+  else if constexpr (Size == 3) { \
+    this->x op b; \
+    this->y op b; \
+    this->z op b; \
+  } \
+  else if constexpr (Size == 2) { \
+    this->x op b; \
+    this->y op b; \
+  } \
+  else { \
+    for (int i = 0; i < Size; i++) { \
+      (*this)[i] op b; \
+    } \
+  } \
+  return *this
+
+/* Initialization from a pointer or indexed argument `a`. */
+#define BLI_UNROLL_MATH_VEC_OP_INIT_INDEX(a) \
+  if constexpr (Size == 4) { \
+    this->x = a[0]; \
+    this->y = a[1]; \
+    this->z = a[2]; \
+    this->w = a[3]; \
+  } \
+  else if constexpr (Size == 3) { \
+    this->x = a[0]; \
+    this->y = a[1]; \
+    this->z = a[2]; \
+  } \
+  else if constexpr (Size == 2) { \
+    this->x = a[0]; \
+    this->y = a[1]; \
+  } \
+  else { \
+    for (int i = 0; i < Size; i++) { \
+      (*this)[i] = a[i]; \
+    } \
+  }
+
+/* Initialization from another vector `a`. */
+#define BLI_UNROLL_MATH_VEC_OP_INIT_VECTOR(a) \
+  if constexpr (Size == 4) { \
+    this->x = T(a.x); \
+    this->y = T(a.y); \
+    this->z = T(a.z); \
+    this->w = T(a.w); \
+  } \
+  else if constexpr (Size == 3) { \
+    this->x = T(a.x); \
+    this->y = T(a.y); \
+    this->z = T(a.z); \
+  } \
+  else if constexpr (Size == 2) { \
+    this->x = T(a.x); \
+    this->y = T(a.y); \
+  } \
+  else { \
+    for (int i = 0; i < Size; i++) { \
+      (*this)[i] = T(a[i]); \
+    } \
+  }
diff --git a/source/blender/blenlib/CMakeLists.txt b/source/blender/blenlib/CMakeLists.txt
index 576c9e03d04..c1918aa8077 100644
--- a/source/blender/blenlib/CMakeLists.txt
+++ b/source/blender/blenlib/CMakeLists.txt
@@ -323,6 +323,7 @@ set(SRC
   BLI_math_vector.hh
   BLI_math_vector_mpq_types.hh
   BLI_math_vector_types.hh
+  BLI_math_vector_unroll.hh
   BLI_memarena.h
   BLI_memblock.h
   BLI_memiter.h