From 30b962b3d8669c64e004091dcb29becba2af14c3 Mon Sep 17 00:00:00 2001
From: Sergey Sharybin <sergey@blender.org>
Date: Wed, 9 Apr 2025 13:40:10 +0200
Subject: [PATCH] Cycles: Optimize 3d and 4d noise

The goal is to reduce the affect of the fmod() used in the noise code,
which was initially reported in the comment:

    https://projects.blender.org/blender/blender/pulls/119884#issuecomment-1258902

Basic idea is to benefit from SIMD vectorization on CPU.

Tested on Linux i9-11900K and macOS on M2 Ultra, in both cases performance
after this change is very close to what it could be with the fmod() commented
out (the call itself, `p = p + precision_correction`).

On macOS the penalty of fmod() was about 10%, on Linux it was closer to 30%
when built with GCC-13. With Linux builds from the buildbot it is more like 18%.

The optimization is only done for 3d and 4d noise. It might be possible to
gain some performance improvement for 1d and 2d cases, but the approach would
need to be different: we'd need to optimize scalar version fmodf(). Maybe
tricks with integer cast will be faster (since we are a bit optimistic in the
kernel and do not guarantee exact behavior in extreme cases such as NaN inputs).

Pull Request: https://projects.blender.org/blender/blender/pulls/137109
---
 intern/cycles/kernel/svm/noise.h             | 18 ++---
 intern/cycles/test/CMakeLists.txt            |  2 +
 intern/cycles/test/util_math_float3_test.cpp | 67 +++++++++++++++++++
 intern/cycles/test/util_math_float4_test.cpp | 69 ++++++++++++++++++++
 intern/cycles/util/math_float2.h             |  5 ++
 intern/cycles/util/math_float3.h             | 24 +++++++
 intern/cycles/util/math_float4.h             | 15 +++++
 intern/cycles/util/types_float2.h            |  5 ++
 8 files changed, 196 insertions(+), 9 deletions(-)
 create mode 100644 intern/cycles/test/util_math_float3_test.cpp
 create mode 100644 intern/cycles/test/util_math_float4_test.cpp

diff --git a/intern/cycles/kernel/svm/noise.h b/intern/cycles/kernel/svm/noise.h
index 6b772874eda..e38e330bacc 100644
--- a/intern/cycles/kernel/svm/noise.h
+++ b/intern/cycles/kernel/svm/noise.h
@@ -706,8 +706,9 @@ ccl_device_inline float noise_1d(const float p)
 
 ccl_device_inline float snoise_2d(float2 p)
 {
-  const float2 precision_correction = 0.5f * make_float2(float(fabsf(p.x) >= 1000000.0f),
-                                                         float(fabsf(p.y) >= 1000000.0f));
+  const float2 precision_correction = 0.5f *
+                                      mask(fabs(p) >= make_float2(1000000.0f), one_float2());
+
   /* Repeat Perlin noise texture every 100000.0f on each axis to prevent floating point
    * representation issues. This causes discontinuities every 100000.0f, however at such scales
    * this usually shouldn't be noticeable. */
@@ -723,9 +724,9 @@ ccl_device_inline float noise_2d(const float2 p)
 
 ccl_device_inline float snoise_3d(float3 p)
 {
-  const float3 precision_correction = 0.5f * make_float3(float(fabsf(p.x) >= 1000000.0f),
-                                                         float(fabsf(p.y) >= 1000000.0f),
-                                                         float(fabsf(p.z) >= 1000000.0f));
+  const float3 precision_correction = 0.5f *
+                                      mask(fabs(p) >= make_float3(1000000.0f), one_float3());
+
   /* Repeat Perlin noise texture every 100000.0f on each axis to prevent floating point
    * representation issues. This causes discontinuities every 100000.0f, however at such scales
    * this usually shouldn't be noticeable. */
@@ -741,10 +742,9 @@ ccl_device_inline float noise_3d(const float3 p)
 
 ccl_device_inline float snoise_4d(float4 p)
 {
-  const float4 precision_correction = 0.5f * make_float4(float(fabsf(p.x) >= 1000000.0f),
-                                                         float(fabsf(p.y) >= 1000000.0f),
-                                                         float(fabsf(p.z) >= 1000000.0f),
-                                                         float(fabsf(p.w) >= 1000000.0f));
+  const float4 precision_correction = 0.5f *
+                                      mask(fabs(p) >= make_float4(1000000.0f), one_float4());
+
   /* Repeat Perlin noise texture every 100000.0f on each axis to prevent floating point
    * representation issues. This causes discontinuities every 100000.0f, however at such scales
    * this usually shouldn't be noticeable. */
diff --git a/intern/cycles/test/CMakeLists.txt b/intern/cycles/test/CMakeLists.txt
index 0d98480defd..c83a3e9971a 100644
--- a/intern/cycles/test/CMakeLists.txt
+++ b/intern/cycles/test/CMakeLists.txt
@@ -37,6 +37,8 @@ set(SRC
   util_ies_test.cpp
   util_math_test.cpp
   util_math_fast_test.cpp
+  util_math_float3_test.cpp
+  util_math_float4_test.cpp
   util_md5_test.cpp
   util_path_test.cpp
   util_string_test.cpp
diff --git a/intern/cycles/test/util_math_float3_test.cpp b/intern/cycles/test/util_math_float3_test.cpp
new file mode 100644
index 00000000000..dabac37dc0b
--- /dev/null
+++ b/intern/cycles/test/util_math_float3_test.cpp
@@ -0,0 +1,67 @@
+/* SPDX-FileCopyrightText: 2011-2025 Blender Foundation
+ *
+ * SPDX-License-Identifier: Apache-2.0 */
+
+/* Note: These fixtures test default micro-architecture optimization defined in the
+ * util/optimization.h. */
+
+#include "testing/testing.h"
+#include "util/math.h"
+#include "util/system.h"
+
+CCL_NAMESPACE_BEGIN
+
+class Float3Test : public ::testing::Test {
+  void SetUp() override
+  {
+    /* The micro-architecture check is not needed here, but use it here as a demonstration of how
+     * it can be implemented in a clear way. */
+    // GTEST_SKIP() << "Test skipped due to uarch capability";
+  }
+};
+
+TEST_F(Float3Test, fmod)
+{
+  {
+    const float3 c = fmod(make_float3(1.2f, 2.3f, 3.4f), 1.0f);
+    EXPECT_NEAR(c.x, 0.2f, 1e-6f);
+    EXPECT_NEAR(c.y, 0.3f, 1e-6f);
+    EXPECT_NEAR(c.z, 0.4f, 1e-6f);
+  }
+
+  {
+    const float3 c = fmod(make_float3(1.2f, 2.3f, 3.4f), 1.2f);
+    EXPECT_NEAR(c.x, 0.0f, 1e-6f);
+    EXPECT_NEAR(c.y, 1.1f, 1e-6f);
+    EXPECT_NEAR(c.z, 1.0f, 1e-6f);
+  }
+
+  {
+    const float3 c = fmod(make_float3(1.2f, 2.3f, 3.4f), 1000000.0f);
+    EXPECT_NEAR(c.x, 1.2f, 1e-6f);
+    EXPECT_NEAR(c.y, 2.3f, 1e-6f);
+    EXPECT_NEAR(c.z, 3.4f, 1e-6f);
+  }
+
+  {
+    const float3 c = fmod(make_float3(1999999.2f, 2000000.3f, 2000001.4f), 1000000.0f);
+    EXPECT_NEAR(c.x, 999999.25f, 1e-6f);
+    EXPECT_NEAR(c.y, 0.25f, 1e-6f);
+    EXPECT_NEAR(c.z, 1.375f, 1e-6f);
+  }
+
+  {
+    const float3 c = fmod(make_float3(5.1f, -5.1f, 0.0f), 3.0f);
+    EXPECT_NEAR(c.x, 2.1f, 1e-6f);
+    EXPECT_NEAR(c.y, -2.1, 1e-6f);
+    EXPECT_NEAR(c.z, 0.0f, 1e-6f);
+  }
+
+  {
+    const float3 c = fmod(make_float3(5.1f, -5.1f, 0.0f), -3.0f);
+    EXPECT_NEAR(c.x, 2.1f, 1e-6f);
+    EXPECT_NEAR(c.y, -2.1, 1e-6f);
+  }
+}
+
+CCL_NAMESPACE_END
diff --git a/intern/cycles/test/util_math_float4_test.cpp b/intern/cycles/test/util_math_float4_test.cpp
new file mode 100644
index 00000000000..6b4aa80657d
--- /dev/null
+++ b/intern/cycles/test/util_math_float4_test.cpp
@@ -0,0 +1,69 @@
+/* SPDX-FileCopyrightText: 2011-2025 Blender Foundation
+ *
+ * SPDX-License-Identifier: Apache-2.0 */
+
+/* Note: These fixtures test default micro-architecture optimization defined in the
+ * util/optimization.h. */
+
+#include "testing/testing.h"
+#include "util/math.h"
+#include "util/system.h"
+
+CCL_NAMESPACE_BEGIN
+
+class Float4Test : public ::testing::Test {
+  void SetUp() override
+  {
+    /* The micro-architecture check is not needed here, but use it here as a demonstration of how
+     * it can be implemented in a clear way. */
+    // GTEST_SKIP() << "Test skipped due to uarch capability";
+  }
+};
+
+TEST_F(Float4Test, fmod)
+{
+  {
+    const float4 c = fmod(make_float4(1.2f, 2.3f, 3.4f, 4.5f), 1.0f);
+    EXPECT_NEAR(c.x, 0.2f, 1e-6f);
+    EXPECT_NEAR(c.y, 0.3f, 1e-6f);
+    EXPECT_NEAR(c.z, 0.4f, 1e-6f);
+    EXPECT_NEAR(c.w, 0.5f, 1e-6f);
+  }
+
+  {
+    const float4 c = fmod(make_float4(1.2f, 2.3f, 3.4f, 0.9f), 1.2f);
+    EXPECT_NEAR(c.x, 0.0f, 1e-6f);
+    EXPECT_NEAR(c.y, 1.1f, 1e-6f);
+    EXPECT_NEAR(c.z, 1.0f, 1e-6f);
+    EXPECT_NEAR(c.w, 0.9f, 1e-6f);
+  }
+
+  {
+    const float4 c = fmod(make_float4(1.2f, 2.3f, 3.4f, 0.0f), 1000000.0f);
+    EXPECT_NEAR(c.x, 1.2f, 1e-6f);
+    EXPECT_NEAR(c.y, 2.3f, 1e-6f);
+    EXPECT_NEAR(c.z, 3.4f, 1e-6f);
+  }
+
+  {
+    const float4 c = fmod(make_float4(1999999.2f, 2000000.3f, 2000001.4f, 0.0f), 1000000.0f);
+    EXPECT_NEAR(c.x, 999999.25f, 1e-6f);
+    EXPECT_NEAR(c.y, 0.25f, 1e-6f);
+    EXPECT_NEAR(c.z, 1.375f, 1e-6f);
+  }
+
+  {
+    const float4 c = fmod(make_float4(5.1f, -5.1f, 0.0f, 0.0f), 3.0f);
+    EXPECT_NEAR(c.x, 2.1f, 1e-6f);
+    EXPECT_NEAR(c.y, -2.1, 1e-6f);
+    EXPECT_NEAR(c.z, 0.0f, 1e-6f);
+  }
+
+  {
+    const float4 c = fmod(make_float4(5.1f, -5.1f, 0.0f, 0.0f), -3.0f);
+    EXPECT_NEAR(c.x, 2.1f, 1e-6f);
+    EXPECT_NEAR(c.y, -2.1, 1e-6f);
+  }
+}
+
+CCL_NAMESPACE_END
diff --git a/intern/cycles/util/math_float2.h b/intern/cycles/util/math_float2.h
index 04405e3e5c9..3aa876535e6 100644
--- a/intern/cycles/util/math_float2.h
+++ b/intern/cycles/util/math_float2.h
@@ -118,6 +118,11 @@ ccl_device_inline bool operator!=(const float2 a, const float2 b)
   return !(a == b);
 }
 
+ccl_device_inline int2 operator>=(const float2 a, const float2 b)
+{
+  return make_int2(a.x >= b.x, a.y >= b.y);
+}
+
 ccl_device_inline bool is_zero(const float2 a)
 {
   return (a.x == 0.0f && a.y == 0.0f);
diff --git a/intern/cycles/util/math_float3.h b/intern/cycles/util/math_float3.h
index 04b64386ecc..ab75c560ae8 100644
--- a/intern/cycles/util/math_float3.h
+++ b/intern/cycles/util/math_float3.h
@@ -207,6 +207,15 @@ ccl_device_inline bool operator!=(const float3 a, const float3 b)
   return !(a == b);
 }
 
+ccl_device_inline int3 operator>=(const float3 a, const float3 b)
+{
+#  ifdef __KERNEL_SSE__
+  return int3(_mm_castps_si128(_mm_cmpge_ps(a.m128, b.m128)));
+#  else
+  return make_int3(a.x >= b.x, a.y >= b.y, a.z >= b.z);
+#  endif
+}
+
 ccl_device_inline float dot(const float3 a, const float3 b)
 {
 #  if defined(__KERNEL_SSE42__) && defined(__KERNEL_SSE__)
@@ -318,9 +327,24 @@ ccl_device_inline float3 fabs(const float3 a)
 #  endif
 }
 
+/* The floating-point remainder of the division operation a / b calculated by this function is
+ * exactly the value a - iquot * b, where iquot is a / b with its fractional part truncated.
+ *
+ * The returned value has the same sign as a and is less than b in magnitude. */
 ccl_device_inline float3 fmod(const float3 a, const float b)
 {
+#  if defined(__KERNEL_NEON__)
+  /* Use native Neon instructions.
+   * The logic is the same as the SSE code below, but on Apple M2 Ultra this seems to be faster.
+   * Possibly due to some runtime checks in _mm_round_ps which do not get properly inlined. */
+  const float32x4_t iquot = vrndq_f32(a / b);
+  return float3(vsubq_f32(a, vmulq_f32(iquot, vdupq_n_f32(b))));
+#  elif defined(__KERNEL_SSE42__) && defined(__KERNEL_SSE__)
+  const __m128 iquot = _mm_round_ps(a / b, _MM_FROUND_TRUNC);
+  return float3(_mm_sub_ps(a, _mm_mul_ps(iquot, _mm_set1_ps(b))));
+#  else
   return make_float3(fmodf(a.x, b), fmodf(a.y, b), fmodf(a.z, b));
+#  endif
 }
 
 ccl_device_inline float3 sqrt(const float3 a)
diff --git a/intern/cycles/util/math_float4.h b/intern/cycles/util/math_float4.h
index 0939a620dd5..ab36a16219d 100644
--- a/intern/cycles/util/math_float4.h
+++ b/intern/cycles/util/math_float4.h
@@ -463,9 +463,24 @@ ccl_device_inline float4 fabs(const float4 a)
 #  endif
 }
 
+/* The floating-point remainder of the division operation a / b calculated by this function is
+ * exactly the value a - iquot * b, where iquot is a / b with its fractional part truncated.
+ *
+ * The returned value has the same sign as a and is less than b in magnitude. */
 ccl_device_inline float4 fmod(const float4 a, const float b)
 {
+#  if defined(__KERNEL_NEON__)
+  /* Use native Neon instructions.
+   * The logic is the same as the SSE code below, but on Apple M2 Ultra this seems to be faster.
+   * Possibly due to some runtime checks in _mm_round_ps which do not get properly inlined. */
+  const float32x4_t iquot = vrndq_f32(a / b);
+  return float4(vsubq_f32(a, vmulq_f32(iquot, vdupq_n_f32(b))));
+#  elif defined(__KERNEL_SSE42__) && defined(__KERNEL_SSE__)
+  const __m128 iquot = _mm_round_ps(a / b, _MM_FROUND_TRUNC);
+  return float4(_mm_sub_ps(a, _mm_mul_ps(iquot, _mm_set1_ps(b))));
+#  else
   return make_float4(fmodf(a.x, b), fmodf(a.y, b), fmodf(a.z, b), fmodf(a.w, b));
+#  endif
 }
 
 ccl_device_inline float4 floor(const float4 a)
diff --git a/intern/cycles/util/types_float2.h b/intern/cycles/util/types_float2.h
index 02fef4e2104..e9912b56e0e 100644
--- a/intern/cycles/util/types_float2.h
+++ b/intern/cycles/util/types_float2.h
@@ -41,6 +41,11 @@ ccl_device_inline float2 make_float2(const float x, const float y)
 }
 #endif /* __KERNEL_NATIVE_VECTOR_TYPES__ */
 
+ccl_device_inline float2 make_float2(const float f)
+{
+  return {f, f};
+}
+
 ccl_device_inline void print_float2(const ccl_private char *label, const float2 a)
 {
 #ifdef __KERNEL_PRINTF__