Cycles: Optimize 3d and 4d noise

The goal is to reduce the affect of the fmod() used in the noise code, which was initially reported in the comment: https://projects.blender.org/blender/blender/pulls/119884#issuecomment-1258902 Basic idea is to benefit from SIMD vectorization on CPU. Tested on Linux i9-11900K and macOS on M2 Ultra, in both cases performance after this change is very close to what it could be with the fmod() commented out (the call itself, `p = p + precision_correction`). On macOS the penalty of fmod() was about 10%, on Linux it was closer to 30% when built with GCC-13. With Linux builds from the buildbot it is more like 18%. The optimization is only done for 3d and 4d noise. It might be possible to gain some performance improvement for 1d and 2d cases, but the approach would need to be different: we'd need to optimize scalar version fmodf(). Maybe tricks with integer cast will be faster (since we are a bit optimistic in the kernel and do not guarantee exact behavior in extreme cases such as NaN inputs). Pull Request: https://projects.blender.org/blender/blender/pulls/137109
2025-04-09 13:40:10 +02:00
parent 58fa6a50ce
commit 30b962b3d8
8 changed files with 196 additions and 9 deletions
--- a/intern/cycles/kernel/svm/noise.h
+++ b/intern/cycles/kernel/svm/noise.h
@@ -706,8 +706,9 @@ ccl_device_inline float noise_1d(const float p)

 ccl_device_inline float snoise_2d(float2 p)
 {
-  const float2 precision_correction = 0.5f * make_float2(float(fabsf(p.x) >= 1000000.0f),
-                                                         float(fabsf(p.y) >= 1000000.0f));
+  const float2 precision_correction = 0.5f *
+                                      mask(fabs(p) >= make_float2(1000000.0f), one_float2());
+
  /* Repeat Perlin noise texture every 100000.0f on each axis to prevent floating point
   * representation issues. This causes discontinuities every 100000.0f, however at such scales
   * this usually shouldn't be noticeable. */
@@ -723,9 +724,9 @@ ccl_device_inline float noise_2d(const float2 p)

 ccl_device_inline float snoise_3d(float3 p)
 {
-  const float3 precision_correction = 0.5f * make_float3(float(fabsf(p.x) >= 1000000.0f),
-                                                         float(fabsf(p.y) >= 1000000.0f),
-                                                         float(fabsf(p.z) >= 1000000.0f));
+  const float3 precision_correction = 0.5f *
+                                      mask(fabs(p) >= make_float3(1000000.0f), one_float3());
+
  /* Repeat Perlin noise texture every 100000.0f on each axis to prevent floating point
   * representation issues. This causes discontinuities every 100000.0f, however at such scales
   * this usually shouldn't be noticeable. */
@@ -741,10 +742,9 @@ ccl_device_inline float noise_3d(const float3 p)

 ccl_device_inline float snoise_4d(float4 p)
 {
-  const float4 precision_correction = 0.5f * make_float4(float(fabsf(p.x) >= 1000000.0f),
-                                                         float(fabsf(p.y) >= 1000000.0f),
-                                                         float(fabsf(p.z) >= 1000000.0f),
-                                                         float(fabsf(p.w) >= 1000000.0f));
+  const float4 precision_correction = 0.5f *
+                                      mask(fabs(p) >= make_float4(1000000.0f), one_float4());
+
  /* Repeat Perlin noise texture every 100000.0f on each axis to prevent floating point
   * representation issues. This causes discontinuities every 100000.0f, however at such scales
   * this usually shouldn't be noticeable. */
--- a/intern/cycles/test/CMakeLists.txt
+++ b/intern/cycles/test/CMakeLists.txt
@@ -37,6 +37,8 @@ set(SRC
  util_ies_test.cpp
  util_math_test.cpp
  util_math_fast_test.cpp
+  util_math_float3_test.cpp
+  util_math_float4_test.cpp
  util_md5_test.cpp
  util_path_test.cpp
  util_string_test.cpp
--- a/intern/cycles/test/util_math_float3_test.cpp
+++ b/intern/cycles/test/util_math_float3_test.cpp
@@ -0,0 +1,67 @@
+/* SPDX-FileCopyrightText: 2011-2025 Blender Foundation
+ *
+ * SPDX-License-Identifier: Apache-2.0 */
+
+/* Note: These fixtures test default micro-architecture optimization defined in the
+ * util/optimization.h. */
+
+#include "testing/testing.h"
+#include "util/math.h"
+#include "util/system.h"
+
+CCL_NAMESPACE_BEGIN
+
+class Float3Test : public ::testing::Test {
+  void SetUp() override
+  {
+    /* The micro-architecture check is not needed here, but use it here as a demonstration of how
+     * it can be implemented in a clear way. */
+    // GTEST_SKIP() << "Test skipped due to uarch capability";
+  }
+};
+
+TEST_F(Float3Test, fmod)
+{
+  {
+    const float3 c = fmod(make_float3(1.2f, 2.3f, 3.4f), 1.0f);
+    EXPECT_NEAR(c.x, 0.2f, 1e-6f);
+    EXPECT_NEAR(c.y, 0.3f, 1e-6f);
+    EXPECT_NEAR(c.z, 0.4f, 1e-6f);
+  }
+
+  {
+    const float3 c = fmod(make_float3(1.2f, 2.3f, 3.4f), 1.2f);
+    EXPECT_NEAR(c.x, 0.0f, 1e-6f);
+    EXPECT_NEAR(c.y, 1.1f, 1e-6f);
+    EXPECT_NEAR(c.z, 1.0f, 1e-6f);
+  }
+
+  {
+    const float3 c = fmod(make_float3(1.2f, 2.3f, 3.4f), 1000000.0f);
+    EXPECT_NEAR(c.x, 1.2f, 1e-6f);
+    EXPECT_NEAR(c.y, 2.3f, 1e-6f);
+    EXPECT_NEAR(c.z, 3.4f, 1e-6f);
+  }
+
+  {
+    const float3 c = fmod(make_float3(1999999.2f, 2000000.3f, 2000001.4f), 1000000.0f);
+    EXPECT_NEAR(c.x, 999999.25f, 1e-6f);
+    EXPECT_NEAR(c.y, 0.25f, 1e-6f);
+    EXPECT_NEAR(c.z, 1.375f, 1e-6f);
+  }
+
+  {
+    const float3 c = fmod(make_float3(5.1f, -5.1f, 0.0f), 3.0f);
+    EXPECT_NEAR(c.x, 2.1f, 1e-6f);
+    EXPECT_NEAR(c.y, -2.1, 1e-6f);
+    EXPECT_NEAR(c.z, 0.0f, 1e-6f);
+  }
+
+  {
+    const float3 c = fmod(make_float3(5.1f, -5.1f, 0.0f), -3.0f);
+    EXPECT_NEAR(c.x, 2.1f, 1e-6f);
+    EXPECT_NEAR(c.y, -2.1, 1e-6f);
+  }
+}
+
+CCL_NAMESPACE_END
--- a/intern/cycles/test/util_math_float4_test.cpp
+++ b/intern/cycles/test/util_math_float4_test.cpp
@@ -0,0 +1,69 @@
+/* SPDX-FileCopyrightText: 2011-2025 Blender Foundation
+ *
+ * SPDX-License-Identifier: Apache-2.0 */
+
+/* Note: These fixtures test default micro-architecture optimization defined in the
+ * util/optimization.h. */
+
+#include "testing/testing.h"
+#include "util/math.h"
+#include "util/system.h"
+
+CCL_NAMESPACE_BEGIN
+
+class Float4Test : public ::testing::Test {
+  void SetUp() override
+  {
+    /* The micro-architecture check is not needed here, but use it here as a demonstration of how
+     * it can be implemented in a clear way. */
+    // GTEST_SKIP() << "Test skipped due to uarch capability";
+  }
+};
+
+TEST_F(Float4Test, fmod)
+{
+  {
+    const float4 c = fmod(make_float4(1.2f, 2.3f, 3.4f, 4.5f), 1.0f);
+    EXPECT_NEAR(c.x, 0.2f, 1e-6f);
+    EXPECT_NEAR(c.y, 0.3f, 1e-6f);
+    EXPECT_NEAR(c.z, 0.4f, 1e-6f);
+    EXPECT_NEAR(c.w, 0.5f, 1e-6f);
+  }
+
+  {
+    const float4 c = fmod(make_float4(1.2f, 2.3f, 3.4f, 0.9f), 1.2f);
+    EXPECT_NEAR(c.x, 0.0f, 1e-6f);
+    EXPECT_NEAR(c.y, 1.1f, 1e-6f);
+    EXPECT_NEAR(c.z, 1.0f, 1e-6f);
+    EXPECT_NEAR(c.w, 0.9f, 1e-6f);
+  }
+
+  {
+    const float4 c = fmod(make_float4(1.2f, 2.3f, 3.4f, 0.0f), 1000000.0f);
+    EXPECT_NEAR(c.x, 1.2f, 1e-6f);
+    EXPECT_NEAR(c.y, 2.3f, 1e-6f);
+    EXPECT_NEAR(c.z, 3.4f, 1e-6f);
+  }
+
+  {
+    const float4 c = fmod(make_float4(1999999.2f, 2000000.3f, 2000001.4f, 0.0f), 1000000.0f);
+    EXPECT_NEAR(c.x, 999999.25f, 1e-6f);
+    EXPECT_NEAR(c.y, 0.25f, 1e-6f);
+    EXPECT_NEAR(c.z, 1.375f, 1e-6f);
+  }
+
+  {
+    const float4 c = fmod(make_float4(5.1f, -5.1f, 0.0f, 0.0f), 3.0f);
+    EXPECT_NEAR(c.x, 2.1f, 1e-6f);
+    EXPECT_NEAR(c.y, -2.1, 1e-6f);
+    EXPECT_NEAR(c.z, 0.0f, 1e-6f);
+  }
+
+  {
+    const float4 c = fmod(make_float4(5.1f, -5.1f, 0.0f, 0.0f), -3.0f);
+    EXPECT_NEAR(c.x, 2.1f, 1e-6f);
+    EXPECT_NEAR(c.y, -2.1, 1e-6f);
+  }
+}
+
+CCL_NAMESPACE_END
--- a/intern/cycles/util/math_float2.h
+++ b/intern/cycles/util/math_float2.h
@@ -118,6 +118,11 @@ ccl_device_inline bool operator!=(const float2 a, const float2 b)
  return !(a == b);
 }

+ccl_device_inline int2 operator>=(const float2 a, const float2 b)
+{
+  return make_int2(a.x >= b.x, a.y >= b.y);
+}
+
 ccl_device_inline bool is_zero(const float2 a)
 {
  return (a.x == 0.0f && a.y == 0.0f);
--- a/intern/cycles/util/math_float3.h
+++ b/intern/cycles/util/math_float3.h
@@ -207,6 +207,15 @@ ccl_device_inline bool operator!=(const float3 a, const float3 b)
  return !(a == b);
 }

+ccl_device_inline int3 operator>=(const float3 a, const float3 b)
+{
+#  ifdef __KERNEL_SSE__
+  return int3(_mm_castps_si128(_mm_cmpge_ps(a.m128, b.m128)));
+#  else
+  return make_int3(a.x >= b.x, a.y >= b.y, a.z >= b.z);
+#  endif
+}
+
 ccl_device_inline float dot(const float3 a, const float3 b)
 {
 #  if defined(__KERNEL_SSE42__) && defined(__KERNEL_SSE__)
@@ -318,9 +327,24 @@ ccl_device_inline float3 fabs(const float3 a)
 #  endif
 }

+/* The floating-point remainder of the division operation a / b calculated by this function is
+ * exactly the value a - iquot * b, where iquot is a / b with its fractional part truncated.
+ *
+ * The returned value has the same sign as a and is less than b in magnitude. */
 ccl_device_inline float3 fmod(const float3 a, const float b)
 {
+#  if defined(__KERNEL_NEON__)
+  /* Use native Neon instructions.
+   * The logic is the same as the SSE code below, but on Apple M2 Ultra this seems to be faster.
+   * Possibly due to some runtime checks in _mm_round_ps which do not get properly inlined. */
+  const float32x4_t iquot = vrndq_f32(a / b);
+  return float3(vsubq_f32(a, vmulq_f32(iquot, vdupq_n_f32(b))));
+#  elif defined(__KERNEL_SSE42__) && defined(__KERNEL_SSE__)
+  const __m128 iquot = _mm_round_ps(a / b, _MM_FROUND_TRUNC);
+  return float3(_mm_sub_ps(a, _mm_mul_ps(iquot, _mm_set1_ps(b))));
+#  else
  return make_float3(fmodf(a.x, b), fmodf(a.y, b), fmodf(a.z, b));
+#  endif
 }

 ccl_device_inline float3 sqrt(const float3 a)
--- a/intern/cycles/util/math_float4.h
+++ b/intern/cycles/util/math_float4.h
@@ -463,9 +463,24 @@ ccl_device_inline float4 fabs(const float4 a)
 #  endif
 }

+/* The floating-point remainder of the division operation a / b calculated by this function is
+ * exactly the value a - iquot * b, where iquot is a / b with its fractional part truncated.
+ *
+ * The returned value has the same sign as a and is less than b in magnitude. */
 ccl_device_inline float4 fmod(const float4 a, const float b)
 {
+#  if defined(__KERNEL_NEON__)
+  /* Use native Neon instructions.
+   * The logic is the same as the SSE code below, but on Apple M2 Ultra this seems to be faster.
+   * Possibly due to some runtime checks in _mm_round_ps which do not get properly inlined. */
+  const float32x4_t iquot = vrndq_f32(a / b);
+  return float4(vsubq_f32(a, vmulq_f32(iquot, vdupq_n_f32(b))));
+#  elif defined(__KERNEL_SSE42__) && defined(__KERNEL_SSE__)
+  const __m128 iquot = _mm_round_ps(a / b, _MM_FROUND_TRUNC);
+  return float4(_mm_sub_ps(a, _mm_mul_ps(iquot, _mm_set1_ps(b))));
+#  else
  return make_float4(fmodf(a.x, b), fmodf(a.y, b), fmodf(a.z, b), fmodf(a.w, b));
+#  endif
 }

 ccl_device_inline float4 floor(const float4 a)
--- a/intern/cycles/util/types_float2.h
+++ b/intern/cycles/util/types_float2.h
@@ -41,6 +41,11 @@ ccl_device_inline float2 make_float2(const float x, const float y)
 }
 #endif /* __KERNEL_NATIVE_VECTOR_TYPES__ */

+ccl_device_inline float2 make_float2(const float f)
+{
+  return {f, f};
+}
+
 ccl_device_inline void print_float2(const ccl_private char *label, const float2 a)
 {
 #ifdef __KERNEL_PRINTF__