diff --git a/intern/cycles/kernel/svm/noise.h b/intern/cycles/kernel/svm/noise.h
index 6b772874eda..e38e330bacc 100644
--- a/intern/cycles/kernel/svm/noise.h
+++ b/intern/cycles/kernel/svm/noise.h
@@ -706,8 +706,9 @@ ccl_device_inline float noise_1d(const float p)
 
 ccl_device_inline float snoise_2d(float2 p)
 {
-  const float2 precision_correction = 0.5f * make_float2(float(fabsf(p.x) >= 1000000.0f),
-                                                         float(fabsf(p.y) >= 1000000.0f));
+  const float2 precision_correction = 0.5f *
+                                      mask(fabs(p) >= make_float2(1000000.0f), one_float2());
+
   /* Repeat Perlin noise texture every 100000.0f on each axis to prevent floating point
    * representation issues. This causes discontinuities every 100000.0f, however at such scales
    * this usually shouldn't be noticeable. */
@@ -723,9 +724,9 @@ ccl_device_inline float noise_2d(const float2 p)
 
 ccl_device_inline float snoise_3d(float3 p)
 {
-  const float3 precision_correction = 0.5f * make_float3(float(fabsf(p.x) >= 1000000.0f),
-                                                         float(fabsf(p.y) >= 1000000.0f),
-                                                         float(fabsf(p.z) >= 1000000.0f));
+  const float3 precision_correction = 0.5f *
+                                      mask(fabs(p) >= make_float3(1000000.0f), one_float3());
+
   /* Repeat Perlin noise texture every 100000.0f on each axis to prevent floating point
    * representation issues. This causes discontinuities every 100000.0f, however at such scales
    * this usually shouldn't be noticeable. */
@@ -741,10 +742,9 @@ ccl_device_inline float noise_3d(const float3 p)
 
 ccl_device_inline float snoise_4d(float4 p)
 {
-  const float4 precision_correction = 0.5f * make_float4(float(fabsf(p.x) >= 1000000.0f),
-                                                         float(fabsf(p.y) >= 1000000.0f),
-                                                         float(fabsf(p.z) >= 1000000.0f),
-                                                         float(fabsf(p.w) >= 1000000.0f));
+  const float4 precision_correction = 0.5f *
+                                      mask(fabs(p) >= make_float4(1000000.0f), one_float4());
+
   /* Repeat Perlin noise texture every 100000.0f on each axis to prevent floating point
    * representation issues. This causes discontinuities every 100000.0f, however at such scales
    * this usually shouldn't be noticeable. */
diff --git a/intern/cycles/test/CMakeLists.txt b/intern/cycles/test/CMakeLists.txt
index 0d98480defd..c83a3e9971a 100644
--- a/intern/cycles/test/CMakeLists.txt
+++ b/intern/cycles/test/CMakeLists.txt
@@ -37,6 +37,8 @@ set(SRC
   util_ies_test.cpp
   util_math_test.cpp
   util_math_fast_test.cpp
+  util_math_float3_test.cpp
+  util_math_float4_test.cpp
   util_md5_test.cpp
   util_path_test.cpp
   util_string_test.cpp
diff --git a/intern/cycles/test/util_math_float3_test.cpp b/intern/cycles/test/util_math_float3_test.cpp
new file mode 100644
index 00000000000..dabac37dc0b
--- /dev/null
+++ b/intern/cycles/test/util_math_float3_test.cpp
@@ -0,0 +1,67 @@
+/* SPDX-FileCopyrightText: 2011-2025 Blender Foundation
+ *
+ * SPDX-License-Identifier: Apache-2.0 */
+
+/* Note: These fixtures test default micro-architecture optimization defined in the
+ * util/optimization.h. */
+
+#include "testing/testing.h"
+#include "util/math.h"
+#include "util/system.h"
+
+CCL_NAMESPACE_BEGIN
+
+class Float3Test : public ::testing::Test {
+  void SetUp() override
+  {
+    /* The micro-architecture check is not needed here, but use it here as a demonstration of how
+     * it can be implemented in a clear way. */
+    // GTEST_SKIP() << "Test skipped due to uarch capability";
+  }
+};
+
+TEST_F(Float3Test, fmod)
+{
+  {
+    const float3 c = fmod(make_float3(1.2f, 2.3f, 3.4f), 1.0f);
+    EXPECT_NEAR(c.x, 0.2f, 1e-6f);
+    EXPECT_NEAR(c.y, 0.3f, 1e-6f);
+    EXPECT_NEAR(c.z, 0.4f, 1e-6f);
+  }
+
+  {
+    const float3 c = fmod(make_float3(1.2f, 2.3f, 3.4f), 1.2f);
+    EXPECT_NEAR(c.x, 0.0f, 1e-6f);
+    EXPECT_NEAR(c.y, 1.1f, 1e-6f);
+    EXPECT_NEAR(c.z, 1.0f, 1e-6f);
+  }
+
+  {
+    const float3 c = fmod(make_float3(1.2f, 2.3f, 3.4f), 1000000.0f);
+    EXPECT_NEAR(c.x, 1.2f, 1e-6f);
+    EXPECT_NEAR(c.y, 2.3f, 1e-6f);
+    EXPECT_NEAR(c.z, 3.4f, 1e-6f);
+  }
+
+  {
+    const float3 c = fmod(make_float3(1999999.2f, 2000000.3f, 2000001.4f), 1000000.0f);
+    EXPECT_NEAR(c.x, 999999.25f, 1e-6f);
+    EXPECT_NEAR(c.y, 0.25f, 1e-6f);
+    EXPECT_NEAR(c.z, 1.375f, 1e-6f);
+  }
+
+  {
+    const float3 c = fmod(make_float3(5.1f, -5.1f, 0.0f), 3.0f);
+    EXPECT_NEAR(c.x, 2.1f, 1e-6f);
+    EXPECT_NEAR(c.y, -2.1, 1e-6f);
+    EXPECT_NEAR(c.z, 0.0f, 1e-6f);
+  }
+
+  {
+    const float3 c = fmod(make_float3(5.1f, -5.1f, 0.0f), -3.0f);
+    EXPECT_NEAR(c.x, 2.1f, 1e-6f);
+    EXPECT_NEAR(c.y, -2.1, 1e-6f);
+  }
+}
+
+CCL_NAMESPACE_END
diff --git a/intern/cycles/test/util_math_float4_test.cpp b/intern/cycles/test/util_math_float4_test.cpp
new file mode 100644
index 00000000000..6b4aa80657d
--- /dev/null
+++ b/intern/cycles/test/util_math_float4_test.cpp
@@ -0,0 +1,69 @@
+/* SPDX-FileCopyrightText: 2011-2025 Blender Foundation
+ *
+ * SPDX-License-Identifier: Apache-2.0 */
+
+/* Note: These fixtures test default micro-architecture optimization defined in the
+ * util/optimization.h. */
+
+#include "testing/testing.h"
+#include "util/math.h"
+#include "util/system.h"
+
+CCL_NAMESPACE_BEGIN
+
+class Float4Test : public ::testing::Test {
+  void SetUp() override
+  {
+    /* The micro-architecture check is not needed here, but use it here as a demonstration of how
+     * it can be implemented in a clear way. */
+    // GTEST_SKIP() << "Test skipped due to uarch capability";
+  }
+};
+
+TEST_F(Float4Test, fmod)
+{
+  {
+    const float4 c = fmod(make_float4(1.2f, 2.3f, 3.4f, 4.5f), 1.0f);
+    EXPECT_NEAR(c.x, 0.2f, 1e-6f);
+    EXPECT_NEAR(c.y, 0.3f, 1e-6f);
+    EXPECT_NEAR(c.z, 0.4f, 1e-6f);
+    EXPECT_NEAR(c.w, 0.5f, 1e-6f);
+  }
+
+  {
+    const float4 c = fmod(make_float4(1.2f, 2.3f, 3.4f, 0.9f), 1.2f);
+    EXPECT_NEAR(c.x, 0.0f, 1e-6f);
+    EXPECT_NEAR(c.y, 1.1f, 1e-6f);
+    EXPECT_NEAR(c.z, 1.0f, 1e-6f);
+    EXPECT_NEAR(c.w, 0.9f, 1e-6f);
+  }
+
+  {
+    const float4 c = fmod(make_float4(1.2f, 2.3f, 3.4f, 0.0f), 1000000.0f);
+    EXPECT_NEAR(c.x, 1.2f, 1e-6f);
+    EXPECT_NEAR(c.y, 2.3f, 1e-6f);
+    EXPECT_NEAR(c.z, 3.4f, 1e-6f);
+  }
+
+  {
+    const float4 c = fmod(make_float4(1999999.2f, 2000000.3f, 2000001.4f, 0.0f), 1000000.0f);
+    EXPECT_NEAR(c.x, 999999.25f, 1e-6f);
+    EXPECT_NEAR(c.y, 0.25f, 1e-6f);
+    EXPECT_NEAR(c.z, 1.375f, 1e-6f);
+  }
+
+  {
+    const float4 c = fmod(make_float4(5.1f, -5.1f, 0.0f, 0.0f), 3.0f);
+    EXPECT_NEAR(c.x, 2.1f, 1e-6f);
+    EXPECT_NEAR(c.y, -2.1, 1e-6f);
+    EXPECT_NEAR(c.z, 0.0f, 1e-6f);
+  }
+
+  {
+    const float4 c = fmod(make_float4(5.1f, -5.1f, 0.0f, 0.0f), -3.0f);
+    EXPECT_NEAR(c.x, 2.1f, 1e-6f);
+    EXPECT_NEAR(c.y, -2.1, 1e-6f);
+  }
+}
+
+CCL_NAMESPACE_END
diff --git a/intern/cycles/util/math_float2.h b/intern/cycles/util/math_float2.h
index 04405e3e5c9..3aa876535e6 100644
--- a/intern/cycles/util/math_float2.h
+++ b/intern/cycles/util/math_float2.h
@@ -118,6 +118,11 @@ ccl_device_inline bool operator!=(const float2 a, const float2 b)
   return !(a == b);
 }
 
+ccl_device_inline int2 operator>=(const float2 a, const float2 b)
+{
+  return make_int2(a.x >= b.x, a.y >= b.y);
+}
+
 ccl_device_inline bool is_zero(const float2 a)
 {
   return (a.x == 0.0f && a.y == 0.0f);
diff --git a/intern/cycles/util/math_float3.h b/intern/cycles/util/math_float3.h
index 04b64386ecc..ab75c560ae8 100644
--- a/intern/cycles/util/math_float3.h
+++ b/intern/cycles/util/math_float3.h
@@ -207,6 +207,15 @@ ccl_device_inline bool operator!=(const float3 a, const float3 b)
   return !(a == b);
 }
 
+ccl_device_inline int3 operator>=(const float3 a, const float3 b)
+{
+#  ifdef __KERNEL_SSE__
+  return int3(_mm_castps_si128(_mm_cmpge_ps(a.m128, b.m128)));
+#  else
+  return make_int3(a.x >= b.x, a.y >= b.y, a.z >= b.z);
+#  endif
+}
+
 ccl_device_inline float dot(const float3 a, const float3 b)
 {
 #  if defined(__KERNEL_SSE42__) && defined(__KERNEL_SSE__)
@@ -318,9 +327,24 @@ ccl_device_inline float3 fabs(const float3 a)
 #  endif
 }
 
+/* The floating-point remainder of the division operation a / b calculated by this function is
+ * exactly the value a - iquot * b, where iquot is a / b with its fractional part truncated.
+ *
+ * The returned value has the same sign as a and is less than b in magnitude. */
 ccl_device_inline float3 fmod(const float3 a, const float b)
 {
+#  if defined(__KERNEL_NEON__)
+  /* Use native Neon instructions.
+   * The logic is the same as the SSE code below, but on Apple M2 Ultra this seems to be faster.
+   * Possibly due to some runtime checks in _mm_round_ps which do not get properly inlined. */
+  const float32x4_t iquot = vrndq_f32(a / b);
+  return float3(vsubq_f32(a, vmulq_f32(iquot, vdupq_n_f32(b))));
+#  elif defined(__KERNEL_SSE42__) && defined(__KERNEL_SSE__)
+  const __m128 iquot = _mm_round_ps(a / b, _MM_FROUND_TRUNC);
+  return float3(_mm_sub_ps(a, _mm_mul_ps(iquot, _mm_set1_ps(b))));
+#  else
   return make_float3(fmodf(a.x, b), fmodf(a.y, b), fmodf(a.z, b));
+#  endif
 }
 
 ccl_device_inline float3 sqrt(const float3 a)
diff --git a/intern/cycles/util/math_float4.h b/intern/cycles/util/math_float4.h
index 0939a620dd5..ab36a16219d 100644
--- a/intern/cycles/util/math_float4.h
+++ b/intern/cycles/util/math_float4.h
@@ -463,9 +463,24 @@ ccl_device_inline float4 fabs(const float4 a)
 #  endif
 }
 
+/* The floating-point remainder of the division operation a / b calculated by this function is
+ * exactly the value a - iquot * b, where iquot is a / b with its fractional part truncated.
+ *
+ * The returned value has the same sign as a and is less than b in magnitude. */
 ccl_device_inline float4 fmod(const float4 a, const float b)
 {
+#  if defined(__KERNEL_NEON__)
+  /* Use native Neon instructions.
+   * The logic is the same as the SSE code below, but on Apple M2 Ultra this seems to be faster.
+   * Possibly due to some runtime checks in _mm_round_ps which do not get properly inlined. */
+  const float32x4_t iquot = vrndq_f32(a / b);
+  return float4(vsubq_f32(a, vmulq_f32(iquot, vdupq_n_f32(b))));
+#  elif defined(__KERNEL_SSE42__) && defined(__KERNEL_SSE__)
+  const __m128 iquot = _mm_round_ps(a / b, _MM_FROUND_TRUNC);
+  return float4(_mm_sub_ps(a, _mm_mul_ps(iquot, _mm_set1_ps(b))));
+#  else
   return make_float4(fmodf(a.x, b), fmodf(a.y, b), fmodf(a.z, b), fmodf(a.w, b));
+#  endif
 }
 
 ccl_device_inline float4 floor(const float4 a)
diff --git a/intern/cycles/util/types_float2.h b/intern/cycles/util/types_float2.h
index 02fef4e2104..e9912b56e0e 100644
--- a/intern/cycles/util/types_float2.h
+++ b/intern/cycles/util/types_float2.h
@@ -41,6 +41,11 @@ ccl_device_inline float2 make_float2(const float x, const float y)
 }
 #endif /* __KERNEL_NATIVE_VECTOR_TYPES__ */
 
+ccl_device_inline float2 make_float2(const float f)
+{
+  return {f, f};
+}
+
 ccl_device_inline void print_float2(const ccl_private char *label, const float2 a)
 {
 #ifdef __KERNEL_PRINTF__