diff --git a/intern/cycles/kernel/svm/noise.h b/intern/cycles/kernel/svm/noise.h index 6b772874eda..e38e330bacc 100644 --- a/intern/cycles/kernel/svm/noise.h +++ b/intern/cycles/kernel/svm/noise.h @@ -706,8 +706,9 @@ ccl_device_inline float noise_1d(const float p) ccl_device_inline float snoise_2d(float2 p) { - const float2 precision_correction = 0.5f * make_float2(float(fabsf(p.x) >= 1000000.0f), - float(fabsf(p.y) >= 1000000.0f)); + const float2 precision_correction = 0.5f * + mask(fabs(p) >= make_float2(1000000.0f), one_float2()); + /* Repeat Perlin noise texture every 100000.0f on each axis to prevent floating point * representation issues. This causes discontinuities every 100000.0f, however at such scales * this usually shouldn't be noticeable. */ @@ -723,9 +724,9 @@ ccl_device_inline float noise_2d(const float2 p) ccl_device_inline float snoise_3d(float3 p) { - const float3 precision_correction = 0.5f * make_float3(float(fabsf(p.x) >= 1000000.0f), - float(fabsf(p.y) >= 1000000.0f), - float(fabsf(p.z) >= 1000000.0f)); + const float3 precision_correction = 0.5f * + mask(fabs(p) >= make_float3(1000000.0f), one_float3()); + /* Repeat Perlin noise texture every 100000.0f on each axis to prevent floating point * representation issues. This causes discontinuities every 100000.0f, however at such scales * this usually shouldn't be noticeable. */ @@ -741,10 +742,9 @@ ccl_device_inline float noise_3d(const float3 p) ccl_device_inline float snoise_4d(float4 p) { - const float4 precision_correction = 0.5f * make_float4(float(fabsf(p.x) >= 1000000.0f), - float(fabsf(p.y) >= 1000000.0f), - float(fabsf(p.z) >= 1000000.0f), - float(fabsf(p.w) >= 1000000.0f)); + const float4 precision_correction = 0.5f * + mask(fabs(p) >= make_float4(1000000.0f), one_float4()); + /* Repeat Perlin noise texture every 100000.0f on each axis to prevent floating point * representation issues. This causes discontinuities every 100000.0f, however at such scales * this usually shouldn't be noticeable. */ diff --git a/intern/cycles/test/CMakeLists.txt b/intern/cycles/test/CMakeLists.txt index 0d98480defd..c83a3e9971a 100644 --- a/intern/cycles/test/CMakeLists.txt +++ b/intern/cycles/test/CMakeLists.txt @@ -37,6 +37,8 @@ set(SRC util_ies_test.cpp util_math_test.cpp util_math_fast_test.cpp + util_math_float3_test.cpp + util_math_float4_test.cpp util_md5_test.cpp util_path_test.cpp util_string_test.cpp diff --git a/intern/cycles/test/util_math_float3_test.cpp b/intern/cycles/test/util_math_float3_test.cpp new file mode 100644 index 00000000000..dabac37dc0b --- /dev/null +++ b/intern/cycles/test/util_math_float3_test.cpp @@ -0,0 +1,67 @@ +/* SPDX-FileCopyrightText: 2011-2025 Blender Foundation + * + * SPDX-License-Identifier: Apache-2.0 */ + +/* Note: These fixtures test default micro-architecture optimization defined in the + * util/optimization.h. */ + +#include "testing/testing.h" +#include "util/math.h" +#include "util/system.h" + +CCL_NAMESPACE_BEGIN + +class Float3Test : public ::testing::Test { + void SetUp() override + { + /* The micro-architecture check is not needed here, but use it here as a demonstration of how + * it can be implemented in a clear way. */ + // GTEST_SKIP() << "Test skipped due to uarch capability"; + } +}; + +TEST_F(Float3Test, fmod) +{ + { + const float3 c = fmod(make_float3(1.2f, 2.3f, 3.4f), 1.0f); + EXPECT_NEAR(c.x, 0.2f, 1e-6f); + EXPECT_NEAR(c.y, 0.3f, 1e-6f); + EXPECT_NEAR(c.z, 0.4f, 1e-6f); + } + + { + const float3 c = fmod(make_float3(1.2f, 2.3f, 3.4f), 1.2f); + EXPECT_NEAR(c.x, 0.0f, 1e-6f); + EXPECT_NEAR(c.y, 1.1f, 1e-6f); + EXPECT_NEAR(c.z, 1.0f, 1e-6f); + } + + { + const float3 c = fmod(make_float3(1.2f, 2.3f, 3.4f), 1000000.0f); + EXPECT_NEAR(c.x, 1.2f, 1e-6f); + EXPECT_NEAR(c.y, 2.3f, 1e-6f); + EXPECT_NEAR(c.z, 3.4f, 1e-6f); + } + + { + const float3 c = fmod(make_float3(1999999.2f, 2000000.3f, 2000001.4f), 1000000.0f); + EXPECT_NEAR(c.x, 999999.25f, 1e-6f); + EXPECT_NEAR(c.y, 0.25f, 1e-6f); + EXPECT_NEAR(c.z, 1.375f, 1e-6f); + } + + { + const float3 c = fmod(make_float3(5.1f, -5.1f, 0.0f), 3.0f); + EXPECT_NEAR(c.x, 2.1f, 1e-6f); + EXPECT_NEAR(c.y, -2.1, 1e-6f); + EXPECT_NEAR(c.z, 0.0f, 1e-6f); + } + + { + const float3 c = fmod(make_float3(5.1f, -5.1f, 0.0f), -3.0f); + EXPECT_NEAR(c.x, 2.1f, 1e-6f); + EXPECT_NEAR(c.y, -2.1, 1e-6f); + } +} + +CCL_NAMESPACE_END diff --git a/intern/cycles/test/util_math_float4_test.cpp b/intern/cycles/test/util_math_float4_test.cpp new file mode 100644 index 00000000000..6b4aa80657d --- /dev/null +++ b/intern/cycles/test/util_math_float4_test.cpp @@ -0,0 +1,69 @@ +/* SPDX-FileCopyrightText: 2011-2025 Blender Foundation + * + * SPDX-License-Identifier: Apache-2.0 */ + +/* Note: These fixtures test default micro-architecture optimization defined in the + * util/optimization.h. */ + +#include "testing/testing.h" +#include "util/math.h" +#include "util/system.h" + +CCL_NAMESPACE_BEGIN + +class Float4Test : public ::testing::Test { + void SetUp() override + { + /* The micro-architecture check is not needed here, but use it here as a demonstration of how + * it can be implemented in a clear way. */ + // GTEST_SKIP() << "Test skipped due to uarch capability"; + } +}; + +TEST_F(Float4Test, fmod) +{ + { + const float4 c = fmod(make_float4(1.2f, 2.3f, 3.4f, 4.5f), 1.0f); + EXPECT_NEAR(c.x, 0.2f, 1e-6f); + EXPECT_NEAR(c.y, 0.3f, 1e-6f); + EXPECT_NEAR(c.z, 0.4f, 1e-6f); + EXPECT_NEAR(c.w, 0.5f, 1e-6f); + } + + { + const float4 c = fmod(make_float4(1.2f, 2.3f, 3.4f, 0.9f), 1.2f); + EXPECT_NEAR(c.x, 0.0f, 1e-6f); + EXPECT_NEAR(c.y, 1.1f, 1e-6f); + EXPECT_NEAR(c.z, 1.0f, 1e-6f); + EXPECT_NEAR(c.w, 0.9f, 1e-6f); + } + + { + const float4 c = fmod(make_float4(1.2f, 2.3f, 3.4f, 0.0f), 1000000.0f); + EXPECT_NEAR(c.x, 1.2f, 1e-6f); + EXPECT_NEAR(c.y, 2.3f, 1e-6f); + EXPECT_NEAR(c.z, 3.4f, 1e-6f); + } + + { + const float4 c = fmod(make_float4(1999999.2f, 2000000.3f, 2000001.4f, 0.0f), 1000000.0f); + EXPECT_NEAR(c.x, 999999.25f, 1e-6f); + EXPECT_NEAR(c.y, 0.25f, 1e-6f); + EXPECT_NEAR(c.z, 1.375f, 1e-6f); + } + + { + const float4 c = fmod(make_float4(5.1f, -5.1f, 0.0f, 0.0f), 3.0f); + EXPECT_NEAR(c.x, 2.1f, 1e-6f); + EXPECT_NEAR(c.y, -2.1, 1e-6f); + EXPECT_NEAR(c.z, 0.0f, 1e-6f); + } + + { + const float4 c = fmod(make_float4(5.1f, -5.1f, 0.0f, 0.0f), -3.0f); + EXPECT_NEAR(c.x, 2.1f, 1e-6f); + EXPECT_NEAR(c.y, -2.1, 1e-6f); + } +} + +CCL_NAMESPACE_END diff --git a/intern/cycles/util/math_float2.h b/intern/cycles/util/math_float2.h index 04405e3e5c9..3aa876535e6 100644 --- a/intern/cycles/util/math_float2.h +++ b/intern/cycles/util/math_float2.h @@ -118,6 +118,11 @@ ccl_device_inline bool operator!=(const float2 a, const float2 b) return !(a == b); } +ccl_device_inline int2 operator>=(const float2 a, const float2 b) +{ + return make_int2(a.x >= b.x, a.y >= b.y); +} + ccl_device_inline bool is_zero(const float2 a) { return (a.x == 0.0f && a.y == 0.0f); diff --git a/intern/cycles/util/math_float3.h b/intern/cycles/util/math_float3.h index 04b64386ecc..ab75c560ae8 100644 --- a/intern/cycles/util/math_float3.h +++ b/intern/cycles/util/math_float3.h @@ -207,6 +207,15 @@ ccl_device_inline bool operator!=(const float3 a, const float3 b) return !(a == b); } +ccl_device_inline int3 operator>=(const float3 a, const float3 b) +{ +# ifdef __KERNEL_SSE__ + return int3(_mm_castps_si128(_mm_cmpge_ps(a.m128, b.m128))); +# else + return make_int3(a.x >= b.x, a.y >= b.y, a.z >= b.z); +# endif +} + ccl_device_inline float dot(const float3 a, const float3 b) { # if defined(__KERNEL_SSE42__) && defined(__KERNEL_SSE__) @@ -318,9 +327,24 @@ ccl_device_inline float3 fabs(const float3 a) # endif } +/* The floating-point remainder of the division operation a / b calculated by this function is + * exactly the value a - iquot * b, where iquot is a / b with its fractional part truncated. + * + * The returned value has the same sign as a and is less than b in magnitude. */ ccl_device_inline float3 fmod(const float3 a, const float b) { +# if defined(__KERNEL_NEON__) + /* Use native Neon instructions. + * The logic is the same as the SSE code below, but on Apple M2 Ultra this seems to be faster. + * Possibly due to some runtime checks in _mm_round_ps which do not get properly inlined. */ + const float32x4_t iquot = vrndq_f32(a / b); + return float3(vsubq_f32(a, vmulq_f32(iquot, vdupq_n_f32(b)))); +# elif defined(__KERNEL_SSE42__) && defined(__KERNEL_SSE__) + const __m128 iquot = _mm_round_ps(a / b, _MM_FROUND_TRUNC); + return float3(_mm_sub_ps(a, _mm_mul_ps(iquot, _mm_set1_ps(b)))); +# else return make_float3(fmodf(a.x, b), fmodf(a.y, b), fmodf(a.z, b)); +# endif } ccl_device_inline float3 sqrt(const float3 a) diff --git a/intern/cycles/util/math_float4.h b/intern/cycles/util/math_float4.h index 0939a620dd5..ab36a16219d 100644 --- a/intern/cycles/util/math_float4.h +++ b/intern/cycles/util/math_float4.h @@ -463,9 +463,24 @@ ccl_device_inline float4 fabs(const float4 a) # endif } +/* The floating-point remainder of the division operation a / b calculated by this function is + * exactly the value a - iquot * b, where iquot is a / b with its fractional part truncated. + * + * The returned value has the same sign as a and is less than b in magnitude. */ ccl_device_inline float4 fmod(const float4 a, const float b) { +# if defined(__KERNEL_NEON__) + /* Use native Neon instructions. + * The logic is the same as the SSE code below, but on Apple M2 Ultra this seems to be faster. + * Possibly due to some runtime checks in _mm_round_ps which do not get properly inlined. */ + const float32x4_t iquot = vrndq_f32(a / b); + return float4(vsubq_f32(a, vmulq_f32(iquot, vdupq_n_f32(b)))); +# elif defined(__KERNEL_SSE42__) && defined(__KERNEL_SSE__) + const __m128 iquot = _mm_round_ps(a / b, _MM_FROUND_TRUNC); + return float4(_mm_sub_ps(a, _mm_mul_ps(iquot, _mm_set1_ps(b)))); +# else return make_float4(fmodf(a.x, b), fmodf(a.y, b), fmodf(a.z, b), fmodf(a.w, b)); +# endif } ccl_device_inline float4 floor(const float4 a) diff --git a/intern/cycles/util/types_float2.h b/intern/cycles/util/types_float2.h index 02fef4e2104..e9912b56e0e 100644 --- a/intern/cycles/util/types_float2.h +++ b/intern/cycles/util/types_float2.h @@ -41,6 +41,11 @@ ccl_device_inline float2 make_float2(const float x, const float y) } #endif /* __KERNEL_NATIVE_VECTOR_TYPES__ */ +ccl_device_inline float2 make_float2(const float f) +{ + return {f, f}; +} + ccl_device_inline void print_float2(const ccl_private char *label, const float2 a) { #ifdef __KERNEL_PRINTF__