From 30a22b92caf588e38bbe5fd9656f27a5424c5765 Mon Sep 17 00:00:00 2001
From: Thomas Dinges <thomas@blender.org>
Date: Fri, 9 Feb 2024 17:25:58 +0100
Subject: [PATCH] Cycles: Rename SSE4.1 kernel to SSE4.2

This commit updates all defines, compiler flags and cleans up some code for unused CPU capabilities.

There should be no functional change, unless it's run on a CPU that supports sse41 but not sse42. It will fallback to the SSE2 kernel in this case.

In preparation for the new SSE4.2 minimum in Blender 4.2.

Pull Request: https://projects.blender.org/blender/blender/pulls/118043
---
 intern/cycles/CMakeLists.txt                  | 14 ++++++-------
 intern/cycles/blender/addon/properties.py     |  2 +-
 intern/cycles/blender/addon/ui.py             |  2 +-
 intern/cycles/blender/python.cpp              |  2 +-
 intern/cycles/device/cpu/device.cpp           |  2 +-
 intern/cycles/device/cpu/kernel.cpp           |  2 +-
 intern/cycles/device/cpu/kernel_function.h    | 16 +++++++--------
 intern/cycles/kernel/CMakeLists.txt           |  4 ++--
 intern/cycles/kernel/device/cpu/kernel.cpp    |  4 ++--
 intern/cycles/kernel/device/cpu/kernel.h      |  2 +-
 .../cycles/kernel/device/cpu/kernel_avx2.cpp  |  2 +-
 .../{kernel_sse41.cpp => kernel_sse42.cpp}    |  8 ++++----
 intern/cycles/util/CMakeLists.txt             |  4 ++--
 intern/cycles/util/debug.cpp                  |  2 +-
 intern/cycles/util/debug.h                    |  8 ++++----
 intern/cycles/util/guiding.h                  |  2 +-
 intern/cycles/util/math_fast.h                |  2 +-
 intern/cycles/util/math_float3.h              |  8 ++++----
 intern/cycles/util/math_float4.h              |  4 ++--
 intern/cycles/util/math_int3.h                |  4 ++--
 intern/cycles/util/math_int4.h                |  4 ++--
 intern/cycles/util/math_intersect.h           |  4 ++--
 intern/cycles/util/openimagedenoise.h         |  2 +-
 intern/cycles/util/optimization.h             |  8 ++++----
 intern/cycles/util/simd.h                     | 10 +++++-----
 intern/cycles/util/system.cpp                 | 20 ++++++++-----------
 intern/cycles/util/system.h                   |  2 +-
 intern/cycles/util/transform.h                |  6 +++---
 intern/cycles/util/transform_inverse.h        |  2 +-
 ...ransform_sse41.cpp => transform_sse42.cpp} |  2 +-
 30 files changed, 75 insertions(+), 79 deletions(-)
 rename intern/cycles/kernel/device/cpu/{kernel_sse41.cpp => kernel_sse42.cpp} (81%)
 rename intern/cycles/util/{transform_sse41.cpp => transform_sse42.cpp} (77%)

diff --git a/intern/cycles/CMakeLists.txt b/intern/cycles/CMakeLists.txt
index 080d2ef79dc..bee0fe16637 100644
--- a/intern/cycles/CMakeLists.txt
+++ b/intern/cycles/CMakeLists.txt
@@ -96,11 +96,11 @@ elseif(WIN32 AND MSVC AND NOT CMAKE_CXX_COMPILER_ID MATCHES "Clang")
   # there is no /arch:SSE3, but intrinsics are available anyway
   if(CMAKE_CL_64)
     set(CYCLES_SSE2_KERNEL_FLAGS "${CYCLES_KERNEL_FLAGS}")
-    set(CYCLES_SSE41_KERNEL_FLAGS "${CYCLES_KERNEL_FLAGS}")
+    set(CYCLES_SSE42_KERNEL_FLAGS "${CYCLES_KERNEL_FLAGS}")
     set(CYCLES_AVX2_KERNEL_FLAGS "${CYCLES_AVX2_ARCH_FLAGS} ${CYCLES_KERNEL_FLAGS}")
   else()
     set(CYCLES_SSE2_KERNEL_FLAGS "/arch:SSE2 ${CYCLES_KERNEL_FLAGS}")
-    set(CYCLES_SSE41_KERNEL_FLAGS "/arch:SSE2 ${CYCLES_KERNEL_FLAGS}")
+    set(CYCLES_SSE42_KERNEL_FLAGS "/arch:SSE2 ${CYCLES_KERNEL_FLAGS}")
     set(CYCLES_AVX2_KERNEL_FLAGS "${CYCLES_AVX2_ARCH_FLAGS} ${CYCLES_KERNEL_FLAGS}")
   endif()
 
@@ -133,9 +133,9 @@ elseif(CMAKE_COMPILER_IS_GNUCC OR (CMAKE_CXX_COMPILER_ID MATCHES "Clang"))
     endif()
 
     set(CYCLES_SSE2_KERNEL_FLAGS "${CYCLES_KERNEL_FLAGS} -msse -msse2")
-    set(CYCLES_SSE41_KERNEL_FLAGS "${CYCLES_SSE2_KERNEL_FLAGS} -msse3 -mssse3 -msse4.1")
+    set(CYCLES_SSE42_KERNEL_FLAGS "${CYCLES_SSE2_KERNEL_FLAGS} -msse3 -mssse3 -msse4.1 -msse4.2")
     if(CXX_HAS_AVX2)
-      set(CYCLES_AVX2_KERNEL_FLAGS "${CYCLES_SSE41_KERNEL_FLAGS} -mavx -mavx2 -mfma -mlzcnt -mbmi -mbmi2 -mf16c")
+      set(CYCLES_AVX2_KERNEL_FLAGS "${CYCLES_SSE42_KERNEL_FLAGS} -mavx -mavx2 -mfma -mlzcnt -mbmi -mbmi2 -mf16c")
     endif()
   endif()
 
@@ -147,7 +147,7 @@ elseif(WIN32 AND CMAKE_CXX_COMPILER_ID MATCHES "Intel")
 
   if(CXX_HAS_SSE)
     set(CYCLES_SSE2_KERNEL_FLAGS "/QxSSE2")
-    set(CYCLES_SSE41_KERNEL_FLAGS "/QxSSE4.1")
+    set(CYCLES_SSE42_KERNEL_FLAGS "/QxSSE4.2")
 
     if(CXX_HAS_AVX2)
       set(CYCLES_AVX2_KERNEL_FLAGS "/QxCORE-AVX2")
@@ -172,7 +172,7 @@ elseif(CMAKE_CXX_COMPILER_ID MATCHES "Intel")
       set(CYCLES_SSE2_KERNEL_FLAGS "-xsse2")
     endif()
 
-    set(CYCLES_SSE41_KERNEL_FLAGS "-xsse4.1")
+    set(CYCLES_SSE42_KERNEL_FLAGS "-xsse4.2")
 
     if(CXX_HAS_AVX2)
       set(CYCLES_AVX2_KERNEL_FLAGS "-xcore-avx2")
@@ -183,7 +183,7 @@ endif()
 if(CXX_HAS_SSE)
   add_definitions(
     -DWITH_KERNEL_SSE2
-    -DWITH_KERNEL_SSE41
+    -DWITH_KERNEL_SSE42
   )
 endif()
 
diff --git a/intern/cycles/blender/addon/properties.py b/intern/cycles/blender/addon/properties.py
index 5fd821a9a29..086f30dcf24 100644
--- a/intern/cycles/blender/addon/properties.py
+++ b/intern/cycles/blender/addon/properties.py
@@ -969,7 +969,7 @@ class CyclesRenderSettings(bpy.types.PropertyGroup):
         return _cycles.debug_flags_update(scene)
 
     debug_use_cpu_avx2: BoolProperty(name="AVX2", default=True)
-    debug_use_cpu_sse41: BoolProperty(name="SSE41", default=True)
+    debug_use_cpu_sse42: BoolProperty(name="SSE42", default=True)
     debug_use_cpu_sse2: BoolProperty(name="SSE2", default=True)
     debug_bvh_layout: EnumProperty(
         name="BVH Layout",
diff --git a/intern/cycles/blender/addon/ui.py b/intern/cycles/blender/addon/ui.py
index cd0b8ae2bd9..258a463e889 100644
--- a/intern/cycles/blender/addon/ui.py
+++ b/intern/cycles/blender/addon/ui.py
@@ -2235,7 +2235,7 @@ class CYCLES_RENDER_PT_debug(CyclesDebugButtonsPanel, Panel):
 
         row = col.row(align=True)
         row.prop(cscene, "debug_use_cpu_sse2", toggle=True)
-        row.prop(cscene, "debug_use_cpu_sse41", toggle=True)
+        row.prop(cscene, "debug_use_cpu_sse42", toggle=True)
         row.prop(cscene, "debug_use_cpu_avx2", toggle=True)
         col.prop(cscene, "debug_bvh_layout", text="BVH")
 
diff --git a/intern/cycles/blender/python.cpp b/intern/cycles/blender/python.cpp
index 6bff3ee7aba..a0441c2e2c8 100644
--- a/intern/cycles/blender/python.cpp
+++ b/intern/cycles/blender/python.cpp
@@ -65,7 +65,7 @@ static void debug_flags_sync_from_scene(BL::Scene b_scene)
   PointerRNA cscene = RNA_pointer_get(&b_scene.ptr, "cycles");
   /* Synchronize CPU flags. */
   flags.cpu.avx2 = get_boolean(cscene, "debug_use_cpu_avx2");
-  flags.cpu.sse41 = get_boolean(cscene, "debug_use_cpu_sse41");
+  flags.cpu.sse42 = get_boolean(cscene, "debug_use_cpu_sse42");
   flags.cpu.sse2 = get_boolean(cscene, "debug_use_cpu_sse2");
   flags.cpu.bvh_layout = (BVHLayout)get_enum(cscene, "debug_bvh_layout");
   /* Synchronize CUDA flags. */
diff --git a/intern/cycles/device/cpu/device.cpp b/intern/cycles/device/cpu/device.cpp
index 5d0959d5e6d..77d7f6cc2a2 100644
--- a/intern/cycles/device/cpu/device.cpp
+++ b/intern/cycles/device/cpu/device.cpp
@@ -47,7 +47,7 @@ string device_cpu_capabilities()
 {
   string capabilities = "";
   capabilities += system_cpu_support_sse2() ? "SSE2 " : "";
-  capabilities += system_cpu_support_sse41() ? "SSE41 " : "";
+  capabilities += system_cpu_support_sse42() ? "SSE42 " : "";
   capabilities += system_cpu_support_avx2() ? "AVX2" : "";
   if (capabilities[capabilities.size() - 1] == ' ') {
     capabilities.resize(capabilities.size() - 1);
diff --git a/intern/cycles/device/cpu/kernel.cpp b/intern/cycles/device/cpu/kernel.cpp
index 8718820a0c8..1a597ee3cef 100644
--- a/intern/cycles/device/cpu/kernel.cpp
+++ b/intern/cycles/device/cpu/kernel.cpp
@@ -10,7 +10,7 @@ CCL_NAMESPACE_BEGIN
 
 #define KERNEL_FUNCTIONS(name) \
   KERNEL_NAME_EVAL(cpu, name), KERNEL_NAME_EVAL(cpu_sse2, name), \
-      KERNEL_NAME_EVAL(cpu_sse41, name), KERNEL_NAME_EVAL(cpu_avx2, name)
+      KERNEL_NAME_EVAL(cpu_sse42, name), KERNEL_NAME_EVAL(cpu_avx2, name)
 
 #define REGISTER_KERNEL(name) name(KERNEL_FUNCTIONS(name))
 #define REGISTER_KERNEL_FILM_CONVERT(name) \
diff --git a/intern/cycles/device/cpu/kernel_function.h b/intern/cycles/device/cpu/kernel_function.h
index 899849674b3..4ba7cfda928 100644
--- a/intern/cycles/device/cpu/kernel_function.h
+++ b/intern/cycles/device/cpu/kernel_function.h
@@ -13,15 +13,15 @@ CCL_NAMESPACE_BEGIN
  *
  * Provides a function-call-like API which gets routed to the most suitable implementation.
  *
- * For example, on a computer which only has SSE4.1 the kernel_sse41 will be used. */
+ * For example, on a computer which only has SSE4.2 the kernel_sse42 will be used. */
 template<typename FunctionType> class CPUKernelFunction {
  public:
   CPUKernelFunction(FunctionType kernel_default,
                     FunctionType kernel_sse2,
-                    FunctionType kernel_sse41,
+                    FunctionType kernel_sse42,
                     FunctionType kernel_avx2)
   {
-    kernel_info_ = get_best_kernel_info(kernel_default, kernel_sse2, kernel_sse41, kernel_avx2);
+    kernel_info_ = get_best_kernel_info(kernel_default, kernel_sse2, kernel_sse42, kernel_avx2);
   }
 
   template<typename... Args> inline auto operator()(Args... args) const
@@ -56,12 +56,12 @@ template<typename FunctionType> class CPUKernelFunction {
 
   KernelInfo get_best_kernel_info(FunctionType kernel_default,
                                   FunctionType kernel_sse2,
-                                  FunctionType kernel_sse41,
+                                  FunctionType kernel_sse42,
                                   FunctionType kernel_avx2)
   {
     /* Silence warnings about unused variables when compiling without some architectures. */
     (void)kernel_sse2;
-    (void)kernel_sse41;
+    (void)kernel_sse42;
     (void)kernel_avx2;
 
 #ifdef WITH_CYCLES_OPTIMIZED_KERNEL_AVX2
@@ -70,9 +70,9 @@ template<typename FunctionType> class CPUKernelFunction {
     }
 #endif
 
-#ifdef WITH_CYCLES_OPTIMIZED_KERNEL_SSE41
-    if (DebugFlags().cpu.has_sse41() && system_cpu_support_sse41()) {
-      return KernelInfo("SSE4.1", kernel_sse41);
+#ifdef WITH_CYCLES_OPTIMIZED_KERNEL_SSE42
+    if (DebugFlags().cpu.has_sse42() && system_cpu_support_sse42()) {
+      return KernelInfo("SSE4.2", kernel_sse42);
     }
 #endif
 
diff --git a/intern/cycles/kernel/CMakeLists.txt b/intern/cycles/kernel/CMakeLists.txt
index 9c91c8fd9e3..3b53e3b9b0e 100644
--- a/intern/cycles/kernel/CMakeLists.txt
+++ b/intern/cycles/kernel/CMakeLists.txt
@@ -15,7 +15,7 @@ set(INC_SYS
 set(SRC_KERNEL_DEVICE_CPU
   device/cpu/kernel.cpp
   device/cpu/kernel_sse2.cpp
-  device/cpu/kernel_sse41.cpp
+  device/cpu/kernel_sse42.cpp
   device/cpu/kernel_avx2.cpp
 )
 
@@ -1165,7 +1165,7 @@ set_source_files_properties(device/cpu/kernel.cpp PROPERTIES COMPILE_FLAGS "${CY
 
 if(CXX_HAS_SSE)
   set_source_files_properties(device/cpu/kernel_sse2.cpp PROPERTIES COMPILE_FLAGS "${CYCLES_SSE2_KERNEL_FLAGS}")
-  set_source_files_properties(device/cpu/kernel_sse41.cpp PROPERTIES COMPILE_FLAGS "${CYCLES_SSE41_KERNEL_FLAGS}")
+  set_source_files_properties(device/cpu/kernel_sse42.cpp PROPERTIES COMPILE_FLAGS "${CYCLES_SSE42_KERNEL_FLAGS}")
 endif()
 
 if(CXX_HAS_AVX2)
diff --git a/intern/cycles/kernel/device/cpu/kernel.cpp b/intern/cycles/kernel/device/cpu/kernel.cpp
index a90d351c7c9..cf42b8d6cda 100644
--- a/intern/cycles/kernel/device/cpu/kernel.cpp
+++ b/intern/cycles/kernel/device/cpu/kernel.cpp
@@ -27,8 +27,8 @@
 #  ifdef __SSSE3__
 #    define __KERNEL_SSSE3__
 #  endif
-#  ifdef __SSE4_1__
-#    define __KERNEL_SSE41__
+#  ifdef __SSE4_2__
+#    define __KERNEL_SSE42__
 #  endif
 #  ifdef __AVX__
 #    ifndef __KERNEL_SSE__
diff --git a/intern/cycles/kernel/device/cpu/kernel.h b/intern/cycles/kernel/device/cpu/kernel.h
index 63ea39c17bc..902e2d341a9 100644
--- a/intern/cycles/kernel/device/cpu/kernel.h
+++ b/intern/cycles/kernel/device/cpu/kernel.h
@@ -36,7 +36,7 @@ void kernel_global_memory_copy(KernelGlobalsCPU *kg, const char *name, void *mem
 #define KERNEL_ARCH cpu_sse2
 #include "kernel/device/cpu/kernel_arch.h"
 
-#define KERNEL_ARCH cpu_sse41
+#define KERNEL_ARCH cpu_sse42
 #include "kernel/device/cpu/kernel_arch.h"
 
 #define KERNEL_ARCH cpu_avx2
diff --git a/intern/cycles/kernel/device/cpu/kernel_avx2.cpp b/intern/cycles/kernel/device/cpu/kernel_avx2.cpp
index 8015fe23968..097601e1950 100644
--- a/intern/cycles/kernel/device/cpu/kernel_avx2.cpp
+++ b/intern/cycles/kernel/device/cpu/kernel_avx2.cpp
@@ -17,7 +17,7 @@
 #    define __KERNEL_SSE2__
 #    define __KERNEL_SSE3__
 #    define __KERNEL_SSSE3__
-#    define __KERNEL_SSE41__
+#    define __KERNEL_SSE42__
 #    define __KERNEL_AVX__
 #    define __KERNEL_AVX2__
 #  endif
diff --git a/intern/cycles/kernel/device/cpu/kernel_sse41.cpp b/intern/cycles/kernel/device/cpu/kernel_sse42.cpp
similarity index 81%
rename from intern/cycles/kernel/device/cpu/kernel_sse41.cpp
rename to intern/cycles/kernel/device/cpu/kernel_sse42.cpp
index 20df03dd9d2..6215df32f12 100644
--- a/intern/cycles/kernel/device/cpu/kernel_sse41.cpp
+++ b/intern/cycles/kernel/device/cpu/kernel_sse42.cpp
@@ -8,7 +8,7 @@
 
 #include "util/optimization.h"
 
-#ifndef WITH_CYCLES_OPTIMIZED_KERNEL_SSE41
+#ifndef WITH_CYCLES_OPTIMIZED_KERNEL_SSE42
 #  define KERNEL_STUB
 #else
 /* SSE optimization disabled for now on 32 bit, see bug #36316. */
@@ -16,10 +16,10 @@
 #    define __KERNEL_SSE2__
 #    define __KERNEL_SSE3__
 #    define __KERNEL_SSSE3__
-#    define __KERNEL_SSE41__
+#    define __KERNEL_SSE42__
 #  endif
-#endif /* WITH_CYCLES_OPTIMIZED_KERNEL_SSE41 */
+#endif /* WITH_CYCLES_OPTIMIZED_KERNEL_SSE42 */
 
 #include "kernel/device/cpu/kernel.h"
-#define KERNEL_ARCH cpu_sse41
+#define KERNEL_ARCH cpu_sse42
 #include "kernel/device/cpu/kernel_arch_impl.h"
diff --git a/intern/cycles/util/CMakeLists.txt b/intern/cycles/util/CMakeLists.txt
index fca9b02b396..c8c8373436b 100644
--- a/intern/cycles/util/CMakeLists.txt
+++ b/intern/cycles/util/CMakeLists.txt
@@ -27,7 +27,7 @@ set(SRC
   time.cpp
   transform.cpp
   transform_avx2.cpp
-  transform_sse41.cpp
+  transform_sse42.cpp
   windows.cpp
 )
 
@@ -137,7 +137,7 @@ set(SRC_HEADERS
 )
 
 if(CXX_HAS_SSE)
-  set_source_files_properties(transform_sse41.cpp PROPERTIES COMPILE_FLAGS "${CYCLES_SSE41_KERNEL_FLAGS}")
+  set_source_files_properties(transform_sse42.cpp PROPERTIES COMPILE_FLAGS "${CYCLES_SSE42_KERNEL_FLAGS}")
 endif()
 if(CXX_HAS_AVX2)
   set_source_files_properties(transform_avx2.cpp PROPERTIES COMPILE_FLAGS "${CYCLES_AVX2_KERNEL_FLAGS}")
diff --git a/intern/cycles/util/debug.cpp b/intern/cycles/util/debug.cpp
index de536690c11..903a1d4cf23 100644
--- a/intern/cycles/util/debug.cpp
+++ b/intern/cycles/util/debug.cpp
@@ -30,7 +30,7 @@ void DebugFlags::CPU::reset()
   } while (0)
 
   CHECK_CPU_FLAGS(avx2, "CYCLES_CPU_NO_AVX2");
-  CHECK_CPU_FLAGS(sse41, "CYCLES_CPU_NO_SSE41");
+  CHECK_CPU_FLAGS(sse42, "CYCLES_CPU_NO_SSE42");
   CHECK_CPU_FLAGS(sse2, "CYCLES_CPU_NO_SSE2");
 
 #undef STRINGIFY
diff --git a/intern/cycles/util/debug.h b/intern/cycles/util/debug.h
index 71f7165cd56..6759256c1a4 100644
--- a/intern/cycles/util/debug.h
+++ b/intern/cycles/util/debug.h
@@ -26,7 +26,7 @@ class DebugFlags {
 
     /* Flags describing which instructions sets are allowed for use. */
     bool avx2 = true;
-    bool sse41 = true;
+    bool sse42 = true;
     bool sse2 = true;
 
     /* Check functions to see whether instructions up to the given one
@@ -34,11 +34,11 @@ class DebugFlags {
      */
     bool has_avx2()
     {
-      return has_sse41() && avx2;
+      return has_sse42() && avx2;
     }
-    bool has_sse41()
+    bool has_sse42()
     {
-      return has_sse2() && sse41;
+      return has_sse2() && sse42;
     }
     bool has_sse2()
     {
diff --git a/intern/cycles/util/guiding.h b/intern/cycles/util/guiding.h
index c4bd3992251..1c5dbb24814 100644
--- a/intern/cycles/util/guiding.h
+++ b/intern/cycles/util/guiding.h
@@ -22,7 +22,7 @@ static int guiding_device_type()
   if (system_cpu_support_avx2()) {
     return 8;
   }
-  if (system_cpu_support_sse41()) {
+  if (system_cpu_support_sse42()) {
     return 4;
   }
   return 0;
diff --git a/intern/cycles/util/math_fast.h b/intern/cycles/util/math_fast.h
index 059c876fd8c..cec5e5391fb 100644
--- a/intern/cycles/util/math_fast.h
+++ b/intern/cycles/util/math_fast.h
@@ -62,7 +62,7 @@ ccl_device_inline float4 madd4(const float4 a, const float4 b, const float4 c)
 ccl_device_inline int fast_rint(float x)
 {
   /* used by sin/cos/tan range reduction. */
-#ifdef __KERNEL_SSE41__
+#ifdef __KERNEL_SSE42__
   /* Single `roundps` instruction on SSE4.1+ for gcc/clang but not MSVC 19.35:
    * float_to_int(rintf(x)); so we use the equivalent intrinsics. */
   __m128 vec = _mm_set_ss(x);
diff --git a/intern/cycles/util/math_float3.h b/intern/cycles/util/math_float3.h
index eaf13cf779f..38f86de6054 100644
--- a/intern/cycles/util/math_float3.h
+++ b/intern/cycles/util/math_float3.h
@@ -200,7 +200,7 @@ ccl_device_inline bool operator!=(const float3 a, const float3 b)
 
 ccl_device_inline float dot(const float3 a, const float3 b)
 {
-#  if defined(__KERNEL_SSE41__) && defined(__KERNEL_SSE__)
+#  if defined(__KERNEL_SSE42__) && defined(__KERNEL_SSE__)
   return _mm_cvtss_f32(_mm_dp_ps(a, b, 0x7F));
 #  else
   return a.x * b.x + a.y * b.y + a.z * b.z;
@@ -211,7 +211,7 @@ ccl_device_inline float dot(const float3 a, const float3 b)
 
 ccl_device_inline float dot_xy(const float3 a, const float3 b)
 {
-#if defined(__KERNEL_SSE41__) && defined(__KERNEL_SSE__)
+#if defined(__KERNEL_SSE42__) && defined(__KERNEL_SSE__)
   return _mm_cvtss_f32(_mm_hadd_ps(_mm_mul_ps(a, b), b));
 #else
   return a.x * b.x + a.y * b.y;
@@ -220,7 +220,7 @@ ccl_device_inline float dot_xy(const float3 a, const float3 b)
 
 ccl_device_inline float len(const float3 a)
 {
-#if defined(__KERNEL_SSE41__) && defined(__KERNEL_SSE__)
+#if defined(__KERNEL_SSE42__) && defined(__KERNEL_SSE__)
   return _mm_cvtss_f32(_mm_sqrt_ss(_mm_dp_ps(a.m128, a.m128, 0x7F)));
 #else
   return sqrtf(dot(a, a));
@@ -264,7 +264,7 @@ ccl_device_inline float3 cross(const float3 a, const float3 b)
 
 ccl_device_inline float3 normalize(const float3 a)
 {
-#  if defined(__KERNEL_SSE41__) && defined(__KERNEL_SSE__)
+#  if defined(__KERNEL_SSE42__) && defined(__KERNEL_SSE__)
   __m128 norm = _mm_sqrt_ps(_mm_dp_ps(a.m128, a.m128, 0x7F));
   return float3(_mm_div_ps(a.m128, norm));
 #  else
diff --git a/intern/cycles/util/math_float4.h b/intern/cycles/util/math_float4.h
index 235bf3fa373..369b5fef3c2 100644
--- a/intern/cycles/util/math_float4.h
+++ b/intern/cycles/util/math_float4.h
@@ -364,7 +364,7 @@ ccl_device_inline float reduce_max(const float4 a)
 #if !defined(__KERNEL_METAL__)
 ccl_device_inline float dot(const float4 a, const float4 b)
 {
-#  if defined(__KERNEL_SSE41__) && defined(__KERNEL_SSE__)
+#  if defined(__KERNEL_SSE42__) && defined(__KERNEL_SSE__)
 #    if defined(__KERNEL_NEON__)
   __m128 t = vmulq_f32(a, b);
   return vaddvq_f32(t);
@@ -534,7 +534,7 @@ ccl_device_inline bool isequal(const float4 a, const float4 b)
 ccl_device_inline float4 select(const int4 mask, const float4 a, const float4 b)
 {
 #  ifdef __KERNEL_SSE__
-#    ifdef __KERNEL_SSE41__
+#    ifdef __KERNEL_SSE42__
   return float4(_mm_blendv_ps(b.m128, a.m128, _mm_castsi128_ps(mask.m128)));
 #    else
   return float4(
diff --git a/intern/cycles/util/math_int3.h b/intern/cycles/util/math_int3.h
index ae0b81f88a8..dfc30ead886 100644
--- a/intern/cycles/util/math_int3.h
+++ b/intern/cycles/util/math_int3.h
@@ -14,7 +14,7 @@ CCL_NAMESPACE_BEGIN
 #if !defined(__KERNEL_METAL__)
 ccl_device_inline int3 min(int3 a, int3 b)
 {
-#  if defined(__KERNEL_SSE__) && defined(__KERNEL_SSE41__)
+#  if defined(__KERNEL_SSE__) && defined(__KERNEL_SSE42__)
   return int3(_mm_min_epi32(a.m128, b.m128));
 #  else
   return make_int3(min(a.x, b.x), min(a.y, b.y), min(a.z, b.z));
@@ -23,7 +23,7 @@ ccl_device_inline int3 min(int3 a, int3 b)
 
 ccl_device_inline int3 max(int3 a, int3 b)
 {
-#  if defined(__KERNEL_SSE__) && defined(__KERNEL_SSE41__)
+#  if defined(__KERNEL_SSE__) && defined(__KERNEL_SSE42__)
   return int3(_mm_max_epi32(a.m128, b.m128));
 #  else
   return make_int3(max(a.x, b.x), max(a.y, b.y), max(a.z, b.z));
diff --git a/intern/cycles/util/math_int4.h b/intern/cycles/util/math_int4.h
index ce5eaf40a52..8929290e9a4 100644
--- a/intern/cycles/util/math_int4.h
+++ b/intern/cycles/util/math_int4.h
@@ -203,7 +203,7 @@ ccl_device_forceinline const int4 srl(const int4 a, const int32_t b)
 
 ccl_device_inline int4 min(int4 a, int4 b)
 {
-#  if defined(__KERNEL_SSE__) && defined(__KERNEL_SSE41__)
+#  if defined(__KERNEL_SSE__) && defined(__KERNEL_SSE42__)
   return int4(_mm_min_epi32(a.m128, b.m128));
 #  else
   return make_int4(min(a.x, b.x), min(a.y, b.y), min(a.z, b.z), min(a.w, b.w));
@@ -212,7 +212,7 @@ ccl_device_inline int4 min(int4 a, int4 b)
 
 ccl_device_inline int4 max(int4 a, int4 b)
 {
-#  if defined(__KERNEL_SSE__) && defined(__KERNEL_SSE41__)
+#  if defined(__KERNEL_SSE__) && defined(__KERNEL_SSE42__)
   return int4(_mm_max_epi32(a.m128, b.m128));
 #  else
   return make_int4(max(a.x, b.x), max(a.y, b.y), max(a.z, b.z), max(a.w, b.w));
diff --git a/intern/cycles/util/math_intersect.h b/intern/cycles/util/math_intersect.h
index b5eb209ec82..f662ed4f394 100644
--- a/intern/cycles/util/math_intersect.h
+++ b/intern/cycles/util/math_intersect.h
@@ -136,7 +136,7 @@ ccl_device_forceinline float ray_triangle_rcp(const float x)
 
 ccl_device_inline float ray_triangle_dot(const float3 a, const float3 b)
 {
-#if defined(__KERNEL_SSE41__) && defined(__KERNEL_SSE__)
+#if defined(__KERNEL_SSE42__) && defined(__KERNEL_SSE__)
   return madd(make_float4(a.x),
               make_float4(b.x),
               madd(make_float4(a.y), make_float4(b.y), make_float4(a.z) * make_float4(b.z)))[0];
@@ -147,7 +147,7 @@ ccl_device_inline float ray_triangle_dot(const float3 a, const float3 b)
 
 ccl_device_inline float3 ray_triangle_cross(const float3 a, const float3 b)
 {
-#if defined(__KERNEL_SSE41__) && defined(__KERNEL_SSE__)
+#if defined(__KERNEL_SSE42__) && defined(__KERNEL_SSE__)
   return make_float3(
       msub(make_float4(a.y), make_float4(b.z), make_float4(a.z) * make_float4(b.y))[0],
       msub(make_float4(a.z), make_float4(b.x), make_float4(a.x) * make_float4(b.z))[0],
diff --git a/intern/cycles/util/openimagedenoise.h b/intern/cycles/util/openimagedenoise.h
index be4ca39a3c4..da3952b7257 100644
--- a/intern/cycles/util/openimagedenoise.h
+++ b/intern/cycles/util/openimagedenoise.h
@@ -20,7 +20,7 @@ static inline bool openimagedenoise_supported()
   /* Always supported through Accelerate framework BNNS. */
   return true;
 #  else
-  return system_cpu_support_sse41();
+  return system_cpu_support_sse42();
 #  endif
 #else
   return false;
diff --git a/intern/cycles/util/optimization.h b/intern/cycles/util/optimization.h
index d4cdc4c1d6e..dfa19f11352 100644
--- a/intern/cycles/util/optimization.h
+++ b/intern/cycles/util/optimization.h
@@ -21,15 +21,15 @@
 
 /* x86-64
  *
- * Compile a regular (includes SSE2), SSE3, SSE 4.1, AVX and AVX2 kernel. */
+ * Compile a regular (includes SSE2), SSE 4.2 and AVX2 kernel. */
 
 #  elif defined(__x86_64__) || defined(_M_X64)
 
 /* SSE2 is always available on x86-64 CPUs, so auto enable */
 #    define __KERNEL_SSE2__
 /* no SSE2 kernel on x86-64, part of regular kernel */
-#    ifdef WITH_KERNEL_SSE41
-#      define WITH_CYCLES_OPTIMIZED_KERNEL_SSE41
+#    ifdef WITH_KERNEL_SSE42
+#      define WITH_CYCLES_OPTIMIZED_KERNEL_SSE42
 #    endif
 #    ifdef WITH_KERNEL_AVX2
 #      define WITH_CYCLES_OPTIMIZED_KERNEL_AVX2
@@ -47,7 +47,7 @@
 #    define __KERNEL_SSE__
 #    define __KERNEL_SSE2__
 #    define __KERNEL_SSE3__
-#    define __KERNEL_SSE41__
+#    define __KERNEL_SSE42__
 
 #  endif
 
diff --git a/intern/cycles/util/simd.h b/intern/cycles/util/simd.h
index e612475d765..8c1c46b89dc 100644
--- a/intern/cycles/util/simd.h
+++ b/intern/cycles/util/simd.h
@@ -463,10 +463,10 @@ __forceinline uint64_t bitscan(uint64_t value)
  * implementations. */
 #ifdef __KERNEL_SSE2__
 
-/* Test __KERNEL_SSE41__ for MSVC which does not define __SSE4_1__, and test
- * __SSE4_1__ to avoid OpenImageIO conflicts with our emulation macros on other
+/* Test __KERNEL_SSE42__ for MSVC which does not define __SSE4_2__, and test
+ * __SSE4_1__ and __SSE4_2__ to avoid OpenImageIO conflicts with our emulation macros on other
  * platforms when compiling code outside the kernel. */
-#  if !(defined(__KERNEL_SSE41__) || defined(__SSE4_1__) || defined(__SSE4_2__))
+#  if !(defined(__KERNEL_SSE42__) || defined(__SSE4_1__) || defined(__SSE4_2__))
 
 /* Emulation of SSE4 functions with SSE2 */
 
@@ -573,7 +573,7 @@ __forceinline __m128 _mm_round_ps_emu(__m128 value, const int flags)
   return value;
 }
 
-#  endif /* !(defined(__KERNEL_SSE41__) || defined(__SSE4_1__) || defined(__SSE4_2__)) */
+#  endif /* !(defined(__KERNEL_SSE42__) || defined(__SSE4_1__) || defined(__SSE4_2__)) */
 
 /* Older GCC versions do not have _mm256_cvtss_f32 yet, so define it ourselves.
  * _mm256_castps256_ps128 generates no instructions so this is just as efficient. */
@@ -586,7 +586,7 @@ __forceinline __m128 _mm_round_ps_emu(__m128 value, const int flags)
 
 /* quiet unused define warnings */
 #if defined(__KERNEL_SSE2__) || defined(__KERNEL_SSE3__) || defined(__KERNEL_SSSE3__) || \
-    defined(__KERNEL_SSE41__) || defined(__KERNEL_AVX__) || defined(__KERNEL_AVX2__)
+    defined(__KERNEL_SSE42__) || defined(__KERNEL_AVX__) || defined(__KERNEL_AVX2__)
 /* do nothing */
 #endif
 
diff --git a/intern/cycles/util/system.cpp b/intern/cycles/util/system.cpp
index 0af0d1fad4e..afd3bef1961 100644
--- a/intern/cycles/util/system.cpp
+++ b/intern/cycles/util/system.cpp
@@ -130,9 +130,7 @@ int system_cpu_bits()
 
 struct CPUCapabilities {
   bool sse2;
-  bool sse3;
-  bool sse41;
-  bool avx;
+  bool sse42;
   bool avx2;
 };
 
@@ -155,7 +153,7 @@ static CPUCapabilities &system_cpu_capabilities()
 
       const bool ssse3 = (result[2] & ((int)1 << 9)) != 0;
       const bool sse41 = (result[2] & ((int)1 << 19)) != 0;
-      // const bool sse42 = (result[2] & ((int)1 << 20)) != 0;
+      const bool sse42 = (result[2] & ((int)1 << 20)) != 0;
 
       const bool fma3 = (result[2] & ((int)1 << 12)) != 0;
       const bool os_uses_xsave_xrestore = (result[2] & ((int)1 << 27)) != 0;
@@ -163,8 +161,7 @@ static CPUCapabilities &system_cpu_capabilities()
 
       /* Simplify to combined capabilities for which we specialize kernels. */
       caps.sse2 = sse && sse2;
-      caps.sse3 = sse && sse2 && sse3 && ssse3;
-      caps.sse41 = sse && sse2 && sse3 && ssse3 && sse41;
+      caps.sse42 = sse && sse2 && sse3 && ssse3 && sse41 && sse42;
 
       if (os_uses_xsave_xrestore && cpu_avx_support) {
         // Check if the OS will save the YMM registers
@@ -187,9 +184,8 @@ static CPUCapabilities &system_cpu_capabilities()
         bool bmi2 = (result[1] & ((int)1 << 8)) != 0;
         bool avx2 = (result[1] & ((int)1 << 5)) != 0;
 
-        caps.avx = sse && sse2 && sse3 && ssse3 && sse41 && avx;
-        caps.avx2 = sse && sse2 && sse3 && ssse3 && sse41 && avx && f16c && avx2 && fma3 && bmi1 &&
-                    bmi2;
+        caps.avx2 = sse && sse2 && sse3 && ssse3 && sse41 && sse42 && avx && f16c && avx2 &&
+                    fma3 && bmi1 && bmi2;
       }
     }
 
@@ -205,10 +201,10 @@ bool system_cpu_support_sse2()
   return caps.sse2;
 }
 
-bool system_cpu_support_sse41()
+bool system_cpu_support_sse42()
 {
   CPUCapabilities &caps = system_cpu_capabilities();
-  return caps.sse41;
+  return caps.sse42;
 }
 
 bool system_cpu_support_avx2()
@@ -223,7 +219,7 @@ bool system_cpu_support_sse2()
   return false;
 }
 
-bool system_cpu_support_sse41()
+bool system_cpu_support_sse42()
 {
   return false;
 }
diff --git a/intern/cycles/util/system.h b/intern/cycles/util/system.h
index 187ab5cd3fc..f14c0551056 100644
--- a/intern/cycles/util/system.h
+++ b/intern/cycles/util/system.h
@@ -18,7 +18,7 @@ int system_console_width();
 std::string system_cpu_brand_string();
 int system_cpu_bits();
 bool system_cpu_support_sse2();
-bool system_cpu_support_sse41();
+bool system_cpu_support_sse42();
 bool system_cpu_support_avx2();
 
 size_t system_physical_ram();
diff --git a/intern/cycles/util/transform.h b/intern/cycles/util/transform.h
index af12ac3d0ea..208c68dc5a1 100644
--- a/intern/cycles/util/transform.h
+++ b/intern/cycles/util/transform.h
@@ -405,7 +405,7 @@ ccl_device_inline float4 quat_interpolate(float4 q1, float4 q2, float t)
 }
 
 #ifndef __KERNEL_GPU__
-void transform_inverse_cpu_sse41(const Transform &tfm, Transform &itfm);
+void transform_inverse_cpu_sse42(const Transform &tfm, Transform &itfm);
 void transform_inverse_cpu_avx2(const Transform &tfm, Transform &itfm);
 #endif
 
@@ -418,9 +418,9 @@ ccl_device_inline Transform transform_inverse(const Transform tfm)
     transform_inverse_cpu_avx2(tfm, itfm);
     return itfm;
   }
-  else if (system_cpu_support_sse41()) {
+  else if (system_cpu_support_sse42()) {
     Transform itfm;
-    transform_inverse_cpu_sse41(tfm, itfm);
+    transform_inverse_cpu_sse42(tfm, itfm);
     return itfm;
   }
 #endif
diff --git a/intern/cycles/util/transform_inverse.h b/intern/cycles/util/transform_inverse.h
index 32a62a27620..fe43e88e49a 100644
--- a/intern/cycles/util/transform_inverse.h
+++ b/intern/cycles/util/transform_inverse.h
@@ -30,7 +30,7 @@ ccl_device_forceinline float3 transform_inverse_cross(const float3 a_, const flo
 
 ccl_device_forceinline float transform_inverse_dot(const float3 a_, const float3 b_)
 {
-#if defined(__KERNEL_SSE__) && defined(__KERNEL_SSE41__)
+#if defined(__KERNEL_SSE__) && defined(__KERNEL_SSE42__)
   const __m128 a = (const __m128 &)a_;
   const __m128 b = (const __m128 &)b_;
   return _mm_cvtss_f32(_mm_dp_ps(a, b, 0x7F));
diff --git a/intern/cycles/util/transform_sse41.cpp b/intern/cycles/util/transform_sse42.cpp
similarity index 77%
rename from intern/cycles/util/transform_sse41.cpp
rename to intern/cycles/util/transform_sse42.cpp
index 8e64cc9cd71..b47392e6f34 100644
--- a/intern/cycles/util/transform_sse41.cpp
+++ b/intern/cycles/util/transform_sse42.cpp
@@ -6,7 +6,7 @@
 
 CCL_NAMESPACE_BEGIN
 
-void transform_inverse_cpu_sse41(const Transform &tfm, Transform &itfm)
+void transform_inverse_cpu_sse42(const Transform &tfm, Transform &itfm)
 {
   itfm = transform_inverse_impl(tfm);
 }