From a84a8a528da89677dc16551dbf545eb1891c4c40 Mon Sep 17 00:00:00 2001 From: Brecht Van Lommel Date: Wed, 11 Jan 2023 16:16:21 +0100 Subject: [PATCH] Cycles: remove SSE3 and AVX kernel optimization levels While keeping SSE2, SSE4.1 and AVX2. This does not affect hardware support, it only slightly reduces performance for some older CPUs. To reduce maintenance cost and improve compile times. Differential Revision: https://developer.blender.org/D16978 --- intern/cycles/CMakeLists.txt | 25 +----------------- intern/cycles/blender/addon/properties.py | 2 -- intern/cycles/blender/addon/ui.py | 2 -- intern/cycles/blender/python.cpp | 2 -- intern/cycles/device/cpu/device.cpp | 2 -- intern/cycles/device/cpu/kernel.cpp | 3 +-- intern/cycles/device/cpu/kernel_function.h | 21 +-------------- intern/cycles/kernel/CMakeLists.txt | 7 ----- intern/cycles/kernel/device/cpu/kernel.h | 6 ----- .../cycles/kernel/device/cpu/kernel_avx.cpp | 26 ------------------- .../cycles/kernel/device/cpu/kernel_sse3.cpp | 23 ---------------- intern/cycles/test/CMakeLists.txt | 13 ---------- intern/cycles/util/debug.cpp | 2 -- intern/cycles/util/debug.h | 14 ++-------- intern/cycles/util/optimization.h | 9 ------- intern/cycles/util/system.h | 2 -- 16 files changed, 5 insertions(+), 154 deletions(-) delete mode 100644 intern/cycles/kernel/device/cpu/kernel_avx.cpp delete mode 100644 intern/cycles/kernel/device/cpu/kernel_sse3.cpp diff --git a/intern/cycles/CMakeLists.txt b/intern/cycles/CMakeLists.txt index 53e87fc5c3a..366d38cc94c 100644 --- a/intern/cycles/CMakeLists.txt +++ b/intern/cycles/CMakeLists.txt @@ -85,15 +85,11 @@ elseif(WIN32 AND MSVC AND NOT CMAKE_CXX_COMPILER_ID MATCHES "Clang") # there is no /arch:SSE3, but intrinsics are available anyway if(CMAKE_CL_64) set(CYCLES_SSE2_KERNEL_FLAGS "${CYCLES_KERNEL_FLAGS}") - set(CYCLES_SSE3_KERNEL_FLAGS "${CYCLES_KERNEL_FLAGS}") set(CYCLES_SSE41_KERNEL_FLAGS "${CYCLES_KERNEL_FLAGS}") - set(CYCLES_AVX_KERNEL_FLAGS "${CYCLES_AVX_ARCH_FLAGS} ${CYCLES_KERNEL_FLAGS}") set(CYCLES_AVX2_KERNEL_FLAGS "${CYCLES_AVX2_ARCH_FLAGS} ${CYCLES_KERNEL_FLAGS}") else() set(CYCLES_SSE2_KERNEL_FLAGS "/arch:SSE2 ${CYCLES_KERNEL_FLAGS}") - set(CYCLES_SSE3_KERNEL_FLAGS "/arch:SSE2 ${CYCLES_KERNEL_FLAGS}") set(CYCLES_SSE41_KERNEL_FLAGS "/arch:SSE2 ${CYCLES_KERNEL_FLAGS}") - set(CYCLES_AVX_KERNEL_FLAGS "${CYCLES_AVX_ARCH_FLAGS} ${CYCLES_KERNEL_FLAGS}") set(CYCLES_AVX2_KERNEL_FLAGS "${CYCLES_AVX2_ARCH_FLAGS} ${CYCLES_KERNEL_FLAGS}") endif() @@ -126,11 +122,7 @@ elseif(CMAKE_COMPILER_IS_GNUCC OR (CMAKE_CXX_COMPILER_ID MATCHES "Clang")) endif() set(CYCLES_SSE2_KERNEL_FLAGS "${CYCLES_KERNEL_FLAGS} -msse -msse2") - set(CYCLES_SSE3_KERNEL_FLAGS "${CYCLES_SSE2_KERNEL_FLAGS} -msse3 -mssse3") - set(CYCLES_SSE41_KERNEL_FLAGS "${CYCLES_SSE3_KERNEL_FLAGS} -msse4.1") - if(CXX_HAS_AVX) - set(CYCLES_AVX_KERNEL_FLAGS "${CYCLES_SSE41_KERNEL_FLAGS} -mavx") - endif() + set(CYCLES_SSE41_KERNEL_FLAGS "${CYCLES_SSE2_KERNEL_FLAGS} -msse3 -mssse3 -msse4.1") if(CXX_HAS_AVX2) set(CYCLES_AVX2_KERNEL_FLAGS "${CYCLES_SSE41_KERNEL_FLAGS} -mavx -mavx2 -mfma -mlzcnt -mbmi -mbmi2 -mf16c") endif() @@ -144,13 +136,8 @@ elseif(WIN32 AND CMAKE_CXX_COMPILER_ID MATCHES "Intel") if(CXX_HAS_SSE) set(CYCLES_SSE2_KERNEL_FLAGS "/QxSSE2") - set(CYCLES_SSE3_KERNEL_FLAGS "/QxSSSE3") set(CYCLES_SSE41_KERNEL_FLAGS "/QxSSE4.1") - if(CXX_HAS_AVX) - set(CYCLES_AVX_KERNEL_FLAGS "/arch:AVX") - endif() - if(CXX_HAS_AVX2) set(CYCLES_AVX2_KERNEL_FLAGS "/QxCORE-AVX2") endif() @@ -174,13 +161,8 @@ elseif(CMAKE_CXX_COMPILER_ID MATCHES "Intel") set(CYCLES_SSE2_KERNEL_FLAGS "-xsse2") endif() - set(CYCLES_SSE3_KERNEL_FLAGS "-xssse3") set(CYCLES_SSE41_KERNEL_FLAGS "-xsse4.1") - if(CXX_HAS_AVX) - set(CYCLES_AVX_KERNEL_FLAGS "-xavx") - endif() - if(CXX_HAS_AVX2) set(CYCLES_AVX2_KERNEL_FLAGS "-xcore-avx2") endif() @@ -190,15 +172,10 @@ endif() if(CXX_HAS_SSE) add_definitions( -DWITH_KERNEL_SSE2 - -DWITH_KERNEL_SSE3 -DWITH_KERNEL_SSE41 ) endif() -if(CXX_HAS_AVX) - add_definitions(-DWITH_KERNEL_AVX) -endif() - if(CXX_HAS_AVX2) add_definitions(-DWITH_KERNEL_AVX2) endif() diff --git a/intern/cycles/blender/addon/properties.py b/intern/cycles/blender/addon/properties.py index 9ec663eb258..9c1cb0a1b4a 100644 --- a/intern/cycles/blender/addon/properties.py +++ b/intern/cycles/blender/addon/properties.py @@ -951,9 +951,7 @@ class CyclesRenderSettings(bpy.types.PropertyGroup): return _cycles.debug_flags_update(scene) debug_use_cpu_avx2: BoolProperty(name="AVX2", default=True) - debug_use_cpu_avx: BoolProperty(name="AVX", default=True) debug_use_cpu_sse41: BoolProperty(name="SSE41", default=True) - debug_use_cpu_sse3: BoolProperty(name="SSE3", default=True) debug_use_cpu_sse2: BoolProperty(name="SSE2", default=True) debug_bvh_layout: EnumProperty( name="BVH Layout", diff --git a/intern/cycles/blender/addon/ui.py b/intern/cycles/blender/addon/ui.py index 102e014297f..81f940529d1 100644 --- a/intern/cycles/blender/addon/ui.py +++ b/intern/cycles/blender/addon/ui.py @@ -2112,9 +2112,7 @@ class CYCLES_RENDER_PT_debug(CyclesDebugButtonsPanel, Panel): row = col.row(align=True) row.prop(cscene, "debug_use_cpu_sse2", toggle=True) - row.prop(cscene, "debug_use_cpu_sse3", toggle=True) row.prop(cscene, "debug_use_cpu_sse41", toggle=True) - row.prop(cscene, "debug_use_cpu_avx", toggle=True) row.prop(cscene, "debug_use_cpu_avx2", toggle=True) col.prop(cscene, "debug_bvh_layout", text="BVH") diff --git a/intern/cycles/blender/python.cpp b/intern/cycles/blender/python.cpp index 96cb204be4b..ebbdc8abf7f 100644 --- a/intern/cycles/blender/python.cpp +++ b/intern/cycles/blender/python.cpp @@ -63,9 +63,7 @@ static void debug_flags_sync_from_scene(BL::Scene b_scene) PointerRNA cscene = RNA_pointer_get(&b_scene.ptr, "cycles"); /* Synchronize CPU flags. */ flags.cpu.avx2 = get_boolean(cscene, "debug_use_cpu_avx2"); - flags.cpu.avx = get_boolean(cscene, "debug_use_cpu_avx"); flags.cpu.sse41 = get_boolean(cscene, "debug_use_cpu_sse41"); - flags.cpu.sse3 = get_boolean(cscene, "debug_use_cpu_sse3"); flags.cpu.sse2 = get_boolean(cscene, "debug_use_cpu_sse2"); flags.cpu.bvh_layout = (BVHLayout)get_enum(cscene, "debug_bvh_layout"); /* Synchronize CUDA flags. */ diff --git a/intern/cycles/device/cpu/device.cpp b/intern/cycles/device/cpu/device.cpp index 9b249063aec..580f70b25d7 100644 --- a/intern/cycles/device/cpu/device.cpp +++ b/intern/cycles/device/cpu/device.cpp @@ -45,9 +45,7 @@ string device_cpu_capabilities() { string capabilities = ""; capabilities += system_cpu_support_sse2() ? "SSE2 " : ""; - capabilities += system_cpu_support_sse3() ? "SSE3 " : ""; capabilities += system_cpu_support_sse41() ? "SSE41 " : ""; - capabilities += system_cpu_support_avx() ? "AVX " : ""; capabilities += system_cpu_support_avx2() ? "AVX2" : ""; if (capabilities[capabilities.size() - 1] == ' ') capabilities.resize(capabilities.size() - 1); diff --git a/intern/cycles/device/cpu/kernel.cpp b/intern/cycles/device/cpu/kernel.cpp index 3e078129bca..4ca68e875a3 100644 --- a/intern/cycles/device/cpu/kernel.cpp +++ b/intern/cycles/device/cpu/kernel.cpp @@ -9,8 +9,7 @@ CCL_NAMESPACE_BEGIN #define KERNEL_FUNCTIONS(name) \ KERNEL_NAME_EVAL(cpu, name), KERNEL_NAME_EVAL(cpu_sse2, name), \ - KERNEL_NAME_EVAL(cpu_sse3, name), KERNEL_NAME_EVAL(cpu_sse41, name), \ - KERNEL_NAME_EVAL(cpu_avx, name), KERNEL_NAME_EVAL(cpu_avx2, name) + KERNEL_NAME_EVAL(cpu_sse41, name), KERNEL_NAME_EVAL(cpu_avx2, name) #define REGISTER_KERNEL(name) name(KERNEL_FUNCTIONS(name)) #define REGISTER_KERNEL_FILM_CONVERT(name) \ diff --git a/intern/cycles/device/cpu/kernel_function.h b/intern/cycles/device/cpu/kernel_function.h index 6171f582518..4875f66f8a8 100644 --- a/intern/cycles/device/cpu/kernel_function.h +++ b/intern/cycles/device/cpu/kernel_function.h @@ -17,13 +17,10 @@ template class CPUKernelFunction { public: CPUKernelFunction(FunctionType kernel_default, FunctionType kernel_sse2, - FunctionType kernel_sse3, FunctionType kernel_sse41, - FunctionType kernel_avx, FunctionType kernel_avx2) { - kernel_info_ = get_best_kernel_info( - kernel_default, kernel_sse2, kernel_sse3, kernel_sse41, kernel_avx, kernel_avx2); + kernel_info_ = get_best_kernel_info(kernel_default, kernel_sse2, kernel_sse41, kernel_avx2); } template inline auto operator()(Args... args) const @@ -60,16 +57,12 @@ template class CPUKernelFunction { KernelInfo get_best_kernel_info(FunctionType kernel_default, FunctionType kernel_sse2, - FunctionType kernel_sse3, FunctionType kernel_sse41, - FunctionType kernel_avx, FunctionType kernel_avx2) { /* Silence warnings about unused variables when compiling without some architectures. */ (void)kernel_sse2; - (void)kernel_sse3; (void)kernel_sse41; - (void)kernel_avx; (void)kernel_avx2; #ifdef WITH_CYCLES_OPTIMIZED_KERNEL_AVX2 @@ -78,24 +71,12 @@ template class CPUKernelFunction { } #endif -#ifdef WITH_CYCLES_OPTIMIZED_KERNEL_AVX - if (DebugFlags().cpu.has_avx() && system_cpu_support_avx()) { - return KernelInfo("AVX", kernel_avx); - } -#endif - #ifdef WITH_CYCLES_OPTIMIZED_KERNEL_SSE41 if (DebugFlags().cpu.has_sse41() && system_cpu_support_sse41()) { return KernelInfo("SSE4.1", kernel_sse41); } #endif -#ifdef WITH_CYCLES_OPTIMIZED_KERNEL_SSE3 - if (DebugFlags().cpu.has_sse3() && system_cpu_support_sse3()) { - return KernelInfo("SSE3", kernel_sse3); - } -#endif - #ifdef WITH_CYCLES_OPTIMIZED_KERNEL_SSE2 if (DebugFlags().cpu.has_sse2() && system_cpu_support_sse2()) { return KernelInfo("SSE2", kernel_sse2); diff --git a/intern/cycles/kernel/CMakeLists.txt b/intern/cycles/kernel/CMakeLists.txt index 5ba1b683d6b..3ae468efd1f 100644 --- a/intern/cycles/kernel/CMakeLists.txt +++ b/intern/cycles/kernel/CMakeLists.txt @@ -14,9 +14,7 @@ set(INC_SYS set(SRC_KERNEL_DEVICE_CPU device/cpu/kernel.cpp device/cpu/kernel_sse2.cpp - device/cpu/kernel_sse3.cpp device/cpu/kernel_sse41.cpp - device/cpu/kernel_avx.cpp device/cpu/kernel_avx2.cpp ) @@ -940,14 +938,9 @@ set_source_files_properties(device/cpu/kernel.cpp PROPERTIES COMPILE_FLAGS "${CY if(CXX_HAS_SSE) set_source_files_properties(device/cpu/kernel_sse2.cpp PROPERTIES COMPILE_FLAGS "${CYCLES_SSE2_KERNEL_FLAGS}") - set_source_files_properties(device/cpu/kernel_sse3.cpp PROPERTIES COMPILE_FLAGS "${CYCLES_SSE3_KERNEL_FLAGS}") set_source_files_properties(device/cpu/kernel_sse41.cpp PROPERTIES COMPILE_FLAGS "${CYCLES_SSE41_KERNEL_FLAGS}") endif() -if(CXX_HAS_AVX) - set_source_files_properties(device/cpu/kernel_avx.cpp PROPERTIES COMPILE_FLAGS "${CYCLES_AVX_KERNEL_FLAGS}") -endif() - if(CXX_HAS_AVX2) set_source_files_properties(device/cpu/kernel_avx2.cpp PROPERTIES COMPILE_FLAGS "${CYCLES_AVX2_KERNEL_FLAGS}") endif() diff --git a/intern/cycles/kernel/device/cpu/kernel.h b/intern/cycles/kernel/device/cpu/kernel.h index 647b405140a..e43d7375eea 100644 --- a/intern/cycles/kernel/device/cpu/kernel.h +++ b/intern/cycles/kernel/device/cpu/kernel.h @@ -35,15 +35,9 @@ void kernel_global_memory_copy(KernelGlobalsCPU *kg, const char *name, void *mem #define KERNEL_ARCH cpu_sse2 #include "kernel/device/cpu/kernel_arch.h" -#define KERNEL_ARCH cpu_sse3 -#include "kernel/device/cpu/kernel_arch.h" - #define KERNEL_ARCH cpu_sse41 #include "kernel/device/cpu/kernel_arch.h" -#define KERNEL_ARCH cpu_avx -#include "kernel/device/cpu/kernel_arch.h" - #define KERNEL_ARCH cpu_avx2 #include "kernel/device/cpu/kernel_arch.h" diff --git a/intern/cycles/kernel/device/cpu/kernel_avx.cpp b/intern/cycles/kernel/device/cpu/kernel_avx.cpp deleted file mode 100644 index 872ad5ce7e2..00000000000 --- a/intern/cycles/kernel/device/cpu/kernel_avx.cpp +++ /dev/null @@ -1,26 +0,0 @@ -/* SPDX-License-Identifier: Apache-2.0 - * Copyright 2011-2022 Blender Foundation */ - -/* Optimized CPU kernel entry points. This file is compiled with AVX - * optimization flags and nearly all functions inlined, while kernel.cpp - * is compiled without for other CPU's. */ - -#include "util/optimization.h" - -#ifndef WITH_CYCLES_OPTIMIZED_KERNEL_AVX -# define KERNEL_STUB -#else -/* SSE optimization disabled for now on 32 bit, see bug T36316. */ -# if !(defined(__GNUC__) && (defined(i386) || defined(_M_IX86))) -# define __KERNEL_SSE__ -# define __KERNEL_SSE2__ -# define __KERNEL_SSE3__ -# define __KERNEL_SSSE3__ -# define __KERNEL_SSE41__ -# define __KERNEL_AVX__ -# endif -#endif /* WITH_CYCLES_OPTIMIZED_KERNEL_AVX */ - -#include "kernel/device/cpu/kernel.h" -#define KERNEL_ARCH cpu_avx -#include "kernel/device/cpu/kernel_arch_impl.h" diff --git a/intern/cycles/kernel/device/cpu/kernel_sse3.cpp b/intern/cycles/kernel/device/cpu/kernel_sse3.cpp deleted file mode 100644 index eb78b61a723..00000000000 --- a/intern/cycles/kernel/device/cpu/kernel_sse3.cpp +++ /dev/null @@ -1,23 +0,0 @@ -/* SPDX-License-Identifier: Apache-2.0 - * Copyright 2011-2022 Blender Foundation */ - -/* Optimized CPU kernel entry points. This file is compiled with SSE3/SSSE3 - * optimization flags and nearly all functions inlined, while kernel.cpp - * is compiled without for other CPU's. */ - -#include "util/optimization.h" - -#ifndef WITH_CYCLES_OPTIMIZED_KERNEL_SSE3 -# define KERNEL_STUB -#else -/* SSE optimization disabled for now on 32 bit, see bug T36316. */ -# if !(defined(__GNUC__) && (defined(i386) || defined(_M_IX86))) -# define __KERNEL_SSE2__ -# define __KERNEL_SSE3__ -# define __KERNEL_SSSE3__ -# endif -#endif /* WITH_CYCLES_OPTIMIZED_KERNEL_SSE3 */ - -#include "kernel/device/cpu/kernel.h" -#define KERNEL_ARCH cpu_sse3 -#include "kernel/device/cpu/kernel_arch_impl.h" diff --git a/intern/cycles/test/CMakeLists.txt b/intern/cycles/test/CMakeLists.txt index 34e5a4770ea..cdf8f7db0bb 100644 --- a/intern/cycles/test/CMakeLists.txt +++ b/intern/cycles/test/CMakeLists.txt @@ -45,19 +45,6 @@ set(SRC # Disable AVX tests on macOS. Rosetta has problems running them, and other # platforms should be enough to verify AVX operations are implemented correctly. if(NOT APPLE) - if(CXX_HAS_SSE) - list(APPEND SRC - util_float8_sse2_test.cpp - ) - set_source_files_properties(util_float8_avx_test.cpp PROPERTIES COMPILE_FLAGS "${CYCLES_SSE2_KERNEL_FLAGS}") - endif() - - if(CXX_HAS_AVX) - list(APPEND SRC - util_float8_avx_test.cpp - ) - set_source_files_properties(util_float8_avx_test.cpp PROPERTIES COMPILE_FLAGS "${CYCLES_AVX_KERNEL_FLAGS}") - endif() if(CXX_HAS_AVX2) list(APPEND SRC util_float8_avx2_test.cpp diff --git a/intern/cycles/util/debug.cpp b/intern/cycles/util/debug.cpp index 8210e21f951..e7cf7b545b8 100644 --- a/intern/cycles/util/debug.cpp +++ b/intern/cycles/util/debug.cpp @@ -29,9 +29,7 @@ void DebugFlags::CPU::reset() } while (0) CHECK_CPU_FLAGS(avx2, "CYCLES_CPU_NO_AVX2"); - CHECK_CPU_FLAGS(avx, "CYCLES_CPU_NO_AVX"); CHECK_CPU_FLAGS(sse41, "CYCLES_CPU_NO_SSE41"); - CHECK_CPU_FLAGS(sse3, "CYCLES_CPU_NO_SSE3"); CHECK_CPU_FLAGS(sse2, "CYCLES_CPU_NO_SSE2"); #undef STRINGIFY diff --git a/intern/cycles/util/debug.h b/intern/cycles/util/debug.h index ab200649f59..9ee09f08581 100644 --- a/intern/cycles/util/debug.h +++ b/intern/cycles/util/debug.h @@ -26,9 +26,7 @@ class DebugFlags { /* Flags describing which instructions sets are allowed for use. */ bool avx2 = true; - bool avx = true; bool sse41 = true; - bool sse3 = true; bool sse2 = true; /* Check functions to see whether instructions up to the given one @@ -36,19 +34,11 @@ class DebugFlags { */ bool has_avx2() { - return has_avx() && avx2; - } - bool has_avx() - { - return has_sse41() && avx; + return has_sse41() && avx2; } bool has_sse41() { - return has_sse3() && sse41; - } - bool has_sse3() - { - return has_sse2() && sse3; + return has_sse2() && sse41; } bool has_sse2() { diff --git a/intern/cycles/util/optimization.h b/intern/cycles/util/optimization.h index 19b96510c47..b6194dc0382 100644 --- a/intern/cycles/util/optimization.h +++ b/intern/cycles/util/optimization.h @@ -17,9 +17,6 @@ # ifdef WITH_KERNEL_SSE2 # define WITH_CYCLES_OPTIMIZED_KERNEL_SSE2 # endif -# ifdef WITH_KERNEL_SSE3 -# define WITH_CYCLES_OPTIMIZED_KERNEL_SSE3 -# endif /* x86-64 * @@ -30,15 +27,9 @@ /* SSE2 is always available on x86-64 CPUs, so auto enable */ # define __KERNEL_SSE2__ /* no SSE2 kernel on x86-64, part of regular kernel */ -# ifdef WITH_KERNEL_SSE3 -# define WITH_CYCLES_OPTIMIZED_KERNEL_SSE3 -# endif # ifdef WITH_KERNEL_SSE41 # define WITH_CYCLES_OPTIMIZED_KERNEL_SSE41 # endif -# ifdef WITH_KERNEL_AVX -# define WITH_CYCLES_OPTIMIZED_KERNEL_AVX -# endif # ifdef WITH_KERNEL_AVX2 # define WITH_CYCLES_OPTIMIZED_KERNEL_AVX2 # endif diff --git a/intern/cycles/util/system.h b/intern/cycles/util/system.h index 2152b89ed24..66140cabf84 100644 --- a/intern/cycles/util/system.h +++ b/intern/cycles/util/system.h @@ -17,9 +17,7 @@ int system_console_width(); std::string system_cpu_brand_string(); int system_cpu_bits(); bool system_cpu_support_sse2(); -bool system_cpu_support_sse3(); bool system_cpu_support_sse41(); -bool system_cpu_support_avx(); bool system_cpu_support_avx2(); size_t system_physical_ram();