diff --git a/build_files/build_environment/cmake/opencolorio.cmake b/build_files/build_environment/cmake/opencolorio.cmake index 649d713e316..e667a989dd5 100644 --- a/build_files/build_environment/cmake/opencolorio.cmake +++ b/build_files/build_environment/cmake/opencolorio.cmake @@ -38,10 +38,18 @@ if(APPLE) endif() if(BLENDER_PLATFORM_ARM) - set(OPENCOLORIO_EXTRA_ARGS - ${OPENCOLORIO_EXTRA_ARGS} - -DOCIO_USE_SSE=OFF - ) + if(WIN32) + set(OCIO_PATCH + ${PATCH_CMD} -p 1 -d + ${BUILD_DIR}/opencolorio/src/external_opencolorio < + ${PATCH_DIR}/ocio_2089.diff + ) + else() + set(OPENCOLORIO_EXTRA_ARGS + ${OPENCOLORIO_EXTRA_ARGS} + -DOCIO_USE_SSE=OFF + ) + endif() endif() if(WIN32) @@ -68,6 +76,7 @@ ExternalProject_Add(external_opencolorio URL_HASH ${OPENCOLORIO_HASH_TYPE}=${OPENCOLORIO_HASH} CMAKE_GENERATOR ${PLATFORM_ALT_GENERATOR} PREFIX ${BUILD_DIR}/opencolorio + PATCH_COMMAND ${OCIO_PATCH} CMAKE_ARGS -DCMAKE_INSTALL_PREFIX=${LIBDIR}/opencolorio diff --git a/build_files/build_environment/patches/ocio_2089.diff b/build_files/build_environment/patches/ocio_2089.diff new file mode 100644 index 00000000000..16c49a23406 --- /dev/null +++ b/build_files/build_environment/patches/ocio_2089.diff @@ -0,0 +1,298 @@ +diff --git a/CMakeLists.txt b/CMakeLists.txt +index 9b20aa043..8f33d3a3f 100755 +--- a/CMakeLists.txt ++++ b/CMakeLists.txt +@@ -180,7 +180,12 @@ option(OCIO_USE_OIIO_FOR_APPS "Request OIIO to build apps (ociolutimage, ociocon + + + if (NOT APPLE) +- if ("${CMAKE_SYSTEM_PROCESSOR}" MATCHES "(AMD64|IA64|EM64T|x86_64|X86|i386|i686)") ++ if("${CMAKE_GENERATOR_PLATFORM}" MATCHES "(ARM64|arm64)" OR CMAKE_SYSTEM_PROCESSOR STREQUAL "ARM64") ++ set(OCIO_ARCH_X86 0) ++ set(OCIO_BUILD_ENABLE_OPTIMIZATIONS_SSE ON) ++ set(OCIO_BUILD_ENABLE_OPTIMIZATIONS_AVX OFF) ++ set(OCIO_BUILD_ENABLE_OPTIMIZATIONS_F16C OFF) ++ elseif ("${CMAKE_SYSTEM_PROCESSOR}" MATCHES "(AMD64|IA64|EM64T|x86_64|X86|i386|i686)") + # Intel-based architecture (not APPLE) + if ("${CMAKE_SYSTEM_PROCESSOR}" MATCHES "(X86|i386|i686)") + set(OCIO_ARCH_X86_32 1) +@@ -270,7 +275,7 @@ option(OCIO_USE_AVX2 "Specify whether to enable AVX2 CPU performance optimizatio + option(OCIO_USE_AVX512 "Specify whether to enable AVX512 CPU performance optimizations" ${OCIO_BUILD_ENABLE_OPTIMIZATIONS_AVX}) + option(OCIO_USE_F16C "Specify whether to enable F16C CPU performance optimizations" ${OCIO_BUILD_ENABLE_OPTIMIZATIONS_F16C}) + +-if (APPLE) ++if (APPLE OR WIN32) + # TODO: Revisit whether that option is necessary. + option(OCIO_USE_SSE2NEON "Specify whether to enable SSE CPU performance optimizations using SSE2NEON for Apple ARM architecture" ON) + mark_as_advanced(OCIO_USE_SSE2NEON) +@@ -332,8 +337,10 @@ if(OCIO_USE_SIMD AND OCIO_USE_SSE2NEON AND COMPILER_SUPPORTS_ARM_NEON) + add_library(sse2neon INTERFACE) + # Add the include directories to the target. + target_include_directories(sse2neon INTERFACE "${sse2neon_INCLUDE_DIR}") +- # Ignore the warnings coming from sse2neon.h as they are false positives. +- target_compile_options(sse2neon INTERFACE -Wno-unused-parameter) ++ if(NOT MSVC) ++ # Ignore the warnings coming from sse2neon.h as they are false positives. ++ target_compile_options(sse2neon INTERFACE -Wno-unused-parameter) ++ endif() + endif() + endif() + +diff --git a/share/cmake/modules/install/Installsse2neon.cmake b/share/cmake/modules/install/Installsse2neon.cmake +index 47877436a..ced4ae139 100644 +--- a/share/cmake/modules/install/Installsse2neon.cmake ++++ b/share/cmake/modules/install/Installsse2neon.cmake +@@ -16,7 +16,7 @@ include(FetchContent) + set(FETCHCONTENT_BASE_DIR "${CMAKE_BINARY_DIR}/ext/build/sse2neon") + FetchContent_Declare(sse2neon + GIT_REPOSITORY https://github.com/DLTcollab/sse2neon.git +- GIT_TAG v1.6.0 ++ GIT_TAG 227cc413fb2d50b2a10073087be96b59d5364aea + ) + + # FetchContent_MakeAvailable is not available until CMake 3.14+. +@@ -38,6 +38,8 @@ if(NOT sse2neon_POPULATED) + add_library(sse2neon INTERFACE) + # Add the include directories to the target. + target_include_directories(sse2neon INTERFACE "${sse2neon_INCLUDE_DIR}") +- # Ignore the warnings coming from sse2neon.h as they are false positives. +- target_compile_options(sse2neon INTERFACE -Wno-unused-parameter) ++ if(NOT MSVC) ++ # Ignore the warnings coming from sse2neon.h as they are false positives. ++ target_compile_options(sse2neon INTERFACE -Wno-unused-parameter) ++ endif() + endif() +\ No newline at end of file +diff --git a/share/cmake/utils/CheckSupportSSEUsingSSE2NEON.cmake b/share/cmake/utils/CheckSupportSSEUsingSSE2NEON.cmake +index c47c8be70..d24cda55f 100644 +--- a/share/cmake/utils/CheckSupportSSEUsingSSE2NEON.cmake ++++ b/share/cmake/utils/CheckSupportSSEUsingSSE2NEON.cmake +@@ -6,8 +6,13 @@ include(CheckCXXSourceCompiles) + set(_cmake_required_flags_orig "${CMAKE_REQUIRED_FLAGS}") + set(_cmake_required_includes_orig "${CMAKE_REQUIRED_INCLUDES}") + set(_cmake_osx_architectures_orig "${CMAKE_OSX_ARCHITECTURES}") ++set(_cmake_cxx_flags_orig "${CMAKE_CXX_FLAGS}") + +-if(APPLE AND COMPILER_SUPPORTS_ARM_NEON) ++if(MSVC) ++ set(CMAKE_CXX_FLAGS "/Zc:preprocessor") ++endif() ++ ++if((APPLE OR WIN32) AND COMPILER_SUPPORTS_ARM_NEON) + + if("${CMAKE_OSX_ARCHITECTURES}" MATCHES "arm64;x86_64" OR + "${CMAKE_OSX_ARCHITECTURES}" MATCHES "x86_64;arm64") +@@ -63,8 +68,9 @@ endif() + set(CMAKE_REQUIRED_FLAGS "${_cmake_required_flags_orig}") + set(CMAKE_REQUIRED_INCLUDES "${_cmake_required_includes_orig}") + set(CMAKE_OSX_ARCHITECTURES "${_cmake_osx_architectures_orig}") ++set(CMAKE_CXX_FLAGS "${_cmake_cxx_flags_orig}") + + unset(_cmake_required_flags_orig) + unset(_cmake_required_includes_orig) + unset(_cmake_osx_architectures_orig) +- ++unset(_cmake_cxx_flags_orig) +diff --git a/share/cmake/utils/CompilerFlags.cmake b/share/cmake/utils/CompilerFlags.cmake +index 536b5eebd..d87e73f2d 100644 +--- a/share/cmake/utils/CompilerFlags.cmake ++++ b/share/cmake/utils/CompilerFlags.cmake +@@ -18,7 +18,12 @@ if(OCIO_USE_SIMD) + + if (OCIO_USE_SSE2NEON AND COMPILER_SUPPORTS_ARM_NEON) + include(CheckSupportSSEUsingSSE2NEON) +- if(NOT COMPILER_SUPPORTS_SSE_WITH_SSE2NEON) ++ if(COMPILER_SUPPORTS_SSE_WITH_SSE2NEON) ++ if(WIN32 AND MSVC) ++ # Enable the "new" preprocessor, to more closely match Clang/GCC, required for sse2neon ++ set(PLATFORM_COMPILE_OPTIONS "${PLATFORM_COMPILE_OPTIONS};/Zc:preprocessor") ++ endif() ++ else() + set(OCIO_USE_SSE2NEON OFF) + endif() + endif() +diff --git a/src/OpenColorIO/CPUInfo.cpp b/src/OpenColorIO/CPUInfo.cpp +index dce813a4f..edf341792 100644 +--- a/src/OpenColorIO/CPUInfo.cpp ++++ b/src/OpenColorIO/CPUInfo.cpp +@@ -183,7 +183,7 @@ CPUInfo::CPUInfo() + } + } + +-#elif defined(__aarch64__) // ARM Processor or Apple ARM. ++#elif defined(__aarch64__) || defined(_M_ARM64) // ARM 64-bit processor (multiple platforms) + + CPUInfo::CPUInfo() + { +diff --git a/src/OpenColorIO/CPUInfoConfig.h.in b/src/OpenColorIO/CPUInfoConfig.h.in +index b8f5045d2..c105d4159 100644 +--- a/src/OpenColorIO/CPUInfoConfig.h.in ++++ b/src/OpenColorIO/CPUInfoConfig.h.in +@@ -6,7 +6,7 @@ + #cmakedefine01 OCIO_ARCH_X86_32 + + // Relevant only for arm64 architecture. +-#if defined(__aarch64__) ++#if defined(__aarch64__) || defined(_M_ARM64) + #cmakedefine01 OCIO_USE_SSE2NEON + #else + #define OCIO_USE_SSE2NEON 0 +@@ -23,7 +23,7 @@ + + // Building for x86_64 processor on a non-ARM host architecture + // OR Building on/for an ARM architecture and using SSE2NEON. +-#if (OCIO_ARCH_X86 && !defined(__aarch64__)) || (defined(__aarch64__) && OCIO_USE_SSE2NEON) ++#if (OCIO_ARCH_X86 && !defined(__aarch64__)) || ((defined(__aarch64__) || defined(_M_ARM64)) && OCIO_USE_SSE2NEON) + #cmakedefine01 OCIO_USE_SSE2 + #cmakedefine01 OCIO_USE_SSE3 + #cmakedefine01 OCIO_USE_SSSE3 +diff --git a/src/OpenColorIO/SSE.h b/src/OpenColorIO/SSE.h +index 2494698c5..6aebc45d4 100644 +--- a/src/OpenColorIO/SSE.h ++++ b/src/OpenColorIO/SSE.h +@@ -9,14 +9,25 @@ + #if OCIO_USE_SSE2 + + // Include the appropriate SIMD intrinsics header based on the architecture (Intel vs. ARM). +-#if !defined(__aarch64__) ++#if !defined(__aarch64__) && !defined(_M_ARM64) + #if OCIO_USE_SSE2 + #include + #endif +-#elif defined(__aarch64__) ++#elif defined(__aarch64__) || defined(_M_ARM64) + // ARM architecture A64 (ARM64) + #if OCIO_USE_SSE2NEON ++ // MSVC doesn't like the redefinitions below and requires the existing functions to be undef-ed ++ #if defined(_M_ARM64) ++ #define _mm_max_ps _mm_max_ps_orig ++ #define _mm_min_ps _mm_min_ps_orig ++ #endif ++ + #include ++ ++ #if defined(_M_ARM64) ++ #undef _mm_max_ps ++ #undef _mm_min_ps ++ #endif + #endif + #endif + +@@ -30,7 +41,7 @@ namespace OCIO_NAMESPACE + // Note that it is important for the code below this ifdef stays in the OCIO_NAMESPACE since + // it is redefining two of the functions from sse2neon. + +-#if defined(__aarch64__) ++#if defined(__aarch64__) || defined(_M_ARM64) + #if OCIO_USE_SSE2NEON + // Using vmaxnmq_f32 and vminnmq_f32 rather than sse2neon's vmaxq_f32 and vminq_f32 due to + // NaN handling. This doesn't seem to be significantly slower than the default sse2neon behavior. +@@ -77,6 +88,9 @@ static const __m128 EPOS128 = _mm_set1_ps(128.0f); + + static const __m128 EPOSINF = _mm_set1_ps(std::numeric_limits::infinity()); + ++// These funtions won't work when using MSVC + ARM64 unless you specify /Zc:arm64-aliased-neon-types- ++// This comes with it's own issues, so it is easier to just disable them when using MSVC + ARM64 ++#if !defined(_M_ARM64) + // Debug function to print out the contents of a floating-point SSE register + inline void ssePrintRegister(const char* msg, __m128& reg) + { +@@ -91,6 +105,7 @@ inline void ssePrintRegister(const char* msg, __m128i& reg) + int *r = (int*) ® + printf("%s : %d %d %d %d\n", msg, r[0], r[1], r[2], r[3]); + } ++#endif + + // Determine whether a floating-point value is negative based on its sign bit. + // This function will treat special values, like -0, -NaN, -Inf, as they were indeed +@@ -170,7 +185,7 @@ inline __m128 sseLog2(__m128 x) + { + // y = log2( x ) = log2( 2^exponent * mantissa ) + // = exponent + log2( mantissa ) +- ++ + __m128 mantissa + = _mm_or_ps( // OR with EONE + _mm_andnot_ps( // NOT(EMASK) AND x +diff --git a/src/OpenColorIO/SSE2.h b/src/OpenColorIO/SSE2.h +index 918694fc8..e51dad9b5 100644 +--- a/src/OpenColorIO/SSE2.h ++++ b/src/OpenColorIO/SSE2.h +@@ -9,12 +9,35 @@ + #if OCIO_USE_SSE2 + + // Include the appropriate SIMD intrinsics header based on the architecture (Intel vs. ARM). +-#if !defined(__aarch64__) ++#if !defined(__aarch64__) && !defined(_M_ARM64) + #include +-#elif defined(__aarch64__) ++#elif defined(__aarch64__) || defined(_M_ARM64) + // ARM architecture A64 (ARM64) + #if OCIO_USE_SSE2NEON ++ // MSVC doesn't like the redefinitions below and requires the existing functions to be undef-ed ++ #if defined(_M_ARM64) ++ #define _mm_max_ps _mm_max_ps_orig ++ #define _mm_min_ps _mm_min_ps_orig ++ #endif ++ + #include ++ ++ #if defined(_M_ARM64) ++ #undef _mm_max_ps ++ #undef _mm_min_ps ++ #endif ++ ++ // Current versions of MSVC do not define float16_t, so we do it ourselves using ++ // int16_t as an intermediate type ++ #if defined(_M_ARM64) && !defined(float16_t) ++ #define float16_t int16_t ++ #endif ++ ++ // Current versions of MSVC do not define vst1q_f16, so we do it ourselves using ++ // internal methods from MSVC's arm_neon.h ++ #if defined(_M_ARM64) && !defined(vst1q_f16) ++ #define vst1q_f16(A, B) neon_st1m_q16((A), __float16x8_t_to_n128(B)); ++ #endif + #endif + #endif + +@@ -30,7 +53,7 @@ namespace OCIO_NAMESPACE + // Note that it is important for the code below this ifdef stays in the OCIO_NAMESPACE since + // it is redefining two of the functions from sse2neon. + +-#if defined(__aarch64__) ++#if defined(__aarch64__) || defined(_M_ARM64) + #if OCIO_USE_SSE2NEON + // Using vmaxnmq_f32 and vminnmq_f32 rather than sse2neon's vmaxq_f32 and vminq_f32 due to + // NaN handling. This doesn't seem to be significantly slower than the default sse2neon behavior. +@@ -321,7 +344,7 @@ struct SSE2RGBAPack + sse2RGBATranspose_4x4(r, g, b, a, rgba0, rgba1, rgba2, rgba3); + + #if OCIO_USE_SSE2NEON +- // use neon hardware support for f32 to f16 ++ // use neon hardware support for f32 to f16 (apart from in MSVC, which doesnt support it) + float16x8_t rgba; + float16x4_t rgba00_01 = vcvt_f16_f32(vreinterpretq_f32_m128(rgba0)); + float16x4_t rgba03_03 = vcvt_f16_f32(vreinterpretq_f32_m128(rgba1)); +diff --git a/src/OpenColorIO/ops/fixedfunction/FixedFunctionOpCPU.cpp b/src/OpenColorIO/ops/fixedfunction/FixedFunctionOpCPU.cpp +index 96adff44b..e9e977433 100644 +--- a/src/OpenColorIO/ops/fixedfunction/FixedFunctionOpCPU.cpp ++++ b/src/OpenColorIO/ops/fixedfunction/FixedFunctionOpCPU.cpp +@@ -1844,7 +1844,7 @@ __m128 Renderer_LIN_TO_PQ_SSE::myPower(__m128 x, __m128 exp) + return ssePower(x, exp); + } + +-#ifdef _WIN32 ++#if (_MSC_VER >= 1920) && (OCIO_USE_AVX) + // Only Windows compilers have built-in _mm_pow_ps() SVML intrinsic + // implementation, so non-fast SIMD version is available only on Windows for + // now. +@@ -1853,7 +1853,7 @@ __m128 Renderer_LIN_TO_PQ_SSE::myPower(__m128 x, __m128 exp) + { + return _mm_pow_ps(x, exp); + } +-#endif // _WIN32 ++#endif // (_MSC_VER >= 1920) && (OCIO_USE_AVX) + + template + void Renderer_LIN_TO_PQ_SSE::apply(const void* inImg, void* outImg, long numPixels) const