diff --git a/intern/cycles/CMakeLists.txt b/intern/cycles/CMakeLists.txt index 11d45a9f4b6..0673325cfc5 100644 --- a/intern/cycles/CMakeLists.txt +++ b/intern/cycles/CMakeLists.txt @@ -64,6 +64,15 @@ if(WITH_CYCLES_NATIVE_ONLY) endif() set(CYCLES_KERNEL_FLAGS "${MSVC_NATIVE_ARCH_FLAGS}") endif() +elseif(WIN32 AND MSVC AND SUPPORT_NEON_BUILD AND SSE2NEON_FOUND) + set(CXX_HAS_SSE FALSE) + set(CXX_HAS_AVX FALSE) + set(CXX_HAS_AVX2 FALSE) + set(CYCLES_KERNEL_FLAGS "/fp:fast -D_CRT_SECURE_NO_WARNINGS /GS-") + string(APPEND CMAKE_CXX_FLAGS " ${CYCLES_KERNEL_FLAGS}") + string(APPEND CMAKE_CXX_FLAGS_RELEASE " /Ox") + string(APPEND CMAKE_CXX_FLAGS_RELWITHDEBINFO " /Ox") + string(APPEND CMAKE_CXX_FLAGS_MINSIZEREL " /Ox") elseif(NOT WITH_CPU_SIMD OR (SUPPORT_NEON_BUILD AND SSE2NEON_FOUND)) set(CXX_HAS_SSE42 FALSE) set(CXX_HAS_AVX FALSE) diff --git a/intern/cycles/kernel/types.h b/intern/cycles/kernel/types.h index b50a0c192a1..da8ddc8d290 100644 --- a/intern/cycles/kernel/types.h +++ b/intern/cycles/kernel/types.h @@ -1217,7 +1217,7 @@ typedef enum KernelBVHLayout { } KernelBVHLayout; /* Specialized struct that can become constants in dynamic compilation. */ -#define KERNEL_STRUCT_BEGIN(name, parent) struct name { +#define KERNEL_STRUCT_BEGIN(name, parent) ccl_align(16) struct name { #define KERNEL_STRUCT_END(name) \ } \ ; \ @@ -1259,7 +1259,7 @@ typedef struct KernelLightLinkSet { uint light_tree_root; } KernelLightLinkSet; -typedef struct KernelData { +typedef ccl_align(16) struct KernelData { /* Features and limits. */ uint kernel_features; uint max_closures; diff --git a/intern/cycles/util/defines.h b/intern/cycles/util/defines.h index fe59b497ac8..a1d68bd9654 100644 --- a/intern/cycles/util/defines.h +++ b/intern/cycles/util/defines.h @@ -13,7 +13,7 @@ /* Bitness */ #if defined(__ppc64__) || defined(__PPC64__) || defined(__x86_64__) || defined(__ia64__) || \ - defined(_M_X64) || defined(__aarch64__) + defined(_M_X64) || defined(__aarch64__) || defined(_M_ARM64) # define __KERNEL_64_BIT__ #endif diff --git a/intern/cycles/util/math.h b/intern/cycles/util/math.h index 9db10e89032..7d5cab7e30c 100644 --- a/intern/cycles/util/math.h +++ b/intern/cycles/util/math.h @@ -1000,7 +1000,7 @@ ccl_device_inline uint32_t reverse_integer_bits(uint32_t x) return __brev(x); #elif defined(__KERNEL_METAL__) return reverse_bits(x); -#elif defined(__aarch64__) || defined(_M_ARM64) +#elif defined(__aarch64__) || (defined(_M_ARM64) && !defined(_MSC_VER)) /* Assume the rbit is always available on 64bit ARM architecture. */ __asm__("rbit %w0, %w1" : "=r"(x) : "r"(x)); return x; diff --git a/intern/cycles/util/math_float3.h b/intern/cycles/util/math_float3.h index 38f86de6054..fd3dc0d71aa 100644 --- a/intern/cycles/util/math_float3.h +++ b/intern/cycles/util/math_float3.h @@ -455,7 +455,7 @@ ccl_device_inline float reduce_add(const float3 a) { #if defined(__KERNEL_SSE__) && defined(__KERNEL_NEON__) __m128 t = a.m128; - t[3] = 0.0f; + t = vsetq_lane_f32(0.0f, t, 3); return vaddvq_f32(t); #else return (a.x + a.y + a.z); diff --git a/intern/cycles/util/math_intersect.h b/intern/cycles/util/math_intersect.h index f662ed4f394..b09cf2a4b1b 100644 --- a/intern/cycles/util/math_intersect.h +++ b/intern/cycles/util/math_intersect.h @@ -114,12 +114,12 @@ ccl_device_forceinline float ray_triangle_rcp(const float x) { #ifdef __KERNEL_NEON__ /* Move scalar to vector register and do rcp. */ - __m128 a; - a[0] = x; + __m128 a = {0}; + a = vsetq_lane_f32(x, a, 0); float32x4_t reciprocal = vrecpeq_f32(a); reciprocal = vmulq_f32(vrecpsq_f32(a, reciprocal), reciprocal); reciprocal = vmulq_f32(vrecpsq_f32(a, reciprocal), reciprocal); - return reciprocal[0]; + return vgetq_lane_f32(reciprocal, 0); #elif defined(__KERNEL_SSE__) const __m128 a = _mm_set_ss(x); const __m128 r = _mm_rcp_ss(a); diff --git a/intern/cycles/util/openimagedenoise.h b/intern/cycles/util/openimagedenoise.h index da3952b7257..18510fc2208 100644 --- a/intern/cycles/util/openimagedenoise.h +++ b/intern/cycles/util/openimagedenoise.h @@ -16,9 +16,12 @@ CCL_NAMESPACE_BEGIN static inline bool openimagedenoise_supported() { #ifdef WITH_OPENIMAGEDENOISE -# ifdef __APPLE__ +# if defined(__APPLE__) /* Always supported through Accelerate framework BNNS. */ return true; +# elif defined(_M_ARM64) + /* OIDN supports NEON natively, and all Windows ARM64 platforms support NEON */ + return true; # else return system_cpu_support_sse42(); # endif diff --git a/intern/cycles/util/optimization.h b/intern/cycles/util/optimization.h index e9a4ad8e6e6..d4d51760f27 100644 --- a/intern/cycles/util/optimization.h +++ b/intern/cycles/util/optimization.h @@ -38,7 +38,7 @@ * SSE, some specializations for performance and compatibility are made * made testing for __KERNEL_NEON__. */ -# elif defined(__ARM_NEON) && defined(WITH_SSE2NEON) +# elif (defined(__ARM_NEON) || defined(_M_ARM64)) && defined(WITH_SSE2NEON) # define __KERNEL_NEON__ # define __KERNEL_SSE__ diff --git a/intern/cycles/util/simd.h b/intern/cycles/util/simd.h index a832811718c..9dd37a9d819 100644 --- a/intern/cycles/util/simd.h +++ b/intern/cycles/util/simd.h @@ -19,7 +19,7 @@ * Since we can't avoid including , better only include that */ #if defined(FREE_WINDOWS64) # include "util/windows.h" -#elif defined(_MSC_VER) +#elif defined(_MSC_VER) && !defined(__KERNEL_NEON__) # include #elif (defined(__x86_64__) || defined(__i386__)) # include @@ -40,12 +40,18 @@ # define SIMD_SET_FLUSH_TO_ZERO \ _MM_SET_FLUSH_ZERO_MODE(_MM_FLUSH_ZERO_ON); \ _MM_SET_DENORMALS_ZERO_MODE(_MM_DENORMALS_ZERO_ON); -# else +# elif !defined(_M_ARM64) # define _MM_FLUSH_ZERO_ON 24 # define __get_fpcr(__fpcr) __asm__ __volatile__("mrs %0,fpcr" : "=r"(__fpcr)) # define __set_fpcr(__fpcr) __asm__ __volatile__("msr fpcr,%0" : : "ri"(__fpcr)) # define SIMD_SET_FLUSH_TO_ZERO set_fz(_MM_FLUSH_ZERO_ON); # define SIMD_GET_FLUSH_TO_ZERO get_fz(_MM_FLUSH_ZERO_ON) +# else +# define _MM_FLUSH_ZERO_ON 24 +# define __get_fpcr(__fpcr) _ReadStatusReg(__fpcr) +# define __set_fpcr(__fpcr) _WriteStatusReg(0x5A20, __fpcr) +# define SIMD_SET_FLUSH_TO_ZERO set_fz(_MM_FLUSH_ZERO_ON); +# define SIMD_GET_FLUSH_TO_ZERO get_fz(_MM_FLUSH_ZERO_ON) # endif #else # define SIMD_SET_FLUSH_TO_ZERO @@ -207,7 +213,11 @@ type shuffle_neon(const type &a, const type &b) (i3 * 4) + 2 + 16, (i3 * 4) + 3 + 16}; - return type(vqtbl2q_s8((int8x16x2_t){int8x16_t(a), int8x16_t(b)}, *(uint8x16_t *)tbl)); + // Note: This cannot all be put in a single line due to how MSVC ARM64 + // implements the function calls as several layers of macros. + int8x16x2_t t = {int8x16_t(a), int8x16_t(b)}; + uint8x16_t idx = *(uint8x16_t *)tbl; + return type(vqtbl2q_s8(t, idx)); } } #endif /* __KERNEL_NEON */ diff --git a/intern/cycles/util/system.cpp b/intern/cycles/util/system.cpp index e6e33ca1645..1e0cbe8cfa6 100644 --- a/intern/cycles/util/system.cpp +++ b/intern/cycles/util/system.cpp @@ -77,7 +77,7 @@ string system_cpu_brand_string() if (sysctlbyname("machdep.cpu.brand_string", &modelname, &bufferlen, NULL, 0) == 0) { return modelname; } -#elif defined(WIN32) || defined(__x86_64__) || defined(__i386__) +#elif (defined(WIN32) || defined(__x86_64__) || defined(__i386__)) && !defined(_M_ARM64) /* Get from intrinsics on Windows and x86. */ char buf[49] = {0}; int result[4] = {0}; @@ -96,6 +96,19 @@ string system_cpu_brand_string() return brand; } +#elif defined(_M_ARM64) + DWORD vendorIdentifierLength = 255; + char vendorIdentifier[255]; + if (RegGetValueA(HKEY_LOCAL_MACHINE, + "HARDWARE\\DESCRIPTION\\System\\CentralProcessor\\0", + "VendorIdentifier", + RRF_RT_REG_SZ, + nullptr, + &vendorIdentifier, + &vendorIdentifierLength) == ERROR_SUCCESS) + { + return vendorIdentifier; + } #else /* Get from /proc/cpuinfo on Unix systems. */ FILE *cpuinfo = fopen("/proc/cpuinfo", "r");