Cycles: Add Windows ARM64 support

Ref #119126

Pull Request: https://projects.blender.org/blender/blender/pulls/117036
This commit is contained in:
Anthony Roberts
2024-03-06 15:44:46 +01:00
committed by Brecht Van Lommel
parent 7e3b83b146
commit 3d5fa7698f
10 changed files with 49 additions and 14 deletions

View File

@@ -64,6 +64,15 @@ if(WITH_CYCLES_NATIVE_ONLY)
endif()
set(CYCLES_KERNEL_FLAGS "${MSVC_NATIVE_ARCH_FLAGS}")
endif()
elseif(WIN32 AND MSVC AND SUPPORT_NEON_BUILD AND SSE2NEON_FOUND)
set(CXX_HAS_SSE FALSE)
set(CXX_HAS_AVX FALSE)
set(CXX_HAS_AVX2 FALSE)
set(CYCLES_KERNEL_FLAGS "/fp:fast -D_CRT_SECURE_NO_WARNINGS /GS-")
string(APPEND CMAKE_CXX_FLAGS " ${CYCLES_KERNEL_FLAGS}")
string(APPEND CMAKE_CXX_FLAGS_RELEASE " /Ox")
string(APPEND CMAKE_CXX_FLAGS_RELWITHDEBINFO " /Ox")
string(APPEND CMAKE_CXX_FLAGS_MINSIZEREL " /Ox")
elseif(NOT WITH_CPU_SIMD OR (SUPPORT_NEON_BUILD AND SSE2NEON_FOUND))
set(CXX_HAS_SSE42 FALSE)
set(CXX_HAS_AVX FALSE)

View File

@@ -1217,7 +1217,7 @@ typedef enum KernelBVHLayout {
} KernelBVHLayout;
/* Specialized struct that can become constants in dynamic compilation. */
#define KERNEL_STRUCT_BEGIN(name, parent) struct name {
#define KERNEL_STRUCT_BEGIN(name, parent) ccl_align(16) struct name {
#define KERNEL_STRUCT_END(name) \
} \
; \
@@ -1259,7 +1259,7 @@ typedef struct KernelLightLinkSet {
uint light_tree_root;
} KernelLightLinkSet;
typedef struct KernelData {
typedef ccl_align(16) struct KernelData {
/* Features and limits. */
uint kernel_features;
uint max_closures;

View File

@@ -13,7 +13,7 @@
/* Bitness */
#if defined(__ppc64__) || defined(__PPC64__) || defined(__x86_64__) || defined(__ia64__) || \
defined(_M_X64) || defined(__aarch64__)
defined(_M_X64) || defined(__aarch64__) || defined(_M_ARM64)
# define __KERNEL_64_BIT__
#endif

View File

@@ -1000,7 +1000,7 @@ ccl_device_inline uint32_t reverse_integer_bits(uint32_t x)
return __brev(x);
#elif defined(__KERNEL_METAL__)
return reverse_bits(x);
#elif defined(__aarch64__) || defined(_M_ARM64)
#elif defined(__aarch64__) || (defined(_M_ARM64) && !defined(_MSC_VER))
/* Assume the rbit is always available on 64bit ARM architecture. */
__asm__("rbit %w0, %w1" : "=r"(x) : "r"(x));
return x;

View File

@@ -455,7 +455,7 @@ ccl_device_inline float reduce_add(const float3 a)
{
#if defined(__KERNEL_SSE__) && defined(__KERNEL_NEON__)
__m128 t = a.m128;
t[3] = 0.0f;
t = vsetq_lane_f32(0.0f, t, 3);
return vaddvq_f32(t);
#else
return (a.x + a.y + a.z);

View File

@@ -114,12 +114,12 @@ ccl_device_forceinline float ray_triangle_rcp(const float x)
{
#ifdef __KERNEL_NEON__
/* Move scalar to vector register and do rcp. */
__m128 a;
a[0] = x;
__m128 a = {0};
a = vsetq_lane_f32(x, a, 0);
float32x4_t reciprocal = vrecpeq_f32(a);
reciprocal = vmulq_f32(vrecpsq_f32(a, reciprocal), reciprocal);
reciprocal = vmulq_f32(vrecpsq_f32(a, reciprocal), reciprocal);
return reciprocal[0];
return vgetq_lane_f32(reciprocal, 0);
#elif defined(__KERNEL_SSE__)
const __m128 a = _mm_set_ss(x);
const __m128 r = _mm_rcp_ss(a);

View File

@@ -16,9 +16,12 @@ CCL_NAMESPACE_BEGIN
static inline bool openimagedenoise_supported()
{
#ifdef WITH_OPENIMAGEDENOISE
# ifdef __APPLE__
# if defined(__APPLE__)
/* Always supported through Accelerate framework BNNS. */
return true;
# elif defined(_M_ARM64)
/* OIDN supports NEON natively, and all Windows ARM64 platforms support NEON */
return true;
# else
return system_cpu_support_sse42();
# endif

View File

@@ -38,7 +38,7 @@
* SSE, some specializations for performance and compatibility are made
* made testing for __KERNEL_NEON__. */
# elif defined(__ARM_NEON) && defined(WITH_SSE2NEON)
# elif (defined(__ARM_NEON) || defined(_M_ARM64)) && defined(WITH_SSE2NEON)
# define __KERNEL_NEON__
# define __KERNEL_SSE__

View File

@@ -19,7 +19,7 @@
* Since we can't avoid including <windows.h>, better only include that */
#if defined(FREE_WINDOWS64)
# include "util/windows.h"
#elif defined(_MSC_VER)
#elif defined(_MSC_VER) && !defined(__KERNEL_NEON__)
# include <intrin.h>
#elif (defined(__x86_64__) || defined(__i386__))
# include <x86intrin.h>
@@ -40,12 +40,18 @@
# define SIMD_SET_FLUSH_TO_ZERO \
_MM_SET_FLUSH_ZERO_MODE(_MM_FLUSH_ZERO_ON); \
_MM_SET_DENORMALS_ZERO_MODE(_MM_DENORMALS_ZERO_ON);
# else
# elif !defined(_M_ARM64)
# define _MM_FLUSH_ZERO_ON 24
# define __get_fpcr(__fpcr) __asm__ __volatile__("mrs %0,fpcr" : "=r"(__fpcr))
# define __set_fpcr(__fpcr) __asm__ __volatile__("msr fpcr,%0" : : "ri"(__fpcr))
# define SIMD_SET_FLUSH_TO_ZERO set_fz(_MM_FLUSH_ZERO_ON);
# define SIMD_GET_FLUSH_TO_ZERO get_fz(_MM_FLUSH_ZERO_ON)
# else
# define _MM_FLUSH_ZERO_ON 24
# define __get_fpcr(__fpcr) _ReadStatusReg(__fpcr)
# define __set_fpcr(__fpcr) _WriteStatusReg(0x5A20, __fpcr)
# define SIMD_SET_FLUSH_TO_ZERO set_fz(_MM_FLUSH_ZERO_ON);
# define SIMD_GET_FLUSH_TO_ZERO get_fz(_MM_FLUSH_ZERO_ON)
# endif
#else
# define SIMD_SET_FLUSH_TO_ZERO
@@ -207,7 +213,11 @@ type shuffle_neon(const type &a, const type &b)
(i3 * 4) + 2 + 16,
(i3 * 4) + 3 + 16};
return type(vqtbl2q_s8((int8x16x2_t){int8x16_t(a), int8x16_t(b)}, *(uint8x16_t *)tbl));
// Note: This cannot all be put in a single line due to how MSVC ARM64
// implements the function calls as several layers of macros.
int8x16x2_t t = {int8x16_t(a), int8x16_t(b)};
uint8x16_t idx = *(uint8x16_t *)tbl;
return type(vqtbl2q_s8(t, idx));
}
}
#endif /* __KERNEL_NEON */

View File

@@ -77,7 +77,7 @@ string system_cpu_brand_string()
if (sysctlbyname("machdep.cpu.brand_string", &modelname, &bufferlen, NULL, 0) == 0) {
return modelname;
}
#elif defined(WIN32) || defined(__x86_64__) || defined(__i386__)
#elif (defined(WIN32) || defined(__x86_64__) || defined(__i386__)) && !defined(_M_ARM64)
/* Get from intrinsics on Windows and x86. */
char buf[49] = {0};
int result[4] = {0};
@@ -96,6 +96,19 @@ string system_cpu_brand_string()
return brand;
}
#elif defined(_M_ARM64)
DWORD vendorIdentifierLength = 255;
char vendorIdentifier[255];
if (RegGetValueA(HKEY_LOCAL_MACHINE,
"HARDWARE\\DESCRIPTION\\System\\CentralProcessor\\0",
"VendorIdentifier",
RRF_RT_REG_SZ,
nullptr,
&vendorIdentifier,
&vendorIdentifierLength) == ERROR_SUCCESS)
{
return vendorIdentifier;
}
#else
/* Get from /proc/cpuinfo on Unix systems. */
FILE *cpuinfo = fopen("/proc/cpuinfo", "r");