Cycles: Add Windows ARM64 support
Ref #119126 Pull Request: https://projects.blender.org/blender/blender/pulls/117036
This commit is contained in:
committed by
Brecht Van Lommel
parent
7e3b83b146
commit
3d5fa7698f
@@ -64,6 +64,15 @@ if(WITH_CYCLES_NATIVE_ONLY)
|
||||
endif()
|
||||
set(CYCLES_KERNEL_FLAGS "${MSVC_NATIVE_ARCH_FLAGS}")
|
||||
endif()
|
||||
elseif(WIN32 AND MSVC AND SUPPORT_NEON_BUILD AND SSE2NEON_FOUND)
|
||||
set(CXX_HAS_SSE FALSE)
|
||||
set(CXX_HAS_AVX FALSE)
|
||||
set(CXX_HAS_AVX2 FALSE)
|
||||
set(CYCLES_KERNEL_FLAGS "/fp:fast -D_CRT_SECURE_NO_WARNINGS /GS-")
|
||||
string(APPEND CMAKE_CXX_FLAGS " ${CYCLES_KERNEL_FLAGS}")
|
||||
string(APPEND CMAKE_CXX_FLAGS_RELEASE " /Ox")
|
||||
string(APPEND CMAKE_CXX_FLAGS_RELWITHDEBINFO " /Ox")
|
||||
string(APPEND CMAKE_CXX_FLAGS_MINSIZEREL " /Ox")
|
||||
elseif(NOT WITH_CPU_SIMD OR (SUPPORT_NEON_BUILD AND SSE2NEON_FOUND))
|
||||
set(CXX_HAS_SSE42 FALSE)
|
||||
set(CXX_HAS_AVX FALSE)
|
||||
|
||||
@@ -1217,7 +1217,7 @@ typedef enum KernelBVHLayout {
|
||||
} KernelBVHLayout;
|
||||
|
||||
/* Specialized struct that can become constants in dynamic compilation. */
|
||||
#define KERNEL_STRUCT_BEGIN(name, parent) struct name {
|
||||
#define KERNEL_STRUCT_BEGIN(name, parent) ccl_align(16) struct name {
|
||||
#define KERNEL_STRUCT_END(name) \
|
||||
} \
|
||||
; \
|
||||
@@ -1259,7 +1259,7 @@ typedef struct KernelLightLinkSet {
|
||||
uint light_tree_root;
|
||||
} KernelLightLinkSet;
|
||||
|
||||
typedef struct KernelData {
|
||||
typedef ccl_align(16) struct KernelData {
|
||||
/* Features and limits. */
|
||||
uint kernel_features;
|
||||
uint max_closures;
|
||||
|
||||
@@ -13,7 +13,7 @@
|
||||
/* Bitness */
|
||||
|
||||
#if defined(__ppc64__) || defined(__PPC64__) || defined(__x86_64__) || defined(__ia64__) || \
|
||||
defined(_M_X64) || defined(__aarch64__)
|
||||
defined(_M_X64) || defined(__aarch64__) || defined(_M_ARM64)
|
||||
# define __KERNEL_64_BIT__
|
||||
#endif
|
||||
|
||||
|
||||
@@ -1000,7 +1000,7 @@ ccl_device_inline uint32_t reverse_integer_bits(uint32_t x)
|
||||
return __brev(x);
|
||||
#elif defined(__KERNEL_METAL__)
|
||||
return reverse_bits(x);
|
||||
#elif defined(__aarch64__) || defined(_M_ARM64)
|
||||
#elif defined(__aarch64__) || (defined(_M_ARM64) && !defined(_MSC_VER))
|
||||
/* Assume the rbit is always available on 64bit ARM architecture. */
|
||||
__asm__("rbit %w0, %w1" : "=r"(x) : "r"(x));
|
||||
return x;
|
||||
|
||||
@@ -455,7 +455,7 @@ ccl_device_inline float reduce_add(const float3 a)
|
||||
{
|
||||
#if defined(__KERNEL_SSE__) && defined(__KERNEL_NEON__)
|
||||
__m128 t = a.m128;
|
||||
t[3] = 0.0f;
|
||||
t = vsetq_lane_f32(0.0f, t, 3);
|
||||
return vaddvq_f32(t);
|
||||
#else
|
||||
return (a.x + a.y + a.z);
|
||||
|
||||
@@ -114,12 +114,12 @@ ccl_device_forceinline float ray_triangle_rcp(const float x)
|
||||
{
|
||||
#ifdef __KERNEL_NEON__
|
||||
/* Move scalar to vector register and do rcp. */
|
||||
__m128 a;
|
||||
a[0] = x;
|
||||
__m128 a = {0};
|
||||
a = vsetq_lane_f32(x, a, 0);
|
||||
float32x4_t reciprocal = vrecpeq_f32(a);
|
||||
reciprocal = vmulq_f32(vrecpsq_f32(a, reciprocal), reciprocal);
|
||||
reciprocal = vmulq_f32(vrecpsq_f32(a, reciprocal), reciprocal);
|
||||
return reciprocal[0];
|
||||
return vgetq_lane_f32(reciprocal, 0);
|
||||
#elif defined(__KERNEL_SSE__)
|
||||
const __m128 a = _mm_set_ss(x);
|
||||
const __m128 r = _mm_rcp_ss(a);
|
||||
|
||||
@@ -16,9 +16,12 @@ CCL_NAMESPACE_BEGIN
|
||||
static inline bool openimagedenoise_supported()
|
||||
{
|
||||
#ifdef WITH_OPENIMAGEDENOISE
|
||||
# ifdef __APPLE__
|
||||
# if defined(__APPLE__)
|
||||
/* Always supported through Accelerate framework BNNS. */
|
||||
return true;
|
||||
# elif defined(_M_ARM64)
|
||||
/* OIDN supports NEON natively, and all Windows ARM64 platforms support NEON */
|
||||
return true;
|
||||
# else
|
||||
return system_cpu_support_sse42();
|
||||
# endif
|
||||
|
||||
@@ -38,7 +38,7 @@
|
||||
* SSE, some specializations for performance and compatibility are made
|
||||
* made testing for __KERNEL_NEON__. */
|
||||
|
||||
# elif defined(__ARM_NEON) && defined(WITH_SSE2NEON)
|
||||
# elif (defined(__ARM_NEON) || defined(_M_ARM64)) && defined(WITH_SSE2NEON)
|
||||
|
||||
# define __KERNEL_NEON__
|
||||
# define __KERNEL_SSE__
|
||||
|
||||
@@ -19,7 +19,7 @@
|
||||
* Since we can't avoid including <windows.h>, better only include that */
|
||||
#if defined(FREE_WINDOWS64)
|
||||
# include "util/windows.h"
|
||||
#elif defined(_MSC_VER)
|
||||
#elif defined(_MSC_VER) && !defined(__KERNEL_NEON__)
|
||||
# include <intrin.h>
|
||||
#elif (defined(__x86_64__) || defined(__i386__))
|
||||
# include <x86intrin.h>
|
||||
@@ -40,12 +40,18 @@
|
||||
# define SIMD_SET_FLUSH_TO_ZERO \
|
||||
_MM_SET_FLUSH_ZERO_MODE(_MM_FLUSH_ZERO_ON); \
|
||||
_MM_SET_DENORMALS_ZERO_MODE(_MM_DENORMALS_ZERO_ON);
|
||||
# else
|
||||
# elif !defined(_M_ARM64)
|
||||
# define _MM_FLUSH_ZERO_ON 24
|
||||
# define __get_fpcr(__fpcr) __asm__ __volatile__("mrs %0,fpcr" : "=r"(__fpcr))
|
||||
# define __set_fpcr(__fpcr) __asm__ __volatile__("msr fpcr,%0" : : "ri"(__fpcr))
|
||||
# define SIMD_SET_FLUSH_TO_ZERO set_fz(_MM_FLUSH_ZERO_ON);
|
||||
# define SIMD_GET_FLUSH_TO_ZERO get_fz(_MM_FLUSH_ZERO_ON)
|
||||
# else
|
||||
# define _MM_FLUSH_ZERO_ON 24
|
||||
# define __get_fpcr(__fpcr) _ReadStatusReg(__fpcr)
|
||||
# define __set_fpcr(__fpcr) _WriteStatusReg(0x5A20, __fpcr)
|
||||
# define SIMD_SET_FLUSH_TO_ZERO set_fz(_MM_FLUSH_ZERO_ON);
|
||||
# define SIMD_GET_FLUSH_TO_ZERO get_fz(_MM_FLUSH_ZERO_ON)
|
||||
# endif
|
||||
#else
|
||||
# define SIMD_SET_FLUSH_TO_ZERO
|
||||
@@ -207,7 +213,11 @@ type shuffle_neon(const type &a, const type &b)
|
||||
(i3 * 4) + 2 + 16,
|
||||
(i3 * 4) + 3 + 16};
|
||||
|
||||
return type(vqtbl2q_s8((int8x16x2_t){int8x16_t(a), int8x16_t(b)}, *(uint8x16_t *)tbl));
|
||||
// Note: This cannot all be put in a single line due to how MSVC ARM64
|
||||
// implements the function calls as several layers of macros.
|
||||
int8x16x2_t t = {int8x16_t(a), int8x16_t(b)};
|
||||
uint8x16_t idx = *(uint8x16_t *)tbl;
|
||||
return type(vqtbl2q_s8(t, idx));
|
||||
}
|
||||
}
|
||||
#endif /* __KERNEL_NEON */
|
||||
|
||||
@@ -77,7 +77,7 @@ string system_cpu_brand_string()
|
||||
if (sysctlbyname("machdep.cpu.brand_string", &modelname, &bufferlen, NULL, 0) == 0) {
|
||||
return modelname;
|
||||
}
|
||||
#elif defined(WIN32) || defined(__x86_64__) || defined(__i386__)
|
||||
#elif (defined(WIN32) || defined(__x86_64__) || defined(__i386__)) && !defined(_M_ARM64)
|
||||
/* Get from intrinsics on Windows and x86. */
|
||||
char buf[49] = {0};
|
||||
int result[4] = {0};
|
||||
@@ -96,6 +96,19 @@ string system_cpu_brand_string()
|
||||
|
||||
return brand;
|
||||
}
|
||||
#elif defined(_M_ARM64)
|
||||
DWORD vendorIdentifierLength = 255;
|
||||
char vendorIdentifier[255];
|
||||
if (RegGetValueA(HKEY_LOCAL_MACHINE,
|
||||
"HARDWARE\\DESCRIPTION\\System\\CentralProcessor\\0",
|
||||
"VendorIdentifier",
|
||||
RRF_RT_REG_SZ,
|
||||
nullptr,
|
||||
&vendorIdentifier,
|
||||
&vendorIdentifierLength) == ERROR_SUCCESS)
|
||||
{
|
||||
return vendorIdentifier;
|
||||
}
|
||||
#else
|
||||
/* Get from /proc/cpuinfo on Unix systems. */
|
||||
FILE *cpuinfo = fopen("/proc/cpuinfo", "r");
|
||||
|
||||
Reference in New Issue
Block a user