diff --git a/intern/cycles/CMakeLists.txt b/intern/cycles/CMakeLists.txt
index 11d45a9f4b6..0673325cfc5 100644
--- a/intern/cycles/CMakeLists.txt
+++ b/intern/cycles/CMakeLists.txt
@@ -64,6 +64,15 @@ if(WITH_CYCLES_NATIVE_ONLY)
     endif()
     set(CYCLES_KERNEL_FLAGS "${MSVC_NATIVE_ARCH_FLAGS}")
   endif()
+elseif(WIN32 AND MSVC AND SUPPORT_NEON_BUILD AND SSE2NEON_FOUND)
+  set(CXX_HAS_SSE FALSE)
+  set(CXX_HAS_AVX FALSE)
+  set(CXX_HAS_AVX2 FALSE)
+  set(CYCLES_KERNEL_FLAGS "/fp:fast -D_CRT_SECURE_NO_WARNINGS /GS-")
+  string(APPEND CMAKE_CXX_FLAGS " ${CYCLES_KERNEL_FLAGS}")
+  string(APPEND CMAKE_CXX_FLAGS_RELEASE " /Ox")
+  string(APPEND CMAKE_CXX_FLAGS_RELWITHDEBINFO " /Ox")
+  string(APPEND CMAKE_CXX_FLAGS_MINSIZEREL " /Ox")
 elseif(NOT WITH_CPU_SIMD OR (SUPPORT_NEON_BUILD AND SSE2NEON_FOUND))
   set(CXX_HAS_SSE42 FALSE)
   set(CXX_HAS_AVX FALSE)
diff --git a/intern/cycles/kernel/types.h b/intern/cycles/kernel/types.h
index b50a0c192a1..da8ddc8d290 100644
--- a/intern/cycles/kernel/types.h
+++ b/intern/cycles/kernel/types.h
@@ -1217,7 +1217,7 @@ typedef enum KernelBVHLayout {
 } KernelBVHLayout;
 
 /* Specialized struct that can become constants in dynamic compilation. */
-#define KERNEL_STRUCT_BEGIN(name, parent) struct name {
+#define KERNEL_STRUCT_BEGIN(name, parent) ccl_align(16) struct name {
 #define KERNEL_STRUCT_END(name) \
   } \
   ; \
@@ -1259,7 +1259,7 @@ typedef struct KernelLightLinkSet {
   uint light_tree_root;
 } KernelLightLinkSet;
 
-typedef struct KernelData {
+typedef ccl_align(16) struct KernelData {
   /* Features and limits. */
   uint kernel_features;
   uint max_closures;
diff --git a/intern/cycles/util/defines.h b/intern/cycles/util/defines.h
index fe59b497ac8..a1d68bd9654 100644
--- a/intern/cycles/util/defines.h
+++ b/intern/cycles/util/defines.h
@@ -13,7 +13,7 @@
 /* Bitness */
 
 #if defined(__ppc64__) || defined(__PPC64__) || defined(__x86_64__) || defined(__ia64__) || \
-    defined(_M_X64) || defined(__aarch64__)
+    defined(_M_X64) || defined(__aarch64__) || defined(_M_ARM64)
 #  define __KERNEL_64_BIT__
 #endif
 
diff --git a/intern/cycles/util/math.h b/intern/cycles/util/math.h
index 9db10e89032..7d5cab7e30c 100644
--- a/intern/cycles/util/math.h
+++ b/intern/cycles/util/math.h
@@ -1000,7 +1000,7 @@ ccl_device_inline uint32_t reverse_integer_bits(uint32_t x)
   return __brev(x);
 #elif defined(__KERNEL_METAL__)
   return reverse_bits(x);
-#elif defined(__aarch64__) || defined(_M_ARM64)
+#elif defined(__aarch64__) || (defined(_M_ARM64) && !defined(_MSC_VER))
   /* Assume the rbit is always available on 64bit ARM architecture. */
   __asm__("rbit %w0, %w1" : "=r"(x) : "r"(x));
   return x;
diff --git a/intern/cycles/util/math_float3.h b/intern/cycles/util/math_float3.h
index 38f86de6054..fd3dc0d71aa 100644
--- a/intern/cycles/util/math_float3.h
+++ b/intern/cycles/util/math_float3.h
@@ -455,7 +455,7 @@ ccl_device_inline float reduce_add(const float3 a)
 {
 #if defined(__KERNEL_SSE__) && defined(__KERNEL_NEON__)
   __m128 t = a.m128;
-  t[3] = 0.0f;
+  t = vsetq_lane_f32(0.0f, t, 3);
   return vaddvq_f32(t);
 #else
   return (a.x + a.y + a.z);
diff --git a/intern/cycles/util/math_intersect.h b/intern/cycles/util/math_intersect.h
index f662ed4f394..b09cf2a4b1b 100644
--- a/intern/cycles/util/math_intersect.h
+++ b/intern/cycles/util/math_intersect.h
@@ -114,12 +114,12 @@ ccl_device_forceinline float ray_triangle_rcp(const float x)
 {
 #ifdef __KERNEL_NEON__
   /* Move scalar to vector register and do rcp. */
-  __m128 a;
-  a[0] = x;
+  __m128 a = {0};
+  a = vsetq_lane_f32(x, a, 0);
   float32x4_t reciprocal = vrecpeq_f32(a);
   reciprocal = vmulq_f32(vrecpsq_f32(a, reciprocal), reciprocal);
   reciprocal = vmulq_f32(vrecpsq_f32(a, reciprocal), reciprocal);
-  return reciprocal[0];
+  return vgetq_lane_f32(reciprocal, 0);
 #elif defined(__KERNEL_SSE__)
   const __m128 a = _mm_set_ss(x);
   const __m128 r = _mm_rcp_ss(a);
diff --git a/intern/cycles/util/openimagedenoise.h b/intern/cycles/util/openimagedenoise.h
index da3952b7257..18510fc2208 100644
--- a/intern/cycles/util/openimagedenoise.h
+++ b/intern/cycles/util/openimagedenoise.h
@@ -16,9 +16,12 @@ CCL_NAMESPACE_BEGIN
 static inline bool openimagedenoise_supported()
 {
 #ifdef WITH_OPENIMAGEDENOISE
-#  ifdef __APPLE__
+#  if defined(__APPLE__)
   /* Always supported through Accelerate framework BNNS. */
   return true;
+#  elif defined(_M_ARM64)
+  /* OIDN supports NEON natively, and all Windows ARM64 platforms support NEON */
+  return true;
 #  else
   return system_cpu_support_sse42();
 #  endif
diff --git a/intern/cycles/util/optimization.h b/intern/cycles/util/optimization.h
index e9a4ad8e6e6..d4d51760f27 100644
--- a/intern/cycles/util/optimization.h
+++ b/intern/cycles/util/optimization.h
@@ -38,7 +38,7 @@
  * SSE, some specializations for performance and compatibility are made
  * made testing for __KERNEL_NEON__. */
 
-#  elif defined(__ARM_NEON) && defined(WITH_SSE2NEON)
+#  elif (defined(__ARM_NEON) || defined(_M_ARM64)) && defined(WITH_SSE2NEON)
 
 #    define __KERNEL_NEON__
 #    define __KERNEL_SSE__
diff --git a/intern/cycles/util/simd.h b/intern/cycles/util/simd.h
index a832811718c..9dd37a9d819 100644
--- a/intern/cycles/util/simd.h
+++ b/intern/cycles/util/simd.h
@@ -19,7 +19,7 @@
  * Since we can't avoid including <windows.h>, better only include that */
 #if defined(FREE_WINDOWS64)
 #  include "util/windows.h"
-#elif defined(_MSC_VER)
+#elif defined(_MSC_VER) && !defined(__KERNEL_NEON__)
 #  include <intrin.h>
 #elif (defined(__x86_64__) || defined(__i386__))
 #  include <x86intrin.h>
@@ -40,12 +40,18 @@
 #    define SIMD_SET_FLUSH_TO_ZERO \
       _MM_SET_FLUSH_ZERO_MODE(_MM_FLUSH_ZERO_ON); \
       _MM_SET_DENORMALS_ZERO_MODE(_MM_DENORMALS_ZERO_ON);
-#  else
+#  elif !defined(_M_ARM64)
 #    define _MM_FLUSH_ZERO_ON 24
 #    define __get_fpcr(__fpcr) __asm__ __volatile__("mrs %0,fpcr" : "=r"(__fpcr))
 #    define __set_fpcr(__fpcr) __asm__ __volatile__("msr fpcr,%0" : : "ri"(__fpcr))
 #    define SIMD_SET_FLUSH_TO_ZERO set_fz(_MM_FLUSH_ZERO_ON);
 #    define SIMD_GET_FLUSH_TO_ZERO get_fz(_MM_FLUSH_ZERO_ON)
+#  else
+#    define _MM_FLUSH_ZERO_ON 24
+#    define __get_fpcr(__fpcr) _ReadStatusReg(__fpcr)
+#    define __set_fpcr(__fpcr) _WriteStatusReg(0x5A20, __fpcr)
+#    define SIMD_SET_FLUSH_TO_ZERO set_fz(_MM_FLUSH_ZERO_ON);
+#    define SIMD_GET_FLUSH_TO_ZERO get_fz(_MM_FLUSH_ZERO_ON)
 #  endif
 #else
 #  define SIMD_SET_FLUSH_TO_ZERO
@@ -207,7 +213,11 @@ type shuffle_neon(const type &a, const type &b)
                                     (i3 * 4) + 2 + 16,
                                     (i3 * 4) + 3 + 16};
 
-    return type(vqtbl2q_s8((int8x16x2_t){int8x16_t(a), int8x16_t(b)}, *(uint8x16_t *)tbl));
+    // Note: This cannot all be put in a single line due to how MSVC ARM64
+    // implements the function calls as several layers of macros.
+    int8x16x2_t t = {int8x16_t(a), int8x16_t(b)};
+    uint8x16_t idx = *(uint8x16_t *)tbl;
+    return type(vqtbl2q_s8(t, idx));
   }
 }
 #endif /* __KERNEL_NEON */
diff --git a/intern/cycles/util/system.cpp b/intern/cycles/util/system.cpp
index e6e33ca1645..1e0cbe8cfa6 100644
--- a/intern/cycles/util/system.cpp
+++ b/intern/cycles/util/system.cpp
@@ -77,7 +77,7 @@ string system_cpu_brand_string()
   if (sysctlbyname("machdep.cpu.brand_string", &modelname, &bufferlen, NULL, 0) == 0) {
     return modelname;
   }
-#elif defined(WIN32) || defined(__x86_64__) || defined(__i386__)
+#elif (defined(WIN32) || defined(__x86_64__) || defined(__i386__)) && !defined(_M_ARM64)
   /* Get from intrinsics on Windows and x86. */
   char buf[49] = {0};
   int result[4] = {0};
@@ -96,6 +96,19 @@ string system_cpu_brand_string()
 
     return brand;
   }
+#elif defined(_M_ARM64)
+  DWORD vendorIdentifierLength = 255;
+  char vendorIdentifier[255];
+  if (RegGetValueA(HKEY_LOCAL_MACHINE,
+                   "HARDWARE\\DESCRIPTION\\System\\CentralProcessor\\0",
+                   "VendorIdentifier",
+                   RRF_RT_REG_SZ,
+                   nullptr,
+                   &vendorIdentifier,
+                   &vendorIdentifierLength) == ERROR_SUCCESS)
+  {
+    return vendorIdentifier;
+  }
 #else
   /* Get from /proc/cpuinfo on Unix systems. */
   FILE *cpuinfo = fopen("/proc/cpuinfo", "r");