From 286e535071c8f9a906c6c36b8dac0eda6384c79a Mon Sep 17 00:00:00 2001
From: Brecht Van Lommel <brecht@blender.org>
Date: Mon, 8 Aug 2022 17:45:37 +0200
Subject: [PATCH 1/3] Cleanup: simplify CPU instruction checking

The performance of this will be slightly more important for upcoming changes.
Also removed an unused function and changed includes so these system.h can
be included in more places.
---
 intern/cycles/util/system.cpp | 91 ++++++++++++-----------------------
 intern/cycles/util/system.h   | 11 ++---
 intern/cycles/util/vector.h   |  1 -
 3 files changed, 36 insertions(+), 67 deletions(-)

diff --git a/intern/cycles/util/system.cpp b/intern/cycles/util/system.cpp
index a13ad95b9fe..3183ac06f26 100644
--- a/intern/cycles/util/system.cpp
+++ b/intern/cycles/util/system.cpp
@@ -128,53 +128,42 @@ int system_cpu_bits()
 #if defined(__x86_64__) || defined(_M_X64) || defined(__i386__) || defined(_M_IX86)
 
 struct CPUCapabilities {
-  bool x64;
-  bool mmx;
-  bool sse;
   bool sse2;
   bool sse3;
-  bool ssse3;
   bool sse41;
-  bool sse42;
-  bool sse4a;
   bool avx;
-  bool f16c;
   bool avx2;
-  bool xop;
-  bool fma3;
-  bool fma4;
-  bool bmi1;
-  bool bmi2;
 };
 
 static CPUCapabilities &system_cpu_capabilities()
 {
-  static CPUCapabilities caps;
+  static CPUCapabilities caps = {};
   static bool caps_init = false;
 
   if (!caps_init) {
     int result[4], num;
 
-    memset(&caps, 0, sizeof(caps));
-
     __cpuid(result, 0);
     num = result[0];
 
     if (num >= 1) {
       __cpuid(result, 0x00000001);
-      caps.mmx = (result[3] & ((int)1 << 23)) != 0;
-      caps.sse = (result[3] & ((int)1 << 25)) != 0;
-      caps.sse2 = (result[3] & ((int)1 << 26)) != 0;
-      caps.sse3 = (result[2] & ((int)1 << 0)) != 0;
+      const bool sse = (result[3] & ((int)1 << 25)) != 0;
+      const bool sse2 = (result[3] & ((int)1 << 26)) != 0;
+      const bool sse3 = (result[2] & ((int)1 << 0)) != 0;
 
-      caps.ssse3 = (result[2] & ((int)1 << 9)) != 0;
-      caps.sse41 = (result[2] & ((int)1 << 19)) != 0;
-      caps.sse42 = (result[2] & ((int)1 << 20)) != 0;
+      const bool ssse3 = (result[2] & ((int)1 << 9)) != 0;
+      const bool sse41 = (result[2] & ((int)1 << 19)) != 0;
+      /* const bool sse42 = (result[2] & ((int)1 << 20)) != 0; */
 
-      caps.fma3 = (result[2] & ((int)1 << 12)) != 0;
-      caps.avx = false;
-      bool os_uses_xsave_xrestore = (result[2] & ((int)1 << 27)) != 0;
-      bool cpu_avx_support = (result[2] & ((int)1 << 28)) != 0;
+      const bool fma3 = (result[2] & ((int)1 << 12)) != 0;
+      const bool os_uses_xsave_xrestore = (result[2] & ((int)1 << 27)) != 0;
+      const bool cpu_avx_support = (result[2] & ((int)1 << 28)) != 0;
+
+      /* Simplify to combined capabilities for which we specialize kernels. */
+      caps.sse2 = sse && sse2;
+      caps.sse3 = sse && sse2 && sse3 && ssse3;
+      caps.sse41 = sse && sse2 && sse3 && ssse3 && sse41;
 
       if (os_uses_xsave_xrestore && cpu_avx_support) {
         // Check if the OS will save the YMM registers
@@ -189,15 +178,18 @@ static CPUCapabilities &system_cpu_capabilities()
 #  else
         xcr_feature_mask = 0;
 #  endif
-        caps.avx = (xcr_feature_mask & 0x6) == 0x6;
+        const bool avx = (xcr_feature_mask & 0x6) == 0x6;
+        const bool f16c = (result[2] & ((int)1 << 29)) != 0;
+
+        __cpuid(result, 0x00000007);
+        bool bmi1 = (result[1] & ((int)1 << 3)) != 0;
+        bool bmi2 = (result[1] & ((int)1 << 8)) != 0;
+        bool avx2 = (result[1] & ((int)1 << 5)) != 0;
+
+        caps.avx = sse && sse2 && sse3 && ssse3 && sse41 && avx;
+        caps.avx2 = sse && sse2 && sse3 && ssse3 && sse41 && avx && f16c && avx2 && fma3 && bmi1 &&
+                    bmi2;
       }
-
-      caps.f16c = (result[2] & ((int)1 << 29)) != 0;
-
-      __cpuid(result, 0x00000007);
-      caps.bmi1 = (result[1] & ((int)1 << 3)) != 0;
-      caps.bmi2 = (result[1] & ((int)1 << 8)) != 0;
-      caps.avx2 = (result[1] & ((int)1 << 5)) != 0;
     }
 
     caps_init = true;
@@ -209,32 +201,31 @@ static CPUCapabilities &system_cpu_capabilities()
 bool system_cpu_support_sse2()
 {
   CPUCapabilities &caps = system_cpu_capabilities();
-  return caps.sse && caps.sse2;
+  return caps.sse2;
 }
 
 bool system_cpu_support_sse3()
 {
   CPUCapabilities &caps = system_cpu_capabilities();
-  return caps.sse && caps.sse2 && caps.sse3 && caps.ssse3;
+  return caps.sse3;
 }
 
 bool system_cpu_support_sse41()
 {
   CPUCapabilities &caps = system_cpu_capabilities();
-  return caps.sse && caps.sse2 && caps.sse3 && caps.ssse3 && caps.sse41;
+  return caps.sse41;
 }
 
 bool system_cpu_support_avx()
 {
   CPUCapabilities &caps = system_cpu_capabilities();
-  return caps.sse && caps.sse2 && caps.sse3 && caps.ssse3 && caps.sse41 && caps.avx;
+  return caps.avx;
 }
 
 bool system_cpu_support_avx2()
 {
   CPUCapabilities &caps = system_cpu_capabilities();
-  return caps.sse && caps.sse2 && caps.sse3 && caps.ssse3 && caps.sse41 && caps.avx && caps.f16c &&
-         caps.avx2 && caps.fma3 && caps.bmi1 && caps.bmi2;
+  return caps.avx2;
 }
 #else
 
@@ -264,26 +255,6 @@ bool system_cpu_support_avx2()
 
 #endif
 
-bool system_call_self(const vector<string> &args)
-{
-  /* Escape program and arguments in case they contain spaces. */
-  string cmd = "\"" + Sysutil::this_program_path() + "\"";
-
-  for (int i = 0; i < args.size(); i++) {
-    cmd += " \"" + args[i] + "\"";
-  }
-
-#ifdef _WIN32
-  /* Use cmd /S to avoid issues with spaces in arguments. */
-  cmd = "cmd /S /C \"" + cmd + " > nul \"";
-#else
-  /* Quiet output. */
-  cmd += " > /dev/null";
-#endif
-
-  return (system(cmd.c_str()) == 0);
-}
-
 size_t system_physical_ram()
 {
 #ifdef _WIN32
diff --git a/intern/cycles/util/system.h b/intern/cycles/util/system.h
index 23dcfdd303a..2152b89ed24 100644
--- a/intern/cycles/util/system.h
+++ b/intern/cycles/util/system.h
@@ -4,15 +4,17 @@
 #ifndef __UTIL_SYSTEM_H__
 #define __UTIL_SYSTEM_H__
 
-#include "util/string.h"
-#include "util/vector.h"
+#include <stdint.h>
+#include <stdlib.h>
+
+#include <string>
 
 CCL_NAMESPACE_BEGIN
 
 /* Get width in characters of the current console output. */
 int system_console_width();
 
-string system_cpu_brand_string();
+std::string system_cpu_brand_string();
 int system_cpu_bits();
 bool system_cpu_support_sse2();
 bool system_cpu_support_sse3();
@@ -22,9 +24,6 @@ bool system_cpu_support_avx2();
 
 size_t system_physical_ram();
 
-/* Start a new process of the current application with the given arguments. */
-bool system_call_self(const vector<string> &args);
-
 /* Get identifier of the currently running process. */
 uint64_t system_self_process_id();
 
diff --git a/intern/cycles/util/vector.h b/intern/cycles/util/vector.h
index 0056fb269ae..9e27997cf2c 100644
--- a/intern/cycles/util/vector.h
+++ b/intern/cycles/util/vector.h
@@ -10,7 +10,6 @@
 
 #include "util/aligned_malloc.h"
 #include "util/guarded_allocator.h"
-#include "util/types.h"
 
 CCL_NAMESPACE_BEGIN
 

From 230f9ade64844a50ea02461cfa005f364de09aa9 Mon Sep 17 00:00:00 2001
From: Brecht Van Lommel <brecht@blender.org>
Date: Tue, 9 Aug 2022 14:28:19 +0200
Subject: [PATCH 2/3] Cycles: make transform inverse match Embree exactly

Helps improve ray-tracing precision. This is a bit complicated as it requires
different implementation depending on the CPU architecture.
---
 intern/cycles/kernel/CMakeLists.txt    |  1 +
 intern/cycles/util/CMakeLists.txt      |  9 +++
 intern/cycles/util/transform.cpp       |  4 +-
 intern/cycles/util/transform.h         | 65 ++++++++++------------
 intern/cycles/util/transform_avx2.cpp  | 13 +++++
 intern/cycles/util/transform_inverse.h | 76 ++++++++++++++++++++++++++
 intern/cycles/util/transform_sse41.cpp | 13 +++++
 7 files changed, 142 insertions(+), 39 deletions(-)
 create mode 100644 intern/cycles/util/transform_avx2.cpp
 create mode 100644 intern/cycles/util/transform_inverse.h
 create mode 100644 intern/cycles/util/transform_sse41.cpp

diff --git a/intern/cycles/kernel/CMakeLists.txt b/intern/cycles/kernel/CMakeLists.txt
index 96b5842b77d..b00515eb037 100644
--- a/intern/cycles/kernel/CMakeLists.txt
+++ b/intern/cycles/kernel/CMakeLists.txt
@@ -326,6 +326,7 @@ set(SRC_UTIL_HEADERS
   ../util/rect.h
   ../util/static_assert.h
   ../util/transform.h
+  ../util/transform_inverse.h
   ../util/texture.h
   ../util/types.h
   ../util/types_float2.h
diff --git a/intern/cycles/util/CMakeLists.txt b/intern/cycles/util/CMakeLists.txt
index 9bc9f00e142..f3fc7739f66 100644
--- a/intern/cycles/util/CMakeLists.txt
+++ b/intern/cycles/util/CMakeLists.txt
@@ -26,6 +26,8 @@ set(SRC
   thread.cpp
   time.cpp
   transform.cpp
+  transform_avx2.cpp
+  transform_sse41.cpp
   windows.cpp
 )
 
@@ -138,6 +140,13 @@ set(SRC_HEADERS
   xml.h
 )
 
+if(CXX_HAS_SSE)
+  set_source_files_properties(transform_sse41.cpp PROPERTIES COMPILE_FLAGS "${CYCLES_SSE41_KERNEL_FLAGS}")
+endif()
+if(CXX_HAS_AVX2)
+  set_source_files_properties(transform_avx2.cpp PROPERTIES COMPILE_FLAGS "${CYCLES_AVX2_KERNEL_FLAGS}")
+endif()
+
 include_directories(${INC})
 include_directories(SYSTEM ${INC_SYS})
 
diff --git a/intern/cycles/util/transform.cpp b/intern/cycles/util/transform.cpp
index 0b87e88871d..cb985c65dd8 100644
--- a/intern/cycles/util/transform.cpp
+++ b/intern/cycles/util/transform.cpp
@@ -11,7 +11,7 @@ CCL_NAMESPACE_BEGIN
 
 /* Transform Inverse */
 
-static bool transform_matrix4_gj_inverse(float R[][4], float M[][4])
+static bool projection_matrix4_inverse(float R[][4], float M[][4])
 {
   /* SPDX-License-Identifier: BSD-3-Clause
    * Adapted from code:
@@ -98,7 +98,7 @@ ProjectionTransform projection_inverse(const ProjectionTransform &tfm)
   memcpy(R, &tfmR, sizeof(R));
   memcpy(M, &tfm, sizeof(M));
 
-  if (UNLIKELY(!transform_matrix4_gj_inverse(R, M))) {
+  if (UNLIKELY(!projection_matrix4_inverse(R, M))) {
     return projection_identity();
   }
 
diff --git a/intern/cycles/util/transform.h b/intern/cycles/util/transform.h
index 71164efbac1..24184dc7074 100644
--- a/intern/cycles/util/transform.h
+++ b/intern/cycles/util/transform.h
@@ -11,6 +11,10 @@
 #include "util/math.h"
 #include "util/types.h"
 
+#ifndef __KERNEL_GPU__
+#  include "util/system.h"
+#endif
+
 CCL_NAMESPACE_BEGIN
 
 /* Affine transformation, stored as 4x3 matrix. */
@@ -38,6 +42,12 @@ typedef struct DecomposedTransform {
   float4 x, y, z, w;
 } DecomposedTransform;
 
+CCL_NAMESPACE_END
+
+#include "util/transform_inverse.h"
+
+CCL_NAMESPACE_BEGIN
+
 /* Functions */
 
 #ifdef __KERNEL_METAL__
@@ -391,47 +401,28 @@ ccl_device_inline float4 quat_interpolate(float4 q1, float4 q2, float t)
 #endif /* defined(__KERNEL_GPU_RAYTRACING__) */
 }
 
+#ifndef __KERNEL_GPU__
+void transform_inverse_cpu_sse41(const Transform &tfm, Transform &itfm);
+void transform_inverse_cpu_avx2(const Transform &tfm, Transform &itfm);
+#endif
+
 ccl_device_inline Transform transform_inverse(const Transform tfm)
 {
-  /* This implementation matches the one in Embree exactly, to ensure consistent
-   * results with the ray intersection of instances. */
-  float3 x = make_float3(tfm.x.x, tfm.y.x, tfm.z.x);
-  float3 y = make_float3(tfm.x.y, tfm.y.y, tfm.z.y);
-  float3 z = make_float3(tfm.x.z, tfm.y.z, tfm.z.z);
-  float3 w = make_float3(tfm.x.w, tfm.y.w, tfm.z.w);
-
-  /* Compute determinant. */
-  float det = dot(x, cross(y, z));
-
-  if (det == 0.0f) {
-    /* Matrix is degenerate (e.g. 0 scale on some axis), ideally we should
-     * never be in this situation, but try to invert it anyway with tweak.
-     *
-     * This logic does not match Embree which would just give an invalid
-     * matrix. A better solution would be to remove this and ensure any object
-     * matrix is valid. */
-    x.x += 1e-8f;
-    y.y += 1e-8f;
-    z.z += 1e-8f;
-
-    det = dot(x, cross(y, z));
-    if (det == 0.0f) {
-      det = FLT_MAX;
-    }
+  /* Optimized transform implementations. */
+#ifndef __KERNEL_GPU__
+  if (system_cpu_support_avx2()) {
+    Transform itfm;
+    transform_inverse_cpu_avx2(tfm, itfm);
+    return itfm;
   }
+  else if (system_cpu_support_sse41()) {
+    Transform itfm;
+    transform_inverse_cpu_sse41(tfm, itfm);
+    return itfm;
+  }
+#endif
 
-  /* Divide adjoint matrix by the determinant to compute inverse of 3x3 matrix. */
-  const float3 inverse_x = cross(y, z) / det;
-  const float3 inverse_y = cross(z, x) / det;
-  const float3 inverse_z = cross(x, y) / det;
-
-  /* Compute translation and fill transform. */
-  Transform itfm;
-  itfm.x = float3_to_float4(inverse_x, -dot(inverse_x, w));
-  itfm.y = float3_to_float4(inverse_y, -dot(inverse_y, w));
-  itfm.z = float3_to_float4(inverse_z, -dot(inverse_z, w));
-
-  return itfm;
+  return transform_inverse_impl(tfm);
 }
 
 ccl_device_inline void transform_compose(ccl_private Transform *tfm,
diff --git a/intern/cycles/util/transform_avx2.cpp b/intern/cycles/util/transform_avx2.cpp
new file mode 100644
index 00000000000..57c160388e2
--- /dev/null
+++ b/intern/cycles/util/transform_avx2.cpp
@@ -0,0 +1,13 @@
+/* SPDX-License-Identifier: Apache-2.0
+ * Copyright 2011-2022 Blender Foundation */
+
+#include "util/transform.h"
+
+CCL_NAMESPACE_BEGIN
+
+void transform_inverse_cpu_avx2(const Transform &tfm, Transform &itfm)
+{
+  itfm = transform_inverse_impl(tfm);
+}
+
+CCL_NAMESPACE_END
diff --git a/intern/cycles/util/transform_inverse.h b/intern/cycles/util/transform_inverse.h
new file mode 100644
index 00000000000..07fd06c1467
--- /dev/null
+++ b/intern/cycles/util/transform_inverse.h
@@ -0,0 +1,76 @@
+/* SPDX-License-Identifier: Apache-2.0
+ * Copyright 2011-2022 Blender Foundation */
+
+#pragma once
+
+CCL_NAMESPACE_BEGIN
+
+/* Custom cross and dot implementations that match Embree bit for bit.
+ * Normally we don't use SSE41/AVX outside the kernel, but for this it's
+ * important to match exactly for ray tracing precision. */
+
+ccl_device_forceinline float3 transform_inverse_cross(const float3 a, const float3 b)
+{
+#ifdef __AVX2__
+  const ssef sse_a = (const __m128 &)a;
+  const ssef sse_b = (const __m128 &)b;
+  const ssef r = shuffle<1, 2, 0, 3>(
+      ssef(_mm_fmsub_ps(sse_a, shuffle<1, 2, 0, 3>(sse_b), shuffle<1, 2, 0, 3>(sse_a) * sse_b)));
+  return (const float3 &)r;
+#endif
+
+  return cross(a, b);
+}
+
+ccl_device_forceinline float transform_inverse_dot(const float3 a, const float3 b)
+{
+#ifdef __SSE4_1__
+  return _mm_cvtss_f32(_mm_dp_ps((const __m128 &)a, (const __m128 &)b, 0x7F));
+#endif
+
+  return dot(a, b);
+}
+
+ccl_device_inline Transform transform_inverse_impl(const Transform tfm)
+{
+  /* This implementation matches the one in Embree exactly, to ensure consistent
+   * results with the ray intersection of instances. */
+  float3 x = make_float3(tfm.x.x, tfm.y.x, tfm.z.x);
+  float3 y = make_float3(tfm.x.y, tfm.y.y, tfm.z.y);
+  float3 z = make_float3(tfm.x.z, tfm.y.z, tfm.z.z);
+  float3 w = make_float3(tfm.x.w, tfm.y.w, tfm.z.w);
+
+  /* Compute determinant. */
+  float det = transform_inverse_dot(x, transform_inverse_cross(y, z));
+
+  if (det == 0.0f) {
+    /* Matrix is degenerate (e.g. 0 scale on some axis), ideally we should
+     * never be in this situation, but try to invert it anyway with tweak.
+     *
+     * This logic does not match Embree which would just give an invalid
+     * matrix. A better solution would be to remove this and ensure any object
+     * matrix is valid. */
+    x.x += 1e-8f;
+    y.y += 1e-8f;
+    z.z += 1e-8f;
+
+    det = transform_inverse_dot(x, cross(y, z));
+    if (det == 0.0f) {
+      det = FLT_MAX;
+    }
+  }
+
+  /* Divide adjoint matrix by the determinant to compute inverse of 3x3 matrix. */
+  const float3 inverse_x = transform_inverse_cross(y, z) / det;
+  const float3 inverse_y = transform_inverse_cross(z, x) / det;
+  const float3 inverse_z = transform_inverse_cross(x, y) / det;
+
+  /* Compute translation and fill transform. */
+  Transform itfm;
+  itfm.x = float3_to_float4(inverse_x, -transform_inverse_dot(inverse_x, w));
+  itfm.y = float3_to_float4(inverse_y, -transform_inverse_dot(inverse_y, w));
+  itfm.z = float3_to_float4(inverse_z, -transform_inverse_dot(inverse_z, w));
+
+  return itfm;
+}
+CCL_NAMESPACE_END
diff --git a/intern/cycles/util/transform_sse41.cpp b/intern/cycles/util/transform_sse41.cpp
new file mode 100644
index 00000000000..8a698807a9c
--- /dev/null
+++ b/intern/cycles/util/transform_sse41.cpp
@@ -0,0 +1,13 @@
+/* SPDX-License-Identifier: Apache-2.0
+ * Copyright 2011-2022 Blender Foundation */
+
+#include "util/transform.h"
+
+CCL_NAMESPACE_BEGIN
+
+void transform_inverse_cpu_sse41(const Transform &tfm, Transform &itfm)
+{
+  itfm = transform_inverse_impl(tfm);
+}
+
+CCL_NAMESPACE_END

From 79f1cc601cdbcf142e1bf4c1966f64dcf93b030f Mon Sep 17 00:00:00 2001
From: Brecht Van Lommel <brecht@blender.org>
Date: Mon, 18 Jul 2022 21:07:06 +0200
Subject: [PATCH 3/3] Cycles: improve ray tracing precision near triangle edges

Detect cases where a ray-intersection would miss the current triangle, which if
the intersection is strictly watertight, implies that a neighboring triangle would
incorrectly be hit instead.

When that is detected, apply a ray-offset. The idea being that we only want to
introduce potential error from ray offsets if we really need to.

This work for BVH2 and Embree, as we are able to match the ray-interesction
bit-for-bit, though doing so for Embree requires ugly hacks. Tiny differences
like fused-multiply-add or dot product intrinstics in matrix inversion and ray
intersection needed to be matched exactly, so this is fragile.

Unfortunately we're not able to do the same for OptiX or MetalRT, since those
implementations are unknown (and possibly impossible to match as hardware
instructions). Still artifacts are much reduced, though not eliminated.

Ref T97259

Differential Revision: https://developer.blender.org/D15559
---
 intern/cycles/kernel/bvh/util.h               | 24 +++++
 .../cycles/kernel/integrator/shade_surface.h  | 58 +++++++++++-
 intern/cycles/util/math_intersect.h           | 92 +++++++++++++++++--
 3 files changed, 165 insertions(+), 9 deletions(-)

diff --git a/intern/cycles/kernel/bvh/util.h b/intern/cycles/kernel/bvh/util.h
index b67c9394bea..a57703a8b8c 100644
--- a/intern/cycles/kernel/bvh/util.h
+++ b/intern/cycles/kernel/bvh/util.h
@@ -33,6 +33,30 @@ ccl_device_forceinline float intersection_t_offset(const float t)
   return __uint_as_float(bits);
 }
 
+/* Ray offset to avoid self intersection.
+ *
+ * This function can be used to compute a modified ray start position for rays
+ * leaving from a surface. This is from:
+ * "A Fast and Robust Method for Avoiding Self-Intersection"
+ * Ray Tracing Gems, chapter 6.
+ */
+ccl_device_inline float3 ray_offset(const float3 P, const float3 Ng)
+{
+  const float int_scale = 256.0f;
+  const int3 of_i = make_int3(
+      (int)(int_scale * Ng.x), (int)(int_scale * Ng.y), (int)(int_scale * Ng.z));
+
+  const float3 p_i = make_float3(
+      __int_as_float(__float_as_int(P.x) + ((P.x < 0) ? -of_i.x : of_i.x)),
+      __int_as_float(__float_as_int(P.y) + ((P.y < 0) ? -of_i.y : of_i.y)),
+      __int_as_float(__float_as_int(P.z) + ((P.z < 0) ? -of_i.z : of_i.z)));
+  const float origin = 1.0f / 32.0f;
+  const float float_scale = 1.0f / 65536.0f;
+  return make_float3(fabsf(P.x) < origin ? P.x + float_scale * Ng.x : p_i.x,
+                     fabsf(P.y) < origin ? P.y + float_scale * Ng.y : p_i.y,
+                     fabsf(P.z) < origin ? P.z + float_scale * Ng.z : p_i.z);
+}
+
 #ifndef __KERNEL_GPU__
 ccl_device int intersections_compare(const void *a, const void *b)
 {
diff --git a/intern/cycles/kernel/integrator/shade_surface.h b/intern/cycles/kernel/integrator/shade_surface.h
index 70b20a93b6a..19b8946e865 100644
--- a/intern/cycles/kernel/integrator/shade_surface.h
+++ b/intern/cycles/kernel/integrator/shade_surface.h
@@ -31,6 +31,52 @@ ccl_device_forceinline void integrate_surface_shader_setup(KernelGlobals kg,
   shader_setup_from_ray(kg, sd, &ray, &isect);
 }
 
+ccl_device_forceinline float3 integrate_surface_ray_offset(KernelGlobals kg,
+                                                           const ccl_private ShaderData *sd,
+                                                           const float3 ray_P,
+                                                           const float3 ray_D)
+{
+  /* No ray offset needed for other primitive types. */
+  if (!(sd->type & PRIMITIVE_TRIANGLE)) {
+    return ray_P;
+  }
+
+  /* Self intersection tests already account for the case where a ray hits the
+   * same primitive. However precision issues can still cause neighboring
+   * triangles to be hit. Here we test if the ray-triangle intersection with
+   * the same primitive would miss, implying that a neighbouring triangle would
+   * be hit instead.
+   *
+   * This relies on triangle intersection to be watertight, and the object inverse
+   * object transform to match the one used by ray intersection exactly.
+   *
+   * Potential improvements:
+   * - It appears this happens when either barycentric coordinates are small,
+   *   or dot(sd->Ng, ray_D)  is small. Detect such cases and skip test?
+   * - Instead of ray offset, can we tweak P to lie within the triangle?
+   */
+  const uint tri_vindex = kernel_data_fetch(tri_vindex, sd->prim).w;
+  const packed_float3 tri_a = kernel_data_fetch(tri_verts, tri_vindex + 0),
+                      tri_b = kernel_data_fetch(tri_verts, tri_vindex + 1),
+                      tri_c = kernel_data_fetch(tri_verts, tri_vindex + 2);
+
+  float3 local_ray_P = ray_P;
+  float3 local_ray_D = ray_D;
+
+  if (!(sd->object_flag & SD_OBJECT_TRANSFORM_APPLIED)) {
+    const Transform itfm = object_get_inverse_transform(kg, sd);
+    local_ray_P = transform_point(&itfm, local_ray_P);
+    local_ray_D = transform_direction(&itfm, local_ray_D);
+  }
+
+  if (ray_triangle_intersect_self(local_ray_P, local_ray_D, tri_a, tri_b, tri_c)) {
+    return ray_P;
+  }
+  else {
+    return ray_offset(ray_P, sd->Ng);
+  }
+}
+
 #ifdef __HOLDOUT__
 ccl_device_forceinline bool integrate_surface_holdout(KernelGlobals kg,
                                                       ConstIntegratorState state,
@@ -200,6 +246,10 @@ ccl_device_forceinline void integrate_surface_direct_light(KernelGlobals kg,
 #  endif
   }
 
+  if (ray.self.object != OBJECT_NONE) {
+    ray.P = integrate_surface_ray_offset(kg, sd, ray.P, ray.D);
+  }
+
   /* Write shadow ray and associated state to global memory. */
   integrator_state_write_shadow_ray(kg, shadow_state, &ray);
   // Save memory by storing the light and object indices in the shadow_isect
@@ -327,8 +377,9 @@ ccl_device_forceinline int integrate_surface_bsdf_bssrdf_bounce(
   }
   else {
     /* Setup ray with changed origin and direction. */
-    INTEGRATOR_STATE_WRITE(state, ray, P) = sd->P;
-    INTEGRATOR_STATE_WRITE(state, ray, D) = normalize(bsdf_omega_in);
+    const float3 D = normalize(bsdf_omega_in);
+    INTEGRATOR_STATE_WRITE(state, ray, P) = integrate_surface_ray_offset(kg, sd, sd->P, D);
+    INTEGRATOR_STATE_WRITE(state, ray, D) = D;
     INTEGRATOR_STATE_WRITE(state, ray, tmin) = 0.0f;
     INTEGRATOR_STATE_WRITE(state, ray, tmax) = FLT_MAX;
 #ifdef __RAY_DIFFERENTIALS__
@@ -422,6 +473,9 @@ ccl_device_forceinline void integrate_surface_ao(KernelGlobals kg,
   Ray ray ccl_optional_struct_init;
   ray.P = shadow_ray_offset(kg, sd, ao_D, &skip_self);
   ray.D = ao_D;
+  if (skip_self) {
+    ray.P = integrate_surface_ray_offset(kg, sd, ray.P, ray.D);
+  }
   ray.tmin = 0.0f;
   ray.tmax = kernel_data.integrator.ao_bounces_distance;
   ray.time = sd->time;
diff --git a/intern/cycles/util/math_intersect.h b/intern/cycles/util/math_intersect.h
index cc07cbe7745..aa28682f8c1 100644
--- a/intern/cycles/util/math_intersect.h
+++ b/intern/cycles/util/math_intersect.h
@@ -105,6 +105,51 @@ ccl_device bool ray_disk_intersect(float3 ray_P,
   return false;
 }
 
+/* Custom rcp, cross and dot implementations that match Embree bit for bit. */
+ccl_device_forceinline float ray_triangle_rcp(const float x)
+{
+#ifdef __KERNEL_NEON__
+  /* Move scalar to vector register and do rcp. */
+  __m128 a;
+  a[0] = x;
+  float32x4_t reciprocal = vrecpeq_f32(a);
+  reciprocal = vmulq_f32(vrecpsq_f32(a, reciprocal), reciprocal);
+  reciprocal = vmulq_f32(vrecpsq_f32(a, reciprocal), reciprocal);
+  return reciprocal[0];
+#elif defined(__KERNEL_SSE__)
+  const __m128 a = _mm_set_ss(x);
+  const __m128 r = _mm_rcp_ss(a);
+
+#  ifdef __KERNEL_AVX2_
+  return _mm_cvtss_f32(_mm_mul_ss(r, _mm_fnmadd_ss(r, a, _mm_set_ss(2.0f))));
+#  else
+  return _mm_cvtss_f32(_mm_mul_ss(r, _mm_sub_ss(_mm_set_ss(2.0f), _mm_mul_ss(r, a))));
+#  endif
+#else
+  return 1.0f / x;
+#endif
+}
+
+ccl_device_inline float ray_triangle_dot(const float3 a, const float3 b)
+{
+#if defined(__KERNEL_SSE41__) && defined(__KERNEL_SSE__)
+  return madd(ssef(a.x), ssef(b.x), madd(ssef(a.y), ssef(b.y), ssef(a.z) * ssef(b.z)))[0];
+#else
+  return a.x * b.x + a.y * b.y + a.z * b.z;
+#endif
+}
+
+ccl_device_inline float3 ray_triangle_cross(const float3 a, const float3 b)
+{
+#if defined(__KERNEL_SSE41__) && defined(__KERNEL_SSE__)
+  return make_float3(msub(ssef(a.y), ssef(b.z), ssef(a.z) * ssef(b.y))[0],
+                     msub(ssef(a.z), ssef(b.x), ssef(a.x) * ssef(b.z))[0],
+                     msub(ssef(a.x), ssef(b.y), ssef(a.y) * ssef(b.x))[0]);
+#else
+  return make_float3(a.y * b.z - a.z * b.y, a.z * b.x - a.x * b.z, a.x * b.y - a.y * b.x);
+#endif
+}
+
 ccl_device_forceinline bool ray_triangle_intersect(const float3 ray_P,
                                                    const float3 ray_D,
                                                    const float ray_tmin,
@@ -130,9 +175,9 @@ ccl_device_forceinline bool ray_triangle_intersect(const float3 ray_P,
   const float3 e2 = v1 - v2;
 
   /* Perform edge tests. */
-  const float U = dot(cross(e0, v2 + v0), ray_D);
-  const float V = dot(cross(e1, v0 + v1), ray_D);
-  const float W = dot(cross(e2, v1 + v2), ray_D);
+  const float U = ray_triangle_dot(ray_triangle_cross(e0, v2 + v0), ray_D);
+  const float V = ray_triangle_dot(ray_triangle_cross(e1, v0 + v1), ray_D);
+  const float W = ray_triangle_dot(ray_triangle_cross(e2, v1 + v2), ray_D);
 
   const float UVW = U + V + W;
   const float eps = FLT_EPSILON * fabsf(UVW);
@@ -144,7 +189,7 @@ ccl_device_forceinline bool ray_triangle_intersect(const float3 ray_P,
   }
 
   /* Calculate geometry normal and denominator. */
-  const float3 Ng1 = cross(e1, e0);
+  const float3 Ng1 = ray_triangle_cross(e1, e0);
   const float3 Ng = Ng1 + Ng1;
   const float den = dot(Ng, ray_D);
   /* Avoid division by 0. */
@@ -159,13 +204,46 @@ ccl_device_forceinline bool ray_triangle_intersect(const float3 ray_P,
     return false;
   }
 
-  const float rcp_UVW = (fabsf(UVW) < 1e-18f) ? 0.0f : 1.0f / UVW;
-  *isect_u = min(U * rcp_UVW, 1.0f);
-  *isect_v = min(V * rcp_UVW, 1.0f);
+  const float rcp_uvw = (fabsf(UVW) < 1e-18f) ? 0.0f : ray_triangle_rcp(UVW);
+  *isect_u = min(U * rcp_uvw, 1.0f);
+  *isect_v = min(V * rcp_uvw, 1.0f);
   *isect_t = t;
   return true;
 }
 
+ccl_device_forceinline bool ray_triangle_intersect_self(const float3 ray_P,
+                                                        const float3 ray_D,
+                                                        const float3 tri_a,
+                                                        const float3 tri_b,
+                                                        const float3 tri_c)
+{
+  /* Matches logic in ray_triangle_intersect, self intersection test to validate
+   * if a ray is going to hit self or might incorrectly hit a neighboring triangle. */
+
+  /* Calculate vertices relative to ray origin. */
+  const float3 v0 = tri_a - ray_P;
+  const float3 v1 = tri_b - ray_P;
+  const float3 v2 = tri_c - ray_P;
+
+  /* Calculate triangle edges. */
+  const float3 e0 = v2 - v0;
+  const float3 e1 = v0 - v1;
+  const float3 e2 = v1 - v2;
+
+  /* Perform edge tests. */
+  const float U = ray_triangle_dot(ray_triangle_cross(v2 + v0, e0), ray_D);
+  const float V = ray_triangle_dot(ray_triangle_cross(v0 + v1, e1), ray_D);
+  const float W = ray_triangle_dot(ray_triangle_cross(v1 + v2, e2), ray_D);
+
+  const float eps = FLT_EPSILON * fabsf(U + V + W);
+  const float minUVW = min(U, min(V, W));
+  const float maxUVW = max(U, max(V, W));
+
+  /* Note the extended epsilon compared to ray_triangle_intersect, to account
+   * for intersections with neighboring triangles that have an epsilon. */
+  return (minUVW >= eps || maxUVW <= -eps);
+}
+
 /* Tests for an intersection between a ray and a quad defined by
  * its midpoint, normal and sides.
  * If ellipse is true, hits outside the ellipse that's enclosed by the