Cycles: add HIP RT device, for AMD hardware ray tracing on Windows

HIP RT enables AMD hardware ray tracing on RDNA2 and above, and falls back to a to shader implementation for older graphics cards. It offers an average 25% sample rendering rate improvement in Cycles benchmarks, on a W6800 card. The ray tracing feature functions are accessed through HIP RT SDK, available on GPUOpen. HIP RT traversal functionality is pre-compiled in bitcode format and shipped with the SDK. This is not yet enabled as there are issues to be resolved, but landing the code now makes testing and further changes easier. Known limitations: * Not working yet with current public AMD drivers. * Visual artifact in motion blur. * One of the buffers allocated for traversal has a static size. Allocating it dynamically would reduce memory usage. * This is for Windows only currently, no Linux support. Co-authored-by: Brecht Van Lommel <brecht@blender.org> Ref #105538
2023-04-24 19:05:30 +02:00
parent 7026d9ac43
commit 557a245dd5
33 changed files with 2794 additions and 39 deletions
--- a/CMakeLists.txt
+++ b/CMakeLists.txt
@@ -525,6 +525,12 @@ if(NOT APPLE)
  set(CYCLES_HIP_BINARIES_ARCH gfx900 gfx90c gfx902 gfx1010 gfx1011 gfx1012 gfx1030 gfx1031 gfx1032 gfx1034 gfx1035 gfx1100 gfx1101 gfx1102 CACHE STRING "AMD HIP architectures to build binaries for")
  mark_as_advanced(WITH_CYCLES_DEVICE_HIP)
  mark_as_advanced(CYCLES_HIP_BINARIES_ARCH)
  # HIPRT is only available on Windows for now.
  if(WIN32)
    option(WITH_CYCLES_DEVICE_HIPRT      "Enable Cycles AMD HIPRT support" OFF)
    mark_as_advanced(WITH_CYCLES_DEVICE_HIPRT)
  endif()
 endif()
 # Apple Metal
@@ -1981,10 +1987,13 @@ if(FIRST_RUN)
      info_cfg_option(WITH_CYCLES_DEVICE_OPTIX)
      info_cfg_option(WITH_CYCLES_DEVICE_CUDA)
      info_cfg_option(WITH_CYCLES_CUDA_BINARIES)
      info_cfg_option(WITH_CYCLES_DEVICE_HIP)
      info_cfg_option(WITH_CYCLES_HIP_BINARIES)
      info_cfg_option(WITH_CYCLES_DEVICE_ONEAPI)
      info_cfg_option(WITH_CYCLES_ONEAPI_BINARIES)
      info_cfg_option(WITH_CYCLES_DEVICE_HIP)
      info_cfg_option(WITH_CYCLES_HIP_BINARIES)
    endif()
    if(WIN32)
      info_cfg_option(WITH_CYCLES_DEVICE_HIPRT)
    endif()
  endif()
--- a/intern/cycles/CMakeLists.txt
+++ b/intern/cycles/CMakeLists.txt
@@ -249,6 +249,13 @@ endif()
 if(WITH_CYCLES_DEVICE_HIP)
  add_definitions(-DWITH_HIP)
  if(WITH_CYCLES_DEVICE_HIPRT)
    include_directories(
      ${HIPRT_INCLUDE_DIR}
    )
    add_definitions(-DWITH_HIPRT)
  endif()
  if(WITH_HIP_DYNLOAD)
    include_directories(
      ../../extern/hipew/include
--- a/intern/cycles/blender/addon/properties.py
+++ b/intern/cycles/blender/addon/properties.py
@@ -1507,7 +1507,7 @@ class CyclesPreferences(bpy.types.AddonPreferences):
    def get_device_types(self, context):
        import _cycles
-        has_cuda, has_optix, has_hip, has_metal, has_oneapi = _cycles.get_device_types()
+        has_cuda, has_optix, has_hip, has_metal, has_oneapi, has_hiprt = _cycles.get_device_types()
        list = [('NONE', "None", "Don't use compute device", 0)]
        if has_cuda:
@@ -1544,6 +1544,13 @@ class CyclesPreferences(bpy.types.AddonPreferences):
        default=False,
    )
    use_hiprt: BoolProperty(
        name="HIP RT (Experimental)",
        description="HIP RT enables AMD hardware ray tracing on RDNA2 and above, with shader fallback on older cards. "
                    "This feature is experimental and some scenes may render incorrectly",
        default=False,
    )
    use_oneapirt: BoolProperty(
        name="Embree on GPU (Experimental)",
        description="Embree GPU execution will allow to use hardware ray tracing on Intel GPUs, which will provide better performance. "
@@ -1770,7 +1777,13 @@ class CyclesPreferences(bpy.types.AddonPreferences):
                    col.prop(self, "kernel_optimization_level")
                col.prop(self, "use_metalrt")
-        if compute_device_type == 'ONEAPI' and _cycles.with_embree_gpu:
+        if compute_device_type == 'HIP':
            has_cuda, has_optix, has_hip, has_metal, has_oneapi, has_hiprt = _cycles.get_device_types()
            row = layout.row()
            row.enabled = has_hiprt
            row.prop(self, "use_hiprt")
        elif compute_device_type == 'ONEAPI' and _cycles.with_embree_gpu:
            row = layout.row()
            row.use_property_split = True
            row.prop(self, "use_oneapirt")
--- a/intern/cycles/blender/device.cpp
+++ b/intern/cycles/blender/device.cpp
@@ -124,6 +124,10 @@ DeviceInfo blender_device_info(BL::Preferences &b_preferences,
      info.use_hardware_raytracing = false;
    }
    if (info.type == DEVICE_HIP && !get_boolean(cpreferences, "use_hiprt")) {
      info.use_hardware_raytracing = false;
    }
    /* There is an accumulative logic here, because Multi-devices are support only for
     * the same backend + CPU in Blender right now, and both oneAPI and Metal have a
     * global boolean backend setting (see above) for enabling/disabling HW RT,
--- a/intern/cycles/blender/python.cpp
+++ b/intern/cycles/blender/python.cpp
@@ -876,20 +876,23 @@ static PyObject *enable_print_stats_func(PyObject * /*self*/, PyObject * /*args*
 static PyObject *get_device_types_func(PyObject * /*self*/, PyObject * /*args*/)
 {
  vector<DeviceType> device_types = Device::available_types();
-  bool has_cuda = false, has_optix = false, has_hip = false, has_metal = false, has_oneapi = false;
+  bool has_cuda = false, has_optix = false, has_hip = false, has_metal = false, has_oneapi = false,
       has_hiprt = false;
  foreach (DeviceType device_type, device_types) {
    has_cuda |= (device_type == DEVICE_CUDA);
    has_optix |= (device_type == DEVICE_OPTIX);
    has_hip |= (device_type == DEVICE_HIP);
    has_metal |= (device_type == DEVICE_METAL);
    has_oneapi |= (device_type == DEVICE_ONEAPI);
    has_hiprt |= (device_type == DEVICE_HIPRT);
  }
-  PyObject *list = PyTuple_New(5);
+  PyObject *list = PyTuple_New(6);
  PyTuple_SET_ITEM(list, 0, PyBool_FromLong(has_cuda));
  PyTuple_SET_ITEM(list, 1, PyBool_FromLong(has_optix));
  PyTuple_SET_ITEM(list, 2, PyBool_FromLong(has_hip));
  PyTuple_SET_ITEM(list, 3, PyBool_FromLong(has_metal));
  PyTuple_SET_ITEM(list, 4, PyBool_FromLong(has_oneapi));
  PyTuple_SET_ITEM(list, 5, PyBool_FromLong(has_hiprt));
  return list;
 }
--- a/intern/cycles/bvh/CMakeLists.txt
+++ b/intern/cycles/bvh/CMakeLists.txt
@@ -14,6 +14,7 @@ set(SRC
  binning.cpp
  build.cpp
  embree.cpp
  hiprt.cpp
  multi.cpp
  node.cpp
  optix.cpp
@@ -39,6 +40,7 @@ set(SRC_HEADERS
  binning.h
  build.h
  embree.h
  hiprt.h
  multi.h
  node.h
  optix.h
--- a/intern/cycles/bvh/bvh.cpp
+++ b/intern/cycles/bvh/bvh.cpp
@@ -6,6 +6,7 @@
 #include "bvh/bvh2.h"
 #include "bvh/embree.h"
 #include "bvh/hiprt.h"
 #include "bvh/metal.h"
 #include "bvh/multi.h"
 #include "bvh/optix.h"
@@ -30,10 +31,14 @@ const char *bvh_layout_name(BVHLayout layout)
      return "OPTIX";
    case BVH_LAYOUT_METAL:
      return "METAL";
    case BVH_LAYOUT_HIPRT:
      return "HIPRT";
    case BVH_LAYOUT_MULTI_OPTIX:
    case BVH_LAYOUT_MULTI_METAL:
    case BVH_LAYOUT_MULTI_HIPRT:
    case BVH_LAYOUT_MULTI_OPTIX_EMBREE:
    case BVH_LAYOUT_MULTI_METAL_EMBREE:
    case BVH_LAYOUT_MULTI_HIPRT_EMBREE:
      return "MULTI";
    case BVH_LAYOUT_ALL:
      return "ALL";
@@ -101,11 +106,20 @@ BVH *BVH::create(const BVHParams &params,
 #else
      (void)device;
      break;
 #endif
    case BVH_LAYOUT_HIPRT:
 #ifdef WITH_HIPRT
      return new BVHHIPRT(params, geometry, objects, device);
 #else
      (void)device;
      break;
 #endif
    case BVH_LAYOUT_MULTI_OPTIX:
    case BVH_LAYOUT_MULTI_METAL:
    case BVH_LAYOUT_MULTI_HIPRT:
    case BVH_LAYOUT_MULTI_OPTIX_EMBREE:
    case BVH_LAYOUT_MULTI_METAL_EMBREE:
    case BVH_LAYOUT_MULTI_HIPRT_EMBREE:
      return new BVHMulti(params, geometry, objects);
    case BVH_LAYOUT_NONE:
    case BVH_LAYOUT_ALL:
--- a/intern/cycles/bvh/hiprt.cpp
+++ b/intern/cycles/bvh/hiprt.cpp
@@ -0,0 +1,45 @@
 /* SPDX-License-Identifier: Apache-2.0
 * Copyright 2011-2023 Blender Foundation */
 #ifdef WITH_HIPRT
 #  include "bvh/hiprt.h"
 #  include "scene/mesh.h"
 #  include "scene/object.h"
 #  include "util/foreach.h"
 #  include "util/progress.h"
 #  include "device/hiprt/device_impl.h"
 CCL_NAMESPACE_BEGIN
 BVHHIPRT::BVHHIPRT(const BVHParams &params,
                   const vector<Geometry *> &geometry,
                   const vector<Object *> &objects,
                   Device *in_device)
    : BVH(params, geometry, objects),
      hiprt_geom(0),
      custom_primitive_bound(in_device, "Custom Primitive Bound", MEM_READ_ONLY),
      triangle_index(in_device, "HIPRT Triangle Index", MEM_READ_ONLY),
      vertex_data(in_device, "vertex_data", MEM_READ_ONLY),
      device(in_device)
 {
  triangle_mesh = {0};
  custom_prim_aabb = {0};
 }
 BVHHIPRT::~BVHHIPRT()
 {
  HIPRTDevice *hiprt_device = static_cast<HIPRTDevice *>(device);
  hiprtContext hiprt_context = hiprt_device->get_hiprt_context();
  custom_primitive_bound.free();
  triangle_index.free();
  vertex_data.free();
  hiprtDestroyGeometry(hiprt_context, hiprt_geom);
 }
 CCL_NAMESPACE_END
 #endif
--- a/intern/cycles/bvh/hiprt.h
+++ b/intern/cycles/bvh/hiprt.h
@@ -0,0 +1,58 @@
 /* SPDX-License-Identifier: Apache-2.0
 * Copyright 2011-2023 Blender Foundation */
 #ifdef WITH_HIPRT
 #  pragma once
 #  include "bvh/bvh.h"
 #  include "bvh/params.h"
 #  ifdef WITH_HIP_DYNLOAD
 #    include "hiprtew.h"
 #  else
 #    include <hiprt/hiprt_types.h>
 #  endif
 #  include "device/memory.h"
 CCL_NAMESPACE_BEGIN
 class BVHHIPRT : public BVH {
 public:
  friend class HIPDevice;
  bool is_tlas()
  {
    return params.top_level;
  }
  hiprtGeometry hiprt_geom;
  hiprtTriangleMeshPrimitive triangle_mesh;
  hiprtAABBListPrimitive custom_prim_aabb;
  hiprtGeometryBuildInput geom_input;
  vector<int2> custom_prim_info; /* x: prim_id, y: prim_type */
  vector<float2> prims_time;
  /* Custom primitives. */
  device_vector<BoundBox> custom_primitive_bound;
  device_vector<int> triangle_index;
  device_vector<float> vertex_data;
 protected:
  friend class BVH;
  BVHHIPRT(const BVHParams &params,
           const vector<Geometry *> &geometry,
           const vector<Object *> &objects,
           Device *in_device);
  virtual ~BVHHIPRT();
 private:
  Device *device;
 };
 CCL_NAMESPACE_END
 #endif
--- a/intern/cycles/cmake/external_libs.cmake
+++ b/intern/cycles/cmake/external_libs.cmake
@@ -41,20 +41,32 @@ endif()
 # HIP
 ###########################################################################
-if(WITH_CYCLES_HIP_BINARIES AND WITH_CYCLES_DEVICE_HIP)
+if(WITH_CYCLES_DEVICE_HIP)
-  if(UNIX)
+  if(WITH_CYCLES_HIP_BINARIES)
-    # Disabled until there is a HIP 5.5 release for Linux.
+    if(UNIX)
-    set(WITH_CYCLES_HIP_BINARIES OFF)
+      # Disabled until there is a HIP 5.5 release for Linux.
-    message(STATUS "HIP temporarily disabled due to compiler bugs")
+      set(WITH_CYCLES_HIP_BINARIES OFF)
-  else()
+      message(STATUS "HIP temporarily disabled due to compiler bugs")
-    # Need at least HIP 5.5 to solve compiler bug affecting the kernel.
+    else()
-    find_package(HIP 5.5.0)
+      # Need at least HIP 5.5 to solve compiler bug affecting the kernel.
-    set_and_warn_library_found("HIP compiler" HIP_FOUND WITH_CYCLES_HIP_BINARIES)
+      find_package(HIP 5.5.0)
      set_and_warn_library_found("HIP compiler" HIP_FOUND WITH_CYCLES_HIP_BINARIES)
-    if(HIP_FOUND)
+      if(HIP_FOUND)
-      message(STATUS "Found HIP ${HIP_HIPCC_EXECUTABLE} (${HIP_VERSION})")
+        message(STATUS "Found HIP ${HIP_HIPCC_EXECUTABLE} (${HIP_VERSION})")
      endif()
    endif()
  endif()
  # HIP RT
  if(WITH_CYCLES_DEVICE_HIP AND WITH_CYCLES_DEVICE_HIPRT)
    find_package(HIPRT)
    set_and_warn_library_found("HIP RT" HIPRT_FOUND WITH_CYCLES_DEVICE_HIPRT)
  endif()
 endif()
 if(NOT WITH_CYCLES_DEVICE_HIP)
  set(WITH_CYCLES_DEVICE_HIPRT OFF)
 endif()
 if(NOT WITH_HIP_DYNLOAD)
--- a/intern/cycles/device/CMakeLists.txt
+++ b/intern/cycles/device/CMakeLists.txt
@@ -66,6 +66,13 @@ set(SRC_HIP
  hip/util.h
 )
 set(SRC_HIPRT
  hiprt/device_impl.cpp
  hiprt/device_impl.h
  hiprt/queue.cpp
  hiprt/queue.h
 )
 set(SRC_ONEAPI
  oneapi/device_impl.cpp
  oneapi/device_impl.h
@@ -124,6 +131,7 @@ set(SRC
  ${SRC_CPU}
  ${SRC_CUDA}
  ${SRC_HIP}
  ${SRC_HIPRT}
  ${SRC_DUMMY}
  ${SRC_MULTI}
  ${SRC_OPTIX}
@@ -209,6 +217,7 @@ source_group("cpu" FILES ${SRC_CPU})
 source_group("cuda" FILES ${SRC_CUDA})
 source_group("dummy" FILES ${SRC_DUMMY})
 source_group("hip" FILES ${SRC_HIP})
 source_group("hiprt" FILES ${SRC_HIPRT})
 source_group("multi" FILES ${SRC_MULTI})
 source_group("metal" FILES ${SRC_METAL})
 source_group("optix" FILES ${SRC_OPTIX})
--- a/intern/cycles/device/device.cpp
+++ b/intern/cycles/device/device.cpp
@@ -14,6 +14,7 @@
 #include "device/cuda/device.h"
 #include "device/dummy/device.h"
 #include "device/hip/device.h"
 #include "device/hiprt/device_impl.h"
 #include "device/metal/device.h"
 #include "device/multi/device.h"
 #include "device/oneapi/device.h"
@@ -135,6 +136,8 @@ DeviceType Device::type_from_string(const char *name)
    return DEVICE_METAL;
  else if (strcmp(name, "ONEAPI") == 0)
    return DEVICE_ONEAPI;
  else if (strcmp(name, "HIPRT") == 0)
    return DEVICE_HIPRT;
  return DEVICE_NONE;
 }
@@ -155,6 +158,8 @@ string Device::string_from_type(DeviceType type)
    return "METAL";
  else if (type == DEVICE_ONEAPI)
    return "ONEAPI";
  else if (type == DEVICE_HIPRT)
    return "HIPRT";
  return "";
 }
@@ -177,6 +182,10 @@ vector<DeviceType> Device::available_types()
 #endif
 #ifdef WITH_ONEAPI
  types.push_back(DEVICE_ONEAPI);
 #endif
 #ifdef WITH_HIPRT
  if (hiprtewInit())
    types.push_back(DEVICE_HIPRT);
 #endif
  return types;
 }
--- a/intern/cycles/device/device.h
+++ b/intern/cycles/device/device.h
@@ -40,6 +40,7 @@ enum DeviceType {
  DEVICE_MULTI,
  DEVICE_OPTIX,
  DEVICE_HIP,
  DEVICE_HIPRT,
  DEVICE_METAL,
  DEVICE_ONEAPI,
  DEVICE_DUMMY,
@@ -79,8 +80,7 @@ class DeviceInfo {
  bool has_profiling;           /* Supports runtime collection of profiling info. */
  bool has_peer_memory;         /* GPU has P2P access to memory of another GPU. */
  bool has_gpu_queue;           /* Device supports GPU queue. */
-  bool use_hardware_raytracing; /* Use hardware ray tracing to accelerate ray queries in a backend.
+  bool use_hardware_raytracing; /* Use hardware instructions to accelerate ray tracing. */
                                 */
  KernelOptimizationLevel kernel_optimization_level; /* Optimization level applied to path tracing
                                                      * kernels (Metal only). */
  DenoiserTypeMask denoisers;                        /* Supported denoiser types. */
--- a/intern/cycles/device/hip/device.cpp
+++ b/intern/cycles/device/hip/device.cpp
@@ -13,6 +13,10 @@
 #  include "util/windows.h"
 #endif /* WITH_HIP */
 #ifdef WITH_HIPRT
 #  include "device/hiprt/device_impl.h"
 #endif
 CCL_NAMESPACE_BEGIN
 bool device_hip_init()
@@ -65,7 +69,12 @@ bool device_hip_init()
 Device *device_hip_create(const DeviceInfo &info, Stats &stats, Profiler &profiler)
 {
-#ifdef WITH_HIP
+#ifdef WITH_HIPRT
  if (info.use_hardware_raytracing)
    return new HIPRTDevice(info, stats, profiler);
  else
    return new HIPDevice(info, stats, profiler);
 #elif defined(WITH_HIP)
  return new HIPDevice(info, stats, profiler);
 #else
  (void)info;
@@ -115,6 +124,12 @@ void device_hip_info(vector<DeviceInfo> &devices)
    return;
  }
 #  ifdef WITH_HIPRT
  const bool has_hardware_raytracing = hiprtewInit();
 #  else
  const bool has_hardware_raytracing = false;
 #  endif
  vector<DeviceInfo> display_devices;
  for (int num = 0; num < count; num++) {
@@ -150,6 +165,8 @@ void device_hip_info(vector<DeviceInfo> &devices)
      }
    }
    info.use_hardware_raytracing = has_hardware_raytracing;
    int pci_location[3] = {0, 0, 0};
    hipDeviceGetAttribute(&pci_location[0], hipDeviceAttributePciDomainID, num);
    hipDeviceGetAttribute(&pci_location[1], hipDeviceAttributePciBusId, num);
@@ -176,6 +193,7 @@ void device_hip_info(vector<DeviceInfo> &devices)
      VLOG_INFO << "Device has compute preemption or is not used for display.";
      devices.push_back(info);
    }
    VLOG_INFO << "Added device \"" << name << "\" with id \"" << info.id << "\".";
  }
--- a/intern/cycles/device/hip/device_impl.h
+++ b/intern/cycles/device/hip/device_impl.h
@@ -1,6 +1,8 @@
 /* SPDX-License-Identifier: Apache-2.0
 * Copyright 2011-2022 Blender Foundation */
 #pragma once
 #ifdef WITH_HIP
 #  include "device/device.h"
@@ -49,9 +51,11 @@ class HIPDevice : public GPUDevice {
  bool use_adaptive_compilation();
-  string compile_kernel_get_common_cflags(const uint kernel_features);
+  virtual string compile_kernel_get_common_cflags(const uint kernel_features);
-  string compile_kernel(const uint kernel_features, const char *name, const char *base = "hip");
+  virtual string compile_kernel(const uint kernel_features,
                                const char *name,
                                const char *base = "hip");
  virtual bool load_kernels(const uint kernel_features) override;
  void reserve_local_memory(const uint kernel_features);
--- a/intern/cycles/device/hiprt/device_impl.cpp
+++ b/intern/cycles/device/hiprt/device_impl.cpp
@@ -0,0 +1,1054 @@
 /* SPDX-License-Identifier: Apache-2.0
 * Copyright 2011-2023 Blender Foundation */
 #ifdef WITH_HIPRT
 #  include "device/hiprt/device_impl.h"
 #  include "util/debug.h"
 #  include "util/foreach.h"
 #  include "util/log.h"
 #  include "util/map.h"
 #  include "util/md5.h"
 #  include "util/path.h"
 #  include "util/progress.h"
 #  include "util/string.h"
 #  include "util/system.h"
 #  include "util/time.h"
 #  include "util/types.h"
 #  include "util/windows.h"
 #  include "bvh/hiprt.h"
 #  include "scene/hair.h"
 #  include "scene/mesh.h"
 #  include "scene/object.h"
 #  include "scene/pointcloud.h"
 CCL_NAMESPACE_BEGIN
 static void get_hiprt_transform(float matrix[][4], Transform &tfm)
 {
  int row = 0;
  int col = 0;
  matrix[row][col++] = tfm.x.x;
  matrix[row][col++] = tfm.x.y;
  matrix[row][col++] = tfm.x.z;
  matrix[row][col++] = tfm.x.w;
  row++;
  col = 0;
  matrix[row][col++] = tfm.y.x;
  matrix[row][col++] = tfm.y.y;
  matrix[row][col++] = tfm.y.z;
  matrix[row][col++] = tfm.y.w;
  row++;
  col = 0;
  matrix[row][col++] = tfm.z.x;
  matrix[row][col++] = tfm.z.y;
  matrix[row][col++] = tfm.z.z;
  matrix[row][col++] = tfm.z.w;
 }
 class HIPRTDevice;
 BVHLayoutMask HIPRTDevice::get_bvh_layout_mask(const uint /* kernel_features */) const
 {
  return BVH_LAYOUT_HIPRT;
 }
 HIPRTDevice::HIPRTDevice(const DeviceInfo &info, Stats &stats, Profiler &profiler)
    : HIPDevice(info, stats, profiler),
      global_stack_buffer(this, "global_stack_buffer", MEM_DEVICE_ONLY),
      hiprt_context(NULL),
      scene(NULL),
      functions_table(NULL),
      scratch_buffer_size(0),
      scratch_buffer(this, "scratch_buffer", MEM_DEVICE_ONLY),
      visibility(this, "visibility", MEM_READ_ONLY),
      instance_transform_matrix(this, "instance_transform_matrix", MEM_READ_ONLY),
      transform_headers(this, "transform_headers", MEM_READ_ONLY),
      user_instance_id(this, "user_instance_id", MEM_GLOBAL),
      hiprt_blas_ptr(this, "hiprt_blas_ptr", MEM_READ_WRITE),
      blas_ptr(this, "blas_ptr", MEM_GLOBAL),
      custom_prim_info(this, "custom_prim_info", MEM_GLOBAL),
      custom_prim_info_offset(this, "custom_prim_info_offset", MEM_GLOBAL),
      prims_time(this, "prims_time", MEM_GLOBAL),
      prim_time_offset(this, "prim_time_offset", MEM_GLOBAL)
 {
  HIPContextScope scope(this);
  hiprtContextCreationInput hiprt_context_input = {0};
  hiprt_context_input.ctxt = hipContext;
  hiprt_context_input.device = hipDevice;
  hiprt_context_input.deviceType = hiprtDeviceAMD;
  hiprtError rt_result = hiprtCreateContext(
      HIPRT_API_VERSION, hiprt_context_input, &hiprt_context);
  if (rt_result != hiprtSuccess) {
    set_error(string_printf("Failed to create HIPRT context"));
    return;
  }
  rt_result = hiprtCreateFuncTable(
      hiprt_context, Max_Primitive_Type, Max_Intersect_Filter_Function, &functions_table);
  if (rt_result != hiprtSuccess) {
    set_error(string_printf("Failed to create HIPRT Function Table"));
    return;
  }
 }
 HIPRTDevice::~HIPRTDevice()
 {
  HIPContextScope scope(this);
  user_instance_id.free();
  visibility.free();
  hiprt_blas_ptr.free();
  blas_ptr.free();
  instance_transform_matrix.free();
  transform_headers.free();
  custom_prim_info_offset.free();
  custom_prim_info.free();
  prim_time_offset.free();
  prims_time.free();
  global_stack_buffer.free();
  hiprtDestroyFuncTable(hiprt_context, functions_table);
  hiprtDestroyScene(hiprt_context, scene);
  hiprtDestroyContext(hiprt_context);
 }
 unique_ptr<DeviceQueue> HIPRTDevice::gpu_queue_create()
 {
  return make_unique<HIPRTDeviceQueue>(this);
 }
 string HIPRTDevice::compile_kernel_get_common_cflags(const uint kernel_features)
 {
  string cflags = HIPDevice::compile_kernel_get_common_cflags(kernel_features);
  cflags += " -D __HIPRT__ ";
  return cflags;
 }
 string HIPRTDevice::compile_kernel(const uint kernel_features, const char *name, const char *base)
 {
  int major, minor;
  hipDeviceGetAttribute(&major, hipDeviceAttributeComputeCapabilityMajor, hipDevId);
  hipDeviceGetAttribute(&minor, hipDeviceAttributeComputeCapabilityMinor, hipDevId);
  hipDeviceProp_t props;
  hipGetDeviceProperties(&props, hipDevId);
  char *arch = strtok(props.gcnArchName, ":");
  if (arch == NULL) {
    arch = props.gcnArchName;
  }
  if (!use_adaptive_compilation()) {
    const string fatbin = path_get(string_printf("lib/%s_rt_gfx.hipfb", name));
    VLOG(1) << "Testing for pre-compiled kernel " << fatbin << ".";
    if (path_exists(fatbin)) {
      VLOG(1) << "Using precompiled kernel.";
      return fatbin;
    }
  }
  string source_path = path_get("source");
  const string source_md5 = path_files_md5_hash(source_path);
  string common_cflags = compile_kernel_get_common_cflags(kernel_features);
  const string kernel_md5 = util_md5_string(source_md5 + common_cflags);
  const string include_path = source_path;
  const string bitcode_file = string_printf("cycles_%s_%s_%s.bc", name, arch, kernel_md5.c_str());
  const string bitcode = path_cache_get(path_join("kernels", bitcode_file));
  const string fatbin_file = string_printf(
      "cycles_%s_%s_%s.hipfb", name, arch, kernel_md5.c_str());
  const string fatbin = path_cache_get(path_join("kernels", fatbin_file));
  VLOG(1) << "Testing for locally compiled kernel " << fatbin << ".";
  if (path_exists(fatbin)) {
    VLOG(1) << "Using locally compiled kernel.";
    return fatbin;
  }
 #  ifdef _WIN32
  if (!use_adaptive_compilation() && have_precompiled_kernels()) {
    if (!hipSupportsDevice(hipDevId)) {
      set_error(
          string_printf("HIP backend requires compute capability 10.1 or up, but found %d.%d. "
                        "Your GPU is not supported.",
                        major,
                        minor));
    }
    else {
      set_error(
          string_printf("HIP binary kernel for this graphics card compute "
                        "capability (%d.%d) not found.",
                        major,
                        minor));
    }
    return string();
  }
 #  endif
  const char *const hipcc = hipewCompilerPath();
  if (hipcc == NULL) {
    set_error(
        "HIP hipcc compiler not found. "
        "Install HIP toolkit in default location.");
    return string();
  }
  const int hipcc_hip_version = hipewCompilerVersion();
  VLOG_INFO << "Found hipcc " << hipcc << ", HIP version " << hipcc_hip_version << ".";
  if (hipcc_hip_version < 40) {
    printf(
        "Unsupported HIP version %d.%d detected, "
        "you need HIP 4.0 or newer.\n",
        hipcc_hip_version / 10,
        hipcc_hip_version % 10);
    return string();
  }
  path_create_directories(fatbin);
  source_path = path_join(path_join(source_path, "kernel"),
                          path_join("device", path_join(base, string_printf("%s.cpp", name))));
  printf("Compiling  %s and caching to %s", source_path.c_str(), fatbin.c_str());
  double starttime = time_dt();
  const string hiprt_path = getenv("HIPRT_ROOT_DIR");
  // First, app kernels are compiled into bitcode, without access to implementation of HIP RT
  // functions
  if (!path_exists(bitcode)) {
    std::string rtc_options;
    rtc_options.append(" --offload-arch=").append(arch);
    rtc_options.append(" -D __HIPRT__");
    rtc_options.append(" -ffast-math -O3 -std=c++17");
    rtc_options.append(" -fgpu-rdc -c --gpu-bundle-output -c -emit-llvm");
    string command = string_printf("%s %s -I %s  -I %s %s -o \"%s\"",
                                   hipcc,
                                   rtc_options.c_str(),
                                   include_path.c_str(),
                                   hiprt_path.c_str(),
                                   source_path.c_str(),
                                   bitcode.c_str());
    printf("Compiling %sHIP kernel ...\n%s\n",
           (use_adaptive_compilation()) ? "adaptive " : "",
           command.c_str());
 #  ifdef _WIN32
    command = "call " + command;
 #  endif
    if (system(command.c_str()) != 0) {
      set_error(
          "Failed to execute compilation command, "
          "see console for details.");
      return string();
    }
  }
  // After compilation, the bitcode produced is linked with HIP RT bitcode (containing
  // implementations of HIP RT functions, e.g. traversal, to produce the final executable code
  string linker_options;
  linker_options.append(" --offload-arch=").append(arch);
  linker_options.append(" -fgpu-rdc --hip-link --cuda-device-only ");
  string hiprt_ver(HIPRT_VERSION_STR);
  string hiprt_bc;
  hiprt_bc = hiprt_path + "\\hiprt" + hiprt_ver + "_amd_lib_win.bc";
  string linker_command = string_printf("clang++ %s \"%s\" %s -o \"%s\"",
                                        linker_options.c_str(),
                                        bitcode.c_str(),
                                        hiprt_bc.c_str(),
                                        fatbin.c_str());
 #  ifdef _WIN32
  linker_command = "call " + linker_command;
 #  endif
  if (system(linker_command.c_str()) != 0) {
    set_error(
        "Failed to execute linking command, "
        "see console for details.");
    return string();
  }
  printf("Kernel compilation finished in %.2lfs.\n", time_dt() - starttime);
  return fatbin;
 }
 bool HIPRTDevice::load_kernels(const uint kernel_features)
 {
  if (hipModule) {
    if (use_adaptive_compilation()) {
      VLOG(1) << "Skipping HIP kernel reload for adaptive compilation, not currently supported.";
    }
    return true;
  }
  if (hipContext == 0)
    return false;
  if (!support_device(kernel_features)) {
    return false;
  }
  /* get kernel */
  const char *kernel_name = "kernel";
  string fatbin = compile_kernel(kernel_features, kernel_name);
  if (fatbin.empty())
    return false;
  /* open module */
  HIPContextScope scope(this);
  string fatbin_data;
  hipError_t result;
  if (path_read_text(fatbin, fatbin_data)) {
    result = hipModuleLoadData(&hipModule, fatbin_data.c_str());
  }
  else
    result = hipErrorFileNotFound;
  if (result != hipSuccess)
    set_error(string_printf(
        "Failed to load HIP kernel from '%s' (%s)", fatbin.c_str(), hipewErrorString(result)));
  if (result == hipSuccess) {
    kernels.load(this);
    {
      const DeviceKernel test_kernel = (kernel_features & KERNEL_FEATURE_NODE_RAYTRACE) ?
                                           DEVICE_KERNEL_INTEGRATOR_SHADE_SURFACE_RAYTRACE :
                                       (kernel_features & KERNEL_FEATURE_MNEE) ?
                                           DEVICE_KERNEL_INTEGRATOR_SHADE_SURFACE_MNEE :
                                           DEVICE_KERNEL_INTEGRATOR_SHADE_SURFACE;
      HIPRTDeviceQueue queue(this);
      device_ptr d_path_index = 0;
      device_ptr d_render_buffer = 0;
      int d_work_size = 0;
      DeviceKernelArguments args(&d_path_index, &d_render_buffer, &d_work_size);
      queue.init_execution();
      queue.enqueue(test_kernel, 1, args);
      queue.synchronize();
    }
  }
  return (result == hipSuccess);
 }
 void HIPRTDevice::const_copy_to(const char *name, void *host, size_t size)
 {
  HIPContextScope scope(this);
  hipDeviceptr_t mem;
  size_t bytes;
  if (strcmp(name, "data") == 0) {
    assert(size <= sizeof(KernelData));
    KernelData *const data = (KernelData *)host;
    *(hiprtScene *)&data->device_bvh = scene;
  }
  hip_assert(hipModuleGetGlobal(&mem, &bytes, hipModule, "kernel_params"));
  assert(bytes == sizeof(KernelParamsHIPRT));
 #  define KERNEL_DATA_ARRAY(data_type, data_name) \
    if (strcmp(name, #data_name) == 0) { \
      hip_assert(hipMemcpyHtoD(mem + offsetof(KernelParamsHIPRT, data_name), host, size)); \
      return; \
    }
  KERNEL_DATA_ARRAY(KernelData, data)
  KERNEL_DATA_ARRAY(IntegratorStateGPU, integrator_state)
  KERNEL_DATA_ARRAY(int, user_instance_id)
  KERNEL_DATA_ARRAY(uint64_t, blas_ptr)
  KERNEL_DATA_ARRAY(int2, custom_prim_info_offset)
  KERNEL_DATA_ARRAY(int2, custom_prim_info)
  KERNEL_DATA_ARRAY(int, prim_time_offset)
  KERNEL_DATA_ARRAY(float2, prims_time)
 #  include "kernel/data_arrays.h"
 #  undef KERNEL_DATA_ARRAY
 }
 hiprtGeometryBuildInput HIPRTDevice::prepare_triangle_blas(BVHHIPRT *bvh, Mesh *mesh)
 {
  hiprtGeometryBuildInput geom_input;
  geom_input.geomType = Triangle;
  if (mesh->has_motion_blur() &&
      !(bvh->params.num_motion_triangle_steps == 0 || bvh->params.use_spatial_split)) {
    const Attribute *attr_mP = mesh->attributes.find(ATTR_STD_MOTION_VERTEX_POSITION);
    const size_t num_triangles = mesh->num_triangles();
    const int num_bvh_steps = bvh->params.num_motion_triangle_steps * 2 + 1;
    const float num_bvh_steps_inv_1 = 1.0f / (num_bvh_steps - 1);
    int num_bounds = 0;
    bvh->custom_primitive_bound.alloc(num_triangles * num_bvh_steps);
    for (uint j = 0; j < num_triangles; j++) {
      Mesh::Triangle t = mesh->get_triangle(j);
      const float3 *verts = mesh->get_verts().data();
      const size_t num_verts = mesh->get_verts().size();
      const size_t num_steps = mesh->get_motion_steps();
      const float3 *vert_steps = attr_mP->data_float3();
      float3 prev_verts[3];
      t.motion_verts(verts, vert_steps, num_verts, num_steps, 0.0f, prev_verts);
      BoundBox prev_bounds = BoundBox::empty;
      prev_bounds.grow(prev_verts[0]);
      prev_bounds.grow(prev_verts[1]);
      prev_bounds.grow(prev_verts[2]);
      for (int bvh_step = 1; bvh_step < num_bvh_steps; ++bvh_step) {
        const float curr_time = (float)(bvh_step)*num_bvh_steps_inv_1;
        float3 curr_verts[3];
        t.motion_verts(verts, vert_steps, num_verts, num_steps, curr_time, curr_verts);
        BoundBox curr_bounds = BoundBox::empty;
        curr_bounds.grow(curr_verts[0]);
        curr_bounds.grow(curr_verts[1]);
        curr_bounds.grow(curr_verts[2]);
        BoundBox bounds = prev_bounds;
        bounds.grow(curr_bounds);
        if (bounds.valid()) {
          const float prev_time = (float)(bvh_step - 1) * num_bvh_steps_inv_1;
          bvh->custom_primitive_bound[num_bounds] = bounds;
          bvh->custom_prim_info[num_bounds].x = j;
          bvh->custom_prim_info[num_bounds].y = mesh->primitive_type();
          bvh->prims_time[num_bounds].x = curr_time;
          bvh->prims_time[num_bounds].y = prev_time;
          num_bounds++;
        }
        prev_bounds = curr_bounds;
      }
    }
    bvh->custom_prim_aabb.aabbCount = bvh->custom_primitive_bound.size();
    bvh->custom_prim_aabb.aabbStride = sizeof(BoundBox);
    bvh->custom_primitive_bound.copy_to_device();
    bvh->custom_prim_aabb.aabbs = (void *)bvh->custom_primitive_bound.device_pointer;
    geom_input.type = hiprtPrimitiveTypeAABBList;
    geom_input.aabbList.primitive = &bvh->custom_prim_aabb;
    geom_input.geomType = Motion_Triangle;
  }
  else {
    size_t triangle_size = mesh->get_triangles().size();
    void *triangle_data = mesh->get_triangles().data();
    size_t vertex_size = mesh->get_verts().size();
    void *vertex_data = mesh->get_verts().data();
    bvh->triangle_mesh.triangleCount = mesh->num_triangles();
    bvh->triangle_mesh.triangleStride = 3 * sizeof(int);
    bvh->triangle_mesh.vertexCount = vertex_size;
    bvh->triangle_mesh.vertexStride = sizeof(float3);
    bvh->triangle_index.host_pointer = triangle_data;
    bvh->triangle_index.data_elements = 1;
    bvh->triangle_index.data_type = TYPE_INT;
    bvh->triangle_index.data_size = triangle_size;
    bvh->triangle_index.copy_to_device();
    bvh->triangle_mesh.triangleIndices = (void *)(bvh->triangle_index.device_pointer);
    // either has to set the host pointer to zero, or increment the refcount on triangle_data
    bvh->triangle_index.host_pointer = 0;
    bvh->vertex_data.host_pointer = vertex_data;
    bvh->vertex_data.data_elements = 4;
    bvh->vertex_data.data_type = TYPE_FLOAT;
    bvh->vertex_data.data_size = vertex_size;
    bvh->vertex_data.copy_to_device();
    bvh->triangle_mesh.vertices = (void *)(bvh->vertex_data.device_pointer);
    bvh->vertex_data.host_pointer = 0;
    geom_input.type = hiprtPrimitiveTypeTriangleMesh;
    geom_input.triangleMesh.primitive = &(bvh->triangle_mesh);
  }
  return geom_input;
 }
 hiprtGeometryBuildInput HIPRTDevice::prepare_curve_blas(BVHHIPRT *bvh, Hair *hair)
 {
  hiprtGeometryBuildInput geom_input;
  const PrimitiveType primitive_type = hair->primitive_type();
  const size_t num_curves = hair->num_curves();
  const size_t num_segments = hair->num_segments();
  const Attribute *curve_attr_mP = NULL;
  if (curve_attr_mP == NULL || bvh->params.num_motion_curve_steps == 0) {
    bvh->custom_prim_info.resize(num_segments);
    bvh->custom_primitive_bound.alloc(num_segments);
  }
  else {
    size_t num_boxes = bvh->params.num_motion_curve_steps * 2 * num_segments;
    bvh->custom_prim_info.resize(num_boxes);
    bvh->custom_primitive_bound.alloc(num_boxes);
    curve_attr_mP = hair->attributes.find(ATTR_STD_MOTION_VERTEX_POSITION);
  }
  int num_bounds = 0;
  float3 *curve_keys = hair->get_curve_keys().data();
  for (uint j = 0; j < num_curves; j++) {
    const Hair::Curve curve = hair->get_curve(j);
    const float *curve_radius = &hair->get_curve_radius()[0];
    int first_key = curve.first_key;
    for (int k = 0; k < curve.num_keys - 1; k++) {
      if (curve_attr_mP == NULL || bvh->params.num_motion_curve_steps == 0) {
        float3 current_keys[4];
        current_keys[0] = curve_keys[max(first_key + k - 1, first_key)];
        current_keys[1] = curve_keys[first_key + k];
        current_keys[2] = curve_keys[first_key + k + 1];
        current_keys[3] = curve_keys[min(first_key + k + 2, first_key + curve.num_keys - 1)];
        if (current_keys[0].x == current_keys[1].x && current_keys[1].x == current_keys[2].x &&
            current_keys[2].x == current_keys[3].x && current_keys[0].y == current_keys[1].y &&
            current_keys[1].y == current_keys[2].y && current_keys[2].y == current_keys[3].y &&
            current_keys[0].z == current_keys[1].z && current_keys[1].z == current_keys[2].z &&
            current_keys[2].z == current_keys[3].z)
          continue;
        BoundBox bounds = BoundBox::empty;
        curve.bounds_grow(k, &hair->get_curve_keys()[0], curve_radius, bounds);
        if (bounds.valid()) {
          int type = PRIMITIVE_PACK_SEGMENT(primitive_type, k);
          bvh->custom_prim_info[num_bounds].x = j;
          bvh->custom_prim_info[num_bounds].y = type;
          bvh->custom_primitive_bound[num_bounds] = bounds;
          num_bounds++;
        }
      }
      else {
        const int num_bvh_steps = bvh->params.num_motion_curve_steps * 2 + 1;
        const float num_bvh_steps_inv_1 = 1.0f / (num_bvh_steps - 1);
        const size_t num_steps = hair->get_motion_steps();
        const float3 *curve_keys = &hair->get_curve_keys()[0];
        const float4 *key_steps = curve_attr_mP->data_float4();
        const size_t num_keys = hair->get_curve_keys().size();
        float4 prev_keys[4];
        curve.cardinal_motion_keys(curve_keys,
                                   curve_radius,
                                   key_steps,
                                   num_keys,
                                   num_steps,
                                   0.0f,
                                   k - 1,
                                   k,
                                   k + 1,
                                   k + 2,
                                   prev_keys);
        BoundBox prev_bounds = BoundBox::empty;
        curve.bounds_grow(prev_keys, prev_bounds);
        for (int bvh_step = 1; bvh_step < num_bvh_steps; ++bvh_step) {
          const float curr_time = (float)(bvh_step)*num_bvh_steps_inv_1;
          float4 curr_keys[4];
          curve.cardinal_motion_keys(curve_keys,
                                     curve_radius,
                                     key_steps,
                                     num_keys,
                                     num_steps,
                                     curr_time,
                                     k - 1,
                                     k,
                                     k + 1,
                                     k + 2,
                                     curr_keys);
          BoundBox curr_bounds = BoundBox::empty;
          curve.bounds_grow(curr_keys, curr_bounds);
          BoundBox bounds = prev_bounds;
          bounds.grow(curr_bounds);
          if (bounds.valid()) {
            const float prev_time = (float)(bvh_step - 1) * num_bvh_steps_inv_1;
            int packed_type = PRIMITIVE_PACK_SEGMENT(primitive_type, k);
            bvh->custom_prim_info[num_bounds].x = j;
            bvh->custom_prim_info[num_bounds].y = packed_type;  // k
            bvh->custom_primitive_bound[num_bounds] = bounds;
            bvh->prims_time[num_bounds].x = curr_time;
            bvh->prims_time[num_bounds].y = prev_time;
            num_bounds++;
          }
          prev_bounds = curr_bounds;
        }
      }
    }
  }
  bvh->custom_prim_aabb.aabbCount = num_bounds;
  bvh->custom_prim_aabb.aabbStride = sizeof(BoundBox);
  bvh->custom_primitive_bound.copy_to_device();
  bvh->custom_prim_aabb.aabbs = (void *)bvh->custom_primitive_bound.device_pointer;
  geom_input.type = hiprtPrimitiveTypeAABBList;
  geom_input.aabbList.primitive = &bvh->custom_prim_aabb;
  geom_input.geomType = Curve;
  return geom_input;
 }
 hiprtGeometryBuildInput HIPRTDevice::prepare_point_blas(BVHHIPRT *bvh, PointCloud *pointcloud)
 {
  hiprtGeometryBuildInput geom_input;
  const Attribute *point_attr_mP = NULL;
  if (pointcloud->has_motion_blur()) {
    point_attr_mP = pointcloud->attributes.find(ATTR_STD_MOTION_VERTEX_POSITION);
  }
  const float3 *points_data = pointcloud->get_points().data();
  const float *radius_data = pointcloud->get_radius().data();
  const size_t num_points = pointcloud->num_points();
  const float3 *motion_data = (point_attr_mP) ? point_attr_mP->data_float3() : NULL;
  const size_t num_steps = pointcloud->get_motion_steps();
  int num_bounds = 0;
  if (point_attr_mP == NULL) {
    bvh->custom_primitive_bound.alloc(num_points);
    for (uint j = 0; j < num_points; j++) {
      const PointCloud::Point point = pointcloud->get_point(j);
      BoundBox bounds = BoundBox::empty;
      point.bounds_grow(points_data, radius_data, bounds);
      if (bounds.valid()) {
        bvh->custom_primitive_bound[num_bounds] = bounds;
        bvh->custom_prim_info[num_bounds].x = j;
        bvh->custom_prim_info[num_bounds].y = PRIMITIVE_POINT;
        num_bounds++;
      }
    }
  }
  else if (bvh->params.num_motion_point_steps == 0) {
    bvh->custom_primitive_bound.alloc(num_points * num_steps);
    for (uint j = 0; j < num_points; j++) {
      const PointCloud::Point point = pointcloud->get_point(j);
      BoundBox bounds = BoundBox::empty;
      point.bounds_grow(points_data, radius_data, bounds);
      for (size_t step = 0; step < num_steps - 1; step++) {
        point.bounds_grow(motion_data + step * num_points, radius_data, bounds);
      }
      if (bounds.valid()) {
        bvh->custom_primitive_bound[num_bounds] = bounds;
        bvh->custom_prim_info[num_bounds].x = j;
        bvh->custom_prim_info[num_bounds].y = PRIMITIVE_POINT;
        num_bounds++;
      }
    }
  }
  else {
    const int num_bvh_steps = bvh->params.num_motion_point_steps * 2 + 1;
    const float num_bvh_steps_inv_1 = 1.0f / (num_bvh_steps - 1);
    bvh->custom_primitive_bound.alloc(num_points * num_bvh_steps);
    for (uint j = 0; j < num_points; j++) {
      const PointCloud::Point point = pointcloud->get_point(j);
      const size_t num_steps = pointcloud->get_motion_steps();
      const float3 *point_steps = point_attr_mP->data_float3();
      float4 prev_key = point.motion_key(
          points_data, radius_data, point_steps, num_points, num_steps, 0.0f, j);
      BoundBox prev_bounds = BoundBox::empty;
      point.bounds_grow(prev_key, prev_bounds);
      for (int bvh_step = 1; bvh_step < num_bvh_steps; ++bvh_step) {
        const float curr_time = (float)(bvh_step)*num_bvh_steps_inv_1;
        float4 curr_key = point.motion_key(
            points_data, radius_data, point_steps, num_points, num_steps, curr_time, j);
        BoundBox curr_bounds = BoundBox::empty;
        point.bounds_grow(curr_key, curr_bounds);
        BoundBox bounds = prev_bounds;
        bounds.grow(curr_bounds);
        if (bounds.valid()) {
          const float prev_time = (float)(bvh_step - 1) * num_bvh_steps_inv_1;
          bvh->custom_primitive_bound[num_bounds] = bounds;
          bvh->custom_prim_info[num_bounds].x = j;
          bvh->custom_prim_info[num_bounds].y = PRIMITIVE_MOTION_POINT;
          bvh->prims_time[num_bounds].x = curr_time;
          bvh->prims_time[num_bounds].y = prev_time;
          num_bounds++;
        }
        prev_bounds = curr_bounds;
      }
    }
  }
  bvh->custom_prim_aabb.aabbCount = bvh->custom_primitive_bound.size();
  bvh->custom_prim_aabb.aabbStride = sizeof(BoundBox);
  bvh->custom_primitive_bound.copy_to_device();
  bvh->custom_prim_aabb.aabbs = (void *)bvh->custom_primitive_bound.device_pointer;
  geom_input.type = hiprtPrimitiveTypeAABBList;
  geom_input.aabbList.primitive = &bvh->custom_prim_aabb;
  geom_input.geomType = Point;
  return geom_input;
 }
 void HIPRTDevice::build_blas(BVHHIPRT *bvh, Geometry *geom, hiprtBuildOptions options)
 {
  hiprtGeometryBuildInput geom_input = {};
  switch (geom->geometry_type) {
    case Geometry::MESH:
    case Geometry::VOLUME: {
      Mesh *mesh = static_cast<Mesh *>(geom);
      if (mesh->num_triangles() == 0)
        return;
      geom_input = prepare_triangle_blas(bvh, mesh);
      break;
    }
    case Geometry::HAIR: {
      Hair *const hair = static_cast<Hair *const>(geom);
      if (hair->num_segments() == 0)
        return;
      geom_input = prepare_curve_blas(bvh, hair);
      break;
    }
    case Geometry::POINTCLOUD: {
      PointCloud *pointcloud = static_cast<PointCloud *>(geom);
      if (pointcloud->num_points() == 0)
        return;
      geom_input = prepare_point_blas(bvh, pointcloud);
      break;
    }
    default:
      assert(geom_input.geomType != hiprtInvalidValue);
  }
  size_t blas_scratch_buffer_size = 0;
  hiprtError rt_err = hiprtGetGeometryBuildTemporaryBufferSize(
      hiprt_context, &geom_input, &options, &blas_scratch_buffer_size);
  if (rt_err != hiprtSuccess) {
    set_error(string_printf("Failed to get scratch buffer size for BLAS!"));
  }
  rt_err = hiprtCreateGeometry(hiprt_context, &geom_input, &options, &bvh->hiprt_geom);
  if (rt_err != hiprtSuccess) {
    set_error(string_printf("Failed to create BLAS!"));
  }
  bvh->geom_input = geom_input;
  {
    thread_scoped_lock lock(hiprt_mutex);
    if (blas_scratch_buffer_size > scratch_buffer_size) {
      scratch_buffer.alloc(blas_scratch_buffer_size);
      scratch_buffer_size = blas_scratch_buffer_size;
      scratch_buffer.zero_to_device();
    }
    rt_err = hiprtBuildGeometry(hiprt_context,
                                hiprtBuildOperationBuild,
                                &bvh->geom_input,
                                &options,
                                (void *)(scratch_buffer.device_pointer),
                                0,
                                bvh->hiprt_geom);
  }
  if (rt_err != hiprtSuccess) {
    set_error(string_printf("Failed to build BLAS"));
  }
 }
 hiprtScene HIPRTDevice::build_tlas(BVHHIPRT *bvh,
                                   vector<Object *> objects,
                                   hiprtBuildOptions options,
                                   bool refit)
 {
  hiprtBuildOperation build_operation = refit ? hiprtBuildOperationUpdate :
                                                hiprtBuildOperationBuild;
  array<hiprtFrameMatrix> transform_matrix;
  unordered_map<Geometry *, int2> prim_info_map;
  size_t custom_prim_offset = 0;
  unordered_map<Geometry *, int> prim_time_map;
  size_t num_instances = 0;
  int blender_instance_id = 0;
  size_t num_object = objects.size();
  user_instance_id.alloc(num_object);
  visibility.alloc(num_object);
  hiprt_blas_ptr.alloc(num_object);
  blas_ptr.alloc(num_object);
  transform_headers.alloc(num_object);
  custom_prim_info_offset.alloc(num_object);
  prim_time_offset.alloc(num_object);
  foreach (Object *ob, objects) {
    uint32_t mask = 0;
    if (ob->is_traceable()) {
      mask = ob->visibility_for_tracing();
    }
    Transform current_transform = ob->get_tfm();
    Geometry *geom = ob->get_geometry();
    bool transform_applied = geom->transform_applied;
    BVHHIPRT *current_bvh = static_cast<BVHHIPRT *>(geom->bvh);
    bool is_valid_geometry = current_bvh->geom_input.geomType != hiprtInvalidValue;
    hiprtGeometry hiprt_geom_current = current_bvh->hiprt_geom;
    hiprtFrameMatrix hiprt_transform_matrix = {{{0}}};
    Transform identity_matrix = transform_identity();
    get_hiprt_transform(hiprt_transform_matrix.matrix, identity_matrix);
    if (is_valid_geometry) {
      bool is_custom_prim = current_bvh->custom_prim_info.size() > 0;
      if (is_custom_prim) {
        bool has_motion_blur = current_bvh->prims_time.size() > 0;
        unordered_map<Geometry *, int2>::iterator it = prim_info_map.find(geom);
        if (prim_info_map.find(geom) != prim_info_map.end()) {
          custom_prim_info_offset[blender_instance_id] = it->second;
          if (has_motion_blur) {
            prim_time_offset[blender_instance_id] = prim_time_map[geom];
          }
        }
        else {
          int offset = bvh->custom_prim_info.size();
          prim_info_map[geom].x = offset;
          prim_info_map[geom].y = custom_prim_offset;
          bvh->custom_prim_info.resize(offset + current_bvh->custom_prim_info.size());
          memcpy(bvh->custom_prim_info.data() + offset,
                 current_bvh->custom_prim_info.data(),
                 current_bvh->custom_prim_info.size() * sizeof(int2));
          custom_prim_info_offset[blender_instance_id].x = offset;
          custom_prim_info_offset[blender_instance_id].y = custom_prim_offset;
          if (geom->geometry_type == Geometry::HAIR) {
            custom_prim_offset += ((Hair *)geom)->num_curves();
          }
          else if (geom->geometry_type == Geometry::POINTCLOUD) {
            custom_prim_offset += ((PointCloud *)geom)->num_points();
          }
          else {
            custom_prim_offset += ((Mesh *)geom)->num_triangles();
          }
          if (has_motion_blur) {
            int time_offset = bvh->prims_time.size();
            prim_time_map[geom] = time_offset;
            memcpy(bvh->prims_time.data() + time_offset,
                   current_bvh->prims_time.data(),
                   current_bvh->prims_time.size() * sizeof(float2));
            prim_time_offset[blender_instance_id] = time_offset;
          }
          else
            prim_time_offset[blender_instance_id] = -1;
        }
      }
      else
        custom_prim_info_offset[blender_instance_id] = {-1, -1};
      hiprtTransformHeader current_header = {0};
      current_header.frameCount = 1;
      current_header.frameIndex = transform_matrix.size();
      if (ob->get_motion().size()) {
        int motion_size = ob->get_motion().size();
        assert(motion_size == 1);
        array<Transform> tfm_array = ob->get_motion();
        float time_iternval = 1 / (float)(motion_size - 1);
        current_header.frameCount = motion_size;
        vector<hiprtFrameMatrix> tfm_hiprt_mb;
        tfm_hiprt_mb.resize(motion_size);
        for (int i = 0; i < motion_size; i++) {
          get_hiprt_transform(tfm_hiprt_mb[i].matrix, tfm_array[i]);
          tfm_hiprt_mb[i].time = (float)i * time_iternval;
          transform_matrix.push_back_slow(tfm_hiprt_mb[i]);
        }
      }
      else {
        if (transform_applied)
          current_transform = identity_matrix;
        get_hiprt_transform(hiprt_transform_matrix.matrix, current_transform);
        transform_matrix.push_back_slow(hiprt_transform_matrix);
      }
      transform_headers[num_instances] = current_header;
      user_instance_id[num_instances] = blender_instance_id;
      visibility[num_instances] = mask;
      hiprt_blas_ptr[num_instances] = (uint64_t)hiprt_geom_current;
      num_instances++;
    }
    blas_ptr[blender_instance_id] = (uint64_t)hiprt_geom_current;
    blender_instance_id++;
  }
  int frame_count = transform_matrix.size();
  hiprtSceneBuildInput scene_input_ptr = {0};
  scene_input_ptr.instanceCount = num_instances;
  scene_input_ptr.frameCount = frame_count;
  scene_input_ptr.frameType = hiprtFrameTypeMatrix;
  user_instance_id.copy_to_device();
  visibility.copy_to_device();
  hiprt_blas_ptr.copy_to_device();
  blas_ptr.copy_to_device();
  transform_headers.copy_to_device();
  {
    instance_transform_matrix.alloc(frame_count);
    instance_transform_matrix.host_pointer = transform_matrix.data();
    instance_transform_matrix.data_elements = sizeof(hiprtFrameMatrix);
    instance_transform_matrix.data_type = TYPE_UCHAR;
    instance_transform_matrix.data_size = frame_count;
    instance_transform_matrix.copy_to_device();
    instance_transform_matrix.host_pointer = 0;
  }
  scene_input_ptr.instanceMasks = (void *)visibility.device_pointer;
  scene_input_ptr.instanceGeometries = (void *)hiprt_blas_ptr.device_pointer;
  scene_input_ptr.instanceTransformHeaders = (void *)transform_headers.device_pointer;
  scene_input_ptr.instanceFrames = (void *)instance_transform_matrix.device_pointer;
  hiprtScene scene = 0;
  hiprtError rt_err = hiprtCreateScene(hiprt_context, &scene_input_ptr, &options, &scene);
  if (rt_err != hiprtSuccess) {
    set_error(string_printf("Failed to create TLAS"));
  }
  size_t tlas_scratch_buffer_size;
  rt_err = hiprtGetSceneBuildTemporaryBufferSize(
      hiprt_context, &scene_input_ptr, &options, &tlas_scratch_buffer_size);
  if (rt_err != hiprtSuccess) {
    set_error(string_printf("Failed to get scratch buffer size for TLAS"));
  }
  if (tlas_scratch_buffer_size > scratch_buffer_size) {
    scratch_buffer.alloc(tlas_scratch_buffer_size);
    scratch_buffer.zero_to_device();
  }
  rt_err = hiprtBuildScene(hiprt_context,
                           build_operation,
                           &scene_input_ptr,
                           &options,
                           (void *)scratch_buffer.device_pointer,
                           0,
                           scene);
  if (rt_err != hiprtSuccess) {
    set_error(string_printf("Failed to build TLAS"));
  }
  scratch_buffer.free();
  scratch_buffer_size = 0;
  if (bvh->custom_prim_info.size()) {
    size_t data_size = bvh->custom_prim_info.size();
    custom_prim_info.alloc(data_size);
    custom_prim_info.host_pointer = bvh->custom_prim_info.data();
    custom_prim_info.data_elements = 2;
    custom_prim_info.data_type = TYPE_INT;
    custom_prim_info.data_size = data_size;
    custom_prim_info.copy_to_device();
    custom_prim_info.host_pointer = 0;
    custom_prim_info_offset.copy_to_device();
  }
  if (bvh->prims_time.size()) {
    size_t data_size = bvh->prims_time.size();
    prims_time.alloc(data_size);
    prims_time.host_pointer = bvh->prims_time.data();
    prims_time.data_elements = 2;
    prims_time.data_type = TYPE_FLOAT;
    prims_time.data_size = data_size;
    prims_time.copy_to_device();
    prims_time.host_pointer = 0;
    prim_time_offset.copy_to_device();
  }
  size_t table_ptr_size = 0;
  hipDeviceptr_t table_device_ptr;
  hip_assert(hipModuleGetGlobal(&table_device_ptr, &table_ptr_size, hipModule, "kernel_params"));
  size_t kernel_param_offset[4];
  int table_index = 0;
  kernel_param_offset[table_index++] = offsetof(KernelParamsHIPRT, table_closest_intersect);
  kernel_param_offset[table_index++] = offsetof(KernelParamsHIPRT, table_shadow_intersect);
  kernel_param_offset[table_index++] = offsetof(KernelParamsHIPRT, table_local_intersect);
  kernel_param_offset[table_index++] = offsetof(KernelParamsHIPRT, table_volume_intersect);
  for (int index = 0; index < table_index; index++) {
    hip_assert(hipMemcpyHtoD(
        table_device_ptr + kernel_param_offset[index], &functions_table, sizeof(device_ptr)));
  }
  return scene;
 }
 void HIPRTDevice::build_bvh(BVH *bvh, Progress &progress, bool refit)
 {
  progress.set_substatus("Building HIPRT acceleration structure");
  hiprtBuildOptions options;
  options.buildFlags = hiprtBuildFlagBitPreferHighQualityBuild;
  BVHHIPRT *bvh_rt = static_cast<BVHHIPRT *>(bvh);
  HIPContextScope scope(this);
  if (!bvh_rt->is_tlas()) {
    vector<Geometry *> geometry = bvh_rt->geometry;
    assert(geometry.size() == 1);
    Geometry *geom = geometry[0];
    build_blas(bvh_rt, geom, options);
  }
  else {
    const vector<Object *> objects = bvh_rt->objects;
    scene = build_tlas(bvh_rt, objects, options, refit);
  }
 }
 CCL_NAMESPACE_END
 #endif
--- a/intern/cycles/device/hiprt/device_impl.h
+++ b/intern/cycles/device/hiprt/device_impl.h
@@ -0,0 +1,126 @@
 /* SPDX-License-Identifier: Apache-2.0
 * Copyright 2011-2023 Blender Foundation */
 #pragma once
 #ifdef WITH_HIPRT
 #  include "device/hip/device_impl.h"
 #  include "device/hip/kernel.h"
 #  include "device/hip/queue.h"
 #  include "device/hiprt/queue.h"
 #  ifdef WITH_HIP_DYNLOAD
 #    include "hiprtew.h"
 #  else
 #    include <hiprt/hiprt_types.h>
 #  endif
 #  include "kernel/device/hiprt/globals.h"
 CCL_NAMESPACE_BEGIN
 class Mesh;
 class Hair;
 class PointCloud;
 class Geometry;
 class Object;
 class BVHHIPRT;
 class HIPRTDevice : public HIPDevice {
 public:
  virtual BVHLayoutMask get_bvh_layout_mask(const uint kernel_features) const override;
  HIPRTDevice(const DeviceInfo &info, Stats &stats, Profiler &profiler);
  virtual ~HIPRTDevice();
  virtual unique_ptr<DeviceQueue> gpu_queue_create() override;
  string compile_kernel_get_common_cflags(const uint kernel_features) override;
  virtual string compile_kernel(const uint kernel_features,
                                const char *name,
                                const char *base = "hiprt") override;
  virtual bool load_kernels(const uint kernel_features) override;
  virtual void const_copy_to(const char *name, void *host, size_t size) override;
  virtual void build_bvh(BVH *bvh, Progress &progress, bool refit) override;
  hiprtContext get_hiprt_context()
  {
    return hiprt_context;
  }
  device_vector<int> global_stack_buffer;
 protected:
  enum Filter_Function { Closest = 0, Shadows, Local, Volume, Max_Intersect_Filter_Function };
  enum Primitive_Type { Triangle = 0, Curve, Motion_Triangle, Point, Max_Primitive_Type };
  hiprtGeometryBuildInput prepare_triangle_blas(BVHHIPRT *bvh, Mesh *mesh);
  hiprtGeometryBuildInput prepare_curve_blas(BVHHIPRT *bvh, Hair *hair);
  hiprtGeometryBuildInput prepare_point_blas(BVHHIPRT *bvh, PointCloud *pointcloud);
  void build_blas(BVHHIPRT *bvh, Geometry *geom, hiprtBuildOptions options);
  hiprtScene build_tlas(BVHHIPRT *bvh,
                        vector<Object *> objects,
                        hiprtBuildOptions options,
                        bool refit);
  hiprtContext hiprt_context;
  hiprtScene scene;
  hiprtFuncTable functions_table;
  thread_mutex hiprt_mutex;
  size_t scratch_buffer_size;
  device_vector<char> scratch_buffer;
  /* The following vectors are to transfer scene information available on the host to the GPU
   * visibility, instance_transform_matrix, transform_headers, and hiprt_blas_ptr are passed to
   * hiprt to build bvh the rest are directly used in traversal functions/intersection kernels and
   * are defined on the GPU side as members of KernelParamsHIPRT struct the host memory is copied
   * to GPU through const_copy_to() function. */
  device_vector<uint32_t> visibility;
  /* instance_transform_matrix passes transform matrix of instances converted from Cycles Transform
   * format to instanceFrames member of hiprtSceneBuildInput. */
  device_vector<hiprtFrameMatrix> instance_transform_matrix;
  /* Movement over a time interval for motion blur is captured through multiple transform matrices.
   * In this case transform matrix of an instance cannot be directly retrieved by looking up
   * instance_transform_matrix give the instance id. transform_headers maps the instance id to the
   * appropriate index to retrieve instance transform matrix (frameIndex member of
   * hiprtTransformHeader). transform_headers also has the information on how many transform
   * matrices are associated with an instance (frameCount member of hiprtTransformHeader)
   * transform_headers is passed to hiprt through instanceTransformHeaders member of
   * hiprtSceneBuildInput. */
  device_vector<hiprtTransformHeader> transform_headers;
  /* Instance/object ids are not explicitly  passed to hiprt.
   * HIP RT assigns the ids based on the order blas pointers are passed to it (through
   * instanceGeometries member of hiprtSceneBuildInput). If blas is absent for a particular
   * geometry (e.g. a plane), HIP RT removes that entry and in scenes with objects with no blas,
   * the instance id that hiprt returns for a hit point will not necessarily match the instance id
   * of the application. user_instance_id provides a map for retrieving original instance id from
   * what HIP RT returns as instance id. hiprt_blas_ptr is the list of all the valid blas pointers.
   * blas_ptr has all the valid pointers and null pointers and blas for any geometry can be
   * directly retrieved from this array (used in subsurface scattering). */
  device_vector<int> user_instance_id;
  device_vector<uint64_t> hiprt_blas_ptr;
  device_vector<uint64_t> blas_ptr;
  /* custom_prim_info stores custom information for custom primitives for all the primitives in a
   * scene. Primitive id that HIP RT returns is local to the geometry that was hit.
   * custom_prim_info_offset returns the offset required to add to the primitive id to retrieve
   * primitive info from custom_prim_info. */
  device_vector<int2> custom_prim_info;
  device_vector<int2> custom_prim_info_offset;
  /* prims_time stores primitive time for geometries with motion blur.
   * prim_time_offset returns the offset to add to primitive id to retrieve primitive time. */
  device_vector<float2> prims_time;
  device_vector<int> prim_time_offset;
 };
 CCL_NAMESPACE_END
 #endif
--- a/intern/cycles/device/hiprt/queue.cpp
+++ b/intern/cycles/device/hiprt/queue.cpp
@@ -0,0 +1,68 @@
 /* SPDX-License-Identifier: Apache-2.0
 * Copyright 2011-2022 Blender Foundation */
 #ifdef WITH_HIPRT
 #  include "device/hiprt/queue.h"
 #  include "device/hip/graphics_interop.h"
 #  include "device/hip/kernel.h"
 #  include "device/hiprt/device_impl.h"
 CCL_NAMESPACE_BEGIN
 HIPRTDeviceQueue::HIPRTDeviceQueue(HIPRTDevice *device)
    : HIPDeviceQueue((HIPDevice *)device), hiprt_device_(device)
 {
 }
 bool HIPRTDeviceQueue::enqueue(DeviceKernel kernel,
                               const int work_size,
                               DeviceKernelArguments const &args)
 {
  if (hiprt_device_->have_error()) {
    return false;
  }
  if (!device_kernel_has_intersection(kernel)) {
    return HIPDeviceQueue::enqueue(kernel, work_size, args);
  }
  debug_enqueue_begin(kernel, work_size);
  const HIPContextScope scope(hiprt_device_);
  const HIPDeviceKernel &hip_kernel = hiprt_device_->kernels.get(kernel);
  if (!hiprt_device_->global_stack_buffer.device_pointer) {
    int max_path = num_concurrent_states(0);
    hiprt_device_->global_stack_buffer.alloc(max_path * HIPRT_SHARED_STACK_SIZE * sizeof(int));
    hiprt_device_->global_stack_buffer.zero_to_device();
  }
  DeviceKernelArguments args_copy = args;
  args_copy.add(&hiprt_device_->global_stack_buffer.device_pointer);
  /* Compute kernel launch parameters. */
  const int num_threads_per_block = HIPRT_THREAD_GROUP_SIZE;
  const int num_blocks = divide_up(work_size, num_threads_per_block);
  int shared_mem_bytes = 0;
  assert_success(hipModuleLaunchKernel(hip_kernel.function,
                                       num_blocks,
                                       1,
                                       1,
                                       num_threads_per_block,
                                       1,
                                       1,
                                       shared_mem_bytes,
                                       hip_stream_,
                                       const_cast<void **>(args_copy.values),
                                       0),
                 "enqueue");
  return !(hiprt_device_->have_error());
 }
 CCL_NAMESPACE_END
 #endif /* WITH_HIPRT */
--- a/intern/cycles/device/hiprt/queue.h
+++ b/intern/cycles/device/hiprt/queue.h
@@ -0,0 +1,33 @@
 /* SPDX-License-Identifier: Apache-2.0
 * Copyright 2011-2022 Blender Foundation */
 #pragma once
 #ifdef WITH_HIPRT
 #  include "device/kernel.h"
 #  include "device/memory.h"
 #  include "device/queue.h"
 #  include "device/hip/queue.h"
 #  include "device/hip/util.h"
 CCL_NAMESPACE_BEGIN
 class HIPRTDevice;
 class HIPRTDeviceQueue : public HIPDeviceQueue {
 public:
  HIPRTDeviceQueue(HIPRTDevice *device);
  ~HIPRTDeviceQueue() {}
  virtual bool enqueue(DeviceKernel kernel,
                       const int work_size,
                       DeviceKernelArguments const &args) override;
 protected:
  HIPRTDevice *hiprt_device_;
 };
 CCL_NAMESPACE_END
 #endif /* WITH_HIPRT */
--- a/intern/cycles/device/multi/device.cpp
+++ b/intern/cycles/device/multi/device.cpp
@@ -117,6 +117,10 @@ class MultiDevice : public Device {
      return BVH_LAYOUT_MULTI_METAL;
    }
    if (bvh_layout_mask == BVH_LAYOUT_HIPRT) {
      return BVH_LAYOUT_MULTI_HIPRT;
    }
    /* When devices do not share a common BVH layout, fall back to creating one for each */
    const BVHLayoutMask BVH_LAYOUT_OPTIX_EMBREE = (BVH_LAYOUT_OPTIX | BVH_LAYOUT_EMBREE);
    if ((bvh_layout_mask_all & BVH_LAYOUT_OPTIX_EMBREE) == BVH_LAYOUT_OPTIX_EMBREE) {
@@ -158,8 +162,10 @@ class MultiDevice : public Device {
    assert(bvh->params.bvh_layout == BVH_LAYOUT_MULTI_OPTIX ||
           bvh->params.bvh_layout == BVH_LAYOUT_MULTI_METAL ||
           bvh->params.bvh_layout == BVH_LAYOUT_MULTI_HIPRT ||
           bvh->params.bvh_layout == BVH_LAYOUT_MULTI_OPTIX_EMBREE ||
-           bvh->params.bvh_layout == BVH_LAYOUT_MULTI_METAL_EMBREE);
+           bvh->params.bvh_layout == BVH_LAYOUT_MULTI_METAL_EMBREE ||
           bvh->params.bvh_layout == BVH_LAYOUT_MULTI_HIPRT_EMBREE);
    BVHMulti *const bvh_multi = static_cast<BVHMulti *>(bvh);
    bvh_multi->sub_bvhs.resize(devices.size());
@@ -184,12 +190,17 @@ class MultiDevice : public Device {
          params.bvh_layout = BVH_LAYOUT_OPTIX;
        else if (bvh->params.bvh_layout == BVH_LAYOUT_MULTI_METAL)
          params.bvh_layout = BVH_LAYOUT_METAL;
        else if (bvh->params.bvh_layout == BVH_LAYOUT_MULTI_HIPRT)
          params.bvh_layout = BVH_LAYOUT_HIPRT;
        else if (bvh->params.bvh_layout == BVH_LAYOUT_MULTI_OPTIX_EMBREE)
          params.bvh_layout = sub.device->info.type == DEVICE_OPTIX ? BVH_LAYOUT_OPTIX :
                                                                      BVH_LAYOUT_EMBREE;
        else if (bvh->params.bvh_layout == BVH_LAYOUT_MULTI_METAL_EMBREE)
          params.bvh_layout = sub.device->info.type == DEVICE_METAL ? BVH_LAYOUT_METAL :
                                                                      BVH_LAYOUT_EMBREE;
        else if (bvh->params.bvh_layout == BVH_LAYOUT_MULTI_HIPRT_EMBREE)
          params.bvh_layout = sub.device->info.type == DEVICE_HIPRT ? BVH_LAYOUT_HIPRT :
                                                                      BVH_LAYOUT_EMBREE;
        /* Skip building a bottom level acceleration structure for non-instanced geometry on Embree
         * (since they are put into the top level directly, see bvh_embree.cpp) */
--- a/intern/cycles/integrator/path_trace.cpp
+++ b/intern/cycles/integrator/path_trace.cpp
@@ -1149,6 +1149,8 @@ static const char *device_type_for_description(const DeviceType type)
      return "OptiX";
    case DEVICE_HIP:
      return "HIP";
    case DEVICE_HIPRT:
      return "HIPRT";
    case DEVICE_ONEAPI:
      return "oneAPI";
    case DEVICE_DUMMY:
--- a/intern/cycles/kernel/CMakeLists.txt
+++ b/intern/cycles/kernel/CMakeLists.txt
@@ -26,6 +26,10 @@ set(SRC_KERNEL_DEVICE_HIP
  device/hip/kernel.cpp
 )
 set(SRC_KERNEL_DEVICE_HIPRT
  device/hiprt/kernel.cpp
 )
 set(SRC_KERNEL_DEVICE_METAL
  device/metal/kernel.metal
 )
@@ -77,6 +81,13 @@ set(SRC_KERNEL_DEVICE_HIP_HEADERS
  device/hip/globals.h
 )
 set(SRC_KERNEL_DEVICE_HIPRT_HEADERS
  device/hiprt/bvh.h
  device/hiprt/common.h
  device/hiprt/globals.h
  device/hiprt/hiprt_kernels.h
 )
 set(SRC_KERNEL_DEVICE_OPTIX_HEADERS
  device/optix/bvh.h
  device/optix/compat.h
@@ -643,6 +654,86 @@ if(WITH_CYCLES_HIP_BINARIES AND WITH_CYCLES_DEVICE_HIP)
  cycles_set_solution_folder(cycles_kernel_hip)
 endif()
 # HIP RT module
 if(WITH_CYCLES_DEVICE_HIPRT AND WITH_CYCLES_HIP_BINARIES)
  set(hiprt_sources device/hiprt/kernel.cpp
    ${SRC_KERNEL_HEADERS}
    ${SRC_KERNEL_DEVICE_GPU_HEADERS}
    ${SRC_KERNEL_DEVICE_HIPRT_HEADERS}
    ${SRC_UTIL_HEADERS})
  set(bitcode_file ${CMAKE_CURRENT_BINARY_DIR}/kernel_rt_gfx.bc)
  set(hiprt_file ${CMAKE_CURRENT_BINARY_DIR}/kernel_rt_gfx.hipfb)
  set(kernel_sources ${sources})
  set(hiprt_kernel_src "/device/hiprt/kernel.cpp")
  if(WIN32)
    set(hiprt_compile_command ${CMAKE_COMMAND})
    set(hiprt_compile_flags
      -E env "HIP_PATH=${HIP_ROOT_DIR}"
      ${HIP_HIPCC_EXECUTABLE}.bat)
  else()
    set(hiprt_compile_command ${HIP_HIPCC_EXECUTABLE})
    set(hiprt_compile_flags)
  endif()
  set(target_gpus)
  foreach(arch ${CYCLES_HIP_BINARIES_ARCH})
    list(APPEND target_gpus "--offload-arch=${arch}")
  endforeach()
  set(hiprt_compile_flags
    ${hiprt_compile_flags}
    ${target_gpus}
    ${HIP_HIPCC_FLAGS}
    ${CMAKE_CURRENT_SOURCE_DIR}${hiprt_kernel_src}
    ${flags}
    -D CCL_NAMESPACE_BEGIN=
    -D CCL_NAMESPACE_END=
    -D HIPCC
    -D __HIPRT__
    -std=c++17
    -fgpu-rdc
    -c
    --gpu-bundle-output
    -emit-llvm
    -I ${CMAKE_CURRENT_SOURCE_DIR}/..
    -I ${CMAKE_CURRENT_SOURCE_DIR}/device/hiprt
    -I ${HIPRT_INCLUDE_DIR}
    -Wno-parentheses-equality
    -Wno-unused-value
    --hipcc-func-supp
    -ffast-math
    -o ${bitcode_file})
  if(WITH_CYCLES_DEBUG)
    set(hiprt_compile_flags ${hiprt_compile_flags} -D WITH_CYCLES_DEBUG)
  endif()
  add_custom_command(
    OUTPUT  ${bitcode_file}
    COMMAND ${hiprt_compile_command} ${hiprt_compile_flags}
    DEPENDS ${kernel_sources})
  if(WIN32)
    set(hiprt_link_command ${CMAKE_COMMAND})
    set(hiprt_link_flags   -E env "HIP_PATH=${HIP_ROOT_DIR}"
      ${HIP_LINKER_EXECUTABLE})
  else()
    # not implemented yet
  endif()
  set(hiprt_link_flags
    ${hiprt_link_flags}
    ${target_gpus}
    -fgpu-rdc
    --hip-link
    --cuda-device-only
    ${bitcode_file}
    ${HIPRT_BITCODE}
    -o ${hiprt_file})
  add_custom_command(
    OUTPUT  ${hiprt_file}
    COMMAND ${hiprt_link_command} ${hiprt_link_flags}
    DEPENDS ${bitcode_file})
  delayed_install("${CMAKE_CURRENT_BINARY_DIR}" "${hiprt_file}" ${CYCLES_INSTALL_PATH}/lib)
  add_custom_target(cycles_kernel_hiprt ALL DEPENDS ${hiprt_file})
  cycles_set_solution_folder(cycles_kernel_hiprt)
 endif()
 # OptiX PTX modules
 if(WITH_CYCLES_DEVICE_OPTIX AND WITH_CYCLES_CUDA_BINARIES)
@@ -1033,6 +1124,7 @@ cycles_add_library(cycles_kernel "${LIB}"
  ${SRC_KERNEL_DEVICE_CPU}
  ${SRC_KERNEL_DEVICE_CUDA}
  ${SRC_KERNEL_DEVICE_HIP}
  ${SRC_KERNEL_DEVICE_HIPRT}
  ${SRC_KERNEL_DEVICE_OPTIX}
  ${SRC_KERNEL_DEVICE_METAL}
  ${SRC_KERNEL_HEADERS}
@@ -1040,6 +1132,7 @@ cycles_add_library(cycles_kernel "${LIB}"
  ${SRC_KERNEL_DEVICE_GPU_HEADERS}
  ${SRC_KERNEL_DEVICE_CUDA_HEADERS}
  ${SRC_KERNEL_DEVICE_HIP_HEADERS}
  ${SRC_KERNEL_DEVICE_HIPRT_HEADERS}
  ${SRC_KERNEL_DEVICE_OPTIX_HEADERS}
  ${SRC_KERNEL_DEVICE_METAL_HEADERS}
  ${SRC_KERNEL_DEVICE_ONEAPI_HEADERS}
@@ -1053,6 +1146,7 @@ source_group("device\\cpu" FILES ${SRC_KERNEL_DEVICE_CPU} ${SRC_KERNEL_DEVICE_CP
 source_group("device\\cuda" FILES ${SRC_KERNEL_DEVICE_CUDA} ${SRC_KERNEL_DEVICE_CUDA_HEADERS})
 source_group("device\\gpu" FILES ${SRC_KERNEL_DEVICE_GPU_HEADERS})
 source_group("device\\hip" FILES ${SRC_KERNEL_DEVICE_HIP} ${SRC_KERNEL_DEVICE_HIP_HEADERS})
 source_group("device\\hiprt" FILES ${SRC_KERNEL_DEVICE_HIPRT} ${SRC_KERNEL_DEVICE_HIPRT_HEADERS})
 source_group("device\\optix" FILES ${SRC_KERNEL_DEVICE_OPTIX} ${SRC_KERNEL_DEVICE_OPTIX_HEADERS})
 source_group("device\\metal" FILES ${SRC_KERNEL_DEVICE_METAL} ${SRC_KERNEL_DEVICE_METAL_HEADERS})
 source_group("device\\oneapi" FILES ${SRC_KERNEL_DEVICE_ONEAPI} ${SRC_KERNEL_DEVICE_ONEAPI_HEADERS})
@@ -1090,6 +1184,8 @@ delayed_install(${CMAKE_CURRENT_SOURCE_DIR} "${SRC_KERNEL_DEVICE_CUDA_HEADERS}"
 delayed_install(${CMAKE_CURRENT_SOURCE_DIR} "${SRC_KERNEL_DEVICE_GPU_HEADERS}" ${CYCLES_INSTALL_PATH}/source/kernel/device/gpu)
 delayed_install(${CMAKE_CURRENT_SOURCE_DIR} "${SRC_KERNEL_DEVICE_HIP}" ${CYCLES_INSTALL_PATH}/source/kernel/device/hip)
 delayed_install(${CMAKE_CURRENT_SOURCE_DIR} "${SRC_KERNEL_DEVICE_HIP_HEADERS}" ${CYCLES_INSTALL_PATH}/source/kernel/device/hip)
 delayed_install(${CMAKE_CURRENT_SOURCE_DIR} "${SRC_KERNEL_DEVICE_HIPRT}" ${CYCLES_INSTALL_PATH}/source/kernel/device/hiprt)
 delayed_install(${CMAKE_CURRENT_SOURCE_DIR} "${SRC_KERNEL_DEVICE_HIPRT_HEADERS}" ${CYCLES_INSTALL_PATH}/source/kernel/device/hiprt)
 delayed_install(${CMAKE_CURRENT_SOURCE_DIR} "${SRC_KERNEL_DEVICE_OPTIX}" ${CYCLES_INSTALL_PATH}/source/kernel/device/optix)
 delayed_install(${CMAKE_CURRENT_SOURCE_DIR} "${SRC_KERNEL_DEVICE_OPTIX_HEADERS}" ${CYCLES_INSTALL_PATH}/source/kernel/device/optix)
 delayed_install(${CMAKE_CURRENT_SOURCE_DIR} "${SRC_KERNEL_DEVICE_METAL}" ${CYCLES_INSTALL_PATH}/source/kernel/device/metal)
--- a/intern/cycles/kernel/bvh/bvh.h
+++ b/intern/cycles/kernel/bvh/bvh.h
@@ -17,6 +17,8 @@
 #  include "kernel/device/metal/bvh.h"
 #elif defined(__KERNEL_OPTIX__)
 #  include "kernel/device/optix/bvh.h"
 #elif defined(__HIPRT__)
 #  include "kernel/device/hiprt/bvh.h"
 #else
 #  define __BVH2__
 #endif
--- a/intern/cycles/kernel/device/gpu/kernel.h
+++ b/intern/cycles/kernel/device/gpu/kernel.h
@@ -48,6 +48,9 @@
 #include "kernel/film/read.h"
 #if defined(__HIPRT__)
 #  include "kernel/device/hiprt/hiprt_kernels.h"
 #endif
 /* --------------------------------------------------------------------
 * Integrator.
 */
@@ -128,11 +131,13 @@ ccl_gpu_kernel(GPU_KERNEL_BLOCK_NUM_THREADS, GPU_KERNEL_MAX_REGISTERS)
 }
 ccl_gpu_kernel_postfix
 #if !defined(__HIPRT__)
 /* Intersection kernels need access to the kernel handler for specialization constants to work
 * properly. */
-#ifdef __KERNEL_ONEAPI__
+#  ifdef __KERNEL_ONEAPI__
-#  include "kernel/device/oneapi/context_intersect_begin.h"
+#    include "kernel/device/oneapi/context_intersect_begin.h"
-#endif
+#  endif
 ccl_gpu_kernel(GPU_KERNEL_BLOCK_NUM_THREADS, GPU_KERNEL_MAX_REGISTERS)
    ccl_gpu_kernel_signature(integrator_intersect_closest,
@@ -191,8 +196,10 @@ ccl_gpu_kernel(GPU_KERNEL_BLOCK_NUM_THREADS, GPU_KERNEL_MAX_REGISTERS)
 }
 ccl_gpu_kernel_postfix
-#ifdef __KERNEL_ONEAPI__
+#  ifdef __KERNEL_ONEAPI__
-#  include "kernel/device/oneapi/context_intersect_end.h"
+#    include "kernel/device/oneapi/context_intersect_end.h"
 #  endif
 #endif
 ccl_gpu_kernel(GPU_KERNEL_BLOCK_NUM_THREADS, GPU_KERNEL_MAX_REGISTERS)
@@ -259,11 +266,13 @@ ccl_gpu_kernel_postfix
 constant int __dummy_constant [[function_constant(Kernel_DummyConstant)]];
 #endif
 #if !defined(__HIPRT__)
 /* Kernels using intersections need access to the kernel handler for specialization constants to
 * work properly. */
-#ifdef __KERNEL_ONEAPI__
+#  ifdef __KERNEL_ONEAPI__
-#  include "kernel/device/oneapi/context_intersect_begin.h"
+#    include "kernel/device/oneapi/context_intersect_begin.h"
-#endif
+#  endif
 ccl_gpu_kernel(GPU_KERNEL_BLOCK_NUM_THREADS, GPU_KERNEL_MAX_REGISTERS)
    ccl_gpu_kernel_signature(integrator_shade_surface_raytrace,
@@ -276,15 +285,15 @@ ccl_gpu_kernel(GPU_KERNEL_BLOCK_NUM_THREADS, GPU_KERNEL_MAX_REGISTERS)
  if (ccl_gpu_kernel_within_bounds(global_index, work_size)) {
    const int state = (path_index_array) ? path_index_array[global_index] : global_index;
-#if defined(__KERNEL_METAL_APPLE__) && defined(__METALRT__)
+#  if defined(__KERNEL_METAL_APPLE__) && defined(__METALRT__)
    KernelGlobals kg = NULL;
    /* Workaround Ambient Occlusion and Bevel nodes not working with Metal.
     * Dummy offset should not affect result, but somehow fixes bug! */
    kg += __dummy_constant;
    ccl_gpu_kernel_call(integrator_shade_surface_raytrace(kg, state, render_buffer));
-#else
+#  else
    ccl_gpu_kernel_call(integrator_shade_surface_raytrace(NULL, state, render_buffer));
-#endif
+#  endif
  }
 }
 ccl_gpu_kernel_postfix
@@ -303,8 +312,11 @@ ccl_gpu_kernel(GPU_KERNEL_BLOCK_NUM_THREADS, GPU_KERNEL_MAX_REGISTERS)
  }
 }
 ccl_gpu_kernel_postfix
-#ifdef __KERNEL_ONEAPI__
+
-#  include "kernel/device/oneapi/context_intersect_end.h"
+#  ifdef __KERNEL_ONEAPI__
 #    include "kernel/device/oneapi/context_intersect_end.h"
 #  endif
 #endif
 ccl_gpu_kernel(GPU_KERNEL_BLOCK_NUM_THREADS, GPU_KERNEL_MAX_REGISTERS)
--- a/intern/cycles/kernel/device/hip/config.h
+++ b/intern/cycles/kernel/device/hip/config.h
@@ -20,6 +20,10 @@
 #define GPU_KERNEL_BLOCK_NUM_THREADS 1024
 #define GPU_KERNEL_MAX_REGISTERS 64
 /* For performance tuning of hiprt kernels we might have to change the number
 * that's why we don't use GPU_KERNEL_BLOCK_NUM_THREADS. */
 #define GPU_HIPRT_KERNEL_BLOCK_NUM_THREADS 1024
 /* Compute number of threads per block and minimum blocks per multiprocessor
 * given the maximum number of registers per thread. */
 #define ccl_gpu_kernel(block_num_threads, thread_num_registers) \
--- a/intern/cycles/kernel/device/hiprt/bvh.h
+++ b/intern/cycles/kernel/device/hiprt/bvh.h
@@ -0,0 +1,219 @@
 /* SPDX-License-Identifier: Apache-2.0
 * Copyright 2011-2022 Blender Foundation */
 #pragma once
 #include "kernel/device/hiprt/common.h"
 CCL_NAMESPACE_BEGIN
 ccl_device_inline bool scene_intersect_valid(ccl_private const Ray *ray)
 {
  return isfinite_safe(ray->P.x) && isfinite_safe(ray->D.x) && len_squared(ray->D) != 0.0f;
 }
 ccl_device_intersect bool scene_intersect(KernelGlobals kg,
                                          ccl_private const Ray *ray,
                                          const uint visibility,
                                          ccl_private Intersection *isect)
 {
  isect->t = ray->tmax;
  isect->u = 0.0f;
  isect->v = 0.0f;
  isect->prim = PRIM_NONE;
  isect->object = OBJECT_NONE;
  isect->type = PRIMITIVE_NONE;
  if (!scene_intersect_valid(ray)) {
    isect->t = ray->tmax;
    isect->type = PRIMITIVE_NONE;
    return false;
  }
  hiprtRay ray_hip;
  SET_HIPRT_RAY(ray_hip, ray)
  RayPayload payload;
  payload.self = ray->self;
  payload.kg = kg;
  payload.visibility = visibility;
  payload.prim_type = PRIMITIVE_NONE;
  payload.ray_time = ray->time;
  hiprtHit hit = {};
  GET_TRAVERSAL_STACK()
  if (visibility & PATH_RAY_SHADOW_OPAQUE) {
    GET_TRAVERSAL_ANY_HIT(table_closest_intersect, 0)
    hit = traversal.getNextHit();
  }
  else {
    GET_TRAVERSAL_CLOSEST_HIT(table_closest_intersect, 0)
    hit = traversal.getNextHit();
  }
  if (hit.hasHit()) {
    set_intersect_point(kg, hit, isect);
    if (isect->type > 1) {  // should be applied only for curves
      isect->type = payload.prim_type;
      isect->prim = hit.primID;
    }
    return true;
  }
  return false;
 }
 #ifdef __BVH_LOCAL__
 ccl_device_intersect bool scene_intersect_local(KernelGlobals kg,
                                                ccl_private const Ray *ray,
                                                ccl_private LocalIntersection *local_isect,
                                                int local_object,
                                                ccl_private uint *lcg_state,
                                                int max_hits)
 {
  if (!scene_intersect_valid(ray)) {
    if (local_isect) {
      local_isect->num_hits = 0;
    }
    return false;
  }
  float3 P = ray->P;
  float3 dir = bvh_clamp_direction(ray->D);
  float3 idir = bvh_inverse_direction(dir);
  if (local_isect != NULL) {
    local_isect->num_hits = 0;
  }
  const int object_flag = kernel_data_fetch(object_flag, local_object);
  if (!(object_flag & SD_OBJECT_TRANSFORM_APPLIED)) {
 #  if BVH_FEATURE(BVH_MOTION)
    bvh_instance_motion_push(kg, local_object, ray, &P, &dir, &idir);
 #  else
    bvh_instance_push(kg, local_object, ray, &P, &dir, &idir);
 #  endif
  }
  hiprtRay ray_hip;
  ray_hip.origin = P;
  ray_hip.direction = dir;
  ray_hip.maxT = ray->tmax;
  ray_hip.minT = ray->tmin;
  LocalPayload payload = {0};
  payload.kg = kg;
  payload.self = ray->self;
  payload.local_object = local_object;
  payload.max_hits = max_hits;
  payload.lcg_state = lcg_state;
  payload.local_isect = local_isect;
  GET_TRAVERSAL_STACK()
  void *local_geom = (void *)(kernel_data_fetch(blas_ptr, local_object));
  // we don't need custom intersection functions for SSR
 #  ifdef HIPRT_SHARED_STACK
  hiprtGeomTraversalAnyHitCustomStack<Stack> traversal(local_geom,
                                                       ray_hip,
                                                       stack,
                                                       hiprtTraversalHintDefault,
                                                       &payload,
                                                       kernel_params.table_local_intersect,
                                                       2);
 #  else
  hiprtGeomTraversalAnyHit traversal(
      local_geom, ray_hip, table, hiprtTraversalHintDefault, &payload);
 #  endif
  hiprtHit hit = traversal.getNextHit();
  return hit.hasHit();
 }
 #endif  //__BVH_LOCAL__
 #ifdef __SHADOW_RECORD_ALL__
 ccl_device_intersect bool scene_intersect_shadow_all(KernelGlobals kg,
                                                     IntegratorShadowState state,
                                                     ccl_private const Ray *ray,
                                                     uint visibility,
                                                     uint max_hits,
                                                     ccl_private uint *num_recorded_hits,
                                                     ccl_private float *throughput)
 {
  *throughput = 1.0f;
  *num_recorded_hits = 0;
  if (!scene_intersect_valid(ray)) {
    return false;
  }
  hiprtRay ray_hip;
  SET_HIPRT_RAY(ray_hip, ray)
  ShadowPayload payload;
  payload.kg = kg;
  payload.self = ray->self;
  payload.in_state = state;
  payload.max_hits = max_hits;
  payload.visibility = visibility;
  payload.prim_type = PRIMITIVE_TRIANGLE;
  payload.ray_time = ray->time;
  payload.num_hits = 0;
  payload.r_num_recorded_hits = num_recorded_hits;
  payload.r_throughput = throughput;
  GET_TRAVERSAL_STACK()
  GET_TRAVERSAL_ANY_HIT(table_shadow_intersect, 1)
  hiprtHit hit = traversal.getNextHit();
  num_recorded_hits = payload.r_num_recorded_hits;
  throughput = payload.r_throughput;
  return hit.hasHit();
 }
 #endif /* __SHADOW_RECORD_ALL__ */
 #ifdef __VOLUME__
 ccl_device_intersect bool scene_intersect_volume(KernelGlobals kg,
                                                 ccl_private const Ray *ray,
                                                 ccl_private Intersection *isect,
                                                 const uint visibility)
 {
  isect->t = ray->tmax;
  isect->u = 0.0f;
  isect->v = 0.0f;
  isect->prim = PRIM_NONE;
  isect->object = OBJECT_NONE;
  isect->type = PRIMITIVE_NONE;
  if (!scene_intersect_valid(ray)) {
    return false;
  }
  hiprtRay ray_hip;
  SET_HIPRT_RAY(ray_hip, ray)
  RayPayload payload;
  payload.self = ray->self;
  payload.kg = kg;
  payload.visibility = visibility;
  payload.prim_type = PRIMITIVE_NONE;
  payload.ray_time = ray->time;
  GET_TRAVERSAL_STACK()
  GET_TRAVERSAL_CLOSEST_HIT(table_volume_intersect, 3)
  hiprtHit hit = traversal.getNextHit();
  // return hit.hasHit();
  if (hit.hasHit()) {
    set_intersect_point(kg, hit, isect);
    if (isect->type > 1) {  // should be applied only for curves
      isect->type = payload.prim_type;
      isect->prim = hit.primID;
    }
    return true;
  }
  else
    return false;
 }
 #endif /* __VOLUME__ */
 CCL_NAMESPACE_END
--- a/intern/cycles/kernel/device/hiprt/common.h
+++ b/intern/cycles/kernel/device/hiprt/common.h
@@ -0,0 +1,637 @@
 /* SPDX-License-Identifier: Apache-2.0
 * Copyright 2011-2022 Blender Foundation */
 #ifdef __HIPRT__
 struct RayPayload {
  KernelGlobals kg;
  RaySelfPrimitives self;
  uint visibility;
  int prim_type;
  float ray_time;
 };
 struct ShadowPayload {
  KernelGlobals kg;
  RaySelfPrimitives self;
  uint visibility;
  int prim_type;
  float ray_time;
  int in_state;
  uint max_hits;
  uint num_hits;
  uint *r_num_recorded_hits;
  float *r_throughput;
 };
 struct LocalPayload {
  KernelGlobals kg;
  RaySelfPrimitives self;
  int prim_type;
  float ray_time;
  int local_object;
  uint max_hits;
  uint *lcg_state;
  LocalIntersection *local_isect;
 };
 #  define SET_HIPRT_RAY(RAY_RT, RAY) \
    RAY_RT.direction = RAY->D; \
    RAY_RT.origin = RAY->P; \
    RAY_RT.maxT = RAY->tmax; \
    RAY_RT.minT = RAY->tmin;
 #  if defined(HIPRT_SHARED_STACK)
 #    define GET_TRAVERSAL_STACK() \
      Stack stack(&kg->global_stack_buffer[0], \
                  HIPRT_THREAD_STACK_SIZE, \
                  kg->shared_stack, \
                  HIPRT_SHARED_STACK_SIZE);
 #  else
 #    define GET_TRAVERSAL_STACK()
 #  endif
 #  ifdef HIPRT_SHARED_STACK
 #    define GET_TRAVERSAL_ANY_HIT(FUNCTION_TABLE, RAY_TYPE) \
      hiprtSceneTraversalAnyHitCustomStack<Stack> traversal(kernel_data.device_bvh, \
                                                            ray_hip, \
                                                            stack, \
                                                            visibility, \
                                                            hiprtTraversalHintDefault, \
                                                            &payload, \
                                                            kernel_params.FUNCTION_TABLE, \
                                                            RAY_TYPE); \
      hiprtSceneTraversalAnyHitCustomStack<Stack> traversal_simple( \
          kernel_data.device_bvh, ray_hip, stack, visibility);
 #    define GET_TRAVERSAL_CLOSEST_HIT(FUNCTION_TABLE, RAY_TYPE) \
      hiprtSceneTraversalClosestCustomStack<Stack> traversal(kernel_data.device_bvh, \
                                                             ray_hip, \
                                                             stack, \
                                                             visibility, \
                                                             hiprtTraversalHintDefault, \
                                                             &payload, \
                                                             kernel_params.FUNCTION_TABLE, \
                                                             RAY_TYPE); \
      hiprtSceneTraversalClosestCustomStack<Stack> traversal_simple( \
          kernel_data.device_bvh, ray_hip, stack, visibility);
 #  else
 #    define GET_TRAVERSAL_ANY_HIT(FUNCTION_TABLE) \
      hiprtSceneTraversalAnyHit traversal(kernel_data.device_bvh, \
                                          ray_hip, \
                                          visibility, \
                                          FUNCTION_TABLE, \
                                          hiprtTraversalHintDefault, \
                                          &payload); \
      hiprtSceneTraversalAnyHit traversal_simple(kernel_data.device_bvh, ray_hip, visibility);
 #    define GET_TRAVERSAL_CLOSEST_HIT(FUNCTION_TABLE) \
      hiprtSceneTraversalClosest traversal(kernel_data.device_bvh, \
                                           ray_hip, \
                                           visibility, \
                                           FUNCTION_TABLE, \
                                           hiprtTraversalHintDefault, \
                                           &payload); \
      hiprtSceneTraversalClosest traversal_simple(kernel_data.device_bvh, ray_hip, visibility);
 #  endif
 ccl_device_inline void set_intersect_point(KernelGlobals kg,
                                           hiprtHit &hit,
                                           ccl_private Intersection *isect)
 {
  int prim_offset = 0;
  int object_id = kernel_data_fetch(user_instance_id, hit.instanceID);
  prim_offset = kernel_data_fetch(object_prim_offset, object_id);
  isect->type = kernel_data_fetch(objects, object_id).primitive_type;
  isect->t = hit.t;
  isect->prim = hit.primID + prim_offset;
  isect->object = object_id;
  isect->u = hit.uv.x;
  isect->v = hit.uv.y;
 }
 // custom intersection functions
 ccl_device_inline bool curve_custom_intersect(const hiprtRay &ray,
                                              const void *userPtr,
                                              void *payload,
                                              hiprtHit &hit)
 {
  Intersection isect;
  RayPayload *local_payload = (RayPayload *)payload;
  // could also cast shadow payload to get the elements needed to do the intersection
  // no need to write a separate function for shadow intersection
  KernelGlobals kg = local_payload->kg;
  int object_id = kernel_data_fetch(user_instance_id, hit.instanceID);
  int2 data_offset = kernel_data_fetch(custom_prim_info_offset, object_id);
  // data_offset.x: where the data (prim id, type )for the geometry of the current object begins
  // the prim_id that is in hiprtHit hit is local to the partciular geometry so we add the above
  // ofstream
  // to map prim id in hiprtHit to the one compatible to what next stage expects
  // data_offset.y: the offset that has to be added to a local primitive to get the global
  // primitive id = kernel_data_fetch(object_prim_offset, object_id);
  int prim_offset = data_offset.y;
  int curve_index = kernel_data_fetch(custom_prim_info, hit.primID + data_offset.x).x;
  int key_value = kernel_data_fetch(custom_prim_info, hit.primID + data_offset.x).y;
  if (intersection_skip_self_shadow(local_payload->self, object_id, curve_index + prim_offset))
    return false;
  float ray_time = local_payload->ray_time;
  if ((key_value & PRIMITIVE_MOTION) && kernel_data.bvh.use_bvh_steps) {
    int time_offset = kernel_data_fetch(prim_time_offset, object_id);
    float2 prims_time = kernel_data_fetch(prims_time, hit.primID + time_offset);
    if (ray_time < prims_time.x || ray_time > prims_time.y) {
      return false;
    }
  }
  bool b_hit = curve_intersect(kg,
                               &isect,
                               ray.origin,
                               ray.direction,
                               ray.minT,
                               ray.maxT,
                               object_id,
                               curve_index + prim_offset,
                               ray_time,
                               key_value);
  if (b_hit) {
    hit.uv.x = isect.u;
    hit.uv.y = isect.v;
    hit.t = isect.t;
    hit.primID = isect.prim;
    local_payload->prim_type = isect.type;  // packed_curve_type;
  }
  return b_hit;
 }
 ccl_device_inline bool motion_triangle_custom_intersect(const hiprtRay &ray,
                                                        const void *userPtr,
                                                        void *payload,
                                                        hiprtHit &hit)
 {
 #  ifdef MOTION_BLUR
  RayPayload *local_payload = (RayPayload *)payload;
  KernelGlobals kg = local_payload->kg;
  int object_id = kernel_data_fetch(user_instance_id, hit.instanceID);
  int2 data_offset = kernel_data_fetch(custom_prim_info_offset, object_id);
  int prim_offset = kernel_data_fetch(object_prim_offset, object_id);
  int prim_id_local = kernel_data_fetch(custom_prim_info, hit.primID + data_offset.x).x;
  int prim_id_global = prim_id_local + prim_offset;
  if (intersection_skip_self_shadow(local_payload->self, object_id, prim_id_global))
    return false;
  Intersection isect;
  bool b_hit = motion_triangle_intersect(kg,
                                         &isect,
                                         ray.origin,
                                         ray.direction,
                                         ray.minT,
                                         ray.maxT,
                                         local_payload->ray_time,
                                         local_payload->visibility,
                                         object_id,
                                         prim_id_global,
                                         prim_id_local);
  if (b_hit) {
    hit.uv.x = isect.u;
    hit.uv.y = isect.v;
    hit.t = isect.t;
    hit.primID = isect.prim;
    local_payload->prim_type = isect.type;
  }
  return b_hit;
 #  else
  return false;
 #  endif
 }
 ccl_device_inline bool motion_triangle_custom_local_intersect(const hiprtRay &ray,
                                                              const void *userPtr,
                                                              void *payload,
                                                              hiprtHit &hit)
 {
 #  ifdef MOTION_BLUR
  LocalPayload *local_payload = (LocalPayload *)payload;
  KernelGlobals kg = local_payload->kg;
  int object_id = local_payload->local_object;
  int prim_offset = kernel_data_fetch(object_prim_offset, object_id);
  int2 data_offset = kernel_data_fetch(custom_prim_info_offset, object_id);
  int prim_id_local = kernel_data_fetch(custom_prim_info, hit.primID + data_offset.x).x;
  int prim_id_global = prim_id_local + prim_offset;
  if (intersection_skip_self_local(local_payload->self, prim_id_global))
    return false;
  LocalIntersection *local_isect = local_payload->local_isect;
  bool b_hit = motion_triangle_intersect_local(kg,
                                               local_isect,
                                               ray.origin,
                                               ray.direction,
                                               local_payload->ray_time,
                                               object_id,
                                               prim_id_global,
                                               prim_id_local,
                                               ray.minT,
                                               ray.maxT,
                                               local_payload->lcg_state,
                                               local_payload->max_hits);
  if (b_hit) {
    local_payload->prim_type = PRIMITIVE_MOTION_TRIANGLE;
  }
  return b_hit;
 #  else
  return false;
 #  endif
 }
 ccl_device_inline bool motion_triangle_custom_volume_intersect(const hiprtRay &ray,
                                                               const void *userPtr,
                                                               void *payload,
                                                               hiprtHit &hit)
 {
 #  ifdef MOTION_BLUR
  RayPayload *local_payload = (RayPayload *)payload;
  KernelGlobals kg = local_payload->kg;
  int object_id = kernel_data_fetch(user_instance_id, hit.instanceID);
  int object_flag = kernel_data_fetch(object_flag, object_id);
  if (!(object_flag & SD_OBJECT_HAS_VOLUME))
    return false;
  int2 data_offset = kernel_data_fetch(custom_prim_info_offset, object_id);
  int prim_offset = kernel_data_fetch(object_prim_offset, object_id);
  int prim_id_local = kernel_data_fetch(custom_prim_info, hit.primID + data_offset.x).x;
  int prim_id_global = prim_id_local + prim_offset;
  if (intersection_skip_self_shadow(local_payload->self, object_id, prim_id_global))
    return false;
  Intersection isect;
  bool b_hit = motion_triangle_intersect(kg,
                                         &isect,
                                         ray.origin,
                                         ray.direction,
                                         ray.minT,
                                         ray.maxT,
                                         local_payload->ray_time,
                                         local_payload->visibility,
                                         object_id,
                                         prim_id_global,
                                         prim_id_local);
  if (b_hit) {
    hit.uv.x = isect.u;
    hit.uv.y = isect.v;
    hit.t = isect.t;
    hit.primID = isect.prim;
    local_payload->prim_type = isect.type;
  }
  return b_hit;
 #  else
  return false;
 #  endif
 }
 ccl_device_inline bool point_custom_intersect(const hiprtRay &ray,
                                              const void *userPtr,
                                              void *payload,
                                              hiprtHit &hit)
 {
 #  ifdef POINT_CLOUD
  RayPayload *local_payload = (RayPayload *)payload;
  KernelGlobals kg = local_payload->kg;
  int object_id = kernel_data_fetch(user_instance_id, hit.instanceID);
  int2 data_offset = kernel_data_fetch(custom_prim_info_offset, object_id);
  int prim_offset = kernel_data_fetch(object_prim_offset, object_id);
  int2 prim_info = kernel_data_fetch(custom_prim_info, hit.primID + data_offset.x);
  int prim_id_local = prim_info.x;
  int prim_id_global = prim_id_local + prim_offset;
  int type = prim_info.y;
  if (intersection_skip_self_shadow(local_payload->self, object_id, prim_id_global))
    return false;
  float ray_time = local_payload->ray_time;
  if ((type & PRIMITIVE_MOTION) && kernel_data.bvh.use_bvh_steps) {
    int time_offset = kernel_data_fetch(prim_time_offset, object_id);
    float2 prims_time = kernel_data_fetch(prims_time, hit.primID + time_offset);
    if (ray_time < prims_time.x || ray_time > prims_time.y) {
      return false;
    }
  }
  Intersection isect;
  bool b_hit = point_intersect(kg,
                               &isect,
                               ray.origin,
                               ray.direction,
                               ray.minT,
                               ray.maxT,
                               object_id,
                               prim_id_global,
                               ray_time,
                               type);
  if (b_hit) {
    hit.uv.x = isect.u;
    hit.uv.y = isect.v;
    hit.t = isect.t;
    hit.primID = isect.prim;
    local_payload->prim_type = isect.type;
  }
  return b_hit;
 #  else
  return false;
 #  endif
 }
 // intersection filters
 ccl_device_inline bool closest_intersection_filter(const hiprtRay &ray,
                                                   const void *data,
                                                   void *user_data,
                                                   const hiprtHit &hit)
 {
  RayPayload *payload = (RayPayload *)user_data;
  int object_id = kernel_data_fetch(user_instance_id, hit.instanceID);
  int prim_offset = kernel_data_fetch(object_prim_offset, object_id);
  int prim = hit.primID + prim_offset;
  if (intersection_skip_self_shadow(payload->self, object_id, prim))
    return true;
  else
    return false;
 }
 ccl_device_inline bool shadow_intersection_filter(const hiprtRay &ray,
                                                  const void *data,
                                                  void *user_data,
                                                  const hiprtHit &hit)
 {
  ShadowPayload *payload = (ShadowPayload *)user_data;
  uint num_hits = payload->num_hits;
  uint num_recorded_hits = *(payload->r_num_recorded_hits);
  uint max_hits = payload->max_hits;
  int state = payload->in_state;
  KernelGlobals kg = payload->kg;
  RaySelfPrimitives self = payload->self;
  int object = kernel_data_fetch(user_instance_id, hit.instanceID);
  int prim_offset = kernel_data_fetch(object_prim_offset, object);
  int prim = hit.primID + prim_offset;
  float ray_tmax = hit.t;
 #  ifdef __VISIBILITY_FLAG__
  if ((kernel_data_fetch(objects, object).visibility & payload->visibility) == 0) {
    return true;  // no hit - continue traversal
  }
 #  endif
  if (intersection_skip_self_shadow(self, object, prim)) {
    return true;  // no hit -continue traversal
  }
  float u = hit.uv.x;
  float v = hit.uv.y;
  int type = kernel_data_fetch(objects, object).primitive_type;
 #  ifdef __HAIR__
  if (type & (PRIMITIVE_CURVE_THICK | PRIMITIVE_CURVE_RIBBON)) {
    const KernelCurveSegment segment = kernel_data_fetch(curve_segments, prim);
    type = segment.type;
    prim = segment.prim;
  }
 #  endif
 #  ifndef __TRANSPARENT_SHADOWS__
  return false;
 #  else
  if (num_hits >= max_hits ||
      !(intersection_get_shader_flags(NULL, prim, type) & SD_HAS_TRANSPARENT_SHADOW)) {
    return false;
  }
  if (type & PRIMITIVE_CURVE) {
    float throughput = *payload->r_throughput;
    throughput *= intersection_curve_shadow_transparency(kg, object, prim, type, u);
    *payload->r_throughput = throughput;
    payload->num_hits += 1;
    if (throughput < CURVE_SHADOW_TRANSPARENCY_CUTOFF) {
      return false;
    }
    else {
      return true;
    }
  }
  uint record_index = num_recorded_hits;
  num_hits += 1;
  num_recorded_hits += 1;
  payload->num_hits = num_hits;
  *(payload->r_num_recorded_hits) = num_recorded_hits;
  const uint max_record_hits = min(max_hits, INTEGRATOR_SHADOW_ISECT_SIZE);
  if (record_index >= max_record_hits) {
    float max_recorded_t = INTEGRATOR_STATE_ARRAY(state, shadow_isect, 0, t);
    uint max_recorded_hit = 0;
    for (int i = 1; i < max_record_hits; i++) {
      const float isect_t = INTEGRATOR_STATE_ARRAY(state, shadow_isect, i, t);
      if (isect_t > max_recorded_t) {
        max_recorded_t = isect_t;
        max_recorded_hit = i;
      }
    }
    if (ray_tmax >= max_recorded_t) {
      return true;
    }
    record_index = max_recorded_hit;
  }
  INTEGRATOR_STATE_ARRAY_WRITE(state, shadow_isect, record_index, u) = u;
  INTEGRATOR_STATE_ARRAY_WRITE(state, shadow_isect, record_index, v) = v;
  INTEGRATOR_STATE_ARRAY_WRITE(state, shadow_isect, record_index, t) = ray_tmax;
  INTEGRATOR_STATE_ARRAY_WRITE(state, shadow_isect, record_index, prim) = prim;
  INTEGRATOR_STATE_ARRAY_WRITE(state, shadow_isect, record_index, object) = object;
  INTEGRATOR_STATE_ARRAY_WRITE(state, shadow_isect, record_index, type) = type;
  return true;
 #  endif /* __TRANSPARENT_SHADOWS__ */
 }
 ccl_device_inline bool local_intersection_filter(const hiprtRay &ray,
                                                 const void *data,
                                                 void *user_data,
                                                 const hiprtHit &hit)
 {
 #  ifdef __BVH_LOCAL__
  LocalPayload *payload = (LocalPayload *)user_data;
  KernelGlobals kg = payload->kg;
  int object_id = payload->local_object;
  int prim_offset = kernel_data_fetch(object_prim_offset, object_id);
  int prim = hit.primID + prim_offset;
 #    ifndef __RAY_OFFSET__
  if (intersection_skip_self_local(payload->self, prim)) {
    return true;  // continue search
  }
 #    endif
  uint max_hits = payload->max_hits;
  if (max_hits == 0) {
    return false;  // stop search
  }
  int hit_index = 0;
  if (payload->lcg_state) {
    for (int i = min(max_hits, payload->local_isect->num_hits) - 1; i >= 0; --i) {
      if (hit.t == payload->local_isect->hits[i].t) {
        return true;  // continue search
      }
    }
    hit_index = payload->local_isect->num_hits++;
    if (payload->local_isect->num_hits > max_hits) {
      hit_index = lcg_step_uint(payload->lcg_state) % payload->local_isect->num_hits;
      if (hit_index >= max_hits) {
        return true;  // continue search
      }
    }
  }
  else {
    if (payload->local_isect->num_hits && hit.t > payload->local_isect->hits[0].t) {
      return true;
    }
    payload->local_isect->num_hits = 1;
  }
  Intersection *isect = &payload->local_isect->hits[hit_index];
  isect->t = hit.t;
  isect->prim = prim;
  isect->object = object_id;
  isect->type = PRIMITIVE_TRIANGLE;  // kernel_data_fetch(__objects, object_id).primitive_type;
  isect->u = hit.uv.x;
  isect->v = hit.uv.y;
  payload->local_isect->Ng[hit_index] = hit.normal;
  return true;
 #  endif
 }
 ccl_device_inline bool volume_intersection_filter(const hiprtRay &ray,
                                                  const void *data,
                                                  void *user_data,
                                                  const hiprtHit &hit)
 {
  RayPayload *payload = (RayPayload *)user_data;
  int object_id = kernel_data_fetch(user_instance_id, hit.instanceID);
  int prim_offset = kernel_data_fetch(object_prim_offset, object_id);
  int prim = hit.primID + prim_offset;
  int object_flag = kernel_data_fetch(object_flag, object_id);
  if (intersection_skip_self(payload->self, object_id, prim))
    return true;
  else if ((object_flag & SD_OBJECT_HAS_VOLUME) == 0)
    return true;
  else
    return false;
 }
 HIPRT_DEVICE bool intersectFunc(u32 geomType,
                                u32 rayType,
                                const hiprtFuncTableHeader &tableHeader,
                                const hiprtRay &ray,
                                void *payload,
                                hiprtHit &hit)
 {
  const u32 index = tableHeader.numGeomTypes * rayType + geomType;
  const void *data = tableHeader.funcDataSets[index].filterFuncData;
  switch (index) {
    case Curve_Intersect_Function:
    case Curve_Intersect_Shadow:
      return curve_custom_intersect(ray, data, payload, hit);
    case Motion_Triangle_Intersect_Function:
    case Motion_Triangle_Intersect_Shadow:
      return motion_triangle_custom_intersect(ray, data, payload, hit);
    case Motion_Triangle_Intersect_Local:
      return motion_triangle_custom_local_intersect(ray, data, payload, hit);
    case Motion_Triangle_Intersect_Volume:
      return motion_triangle_custom_volume_intersect(ray, data, payload, hit);
    case Point_Intersect_Function:
    case Point_Intersect_Shadow:
      return point_custom_intersect(ray, data, payload, hit);
    default:
      break;
  }
  return false;
 }
 HIPRT_DEVICE bool filterFunc(u32 geomType,
                             u32 rayType,
                             const hiprtFuncTableHeader &tableHeader,
                             const hiprtRay &ray,
                             void *payload,
                             const hiprtHit &hit)
 {
  const u32 index = tableHeader.numGeomTypes * rayType + geomType;
  const void *data = tableHeader.funcDataSets[index].intersectFuncData;
  switch (index) {
    case Triangle_Filter_Closest:
      return closest_intersection_filter(ray, data, payload, hit);
    case Triangle_Filter_Shadow:
    case Curve_Filter_Shadow:
    case Motion_Triangle_Filter_Shadow:
    case Point_Filter_Shadow:
      return shadow_intersection_filter(ray, data, payload, hit);
    case Triangle_Filter_Local:
    case Motion_Triangle_Filter_Local:
      return local_intersection_filter(ray, data, payload, hit);
    case Triangle_Filter_Volume:
    case Motion_Triangle_Filter_Volume:
      return volume_intersection_filter(ray, data, payload, hit);
    default:
      break;
  }
  return false;
 }
 #endif
--- a/intern/cycles/kernel/device/hiprt/globals.h
+++ b/intern/cycles/kernel/device/hiprt/globals.h
@@ -0,0 +1,158 @@
 /* SPDX-License-Identifier: Apache-2.0
 * Copyright 2011-2022 Blender Foundation */
 #pragma once
 #include "kernel/types.h"
 #include "kernel/integrator/state.h"
 #include "kernel/util/profiling.h"
 #define HIPRT_SHARED_STACK
 /* The size of global stack  availavle to each thread (memory reserved for each thread in
 * global_stack_buffer). */
 #define HIPRT_THREAD_STACK_SIZE 64
 /* LDS (Local Data Storage) allocation for each thread, the number is obtained empirically. */
 #define HIPRT_SHARED_STACK_SIZE 24
 /* HIPRT_THREAD_GROUP_SIZE is the number of threads per work group for intersection kernels
 * The default number of threads per workgroup is 1024, however, since HIP RT intersection kernels
 * use local memory, and the local memory size in those kernels scales up with the number of
 * threads, the number of threads to is scaled down to 256 to avoid going over maximum local memory
 * and to strike a balance between memory access and the number of waves.
 *
 * Total local stack size would be number of threads * HIPRT_SHARED_STACK_SIZE. */
 #define HIPRT_THREAD_GROUP_SIZE 256
 CCL_NAMESPACE_BEGIN
 struct KernelGlobalsGPU {
  int *global_stack_buffer;
 #ifdef HIPRT_SHARED_STACK
  int *shared_stack;
 #endif
 };
 typedef ccl_global KernelGlobalsGPU *ccl_restrict KernelGlobals;
 #if defined(HIPRT_SHARED_STACK)
 /* This macro allocates shared memory and to pass the shared memory down to intersection functions
 * KernelGlobals is used. */
 #  define HIPRT_INIT_KERNEL_GLOBAL() \
    ccl_gpu_shared int shared_stack[HIPRT_SHARED_STACK_SIZE * HIPRT_THREAD_GROUP_SIZE]; \
    ccl_global KernelGlobalsGPU kg_gpu; \
    KernelGlobals kg = &kg_gpu; \
    kg->shared_stack = &shared_stack[0]; \
    kg->global_stack_buffer = stack_buffer;
 #else
 #  define HIPRT_INIT_KERNEL_GLOBAL() \
    KernelGlobals kg = NULL; \
    kg->global_stack_buffer = stack_buffer;
 #endif
 struct KernelParamsHIPRT {
  KernelData data;
 #define KERNEL_DATA_ARRAY(type, name) const type *name;
  KERNEL_DATA_ARRAY(int, user_instance_id)
  KERNEL_DATA_ARRAY(uint64_t, blas_ptr)
  KERNEL_DATA_ARRAY(int2, custom_prim_info)
  KERNEL_DATA_ARRAY(int2, custom_prim_info_offset)
  KERNEL_DATA_ARRAY(float2, prims_time)
  KERNEL_DATA_ARRAY(int, prim_time_offset)
 #include "kernel/data_arrays.h"
  /* Integrator state */
  IntegratorStateGPU integrator_state;
  hiprtFuncTable table_closest_intersect;
  hiprtFuncTable table_shadow_intersect;
  hiprtFuncTable table_local_intersect;
  hiprtFuncTable table_volume_intersect;
 };
 /* Intersection_Function_Table_Index defines index values to retrieve custom intersection
 * functions from function table. */
 enum Intersection_Function_Table_Index {
  // Triangles use the intersection function provided by HIP RT and don't need custom intersection
  // functions
  // Custom intersection functions for closest intersect.
  Curve_Intersect_Function = 1,        // Custom intersection for curves
  Motion_Triangle_Intersect_Function,  // Custom intersection for triangles with vertex motion blur
                                       // attributes.
  Point_Intersect_Function,            // Custom intersection for point cloud.
  // Custom intersection functions for shadow rendering are the same as the function for closest
  // intersect.
  // However, the table indices are different
  Triangle_Intersect_Shadow_None,
  Curve_Intersect_Shadow,
  Motion_Triangle_Intersect_Shadow,
  Point_Intersect_Shadow,
  // Custom intersection functions for subsurface scattering.
  // Only motion triangles have valid custom intersection function
  Triangle_Intersect_Local_None,
  Curve_Intersect_Local_None,
  Motion_Triangle_Intersect_Local,
  Point_Intersect_Local_None,
  // Custom intersection functions for volume rendering.
  // Only motion triangles have valid custom intersection function
  Triangle_Intersect_Volume_None,
  Curve_Intersect_Volume_None,
  Motion_Triangle_Intersect_Volume,
  Point_Intersect_Volume_None,
 };
 // Filter functions, filter hits, i.e. test whether a hit should be accepted or not, and whether
 // traversal should stop or continue.
 enum Filter_Function_Table_Index {
  Triangle_Filter_Closest = 0,  // Filter function for triangles for closest intersect, no custom
                                // intersection function is needed.
  Curve_Filter_Opaque_None,     // No filter function is needed and everything is handled in the
                                // intersection function.
  Motion_Triangle_Filter_Opaque_None,  // No filter function is needed and everything is handled in
                                       // intersection function.
  Point_Filter_Opaque_Non,             // No filter function is needed.
  // Filter function for all primitives for shadow intersection.
  // All primitives use the same function but each has a different index in the table.
  Triangle_Filter_Shadow,
  Curve_Filter_Shadow,
  Motion_Triangle_Filter_Shadow,
  Point_Filter_Shadow,
  // Filter functions for subsurface scattering. Triangles and motion triangles need function
  // assignment. They indices for triangles and motion triangles point to the same function. Points
  // and curves dont need any function since subsurface scattering is not applied on either.
  Triangle_Filter_Local,    // Filter functions for triangles
  Curve_Filter_Local_None,  // Subsurface scattering is not applied on curves, no filter function
                            // is
                            // needed.
  Motion_Triangle_Filter_Local,
  Point_Filter_Local_None,
  // Filter functions for volume rendering.
  // Volume rendering only applies to triangles and motion triangles.
  // Triangles and motion triangles use the same filter functions for volume rendering
  Triangle_Filter_Volume,
  Curve_Filter_Volume_None,
  Motion_Triangle_Filter_Volume,
  Point_Filter_Volume_None,
 };
 #ifdef __KERNEL_GPU__
 __constant__ KernelParamsHIPRT kernel_params;
 #  ifdef HIPRT_SHARED_STACK
 typedef hiprtGlobalStack Stack;
 #  endif
 #endif
 /* Abstraction macros */
 #define kernel_data kernel_params.data
 #define kernel_data_fetch(name, index) kernel_params.name[(index)]
 #define kernel_data_array(name) (kernel_params.name)
 #define kernel_integrator_state kernel_params.integrator_state
 CCL_NAMESPACE_END
--- a/intern/cycles/kernel/device/hiprt/hiprt_kernels.h
+++ b/intern/cycles/kernel/device/hiprt/hiprt_kernels.h
@@ -0,0 +1,101 @@
 /* SPDX-License-Identifier: Apache-2.0
 * Copyright 2011-2022 Blender Foundation */
 #ifdef __HIPRT__
 ccl_gpu_kernel_threads(GPU_HIPRT_KERNEL_BLOCK_NUM_THREADS)
    ccl_gpu_kernel_signature(integrator_intersect_closest,
                             ccl_global const int *path_index_array,
                             ccl_global float *render_buffer,
                             const int work_size,
                             ccl_global int *stack_buffer)
 {
  const int global_index = ccl_gpu_global_id_x();
  if (global_index < work_size) {
    HIPRT_INIT_KERNEL_GLOBAL()
    const int state = (path_index_array) ? path_index_array[global_index] : global_index;
    ccl_gpu_kernel_call(integrator_intersect_closest(kg, state, render_buffer));
  }
 }
 ccl_gpu_kernel_postfix
 ccl_gpu_kernel_threads(GPU_HIPRT_KERNEL_BLOCK_NUM_THREADS)
    ccl_gpu_kernel_signature(integrator_intersect_shadow,
                             ccl_global const int *path_index_array,
                             const int work_size,
                             ccl_global int *stack_buffer)
 {
  const int global_index = ccl_gpu_global_id_x();
  if (global_index < work_size) {
    HIPRT_INIT_KERNEL_GLOBAL()
    const int state = (path_index_array) ? path_index_array[global_index] : global_index;
    ccl_gpu_kernel_call(integrator_intersect_shadow(kg, state));
  }
 }
 ccl_gpu_kernel_postfix
 ccl_gpu_kernel_threads(GPU_HIPRT_KERNEL_BLOCK_NUM_THREADS)
    ccl_gpu_kernel_signature(integrator_intersect_subsurface,
                             ccl_global const int *path_index_array,
                             const int work_size,
                             ccl_global int *stack_buffer)
 {
  const int global_index = ccl_gpu_global_id_x();
  if (global_index < work_size) {
    HIPRT_INIT_KERNEL_GLOBAL()
    const int state = (path_index_array) ? path_index_array[global_index] : global_index;
    ccl_gpu_kernel_call(integrator_intersect_subsurface(kg, state));
  }
 }
 ccl_gpu_kernel_postfix
 ccl_gpu_kernel_threads(GPU_HIPRT_KERNEL_BLOCK_NUM_THREADS)
    ccl_gpu_kernel_signature(integrator_intersect_volume_stack,
                             ccl_global const int *path_index_array,
                             const int work_size,
                             ccl_global int *stack_buffer)
 {
  const int global_index = ccl_gpu_global_id_x();
  if (global_index < work_size) {
    HIPRT_INIT_KERNEL_GLOBAL()
    const int state = (path_index_array) ? path_index_array[global_index] : global_index;
    ccl_gpu_kernel_call(integrator_intersect_volume_stack(kg, state));
  }
 }
 ccl_gpu_kernel_postfix
 ccl_gpu_kernel_threads(GPU_HIPRT_KERNEL_BLOCK_NUM_THREADS)
    ccl_gpu_kernel_signature(integrator_shade_surface_raytrace,
                             ccl_global const int *path_index_array,
                             ccl_global float *render_buffer,
                             const int work_size,
                             ccl_global int *stack_buffer)
 {
  const int global_index = ccl_gpu_global_id_x();
  if (global_index < work_size) {
    HIPRT_INIT_KERNEL_GLOBAL()
    const int state = (path_index_array) ? path_index_array[global_index] : global_index;
    ccl_gpu_kernel_call(integrator_shade_surface_raytrace(kg, state, render_buffer));
  }
 }
 ccl_gpu_kernel_postfix
 ccl_gpu_kernel_threads(GPU_HIPRT_KERNEL_BLOCK_NUM_THREADS)
    ccl_gpu_kernel_signature(integrator_shade_surface_mnee,
                             ccl_global const int *path_index_array,
                             ccl_global float *render_buffer,
                             const int work_size,
                             ccl_global int *stack_buffer)
 {
  const int global_index = ccl_gpu_global_id_x();
  if (global_index < work_size) {
    HIPRT_INIT_KERNEL_GLOBAL()
    const int state = (path_index_array) ? path_index_array[global_index] : global_index;
    ccl_gpu_kernel_call(integrator_shade_surface_mnee(kg, state, render_buffer));
  }
 }
 ccl_gpu_kernel_postfix
 #endif /* __HIPRT__ */
--- a/intern/cycles/kernel/device/hiprt/kernel.cpp
+++ b/intern/cycles/kernel/device/hiprt/kernel.cpp
@@ -0,0 +1,16 @@
 /* SPDX-License-Identifier: Apache-2.0
 * Copyright 2011-2023 Blender Foundation */
 #ifdef __HIP_DEVICE_COMPILE__
 #  include "kernel/device/hip/compat.h"
 #  include "kernel/device/hip/config.h"
 #  include <hiprt/hiprt_device.h>
 #  include "kernel/device/hiprt/globals.h"
 #  include "kernel/device/gpu/image.h"
 #  include "kernel/device/gpu/kernel.h"
 #endif
--- a/intern/cycles/kernel/types.h
+++ b/intern/cycles/kernel/types.h
@@ -1169,10 +1169,14 @@ typedef enum KernelBVHLayout {
  BVH_LAYOUT_METAL = (1 << 5),
  BVH_LAYOUT_MULTI_METAL = (1 << 6),
  BVH_LAYOUT_MULTI_METAL_EMBREE = (1 << 7),
  BVH_LAYOUT_HIPRT = (1 << 8),
  BVH_LAYOUT_MULTI_HIPRT = (1 << 9),
  BVH_LAYOUT_MULTI_HIPRT_EMBREE = (1 << 10),
  /* Default BVH layout to use for CPU. */
  BVH_LAYOUT_AUTO = BVH_LAYOUT_EMBREE,
-  BVH_LAYOUT_ALL = BVH_LAYOUT_BVH2 | BVH_LAYOUT_EMBREE | BVH_LAYOUT_OPTIX | BVH_LAYOUT_METAL,
+  BVH_LAYOUT_ALL = BVH_LAYOUT_BVH2 | BVH_LAYOUT_EMBREE | BVH_LAYOUT_OPTIX | BVH_LAYOUT_METAL |
                   BVH_LAYOUT_HIPRT | BVH_LAYOUT_MULTI_HIPRT | BVH_LAYOUT_MULTI_HIPRT_EMBREE,
 } KernelBVHLayout;
 /* Specialized struct that can become constants in dynamic compilation. */
@@ -1225,6 +1229,8 @@ typedef struct KernelData {
  OptixTraversableHandle device_bvh;
 #elif defined __METALRT__
  metalrt_as_type device_bvh;
 #elif defined(__HIPRT__)
  void *device_bvh;
 #else
 #  ifdef __EMBREE__
  RTCScene device_bvh;
--- a/intern/cycles/scene/geometry.cpp
+++ b/intern/cycles/scene/geometry.cpp
@@ -114,7 +114,9 @@ bool Geometry::need_build_bvh(BVHLayout layout) const
 {
  return is_instanced() || layout == BVH_LAYOUT_OPTIX || layout == BVH_LAYOUT_MULTI_OPTIX ||
         layout == BVH_LAYOUT_METAL || layout == BVH_LAYOUT_MULTI_OPTIX_EMBREE ||
-         layout == BVH_LAYOUT_MULTI_METAL || layout == BVH_LAYOUT_MULTI_METAL_EMBREE;
+         layout == BVH_LAYOUT_MULTI_METAL || layout == BVH_LAYOUT_MULTI_METAL_EMBREE ||
         layout == BVH_LAYOUT_HIPRT || layout == BVH_LAYOUT_MULTI_HIPRT ||
         layout == BVH_LAYOUT_MULTI_HIPRT_EMBREE;
 }
 bool Geometry::is_instanced() const
--- a/intern/cycles/scene/object.cpp
+++ b/intern/cycles/scene/object.cpp
@@ -597,7 +597,8 @@ void ObjectManager::device_update_prim_offsets(Device *device, DeviceScene *dsce
  if (!scene->integrator->get_use_light_tree()) {
    BVHLayoutMask layout_mask = device->get_bvh_layout_mask(dscene->data.kernel_features);
    if (layout_mask != BVH_LAYOUT_METAL && layout_mask != BVH_LAYOUT_MULTI_METAL &&
-        layout_mask != BVH_LAYOUT_MULTI_METAL_EMBREE) {
+        layout_mask != BVH_LAYOUT_MULTI_METAL_EMBREE && layout_mask != BVH_LAYOUT_HIPRT &&
        layout_mask != BVH_LAYOUT_MULTI_HIPRT && layout_mask != BVH_LAYOUT_MULTI_HIPRT_EMBREE) {
      return;
    }
  }