diff --git a/CMakeLists.txt b/CMakeLists.txt index 8cd3afc2e12..a6a4136a4cb 100644 --- a/CMakeLists.txt +++ b/CMakeLists.txt @@ -525,6 +525,12 @@ if(NOT APPLE) set(CYCLES_HIP_BINARIES_ARCH gfx900 gfx90c gfx902 gfx1010 gfx1011 gfx1012 gfx1030 gfx1031 gfx1032 gfx1034 gfx1035 gfx1100 gfx1101 gfx1102 CACHE STRING "AMD HIP architectures to build binaries for") mark_as_advanced(WITH_CYCLES_DEVICE_HIP) mark_as_advanced(CYCLES_HIP_BINARIES_ARCH) + + # HIPRT is only available on Windows for now. + if(WIN32) + option(WITH_CYCLES_DEVICE_HIPRT "Enable Cycles AMD HIPRT support" OFF) + mark_as_advanced(WITH_CYCLES_DEVICE_HIPRT) + endif() endif() # Apple Metal @@ -1981,10 +1987,13 @@ if(FIRST_RUN) info_cfg_option(WITH_CYCLES_DEVICE_OPTIX) info_cfg_option(WITH_CYCLES_DEVICE_CUDA) info_cfg_option(WITH_CYCLES_CUDA_BINARIES) - info_cfg_option(WITH_CYCLES_DEVICE_HIP) - info_cfg_option(WITH_CYCLES_HIP_BINARIES) info_cfg_option(WITH_CYCLES_DEVICE_ONEAPI) info_cfg_option(WITH_CYCLES_ONEAPI_BINARIES) + info_cfg_option(WITH_CYCLES_DEVICE_HIP) + info_cfg_option(WITH_CYCLES_HIP_BINARIES) + endif() + if(WIN32) + info_cfg_option(WITH_CYCLES_DEVICE_HIPRT) endif() endif() diff --git a/intern/cycles/CMakeLists.txt b/intern/cycles/CMakeLists.txt index f28c14ad4cc..68316e3cd9b 100644 --- a/intern/cycles/CMakeLists.txt +++ b/intern/cycles/CMakeLists.txt @@ -249,6 +249,13 @@ endif() if(WITH_CYCLES_DEVICE_HIP) add_definitions(-DWITH_HIP) + if(WITH_CYCLES_DEVICE_HIPRT) + include_directories( + ${HIPRT_INCLUDE_DIR} + ) + add_definitions(-DWITH_HIPRT) + endif() + if(WITH_HIP_DYNLOAD) include_directories( ../../extern/hipew/include diff --git a/intern/cycles/blender/addon/properties.py b/intern/cycles/blender/addon/properties.py index 9d16b4983a7..0d236ee4b12 100644 --- a/intern/cycles/blender/addon/properties.py +++ b/intern/cycles/blender/addon/properties.py @@ -1507,7 +1507,7 @@ class CyclesPreferences(bpy.types.AddonPreferences): def get_device_types(self, context): import _cycles - has_cuda, has_optix, has_hip, has_metal, has_oneapi = _cycles.get_device_types() + has_cuda, has_optix, has_hip, has_metal, has_oneapi, has_hiprt = _cycles.get_device_types() list = [('NONE', "None", "Don't use compute device", 0)] if has_cuda: @@ -1544,6 +1544,13 @@ class CyclesPreferences(bpy.types.AddonPreferences): default=False, ) + use_hiprt: BoolProperty( + name="HIP RT (Experimental)", + description="HIP RT enables AMD hardware ray tracing on RDNA2 and above, with shader fallback on older cards. " + "This feature is experimental and some scenes may render incorrectly", + default=False, + ) + use_oneapirt: BoolProperty( name="Embree on GPU (Experimental)", description="Embree GPU execution will allow to use hardware ray tracing on Intel GPUs, which will provide better performance. " @@ -1770,7 +1777,13 @@ class CyclesPreferences(bpy.types.AddonPreferences): col.prop(self, "kernel_optimization_level") col.prop(self, "use_metalrt") - if compute_device_type == 'ONEAPI' and _cycles.with_embree_gpu: + if compute_device_type == 'HIP': + has_cuda, has_optix, has_hip, has_metal, has_oneapi, has_hiprt = _cycles.get_device_types() + row = layout.row() + row.enabled = has_hiprt + row.prop(self, "use_hiprt") + + elif compute_device_type == 'ONEAPI' and _cycles.with_embree_gpu: row = layout.row() row.use_property_split = True row.prop(self, "use_oneapirt") diff --git a/intern/cycles/blender/device.cpp b/intern/cycles/blender/device.cpp index b5fc5b0df44..4e1f77eb9cb 100644 --- a/intern/cycles/blender/device.cpp +++ b/intern/cycles/blender/device.cpp @@ -124,6 +124,10 @@ DeviceInfo blender_device_info(BL::Preferences &b_preferences, info.use_hardware_raytracing = false; } + if (info.type == DEVICE_HIP && !get_boolean(cpreferences, "use_hiprt")) { + info.use_hardware_raytracing = false; + } + /* There is an accumulative logic here, because Multi-devices are support only for * the same backend + CPU in Blender right now, and both oneAPI and Metal have a * global boolean backend setting (see above) for enabling/disabling HW RT, diff --git a/intern/cycles/blender/python.cpp b/intern/cycles/blender/python.cpp index 682d7075455..1afd1e9bd1c 100644 --- a/intern/cycles/blender/python.cpp +++ b/intern/cycles/blender/python.cpp @@ -876,20 +876,23 @@ static PyObject *enable_print_stats_func(PyObject * /*self*/, PyObject * /*args* static PyObject *get_device_types_func(PyObject * /*self*/, PyObject * /*args*/) { vector device_types = Device::available_types(); - bool has_cuda = false, has_optix = false, has_hip = false, has_metal = false, has_oneapi = false; + bool has_cuda = false, has_optix = false, has_hip = false, has_metal = false, has_oneapi = false, + has_hiprt = false; foreach (DeviceType device_type, device_types) { has_cuda |= (device_type == DEVICE_CUDA); has_optix |= (device_type == DEVICE_OPTIX); has_hip |= (device_type == DEVICE_HIP); has_metal |= (device_type == DEVICE_METAL); has_oneapi |= (device_type == DEVICE_ONEAPI); + has_hiprt |= (device_type == DEVICE_HIPRT); } - PyObject *list = PyTuple_New(5); + PyObject *list = PyTuple_New(6); PyTuple_SET_ITEM(list, 0, PyBool_FromLong(has_cuda)); PyTuple_SET_ITEM(list, 1, PyBool_FromLong(has_optix)); PyTuple_SET_ITEM(list, 2, PyBool_FromLong(has_hip)); PyTuple_SET_ITEM(list, 3, PyBool_FromLong(has_metal)); PyTuple_SET_ITEM(list, 4, PyBool_FromLong(has_oneapi)); + PyTuple_SET_ITEM(list, 5, PyBool_FromLong(has_hiprt)); return list; } diff --git a/intern/cycles/bvh/CMakeLists.txt b/intern/cycles/bvh/CMakeLists.txt index 8ccc38660b3..f3242e6de97 100644 --- a/intern/cycles/bvh/CMakeLists.txt +++ b/intern/cycles/bvh/CMakeLists.txt @@ -14,6 +14,7 @@ set(SRC binning.cpp build.cpp embree.cpp + hiprt.cpp multi.cpp node.cpp optix.cpp @@ -39,6 +40,7 @@ set(SRC_HEADERS binning.h build.h embree.h + hiprt.h multi.h node.h optix.h diff --git a/intern/cycles/bvh/bvh.cpp b/intern/cycles/bvh/bvh.cpp index 2a7bbfe968b..6db59e81a89 100644 --- a/intern/cycles/bvh/bvh.cpp +++ b/intern/cycles/bvh/bvh.cpp @@ -6,6 +6,7 @@ #include "bvh/bvh2.h" #include "bvh/embree.h" +#include "bvh/hiprt.h" #include "bvh/metal.h" #include "bvh/multi.h" #include "bvh/optix.h" @@ -30,10 +31,14 @@ const char *bvh_layout_name(BVHLayout layout) return "OPTIX"; case BVH_LAYOUT_METAL: return "METAL"; + case BVH_LAYOUT_HIPRT: + return "HIPRT"; case BVH_LAYOUT_MULTI_OPTIX: case BVH_LAYOUT_MULTI_METAL: + case BVH_LAYOUT_MULTI_HIPRT: case BVH_LAYOUT_MULTI_OPTIX_EMBREE: case BVH_LAYOUT_MULTI_METAL_EMBREE: + case BVH_LAYOUT_MULTI_HIPRT_EMBREE: return "MULTI"; case BVH_LAYOUT_ALL: return "ALL"; @@ -101,11 +106,20 @@ BVH *BVH::create(const BVHParams ¶ms, #else (void)device; break; +#endif + case BVH_LAYOUT_HIPRT: +#ifdef WITH_HIPRT + return new BVHHIPRT(params, geometry, objects, device); +#else + (void)device; + break; #endif case BVH_LAYOUT_MULTI_OPTIX: case BVH_LAYOUT_MULTI_METAL: + case BVH_LAYOUT_MULTI_HIPRT: case BVH_LAYOUT_MULTI_OPTIX_EMBREE: case BVH_LAYOUT_MULTI_METAL_EMBREE: + case BVH_LAYOUT_MULTI_HIPRT_EMBREE: return new BVHMulti(params, geometry, objects); case BVH_LAYOUT_NONE: case BVH_LAYOUT_ALL: diff --git a/intern/cycles/bvh/hiprt.cpp b/intern/cycles/bvh/hiprt.cpp new file mode 100644 index 00000000000..05f317ee70e --- /dev/null +++ b/intern/cycles/bvh/hiprt.cpp @@ -0,0 +1,45 @@ +/* SPDX-License-Identifier: Apache-2.0 + * Copyright 2011-2023 Blender Foundation */ + +#ifdef WITH_HIPRT + +# include "bvh/hiprt.h" + +# include "scene/mesh.h" +# include "scene/object.h" + +# include "util/foreach.h" +# include "util/progress.h" + +# include "device/hiprt/device_impl.h" + +CCL_NAMESPACE_BEGIN + +BVHHIPRT::BVHHIPRT(const BVHParams ¶ms, + const vector &geometry, + const vector &objects, + Device *in_device) + : BVH(params, geometry, objects), + hiprt_geom(0), + custom_primitive_bound(in_device, "Custom Primitive Bound", MEM_READ_ONLY), + triangle_index(in_device, "HIPRT Triangle Index", MEM_READ_ONLY), + vertex_data(in_device, "vertex_data", MEM_READ_ONLY), + device(in_device) +{ + triangle_mesh = {0}; + custom_prim_aabb = {0}; +} + +BVHHIPRT::~BVHHIPRT() +{ + HIPRTDevice *hiprt_device = static_cast(device); + hiprtContext hiprt_context = hiprt_device->get_hiprt_context(); + custom_primitive_bound.free(); + triangle_index.free(); + vertex_data.free(); + hiprtDestroyGeometry(hiprt_context, hiprt_geom); +} + +CCL_NAMESPACE_END + +#endif diff --git a/intern/cycles/bvh/hiprt.h b/intern/cycles/bvh/hiprt.h new file mode 100644 index 00000000000..fd155a1ae0c --- /dev/null +++ b/intern/cycles/bvh/hiprt.h @@ -0,0 +1,58 @@ +/* SPDX-License-Identifier: Apache-2.0 + * Copyright 2011-2023 Blender Foundation */ + +#ifdef WITH_HIPRT + +# pragma once + +# include "bvh/bvh.h" +# include "bvh/params.h" + +# ifdef WITH_HIP_DYNLOAD +# include "hiprtew.h" +# else +# include +# endif + +# include "device/memory.h" + +CCL_NAMESPACE_BEGIN + +class BVHHIPRT : public BVH { + public: + friend class HIPDevice; + + bool is_tlas() + { + return params.top_level; + } + + hiprtGeometry hiprt_geom; + hiprtTriangleMeshPrimitive triangle_mesh; + hiprtAABBListPrimitive custom_prim_aabb; + hiprtGeometryBuildInput geom_input; + + vector custom_prim_info; /* x: prim_id, y: prim_type */ + vector prims_time; + + /* Custom primitives. */ + device_vector custom_primitive_bound; + device_vector triangle_index; + device_vector vertex_data; + + protected: + friend class BVH; + BVHHIPRT(const BVHParams ¶ms, + const vector &geometry, + const vector &objects, + Device *in_device); + + virtual ~BVHHIPRT(); + + private: + Device *device; +}; + +CCL_NAMESPACE_END + +#endif diff --git a/intern/cycles/cmake/external_libs.cmake b/intern/cycles/cmake/external_libs.cmake index 536639b3154..e43275f2398 100644 --- a/intern/cycles/cmake/external_libs.cmake +++ b/intern/cycles/cmake/external_libs.cmake @@ -41,20 +41,32 @@ endif() # HIP ########################################################################### -if(WITH_CYCLES_HIP_BINARIES AND WITH_CYCLES_DEVICE_HIP) - if(UNIX) - # Disabled until there is a HIP 5.5 release for Linux. - set(WITH_CYCLES_HIP_BINARIES OFF) - message(STATUS "HIP temporarily disabled due to compiler bugs") - else() - # Need at least HIP 5.5 to solve compiler bug affecting the kernel. - find_package(HIP 5.5.0) - set_and_warn_library_found("HIP compiler" HIP_FOUND WITH_CYCLES_HIP_BINARIES) +if(WITH_CYCLES_DEVICE_HIP) + if(WITH_CYCLES_HIP_BINARIES) + if(UNIX) + # Disabled until there is a HIP 5.5 release for Linux. + set(WITH_CYCLES_HIP_BINARIES OFF) + message(STATUS "HIP temporarily disabled due to compiler bugs") + else() + # Need at least HIP 5.5 to solve compiler bug affecting the kernel. + find_package(HIP 5.5.0) + set_and_warn_library_found("HIP compiler" HIP_FOUND WITH_CYCLES_HIP_BINARIES) - if(HIP_FOUND) - message(STATUS "Found HIP ${HIP_HIPCC_EXECUTABLE} (${HIP_VERSION})") + if(HIP_FOUND) + message(STATUS "Found HIP ${HIP_HIPCC_EXECUTABLE} (${HIP_VERSION})") + endif() endif() endif() + + # HIP RT + if(WITH_CYCLES_DEVICE_HIP AND WITH_CYCLES_DEVICE_HIPRT) + find_package(HIPRT) + set_and_warn_library_found("HIP RT" HIPRT_FOUND WITH_CYCLES_DEVICE_HIPRT) + endif() +endif() + +if(NOT WITH_CYCLES_DEVICE_HIP) + set(WITH_CYCLES_DEVICE_HIPRT OFF) endif() if(NOT WITH_HIP_DYNLOAD) diff --git a/intern/cycles/device/CMakeLists.txt b/intern/cycles/device/CMakeLists.txt index 127ad1d4d1e..aceaac5b2a6 100644 --- a/intern/cycles/device/CMakeLists.txt +++ b/intern/cycles/device/CMakeLists.txt @@ -66,6 +66,13 @@ set(SRC_HIP hip/util.h ) +set(SRC_HIPRT + hiprt/device_impl.cpp + hiprt/device_impl.h + hiprt/queue.cpp + hiprt/queue.h +) + set(SRC_ONEAPI oneapi/device_impl.cpp oneapi/device_impl.h @@ -124,6 +131,7 @@ set(SRC ${SRC_CPU} ${SRC_CUDA} ${SRC_HIP} + ${SRC_HIPRT} ${SRC_DUMMY} ${SRC_MULTI} ${SRC_OPTIX} @@ -209,6 +217,7 @@ source_group("cpu" FILES ${SRC_CPU}) source_group("cuda" FILES ${SRC_CUDA}) source_group("dummy" FILES ${SRC_DUMMY}) source_group("hip" FILES ${SRC_HIP}) +source_group("hiprt" FILES ${SRC_HIPRT}) source_group("multi" FILES ${SRC_MULTI}) source_group("metal" FILES ${SRC_METAL}) source_group("optix" FILES ${SRC_OPTIX}) diff --git a/intern/cycles/device/device.cpp b/intern/cycles/device/device.cpp index 0ed03f1e94d..f36bb7d2fda 100644 --- a/intern/cycles/device/device.cpp +++ b/intern/cycles/device/device.cpp @@ -14,6 +14,7 @@ #include "device/cuda/device.h" #include "device/dummy/device.h" #include "device/hip/device.h" +#include "device/hiprt/device_impl.h" #include "device/metal/device.h" #include "device/multi/device.h" #include "device/oneapi/device.h" @@ -135,6 +136,8 @@ DeviceType Device::type_from_string(const char *name) return DEVICE_METAL; else if (strcmp(name, "ONEAPI") == 0) return DEVICE_ONEAPI; + else if (strcmp(name, "HIPRT") == 0) + return DEVICE_HIPRT; return DEVICE_NONE; } @@ -155,6 +158,8 @@ string Device::string_from_type(DeviceType type) return "METAL"; else if (type == DEVICE_ONEAPI) return "ONEAPI"; + else if (type == DEVICE_HIPRT) + return "HIPRT"; return ""; } @@ -177,6 +182,10 @@ vector Device::available_types() #endif #ifdef WITH_ONEAPI types.push_back(DEVICE_ONEAPI); +#endif +#ifdef WITH_HIPRT + if (hiprtewInit()) + types.push_back(DEVICE_HIPRT); #endif return types; } diff --git a/intern/cycles/device/device.h b/intern/cycles/device/device.h index 1a665efcd2e..2b6b42f0822 100644 --- a/intern/cycles/device/device.h +++ b/intern/cycles/device/device.h @@ -40,6 +40,7 @@ enum DeviceType { DEVICE_MULTI, DEVICE_OPTIX, DEVICE_HIP, + DEVICE_HIPRT, DEVICE_METAL, DEVICE_ONEAPI, DEVICE_DUMMY, @@ -79,8 +80,7 @@ class DeviceInfo { bool has_profiling; /* Supports runtime collection of profiling info. */ bool has_peer_memory; /* GPU has P2P access to memory of another GPU. */ bool has_gpu_queue; /* Device supports GPU queue. */ - bool use_hardware_raytracing; /* Use hardware ray tracing to accelerate ray queries in a backend. - */ + bool use_hardware_raytracing; /* Use hardware instructions to accelerate ray tracing. */ KernelOptimizationLevel kernel_optimization_level; /* Optimization level applied to path tracing * kernels (Metal only). */ DenoiserTypeMask denoisers; /* Supported denoiser types. */ diff --git a/intern/cycles/device/hip/device.cpp b/intern/cycles/device/hip/device.cpp index c853114cea7..b982dd39ed3 100644 --- a/intern/cycles/device/hip/device.cpp +++ b/intern/cycles/device/hip/device.cpp @@ -13,6 +13,10 @@ # include "util/windows.h" #endif /* WITH_HIP */ +#ifdef WITH_HIPRT +# include "device/hiprt/device_impl.h" +#endif + CCL_NAMESPACE_BEGIN bool device_hip_init() @@ -65,7 +69,12 @@ bool device_hip_init() Device *device_hip_create(const DeviceInfo &info, Stats &stats, Profiler &profiler) { -#ifdef WITH_HIP +#ifdef WITH_HIPRT + if (info.use_hardware_raytracing) + return new HIPRTDevice(info, stats, profiler); + else + return new HIPDevice(info, stats, profiler); +#elif defined(WITH_HIP) return new HIPDevice(info, stats, profiler); #else (void)info; @@ -115,6 +124,12 @@ void device_hip_info(vector &devices) return; } +# ifdef WITH_HIPRT + const bool has_hardware_raytracing = hiprtewInit(); +# else + const bool has_hardware_raytracing = false; +# endif + vector display_devices; for (int num = 0; num < count; num++) { @@ -150,6 +165,8 @@ void device_hip_info(vector &devices) } } + info.use_hardware_raytracing = has_hardware_raytracing; + int pci_location[3] = {0, 0, 0}; hipDeviceGetAttribute(&pci_location[0], hipDeviceAttributePciDomainID, num); hipDeviceGetAttribute(&pci_location[1], hipDeviceAttributePciBusId, num); @@ -176,6 +193,7 @@ void device_hip_info(vector &devices) VLOG_INFO << "Device has compute preemption or is not used for display."; devices.push_back(info); } + VLOG_INFO << "Added device \"" << name << "\" with id \"" << info.id << "\"."; } diff --git a/intern/cycles/device/hip/device_impl.h b/intern/cycles/device/hip/device_impl.h index 021d8d1651f..97b8ba29490 100644 --- a/intern/cycles/device/hip/device_impl.h +++ b/intern/cycles/device/hip/device_impl.h @@ -1,6 +1,8 @@ /* SPDX-License-Identifier: Apache-2.0 * Copyright 2011-2022 Blender Foundation */ +#pragma once + #ifdef WITH_HIP # include "device/device.h" @@ -49,9 +51,11 @@ class HIPDevice : public GPUDevice { bool use_adaptive_compilation(); - string compile_kernel_get_common_cflags(const uint kernel_features); + virtual string compile_kernel_get_common_cflags(const uint kernel_features); - string compile_kernel(const uint kernel_features, const char *name, const char *base = "hip"); + virtual string compile_kernel(const uint kernel_features, + const char *name, + const char *base = "hip"); virtual bool load_kernels(const uint kernel_features) override; void reserve_local_memory(const uint kernel_features); diff --git a/intern/cycles/device/hiprt/device_impl.cpp b/intern/cycles/device/hiprt/device_impl.cpp new file mode 100644 index 00000000000..beb255deba4 --- /dev/null +++ b/intern/cycles/device/hiprt/device_impl.cpp @@ -0,0 +1,1054 @@ +/* SPDX-License-Identifier: Apache-2.0 + * Copyright 2011-2023 Blender Foundation */ + +#ifdef WITH_HIPRT + +# include "device/hiprt/device_impl.h" + +# include "util/debug.h" +# include "util/foreach.h" +# include "util/log.h" +# include "util/map.h" +# include "util/md5.h" +# include "util/path.h" +# include "util/progress.h" +# include "util/string.h" +# include "util/system.h" +# include "util/time.h" +# include "util/types.h" +# include "util/windows.h" + +# include "bvh/hiprt.h" + +# include "scene/hair.h" +# include "scene/mesh.h" +# include "scene/object.h" +# include "scene/pointcloud.h" + +CCL_NAMESPACE_BEGIN + +static void get_hiprt_transform(float matrix[][4], Transform &tfm) +{ + int row = 0; + int col = 0; + matrix[row][col++] = tfm.x.x; + matrix[row][col++] = tfm.x.y; + matrix[row][col++] = tfm.x.z; + matrix[row][col++] = tfm.x.w; + row++; + col = 0; + matrix[row][col++] = tfm.y.x; + matrix[row][col++] = tfm.y.y; + matrix[row][col++] = tfm.y.z; + matrix[row][col++] = tfm.y.w; + row++; + col = 0; + matrix[row][col++] = tfm.z.x; + matrix[row][col++] = tfm.z.y; + matrix[row][col++] = tfm.z.z; + matrix[row][col++] = tfm.z.w; +} + +class HIPRTDevice; + +BVHLayoutMask HIPRTDevice::get_bvh_layout_mask(const uint /* kernel_features */) const +{ + return BVH_LAYOUT_HIPRT; +} + +HIPRTDevice::HIPRTDevice(const DeviceInfo &info, Stats &stats, Profiler &profiler) + : HIPDevice(info, stats, profiler), + global_stack_buffer(this, "global_stack_buffer", MEM_DEVICE_ONLY), + hiprt_context(NULL), + scene(NULL), + functions_table(NULL), + scratch_buffer_size(0), + scratch_buffer(this, "scratch_buffer", MEM_DEVICE_ONLY), + visibility(this, "visibility", MEM_READ_ONLY), + instance_transform_matrix(this, "instance_transform_matrix", MEM_READ_ONLY), + transform_headers(this, "transform_headers", MEM_READ_ONLY), + user_instance_id(this, "user_instance_id", MEM_GLOBAL), + hiprt_blas_ptr(this, "hiprt_blas_ptr", MEM_READ_WRITE), + blas_ptr(this, "blas_ptr", MEM_GLOBAL), + custom_prim_info(this, "custom_prim_info", MEM_GLOBAL), + custom_prim_info_offset(this, "custom_prim_info_offset", MEM_GLOBAL), + prims_time(this, "prims_time", MEM_GLOBAL), + prim_time_offset(this, "prim_time_offset", MEM_GLOBAL) +{ + HIPContextScope scope(this); + hiprtContextCreationInput hiprt_context_input = {0}; + hiprt_context_input.ctxt = hipContext; + hiprt_context_input.device = hipDevice; + hiprt_context_input.deviceType = hiprtDeviceAMD; + hiprtError rt_result = hiprtCreateContext( + HIPRT_API_VERSION, hiprt_context_input, &hiprt_context); + + if (rt_result != hiprtSuccess) { + set_error(string_printf("Failed to create HIPRT context")); + return; + } + + rt_result = hiprtCreateFuncTable( + hiprt_context, Max_Primitive_Type, Max_Intersect_Filter_Function, &functions_table); + + if (rt_result != hiprtSuccess) { + set_error(string_printf("Failed to create HIPRT Function Table")); + return; + } +} + +HIPRTDevice::~HIPRTDevice() +{ + HIPContextScope scope(this); + user_instance_id.free(); + visibility.free(); + hiprt_blas_ptr.free(); + blas_ptr.free(); + instance_transform_matrix.free(); + transform_headers.free(); + custom_prim_info_offset.free(); + custom_prim_info.free(); + prim_time_offset.free(); + prims_time.free(); + global_stack_buffer.free(); + hiprtDestroyFuncTable(hiprt_context, functions_table); + hiprtDestroyScene(hiprt_context, scene); + hiprtDestroyContext(hiprt_context); +} + +unique_ptr HIPRTDevice::gpu_queue_create() +{ + return make_unique(this); +} + +string HIPRTDevice::compile_kernel_get_common_cflags(const uint kernel_features) +{ + string cflags = HIPDevice::compile_kernel_get_common_cflags(kernel_features); + + cflags += " -D __HIPRT__ "; + + return cflags; +} + +string HIPRTDevice::compile_kernel(const uint kernel_features, const char *name, const char *base) +{ + int major, minor; + hipDeviceGetAttribute(&major, hipDeviceAttributeComputeCapabilityMajor, hipDevId); + hipDeviceGetAttribute(&minor, hipDeviceAttributeComputeCapabilityMinor, hipDevId); + hipDeviceProp_t props; + hipGetDeviceProperties(&props, hipDevId); + + char *arch = strtok(props.gcnArchName, ":"); + if (arch == NULL) { + arch = props.gcnArchName; + } + + if (!use_adaptive_compilation()) { + const string fatbin = path_get(string_printf("lib/%s_rt_gfx.hipfb", name)); + VLOG(1) << "Testing for pre-compiled kernel " << fatbin << "."; + if (path_exists(fatbin)) { + VLOG(1) << "Using precompiled kernel."; + return fatbin; + } + } + + string source_path = path_get("source"); + const string source_md5 = path_files_md5_hash(source_path); + + string common_cflags = compile_kernel_get_common_cflags(kernel_features); + const string kernel_md5 = util_md5_string(source_md5 + common_cflags); + + const string include_path = source_path; + const string bitcode_file = string_printf("cycles_%s_%s_%s.bc", name, arch, kernel_md5.c_str()); + const string bitcode = path_cache_get(path_join("kernels", bitcode_file)); + const string fatbin_file = string_printf( + "cycles_%s_%s_%s.hipfb", name, arch, kernel_md5.c_str()); + const string fatbin = path_cache_get(path_join("kernels", fatbin_file)); + + VLOG(1) << "Testing for locally compiled kernel " << fatbin << "."; + if (path_exists(fatbin)) { + VLOG(1) << "Using locally compiled kernel."; + return fatbin; + } + +# ifdef _WIN32 + if (!use_adaptive_compilation() && have_precompiled_kernels()) { + if (!hipSupportsDevice(hipDevId)) { + set_error( + string_printf("HIP backend requires compute capability 10.1 or up, but found %d.%d. " + "Your GPU is not supported.", + major, + minor)); + } + else { + set_error( + string_printf("HIP binary kernel for this graphics card compute " + "capability (%d.%d) not found.", + major, + minor)); + } + return string(); + } +# endif + + const char *const hipcc = hipewCompilerPath(); + if (hipcc == NULL) { + set_error( + "HIP hipcc compiler not found. " + "Install HIP toolkit in default location."); + return string(); + } + + const int hipcc_hip_version = hipewCompilerVersion(); + VLOG_INFO << "Found hipcc " << hipcc << ", HIP version " << hipcc_hip_version << "."; + if (hipcc_hip_version < 40) { + printf( + "Unsupported HIP version %d.%d detected, " + "you need HIP 4.0 or newer.\n", + hipcc_hip_version / 10, + hipcc_hip_version % 10); + return string(); + } + + path_create_directories(fatbin); + + source_path = path_join(path_join(source_path, "kernel"), + path_join("device", path_join(base, string_printf("%s.cpp", name)))); + + printf("Compiling %s and caching to %s", source_path.c_str(), fatbin.c_str()); + + double starttime = time_dt(); + + const string hiprt_path = getenv("HIPRT_ROOT_DIR"); + // First, app kernels are compiled into bitcode, without access to implementation of HIP RT + // functions + if (!path_exists(bitcode)) { + + std::string rtc_options; + + rtc_options.append(" --offload-arch=").append(arch); + rtc_options.append(" -D __HIPRT__"); + rtc_options.append(" -ffast-math -O3 -std=c++17"); + rtc_options.append(" -fgpu-rdc -c --gpu-bundle-output -c -emit-llvm"); + + string command = string_printf("%s %s -I %s -I %s %s -o \"%s\"", + hipcc, + rtc_options.c_str(), + include_path.c_str(), + hiprt_path.c_str(), + source_path.c_str(), + bitcode.c_str()); + + printf("Compiling %sHIP kernel ...\n%s\n", + (use_adaptive_compilation()) ? "adaptive " : "", + command.c_str()); + +# ifdef _WIN32 + command = "call " + command; +# endif + if (system(command.c_str()) != 0) { + set_error( + "Failed to execute compilation command, " + "see console for details."); + return string(); + } + } + + // After compilation, the bitcode produced is linked with HIP RT bitcode (containing + // implementations of HIP RT functions, e.g. traversal, to produce the final executable code + string linker_options; + linker_options.append(" --offload-arch=").append(arch); + linker_options.append(" -fgpu-rdc --hip-link --cuda-device-only "); + string hiprt_ver(HIPRT_VERSION_STR); + string hiprt_bc; + hiprt_bc = hiprt_path + "\\hiprt" + hiprt_ver + "_amd_lib_win.bc"; + + string linker_command = string_printf("clang++ %s \"%s\" %s -o \"%s\"", + linker_options.c_str(), + bitcode.c_str(), + hiprt_bc.c_str(), + fatbin.c_str()); + +# ifdef _WIN32 + linker_command = "call " + linker_command; +# endif + if (system(linker_command.c_str()) != 0) { + set_error( + "Failed to execute linking command, " + "see console for details."); + return string(); + } + + printf("Kernel compilation finished in %.2lfs.\n", time_dt() - starttime); + + return fatbin; +} + +bool HIPRTDevice::load_kernels(const uint kernel_features) +{ + if (hipModule) { + if (use_adaptive_compilation()) { + VLOG(1) << "Skipping HIP kernel reload for adaptive compilation, not currently supported."; + } + return true; + } + + if (hipContext == 0) + return false; + + if (!support_device(kernel_features)) { + return false; + } + + /* get kernel */ + const char *kernel_name = "kernel"; + string fatbin = compile_kernel(kernel_features, kernel_name); + if (fatbin.empty()) + return false; + + /* open module */ + HIPContextScope scope(this); + + string fatbin_data; + hipError_t result; + + if (path_read_text(fatbin, fatbin_data)) { + + result = hipModuleLoadData(&hipModule, fatbin_data.c_str()); + } + else + result = hipErrorFileNotFound; + + if (result != hipSuccess) + set_error(string_printf( + "Failed to load HIP kernel from '%s' (%s)", fatbin.c_str(), hipewErrorString(result))); + + if (result == hipSuccess) { + kernels.load(this); + { + const DeviceKernel test_kernel = (kernel_features & KERNEL_FEATURE_NODE_RAYTRACE) ? + DEVICE_KERNEL_INTEGRATOR_SHADE_SURFACE_RAYTRACE : + (kernel_features & KERNEL_FEATURE_MNEE) ? + DEVICE_KERNEL_INTEGRATOR_SHADE_SURFACE_MNEE : + DEVICE_KERNEL_INTEGRATOR_SHADE_SURFACE; + + HIPRTDeviceQueue queue(this); + + device_ptr d_path_index = 0; + device_ptr d_render_buffer = 0; + int d_work_size = 0; + DeviceKernelArguments args(&d_path_index, &d_render_buffer, &d_work_size); + + queue.init_execution(); + queue.enqueue(test_kernel, 1, args); + queue.synchronize(); + } + } + + return (result == hipSuccess); +} + +void HIPRTDevice::const_copy_to(const char *name, void *host, size_t size) +{ + HIPContextScope scope(this); + hipDeviceptr_t mem; + size_t bytes; + + if (strcmp(name, "data") == 0) { + assert(size <= sizeof(KernelData)); + KernelData *const data = (KernelData *)host; + *(hiprtScene *)&data->device_bvh = scene; + } + + hip_assert(hipModuleGetGlobal(&mem, &bytes, hipModule, "kernel_params")); + assert(bytes == sizeof(KernelParamsHIPRT)); + +# define KERNEL_DATA_ARRAY(data_type, data_name) \ + if (strcmp(name, #data_name) == 0) { \ + hip_assert(hipMemcpyHtoD(mem + offsetof(KernelParamsHIPRT, data_name), host, size)); \ + return; \ + } + KERNEL_DATA_ARRAY(KernelData, data) + KERNEL_DATA_ARRAY(IntegratorStateGPU, integrator_state) + KERNEL_DATA_ARRAY(int, user_instance_id) + KERNEL_DATA_ARRAY(uint64_t, blas_ptr) + KERNEL_DATA_ARRAY(int2, custom_prim_info_offset) + KERNEL_DATA_ARRAY(int2, custom_prim_info) + KERNEL_DATA_ARRAY(int, prim_time_offset) + KERNEL_DATA_ARRAY(float2, prims_time) + +# include "kernel/data_arrays.h" +# undef KERNEL_DATA_ARRAY +} + +hiprtGeometryBuildInput HIPRTDevice::prepare_triangle_blas(BVHHIPRT *bvh, Mesh *mesh) +{ + hiprtGeometryBuildInput geom_input; + geom_input.geomType = Triangle; + + if (mesh->has_motion_blur() && + !(bvh->params.num_motion_triangle_steps == 0 || bvh->params.use_spatial_split)) { + + const Attribute *attr_mP = mesh->attributes.find(ATTR_STD_MOTION_VERTEX_POSITION); + const size_t num_triangles = mesh->num_triangles(); + + const int num_bvh_steps = bvh->params.num_motion_triangle_steps * 2 + 1; + const float num_bvh_steps_inv_1 = 1.0f / (num_bvh_steps - 1); + + int num_bounds = 0; + bvh->custom_primitive_bound.alloc(num_triangles * num_bvh_steps); + + for (uint j = 0; j < num_triangles; j++) { + Mesh::Triangle t = mesh->get_triangle(j); + const float3 *verts = mesh->get_verts().data(); + + const size_t num_verts = mesh->get_verts().size(); + const size_t num_steps = mesh->get_motion_steps(); + const float3 *vert_steps = attr_mP->data_float3(); + + float3 prev_verts[3]; + t.motion_verts(verts, vert_steps, num_verts, num_steps, 0.0f, prev_verts); + BoundBox prev_bounds = BoundBox::empty; + prev_bounds.grow(prev_verts[0]); + prev_bounds.grow(prev_verts[1]); + prev_bounds.grow(prev_verts[2]); + + for (int bvh_step = 1; bvh_step < num_bvh_steps; ++bvh_step) { + const float curr_time = (float)(bvh_step)*num_bvh_steps_inv_1; + float3 curr_verts[3]; + t.motion_verts(verts, vert_steps, num_verts, num_steps, curr_time, curr_verts); + BoundBox curr_bounds = BoundBox::empty; + curr_bounds.grow(curr_verts[0]); + curr_bounds.grow(curr_verts[1]); + curr_bounds.grow(curr_verts[2]); + BoundBox bounds = prev_bounds; + bounds.grow(curr_bounds); + if (bounds.valid()) { + const float prev_time = (float)(bvh_step - 1) * num_bvh_steps_inv_1; + bvh->custom_primitive_bound[num_bounds] = bounds; + bvh->custom_prim_info[num_bounds].x = j; + bvh->custom_prim_info[num_bounds].y = mesh->primitive_type(); + bvh->prims_time[num_bounds].x = curr_time; + bvh->prims_time[num_bounds].y = prev_time; + num_bounds++; + } + prev_bounds = curr_bounds; + } + } + + bvh->custom_prim_aabb.aabbCount = bvh->custom_primitive_bound.size(); + bvh->custom_prim_aabb.aabbStride = sizeof(BoundBox); + bvh->custom_primitive_bound.copy_to_device(); + bvh->custom_prim_aabb.aabbs = (void *)bvh->custom_primitive_bound.device_pointer; + + geom_input.type = hiprtPrimitiveTypeAABBList; + geom_input.aabbList.primitive = &bvh->custom_prim_aabb; + geom_input.geomType = Motion_Triangle; + } + else { + + size_t triangle_size = mesh->get_triangles().size(); + void *triangle_data = mesh->get_triangles().data(); + + size_t vertex_size = mesh->get_verts().size(); + void *vertex_data = mesh->get_verts().data(); + + bvh->triangle_mesh.triangleCount = mesh->num_triangles(); + bvh->triangle_mesh.triangleStride = 3 * sizeof(int); + bvh->triangle_mesh.vertexCount = vertex_size; + bvh->triangle_mesh.vertexStride = sizeof(float3); + + bvh->triangle_index.host_pointer = triangle_data; + bvh->triangle_index.data_elements = 1; + bvh->triangle_index.data_type = TYPE_INT; + bvh->triangle_index.data_size = triangle_size; + bvh->triangle_index.copy_to_device(); + bvh->triangle_mesh.triangleIndices = (void *)(bvh->triangle_index.device_pointer); + // either has to set the host pointer to zero, or increment the refcount on triangle_data + bvh->triangle_index.host_pointer = 0; + bvh->vertex_data.host_pointer = vertex_data; + bvh->vertex_data.data_elements = 4; + bvh->vertex_data.data_type = TYPE_FLOAT; + bvh->vertex_data.data_size = vertex_size; + bvh->vertex_data.copy_to_device(); + bvh->triangle_mesh.vertices = (void *)(bvh->vertex_data.device_pointer); + bvh->vertex_data.host_pointer = 0; + + geom_input.type = hiprtPrimitiveTypeTriangleMesh; + geom_input.triangleMesh.primitive = &(bvh->triangle_mesh); + } + return geom_input; +} + +hiprtGeometryBuildInput HIPRTDevice::prepare_curve_blas(BVHHIPRT *bvh, Hair *hair) +{ + hiprtGeometryBuildInput geom_input; + + const PrimitiveType primitive_type = hair->primitive_type(); + const size_t num_curves = hair->num_curves(); + const size_t num_segments = hair->num_segments(); + const Attribute *curve_attr_mP = NULL; + + if (curve_attr_mP == NULL || bvh->params.num_motion_curve_steps == 0) { + + bvh->custom_prim_info.resize(num_segments); + bvh->custom_primitive_bound.alloc(num_segments); + } + else { + size_t num_boxes = bvh->params.num_motion_curve_steps * 2 * num_segments; + bvh->custom_prim_info.resize(num_boxes); + bvh->custom_primitive_bound.alloc(num_boxes); + curve_attr_mP = hair->attributes.find(ATTR_STD_MOTION_VERTEX_POSITION); + } + + int num_bounds = 0; + float3 *curve_keys = hair->get_curve_keys().data(); + + for (uint j = 0; j < num_curves; j++) { + const Hair::Curve curve = hair->get_curve(j); + const float *curve_radius = &hair->get_curve_radius()[0]; + int first_key = curve.first_key; + for (int k = 0; k < curve.num_keys - 1; k++) { + if (curve_attr_mP == NULL || bvh->params.num_motion_curve_steps == 0) { + float3 current_keys[4]; + current_keys[0] = curve_keys[max(first_key + k - 1, first_key)]; + current_keys[1] = curve_keys[first_key + k]; + current_keys[2] = curve_keys[first_key + k + 1]; + current_keys[3] = curve_keys[min(first_key + k + 2, first_key + curve.num_keys - 1)]; + + if (current_keys[0].x == current_keys[1].x && current_keys[1].x == current_keys[2].x && + current_keys[2].x == current_keys[3].x && current_keys[0].y == current_keys[1].y && + current_keys[1].y == current_keys[2].y && current_keys[2].y == current_keys[3].y && + current_keys[0].z == current_keys[1].z && current_keys[1].z == current_keys[2].z && + current_keys[2].z == current_keys[3].z) + continue; + + BoundBox bounds = BoundBox::empty; + curve.bounds_grow(k, &hair->get_curve_keys()[0], curve_radius, bounds); + if (bounds.valid()) { + int type = PRIMITIVE_PACK_SEGMENT(primitive_type, k); + bvh->custom_prim_info[num_bounds].x = j; + bvh->custom_prim_info[num_bounds].y = type; + bvh->custom_primitive_bound[num_bounds] = bounds; + num_bounds++; + } + } + else { + + const int num_bvh_steps = bvh->params.num_motion_curve_steps * 2 + 1; + const float num_bvh_steps_inv_1 = 1.0f / (num_bvh_steps - 1); + const size_t num_steps = hair->get_motion_steps(); + const float3 *curve_keys = &hair->get_curve_keys()[0]; + const float4 *key_steps = curve_attr_mP->data_float4(); + const size_t num_keys = hair->get_curve_keys().size(); + + float4 prev_keys[4]; + curve.cardinal_motion_keys(curve_keys, + curve_radius, + key_steps, + num_keys, + num_steps, + 0.0f, + k - 1, + k, + k + 1, + k + 2, + prev_keys); + BoundBox prev_bounds = BoundBox::empty; + curve.bounds_grow(prev_keys, prev_bounds); + + for (int bvh_step = 1; bvh_step < num_bvh_steps; ++bvh_step) { + const float curr_time = (float)(bvh_step)*num_bvh_steps_inv_1; + float4 curr_keys[4]; + curve.cardinal_motion_keys(curve_keys, + curve_radius, + key_steps, + num_keys, + num_steps, + curr_time, + k - 1, + k, + k + 1, + k + 2, + curr_keys); + BoundBox curr_bounds = BoundBox::empty; + curve.bounds_grow(curr_keys, curr_bounds); + BoundBox bounds = prev_bounds; + bounds.grow(curr_bounds); + if (bounds.valid()) { + const float prev_time = (float)(bvh_step - 1) * num_bvh_steps_inv_1; + int packed_type = PRIMITIVE_PACK_SEGMENT(primitive_type, k); + bvh->custom_prim_info[num_bounds].x = j; + bvh->custom_prim_info[num_bounds].y = packed_type; // k + bvh->custom_primitive_bound[num_bounds] = bounds; + bvh->prims_time[num_bounds].x = curr_time; + bvh->prims_time[num_bounds].y = prev_time; + num_bounds++; + } + prev_bounds = curr_bounds; + } + } + } + } + + bvh->custom_prim_aabb.aabbCount = num_bounds; + bvh->custom_prim_aabb.aabbStride = sizeof(BoundBox); + bvh->custom_primitive_bound.copy_to_device(); + bvh->custom_prim_aabb.aabbs = (void *)bvh->custom_primitive_bound.device_pointer; + + geom_input.type = hiprtPrimitiveTypeAABBList; + geom_input.aabbList.primitive = &bvh->custom_prim_aabb; + geom_input.geomType = Curve; + + return geom_input; +} + +hiprtGeometryBuildInput HIPRTDevice::prepare_point_blas(BVHHIPRT *bvh, PointCloud *pointcloud) +{ + hiprtGeometryBuildInput geom_input; + + const Attribute *point_attr_mP = NULL; + if (pointcloud->has_motion_blur()) { + point_attr_mP = pointcloud->attributes.find(ATTR_STD_MOTION_VERTEX_POSITION); + } + + const float3 *points_data = pointcloud->get_points().data(); + const float *radius_data = pointcloud->get_radius().data(); + const size_t num_points = pointcloud->num_points(); + const float3 *motion_data = (point_attr_mP) ? point_attr_mP->data_float3() : NULL; + const size_t num_steps = pointcloud->get_motion_steps(); + + int num_bounds = 0; + + if (point_attr_mP == NULL) { + bvh->custom_primitive_bound.alloc(num_points); + for (uint j = 0; j < num_points; j++) { + const PointCloud::Point point = pointcloud->get_point(j); + BoundBox bounds = BoundBox::empty; + point.bounds_grow(points_data, radius_data, bounds); + if (bounds.valid()) { + bvh->custom_primitive_bound[num_bounds] = bounds; + bvh->custom_prim_info[num_bounds].x = j; + bvh->custom_prim_info[num_bounds].y = PRIMITIVE_POINT; + num_bounds++; + } + } + } + else if (bvh->params.num_motion_point_steps == 0) { + + bvh->custom_primitive_bound.alloc(num_points * num_steps); + + for (uint j = 0; j < num_points; j++) { + const PointCloud::Point point = pointcloud->get_point(j); + BoundBox bounds = BoundBox::empty; + point.bounds_grow(points_data, radius_data, bounds); + for (size_t step = 0; step < num_steps - 1; step++) { + point.bounds_grow(motion_data + step * num_points, radius_data, bounds); + } + if (bounds.valid()) { + bvh->custom_primitive_bound[num_bounds] = bounds; + bvh->custom_prim_info[num_bounds].x = j; + bvh->custom_prim_info[num_bounds].y = PRIMITIVE_POINT; + num_bounds++; + } + } + } + else { + + const int num_bvh_steps = bvh->params.num_motion_point_steps * 2 + 1; + const float num_bvh_steps_inv_1 = 1.0f / (num_bvh_steps - 1); + + bvh->custom_primitive_bound.alloc(num_points * num_bvh_steps); + + for (uint j = 0; j < num_points; j++) { + const PointCloud::Point point = pointcloud->get_point(j); + const size_t num_steps = pointcloud->get_motion_steps(); + const float3 *point_steps = point_attr_mP->data_float3(); + + float4 prev_key = point.motion_key( + points_data, radius_data, point_steps, num_points, num_steps, 0.0f, j); + BoundBox prev_bounds = BoundBox::empty; + point.bounds_grow(prev_key, prev_bounds); + + for (int bvh_step = 1; bvh_step < num_bvh_steps; ++bvh_step) { + const float curr_time = (float)(bvh_step)*num_bvh_steps_inv_1; + float4 curr_key = point.motion_key( + points_data, radius_data, point_steps, num_points, num_steps, curr_time, j); + BoundBox curr_bounds = BoundBox::empty; + point.bounds_grow(curr_key, curr_bounds); + BoundBox bounds = prev_bounds; + bounds.grow(curr_bounds); + if (bounds.valid()) { + const float prev_time = (float)(bvh_step - 1) * num_bvh_steps_inv_1; + bvh->custom_primitive_bound[num_bounds] = bounds; + bvh->custom_prim_info[num_bounds].x = j; + bvh->custom_prim_info[num_bounds].y = PRIMITIVE_MOTION_POINT; + bvh->prims_time[num_bounds].x = curr_time; + bvh->prims_time[num_bounds].y = prev_time; + num_bounds++; + } + prev_bounds = curr_bounds; + } + } + } + + bvh->custom_prim_aabb.aabbCount = bvh->custom_primitive_bound.size(); + bvh->custom_prim_aabb.aabbStride = sizeof(BoundBox); + bvh->custom_primitive_bound.copy_to_device(); + bvh->custom_prim_aabb.aabbs = (void *)bvh->custom_primitive_bound.device_pointer; + + geom_input.type = hiprtPrimitiveTypeAABBList; + geom_input.aabbList.primitive = &bvh->custom_prim_aabb; + geom_input.geomType = Point; + + return geom_input; +} + +void HIPRTDevice::build_blas(BVHHIPRT *bvh, Geometry *geom, hiprtBuildOptions options) +{ + hiprtGeometryBuildInput geom_input = {}; + + switch (geom->geometry_type) { + case Geometry::MESH: + case Geometry::VOLUME: { + Mesh *mesh = static_cast(geom); + + if (mesh->num_triangles() == 0) + return; + + geom_input = prepare_triangle_blas(bvh, mesh); + break; + } + + case Geometry::HAIR: { + Hair *const hair = static_cast(geom); + + if (hair->num_segments() == 0) + return; + + geom_input = prepare_curve_blas(bvh, hair); + break; + } + + case Geometry::POINTCLOUD: { + PointCloud *pointcloud = static_cast(geom); + if (pointcloud->num_points() == 0) + return; + + geom_input = prepare_point_blas(bvh, pointcloud); + break; + } + + default: + assert(geom_input.geomType != hiprtInvalidValue); + } + + size_t blas_scratch_buffer_size = 0; + hiprtError rt_err = hiprtGetGeometryBuildTemporaryBufferSize( + hiprt_context, &geom_input, &options, &blas_scratch_buffer_size); + + if (rt_err != hiprtSuccess) { + set_error(string_printf("Failed to get scratch buffer size for BLAS!")); + } + + rt_err = hiprtCreateGeometry(hiprt_context, &geom_input, &options, &bvh->hiprt_geom); + + if (rt_err != hiprtSuccess) { + set_error(string_printf("Failed to create BLAS!")); + } + bvh->geom_input = geom_input; + { + thread_scoped_lock lock(hiprt_mutex); + if (blas_scratch_buffer_size > scratch_buffer_size) { + scratch_buffer.alloc(blas_scratch_buffer_size); + scratch_buffer_size = blas_scratch_buffer_size; + scratch_buffer.zero_to_device(); + } + rt_err = hiprtBuildGeometry(hiprt_context, + hiprtBuildOperationBuild, + &bvh->geom_input, + &options, + (void *)(scratch_buffer.device_pointer), + 0, + bvh->hiprt_geom); + } + if (rt_err != hiprtSuccess) { + set_error(string_printf("Failed to build BLAS")); + } +} + +hiprtScene HIPRTDevice::build_tlas(BVHHIPRT *bvh, + vector objects, + hiprtBuildOptions options, + bool refit) +{ + hiprtBuildOperation build_operation = refit ? hiprtBuildOperationUpdate : + hiprtBuildOperationBuild; + + array transform_matrix; + + unordered_map prim_info_map; + size_t custom_prim_offset = 0; + + unordered_map prim_time_map; + + size_t num_instances = 0; + int blender_instance_id = 0; + + size_t num_object = objects.size(); + user_instance_id.alloc(num_object); + visibility.alloc(num_object); + hiprt_blas_ptr.alloc(num_object); + blas_ptr.alloc(num_object); + transform_headers.alloc(num_object); + custom_prim_info_offset.alloc(num_object); + prim_time_offset.alloc(num_object); + + foreach (Object *ob, objects) { + uint32_t mask = 0; + if (ob->is_traceable()) { + mask = ob->visibility_for_tracing(); + } + + Transform current_transform = ob->get_tfm(); + Geometry *geom = ob->get_geometry(); + bool transform_applied = geom->transform_applied; + + BVHHIPRT *current_bvh = static_cast(geom->bvh); + bool is_valid_geometry = current_bvh->geom_input.geomType != hiprtInvalidValue; + hiprtGeometry hiprt_geom_current = current_bvh->hiprt_geom; + + hiprtFrameMatrix hiprt_transform_matrix = {{{0}}}; + Transform identity_matrix = transform_identity(); + get_hiprt_transform(hiprt_transform_matrix.matrix, identity_matrix); + + if (is_valid_geometry) { + bool is_custom_prim = current_bvh->custom_prim_info.size() > 0; + + if (is_custom_prim) { + + bool has_motion_blur = current_bvh->prims_time.size() > 0; + + unordered_map::iterator it = prim_info_map.find(geom); + + if (prim_info_map.find(geom) != prim_info_map.end()) { + + custom_prim_info_offset[blender_instance_id] = it->second; + + if (has_motion_blur) { + + prim_time_offset[blender_instance_id] = prim_time_map[geom]; + } + } + else { + int offset = bvh->custom_prim_info.size(); + + prim_info_map[geom].x = offset; + prim_info_map[geom].y = custom_prim_offset; + + bvh->custom_prim_info.resize(offset + current_bvh->custom_prim_info.size()); + memcpy(bvh->custom_prim_info.data() + offset, + current_bvh->custom_prim_info.data(), + current_bvh->custom_prim_info.size() * sizeof(int2)); + + custom_prim_info_offset[blender_instance_id].x = offset; + custom_prim_info_offset[blender_instance_id].y = custom_prim_offset; + + if (geom->geometry_type == Geometry::HAIR) { + custom_prim_offset += ((Hair *)geom)->num_curves(); + } + else if (geom->geometry_type == Geometry::POINTCLOUD) { + custom_prim_offset += ((PointCloud *)geom)->num_points(); + } + else { + custom_prim_offset += ((Mesh *)geom)->num_triangles(); + } + + if (has_motion_blur) { + int time_offset = bvh->prims_time.size(); + prim_time_map[geom] = time_offset; + + memcpy(bvh->prims_time.data() + time_offset, + current_bvh->prims_time.data(), + current_bvh->prims_time.size() * sizeof(float2)); + + prim_time_offset[blender_instance_id] = time_offset; + } + else + prim_time_offset[blender_instance_id] = -1; + } + } + else + custom_prim_info_offset[blender_instance_id] = {-1, -1}; + + hiprtTransformHeader current_header = {0}; + current_header.frameCount = 1; + current_header.frameIndex = transform_matrix.size(); + if (ob->get_motion().size()) { + int motion_size = ob->get_motion().size(); + assert(motion_size == 1); + + array tfm_array = ob->get_motion(); + float time_iternval = 1 / (float)(motion_size - 1); + current_header.frameCount = motion_size; + + vector tfm_hiprt_mb; + tfm_hiprt_mb.resize(motion_size); + for (int i = 0; i < motion_size; i++) { + get_hiprt_transform(tfm_hiprt_mb[i].matrix, tfm_array[i]); + tfm_hiprt_mb[i].time = (float)i * time_iternval; + transform_matrix.push_back_slow(tfm_hiprt_mb[i]); + } + } + else { + if (transform_applied) + current_transform = identity_matrix; + get_hiprt_transform(hiprt_transform_matrix.matrix, current_transform); + transform_matrix.push_back_slow(hiprt_transform_matrix); + } + + transform_headers[num_instances] = current_header; + + user_instance_id[num_instances] = blender_instance_id; + visibility[num_instances] = mask; + hiprt_blas_ptr[num_instances] = (uint64_t)hiprt_geom_current; + num_instances++; + } + blas_ptr[blender_instance_id] = (uint64_t)hiprt_geom_current; + blender_instance_id++; + } + + int frame_count = transform_matrix.size(); + hiprtSceneBuildInput scene_input_ptr = {0}; + scene_input_ptr.instanceCount = num_instances; + scene_input_ptr.frameCount = frame_count; + scene_input_ptr.frameType = hiprtFrameTypeMatrix; + + user_instance_id.copy_to_device(); + visibility.copy_to_device(); + hiprt_blas_ptr.copy_to_device(); + blas_ptr.copy_to_device(); + transform_headers.copy_to_device(); + { + instance_transform_matrix.alloc(frame_count); + instance_transform_matrix.host_pointer = transform_matrix.data(); + instance_transform_matrix.data_elements = sizeof(hiprtFrameMatrix); + instance_transform_matrix.data_type = TYPE_UCHAR; + instance_transform_matrix.data_size = frame_count; + instance_transform_matrix.copy_to_device(); + instance_transform_matrix.host_pointer = 0; + } + + scene_input_ptr.instanceMasks = (void *)visibility.device_pointer; + scene_input_ptr.instanceGeometries = (void *)hiprt_blas_ptr.device_pointer; + scene_input_ptr.instanceTransformHeaders = (void *)transform_headers.device_pointer; + scene_input_ptr.instanceFrames = (void *)instance_transform_matrix.device_pointer; + + hiprtScene scene = 0; + + hiprtError rt_err = hiprtCreateScene(hiprt_context, &scene_input_ptr, &options, &scene); + + if (rt_err != hiprtSuccess) { + set_error(string_printf("Failed to create TLAS")); + } + + size_t tlas_scratch_buffer_size; + rt_err = hiprtGetSceneBuildTemporaryBufferSize( + hiprt_context, &scene_input_ptr, &options, &tlas_scratch_buffer_size); + + if (rt_err != hiprtSuccess) { + set_error(string_printf("Failed to get scratch buffer size for TLAS")); + } + + if (tlas_scratch_buffer_size > scratch_buffer_size) { + scratch_buffer.alloc(tlas_scratch_buffer_size); + scratch_buffer.zero_to_device(); + } + + rt_err = hiprtBuildScene(hiprt_context, + build_operation, + &scene_input_ptr, + &options, + (void *)scratch_buffer.device_pointer, + 0, + scene); + + if (rt_err != hiprtSuccess) { + set_error(string_printf("Failed to build TLAS")); + } + + scratch_buffer.free(); + scratch_buffer_size = 0; + + if (bvh->custom_prim_info.size()) { + size_t data_size = bvh->custom_prim_info.size(); + custom_prim_info.alloc(data_size); + custom_prim_info.host_pointer = bvh->custom_prim_info.data(); + custom_prim_info.data_elements = 2; + custom_prim_info.data_type = TYPE_INT; + custom_prim_info.data_size = data_size; + custom_prim_info.copy_to_device(); + custom_prim_info.host_pointer = 0; + + custom_prim_info_offset.copy_to_device(); + } + + if (bvh->prims_time.size()) { + size_t data_size = bvh->prims_time.size(); + prims_time.alloc(data_size); + prims_time.host_pointer = bvh->prims_time.data(); + prims_time.data_elements = 2; + prims_time.data_type = TYPE_FLOAT; + prims_time.data_size = data_size; + prims_time.copy_to_device(); + prims_time.host_pointer = 0; + + prim_time_offset.copy_to_device(); + } + + size_t table_ptr_size = 0; + hipDeviceptr_t table_device_ptr; + + hip_assert(hipModuleGetGlobal(&table_device_ptr, &table_ptr_size, hipModule, "kernel_params")); + + size_t kernel_param_offset[4]; + int table_index = 0; + kernel_param_offset[table_index++] = offsetof(KernelParamsHIPRT, table_closest_intersect); + kernel_param_offset[table_index++] = offsetof(KernelParamsHIPRT, table_shadow_intersect); + kernel_param_offset[table_index++] = offsetof(KernelParamsHIPRT, table_local_intersect); + kernel_param_offset[table_index++] = offsetof(KernelParamsHIPRT, table_volume_intersect); + + for (int index = 0; index < table_index; index++) { + + hip_assert(hipMemcpyHtoD( + table_device_ptr + kernel_param_offset[index], &functions_table, sizeof(device_ptr))); + } + + return scene; +} + +void HIPRTDevice::build_bvh(BVH *bvh, Progress &progress, bool refit) +{ + progress.set_substatus("Building HIPRT acceleration structure"); + + hiprtBuildOptions options; + options.buildFlags = hiprtBuildFlagBitPreferHighQualityBuild; + + BVHHIPRT *bvh_rt = static_cast(bvh); + HIPContextScope scope(this); + + if (!bvh_rt->is_tlas()) { + vector geometry = bvh_rt->geometry; + assert(geometry.size() == 1); + Geometry *geom = geometry[0]; + build_blas(bvh_rt, geom, options); + } + else { + + const vector objects = bvh_rt->objects; + scene = build_tlas(bvh_rt, objects, options, refit); + } +} +CCL_NAMESPACE_END + +#endif diff --git a/intern/cycles/device/hiprt/device_impl.h b/intern/cycles/device/hiprt/device_impl.h new file mode 100644 index 00000000000..b5005cbccb5 --- /dev/null +++ b/intern/cycles/device/hiprt/device_impl.h @@ -0,0 +1,126 @@ +/* SPDX-License-Identifier: Apache-2.0 + * Copyright 2011-2023 Blender Foundation */ + +#pragma once + +#ifdef WITH_HIPRT + +# include "device/hip/device_impl.h" +# include "device/hip/kernel.h" +# include "device/hip/queue.h" +# include "device/hiprt/queue.h" + +# ifdef WITH_HIP_DYNLOAD +# include "hiprtew.h" +# else +# include +# endif + +# include "kernel/device/hiprt/globals.h" + +CCL_NAMESPACE_BEGIN + +class Mesh; +class Hair; +class PointCloud; +class Geometry; +class Object; +class BVHHIPRT; + +class HIPRTDevice : public HIPDevice { + + public: + virtual BVHLayoutMask get_bvh_layout_mask(const uint kernel_features) const override; + + HIPRTDevice(const DeviceInfo &info, Stats &stats, Profiler &profiler); + + virtual ~HIPRTDevice(); + virtual unique_ptr gpu_queue_create() override; + string compile_kernel_get_common_cflags(const uint kernel_features) override; + virtual string compile_kernel(const uint kernel_features, + const char *name, + const char *base = "hiprt") override; + + virtual bool load_kernels(const uint kernel_features) override; + + virtual void const_copy_to(const char *name, void *host, size_t size) override; + + virtual void build_bvh(BVH *bvh, Progress &progress, bool refit) override; + + hiprtContext get_hiprt_context() + { + return hiprt_context; + } + + device_vector global_stack_buffer; + + protected: + enum Filter_Function { Closest = 0, Shadows, Local, Volume, Max_Intersect_Filter_Function }; + enum Primitive_Type { Triangle = 0, Curve, Motion_Triangle, Point, Max_Primitive_Type }; + + hiprtGeometryBuildInput prepare_triangle_blas(BVHHIPRT *bvh, Mesh *mesh); + hiprtGeometryBuildInput prepare_curve_blas(BVHHIPRT *bvh, Hair *hair); + hiprtGeometryBuildInput prepare_point_blas(BVHHIPRT *bvh, PointCloud *pointcloud); + void build_blas(BVHHIPRT *bvh, Geometry *geom, hiprtBuildOptions options); + hiprtScene build_tlas(BVHHIPRT *bvh, + vector objects, + hiprtBuildOptions options, + bool refit); + + hiprtContext hiprt_context; + hiprtScene scene; + hiprtFuncTable functions_table; + + thread_mutex hiprt_mutex; + size_t scratch_buffer_size; + device_vector scratch_buffer; + + /* The following vectors are to transfer scene information available on the host to the GPU + * visibility, instance_transform_matrix, transform_headers, and hiprt_blas_ptr are passed to + * hiprt to build bvh the rest are directly used in traversal functions/intersection kernels and + * are defined on the GPU side as members of KernelParamsHIPRT struct the host memory is copied + * to GPU through const_copy_to() function. */ + + device_vector visibility; + + /* instance_transform_matrix passes transform matrix of instances converted from Cycles Transform + * format to instanceFrames member of hiprtSceneBuildInput. */ + device_vector instance_transform_matrix; + /* Movement over a time interval for motion blur is captured through multiple transform matrices. + * In this case transform matrix of an instance cannot be directly retrieved by looking up + * instance_transform_matrix give the instance id. transform_headers maps the instance id to the + * appropriate index to retrieve instance transform matrix (frameIndex member of + * hiprtTransformHeader). transform_headers also has the information on how many transform + * matrices are associated with an instance (frameCount member of hiprtTransformHeader) + * transform_headers is passed to hiprt through instanceTransformHeaders member of + * hiprtSceneBuildInput. */ + device_vector transform_headers; + + /* Instance/object ids are not explicitly passed to hiprt. + * HIP RT assigns the ids based on the order blas pointers are passed to it (through + * instanceGeometries member of hiprtSceneBuildInput). If blas is absent for a particular + * geometry (e.g. a plane), HIP RT removes that entry and in scenes with objects with no blas, + * the instance id that hiprt returns for a hit point will not necessarily match the instance id + * of the application. user_instance_id provides a map for retrieving original instance id from + * what HIP RT returns as instance id. hiprt_blas_ptr is the list of all the valid blas pointers. + * blas_ptr has all the valid pointers and null pointers and blas for any geometry can be + * directly retrieved from this array (used in subsurface scattering). */ + device_vector user_instance_id; + device_vector hiprt_blas_ptr; + device_vector blas_ptr; + + /* custom_prim_info stores custom information for custom primitives for all the primitives in a + * scene. Primitive id that HIP RT returns is local to the geometry that was hit. + * custom_prim_info_offset returns the offset required to add to the primitive id to retrieve + * primitive info from custom_prim_info. */ + device_vector custom_prim_info; + device_vector custom_prim_info_offset; + + /* prims_time stores primitive time for geometries with motion blur. + * prim_time_offset returns the offset to add to primitive id to retrieve primitive time. */ + device_vector prims_time; + device_vector prim_time_offset; +}; +CCL_NAMESPACE_END + +#endif diff --git a/intern/cycles/device/hiprt/queue.cpp b/intern/cycles/device/hiprt/queue.cpp new file mode 100644 index 00000000000..28cc6673c87 --- /dev/null +++ b/intern/cycles/device/hiprt/queue.cpp @@ -0,0 +1,68 @@ +/* SPDX-License-Identifier: Apache-2.0 + * Copyright 2011-2022 Blender Foundation */ + +#ifdef WITH_HIPRT + +# include "device/hiprt/queue.h" + +# include "device/hip/graphics_interop.h" +# include "device/hip/kernel.h" +# include "device/hiprt/device_impl.h" + +CCL_NAMESPACE_BEGIN + +HIPRTDeviceQueue::HIPRTDeviceQueue(HIPRTDevice *device) + : HIPDeviceQueue((HIPDevice *)device), hiprt_device_(device) +{ +} + +bool HIPRTDeviceQueue::enqueue(DeviceKernel kernel, + const int work_size, + DeviceKernelArguments const &args) +{ + if (hiprt_device_->have_error()) { + return false; + } + + if (!device_kernel_has_intersection(kernel)) { + return HIPDeviceQueue::enqueue(kernel, work_size, args); + } + + debug_enqueue_begin(kernel, work_size); + + const HIPContextScope scope(hiprt_device_); + const HIPDeviceKernel &hip_kernel = hiprt_device_->kernels.get(kernel); + + if (!hiprt_device_->global_stack_buffer.device_pointer) { + int max_path = num_concurrent_states(0); + hiprt_device_->global_stack_buffer.alloc(max_path * HIPRT_SHARED_STACK_SIZE * sizeof(int)); + hiprt_device_->global_stack_buffer.zero_to_device(); + } + + DeviceKernelArguments args_copy = args; + args_copy.add(&hiprt_device_->global_stack_buffer.device_pointer); + + /* Compute kernel launch parameters. */ + const int num_threads_per_block = HIPRT_THREAD_GROUP_SIZE; + const int num_blocks = divide_up(work_size, num_threads_per_block); + int shared_mem_bytes = 0; + + assert_success(hipModuleLaunchKernel(hip_kernel.function, + num_blocks, + 1, + 1, + num_threads_per_block, + 1, + 1, + shared_mem_bytes, + hip_stream_, + const_cast(args_copy.values), + 0), + "enqueue"); + + return !(hiprt_device_->have_error()); +} + +CCL_NAMESPACE_END + +#endif /* WITH_HIPRT */ diff --git a/intern/cycles/device/hiprt/queue.h b/intern/cycles/device/hiprt/queue.h new file mode 100644 index 00000000000..1318e2c15f0 --- /dev/null +++ b/intern/cycles/device/hiprt/queue.h @@ -0,0 +1,33 @@ +/* SPDX-License-Identifier: Apache-2.0 + * Copyright 2011-2022 Blender Foundation */ + +#pragma once + +#ifdef WITH_HIPRT + +# include "device/kernel.h" +# include "device/memory.h" +# include "device/queue.h" + +# include "device/hip/queue.h" +# include "device/hip/util.h" + +CCL_NAMESPACE_BEGIN + +class HIPRTDevice; + +class HIPRTDeviceQueue : public HIPDeviceQueue { + public: + HIPRTDeviceQueue(HIPRTDevice *device); + ~HIPRTDeviceQueue() {} + virtual bool enqueue(DeviceKernel kernel, + const int work_size, + DeviceKernelArguments const &args) override; + + protected: + HIPRTDevice *hiprt_device_; +}; + +CCL_NAMESPACE_END + +#endif /* WITH_HIPRT */ diff --git a/intern/cycles/device/multi/device.cpp b/intern/cycles/device/multi/device.cpp index aae812604eb..fb5534ab9cc 100644 --- a/intern/cycles/device/multi/device.cpp +++ b/intern/cycles/device/multi/device.cpp @@ -117,6 +117,10 @@ class MultiDevice : public Device { return BVH_LAYOUT_MULTI_METAL; } + if (bvh_layout_mask == BVH_LAYOUT_HIPRT) { + return BVH_LAYOUT_MULTI_HIPRT; + } + /* When devices do not share a common BVH layout, fall back to creating one for each */ const BVHLayoutMask BVH_LAYOUT_OPTIX_EMBREE = (BVH_LAYOUT_OPTIX | BVH_LAYOUT_EMBREE); if ((bvh_layout_mask_all & BVH_LAYOUT_OPTIX_EMBREE) == BVH_LAYOUT_OPTIX_EMBREE) { @@ -158,8 +162,10 @@ class MultiDevice : public Device { assert(bvh->params.bvh_layout == BVH_LAYOUT_MULTI_OPTIX || bvh->params.bvh_layout == BVH_LAYOUT_MULTI_METAL || + bvh->params.bvh_layout == BVH_LAYOUT_MULTI_HIPRT || bvh->params.bvh_layout == BVH_LAYOUT_MULTI_OPTIX_EMBREE || - bvh->params.bvh_layout == BVH_LAYOUT_MULTI_METAL_EMBREE); + bvh->params.bvh_layout == BVH_LAYOUT_MULTI_METAL_EMBREE || + bvh->params.bvh_layout == BVH_LAYOUT_MULTI_HIPRT_EMBREE); BVHMulti *const bvh_multi = static_cast(bvh); bvh_multi->sub_bvhs.resize(devices.size()); @@ -184,12 +190,17 @@ class MultiDevice : public Device { params.bvh_layout = BVH_LAYOUT_OPTIX; else if (bvh->params.bvh_layout == BVH_LAYOUT_MULTI_METAL) params.bvh_layout = BVH_LAYOUT_METAL; + else if (bvh->params.bvh_layout == BVH_LAYOUT_MULTI_HIPRT) + params.bvh_layout = BVH_LAYOUT_HIPRT; else if (bvh->params.bvh_layout == BVH_LAYOUT_MULTI_OPTIX_EMBREE) params.bvh_layout = sub.device->info.type == DEVICE_OPTIX ? BVH_LAYOUT_OPTIX : BVH_LAYOUT_EMBREE; else if (bvh->params.bvh_layout == BVH_LAYOUT_MULTI_METAL_EMBREE) params.bvh_layout = sub.device->info.type == DEVICE_METAL ? BVH_LAYOUT_METAL : BVH_LAYOUT_EMBREE; + else if (bvh->params.bvh_layout == BVH_LAYOUT_MULTI_HIPRT_EMBREE) + params.bvh_layout = sub.device->info.type == DEVICE_HIPRT ? BVH_LAYOUT_HIPRT : + BVH_LAYOUT_EMBREE; /* Skip building a bottom level acceleration structure for non-instanced geometry on Embree * (since they are put into the top level directly, see bvh_embree.cpp) */ diff --git a/intern/cycles/integrator/path_trace.cpp b/intern/cycles/integrator/path_trace.cpp index da2e38730e2..4812ff2614d 100644 --- a/intern/cycles/integrator/path_trace.cpp +++ b/intern/cycles/integrator/path_trace.cpp @@ -1149,6 +1149,8 @@ static const char *device_type_for_description(const DeviceType type) return "OptiX"; case DEVICE_HIP: return "HIP"; + case DEVICE_HIPRT: + return "HIPRT"; case DEVICE_ONEAPI: return "oneAPI"; case DEVICE_DUMMY: diff --git a/intern/cycles/kernel/CMakeLists.txt b/intern/cycles/kernel/CMakeLists.txt index 71466ace54c..b22146eaf83 100644 --- a/intern/cycles/kernel/CMakeLists.txt +++ b/intern/cycles/kernel/CMakeLists.txt @@ -26,6 +26,10 @@ set(SRC_KERNEL_DEVICE_HIP device/hip/kernel.cpp ) +set(SRC_KERNEL_DEVICE_HIPRT + device/hiprt/kernel.cpp +) + set(SRC_KERNEL_DEVICE_METAL device/metal/kernel.metal ) @@ -77,6 +81,13 @@ set(SRC_KERNEL_DEVICE_HIP_HEADERS device/hip/globals.h ) +set(SRC_KERNEL_DEVICE_HIPRT_HEADERS + device/hiprt/bvh.h + device/hiprt/common.h + device/hiprt/globals.h + device/hiprt/hiprt_kernels.h +) + set(SRC_KERNEL_DEVICE_OPTIX_HEADERS device/optix/bvh.h device/optix/compat.h @@ -643,6 +654,86 @@ if(WITH_CYCLES_HIP_BINARIES AND WITH_CYCLES_DEVICE_HIP) cycles_set_solution_folder(cycles_kernel_hip) endif() +# HIP RT module + +if(WITH_CYCLES_DEVICE_HIPRT AND WITH_CYCLES_HIP_BINARIES) + set(hiprt_sources device/hiprt/kernel.cpp + ${SRC_KERNEL_HEADERS} + ${SRC_KERNEL_DEVICE_GPU_HEADERS} + ${SRC_KERNEL_DEVICE_HIPRT_HEADERS} + ${SRC_UTIL_HEADERS}) + set(bitcode_file ${CMAKE_CURRENT_BINARY_DIR}/kernel_rt_gfx.bc) + set(hiprt_file ${CMAKE_CURRENT_BINARY_DIR}/kernel_rt_gfx.hipfb) + set(kernel_sources ${sources}) + set(hiprt_kernel_src "/device/hiprt/kernel.cpp") + if(WIN32) + set(hiprt_compile_command ${CMAKE_COMMAND}) + set(hiprt_compile_flags + -E env "HIP_PATH=${HIP_ROOT_DIR}" + ${HIP_HIPCC_EXECUTABLE}.bat) + else() + set(hiprt_compile_command ${HIP_HIPCC_EXECUTABLE}) + set(hiprt_compile_flags) + endif() + set(target_gpus) + foreach(arch ${CYCLES_HIP_BINARIES_ARCH}) + list(APPEND target_gpus "--offload-arch=${arch}") + endforeach() + set(hiprt_compile_flags + ${hiprt_compile_flags} + ${target_gpus} + ${HIP_HIPCC_FLAGS} + ${CMAKE_CURRENT_SOURCE_DIR}${hiprt_kernel_src} + ${flags} + -D CCL_NAMESPACE_BEGIN= + -D CCL_NAMESPACE_END= + -D HIPCC + -D __HIPRT__ + -std=c++17 + -fgpu-rdc + -c + --gpu-bundle-output + -emit-llvm + -I ${CMAKE_CURRENT_SOURCE_DIR}/.. + -I ${CMAKE_CURRENT_SOURCE_DIR}/device/hiprt + -I ${HIPRT_INCLUDE_DIR} + -Wno-parentheses-equality + -Wno-unused-value + --hipcc-func-supp + -ffast-math + -o ${bitcode_file}) + if(WITH_CYCLES_DEBUG) + set(hiprt_compile_flags ${hiprt_compile_flags} -D WITH_CYCLES_DEBUG) + endif() + add_custom_command( + OUTPUT ${bitcode_file} + COMMAND ${hiprt_compile_command} ${hiprt_compile_flags} + DEPENDS ${kernel_sources}) + if(WIN32) + set(hiprt_link_command ${CMAKE_COMMAND}) + set(hiprt_link_flags -E env "HIP_PATH=${HIP_ROOT_DIR}" + ${HIP_LINKER_EXECUTABLE}) + else() + # not implemented yet + endif() + set(hiprt_link_flags + ${hiprt_link_flags} + ${target_gpus} + -fgpu-rdc + --hip-link + --cuda-device-only + ${bitcode_file} + ${HIPRT_BITCODE} + -o ${hiprt_file}) + add_custom_command( + OUTPUT ${hiprt_file} + COMMAND ${hiprt_link_command} ${hiprt_link_flags} + DEPENDS ${bitcode_file}) + delayed_install("${CMAKE_CURRENT_BINARY_DIR}" "${hiprt_file}" ${CYCLES_INSTALL_PATH}/lib) + add_custom_target(cycles_kernel_hiprt ALL DEPENDS ${hiprt_file}) + cycles_set_solution_folder(cycles_kernel_hiprt) +endif() + # OptiX PTX modules if(WITH_CYCLES_DEVICE_OPTIX AND WITH_CYCLES_CUDA_BINARIES) @@ -1033,6 +1124,7 @@ cycles_add_library(cycles_kernel "${LIB}" ${SRC_KERNEL_DEVICE_CPU} ${SRC_KERNEL_DEVICE_CUDA} ${SRC_KERNEL_DEVICE_HIP} + ${SRC_KERNEL_DEVICE_HIPRT} ${SRC_KERNEL_DEVICE_OPTIX} ${SRC_KERNEL_DEVICE_METAL} ${SRC_KERNEL_HEADERS} @@ -1040,6 +1132,7 @@ cycles_add_library(cycles_kernel "${LIB}" ${SRC_KERNEL_DEVICE_GPU_HEADERS} ${SRC_KERNEL_DEVICE_CUDA_HEADERS} ${SRC_KERNEL_DEVICE_HIP_HEADERS} + ${SRC_KERNEL_DEVICE_HIPRT_HEADERS} ${SRC_KERNEL_DEVICE_OPTIX_HEADERS} ${SRC_KERNEL_DEVICE_METAL_HEADERS} ${SRC_KERNEL_DEVICE_ONEAPI_HEADERS} @@ -1053,6 +1146,7 @@ source_group("device\\cpu" FILES ${SRC_KERNEL_DEVICE_CPU} ${SRC_KERNEL_DEVICE_CP source_group("device\\cuda" FILES ${SRC_KERNEL_DEVICE_CUDA} ${SRC_KERNEL_DEVICE_CUDA_HEADERS}) source_group("device\\gpu" FILES ${SRC_KERNEL_DEVICE_GPU_HEADERS}) source_group("device\\hip" FILES ${SRC_KERNEL_DEVICE_HIP} ${SRC_KERNEL_DEVICE_HIP_HEADERS}) +source_group("device\\hiprt" FILES ${SRC_KERNEL_DEVICE_HIPRT} ${SRC_KERNEL_DEVICE_HIPRT_HEADERS}) source_group("device\\optix" FILES ${SRC_KERNEL_DEVICE_OPTIX} ${SRC_KERNEL_DEVICE_OPTIX_HEADERS}) source_group("device\\metal" FILES ${SRC_KERNEL_DEVICE_METAL} ${SRC_KERNEL_DEVICE_METAL_HEADERS}) source_group("device\\oneapi" FILES ${SRC_KERNEL_DEVICE_ONEAPI} ${SRC_KERNEL_DEVICE_ONEAPI_HEADERS}) @@ -1090,6 +1184,8 @@ delayed_install(${CMAKE_CURRENT_SOURCE_DIR} "${SRC_KERNEL_DEVICE_CUDA_HEADERS}" delayed_install(${CMAKE_CURRENT_SOURCE_DIR} "${SRC_KERNEL_DEVICE_GPU_HEADERS}" ${CYCLES_INSTALL_PATH}/source/kernel/device/gpu) delayed_install(${CMAKE_CURRENT_SOURCE_DIR} "${SRC_KERNEL_DEVICE_HIP}" ${CYCLES_INSTALL_PATH}/source/kernel/device/hip) delayed_install(${CMAKE_CURRENT_SOURCE_DIR} "${SRC_KERNEL_DEVICE_HIP_HEADERS}" ${CYCLES_INSTALL_PATH}/source/kernel/device/hip) +delayed_install(${CMAKE_CURRENT_SOURCE_DIR} "${SRC_KERNEL_DEVICE_HIPRT}" ${CYCLES_INSTALL_PATH}/source/kernel/device/hiprt) +delayed_install(${CMAKE_CURRENT_SOURCE_DIR} "${SRC_KERNEL_DEVICE_HIPRT_HEADERS}" ${CYCLES_INSTALL_PATH}/source/kernel/device/hiprt) delayed_install(${CMAKE_CURRENT_SOURCE_DIR} "${SRC_KERNEL_DEVICE_OPTIX}" ${CYCLES_INSTALL_PATH}/source/kernel/device/optix) delayed_install(${CMAKE_CURRENT_SOURCE_DIR} "${SRC_KERNEL_DEVICE_OPTIX_HEADERS}" ${CYCLES_INSTALL_PATH}/source/kernel/device/optix) delayed_install(${CMAKE_CURRENT_SOURCE_DIR} "${SRC_KERNEL_DEVICE_METAL}" ${CYCLES_INSTALL_PATH}/source/kernel/device/metal) diff --git a/intern/cycles/kernel/bvh/bvh.h b/intern/cycles/kernel/bvh/bvh.h index ceee9aa39fb..b39731bd0ad 100644 --- a/intern/cycles/kernel/bvh/bvh.h +++ b/intern/cycles/kernel/bvh/bvh.h @@ -17,6 +17,8 @@ # include "kernel/device/metal/bvh.h" #elif defined(__KERNEL_OPTIX__) # include "kernel/device/optix/bvh.h" +#elif defined(__HIPRT__) +# include "kernel/device/hiprt/bvh.h" #else # define __BVH2__ #endif diff --git a/intern/cycles/kernel/device/gpu/kernel.h b/intern/cycles/kernel/device/gpu/kernel.h index b4c84fd0f0b..fcb99328950 100644 --- a/intern/cycles/kernel/device/gpu/kernel.h +++ b/intern/cycles/kernel/device/gpu/kernel.h @@ -48,6 +48,9 @@ #include "kernel/film/read.h" +#if defined(__HIPRT__) +# include "kernel/device/hiprt/hiprt_kernels.h" +#endif /* -------------------------------------------------------------------- * Integrator. */ @@ -128,11 +131,13 @@ ccl_gpu_kernel(GPU_KERNEL_BLOCK_NUM_THREADS, GPU_KERNEL_MAX_REGISTERS) } ccl_gpu_kernel_postfix +#if !defined(__HIPRT__) + /* Intersection kernels need access to the kernel handler for specialization constants to work * properly. */ -#ifdef __KERNEL_ONEAPI__ -# include "kernel/device/oneapi/context_intersect_begin.h" -#endif +# ifdef __KERNEL_ONEAPI__ +# include "kernel/device/oneapi/context_intersect_begin.h" +# endif ccl_gpu_kernel(GPU_KERNEL_BLOCK_NUM_THREADS, GPU_KERNEL_MAX_REGISTERS) ccl_gpu_kernel_signature(integrator_intersect_closest, @@ -191,8 +196,10 @@ ccl_gpu_kernel(GPU_KERNEL_BLOCK_NUM_THREADS, GPU_KERNEL_MAX_REGISTERS) } ccl_gpu_kernel_postfix -#ifdef __KERNEL_ONEAPI__ -# include "kernel/device/oneapi/context_intersect_end.h" +# ifdef __KERNEL_ONEAPI__ +# include "kernel/device/oneapi/context_intersect_end.h" +# endif + #endif ccl_gpu_kernel(GPU_KERNEL_BLOCK_NUM_THREADS, GPU_KERNEL_MAX_REGISTERS) @@ -259,11 +266,13 @@ ccl_gpu_kernel_postfix constant int __dummy_constant [[function_constant(Kernel_DummyConstant)]]; #endif +#if !defined(__HIPRT__) + /* Kernels using intersections need access to the kernel handler for specialization constants to * work properly. */ -#ifdef __KERNEL_ONEAPI__ -# include "kernel/device/oneapi/context_intersect_begin.h" -#endif +# ifdef __KERNEL_ONEAPI__ +# include "kernel/device/oneapi/context_intersect_begin.h" +# endif ccl_gpu_kernel(GPU_KERNEL_BLOCK_NUM_THREADS, GPU_KERNEL_MAX_REGISTERS) ccl_gpu_kernel_signature(integrator_shade_surface_raytrace, @@ -276,15 +285,15 @@ ccl_gpu_kernel(GPU_KERNEL_BLOCK_NUM_THREADS, GPU_KERNEL_MAX_REGISTERS) if (ccl_gpu_kernel_within_bounds(global_index, work_size)) { const int state = (path_index_array) ? path_index_array[global_index] : global_index; -#if defined(__KERNEL_METAL_APPLE__) && defined(__METALRT__) +# if defined(__KERNEL_METAL_APPLE__) && defined(__METALRT__) KernelGlobals kg = NULL; /* Workaround Ambient Occlusion and Bevel nodes not working with Metal. * Dummy offset should not affect result, but somehow fixes bug! */ kg += __dummy_constant; ccl_gpu_kernel_call(integrator_shade_surface_raytrace(kg, state, render_buffer)); -#else +# else ccl_gpu_kernel_call(integrator_shade_surface_raytrace(NULL, state, render_buffer)); -#endif +# endif } } ccl_gpu_kernel_postfix @@ -303,8 +312,11 @@ ccl_gpu_kernel(GPU_KERNEL_BLOCK_NUM_THREADS, GPU_KERNEL_MAX_REGISTERS) } } ccl_gpu_kernel_postfix -#ifdef __KERNEL_ONEAPI__ -# include "kernel/device/oneapi/context_intersect_end.h" + +# ifdef __KERNEL_ONEAPI__ +# include "kernel/device/oneapi/context_intersect_end.h" +# endif + #endif ccl_gpu_kernel(GPU_KERNEL_BLOCK_NUM_THREADS, GPU_KERNEL_MAX_REGISTERS) diff --git a/intern/cycles/kernel/device/hip/config.h b/intern/cycles/kernel/device/hip/config.h index 6b1a9464b34..7239acb2a71 100644 --- a/intern/cycles/kernel/device/hip/config.h +++ b/intern/cycles/kernel/device/hip/config.h @@ -20,6 +20,10 @@ #define GPU_KERNEL_BLOCK_NUM_THREADS 1024 #define GPU_KERNEL_MAX_REGISTERS 64 +/* For performance tuning of hiprt kernels we might have to change the number + * that's why we don't use GPU_KERNEL_BLOCK_NUM_THREADS. */ +#define GPU_HIPRT_KERNEL_BLOCK_NUM_THREADS 1024 + /* Compute number of threads per block and minimum blocks per multiprocessor * given the maximum number of registers per thread. */ #define ccl_gpu_kernel(block_num_threads, thread_num_registers) \ diff --git a/intern/cycles/kernel/device/hiprt/bvh.h b/intern/cycles/kernel/device/hiprt/bvh.h new file mode 100644 index 00000000000..2c7daa2f2fa --- /dev/null +++ b/intern/cycles/kernel/device/hiprt/bvh.h @@ -0,0 +1,219 @@ +/* SPDX-License-Identifier: Apache-2.0 + * Copyright 2011-2022 Blender Foundation */ + +#pragma once + +#include "kernel/device/hiprt/common.h" + +CCL_NAMESPACE_BEGIN + +ccl_device_inline bool scene_intersect_valid(ccl_private const Ray *ray) +{ + return isfinite_safe(ray->P.x) && isfinite_safe(ray->D.x) && len_squared(ray->D) != 0.0f; +} + +ccl_device_intersect bool scene_intersect(KernelGlobals kg, + ccl_private const Ray *ray, + const uint visibility, + ccl_private Intersection *isect) +{ + isect->t = ray->tmax; + isect->u = 0.0f; + isect->v = 0.0f; + isect->prim = PRIM_NONE; + isect->object = OBJECT_NONE; + isect->type = PRIMITIVE_NONE; + if (!scene_intersect_valid(ray)) { + isect->t = ray->tmax; + isect->type = PRIMITIVE_NONE; + return false; + } + + hiprtRay ray_hip; + + SET_HIPRT_RAY(ray_hip, ray) + + RayPayload payload; + payload.self = ray->self; + payload.kg = kg; + payload.visibility = visibility; + payload.prim_type = PRIMITIVE_NONE; + payload.ray_time = ray->time; + + hiprtHit hit = {}; + + GET_TRAVERSAL_STACK() + + if (visibility & PATH_RAY_SHADOW_OPAQUE) { + GET_TRAVERSAL_ANY_HIT(table_closest_intersect, 0) + hit = traversal.getNextHit(); + } + else { + GET_TRAVERSAL_CLOSEST_HIT(table_closest_intersect, 0) + hit = traversal.getNextHit(); + } + if (hit.hasHit()) { + set_intersect_point(kg, hit, isect); + if (isect->type > 1) { // should be applied only for curves + isect->type = payload.prim_type; + isect->prim = hit.primID; + } + return true; + } + return false; +} + +#ifdef __BVH_LOCAL__ +ccl_device_intersect bool scene_intersect_local(KernelGlobals kg, + ccl_private const Ray *ray, + ccl_private LocalIntersection *local_isect, + int local_object, + ccl_private uint *lcg_state, + int max_hits) +{ + if (!scene_intersect_valid(ray)) { + if (local_isect) { + local_isect->num_hits = 0; + } + return false; + } + + float3 P = ray->P; + float3 dir = bvh_clamp_direction(ray->D); + float3 idir = bvh_inverse_direction(dir); + + if (local_isect != NULL) { + local_isect->num_hits = 0; + } + + const int object_flag = kernel_data_fetch(object_flag, local_object); + if (!(object_flag & SD_OBJECT_TRANSFORM_APPLIED)) { + +# if BVH_FEATURE(BVH_MOTION) + bvh_instance_motion_push(kg, local_object, ray, &P, &dir, &idir); +# else + bvh_instance_push(kg, local_object, ray, &P, &dir, &idir); +# endif + } + + hiprtRay ray_hip; + ray_hip.origin = P; + ray_hip.direction = dir; + ray_hip.maxT = ray->tmax; + ray_hip.minT = ray->tmin; + + LocalPayload payload = {0}; + payload.kg = kg; + payload.self = ray->self; + payload.local_object = local_object; + payload.max_hits = max_hits; + payload.lcg_state = lcg_state; + payload.local_isect = local_isect; + + GET_TRAVERSAL_STACK() + + void *local_geom = (void *)(kernel_data_fetch(blas_ptr, local_object)); + // we don't need custom intersection functions for SSR +# ifdef HIPRT_SHARED_STACK + hiprtGeomTraversalAnyHitCustomStack traversal(local_geom, + ray_hip, + stack, + hiprtTraversalHintDefault, + &payload, + kernel_params.table_local_intersect, + 2); +# else + hiprtGeomTraversalAnyHit traversal( + local_geom, ray_hip, table, hiprtTraversalHintDefault, &payload); +# endif + hiprtHit hit = traversal.getNextHit(); + return hit.hasHit(); +} +#endif //__BVH_LOCAL__ + +#ifdef __SHADOW_RECORD_ALL__ +ccl_device_intersect bool scene_intersect_shadow_all(KernelGlobals kg, + IntegratorShadowState state, + ccl_private const Ray *ray, + uint visibility, + uint max_hits, + ccl_private uint *num_recorded_hits, + ccl_private float *throughput) +{ + *throughput = 1.0f; + *num_recorded_hits = 0; + + if (!scene_intersect_valid(ray)) { + return false; + } + + hiprtRay ray_hip; + + SET_HIPRT_RAY(ray_hip, ray) + ShadowPayload payload; + payload.kg = kg; + payload.self = ray->self; + payload.in_state = state; + payload.max_hits = max_hits; + payload.visibility = visibility; + payload.prim_type = PRIMITIVE_TRIANGLE; + payload.ray_time = ray->time; + payload.num_hits = 0; + payload.r_num_recorded_hits = num_recorded_hits; + payload.r_throughput = throughput; + GET_TRAVERSAL_STACK() + GET_TRAVERSAL_ANY_HIT(table_shadow_intersect, 1) + hiprtHit hit = traversal.getNextHit(); + num_recorded_hits = payload.r_num_recorded_hits; + throughput = payload.r_throughput; + return hit.hasHit(); +} +#endif /* __SHADOW_RECORD_ALL__ */ + +#ifdef __VOLUME__ +ccl_device_intersect bool scene_intersect_volume(KernelGlobals kg, + ccl_private const Ray *ray, + ccl_private Intersection *isect, + const uint visibility) +{ + isect->t = ray->tmax; + isect->u = 0.0f; + isect->v = 0.0f; + isect->prim = PRIM_NONE; + isect->object = OBJECT_NONE; + isect->type = PRIMITIVE_NONE; + + if (!scene_intersect_valid(ray)) { + return false; + } + + hiprtRay ray_hip; + + SET_HIPRT_RAY(ray_hip, ray) + + RayPayload payload; + payload.self = ray->self; + payload.kg = kg; + payload.visibility = visibility; + payload.prim_type = PRIMITIVE_NONE; + payload.ray_time = ray->time; + + GET_TRAVERSAL_STACK() + + GET_TRAVERSAL_CLOSEST_HIT(table_volume_intersect, 3) + hiprtHit hit = traversal.getNextHit(); + // return hit.hasHit(); + if (hit.hasHit()) { + set_intersect_point(kg, hit, isect); + if (isect->type > 1) { // should be applied only for curves + isect->type = payload.prim_type; + isect->prim = hit.primID; + } + return true; + } + else + return false; +} +#endif /* __VOLUME__ */ + +CCL_NAMESPACE_END diff --git a/intern/cycles/kernel/device/hiprt/common.h b/intern/cycles/kernel/device/hiprt/common.h new file mode 100644 index 00000000000..1017857d8f9 --- /dev/null +++ b/intern/cycles/kernel/device/hiprt/common.h @@ -0,0 +1,637 @@ +/* SPDX-License-Identifier: Apache-2.0 + * Copyright 2011-2022 Blender Foundation */ + +#ifdef __HIPRT__ + +struct RayPayload { + KernelGlobals kg; + RaySelfPrimitives self; + uint visibility; + int prim_type; + float ray_time; +}; + +struct ShadowPayload { + KernelGlobals kg; + RaySelfPrimitives self; + uint visibility; + int prim_type; + float ray_time; + int in_state; + uint max_hits; + uint num_hits; + uint *r_num_recorded_hits; + float *r_throughput; +}; + +struct LocalPayload { + KernelGlobals kg; + RaySelfPrimitives self; + int prim_type; + float ray_time; + int local_object; + uint max_hits; + uint *lcg_state; + LocalIntersection *local_isect; +}; + +# define SET_HIPRT_RAY(RAY_RT, RAY) \ + RAY_RT.direction = RAY->D; \ + RAY_RT.origin = RAY->P; \ + RAY_RT.maxT = RAY->tmax; \ + RAY_RT.minT = RAY->tmin; + +# if defined(HIPRT_SHARED_STACK) +# define GET_TRAVERSAL_STACK() \ + Stack stack(&kg->global_stack_buffer[0], \ + HIPRT_THREAD_STACK_SIZE, \ + kg->shared_stack, \ + HIPRT_SHARED_STACK_SIZE); +# else +# define GET_TRAVERSAL_STACK() +# endif + +# ifdef HIPRT_SHARED_STACK +# define GET_TRAVERSAL_ANY_HIT(FUNCTION_TABLE, RAY_TYPE) \ + hiprtSceneTraversalAnyHitCustomStack traversal(kernel_data.device_bvh, \ + ray_hip, \ + stack, \ + visibility, \ + hiprtTraversalHintDefault, \ + &payload, \ + kernel_params.FUNCTION_TABLE, \ + RAY_TYPE); \ + hiprtSceneTraversalAnyHitCustomStack traversal_simple( \ + kernel_data.device_bvh, ray_hip, stack, visibility); +# define GET_TRAVERSAL_CLOSEST_HIT(FUNCTION_TABLE, RAY_TYPE) \ + hiprtSceneTraversalClosestCustomStack traversal(kernel_data.device_bvh, \ + ray_hip, \ + stack, \ + visibility, \ + hiprtTraversalHintDefault, \ + &payload, \ + kernel_params.FUNCTION_TABLE, \ + RAY_TYPE); \ + hiprtSceneTraversalClosestCustomStack traversal_simple( \ + kernel_data.device_bvh, ray_hip, stack, visibility); +# else +# define GET_TRAVERSAL_ANY_HIT(FUNCTION_TABLE) \ + hiprtSceneTraversalAnyHit traversal(kernel_data.device_bvh, \ + ray_hip, \ + visibility, \ + FUNCTION_TABLE, \ + hiprtTraversalHintDefault, \ + &payload); \ + hiprtSceneTraversalAnyHit traversal_simple(kernel_data.device_bvh, ray_hip, visibility); +# define GET_TRAVERSAL_CLOSEST_HIT(FUNCTION_TABLE) \ + hiprtSceneTraversalClosest traversal(kernel_data.device_bvh, \ + ray_hip, \ + visibility, \ + FUNCTION_TABLE, \ + hiprtTraversalHintDefault, \ + &payload); \ + hiprtSceneTraversalClosest traversal_simple(kernel_data.device_bvh, ray_hip, visibility); +# endif + +ccl_device_inline void set_intersect_point(KernelGlobals kg, + hiprtHit &hit, + ccl_private Intersection *isect) +{ + int prim_offset = 0; + int object_id = kernel_data_fetch(user_instance_id, hit.instanceID); + prim_offset = kernel_data_fetch(object_prim_offset, object_id); + isect->type = kernel_data_fetch(objects, object_id).primitive_type; + + isect->t = hit.t; + isect->prim = hit.primID + prim_offset; + isect->object = object_id; + isect->u = hit.uv.x; + isect->v = hit.uv.y; +} + +// custom intersection functions + +ccl_device_inline bool curve_custom_intersect(const hiprtRay &ray, + const void *userPtr, + void *payload, + hiprtHit &hit) + +{ + Intersection isect; + RayPayload *local_payload = (RayPayload *)payload; + // could also cast shadow payload to get the elements needed to do the intersection + // no need to write a separate function for shadow intersection + + KernelGlobals kg = local_payload->kg; + + int object_id = kernel_data_fetch(user_instance_id, hit.instanceID); + int2 data_offset = kernel_data_fetch(custom_prim_info_offset, object_id); + // data_offset.x: where the data (prim id, type )for the geometry of the current object begins + // the prim_id that is in hiprtHit hit is local to the partciular geometry so we add the above + // ofstream + // to map prim id in hiprtHit to the one compatible to what next stage expects + + // data_offset.y: the offset that has to be added to a local primitive to get the global + // primitive id = kernel_data_fetch(object_prim_offset, object_id); + + int prim_offset = data_offset.y; + + int curve_index = kernel_data_fetch(custom_prim_info, hit.primID + data_offset.x).x; + int key_value = kernel_data_fetch(custom_prim_info, hit.primID + data_offset.x).y; + + if (intersection_skip_self_shadow(local_payload->self, object_id, curve_index + prim_offset)) + return false; + + float ray_time = local_payload->ray_time; + + if ((key_value & PRIMITIVE_MOTION) && kernel_data.bvh.use_bvh_steps) { + + int time_offset = kernel_data_fetch(prim_time_offset, object_id); + float2 prims_time = kernel_data_fetch(prims_time, hit.primID + time_offset); + + if (ray_time < prims_time.x || ray_time > prims_time.y) { + return false; + } + } + + bool b_hit = curve_intersect(kg, + &isect, + ray.origin, + ray.direction, + ray.minT, + ray.maxT, + object_id, + curve_index + prim_offset, + ray_time, + key_value); + if (b_hit) { + hit.uv.x = isect.u; + hit.uv.y = isect.v; + hit.t = isect.t; + hit.primID = isect.prim; + local_payload->prim_type = isect.type; // packed_curve_type; + } + return b_hit; +} + +ccl_device_inline bool motion_triangle_custom_intersect(const hiprtRay &ray, + const void *userPtr, + void *payload, + hiprtHit &hit) +{ +# ifdef MOTION_BLUR + RayPayload *local_payload = (RayPayload *)payload; + KernelGlobals kg = local_payload->kg; + int object_id = kernel_data_fetch(user_instance_id, hit.instanceID); + int2 data_offset = kernel_data_fetch(custom_prim_info_offset, object_id); + int prim_offset = kernel_data_fetch(object_prim_offset, object_id); + + int prim_id_local = kernel_data_fetch(custom_prim_info, hit.primID + data_offset.x).x; + int prim_id_global = prim_id_local + prim_offset; + + if (intersection_skip_self_shadow(local_payload->self, object_id, prim_id_global)) + return false; + + Intersection isect; + + bool b_hit = motion_triangle_intersect(kg, + &isect, + ray.origin, + ray.direction, + ray.minT, + ray.maxT, + local_payload->ray_time, + local_payload->visibility, + object_id, + prim_id_global, + prim_id_local); + + if (b_hit) { + hit.uv.x = isect.u; + hit.uv.y = isect.v; + hit.t = isect.t; + hit.primID = isect.prim; + local_payload->prim_type = isect.type; + } + return b_hit; +# else + return false; +# endif +} + +ccl_device_inline bool motion_triangle_custom_local_intersect(const hiprtRay &ray, + const void *userPtr, + void *payload, + hiprtHit &hit) +{ +# ifdef MOTION_BLUR + LocalPayload *local_payload = (LocalPayload *)payload; + KernelGlobals kg = local_payload->kg; + int object_id = local_payload->local_object; + + int prim_offset = kernel_data_fetch(object_prim_offset, object_id); + int2 data_offset = kernel_data_fetch(custom_prim_info_offset, object_id); + + int prim_id_local = kernel_data_fetch(custom_prim_info, hit.primID + data_offset.x).x; + int prim_id_global = prim_id_local + prim_offset; + + if (intersection_skip_self_local(local_payload->self, prim_id_global)) + return false; + + LocalIntersection *local_isect = local_payload->local_isect; + + bool b_hit = motion_triangle_intersect_local(kg, + local_isect, + ray.origin, + ray.direction, + local_payload->ray_time, + object_id, + prim_id_global, + prim_id_local, + ray.minT, + ray.maxT, + local_payload->lcg_state, + local_payload->max_hits); + + if (b_hit) { + local_payload->prim_type = PRIMITIVE_MOTION_TRIANGLE; + } + return b_hit; +# else + return false; +# endif +} + +ccl_device_inline bool motion_triangle_custom_volume_intersect(const hiprtRay &ray, + const void *userPtr, + void *payload, + hiprtHit &hit) +{ +# ifdef MOTION_BLUR + + RayPayload *local_payload = (RayPayload *)payload; + KernelGlobals kg = local_payload->kg; + int object_id = kernel_data_fetch(user_instance_id, hit.instanceID); + int object_flag = kernel_data_fetch(object_flag, object_id); + + if (!(object_flag & SD_OBJECT_HAS_VOLUME)) + return false; + + int2 data_offset = kernel_data_fetch(custom_prim_info_offset, object_id); + int prim_offset = kernel_data_fetch(object_prim_offset, object_id); + + int prim_id_local = kernel_data_fetch(custom_prim_info, hit.primID + data_offset.x).x; + int prim_id_global = prim_id_local + prim_offset; + + if (intersection_skip_self_shadow(local_payload->self, object_id, prim_id_global)) + return false; + + Intersection isect; + + bool b_hit = motion_triangle_intersect(kg, + &isect, + ray.origin, + ray.direction, + ray.minT, + ray.maxT, + local_payload->ray_time, + local_payload->visibility, + object_id, + prim_id_global, + prim_id_local); + + if (b_hit) { + hit.uv.x = isect.u; + hit.uv.y = isect.v; + hit.t = isect.t; + hit.primID = isect.prim; + local_payload->prim_type = isect.type; + } + return b_hit; +# else + return false; +# endif +} + +ccl_device_inline bool point_custom_intersect(const hiprtRay &ray, + const void *userPtr, + void *payload, + hiprtHit &hit) +{ +# ifdef POINT_CLOUD + RayPayload *local_payload = (RayPayload *)payload; + KernelGlobals kg = local_payload->kg; + int object_id = kernel_data_fetch(user_instance_id, hit.instanceID); + + int2 data_offset = kernel_data_fetch(custom_prim_info_offset, object_id); + int prim_offset = kernel_data_fetch(object_prim_offset, object_id); + + int2 prim_info = kernel_data_fetch(custom_prim_info, hit.primID + data_offset.x); + int prim_id_local = prim_info.x; + int prim_id_global = prim_id_local + prim_offset; + + int type = prim_info.y; + + if (intersection_skip_self_shadow(local_payload->self, object_id, prim_id_global)) + return false; + + float ray_time = local_payload->ray_time; + + if ((type & PRIMITIVE_MOTION) && kernel_data.bvh.use_bvh_steps) { + + int time_offset = kernel_data_fetch(prim_time_offset, object_id); + float2 prims_time = kernel_data_fetch(prims_time, hit.primID + time_offset); + + if (ray_time < prims_time.x || ray_time > prims_time.y) { + return false; + } + } + + Intersection isect; + + bool b_hit = point_intersect(kg, + &isect, + ray.origin, + ray.direction, + ray.minT, + ray.maxT, + object_id, + prim_id_global, + ray_time, + type); + + if (b_hit) { + hit.uv.x = isect.u; + hit.uv.y = isect.v; + hit.t = isect.t; + hit.primID = isect.prim; + local_payload->prim_type = isect.type; + } + return b_hit; +# else + return false; +# endif +} + +// intersection filters + +ccl_device_inline bool closest_intersection_filter(const hiprtRay &ray, + const void *data, + void *user_data, + const hiprtHit &hit) +{ + RayPayload *payload = (RayPayload *)user_data; + int object_id = kernel_data_fetch(user_instance_id, hit.instanceID); + int prim_offset = kernel_data_fetch(object_prim_offset, object_id); + int prim = hit.primID + prim_offset; + + if (intersection_skip_self_shadow(payload->self, object_id, prim)) + return true; + else + return false; +} + +ccl_device_inline bool shadow_intersection_filter(const hiprtRay &ray, + const void *data, + void *user_data, + const hiprtHit &hit) + +{ + ShadowPayload *payload = (ShadowPayload *)user_data; + + uint num_hits = payload->num_hits; + uint num_recorded_hits = *(payload->r_num_recorded_hits); + uint max_hits = payload->max_hits; + int state = payload->in_state; + KernelGlobals kg = payload->kg; + RaySelfPrimitives self = payload->self; + + int object = kernel_data_fetch(user_instance_id, hit.instanceID); + int prim_offset = kernel_data_fetch(object_prim_offset, object); + int prim = hit.primID + prim_offset; + + float ray_tmax = hit.t; + +# ifdef __VISIBILITY_FLAG__ + + if ((kernel_data_fetch(objects, object).visibility & payload->visibility) == 0) { + return true; // no hit - continue traversal + } +# endif + + if (intersection_skip_self_shadow(self, object, prim)) { + return true; // no hit -continue traversal + } + + float u = hit.uv.x; + float v = hit.uv.y; + int type = kernel_data_fetch(objects, object).primitive_type; +# ifdef __HAIR__ + if (type & (PRIMITIVE_CURVE_THICK | PRIMITIVE_CURVE_RIBBON)) { + + const KernelCurveSegment segment = kernel_data_fetch(curve_segments, prim); + type = segment.type; + prim = segment.prim; + } +# endif + +# ifndef __TRANSPARENT_SHADOWS__ + + return false; + +# else + + if (num_hits >= max_hits || + !(intersection_get_shader_flags(NULL, prim, type) & SD_HAS_TRANSPARENT_SHADOW)) { + return false; + } + + if (type & PRIMITIVE_CURVE) { + float throughput = *payload->r_throughput; + throughput *= intersection_curve_shadow_transparency(kg, object, prim, type, u); + *payload->r_throughput = throughput; + payload->num_hits += 1; + + if (throughput < CURVE_SHADOW_TRANSPARENCY_CUTOFF) { + return false; + } + else { + return true; + } + } + + uint record_index = num_recorded_hits; + + num_hits += 1; + num_recorded_hits += 1; + payload->num_hits = num_hits; + *(payload->r_num_recorded_hits) = num_recorded_hits; + + const uint max_record_hits = min(max_hits, INTEGRATOR_SHADOW_ISECT_SIZE); + if (record_index >= max_record_hits) { + float max_recorded_t = INTEGRATOR_STATE_ARRAY(state, shadow_isect, 0, t); + uint max_recorded_hit = 0; + + for (int i = 1; i < max_record_hits; i++) { + const float isect_t = INTEGRATOR_STATE_ARRAY(state, shadow_isect, i, t); + if (isect_t > max_recorded_t) { + max_recorded_t = isect_t; + max_recorded_hit = i; + } + } + + if (ray_tmax >= max_recorded_t) { + + return true; + } + + record_index = max_recorded_hit; + } + + INTEGRATOR_STATE_ARRAY_WRITE(state, shadow_isect, record_index, u) = u; + INTEGRATOR_STATE_ARRAY_WRITE(state, shadow_isect, record_index, v) = v; + INTEGRATOR_STATE_ARRAY_WRITE(state, shadow_isect, record_index, t) = ray_tmax; + INTEGRATOR_STATE_ARRAY_WRITE(state, shadow_isect, record_index, prim) = prim; + INTEGRATOR_STATE_ARRAY_WRITE(state, shadow_isect, record_index, object) = object; + INTEGRATOR_STATE_ARRAY_WRITE(state, shadow_isect, record_index, type) = type; + return true; + +# endif /* __TRANSPARENT_SHADOWS__ */ +} + +ccl_device_inline bool local_intersection_filter(const hiprtRay &ray, + const void *data, + void *user_data, + const hiprtHit &hit) +{ +# ifdef __BVH_LOCAL__ + LocalPayload *payload = (LocalPayload *)user_data; + KernelGlobals kg = payload->kg; + int object_id = payload->local_object; + int prim_offset = kernel_data_fetch(object_prim_offset, object_id); + int prim = hit.primID + prim_offset; +# ifndef __RAY_OFFSET__ + if (intersection_skip_self_local(payload->self, prim)) { + return true; // continue search + } +# endif + uint max_hits = payload->max_hits; + if (max_hits == 0) { + return false; // stop search + } + int hit_index = 0; + if (payload->lcg_state) { + for (int i = min(max_hits, payload->local_isect->num_hits) - 1; i >= 0; --i) { + if (hit.t == payload->local_isect->hits[i].t) { + return true; // continue search + } + } + hit_index = payload->local_isect->num_hits++; + if (payload->local_isect->num_hits > max_hits) { + hit_index = lcg_step_uint(payload->lcg_state) % payload->local_isect->num_hits; + if (hit_index >= max_hits) { + return true; // continue search + } + } + } + else { + if (payload->local_isect->num_hits && hit.t > payload->local_isect->hits[0].t) { + return true; + } + payload->local_isect->num_hits = 1; + } + Intersection *isect = &payload->local_isect->hits[hit_index]; + isect->t = hit.t; + isect->prim = prim; + isect->object = object_id; + isect->type = PRIMITIVE_TRIANGLE; // kernel_data_fetch(__objects, object_id).primitive_type; + + isect->u = hit.uv.x; + isect->v = hit.uv.y; + + payload->local_isect->Ng[hit_index] = hit.normal; + + return true; + +# endif +} + +ccl_device_inline bool volume_intersection_filter(const hiprtRay &ray, + const void *data, + void *user_data, + const hiprtHit &hit) +{ + RayPayload *payload = (RayPayload *)user_data; + int object_id = kernel_data_fetch(user_instance_id, hit.instanceID); + int prim_offset = kernel_data_fetch(object_prim_offset, object_id); + int prim = hit.primID + prim_offset; + int object_flag = kernel_data_fetch(object_flag, object_id); + + if (intersection_skip_self(payload->self, object_id, prim)) + return true; + else if ((object_flag & SD_OBJECT_HAS_VOLUME) == 0) + return true; + else + return false; +} + +HIPRT_DEVICE bool intersectFunc(u32 geomType, + u32 rayType, + const hiprtFuncTableHeader &tableHeader, + const hiprtRay &ray, + void *payload, + hiprtHit &hit) +{ + const u32 index = tableHeader.numGeomTypes * rayType + geomType; + const void *data = tableHeader.funcDataSets[index].filterFuncData; + switch (index) { + case Curve_Intersect_Function: + case Curve_Intersect_Shadow: + return curve_custom_intersect(ray, data, payload, hit); + case Motion_Triangle_Intersect_Function: + case Motion_Triangle_Intersect_Shadow: + return motion_triangle_custom_intersect(ray, data, payload, hit); + case Motion_Triangle_Intersect_Local: + return motion_triangle_custom_local_intersect(ray, data, payload, hit); + case Motion_Triangle_Intersect_Volume: + return motion_triangle_custom_volume_intersect(ray, data, payload, hit); + case Point_Intersect_Function: + case Point_Intersect_Shadow: + return point_custom_intersect(ray, data, payload, hit); + default: + break; + } + return false; +} + +HIPRT_DEVICE bool filterFunc(u32 geomType, + u32 rayType, + const hiprtFuncTableHeader &tableHeader, + const hiprtRay &ray, + void *payload, + const hiprtHit &hit) +{ + const u32 index = tableHeader.numGeomTypes * rayType + geomType; + const void *data = tableHeader.funcDataSets[index].intersectFuncData; + switch (index) { + case Triangle_Filter_Closest: + return closest_intersection_filter(ray, data, payload, hit); + case Triangle_Filter_Shadow: + case Curve_Filter_Shadow: + case Motion_Triangle_Filter_Shadow: + case Point_Filter_Shadow: + return shadow_intersection_filter(ray, data, payload, hit); + case Triangle_Filter_Local: + case Motion_Triangle_Filter_Local: + return local_intersection_filter(ray, data, payload, hit); + case Triangle_Filter_Volume: + case Motion_Triangle_Filter_Volume: + return volume_intersection_filter(ray, data, payload, hit); + default: + break; + } + + return false; +} + +#endif diff --git a/intern/cycles/kernel/device/hiprt/globals.h b/intern/cycles/kernel/device/hiprt/globals.h new file mode 100644 index 00000000000..7fff15b97b8 --- /dev/null +++ b/intern/cycles/kernel/device/hiprt/globals.h @@ -0,0 +1,158 @@ +/* SPDX-License-Identifier: Apache-2.0 + * Copyright 2011-2022 Blender Foundation */ + +#pragma once + +#include "kernel/types.h" + +#include "kernel/integrator/state.h" + +#include "kernel/util/profiling.h" + +#define HIPRT_SHARED_STACK + +/* The size of global stack availavle to each thread (memory reserved for each thread in + * global_stack_buffer). */ +#define HIPRT_THREAD_STACK_SIZE 64 + +/* LDS (Local Data Storage) allocation for each thread, the number is obtained empirically. */ +#define HIPRT_SHARED_STACK_SIZE 24 + +/* HIPRT_THREAD_GROUP_SIZE is the number of threads per work group for intersection kernels + * The default number of threads per workgroup is 1024, however, since HIP RT intersection kernels + * use local memory, and the local memory size in those kernels scales up with the number of + * threads, the number of threads to is scaled down to 256 to avoid going over maximum local memory + * and to strike a balance between memory access and the number of waves. + * + * Total local stack size would be number of threads * HIPRT_SHARED_STACK_SIZE. */ +#define HIPRT_THREAD_GROUP_SIZE 256 + +CCL_NAMESPACE_BEGIN + +struct KernelGlobalsGPU { + int *global_stack_buffer; +#ifdef HIPRT_SHARED_STACK + int *shared_stack; +#endif +}; + +typedef ccl_global KernelGlobalsGPU *ccl_restrict KernelGlobals; + +#if defined(HIPRT_SHARED_STACK) + +/* This macro allocates shared memory and to pass the shared memory down to intersection functions + * KernelGlobals is used. */ +# define HIPRT_INIT_KERNEL_GLOBAL() \ + ccl_gpu_shared int shared_stack[HIPRT_SHARED_STACK_SIZE * HIPRT_THREAD_GROUP_SIZE]; \ + ccl_global KernelGlobalsGPU kg_gpu; \ + KernelGlobals kg = &kg_gpu; \ + kg->shared_stack = &shared_stack[0]; \ + kg->global_stack_buffer = stack_buffer; +#else +# define HIPRT_INIT_KERNEL_GLOBAL() \ + KernelGlobals kg = NULL; \ + kg->global_stack_buffer = stack_buffer; +#endif + +struct KernelParamsHIPRT { + KernelData data; +#define KERNEL_DATA_ARRAY(type, name) const type *name; + KERNEL_DATA_ARRAY(int, user_instance_id) + KERNEL_DATA_ARRAY(uint64_t, blas_ptr) + KERNEL_DATA_ARRAY(int2, custom_prim_info) + KERNEL_DATA_ARRAY(int2, custom_prim_info_offset) + KERNEL_DATA_ARRAY(float2, prims_time) + KERNEL_DATA_ARRAY(int, prim_time_offset) +#include "kernel/data_arrays.h" + + /* Integrator state */ + IntegratorStateGPU integrator_state; + + hiprtFuncTable table_closest_intersect; + hiprtFuncTable table_shadow_intersect; + hiprtFuncTable table_local_intersect; + hiprtFuncTable table_volume_intersect; +}; + +/* Intersection_Function_Table_Index defines index values to retrieve custom intersection + * functions from function table. */ + +enum Intersection_Function_Table_Index { + // Triangles use the intersection function provided by HIP RT and don't need custom intersection + // functions + // Custom intersection functions for closest intersect. + Curve_Intersect_Function = 1, // Custom intersection for curves + Motion_Triangle_Intersect_Function, // Custom intersection for triangles with vertex motion blur + // attributes. + Point_Intersect_Function, // Custom intersection for point cloud. + // Custom intersection functions for shadow rendering are the same as the function for closest + // intersect. + // However, the table indices are different + Triangle_Intersect_Shadow_None, + Curve_Intersect_Shadow, + Motion_Triangle_Intersect_Shadow, + Point_Intersect_Shadow, + // Custom intersection functions for subsurface scattering. + // Only motion triangles have valid custom intersection function + Triangle_Intersect_Local_None, + Curve_Intersect_Local_None, + Motion_Triangle_Intersect_Local, + Point_Intersect_Local_None, + // Custom intersection functions for volume rendering. + // Only motion triangles have valid custom intersection function + Triangle_Intersect_Volume_None, + Curve_Intersect_Volume_None, + Motion_Triangle_Intersect_Volume, + Point_Intersect_Volume_None, +}; + +// Filter functions, filter hits, i.e. test whether a hit should be accepted or not, and whether +// traversal should stop or continue. +enum Filter_Function_Table_Index { + Triangle_Filter_Closest = 0, // Filter function for triangles for closest intersect, no custom + // intersection function is needed. + Curve_Filter_Opaque_None, // No filter function is needed and everything is handled in the + // intersection function. + Motion_Triangle_Filter_Opaque_None, // No filter function is needed and everything is handled in + // intersection function. + Point_Filter_Opaque_Non, // No filter function is needed. + // Filter function for all primitives for shadow intersection. + // All primitives use the same function but each has a different index in the table. + Triangle_Filter_Shadow, + Curve_Filter_Shadow, + Motion_Triangle_Filter_Shadow, + Point_Filter_Shadow, + // Filter functions for subsurface scattering. Triangles and motion triangles need function + // assignment. They indices for triangles and motion triangles point to the same function. Points + // and curves dont need any function since subsurface scattering is not applied on either. + Triangle_Filter_Local, // Filter functions for triangles + Curve_Filter_Local_None, // Subsurface scattering is not applied on curves, no filter function + // is + // needed. + Motion_Triangle_Filter_Local, + Point_Filter_Local_None, + // Filter functions for volume rendering. + // Volume rendering only applies to triangles and motion triangles. + // Triangles and motion triangles use the same filter functions for volume rendering + Triangle_Filter_Volume, + Curve_Filter_Volume_None, + Motion_Triangle_Filter_Volume, + Point_Filter_Volume_None, +}; + +#ifdef __KERNEL_GPU__ +__constant__ KernelParamsHIPRT kernel_params; + +# ifdef HIPRT_SHARED_STACK +typedef hiprtGlobalStack Stack; +# endif + +#endif + +/* Abstraction macros */ +#define kernel_data kernel_params.data +#define kernel_data_fetch(name, index) kernel_params.name[(index)] +#define kernel_data_array(name) (kernel_params.name) +#define kernel_integrator_state kernel_params.integrator_state + +CCL_NAMESPACE_END diff --git a/intern/cycles/kernel/device/hiprt/hiprt_kernels.h b/intern/cycles/kernel/device/hiprt/hiprt_kernels.h new file mode 100644 index 00000000000..cd92e9ff30c --- /dev/null +++ b/intern/cycles/kernel/device/hiprt/hiprt_kernels.h @@ -0,0 +1,101 @@ +/* SPDX-License-Identifier: Apache-2.0 + * Copyright 2011-2022 Blender Foundation */ + +#ifdef __HIPRT__ + +ccl_gpu_kernel_threads(GPU_HIPRT_KERNEL_BLOCK_NUM_THREADS) + ccl_gpu_kernel_signature(integrator_intersect_closest, + ccl_global const int *path_index_array, + ccl_global float *render_buffer, + const int work_size, + ccl_global int *stack_buffer) +{ + const int global_index = ccl_gpu_global_id_x(); + + if (global_index < work_size) { + HIPRT_INIT_KERNEL_GLOBAL() + const int state = (path_index_array) ? path_index_array[global_index] : global_index; + ccl_gpu_kernel_call(integrator_intersect_closest(kg, state, render_buffer)); + } +} +ccl_gpu_kernel_postfix + +ccl_gpu_kernel_threads(GPU_HIPRT_KERNEL_BLOCK_NUM_THREADS) + ccl_gpu_kernel_signature(integrator_intersect_shadow, + ccl_global const int *path_index_array, + const int work_size, + ccl_global int *stack_buffer) +{ + const int global_index = ccl_gpu_global_id_x(); + + if (global_index < work_size) { + HIPRT_INIT_KERNEL_GLOBAL() + const int state = (path_index_array) ? path_index_array[global_index] : global_index; + ccl_gpu_kernel_call(integrator_intersect_shadow(kg, state)); + } +} +ccl_gpu_kernel_postfix + +ccl_gpu_kernel_threads(GPU_HIPRT_KERNEL_BLOCK_NUM_THREADS) + ccl_gpu_kernel_signature(integrator_intersect_subsurface, + ccl_global const int *path_index_array, + const int work_size, + ccl_global int *stack_buffer) +{ + const int global_index = ccl_gpu_global_id_x(); + + if (global_index < work_size) { + HIPRT_INIT_KERNEL_GLOBAL() + const int state = (path_index_array) ? path_index_array[global_index] : global_index; + ccl_gpu_kernel_call(integrator_intersect_subsurface(kg, state)); + } +} +ccl_gpu_kernel_postfix + +ccl_gpu_kernel_threads(GPU_HIPRT_KERNEL_BLOCK_NUM_THREADS) + ccl_gpu_kernel_signature(integrator_intersect_volume_stack, + ccl_global const int *path_index_array, + const int work_size, + ccl_global int *stack_buffer) +{ + const int global_index = ccl_gpu_global_id_x(); + + if (global_index < work_size) { + HIPRT_INIT_KERNEL_GLOBAL() + const int state = (path_index_array) ? path_index_array[global_index] : global_index; + ccl_gpu_kernel_call(integrator_intersect_volume_stack(kg, state)); + } +} +ccl_gpu_kernel_postfix +ccl_gpu_kernel_threads(GPU_HIPRT_KERNEL_BLOCK_NUM_THREADS) + ccl_gpu_kernel_signature(integrator_shade_surface_raytrace, + ccl_global const int *path_index_array, + ccl_global float *render_buffer, + const int work_size, + ccl_global int *stack_buffer) +{ + const int global_index = ccl_gpu_global_id_x(); + if (global_index < work_size) { + HIPRT_INIT_KERNEL_GLOBAL() + const int state = (path_index_array) ? path_index_array[global_index] : global_index; + ccl_gpu_kernel_call(integrator_shade_surface_raytrace(kg, state, render_buffer)); + } +} +ccl_gpu_kernel_postfix +ccl_gpu_kernel_threads(GPU_HIPRT_KERNEL_BLOCK_NUM_THREADS) + ccl_gpu_kernel_signature(integrator_shade_surface_mnee, + ccl_global const int *path_index_array, + ccl_global float *render_buffer, + const int work_size, + ccl_global int *stack_buffer) +{ + const int global_index = ccl_gpu_global_id_x(); + if (global_index < work_size) { + HIPRT_INIT_KERNEL_GLOBAL() + const int state = (path_index_array) ? path_index_array[global_index] : global_index; + ccl_gpu_kernel_call(integrator_shade_surface_mnee(kg, state, render_buffer)); + } +} +ccl_gpu_kernel_postfix + +#endif /* __HIPRT__ */ diff --git a/intern/cycles/kernel/device/hiprt/kernel.cpp b/intern/cycles/kernel/device/hiprt/kernel.cpp new file mode 100644 index 00000000000..34456e1d7c0 --- /dev/null +++ b/intern/cycles/kernel/device/hiprt/kernel.cpp @@ -0,0 +1,16 @@ +/* SPDX-License-Identifier: Apache-2.0 + * Copyright 2011-2023 Blender Foundation */ + +#ifdef __HIP_DEVICE_COMPILE__ + +# include "kernel/device/hip/compat.h" +# include "kernel/device/hip/config.h" + +# include + +# include "kernel/device/hiprt/globals.h" + +# include "kernel/device/gpu/image.h" +# include "kernel/device/gpu/kernel.h" + +#endif diff --git a/intern/cycles/kernel/types.h b/intern/cycles/kernel/types.h index 4fa379e4b15..d833290a34d 100644 --- a/intern/cycles/kernel/types.h +++ b/intern/cycles/kernel/types.h @@ -1169,10 +1169,14 @@ typedef enum KernelBVHLayout { BVH_LAYOUT_METAL = (1 << 5), BVH_LAYOUT_MULTI_METAL = (1 << 6), BVH_LAYOUT_MULTI_METAL_EMBREE = (1 << 7), + BVH_LAYOUT_HIPRT = (1 << 8), + BVH_LAYOUT_MULTI_HIPRT = (1 << 9), + BVH_LAYOUT_MULTI_HIPRT_EMBREE = (1 << 10), /* Default BVH layout to use for CPU. */ BVH_LAYOUT_AUTO = BVH_LAYOUT_EMBREE, - BVH_LAYOUT_ALL = BVH_LAYOUT_BVH2 | BVH_LAYOUT_EMBREE | BVH_LAYOUT_OPTIX | BVH_LAYOUT_METAL, + BVH_LAYOUT_ALL = BVH_LAYOUT_BVH2 | BVH_LAYOUT_EMBREE | BVH_LAYOUT_OPTIX | BVH_LAYOUT_METAL | + BVH_LAYOUT_HIPRT | BVH_LAYOUT_MULTI_HIPRT | BVH_LAYOUT_MULTI_HIPRT_EMBREE, } KernelBVHLayout; /* Specialized struct that can become constants in dynamic compilation. */ @@ -1225,6 +1229,8 @@ typedef struct KernelData { OptixTraversableHandle device_bvh; #elif defined __METALRT__ metalrt_as_type device_bvh; +#elif defined(__HIPRT__) + void *device_bvh; #else # ifdef __EMBREE__ RTCScene device_bvh; diff --git a/intern/cycles/scene/geometry.cpp b/intern/cycles/scene/geometry.cpp index 21d3b6a52a6..d268b5570f4 100644 --- a/intern/cycles/scene/geometry.cpp +++ b/intern/cycles/scene/geometry.cpp @@ -114,7 +114,9 @@ bool Geometry::need_build_bvh(BVHLayout layout) const { return is_instanced() || layout == BVH_LAYOUT_OPTIX || layout == BVH_LAYOUT_MULTI_OPTIX || layout == BVH_LAYOUT_METAL || layout == BVH_LAYOUT_MULTI_OPTIX_EMBREE || - layout == BVH_LAYOUT_MULTI_METAL || layout == BVH_LAYOUT_MULTI_METAL_EMBREE; + layout == BVH_LAYOUT_MULTI_METAL || layout == BVH_LAYOUT_MULTI_METAL_EMBREE || + layout == BVH_LAYOUT_HIPRT || layout == BVH_LAYOUT_MULTI_HIPRT || + layout == BVH_LAYOUT_MULTI_HIPRT_EMBREE; } bool Geometry::is_instanced() const diff --git a/intern/cycles/scene/object.cpp b/intern/cycles/scene/object.cpp index c19a28583e9..8d17b0803a7 100644 --- a/intern/cycles/scene/object.cpp +++ b/intern/cycles/scene/object.cpp @@ -597,7 +597,8 @@ void ObjectManager::device_update_prim_offsets(Device *device, DeviceScene *dsce if (!scene->integrator->get_use_light_tree()) { BVHLayoutMask layout_mask = device->get_bvh_layout_mask(dscene->data.kernel_features); if (layout_mask != BVH_LAYOUT_METAL && layout_mask != BVH_LAYOUT_MULTI_METAL && - layout_mask != BVH_LAYOUT_MULTI_METAL_EMBREE) { + layout_mask != BVH_LAYOUT_MULTI_METAL_EMBREE && layout_mask != BVH_LAYOUT_HIPRT && + layout_mask != BVH_LAYOUT_MULTI_HIPRT && layout_mask != BVH_LAYOUT_MULTI_HIPRT_EMBREE) { return; } }