This change switches Cycles to an opensource HIP-RT library which
implements hardware ray-tracing. This library is now used on
both Windows and Linux. While there should be no noticeable changes
on Windows, on Linux this adds support for hardware ray-tracing on
AMD GPUs.
The majority of the change is typical platform code to add new
library to the dependency builder, and a change in the way how
ahead-of-time (AoT) kernels are compiled. There are changes in
Cycles itself, but they are rather straightforward: some APIs
changed in the opensource version of the library.
There are a couple of extra files which are needed for this to
work: hiprt02003_6.1_amd.hipfb and oro_compiled_kernels.hipfb.
There are some assumptions in the HIP-RT library about how they
are available. Currently they follow the same rule as AoT
kernels for oneAPI:
- On Windows they are next to blender.exe
- On Linux they are in the lib/ folder
Performance comparison on Ubuntu 22.04.5:
```
GPU: AMD Radeon PRO W7800
Driver: amdgpu-install_6.1.60103-1_all.deb
main hip-rt
attic 0.1414s 0.0932s
barbershop_interior 0.1563s 0.1258s
bistro 0.2134s 0.1597s
bmw27 0.0119s 0.0099s
classroom 0.1006s 0.0803s
fishy_cat 0.0248s 0.0178s
junkshop 0.0916s 0.0713s
koro 0.0589s 0.0720s
monster 0.0435s 0.0385s
pabellon 0.0543s 0.0391s
sponza 0.0223s 0.0180s
spring 0.1026s 1.5145s
victor 0.1901s 0.1239s
wdas_cloud 0.1153s 0.1125s
```
Co-authored-by: Brecht Van Lommel <brecht@blender.org>
Co-authored-by: Ray Molenkamp <github@lazydodo.com>
Co-authored-by: Sergey Sharybin <sergey@blender.org>
Pull Request: https://projects.blender.org/blender/blender/pulls/121050
162 lines
6.3 KiB
C
162 lines
6.3 KiB
C
/* SPDX-FileCopyrightText: 2011-2022 Blender Foundation
|
|
*
|
|
* SPDX-License-Identifier: Apache-2.0 */
|
|
|
|
#pragma once
|
|
|
|
#include "kernel/types.h"
|
|
|
|
#include "kernel/integrator/state.h"
|
|
|
|
#include "kernel/util/profiling.h"
|
|
|
|
#define HIPRT_SHARED_STACK
|
|
|
|
/* The size of global stack available to each thread (memory reserved for each thread in
|
|
* global_stack_buffer). */
|
|
#define HIPRT_THREAD_STACK_SIZE 64
|
|
|
|
/* LDS (Local Data Storage) allocation for each thread, the number is obtained empirically. */
|
|
#define HIPRT_SHARED_STACK_SIZE 24
|
|
|
|
/* HIPRT_THREAD_GROUP_SIZE is the number of threads per work group for intersection kernels
|
|
* The default number of threads per work-group is 1024, however, since HIP RT intersection kernels
|
|
* use local memory, and the local memory size in those kernels scales up with the number of
|
|
* threads, the number of threads to is scaled down to 256 to avoid going over maximum local memory
|
|
* and to strike a balance between memory access and the number of waves.
|
|
*
|
|
* Total local stack size would be number of threads * HIPRT_SHARED_STACK_SIZE. */
|
|
#define HIPRT_THREAD_GROUP_SIZE 256
|
|
|
|
CCL_NAMESPACE_BEGIN
|
|
|
|
struct KernelGlobalsGPU {
|
|
hiprtGlobalStackBuffer global_stack_buffer;
|
|
#ifdef HIPRT_SHARED_STACK
|
|
hiprtSharedStackBuffer shared_stack;
|
|
#endif
|
|
};
|
|
|
|
typedef ccl_global KernelGlobalsGPU *ccl_restrict KernelGlobals;
|
|
|
|
#if defined(HIPRT_SHARED_STACK)
|
|
|
|
/* This macro allocates shared memory and to pass the shared memory down to intersection functions
|
|
* KernelGlobals is used. */
|
|
# define HIPRT_INIT_KERNEL_GLOBAL() \
|
|
ccl_gpu_shared int shared_stack[HIPRT_SHARED_STACK_SIZE * HIPRT_THREAD_GROUP_SIZE]; \
|
|
ccl_global KernelGlobalsGPU kg_gpu; \
|
|
KernelGlobals kg = &kg_gpu; \
|
|
kg->shared_stack.stackData = &shared_stack[0]; \
|
|
kg->shared_stack.stackSize = HIPRT_SHARED_STACK_SIZE; \
|
|
kg->global_stack_buffer = stack_buffer;
|
|
#else
|
|
# define HIPRT_INIT_KERNEL_GLOBAL() \
|
|
KernelGlobals kg = NULL; \
|
|
kg->global_stack_buffer = stack_buffer;
|
|
#endif
|
|
|
|
struct KernelParamsHIPRT {
|
|
KernelData data;
|
|
#define KERNEL_DATA_ARRAY(type, name) const type *name;
|
|
KERNEL_DATA_ARRAY(int, user_instance_id)
|
|
KERNEL_DATA_ARRAY(uint64_t, blas_ptr)
|
|
KERNEL_DATA_ARRAY(int2, custom_prim_info)
|
|
KERNEL_DATA_ARRAY(int2, custom_prim_info_offset)
|
|
KERNEL_DATA_ARRAY(float2, prims_time)
|
|
KERNEL_DATA_ARRAY(int, prim_time_offset)
|
|
#include "kernel/data_arrays.h"
|
|
|
|
/* Integrator state */
|
|
IntegratorStateGPU integrator_state;
|
|
|
|
hiprtFuncTable table_closest_intersect;
|
|
hiprtFuncTable table_shadow_intersect;
|
|
hiprtFuncTable table_local_intersect;
|
|
hiprtFuncTable table_volume_intersect;
|
|
};
|
|
|
|
/* Intersection_Function_Table_Index defines index values to retrieve custom intersection
|
|
* functions from function table. */
|
|
|
|
enum Intersection_Function_Table_Index {
|
|
// Triangles use the intersection function provided by HIP RT and don't need custom intersection
|
|
// functions
|
|
// Custom intersection functions for closest intersect.
|
|
Curve_Intersect_Function = 1, // Custom intersection for curves
|
|
Motion_Triangle_Intersect_Function, // Custom intersection for triangles with vertex motion blur
|
|
// attributes.
|
|
Point_Intersect_Function, // Custom intersection for point cloud.
|
|
// Custom intersection functions for shadow rendering are the same as the function for closest
|
|
// intersect.
|
|
// However, the table indices are different
|
|
Triangle_Intersect_Shadow_None,
|
|
Curve_Intersect_Shadow,
|
|
Motion_Triangle_Intersect_Shadow,
|
|
Point_Intersect_Shadow,
|
|
// Custom intersection functions for subsurface scattering.
|
|
// Only motion triangles have valid custom intersection function
|
|
Triangle_Intersect_Local_None,
|
|
Curve_Intersect_Local_None,
|
|
Motion_Triangle_Intersect_Local,
|
|
Point_Intersect_Local_None,
|
|
// Custom intersection functions for volume rendering.
|
|
// Only motion triangles have valid custom intersection function
|
|
Triangle_Intersect_Volume_None,
|
|
Curve_Intersect_Volume_None,
|
|
Motion_Triangle_Intersect_Volume,
|
|
Point_Intersect_Volume_None,
|
|
};
|
|
|
|
// Filter functions, filter hits, i.e. test whether a hit should be accepted or not, and whether
|
|
// traversal should stop or continue.
|
|
enum Filter_Function_Table_Index {
|
|
Triangle_Filter_Closest = 0, // Filter function for triangles for closest intersect, no custom
|
|
// intersection function is needed.
|
|
Curve_Filter_Opaque_None, // No filter function is needed and everything is handled in the
|
|
// intersection function.
|
|
Motion_Triangle_Filter_Opaque_None, // No filter function is needed and everything is handled in
|
|
// intersection function.
|
|
Point_Filter_Opaque_Non, // No filter function is needed.
|
|
// Filter function for all primitives for shadow intersection.
|
|
// All primitives use the same function but each has a different index in the table.
|
|
Triangle_Filter_Shadow,
|
|
Curve_Filter_Shadow,
|
|
Motion_Triangle_Filter_Shadow,
|
|
Point_Filter_Shadow,
|
|
// Filter functions for subsurface scattering. Triangles and motion triangles need function
|
|
// assignment. They indices for triangles and motion triangles point to the same function. Points
|
|
// and curves dont need any function since subsurface scattering is not applied on either.
|
|
Triangle_Filter_Local, // Filter functions for triangles
|
|
Curve_Filter_Local_None, // Subsurface scattering is not applied on curves, no filter function
|
|
// is
|
|
// needed.
|
|
Motion_Triangle_Filter_Local,
|
|
Point_Filter_Local_None,
|
|
// Filter functions for volume rendering.
|
|
// Volume rendering only applies to triangles and motion triangles.
|
|
// Triangles and motion triangles use the same filter functions for volume rendering
|
|
Triangle_Filter_Volume,
|
|
Curve_Filter_Volume_None,
|
|
Motion_Triangle_Filter_Volume,
|
|
Point_Filter_Volume_None,
|
|
};
|
|
|
|
#ifdef __KERNEL_GPU__
|
|
__constant__ KernelParamsHIPRT kernel_params;
|
|
|
|
# ifdef HIPRT_SHARED_STACK
|
|
typedef hiprtGlobalStack Stack;
|
|
typedef hiprtEmptyInstanceStack Instance_Stack;
|
|
# endif
|
|
|
|
#endif
|
|
|
|
/* Abstraction macros */
|
|
#define kernel_data kernel_params.data
|
|
#define kernel_data_fetch(name, index) kernel_params.name[(index)]
|
|
#define kernel_data_array(name) (kernel_params.name)
|
|
#define kernel_integrator_state kernel_params.integrator_state
|
|
|
|
CCL_NAMESPACE_END
|