From 70892e82ac458905913fd8cd41a6d431a1f0a83a Mon Sep 17 00:00:00 2001 From: Xavier Hallade Date: Thu, 6 Apr 2023 10:51:49 +0200 Subject: [PATCH] Cycles: oneAPI: use specialization constant to compile with/without Embree on GPU --- intern/cycles/device/oneapi/device_impl.cpp | 16 +- intern/cycles/device/oneapi/device_impl.h | 2 +- intern/cycles/kernel/CMakeLists.txt | 2 +- intern/cycles/kernel/bvh/bvh.h | 155 +++++++++++----- intern/cycles/kernel/device/cpu/bvh.h | 2 - intern/cycles/kernel/device/gpu/kernel.h | 13 ++ intern/cycles/kernel/device/oneapi/kernel.cpp | 166 +++++++++++------- 7 files changed, 233 insertions(+), 123 deletions(-) diff --git a/intern/cycles/device/oneapi/device_impl.cpp b/intern/cycles/device/oneapi/device_impl.cpp index 245193968d0..64ea7336072 100644 --- a/intern/cycles/device/oneapi/device_impl.cpp +++ b/intern/cycles/device/oneapi/device_impl.cpp @@ -120,18 +120,16 @@ bool OneapiDevice::check_peer_access(Device * /*peer_device*/) return false; } -bool OneapiDevice::can_use_hardware_raytracing_for_features(uint kernel_features) const +bool OneapiDevice::can_use_hardware_raytracing_for_features(uint requested_features) const { /* MNEE and Raytrace kernels currently don't work correctly with HWRT. */ - if ((kernel_features & KERNEL_FEATURE_MNEE || kernel_features & KERNEL_FEATURE_NODE_RAYTRACE)) { - return false; - } - return true; + return !(requested_features & (KERNEL_FEATURE_MNEE | KERNEL_FEATURE_NODE_RAYTRACE)); } -BVHLayoutMask OneapiDevice::get_bvh_layout_mask(uint kernel_features) const +BVHLayoutMask OneapiDevice::get_bvh_layout_mask(uint requested_features) const { - return (use_hardware_raytracing && can_use_hardware_raytracing_for_features(kernel_features)) ? + return (use_hardware_raytracing && + can_use_hardware_raytracing_for_features(requested_features)) ? BVH_LAYOUT_EMBREE : BVH_LAYOUT_BVH2; } @@ -175,8 +173,8 @@ bool OneapiDevice::load_kernels(const uint requested_features) } if (use_hardware_raytracing && !can_use_hardware_raytracing_for_features(requested_features)) { - VLOG_INFO << "Requested features don't work properly together with Hardware Raytracing yet " - "in oneAPI backend. Hardware Raytracing is now disabled."; + VLOG_INFO + << "Hardware ray tracing disabled, not supported yet by oneAPI for requested features."; use_hardware_raytracing = false; } diff --git a/intern/cycles/device/oneapi/device_impl.h b/intern/cycles/device/oneapi/device_impl.h index 0076f121beb..ea6b6c040e6 100644 --- a/intern/cycles/device/oneapi/device_impl.h +++ b/intern/cycles/device/oneapi/device_impl.h @@ -118,7 +118,7 @@ class OneapiDevice : public Device { SyclQueue *sycl_queue(); protected: - bool can_use_hwrt_for_features(uint kernel_features) const; + bool can_use_hardware_raytracing_for_features(uint kernel_features) const; void check_usm(SyclQueue *queue, const void *usm_ptr, bool allow_host); bool create_queue(SyclQueue *&external_queue, int device_index, void *embree_device); void free_queue(SyclQueue *queue); diff --git a/intern/cycles/kernel/CMakeLists.txt b/intern/cycles/kernel/CMakeLists.txt index 57ab5beb030..f3c970fd8bf 100644 --- a/intern/cycles/kernel/CMakeLists.txt +++ b/intern/cycles/kernel/CMakeLists.txt @@ -779,7 +779,7 @@ if(WITH_CYCLES_DEVICE_ONEAPI) # Host execution won't use GPU binaries, no need to compile them. if(WITH_CYCLES_ONEAPI_BINARIES AND NOT WITH_CYCLES_ONEAPI_HOST_TASK_EXECUTION) # AoT binaries aren't currently reused when calling sycl::build. - list(APPEND sycl_compiler_flags -DSYCL_SKIP_KERNELS_PRELOAD) + list(APPEND sycl_compiler_flags -DWITH_CYCLES_ONEAPI_BINARIES) # Iterate over all targest and their options list(JOIN CYCLES_ONEAPI_SYCL_TARGETS "," targets_string) list(APPEND sycl_compiler_flags -fsycl-targets=${targets_string}) diff --git a/intern/cycles/kernel/bvh/bvh.h b/intern/cycles/kernel/bvh/bvh.h index 29789a15b28..50ec52c6a9a 100644 --- a/intern/cycles/kernel/bvh/bvh.h +++ b/intern/cycles/kernel/bvh/bvh.h @@ -21,6 +21,28 @@ # define __BVH2__ #endif +#if defined(__KERNEL_ONEAPI__) && defined(WITH_EMBREE_GPU) +/* bool is apparently not tested for specialization constants: + * https://github.com/intel/llvm/blob/39d1c65272a786b2b13a6f094facfddf9408406d/sycl/test/basic_tests/SYCL-2020-spec-constants.cpp#L25-L27 + * Instead of adding one more bool specialization constant, we reuse existing embree_features one + * and use RTC_FEATURE_FLAG_NONE as value to test for avoiding to call Embree on GPU. + */ +/* We set it to RTC_FEATURE_FLAG_NONE by default so AoT binaries contain MNE and raytrace kernels + * precompiled without Embree. + * Changing this default value would require updating the logic in oneapi_load_kernels(). */ +static constexpr sycl::specialization_id oneapi_embree_features{ + RTC_FEATURE_FLAG_NONE}; +# define IF_USING_EMBREE \ + if (kernel_handler.get_specialization_constant() != \ + RTC_FEATURE_FLAG_NONE) +# define IF_NOT_USING_EMBREE \ + if (kernel_handler.get_specialization_constant() == \ + RTC_FEATURE_FLAG_NONE) +#else +# define IF_USING_EMBREE +# define IF_NOT_USING_EMBREE +#endif + CCL_NAMESPACE_BEGIN #ifdef __BVH2__ @@ -74,30 +96,39 @@ ccl_device_intersect bool scene_intersect(KernelGlobals kg, } # ifdef __EMBREE__ - if (kernel_data.device_bvh) { - return kernel_embree_intersect(kg, ray, visibility, isect); + IF_USING_EMBREE + { + if (kernel_data.device_bvh) { + return kernel_embree_intersect(kg, ray, visibility, isect); + } } # endif + IF_NOT_USING_EMBREE + { # ifdef __OBJECT_MOTION__ - if (kernel_data.bvh.have_motion) { + if (kernel_data.bvh.have_motion) { # ifdef __HAIR__ - if (kernel_data.bvh.have_curves) { - return bvh_intersect_hair_motion(kg, ray, isect, visibility); - } + if (kernel_data.bvh.have_curves) { + return bvh_intersect_hair_motion(kg, ray, isect, visibility); + } # endif /* __HAIR__ */ - return bvh_intersect_motion(kg, ray, isect, visibility); - } + return bvh_intersect_motion(kg, ray, isect, visibility); + } # endif /* __OBJECT_MOTION__ */ # ifdef __HAIR__ - if (kernel_data.bvh.have_curves) { - return bvh_intersect_hair(kg, ray, isect, visibility); - } + if (kernel_data.bvh.have_curves) { + return bvh_intersect_hair(kg, ray, isect, visibility); + } # endif /* __HAIR__ */ - return bvh_intersect(kg, ray, isect, visibility); + return bvh_intersect(kg, ray, isect, visibility); + } + + kernel_assert(false); + return false; } /* Single object BVH traversal, for SSS/AO/bevel. */ @@ -129,17 +160,27 @@ ccl_device_intersect bool scene_intersect_local(KernelGlobals kg, } # ifdef __EMBREE__ - if (kernel_data.device_bvh) { - return kernel_embree_intersect_local(kg, ray, local_isect, local_object, lcg_state, max_hits); + IF_USING_EMBREE + { + if (kernel_data.device_bvh) { + return kernel_embree_intersect_local( + kg, ray, local_isect, local_object, lcg_state, max_hits); + } } # endif + IF_NOT_USING_EMBREE + { # ifdef __OBJECT_MOTION__ - if (kernel_data.bvh.have_motion) { - return bvh_intersect_local_motion(kg, ray, local_isect, local_object, lcg_state, max_hits); - } + if (kernel_data.bvh.have_motion) { + return bvh_intersect_local_motion(kg, ray, local_isect, local_object, lcg_state, max_hits); + } # endif /* __OBJECT_MOTION__ */ - return bvh_intersect_local(kg, ray, local_isect, local_object, lcg_state, max_hits); + return bvh_intersect_local(kg, ray, local_isect, local_object, lcg_state, max_hits); + } + + kernel_assert(false); + return false; } # endif @@ -184,35 +225,44 @@ ccl_device_intersect bool scene_intersect_shadow_all(KernelGlobals kg, } # ifdef __EMBREE__ - if (kernel_data.device_bvh) { - return kernel_embree_intersect_shadow_all( - kg, state, ray, visibility, max_hits, num_recorded_hits, throughput); + IF_USING_EMBREE + { + if (kernel_data.device_bvh) { + return kernel_embree_intersect_shadow_all( + kg, state, ray, visibility, max_hits, num_recorded_hits, throughput); + } } # endif + IF_NOT_USING_EMBREE + { # ifdef __OBJECT_MOTION__ - if (kernel_data.bvh.have_motion) { + if (kernel_data.bvh.have_motion) { # ifdef __HAIR__ - if (kernel_data.bvh.have_curves) { - return bvh_intersect_shadow_all_hair_motion( - kg, ray, state, visibility, max_hits, num_recorded_hits, throughput); - } + if (kernel_data.bvh.have_curves) { + return bvh_intersect_shadow_all_hair_motion( + kg, ray, state, visibility, max_hits, num_recorded_hits, throughput); + } # endif /* __HAIR__ */ - return bvh_intersect_shadow_all_motion( - kg, ray, state, visibility, max_hits, num_recorded_hits, throughput); - } + return bvh_intersect_shadow_all_motion( + kg, ray, state, visibility, max_hits, num_recorded_hits, throughput); + } # endif /* __OBJECT_MOTION__ */ # ifdef __HAIR__ - if (kernel_data.bvh.have_curves) { - return bvh_intersect_shadow_all_hair( - kg, ray, state, visibility, max_hits, num_recorded_hits, throughput); - } + if (kernel_data.bvh.have_curves) { + return bvh_intersect_shadow_all_hair( + kg, ray, state, visibility, max_hits, num_recorded_hits, throughput); + } # endif /* __HAIR__ */ - return bvh_intersect_shadow_all( - kg, ray, state, visibility, max_hits, num_recorded_hits, throughput); + return bvh_intersect_shadow_all( + kg, ray, state, visibility, max_hits, num_recorded_hits, throughput); + } + + kernel_assert(false); + return false; } # endif /* __SHADOW_RECORD_ALL__ */ @@ -239,13 +289,19 @@ ccl_device_intersect bool scene_intersect_volume(KernelGlobals kg, return false; } + IF_NOT_USING_EMBREE + { # ifdef __OBJECT_MOTION__ - if (kernel_data.bvh.have_motion) { - return bvh_intersect_volume_motion(kg, ray, isect, visibility); - } + if (kernel_data.bvh.have_motion) { + return bvh_intersect_volume_motion(kg, ray, isect, visibility); + } # endif /* __OBJECT_MOTION__ */ - return bvh_intersect_volume(kg, ray, isect, visibility); + return bvh_intersect_volume(kg, ray, isect, visibility); + } + + kernel_assert(false); + return false; } # endif /* defined(__VOLUME__) && !defined(__VOLUME_RECORD_ALL__) */ @@ -275,18 +331,27 @@ ccl_device_intersect uint scene_intersect_volume(KernelGlobals kg, } # ifdef __EMBREE__ - if (kernel_data.device_bvh) { - return kernel_embree_intersect_volume(kg, ray, isect, max_hits, visibility); + IF_USING_EMBREE + { + if (kernel_data.device_bvh) { + return kernel_embree_intersect_volume(kg, ray, isect, max_hits, visibility); + } } # endif + IF_NOT_USING_EMBREE + { # ifdef __OBJECT_MOTION__ - if (kernel_data.bvh.have_motion) { - return bvh_intersect_volume_all_motion(kg, ray, isect, max_hits, visibility); - } + if (kernel_data.bvh.have_motion) { + return bvh_intersect_volume_all_motion(kg, ray, isect, max_hits, visibility); + } # endif /* __OBJECT_MOTION__ */ - return bvh_intersect_volume_all(kg, ray, isect, max_hits, visibility); + return bvh_intersect_volume_all(kg, ray, isect, max_hits, visibility); + } + + kernel_assert(false); + return false; } # endif /* defined(__VOLUME__) && defined(__VOLUME_RECORD_ALL__) */ diff --git a/intern/cycles/kernel/device/cpu/bvh.h b/intern/cycles/kernel/device/cpu/bvh.h index 65aed44dbbe..8e2b0be3cdc 100644 --- a/intern/cycles/kernel/device/cpu/bvh.h +++ b/intern/cycles/kernel/device/cpu/bvh.h @@ -39,8 +39,6 @@ using numhit_t = uint32_t; #endif #ifdef __KERNEL_ONEAPI__ -static constexpr sycl::specialization_id oneapi_embree_features{ - (const RTCFeatureFlags)(0)}; # define CYCLES_EMBREE_USED_FEATURES \ (kernel_handler.get_specialization_constant()) #else diff --git a/intern/cycles/kernel/device/gpu/kernel.h b/intern/cycles/kernel/device/gpu/kernel.h index 97f699cbe05..b4c84fd0f0b 100644 --- a/intern/cycles/kernel/device/gpu/kernel.h +++ b/intern/cycles/kernel/device/gpu/kernel.h @@ -191,6 +191,10 @@ ccl_gpu_kernel(GPU_KERNEL_BLOCK_NUM_THREADS, GPU_KERNEL_MAX_REGISTERS) } ccl_gpu_kernel_postfix +#ifdef __KERNEL_ONEAPI__ +# include "kernel/device/oneapi/context_intersect_end.h" +#endif + ccl_gpu_kernel(GPU_KERNEL_BLOCK_NUM_THREADS, GPU_KERNEL_MAX_REGISTERS) ccl_gpu_kernel_signature(integrator_shade_background, ccl_global const int *path_index_array, @@ -255,6 +259,12 @@ ccl_gpu_kernel_postfix constant int __dummy_constant [[function_constant(Kernel_DummyConstant)]]; #endif +/* Kernels using intersections need access to the kernel handler for specialization constants to + * work properly. */ +#ifdef __KERNEL_ONEAPI__ +# include "kernel/device/oneapi/context_intersect_begin.h" +#endif + ccl_gpu_kernel(GPU_KERNEL_BLOCK_NUM_THREADS, GPU_KERNEL_MAX_REGISTERS) ccl_gpu_kernel_signature(integrator_shade_surface_raytrace, ccl_global const int *path_index_array, @@ -293,6 +303,9 @@ ccl_gpu_kernel(GPU_KERNEL_BLOCK_NUM_THREADS, GPU_KERNEL_MAX_REGISTERS) } } ccl_gpu_kernel_postfix +#ifdef __KERNEL_ONEAPI__ +# include "kernel/device/oneapi/context_intersect_end.h" +#endif ccl_gpu_kernel(GPU_KERNEL_BLOCK_NUM_THREADS, GPU_KERNEL_MAX_REGISTERS) ccl_gpu_kernel_signature(integrator_shade_volume, diff --git a/intern/cycles/kernel/device/oneapi/kernel.cpp b/intern/cycles/kernel/device/oneapi/kernel.cpp index e2f4f3963f8..fa68237914a 100644 --- a/intern/cycles/kernel/device/oneapi/kernel.cpp +++ b/intern/cycles/kernel/device/oneapi/kernel.cpp @@ -155,54 +155,93 @@ size_t oneapi_kernel_preferred_local_size(SyclQueue *queue, return std::min(limit_work_group_size, preferred_work_group_size); } -bool oneapi_load_kernels(SyclQueue *queue_, const uint requested_features) +bool oneapi_kernel_is_required_for_features(const std::string &kernel_name, + const uint kernel_features) +{ + if ((kernel_features & KERNEL_FEATURE_NODE_RAYTRACE) == 0 && + kernel_name.find(device_kernel_as_string(DEVICE_KERNEL_INTEGRATOR_SHADE_SURFACE_RAYTRACE)) != + std::string::npos) + return false; + if ((kernel_features & KERNEL_FEATURE_MNEE) == 0 && + kernel_name.find(device_kernel_as_string(DEVICE_KERNEL_INTEGRATOR_SHADE_SURFACE_MNEE)) != + std::string::npos) + return false; + if ((kernel_features & KERNEL_FEATURE_VOLUME) == 0 && + kernel_name.find(device_kernel_as_string(DEVICE_KERNEL_INTEGRATOR_INTERSECT_VOLUME_STACK)) != + std::string::npos) + return false; + + return true; +} + +bool oneapi_kernel_is_using_embree(const std::string &kernel_name) +{ +# ifdef WITH_EMBREE_GPU + /* MNEE and Raytrace kernels aren't yet enabled to use Embree. */ + for (int i = 0; i < (int)DEVICE_KERNEL_NUM; i++) { + DeviceKernel kernel = (DeviceKernel)i; + if (device_kernel_has_intersection(kernel)) { + if (kernel_name.find(device_kernel_as_string(kernel)) != std::string::npos) { + return !(kernel == DEVICE_KERNEL_INTEGRATOR_SHADE_SURFACE_RAYTRACE || + kernel == DEVICE_KERNEL_INTEGRATOR_SHADE_SURFACE_MNEE); + } + } + } +# endif + return false; +} + +bool oneapi_load_kernels(SyclQueue *queue_, + const uint kernel_features, + bool use_hardware_raytracing) { assert(queue_); sycl::queue *queue = reinterpret_cast(queue_); + # ifdef WITH_EMBREE_GPU - /* Preloading intersection kernels is mandatory with Embree on GPU execution, - * because AoT will be not fully performant. */ - try { - sycl::kernel_bundle all_kernels_bundle = - sycl::get_kernel_bundle(queue->get_context(), - {queue->get_device()}); + /* For best performance, we always JIT compile the kernels that are using Embree. */ + if (use_hardware_raytracing) { + try { + sycl::kernel_bundle all_kernels_bundle = + sycl::get_kernel_bundle(queue->get_context(), + {queue->get_device()}); - for (const sycl::kernel_id &kernel_id : all_kernels_bundle.get_kernel_ids()) { - const std::string &kernel_name = kernel_id.get_name(); + for (const sycl::kernel_id &kernel_id : all_kernels_bundle.get_kernel_ids()) { + const std::string &kernel_name = kernel_id.get_name(); - /* NOTE(@nsirgien): Names in this conditions below should match names from - * oneapi_call macro in oneapi_enqueue_kernel below */ - /* Also, here we handle only intersection kernels (and skip the rest) */ - if (kernel_name.find("_intersect_") == std::string::npos) { - continue; + if (!oneapi_kernel_is_required_for_features(kernel_name, kernel_features) || + !oneapi_kernel_is_using_embree(kernel_name)) { + continue; + } + + sycl::kernel_bundle one_kernel_bundle_input = + sycl::get_kernel_bundle(queue->get_context(), {kernel_id}); + + /* Hair requires embree curves support. */ + if (kernel_features & KERNEL_FEATURE_HAIR) { + one_kernel_bundle_input + .set_specialization_constant( + CYCLES_ONEAPI_EMBREE_ALL_FEATURES); + sycl::build(one_kernel_bundle_input); + } + else { + one_kernel_bundle_input + .set_specialization_constant( + CYCLES_ONEAPI_EMBREE_BASIC_FEATURES); + sycl::build(one_kernel_bundle_input); + } } - - if (((requested_features & KERNEL_FEATURE_VOLUME) == 0) && - kernel_name.find("_intersect_volume") != std::string::npos) { - continue; + } + catch (sycl::exception const &e) { + if (s_error_cb) { + s_error_cb(e.what(), s_error_user_ptr); } - - sycl::kernel_bundle one_kernel_bundle = - sycl::get_kernel_bundle(queue->get_context(), {kernel_id}); - - one_kernel_bundle.set_specialization_constant( - CYCLES_ONEAPI_EMBREE_BASIC_FEATURES); - sycl::build(one_kernel_bundle); - - one_kernel_bundle.set_specialization_constant( - CYCLES_ONEAPI_EMBREE_ALL_FEATURES); - sycl::build(one_kernel_bundle); + return false; } } - catch (sycl::exception const &e) { - if (s_error_cb) { - s_error_cb(e.what(), s_error_user_ptr); - } - return false; - } # endif -# ifdef SYCL_SKIP_KERNELS_PRELOAD +# ifdef WITH_CYCLES_ONEAPI_BINARIES (void)queue_; (void)kernel_features; # else @@ -214,27 +253,25 @@ bool oneapi_load_kernels(SyclQueue *queue_, const uint requested_features) for (const sycl::kernel_id &kernel_id : all_kernels_bundle.get_kernel_ids()) { const std::string &kernel_name = kernel_id.get_name(); - /* NOTE(@nsirgien): Names in this conditions below should match names from - * oneapi_call macro in oneapi_enqueue_kernel below */ - if (((requested_features & KERNEL_FEATURE_VOLUME) == 0) && - kernel_name.find("oneapi_kernel_integrator_shade_volume") != std::string::npos) { + /* In case HWRT is on, compilation of kernels using Embree is already handled in previous + * block. */ + if (!oneapi_kernel_is_required_for_features(kernel_name, kernel_features) || + (use_hardware_raytracing && oneapi_kernel_is_using_embree(kernel_name))) { continue; } - if (((requested_features & KERNEL_FEATURE_MNEE) == 0) && - kernel_name.find("oneapi_kernel_integrator_shade_surface_mnee") != std::string::npos) { - continue; - } - - if (((requested_features & KERNEL_FEATURE_NODE_RAYTRACE) == 0) && - kernel_name.find("oneapi_kernel_integrator_shade_surface_raytrace") != - std::string::npos) { - continue; - } - - sycl::kernel_bundle one_kernel_bundle = + sycl::kernel_bundle one_kernel_bundle_input = sycl::get_kernel_bundle(queue->get_context(), {kernel_id}); - sycl::build(one_kernel_bundle); +# ifdef WITH_EMBREE_GPU + /* This is expected to be the default, we set it again to be sure. */ + if (one_kernel_bundle_input + .has_specialization_constant()) { + one_kernel_bundle_input + .set_specialization_constant( + RTC_FEATURE_FLAG_NONE); + } +# endif + sycl::build(one_kernel_bundle_input); } } catch (sycl::exception const &e) { @@ -303,23 +340,22 @@ bool oneapi_enqueue_kernel(KernelContext *kernel_context, # pragma GCC diagnostic error "-Wswitch" # endif -# ifdef WITH_EMBREE_GPU - bool is_with_rthw_kernel = device_kernel == DEVICE_KERNEL_INTEGRATOR_INTERSECT_CLOSEST || - device_kernel == DEVICE_KERNEL_INTEGRATOR_INTERSECT_SHADOW || - device_kernel == DEVICE_KERNEL_INTEGRATOR_INTERSECT_SUBSURFACE || - device_kernel == DEVICE_KERNEL_INTEGRATOR_INTERSECT_VOLUME_STACK; - const RTCFeatureFlags used_embree_features = (is_with_rthw_kernel && with_hwrt && - !with_curve_features) ? - CYCLES_ONEAPI_EMBREE_BASIC_FEATURES : - CYCLES_ONEAPI_EMBREE_ALL_FEATURES; -# endif - try { queue->submit([&](sycl::handler &cgh) { # ifdef WITH_EMBREE_GPU - if (is_with_rthw_kernel) + /* Spec says it has no effect if the called kernel doesn't support the below specialization + * constant but it can still trigger a recompilation, so we set it only if needed. */ + if (device_kernel_has_intersection(device_kernel)) { + const RTCFeatureFlags used_embree_features = !use_hardware_raytracing ? + RTC_FEATURE_FLAG_NONE : + !(kernel_features & KERNEL_FEATURE_HAIR) ? + CYCLES_ONEAPI_EMBREE_BASIC_FEATURES : + CYCLES_ONEAPI_EMBREE_ALL_FEATURES; cgh.set_specialization_constant( used_embree_features); + } +# else + (void)kernel_features; # endif switch (device_kernel) { case DEVICE_KERNEL_INTEGRATOR_RESET: {