Recently, performance with oneAPI have regressed due some recent
changes in Blender itself. This commit's changes is resolving this
and also improve compilation time for oneAPI backend first
execution (or Blender compilation time in case of AoT).
Regression have appeared after 5152c7c152 and not related to the
changes itself, but increase of kernels complexity introduced with
it. Changes in this commit is marking some Blender functions as
noinlined for oneAPI backend, which helps GPU compiler to deal with
this complexity without any negative side-effects on performance.
207 lines
7.1 KiB
C++
207 lines
7.1 KiB
C++
/* SPDX-License-Identifier: Apache-2.0
|
|
* Copyright 2021-2022 Intel Corporation */
|
|
|
|
#pragma once
|
|
|
|
#define __KERNEL_GPU__
|
|
#define __KERNEL_ONEAPI__
|
|
|
|
#define CCL_NAMESPACE_BEGIN
|
|
#define CCL_NAMESPACE_END
|
|
|
|
#include <cstdint>
|
|
|
|
#ifndef __NODES_MAX_GROUP__
|
|
# define __NODES_MAX_GROUP__ NODE_GROUP_LEVEL_MAX
|
|
#endif
|
|
#ifndef __NODES_FEATURES__
|
|
# define __NODES_FEATURES__ NODE_FEATURE_ALL
|
|
#endif
|
|
|
|
/* This one does not have an abstraction.
|
|
* It's used by other devices directly.
|
|
*/
|
|
|
|
#define __device__
|
|
|
|
/* Qualifier wrappers for different names on different devices */
|
|
|
|
#define ccl_device
|
|
#define ccl_global
|
|
#define ccl_always_inline __attribute__((always_inline))
|
|
#define ccl_device_inline inline
|
|
#define ccl_noinline __attribute__((noinline))
|
|
#define ccl_inline_constant const constexpr
|
|
#define ccl_static_constant const
|
|
#define ccl_device_forceinline __attribute__((always_inline))
|
|
#define ccl_device_noinline ccl_device ccl_noinline
|
|
#define ccl_device_noinline_cpu ccl_device
|
|
#define ccl_device_inline_method ccl_device
|
|
#define ccl_restrict __restrict__
|
|
#define ccl_loop_no_unroll
|
|
#define ccl_optional_struct_init
|
|
#define ccl_private
|
|
#define ATTR_FALLTHROUGH __attribute__((fallthrough))
|
|
#define ccl_constant const
|
|
#define ccl_try_align(...) __attribute__((aligned(__VA_ARGS__)))
|
|
#define ccl_align(n) __attribute__((aligned(n)))
|
|
#define kernel_assert(cond)
|
|
#define ccl_may_alias
|
|
|
|
/* clang-format off */
|
|
|
|
/* kernel.h adapters */
|
|
#define ccl_gpu_kernel(block_num_threads, thread_num_registers)
|
|
#define ccl_gpu_kernel_threads(block_num_threads)
|
|
|
|
#ifdef WITH_ONEAPI_SYCL_HOST_ENABLED
|
|
# define KG_ND_ITEMS \
|
|
kg->nd_item_local_id_0 = item.get_local_id(0); \
|
|
kg->nd_item_local_range_0 = item.get_local_range(0); \
|
|
kg->nd_item_group_0 = item.get_group(0); \
|
|
kg->nd_item_group_range_0 = item.get_group_range(0); \
|
|
kg->nd_item_global_id_0 = item.get_global_id(0); \
|
|
kg->nd_item_global_range_0 = item.get_global_range(0);
|
|
#else
|
|
# define KG_ND_ITEMS
|
|
#endif
|
|
|
|
#define ccl_gpu_kernel_signature(name, ...) \
|
|
void oneapi_kernel_##name(KernelGlobalsGPU *ccl_restrict kg, \
|
|
size_t kernel_global_size, \
|
|
size_t kernel_local_size, \
|
|
sycl::handler &cgh, \
|
|
__VA_ARGS__) { \
|
|
(kg); \
|
|
cgh.parallel_for<class kernel_##name>( \
|
|
sycl::nd_range<1>(kernel_global_size, kernel_local_size), \
|
|
[=](sycl::nd_item<1> item) { \
|
|
KG_ND_ITEMS
|
|
|
|
#define ccl_gpu_kernel_postfix \
|
|
}); \
|
|
}
|
|
|
|
#define ccl_gpu_kernel_call(x) ((ONEAPIKernelContext*)kg)->x
|
|
|
|
#define ccl_gpu_kernel_lambda(func, ...) \
|
|
struct KernelLambda \
|
|
{ \
|
|
KernelLambda(const ONEAPIKernelContext *_kg) : kg(_kg) {} \
|
|
ccl_private const ONEAPIKernelContext *kg; \
|
|
__VA_ARGS__; \
|
|
int operator()(const int state) const { return (func); } \
|
|
} ccl_gpu_kernel_lambda_pass((ONEAPIKernelContext *)kg)
|
|
|
|
/* GPU thread, block, grid size and index */
|
|
#ifndef WITH_ONEAPI_SYCL_HOST_ENABLED
|
|
# define ccl_gpu_thread_idx_x (sycl::ext::oneapi::experimental::this_nd_item<1>().get_local_id(0))
|
|
# define ccl_gpu_block_dim_x (sycl::ext::oneapi::experimental::this_nd_item<1>().get_local_range(0))
|
|
# define ccl_gpu_block_idx_x (sycl::ext::oneapi::experimental::this_nd_item<1>().get_group(0))
|
|
# define ccl_gpu_grid_dim_x (sycl::ext::oneapi::experimental::this_nd_item<1>().get_group_range(0))
|
|
# define ccl_gpu_warp_size (sycl::ext::oneapi::experimental::this_sub_group().get_local_range()[0])
|
|
# define ccl_gpu_thread_mask(thread_warp) uint(0xFFFFFFFF >> (ccl_gpu_warp_size - thread_warp))
|
|
|
|
# define ccl_gpu_global_id_x() (sycl::ext::oneapi::experimental::this_nd_item<1>().get_global_id(0))
|
|
# define ccl_gpu_global_size_x() (sycl::ext::oneapi::experimental::this_nd_item<1>().get_global_range(0))
|
|
#else
|
|
# define ccl_gpu_thread_idx_x (kg->nd_item_local_id_0)
|
|
# define ccl_gpu_block_dim_x (kg->nd_item_local_range_0)
|
|
# define ccl_gpu_block_idx_x (kg->nd_item_group_0)
|
|
# define ccl_gpu_grid_dim_x (kg->nd_item_group_range_0)
|
|
# define ccl_gpu_warp_size (sycl::ext::oneapi::experimental::this_sub_group().get_local_range()[0])
|
|
# define ccl_gpu_thread_mask(thread_warp) uint(0xFFFFFFFF >> (ccl_gpu_warp_size - thread_warp))
|
|
|
|
# define ccl_gpu_global_id_x() (kg->nd_item_global_id_0)
|
|
# define ccl_gpu_global_size_x() (kg->nd_item_global_range_0)
|
|
#endif
|
|
|
|
|
|
/* GPU warp synchronization */
|
|
|
|
#define ccl_gpu_syncthreads() sycl::ext::oneapi::experimental::this_nd_item<1>().barrier()
|
|
#define ccl_gpu_local_syncthreads() sycl::ext::oneapi::experimental::this_nd_item<1>().barrier(sycl::access::fence_space::local_space)
|
|
#ifdef __SYCL_DEVICE_ONLY__
|
|
#define ccl_gpu_ballot(predicate) (sycl::ext::oneapi::group_ballot(sycl::ext::oneapi::experimental::this_sub_group(), predicate).count())
|
|
#else
|
|
#define ccl_gpu_ballot(predicate) (predicate ? 1 : 0)
|
|
#endif
|
|
|
|
/* Debug defines */
|
|
#if defined(__SYCL_DEVICE_ONLY__)
|
|
# define CONSTANT __attribute__((opencl_constant))
|
|
#else
|
|
# define CONSTANT
|
|
#endif
|
|
|
|
#define sycl_printf(format, ...) { \
|
|
static const CONSTANT char fmt[] = format; \
|
|
sycl::ext::oneapi::experimental::printf(fmt, __VA_ARGS__ ); \
|
|
}
|
|
|
|
#define sycl_printf_(format) { \
|
|
static const CONSTANT char fmt[] = format; \
|
|
sycl::ext::oneapi::experimental::printf(fmt); \
|
|
}
|
|
|
|
/* GPU texture objects */
|
|
|
|
/* clang-format on */
|
|
|
|
/* Types */
|
|
/* It's not possible to use sycl types like sycl::float3, sycl::int3, etc
|
|
* because these types have different interfaces from blender version */
|
|
|
|
using uchar = unsigned char;
|
|
using sycl::half;
|
|
|
|
struct float3 {
|
|
float x, y, z;
|
|
};
|
|
|
|
ccl_always_inline float3 make_float3(float x, float y, float z)
|
|
{
|
|
return {x, y, z};
|
|
}
|
|
ccl_always_inline float3 make_float3(float x)
|
|
{
|
|
return {x, x, x};
|
|
}
|
|
|
|
/* math functions */
|
|
#define fabsf(x) sycl::fabs((x))
|
|
#define copysignf(x, y) sycl::copysign((x), (y))
|
|
#define asinf(x) sycl::asin((x))
|
|
#define acosf(x) sycl::acos((x))
|
|
#define atanf(x) sycl::atan((x))
|
|
#define floorf(x) sycl::floor((x))
|
|
#define ceilf(x) sycl::ceil((x))
|
|
#define sinhf(x) sycl::sinh((x))
|
|
#define coshf(x) sycl::cosh((x))
|
|
#define tanhf(x) sycl::tanh((x))
|
|
#define hypotf(x, y) sycl::hypot((x), (y))
|
|
#define atan2f(x, y) sycl::atan2((x), (y))
|
|
#define fmaxf(x, y) sycl::fmax((x), (y))
|
|
#define fminf(x, y) sycl::fmin((x), (y))
|
|
#define fmodf(x, y) sycl::fmod((x), (y))
|
|
#define lgammaf(x) sycl::lgamma((x))
|
|
|
|
#define __forceinline __attribute__((always_inline))
|
|
|
|
/* Types */
|
|
#include "util/half.h"
|
|
#include "util/types.h"
|
|
|
|
/* NOTE(@nsirgien): Declaring these functions after types headers is very important because they
|
|
* include oneAPI headers, which transitively include math.h headers which will cause redefinitions
|
|
* of the math defines because math.h also uses them and having them defined before math.h include
|
|
* is actually UB. */
|
|
/* Use fast math functions - get them from sycl::native namespace for native math function
|
|
* implementations */
|
|
#define cosf(x) sycl::native::cos(((float)(x)))
|
|
#define sinf(x) sycl::native::sin(((float)(x)))
|
|
#define powf(x, y) sycl::native::powr(((float)(x)), ((float)(y)))
|
|
#define tanf(x) sycl::native::tan(((float)(x)))
|
|
#define logf(x) sycl::native::log(((float)(x)))
|
|
#define expf(x) sycl::native::exp(((float)(x)))
|