test2/intern/cycles/kernel/device/oneapi/compat.h

/* SPDX-License-Identifier: Apache-2.0
 * Copyright 2021-2022 Intel Corporation */

#pragma once

#define __KERNEL_GPU__
#define __KERNEL_ONEAPI__

#define CCL_NAMESPACE_BEGIN
#define CCL_NAMESPACE_END

#include <cstdint>

#ifndef __NODES_MAX_GROUP__
#  define __NODES_MAX_GROUP__ NODE_GROUP_LEVEL_MAX
#endif
#ifndef __NODES_FEATURES__
#  define __NODES_FEATURES__ NODE_FEATURE_ALL
#endif

/* This one does not have an abstraction.
 * It's used by other devices directly.
 */

#define __device__

/* Qualifier wrappers for different names on different devices */

#define ccl_device
#define ccl_global
#define ccl_always_inline __attribute__((always_inline))
#define ccl_device_inline inline
#define ccl_noinline __attribute__((noinline))
#define ccl_inline_constant const constexpr
#define ccl_static_constant const
#define ccl_device_forceinline __attribute__((always_inline))
#define ccl_device_noinline ccl_device ccl_noinline
#define ccl_device_noinline_cpu ccl_device
#define ccl_device_inline_method ccl_device
#define ccl_restrict __restrict__
#define ccl_loop_no_unroll
#define ccl_optional_struct_init
#define ccl_private
#define ATTR_FALLTHROUGH __attribute__((fallthrough))
#define ccl_constant const
#define ccl_try_align(...) __attribute__((aligned(__VA_ARGS__)))
#define ccl_align(n) __attribute__((aligned(n)))
#define kernel_assert(cond)
#define ccl_may_alias

/* clang-format off */

/* kernel.h adapters */
#define ccl_gpu_kernel(block_num_threads, thread_num_registers)
#define ccl_gpu_kernel_threads(block_num_threads)

#ifdef WITH_ONEAPI_SYCL_HOST_ENABLED
#  define KG_ND_ITEMS \
  kg->nd_item_local_id_0 = item.get_local_id(0); \
  kg->nd_item_local_range_0 = item.get_local_range(0); \
  kg->nd_item_group_0 = item.get_group(0); \
  kg->nd_item_group_range_0 = item.get_group_range(0); \
  kg->nd_item_global_id_0 = item.get_global_id(0); \
  kg->nd_item_global_range_0 = item.get_global_range(0);
#else
# define KG_ND_ITEMS
#endif

#define ccl_gpu_kernel_signature(name, ...) \
void oneapi_kernel_##name(KernelGlobalsGPU *ccl_restrict kg, \
                          size_t kernel_global_size, \
                          size_t kernel_local_size, \
                          sycl::handler &cgh, \
                          __VA_ARGS__) { \
      (kg); \
      cgh.parallel_for<class kernel_##name>( \
          sycl::nd_range<1>(kernel_global_size, kernel_local_size), \
          [=](sycl::nd_item<1> item) { \
            KG_ND_ITEMS

#define ccl_gpu_kernel_postfix \
          }); \
    }

#define ccl_gpu_kernel_call(x) ((ONEAPIKernelContext*)kg)->x

#define ccl_gpu_kernel_lambda(func, ...) \
  struct KernelLambda \
  { \
    KernelLambda(const ONEAPIKernelContext *_kg) : kg(_kg) {} \
    ccl_private const ONEAPIKernelContext *kg; \
    __VA_ARGS__; \
    int operator()(const int state) const { return (func); } \
  } ccl_gpu_kernel_lambda_pass((ONEAPIKernelContext *)kg)

/* GPU thread, block, grid size and index */
#ifndef WITH_ONEAPI_SYCL_HOST_ENABLED
#  define ccl_gpu_thread_idx_x (sycl::ext::oneapi::experimental::this_nd_item<1>().get_local_id(0))
#  define ccl_gpu_block_dim_x (sycl::ext::oneapi::experimental::this_nd_item<1>().get_local_range(0))
#  define ccl_gpu_block_idx_x (sycl::ext::oneapi::experimental::this_nd_item<1>().get_group(0))
#  define ccl_gpu_grid_dim_x (sycl::ext::oneapi::experimental::this_nd_item<1>().get_group_range(0))
#  define ccl_gpu_warp_size (sycl::ext::oneapi::experimental::this_sub_group().get_local_range()[0])
#  define ccl_gpu_thread_mask(thread_warp) uint(0xFFFFFFFF >> (ccl_gpu_warp_size - thread_warp))

#  define ccl_gpu_global_id_x() (sycl::ext::oneapi::experimental::this_nd_item<1>().get_global_id(0))
#  define ccl_gpu_global_size_x() (sycl::ext::oneapi::experimental::this_nd_item<1>().get_global_range(0))
#else
#  define ccl_gpu_thread_idx_x (kg->nd_item_local_id_0)
#  define ccl_gpu_block_dim_x (kg->nd_item_local_range_0)
#  define ccl_gpu_block_idx_x (kg->nd_item_group_0)
#  define ccl_gpu_grid_dim_x (kg->nd_item_group_range_0)
#  define ccl_gpu_warp_size (sycl::ext::oneapi::experimental::this_sub_group().get_local_range()[0])
#  define ccl_gpu_thread_mask(thread_warp) uint(0xFFFFFFFF >> (ccl_gpu_warp_size - thread_warp))

#  define ccl_gpu_global_id_x() (kg->nd_item_global_id_0)
#  define ccl_gpu_global_size_x() (kg->nd_item_global_range_0)
#endif


/* GPU warp synchronization */

#define ccl_gpu_syncthreads() sycl::ext::oneapi::experimental::this_nd_item<1>().barrier()
#define ccl_gpu_local_syncthreads() sycl::ext::oneapi::experimental::this_nd_item<1>().barrier(sycl::access::fence_space::local_space)
#ifdef __SYCL_DEVICE_ONLY__
  #define ccl_gpu_ballot(predicate) (sycl::ext::oneapi::group_ballot(sycl::ext::oneapi::experimental::this_sub_group(), predicate).count())
#else
  #define ccl_gpu_ballot(predicate) (predicate ? 1 : 0)
#endif

/* Debug defines */
#if defined(__SYCL_DEVICE_ONLY__)
#  define CONSTANT __attribute__((opencl_constant))
#else
#  define CONSTANT
#endif

#define sycl_printf(format, ...) {               \
    static const CONSTANT char fmt[] = format;               \
    sycl::ext::oneapi::experimental::printf(fmt, __VA_ARGS__ );    \
  }

#define sycl_printf_(format) {               \
    static const CONSTANT char fmt[] = format;               \
    sycl::ext::oneapi::experimental::printf(fmt);                  \
  }

/* GPU texture objects */

/* clang-format on */

/* Types */
/* It's not possible to use sycl types like sycl::float3, sycl::int3, etc
 * because these types have different interfaces from blender version */

using uchar = unsigned char;
using sycl::half;

struct float3 {
  float x, y, z;
};

ccl_always_inline float3 make_float3(float x, float y, float z)
{
  return {x, y, z};
}
ccl_always_inline float3 make_float3(float x)
{
  return {x, x, x};
}

/* math functions */
#define fabsf(x) sycl::fabs((x))
#define copysignf(x, y) sycl::copysign((x), (y))
#define asinf(x) sycl::asin((x))
#define acosf(x) sycl::acos((x))
#define atanf(x) sycl::atan((x))
#define floorf(x) sycl::floor((x))
#define ceilf(x) sycl::ceil((x))
#define sinhf(x) sycl::sinh((x))
#define coshf(x) sycl::cosh((x))
#define tanhf(x) sycl::tanh((x))
#define hypotf(x, y) sycl::hypot((x), (y))
#define atan2f(x, y) sycl::atan2((x), (y))
#define fmaxf(x, y) sycl::fmax((x), (y))
#define fminf(x, y) sycl::fmin((x), (y))
#define fmodf(x, y) sycl::fmod((x), (y))
#define lgammaf(x) sycl::lgamma((x))

#define __forceinline __attribute__((always_inline))

/* Types */
#include "util/half.h"
#include "util/types.h"

/* NOTE(@nsirgien): Declaring these functions after types headers is very important because they
 * include oneAPI headers, which transitively include math.h headers which will cause redefinitions
 * of the math defines because math.h also uses them and having them defined before math.h include
 * is actually UB. */
/* Use fast math functions - get them from sycl::native namespace for native math function
 * implementations */
#define cosf(x) sycl::native::cos(((float)(x)))
#define sinf(x) sycl::native::sin(((float)(x)))
#define powf(x, y) sycl::native::powr(((float)(x)), ((float)(y)))
#define tanf(x) sycl::native::tan(((float)(x)))
#define logf(x) sycl::native::log(((float)(x)))
#define expf(x) sycl::native::exp(((float)(x)))