This patch adds MetalRT support to Cycles kernel code. It is mostly additive in nature or confined to Metal-specific code, however there are a few areas where this interacts with other code: - MetalRT closely follows the Optix implementation, and in some cases (notably handling of transforms) it makes sense to extend Optix special-casing to MetalRT. For these generalisations we now have `__KERNEL_GPU_RAYTRACING__` instead of `__KERNEL_OPTIX__`. - MetalRT doesn't support primitive offsetting (as with `primitiveIndexOffset` in Optix), so we define and populate a new kernel texture, `__object_prim_offset`, containing per-object primitive / curve-segment offsets. This is referenced and applied in MetalRT intersection handlers. - Two new BVH layout enum values have been added: `BVH_LAYOUT_METAL` and `BVH_LAYOUT_MULTI_METAL_EMBREE` for XPU mode). Some host-side enum case handling has been updated where it is trivial to do so. Ref T92212 Reviewed By: brecht Maniphest Tasks: T92212 Differential Revision: https://developer.blender.org/D13353
136 lines
3.9 KiB
C++
136 lines
3.9 KiB
C++
/*
|
|
* Copyright 2019, NVIDIA Corporation.
|
|
* Copyright 2019, Blender Foundation.
|
|
*
|
|
* Licensed under the Apache License, Version 2.0 (the "License");
|
|
* you may not use this file except in compliance with the License.
|
|
* You may obtain a copy of the License at
|
|
*
|
|
* http://www.apache.org/licenses/LICENSE-2.0
|
|
*
|
|
* Unless required by applicable law or agreed to in writing, software
|
|
* distributed under the License is distributed on an "AS IS" BASIS,
|
|
* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
|
|
* See the License for the specific language governing permissions and
|
|
* limitations under the License.
|
|
*/
|
|
|
|
#pragma once
|
|
|
|
#define OPTIX_DONT_INCLUDE_CUDA
|
|
#include <optix.h>
|
|
|
|
#define __KERNEL_GPU__
|
|
#define __KERNEL_GPU_RAYTRACING__
|
|
#define __KERNEL_CUDA__ /* OptiX kernels are implicitly CUDA kernels too */
|
|
#define __KERNEL_OPTIX__
|
|
#define CCL_NAMESPACE_BEGIN
|
|
#define CCL_NAMESPACE_END
|
|
|
|
#ifndef ATTR_FALLTHROUGH
|
|
# define ATTR_FALLTHROUGH
|
|
#endif
|
|
|
|
/* Manual definitions so we can compile without CUDA toolkit. */
|
|
|
|
#ifdef __CUDACC_RTC__
|
|
typedef unsigned int uint32_t;
|
|
typedef unsigned long long uint64_t;
|
|
#else
|
|
# include <stdint.h>
|
|
#endif
|
|
|
|
#ifdef CYCLES_CUBIN_CC
|
|
# define FLT_MIN 1.175494350822287507969e-38f
|
|
# define FLT_MAX 340282346638528859811704183484516925440.0f
|
|
# define FLT_EPSILON 1.192092896e-07F
|
|
#endif
|
|
|
|
#define ccl_device \
|
|
__device__ __forceinline__ // Function calls are bad for OptiX performance, so inline everything
|
|
#define ccl_device_inline ccl_device
|
|
#define ccl_device_forceinline ccl_device
|
|
#define ccl_device_inline_method ccl_device
|
|
#define ccl_device_noinline __device__ __noinline__
|
|
#define ccl_device_noinline_cpu ccl_device
|
|
#define ccl_global
|
|
#define ccl_inline_constant __constant__
|
|
#define ccl_device_constant __constant__ __device__
|
|
#define ccl_constant const
|
|
#define ccl_gpu_shared __shared__
|
|
#define ccl_private
|
|
#define ccl_may_alias
|
|
#define ccl_restrict __restrict__
|
|
#define ccl_loop_no_unroll
|
|
#define ccl_align(n) __align__(n)
|
|
|
|
/* Zero initialize structs to help the compiler figure out scoping */
|
|
#define ccl_optional_struct_init = {}
|
|
|
|
/* No assert supported for CUDA */
|
|
|
|
#define kernel_assert(cond)
|
|
|
|
/* GPU thread, block, grid size and index */
|
|
|
|
#define ccl_gpu_thread_idx_x (threadIdx.x)
|
|
#define ccl_gpu_block_dim_x (blockDim.x)
|
|
#define ccl_gpu_block_idx_x (blockIdx.x)
|
|
#define ccl_gpu_grid_dim_x (gridDim.x)
|
|
#define ccl_gpu_warp_size (warpSize)
|
|
#define ccl_gpu_thread_mask(thread_warp) uint(0xFFFFFFFF >> (ccl_gpu_warp_size - thread_warp))
|
|
|
|
#define ccl_gpu_global_id_x() (ccl_gpu_block_idx_x * ccl_gpu_block_dim_x + ccl_gpu_thread_idx_x)
|
|
#define ccl_gpu_global_size_x() (ccl_gpu_grid_dim_x * ccl_gpu_block_dim_x)
|
|
|
|
/* GPU warp synchronization. */
|
|
|
|
#define ccl_gpu_syncthreads() __syncthreads()
|
|
#define ccl_gpu_ballot(predicate) __ballot_sync(0xFFFFFFFF, predicate)
|
|
#define ccl_gpu_shfl_down_sync(mask, var, detla) __shfl_down_sync(mask, var, detla)
|
|
|
|
/* GPU texture objects */
|
|
|
|
typedef unsigned long long CUtexObject;
|
|
typedef CUtexObject ccl_gpu_tex_object;
|
|
|
|
template<typename T>
|
|
ccl_device_forceinline T ccl_gpu_tex_object_read_2D(const ccl_gpu_tex_object texobj,
|
|
const float x,
|
|
const float y)
|
|
{
|
|
return tex2D<T>(texobj, x, y);
|
|
}
|
|
|
|
template<typename T>
|
|
ccl_device_forceinline T ccl_gpu_tex_object_read_3D(const ccl_gpu_tex_object texobj,
|
|
const float x,
|
|
const float y,
|
|
const float z)
|
|
{
|
|
return tex3D<T>(texobj, x, y, z);
|
|
}
|
|
|
|
/* Half */
|
|
|
|
typedef unsigned short half;
|
|
|
|
__device__ half __float2half(const float f)
|
|
{
|
|
half val;
|
|
asm("{ cvt.rn.f16.f32 %0, %1;}\n" : "=h"(val) : "f"(f));
|
|
return val;
|
|
}
|
|
|
|
__device__ float __half2float(const half h)
|
|
{
|
|
float val;
|
|
asm("{ cvt.f32.f16 %0, %1;}\n" : "=f"(val) : "h"(h));
|
|
return val;
|
|
}
|
|
|
|
/* Types */
|
|
|
|
#include "util/half.h"
|
|
#include "util/types.h"
|