Cycles: oneAPI: Optimize texture access by using GPU HW sampler
The current usage of software-based texture operations in the oneAPI implementation puts additional register pressure on the GPU compiler during register allocation. And it also creates code that requires maintenance. This commit is intended to address this situation by utilizing a recently productized SYCL bindless texture API to enable HW-based texture operations using Intel GPUs' hardware sampler. This currently translates to 1-11% rendering speedups (scene-specific) on my Arc A770 and Arc B580. At the moment, there are small performance regressions with NanoVDB texture operations on Arc B580 and small performance regressions in shade surface MNEE and Raytrace kernels on Arc A770, but they look recoverable and will be handled in the future. Pull Request: https://projects.blender.org/blender/blender/pulls/133457
This commit is contained in:
committed by
Brecht Van Lommel
parent
a0b7ad436b
commit
2bab4ae370
@@ -20,7 +20,7 @@ buildbot:
|
||||
optix:
|
||||
version: '7.4.0'
|
||||
ocloc:
|
||||
version: '101.5972'
|
||||
version: '101.6557'
|
||||
cmake:
|
||||
default:
|
||||
version: any
|
||||
|
||||
@@ -1787,7 +1787,7 @@ class CyclesPreferences(bpy.types.AddonPreferences):
|
||||
elif device_type == 'ONEAPI':
|
||||
import sys
|
||||
if sys.platform.startswith("win"):
|
||||
driver_version = "XX.X.101.5730"
|
||||
driver_version = "XX.X.101.6557"
|
||||
col.label(text=rpt_("Requires Intel GPU with Xe-HPG architecture"), icon='BLANK1', translate=False)
|
||||
col.label(text=rpt_("and Windows driver version %s or newer") % driver_version,
|
||||
icon='BLANK1', translate=False)
|
||||
|
||||
@@ -57,9 +57,12 @@ OneapiDevice::OneapiDevice(const DeviceInfo &info, Stats &stats, Profiler &profi
|
||||
kg_memory_size_(0)
|
||||
{
|
||||
/* Verify that base class types can be used with specific backend types */
|
||||
static_assert(sizeof(texMemObject) == sizeof(void *));
|
||||
static_assert(sizeof(arrayMemObject) == sizeof(void *));
|
||||
static_assert(sizeof(texMemObject) ==
|
||||
sizeof(sycl::ext::oneapi::experimental::sampled_image_handle));
|
||||
static_assert(sizeof(arrayMemObject) ==
|
||||
sizeof(sycl::ext::oneapi::experimental::image_mem_handle));
|
||||
|
||||
need_texture_info = false;
|
||||
use_hardware_raytracing = info.use_hardware_raytracing;
|
||||
|
||||
oneapi_set_error_cb(queue_error_cb, &oneapi_error_string_);
|
||||
@@ -636,23 +639,188 @@ void OneapiDevice::global_free(device_memory &mem)
|
||||
}
|
||||
}
|
||||
|
||||
static sycl::ext::oneapi::experimental::image_descriptor image_desc(const device_texture &mem)
|
||||
{
|
||||
/* Image Texture Storage */
|
||||
sycl::image_channel_type channel_type;
|
||||
|
||||
switch (mem.data_type) {
|
||||
case TYPE_UCHAR:
|
||||
channel_type = sycl::image_channel_type::unorm_int8;
|
||||
break;
|
||||
case TYPE_UINT16:
|
||||
channel_type = sycl::image_channel_type::unorm_int16;
|
||||
break;
|
||||
case TYPE_FLOAT:
|
||||
channel_type = sycl::image_channel_type::fp32;
|
||||
break;
|
||||
case TYPE_HALF:
|
||||
channel_type = sycl::image_channel_type::fp16;
|
||||
break;
|
||||
default:
|
||||
assert(0);
|
||||
}
|
||||
|
||||
sycl::ext::oneapi::experimental::image_descriptor param;
|
||||
param.width = mem.data_width;
|
||||
param.height = mem.data_height;
|
||||
param.depth = mem.data_depth == 1 ? 0 : mem.data_depth;
|
||||
param.num_channels = mem.data_elements;
|
||||
param.channel_type = channel_type;
|
||||
|
||||
param.verify();
|
||||
|
||||
return param;
|
||||
}
|
||||
|
||||
void OneapiDevice::tex_alloc(device_texture &mem)
|
||||
{
|
||||
generic_alloc(mem);
|
||||
generic_copy_to(mem);
|
||||
assert(device_queue_);
|
||||
|
||||
{
|
||||
/* Update texture info. */
|
||||
thread_scoped_lock lock(texture_info_mutex);
|
||||
const uint slot = mem.slot;
|
||||
if (slot >= texture_info.size()) {
|
||||
/* Allocate some slots in advance, to reduce amount of re-allocations. */
|
||||
texture_info.resize(slot + 128);
|
||||
size_t size = mem.memory_size();
|
||||
|
||||
sycl::addressing_mode address_mode = sycl::addressing_mode::none;
|
||||
switch (mem.info.extension) {
|
||||
case EXTENSION_REPEAT:
|
||||
address_mode = sycl::addressing_mode::repeat;
|
||||
break;
|
||||
case EXTENSION_EXTEND:
|
||||
address_mode = sycl::addressing_mode::clamp_to_edge;
|
||||
break;
|
||||
case EXTENSION_CLIP:
|
||||
address_mode = sycl::addressing_mode::clamp;
|
||||
break;
|
||||
case EXTENSION_MIRROR:
|
||||
address_mode = sycl::addressing_mode::mirrored_repeat;
|
||||
break;
|
||||
default:
|
||||
assert(0);
|
||||
break;
|
||||
}
|
||||
|
||||
sycl::filtering_mode filter_mode;
|
||||
if (mem.info.interpolation == INTERPOLATION_CLOSEST) {
|
||||
filter_mode = sycl::filtering_mode::nearest;
|
||||
}
|
||||
else {
|
||||
filter_mode = sycl::filtering_mode::linear;
|
||||
}
|
||||
|
||||
/* Image Texture Storage */
|
||||
sycl::image_channel_type channel_type;
|
||||
|
||||
switch (mem.data_type) {
|
||||
case TYPE_UCHAR:
|
||||
channel_type = sycl::image_channel_type::unorm_int8;
|
||||
break;
|
||||
case TYPE_UINT16:
|
||||
channel_type = sycl::image_channel_type::unorm_int16;
|
||||
break;
|
||||
case TYPE_FLOAT:
|
||||
channel_type = sycl::image_channel_type::fp32;
|
||||
break;
|
||||
case TYPE_HALF:
|
||||
channel_type = sycl::image_channel_type::fp16;
|
||||
break;
|
||||
default:
|
||||
assert(0);
|
||||
return;
|
||||
}
|
||||
|
||||
sycl::queue *queue = reinterpret_cast<sycl::queue *>(device_queue_);
|
||||
|
||||
try {
|
||||
Mem *cmem = nullptr;
|
||||
sycl::ext::oneapi::experimental::image_mem_handle memHandle{0};
|
||||
sycl::ext::oneapi::experimental::image_descriptor desc{};
|
||||
|
||||
if (mem.data_height > 0) {
|
||||
/* 2D/3D texture -- Tile optimized */
|
||||
size_t depth = mem.data_depth == 1 ? 0 : mem.data_depth;
|
||||
desc = sycl::ext::oneapi::experimental::image_descriptor(
|
||||
{mem.data_width, mem.data_height, depth}, mem.data_elements, channel_type);
|
||||
|
||||
VLOG_WORK << "Array 2D/3D allocate: " << mem.name << ", "
|
||||
<< string_human_readable_number(mem.memory_size()) << " bytes. ("
|
||||
<< string_human_readable_size(mem.memory_size()) << ")";
|
||||
|
||||
sycl::ext::oneapi::experimental::image_mem_handle memHandle =
|
||||
sycl::ext::oneapi::experimental::alloc_image_mem(desc, *queue);
|
||||
|
||||
/* Copy data from host to the texture properly based on the texture description */
|
||||
queue->ext_oneapi_copy(mem.host_pointer, memHandle, desc);
|
||||
|
||||
mem.device_pointer = (device_ptr)memHandle.raw_handle;
|
||||
mem.device_size = size;
|
||||
stats.mem_alloc(size);
|
||||
|
||||
thread_scoped_lock lock(device_mem_map_mutex);
|
||||
cmem = &device_mem_map[&mem];
|
||||
cmem->texobject = 0;
|
||||
cmem->array = (arrayMemObject)(memHandle.raw_handle);
|
||||
}
|
||||
else {
|
||||
/* 1D texture -- Linear memory */
|
||||
desc = sycl::ext::oneapi::experimental::image_descriptor(
|
||||
{mem.data_width}, mem.data_elements, channel_type);
|
||||
cmem = generic_alloc(mem);
|
||||
if (!cmem) {
|
||||
return;
|
||||
}
|
||||
|
||||
queue->memcpy((void *)mem.device_pointer, mem.host_pointer, size);
|
||||
}
|
||||
|
||||
queue->wait_and_throw();
|
||||
|
||||
/* Set Mapping and tag that we need to (re-)upload to device */
|
||||
TextureInfo tex_info = mem.info;
|
||||
tex_info.data = (uint64_t)mem.device_pointer;
|
||||
texture_info[slot] = tex_info;
|
||||
need_texture_info = true;
|
||||
|
||||
sycl::ext::oneapi::experimental::bindless_image_sampler samp(
|
||||
address_mode, sycl::coordinate_normalization_mode::normalized, filter_mode);
|
||||
|
||||
if (mem.info.data_type != IMAGE_DATA_TYPE_NANOVDB_FLOAT &&
|
||||
mem.info.data_type != IMAGE_DATA_TYPE_NANOVDB_FLOAT3 &&
|
||||
mem.info.data_type != IMAGE_DATA_TYPE_NANOVDB_FPN &&
|
||||
mem.info.data_type != IMAGE_DATA_TYPE_NANOVDB_FP16)
|
||||
{
|
||||
sycl::ext::oneapi::experimental::sampled_image_handle imgHandle;
|
||||
|
||||
if (memHandle.raw_handle) {
|
||||
/* Create 2D/3D texture handle */
|
||||
imgHandle = sycl::ext::oneapi::experimental::create_image(memHandle, samp, desc, *queue);
|
||||
}
|
||||
else {
|
||||
/* Create 1D texture */
|
||||
imgHandle = sycl::ext::oneapi::experimental::create_image(
|
||||
(void *)mem.device_pointer, 0, samp, desc, *queue);
|
||||
}
|
||||
|
||||
thread_scoped_lock lock(device_mem_map_mutex);
|
||||
cmem = &device_mem_map[&mem];
|
||||
cmem->texobject = (texMemObject)(imgHandle.raw_handle);
|
||||
|
||||
tex_info.data = (uint64_t)cmem->texobject;
|
||||
}
|
||||
else {
|
||||
tex_info.data = (uint64_t)mem.device_pointer;
|
||||
}
|
||||
|
||||
{
|
||||
/* Update texture info. */
|
||||
thread_scoped_lock lock(texture_info_mutex);
|
||||
const uint slot = mem.slot;
|
||||
if (slot >= texture_info.size()) {
|
||||
/* Allocate some slots in advance, to reduce amount of re-allocations. */
|
||||
texture_info.resize(slot + 128);
|
||||
}
|
||||
texture_info[slot] = tex_info;
|
||||
need_texture_info = true;
|
||||
}
|
||||
}
|
||||
catch (sycl::exception const &e) {
|
||||
set_error("oneAPI texture allocation error: got runtime exception \"" + string(e.what()) +
|
||||
"\"");
|
||||
}
|
||||
}
|
||||
|
||||
@@ -662,15 +830,73 @@ void OneapiDevice::tex_copy_to(device_texture &mem)
|
||||
tex_alloc(mem);
|
||||
}
|
||||
else {
|
||||
generic_copy_to(mem);
|
||||
if (mem.data_height > 0) {
|
||||
/* 2D/3D texture -- Tile optimized */
|
||||
sycl::ext::oneapi::experimental::image_descriptor desc = image_desc(mem);
|
||||
|
||||
sycl::queue *queue = reinterpret_cast<sycl::queue *>(device_queue_);
|
||||
|
||||
try {
|
||||
/* Copy data from host to the texture properly based on the texture description */
|
||||
thread_scoped_lock lock(device_mem_map_mutex);
|
||||
const Mem &cmem = device_mem_map[&mem];
|
||||
sycl::ext::oneapi::experimental::image_mem_handle image_handle{
|
||||
(sycl::ext::oneapi::experimental::image_mem_handle::raw_handle_type)cmem.array};
|
||||
queue->ext_oneapi_copy(mem.host_pointer, image_handle, desc);
|
||||
|
||||
# ifdef WITH_CYCLES_DEBUG
|
||||
queue->wait_and_throw();
|
||||
# endif
|
||||
}
|
||||
catch (sycl::exception const &e) {
|
||||
set_error("oneAPI texture copy error: got runtime exception \"" + string(e.what()) + "\"");
|
||||
}
|
||||
}
|
||||
else {
|
||||
generic_copy_to(mem);
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
void OneapiDevice::tex_free(device_texture &mem)
|
||||
{
|
||||
/* There is no texture memory in SYCL. */
|
||||
if (mem.device_pointer) {
|
||||
generic_free(mem);
|
||||
thread_scoped_lock lock(device_mem_map_mutex);
|
||||
DCHECK(device_mem_map.find(&mem) != device_mem_map.end());
|
||||
const Mem &cmem = device_mem_map[&mem];
|
||||
|
||||
sycl::queue *queue = reinterpret_cast<sycl::queue *>(device_queue_);
|
||||
|
||||
if (cmem.texobject) {
|
||||
/* Free bindless texture itself. */
|
||||
sycl::ext::oneapi::experimental::sampled_image_handle image(cmem.texobject);
|
||||
sycl::ext::oneapi::experimental::destroy_image_handle(image, *queue);
|
||||
}
|
||||
|
||||
if (cmem.array) {
|
||||
/* Free texture memory. */
|
||||
sycl::ext::oneapi::experimental::image_mem_handle imgHandle{
|
||||
(sycl::ext::oneapi::experimental::image_mem_handle::raw_handle_type)cmem.array};
|
||||
|
||||
try {
|
||||
/* We have allocated only standard textures, so we also dellocate only them. */
|
||||
sycl::ext::oneapi::experimental::free_image_mem(
|
||||
imgHandle, sycl::ext::oneapi::experimental::image_type::standard, *queue);
|
||||
}
|
||||
catch (sycl::exception const &e) {
|
||||
set_error("oneAPI texture deallocation error: got runtime exception \"" +
|
||||
string(e.what()) + "\"");
|
||||
}
|
||||
|
||||
stats.mem_free(mem.memory_size());
|
||||
mem.device_pointer = 0;
|
||||
mem.device_size = 0;
|
||||
device_mem_map.erase(device_mem_map.find(&mem));
|
||||
}
|
||||
else {
|
||||
lock.unlock();
|
||||
generic_free(mem);
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
@@ -1061,11 +1287,11 @@ void OneapiDevice::get_adjusted_global_and_local_sizes(SyclQueue *queue,
|
||||
|
||||
/* Compute-runtime (ie. NEO) version is what gets returned by sycl/L0 on Windows
|
||||
* since Windows driver 101.3268. */
|
||||
static const int lowest_supported_driver_version_win = 1015730;
|
||||
static const int lowest_supported_driver_version_win = 1016554;
|
||||
# ifdef _WIN32
|
||||
/* For Windows driver 101.5730, compute-runtime version is 29550.
|
||||
/* For Windows driver 101.6557, compute-runtime version is 31896.
|
||||
* This information is returned by `ocloc query OCL_DRIVER_VERSION`.*/
|
||||
static const int lowest_supported_driver_version_neo = 29550;
|
||||
static const int lowest_supported_driver_version_neo = 31896;
|
||||
# else
|
||||
static const int lowest_supported_driver_version_neo = 31740;
|
||||
# endif
|
||||
|
||||
@@ -134,7 +134,6 @@ set(SRC_KERNEL_DEVICE_ONEAPI_HEADERS
|
||||
device/oneapi/context_intersect_begin.h
|
||||
device/oneapi/context_intersect_end.h
|
||||
device/oneapi/globals.h
|
||||
device/oneapi/image.h
|
||||
device/oneapi/kernel.h
|
||||
device/oneapi/kernel_templates.h
|
||||
device/cpu/bvh.h
|
||||
|
||||
@@ -236,3 +236,78 @@ ccl_device_forceinline int __float_as_int(const float x)
|
||||
/* Types */
|
||||
#include "util/half.h"
|
||||
#include "util/types.h"
|
||||
|
||||
static_assert(
|
||||
sizeof(sycl::ext::oneapi::experimental::sampled_image_handle::raw_image_handle_type) ==
|
||||
sizeof(uint64_t));
|
||||
typedef uint64_t ccl_gpu_tex_object_2D;
|
||||
typedef uint64_t ccl_gpu_tex_object_3D;
|
||||
|
||||
template<typename T>
|
||||
ccl_device_forceinline T ccl_gpu_tex_object_read_2D(const ccl_gpu_tex_object_2D texobj,
|
||||
const float x,
|
||||
const float y)
|
||||
{
|
||||
/* Generic implementation not possible due to limitation with SYCL bindless sampled images
|
||||
* not being able to read in a format, which is different from the supported data type of
|
||||
* the texture.
|
||||
* But looks it looks like this is not a problem at the moment. */
|
||||
static_assert(false);
|
||||
return T();
|
||||
}
|
||||
|
||||
template<>
|
||||
ccl_device_forceinline float ccl_gpu_tex_object_read_2D<float>(const ccl_gpu_tex_object_2D texobj,
|
||||
const float x,
|
||||
const float y)
|
||||
{
|
||||
sycl::ext::oneapi::experimental::sampled_image_handle image(
|
||||
(sycl::ext::oneapi::experimental::sampled_image_handle::raw_image_handle_type)texobj);
|
||||
return sycl::ext::oneapi::experimental::sample_image<float>(image, sycl::float2{x, y});
|
||||
}
|
||||
|
||||
template<>
|
||||
ccl_device_forceinline float4 ccl_gpu_tex_object_read_2D<float4>(
|
||||
const ccl_gpu_tex_object_2D texobj, const float x, const float y)
|
||||
{
|
||||
sycl::ext::oneapi::experimental::sampled_image_handle image(
|
||||
(sycl::ext::oneapi::experimental::sampled_image_handle::raw_image_handle_type)texobj);
|
||||
return sycl::ext::oneapi::experimental::sample_image<float4, sycl::vec<float, 4>>(
|
||||
image, sycl::float2{x, y});
|
||||
}
|
||||
|
||||
template<typename T>
|
||||
ccl_device_forceinline T ccl_gpu_tex_object_read_3D(const ccl_gpu_tex_object_3D texobj,
|
||||
const float x,
|
||||
const float y,
|
||||
const float z)
|
||||
{
|
||||
/* A generic implementation is not possible due to limitations with SYCL bindless sampled images
|
||||
* not being able to read in a format that is different from the supported data type of
|
||||
* the texture.
|
||||
* However, it looks like this is not a problem at the moment, but I am leaving a static
|
||||
* assert in order to easily detect if it becomes a problem in the future. */
|
||||
static_assert(false);
|
||||
return T();
|
||||
}
|
||||
|
||||
template<>
|
||||
ccl_device_forceinline float ccl_gpu_tex_object_read_3D<float>(const ccl_gpu_tex_object_3D texobj,
|
||||
const float x,
|
||||
const float y,
|
||||
const float z)
|
||||
{
|
||||
sycl::ext::oneapi::experimental::sampled_image_handle image(
|
||||
(sycl::ext::oneapi::experimental::sampled_image_handle::raw_image_handle_type)texobj);
|
||||
return sycl::ext::oneapi::experimental::sample_image<float>(image, sycl::float3{x, y, z});
|
||||
}
|
||||
|
||||
template<>
|
||||
ccl_device_forceinline float4 ccl_gpu_tex_object_read_3D<float4>(
|
||||
const ccl_gpu_tex_object_3D texobj, const float x, const float y, const float z)
|
||||
{
|
||||
sycl::ext::oneapi::experimental::sampled_image_handle image(
|
||||
(sycl::ext::oneapi::experimental::sampled_image_handle::raw_image_handle_type)texobj);
|
||||
return sycl::ext::oneapi::experimental::sample_image<float4, sycl::vec<float, 4>>(
|
||||
image, sycl::float3{x, y, z});
|
||||
}
|
||||
|
||||
@@ -7,5 +7,5 @@
|
||||
/* clang-format off */
|
||||
struct ONEAPIKernelContext : public KernelGlobalsGPU {
|
||||
public:
|
||||
# include "kernel/device/oneapi/image.h"
|
||||
# include "kernel/device/gpu/image.h"
|
||||
/* clang-format on */
|
||||
|
||||
@@ -1,433 +0,0 @@
|
||||
/* SPDX-FileCopyrightText: 2021-2022 Intel Corporation
|
||||
*
|
||||
* SPDX-License-Identifier: Apache-2.0 */
|
||||
|
||||
CCL_NAMESPACE_BEGIN
|
||||
|
||||
/* For oneAPI implementation we do manual lookup and interpolation. */
|
||||
/* TODO: share implementation with ../cpu/image.h. */
|
||||
|
||||
template<typename T> ccl_device_forceinline T tex_fetch(const TextureInfo &info, const int index)
|
||||
{
|
||||
return reinterpret_cast<ccl_global T *>(info.data)[index];
|
||||
}
|
||||
|
||||
ccl_device_inline int svm_image_texture_wrap_periodic(int x, int width)
|
||||
{
|
||||
x %= width;
|
||||
if (x < 0) {
|
||||
x += width;
|
||||
}
|
||||
return x;
|
||||
}
|
||||
|
||||
ccl_device_inline int svm_image_texture_wrap_clamp(const int x, const int width)
|
||||
{
|
||||
return clamp(x, 0, width - 1);
|
||||
}
|
||||
|
||||
ccl_device_inline int svm_image_texture_wrap_mirror(const int x, const int width)
|
||||
{
|
||||
const int m = abs(x + (x < 0)) % (2 * width);
|
||||
if (m >= width) {
|
||||
return 2 * width - m - 1;
|
||||
}
|
||||
return m;
|
||||
}
|
||||
|
||||
ccl_device_inline float4 svm_image_texture_read(const TextureInfo &info,
|
||||
const int x,
|
||||
int y,
|
||||
const int z)
|
||||
{
|
||||
const int data_offset = x + info.width * y + info.width * info.height * z;
|
||||
const int texture_type = info.data_type;
|
||||
|
||||
/* Float4 */
|
||||
if (texture_type == IMAGE_DATA_TYPE_FLOAT4) {
|
||||
return tex_fetch<float4>(info, data_offset);
|
||||
}
|
||||
/* Byte4 */
|
||||
if (texture_type == IMAGE_DATA_TYPE_BYTE4) {
|
||||
uchar4 r = tex_fetch<uchar4>(info, data_offset);
|
||||
float f = 1.0f / 255.0f;
|
||||
return make_float4(r.x * f, r.y * f, r.z * f, r.w * f);
|
||||
}
|
||||
/* Ushort4 */
|
||||
if (texture_type == IMAGE_DATA_TYPE_USHORT4) {
|
||||
ushort4 r = tex_fetch<ushort4>(info, data_offset);
|
||||
float f = 1.0f / 65535.f;
|
||||
return make_float4(r.x * f, r.y * f, r.z * f, r.w * f);
|
||||
}
|
||||
/* Float */
|
||||
if (texture_type == IMAGE_DATA_TYPE_FLOAT) {
|
||||
float f = tex_fetch<float>(info, data_offset);
|
||||
return make_float4(f, f, f, 1.0f);
|
||||
}
|
||||
/* UShort */
|
||||
if (texture_type == IMAGE_DATA_TYPE_USHORT) {
|
||||
ushort r = tex_fetch<ushort>(info, data_offset);
|
||||
float f = r * (1.0f / 65535.0f);
|
||||
return make_float4(f, f, f, 1.0f);
|
||||
}
|
||||
if (texture_type == IMAGE_DATA_TYPE_HALF) {
|
||||
float f = tex_fetch<half>(info, data_offset);
|
||||
return make_float4(f, f, f, 1.0f);
|
||||
}
|
||||
if (texture_type == IMAGE_DATA_TYPE_HALF4) {
|
||||
half4 r = tex_fetch<half4>(info, data_offset);
|
||||
return make_float4(r.x, r.y, r.z, r.w);
|
||||
}
|
||||
/* Byte */
|
||||
uchar r = tex_fetch<uchar>(info, data_offset);
|
||||
float f = r * (1.0f / 255.0f);
|
||||
return make_float4(f, f, f, 1.0f);
|
||||
}
|
||||
|
||||
ccl_device_inline float4 svm_image_texture_read_2d(const int id, int x, int y)
|
||||
{
|
||||
const TextureInfo &info = kernel_data_fetch(texture_info, id);
|
||||
|
||||
/* Wrap */
|
||||
if (info.extension == EXTENSION_REPEAT) {
|
||||
x = svm_image_texture_wrap_periodic(x, info.width);
|
||||
y = svm_image_texture_wrap_periodic(y, info.height);
|
||||
}
|
||||
else if (info.extension == EXTENSION_EXTEND) {
|
||||
x = svm_image_texture_wrap_clamp(x, info.width);
|
||||
y = svm_image_texture_wrap_clamp(y, info.height);
|
||||
}
|
||||
else if (info.extension == EXTENSION_MIRROR) {
|
||||
x = svm_image_texture_wrap_mirror(x, info.width);
|
||||
y = svm_image_texture_wrap_mirror(y, info.height);
|
||||
}
|
||||
else {
|
||||
if (x < 0 || x >= info.width || y < 0 || y >= info.height) {
|
||||
return make_float4(0.0f, 0.0f, 0.0f, 0.0f);
|
||||
}
|
||||
}
|
||||
|
||||
return svm_image_texture_read(info, x, y, 0);
|
||||
}
|
||||
|
||||
ccl_device_inline float4 svm_image_texture_read_3d(const int id, int x, int y, int z)
|
||||
{
|
||||
const TextureInfo &info = kernel_data_fetch(texture_info, id);
|
||||
|
||||
/* Wrap */
|
||||
if (info.extension == EXTENSION_REPEAT) {
|
||||
x = svm_image_texture_wrap_periodic(x, info.width);
|
||||
y = svm_image_texture_wrap_periodic(y, info.height);
|
||||
z = svm_image_texture_wrap_periodic(z, info.depth);
|
||||
}
|
||||
else if (info.extension == EXTENSION_EXTEND) {
|
||||
x = svm_image_texture_wrap_clamp(x, info.width);
|
||||
y = svm_image_texture_wrap_clamp(y, info.height);
|
||||
z = svm_image_texture_wrap_clamp(z, info.depth);
|
||||
}
|
||||
else if (info.extension == EXTENSION_MIRROR) {
|
||||
x = svm_image_texture_wrap_mirror(x, info.width);
|
||||
y = svm_image_texture_wrap_mirror(y, info.height);
|
||||
z = svm_image_texture_wrap_mirror(z, info.depth);
|
||||
}
|
||||
else {
|
||||
if (x < 0 || x >= info.width || y < 0 || y >= info.height || z < 0 || z >= info.depth) {
|
||||
return make_float4(0.0f, 0.0f, 0.0f, 0.0f);
|
||||
}
|
||||
}
|
||||
|
||||
return svm_image_texture_read(info, x, y, z);
|
||||
}
|
||||
|
||||
static float svm_image_texture_frac(const float x, int *ix)
|
||||
{
|
||||
int i = float_to_int(x) - ((x < 0.0f) ? 1 : 0);
|
||||
*ix = i;
|
||||
return x - (float)i;
|
||||
}
|
||||
|
||||
#define SET_CUBIC_SPLINE_WEIGHTS(u, t) \
|
||||
{ \
|
||||
u[0] = (((-1.0f / 6.0f) * t + 0.5f) * t - 0.5f) * t + (1.0f / 6.0f); \
|
||||
u[1] = ((0.5f * t - 1.0f) * t) * t + (2.0f / 3.0f); \
|
||||
u[2] = ((-0.5f * t + 0.5f) * t + 0.5f) * t + (1.0f / 6.0f); \
|
||||
u[3] = (1.0f / 6.0f) * t * t * t; \
|
||||
} \
|
||||
(void)0
|
||||
|
||||
ccl_device float4 kernel_tex_image_interp(KernelGlobals kg, const int id, float x, float y)
|
||||
{
|
||||
const TextureInfo &info = kernel_data_fetch(texture_info, id);
|
||||
|
||||
if (info.interpolation == INTERPOLATION_CLOSEST) {
|
||||
/* Closest interpolation. */
|
||||
int ix, iy;
|
||||
svm_image_texture_frac(x * info.width, &ix);
|
||||
svm_image_texture_frac(y * info.height, &iy);
|
||||
|
||||
return svm_image_texture_read_2d(id, ix, iy);
|
||||
}
|
||||
if (info.interpolation == INTERPOLATION_LINEAR) {
|
||||
/* Bilinear interpolation. */
|
||||
int ix, iy;
|
||||
float tx = svm_image_texture_frac(x * info.width - 0.5f, &ix);
|
||||
float ty = svm_image_texture_frac(y * info.height - 0.5f, &iy);
|
||||
|
||||
float4 r;
|
||||
r = (1.0f - ty) * (1.0f - tx) * svm_image_texture_read_2d(id, ix, iy);
|
||||
r += (1.0f - ty) * tx * svm_image_texture_read_2d(id, ix + 1, iy);
|
||||
r += ty * (1.0f - tx) * svm_image_texture_read_2d(id, ix, iy + 1);
|
||||
r += ty * tx * svm_image_texture_read_2d(id, ix + 1, iy + 1);
|
||||
return r;
|
||||
}
|
||||
/* Bicubic interpolation. */
|
||||
int ix, iy;
|
||||
float tx = svm_image_texture_frac(x * info.width - 0.5f, &ix);
|
||||
float ty = svm_image_texture_frac(y * info.height - 0.5f, &iy);
|
||||
|
||||
float u[4], v[4];
|
||||
SET_CUBIC_SPLINE_WEIGHTS(u, tx);
|
||||
SET_CUBIC_SPLINE_WEIGHTS(v, ty);
|
||||
|
||||
float4 r = make_float4(0.0f, 0.0f, 0.0f, 0.0f);
|
||||
|
||||
for (int y = 0; y < 4; y++) {
|
||||
for (int x = 0; x < 4; x++) {
|
||||
float weight = u[x] * v[y];
|
||||
r += weight * svm_image_texture_read_2d(id, ix + x - 1, iy + y - 1);
|
||||
}
|
||||
}
|
||||
return r;
|
||||
}
|
||||
|
||||
#ifdef WITH_NANOVDB
|
||||
template<typename TexT, typename OutT> struct NanoVDBInterpolator {
|
||||
|
||||
static ccl_always_inline float read(const float r)
|
||||
{
|
||||
return r;
|
||||
}
|
||||
|
||||
static ccl_always_inline float4 read(const packed_float3 r)
|
||||
{
|
||||
return make_float4(r.x, r.y, r.z, 1.0f);
|
||||
}
|
||||
|
||||
template<typename Acc>
|
||||
static ccl_always_inline OutT
|
||||
interp_3d_closest(const Acc &acc, const float x, float y, const float z)
|
||||
{
|
||||
const nanovdb::Coord coord(int32_t(rintf(x)), int32_t(rintf(y)), int32_t(rintf(z)));
|
||||
return read(acc.getValue(coord));
|
||||
}
|
||||
|
||||
template<typename Acc>
|
||||
static ccl_always_inline OutT
|
||||
interp_3d_linear(const Acc &acc, const float x, float y, const float z)
|
||||
{
|
||||
int ix, iy, iz;
|
||||
const float tx = svm_image_texture_frac(x - 0.5f, &ix);
|
||||
const float ty = svm_image_texture_frac(y - 0.5f, &iy);
|
||||
const float tz = svm_image_texture_frac(z - 0.5f, &iz);
|
||||
|
||||
return mix(mix(mix(read(acc.getValue(nanovdb::Coord(ix, iy, iz))),
|
||||
read(acc.getValue(nanovdb::Coord(ix, iy, iz + 1))),
|
||||
tz),
|
||||
mix(read(acc.getValue(nanovdb::Coord(ix, iy + 1, iz + 1))),
|
||||
read(acc.getValue(nanovdb::Coord(ix, iy + 1, iz))),
|
||||
1.0f - tz),
|
||||
ty),
|
||||
mix(mix(read(acc.getValue(nanovdb::Coord(ix + 1, iy + 1, iz))),
|
||||
read(acc.getValue(nanovdb::Coord(ix + 1, iy + 1, iz + 1))),
|
||||
tz),
|
||||
mix(read(acc.getValue(nanovdb::Coord(ix + 1, iy, iz + 1))),
|
||||
read(acc.getValue(nanovdb::Coord(ix + 1, iy, iz))),
|
||||
1.0f - tz),
|
||||
1.0f - ty),
|
||||
tx);
|
||||
}
|
||||
|
||||
/* Tricubic b-spline interpolation. */
|
||||
template<typename Acc>
|
||||
static ccl_always_inline OutT
|
||||
interp_3d_cubic(const Acc &acc, const float x, float y, const float z)
|
||||
{
|
||||
int ix, iy, iz;
|
||||
int nix, niy, niz;
|
||||
int pix, piy, piz;
|
||||
int nnix, nniy, nniz;
|
||||
|
||||
/* A -0.5 offset is used to center the cubic samples around the sample point. */
|
||||
const float tx = svm_image_texture_frac(x - 0.5f, &ix);
|
||||
const float ty = svm_image_texture_frac(y - 0.5f, &iy);
|
||||
const float tz = svm_image_texture_frac(z - 0.5f, &iz);
|
||||
|
||||
pix = ix - 1;
|
||||
piy = iy - 1;
|
||||
piz = iz - 1;
|
||||
nix = ix + 1;
|
||||
niy = iy + 1;
|
||||
niz = iz + 1;
|
||||
nnix = ix + 2;
|
||||
nniy = iy + 2;
|
||||
nniz = iz + 2;
|
||||
|
||||
const int xc[4] = {pix, ix, nix, nnix};
|
||||
const int yc[4] = {piy, iy, niy, nniy};
|
||||
const int zc[4] = {piz, iz, niz, nniz};
|
||||
float u[4], v[4], w[4];
|
||||
|
||||
/* Some helper macros to keep code size reasonable.
|
||||
* Lets the compiler inline all the matrix multiplications.
|
||||
*/
|
||||
# define DATA(x, y, z) (read(acc.getValue(nanovdb::Coord(xc[x], yc[y], zc[z]))))
|
||||
# define COL_TERM(col, row) \
|
||||
(v[col] * (u[0] * DATA(0, col, row) + u[1] * DATA(1, col, row) + u[2] * DATA(2, col, row) + \
|
||||
u[3] * DATA(3, col, row)))
|
||||
# define ROW_TERM(row) \
|
||||
(w[row] * (COL_TERM(0, row) + COL_TERM(1, row) + COL_TERM(2, row) + COL_TERM(3, row)))
|
||||
|
||||
SET_CUBIC_SPLINE_WEIGHTS(u, tx);
|
||||
SET_CUBIC_SPLINE_WEIGHTS(v, ty);
|
||||
SET_CUBIC_SPLINE_WEIGHTS(w, tz);
|
||||
|
||||
/* Actual interpolation. */
|
||||
return ROW_TERM(0) + ROW_TERM(1) + ROW_TERM(2) + ROW_TERM(3);
|
||||
|
||||
# undef COL_TERM
|
||||
# undef ROW_TERM
|
||||
# undef DATA
|
||||
}
|
||||
|
||||
static ccl_always_inline OutT
|
||||
interp_3d(const TextureInfo &info, const float x, float y, const float z, const int interp)
|
||||
{
|
||||
using namespace nanovdb;
|
||||
|
||||
NanoGrid<TexT> *const grid = (NanoGrid<TexT> *)info.data;
|
||||
|
||||
switch (interp) {
|
||||
case INTERPOLATION_CLOSEST: {
|
||||
ReadAccessor<TexT> acc(grid->tree().root());
|
||||
return interp_3d_closest(acc, x, y, z);
|
||||
}
|
||||
case INTERPOLATION_LINEAR: {
|
||||
CachedReadAccessor<TexT> acc(grid->tree().root());
|
||||
return interp_3d_linear(acc, x, y, z);
|
||||
}
|
||||
default: {
|
||||
CachedReadAccessor<TexT> acc(grid->tree().root());
|
||||
return interp_3d_cubic(acc, x, y, z);
|
||||
}
|
||||
}
|
||||
}
|
||||
};
|
||||
#endif /* WITH_NANOVDB */
|
||||
|
||||
ccl_device float4 kernel_tex_image_interp_3d(KernelGlobals kg,
|
||||
const int id,
|
||||
float3 P,
|
||||
const int interp)
|
||||
{
|
||||
const TextureInfo &info = kernel_data_fetch(texture_info, id);
|
||||
|
||||
if (info.use_transform_3d) {
|
||||
Transform tfm = info.transform_3d;
|
||||
P = transform_point(&tfm, P);
|
||||
}
|
||||
|
||||
float x = P.x;
|
||||
float y = P.y;
|
||||
float z = P.z;
|
||||
|
||||
uint interpolation = (interp == INTERPOLATION_NONE) ? info.interpolation : interp;
|
||||
|
||||
#ifdef WITH_NANOVDB
|
||||
if (info.data_type == IMAGE_DATA_TYPE_NANOVDB_FLOAT) {
|
||||
const float f = NanoVDBInterpolator<float, float>::interp_3d(info, x, y, z, interpolation);
|
||||
return make_float4(f, f, f, 1.0f);
|
||||
}
|
||||
if (info.data_type == IMAGE_DATA_TYPE_NANOVDB_FLOAT3) {
|
||||
return NanoVDBInterpolator<packed_float3, float4>::interp_3d(info, x, y, z, interpolation);
|
||||
}
|
||||
if (info.data_type == IMAGE_DATA_TYPE_NANOVDB_FPN) {
|
||||
const float f = NanoVDBInterpolator<nanovdb::FpN, float>::interp_3d(
|
||||
info, x, y, z, interpolation);
|
||||
return make_float4(f, f, f, 1.0f);
|
||||
}
|
||||
if (info.data_type == IMAGE_DATA_TYPE_NANOVDB_FP16) {
|
||||
const float f = NanoVDBInterpolator<nanovdb::Fp16, float>::interp_3d(
|
||||
info, x, y, z, interpolation);
|
||||
return make_float4(f, f, f, 1.0f);
|
||||
}
|
||||
#else
|
||||
if (info.data_type == IMAGE_DATA_TYPE_NANOVDB_FLOAT ||
|
||||
info.data_type == IMAGE_DATA_TYPE_NANOVDB_FLOAT3 ||
|
||||
info.data_type == IMAGE_DATA_TYPE_NANOVDB_FPN ||
|
||||
info.data_type == IMAGE_DATA_TYPE_NANOVDB_FP16)
|
||||
{
|
||||
return make_float4(
|
||||
TEX_IMAGE_MISSING_R, TEX_IMAGE_MISSING_G, TEX_IMAGE_MISSING_B, TEX_IMAGE_MISSING_A);
|
||||
}
|
||||
#endif
|
||||
else {
|
||||
x *= info.width;
|
||||
y *= info.height;
|
||||
z *= info.depth;
|
||||
}
|
||||
|
||||
if (interpolation == INTERPOLATION_CLOSEST) {
|
||||
/* Closest interpolation. */
|
||||
int ix, iy, iz;
|
||||
svm_image_texture_frac(x, &ix);
|
||||
svm_image_texture_frac(y, &iy);
|
||||
svm_image_texture_frac(z, &iz);
|
||||
|
||||
return svm_image_texture_read_3d(id, ix, iy, iz);
|
||||
}
|
||||
if (interpolation == INTERPOLATION_LINEAR) {
|
||||
/* Trilinear interpolation. */
|
||||
int ix, iy, iz;
|
||||
float tx = svm_image_texture_frac(x - 0.5f, &ix);
|
||||
float ty = svm_image_texture_frac(y - 0.5f, &iy);
|
||||
float tz = svm_image_texture_frac(z - 0.5f, &iz);
|
||||
|
||||
float4 r;
|
||||
r = (1.0f - tz) * (1.0f - ty) * (1.0f - tx) * svm_image_texture_read_3d(id, ix, iy, iz);
|
||||
r += (1.0f - tz) * (1.0f - ty) * tx * svm_image_texture_read_3d(id, ix + 1, iy, iz);
|
||||
r += (1.0f - tz) * ty * (1.0f - tx) * svm_image_texture_read_3d(id, ix, iy + 1, iz);
|
||||
r += (1.0f - tz) * ty * tx * svm_image_texture_read_3d(id, ix + 1, iy + 1, iz);
|
||||
|
||||
r += tz * (1.0f - ty) * (1.0f - tx) * svm_image_texture_read_3d(id, ix, iy, iz + 1);
|
||||
r += tz * (1.0f - ty) * tx * svm_image_texture_read_3d(id, ix + 1, iy, iz + 1);
|
||||
r += tz * ty * (1.0f - tx) * svm_image_texture_read_3d(id, ix, iy + 1, iz + 1);
|
||||
r += tz * ty * tx * svm_image_texture_read_3d(id, ix + 1, iy + 1, iz + 1);
|
||||
return r;
|
||||
}
|
||||
/* Tri-cubic interpolation. */
|
||||
int ix, iy, iz;
|
||||
float tx = svm_image_texture_frac(x - 0.5f, &ix);
|
||||
float ty = svm_image_texture_frac(y - 0.5f, &iy);
|
||||
float tz = svm_image_texture_frac(z - 0.5f, &iz);
|
||||
|
||||
float u[4], v[4], w[4];
|
||||
SET_CUBIC_SPLINE_WEIGHTS(u, tx);
|
||||
SET_CUBIC_SPLINE_WEIGHTS(v, ty);
|
||||
SET_CUBIC_SPLINE_WEIGHTS(w, tz);
|
||||
|
||||
float4 r = make_float4(0.0f, 0.0f, 0.0f, 0.0f);
|
||||
|
||||
for (int z = 0; z < 4; z++) {
|
||||
for (int y = 0; y < 4; y++) {
|
||||
for (int x = 0; x < 4; x++) {
|
||||
float weight = u[x] * v[y] * w[z];
|
||||
r += weight * svm_image_texture_read_3d(id, ix + x - 1, iy + y - 1, iz + z - 1);
|
||||
}
|
||||
}
|
||||
}
|
||||
return r;
|
||||
}
|
||||
|
||||
#undef SET_CUBIC_SPLINE_WEIGHTS
|
||||
|
||||
CCL_NAMESPACE_END
|
||||
Reference in New Issue
Block a user