Files
test/intern/cycles/kernel/film/write.h
Lukas Stockner 0dc4754da4 Cycles: Move OptiX OSL Camera kernel into its own PTX module
On the one hand, this improves initialization time since we don't need to
load/compile the full OSL module with all the shading logic if we're only
using a custom camera with SVM shading.

On the other hand, it also fixes a bug I noticed while preparing test scenes:
The AO and Bevel nodes don't work when using custom cameras with SVM on OptiX.

The issue there is that those two are handled by the SHADE_SURFACE_RAYTRACE
kernel, but since that one has intersection logic, we use the OptiX-specific
kernel even if OSL shading is disabled.
However, with the previous unified OSL module, this would mean loading
SHADE_SURFACE_RAYTRACE from kernel_osl.cu, which has `#undef __SVM__` and
therefore doesn't handle them correctly.

With this change, we'll use the kernels from kernel_shader_raytrace.cu in that
case, which do support SVM nodes just fine.

Disk usage of the new kernel_optix_osl_camera.ptx.zst file is 30KB, so this
also doesn't blow up the kernel disk size (and kernel_optix_osl.ptx.zst is
probably smaller by that amount now).

Since it seems that we can mix modules just fine, I'm suspecting that we could
split the modules properly (intersection, SVM shading with raytracing,
OSL shading, OSL camera), instead of the current approach where modules
essentially correspond to feature set tiers and each includes the previous
one's kernels as well - but that's a separate refactor.

Pull Request: https://projects.blender.org/blender/blender/pulls/138021
2025-04-28 12:49:35 +02:00

135 lines
3.9 KiB
C

/* SPDX-FileCopyrightText: 2011-2022 Blender Foundation
*
* SPDX-License-Identifier: Apache-2.0 */
#pragma once
#include "kernel/globals.h"
#include "kernel/integrator/state.h"
#include "kernel/util/colorspace.h"
#include "util/atomic.h"
#ifdef __KERNEL_GPU__
# define __ATOMIC_PASS_WRITE__
#endif
CCL_NAMESPACE_BEGIN
/* Get pointer to pixel in render buffer. */
ccl_device_forceinline ccl_global float *film_pass_pixel_render_buffer(
KernelGlobals kg, ConstIntegratorState state, ccl_global float *ccl_restrict render_buffer)
{
const uint32_t render_pixel_index = INTEGRATOR_STATE(state, path, render_pixel_index);
const uint64_t render_buffer_offset = (uint64_t)render_pixel_index *
kernel_data.film.pass_stride;
return render_buffer + render_buffer_offset;
}
ccl_device_forceinline ccl_global float *film_pass_pixel_render_buffer_shadow(
KernelGlobals kg,
ConstIntegratorShadowState state,
ccl_global float *ccl_restrict render_buffer)
{
const uint32_t render_pixel_index = INTEGRATOR_STATE(state, shadow_path, render_pixel_index);
const uint64_t render_buffer_offset = (uint64_t)render_pixel_index *
kernel_data.film.pass_stride;
return render_buffer + render_buffer_offset;
}
/* Accumulate in passes. */
ccl_device_inline void film_write_pass_float(ccl_global float *ccl_restrict buffer,
const float value)
{
#ifdef __ATOMIC_PASS_WRITE__
atomic_add_and_fetch_float(buffer, value);
#else
*buffer += value;
#endif
}
ccl_device_inline void film_write_pass_float3(ccl_global float *ccl_restrict buffer,
const float3 value)
{
#ifdef __ATOMIC_PASS_WRITE__
ccl_global float *buf_x = buffer + 0;
ccl_global float *buf_y = buffer + 1;
ccl_global float *buf_z = buffer + 2;
atomic_add_and_fetch_float(buf_x, value.x);
atomic_add_and_fetch_float(buf_y, value.y);
atomic_add_and_fetch_float(buf_z, value.z);
#else
buffer[0] += value.x;
buffer[1] += value.y;
buffer[2] += value.z;
#endif
}
ccl_device_inline void film_write_pass_spectrum(ccl_global float *ccl_restrict buffer,
Spectrum value)
{
film_write_pass_float3(buffer, spectrum_to_rgb(value));
}
ccl_device_inline void film_write_pass_float4(ccl_global float *ccl_restrict buffer,
const float4 value)
{
#ifdef __ATOMIC_PASS_WRITE__
ccl_global float *buf_x = buffer + 0;
ccl_global float *buf_y = buffer + 1;
ccl_global float *buf_z = buffer + 2;
ccl_global float *buf_w = buffer + 3;
atomic_add_and_fetch_float(buf_x, value.x);
atomic_add_and_fetch_float(buf_y, value.y);
atomic_add_and_fetch_float(buf_z, value.z);
atomic_add_and_fetch_float(buf_w, value.w);
#else
buffer[0] += value.x;
buffer[1] += value.y;
buffer[2] += value.z;
buffer[3] += value.w;
#endif
}
/* Overwrite for passes that only write on sample 0. This assumes only a single thread will write
* to this pixel and no atomics are needed. */
ccl_device_inline void film_overwrite_pass_float(ccl_global float *ccl_restrict buffer,
const float value)
{
*buffer = value;
}
ccl_device_inline void film_overwrite_pass_float3(ccl_global float *ccl_restrict buffer,
const float3 value)
{
buffer[0] = value.x;
buffer[1] = value.y;
buffer[2] = value.z;
}
/* Read back from passes. */
ccl_device_inline float kernel_read_pass_float(const ccl_global float *ccl_restrict buffer)
{
return *buffer;
}
ccl_device_inline float3 kernel_read_pass_float3(ccl_global float *ccl_restrict buffer)
{
return make_float3(buffer[0], buffer[1], buffer[2]);
}
ccl_device_inline float4 kernel_read_pass_float4(ccl_global float *ccl_restrict buffer)
{
return make_float4(buffer[0], buffer[1], buffer[2], buffer[3]);
}
CCL_NAMESPACE_END