diff --git a/intern/cycles/device/kernel.cpp b/intern/cycles/device/kernel.cpp index 415e76186cd..bc8481789e6 100644 --- a/intern/cycles/device/kernel.cpp +++ b/intern/cycles/device/kernel.cpp @@ -155,6 +155,8 @@ const char *device_kernel_as_string(DeviceKernel kernel) return "filter_color_preprocess"; case DEVICE_KERNEL_FILTER_COLOR_POSTPROCESS: return "filter_color_postprocess"; + case DEVICE_KERNEL_FILTER_COLOR_FLIP_Y: + return "filter_color_flip_y"; /* Volume Scattering Probability Guiding. */ case DEVICE_KERNEL_VOLUME_GUIDING_FILTER_X: diff --git a/intern/cycles/integrator/denoiser_gpu.cpp b/intern/cycles/integrator/denoiser_gpu.cpp index 49bb72b2262..d1cbef2aa80 100644 --- a/intern/cycles/integrator/denoiser_gpu.cpp +++ b/intern/cycles/integrator/denoiser_gpu.cpp @@ -157,7 +157,8 @@ bool DenoiserGPU::denoise_filter_guiding_preprocess(const DenoiseContext &contex &buffer_params.height, &context.num_samples); - return denoiser_queue_->enqueue(DEVICE_KERNEL_FILTER_GUIDING_PREPROCESS, work_size, args); + return denoiser_queue_->enqueue(DEVICE_KERNEL_FILTER_GUIDING_PREPROCESS, work_size, args) && + denoise_filter_guiding_flip_y(context); } DenoiserGPU::DenoiseContext::DenoiseContext(Device *device, const DenoiseTask &task) @@ -234,6 +235,10 @@ DenoiserGPU::DenoiseContext::DenoiseContext(Device *device, const DenoiseTask &t bool DenoiserGPU::denoise_filter_color_postprocess(const DenoiseContext &context, const DenoisePass &pass) { + if (!denoise_filter_color_flip_y(context, pass)) { + return false; + } + const BufferParams &buffer_params = context.buffer_params; const int work_size = buffer_params.width * buffer_params.height; @@ -265,6 +270,10 @@ bool DenoiserGPU::denoise_filter_color_preprocess(const DenoiseContext &context, return true; } + if (!denoise_filter_color_flip_y(context, pass)) { + return false; + } + const BufferParams &buffer_params = context.buffer_params; const int work_size = buffer_params.width * buffer_params.height; @@ -282,6 +291,70 @@ bool DenoiserGPU::denoise_filter_color_preprocess(const DenoiseContext &context, return denoiser_queue_->enqueue(DEVICE_KERNEL_FILTER_COLOR_PREPROCESS, work_size, args); } +bool DenoiserGPU::denoise_filter_color_flip_y(const DenoiseContext &context, + const DenoisePass &pass) +{ + if (context.denoise_params.type != DENOISER_OPTIX || context.denoise_params.temporally_stable) { + /* Flipping the image is used to improve result quality with the OptiX denoiser. + * It is not necessary for other denoisers, so just skip this preprocess step. */ + return true; + } + + const BufferParams &buffer_params = context.buffer_params; + + const int work_size = buffer_params.width * buffer_params.height / 2; + + const DeviceKernelArguments args(&context.render_buffers->buffer.device_pointer, + &buffer_params.full_x, + &buffer_params.full_y, + &buffer_params.width, + &buffer_params.height, + &buffer_params.offset, + &buffer_params.stride, + &buffer_params.pass_stride, + &pass.denoised_offset); + + return denoiser_queue_->enqueue(DEVICE_KERNEL_FILTER_COLOR_FLIP_Y, work_size, args); +} + +bool DenoiserGPU::denoise_filter_guiding_flip_y(const DenoiseContext &context) +{ + if (context.denoise_params.type != DENOISER_OPTIX || context.denoise_params.temporally_stable) { + /* Flipping the image is used to improve result quality with the OptiX denoiser. + * It is not necessary for other denoisers, so just skip this preprocess step. */ + return true; + } + + const BufferParams &buffer_params = context.buffer_params; + + const int guiding_offset = 0; + + const int work_size = buffer_params.width * buffer_params.height / 2; + + const int guiding_passes[] = {context.guiding_params.pass_albedo, + context.guiding_params.pass_normal}; + for (const int guiding_pass : guiding_passes) { + if (guiding_pass == PASS_UNUSED) { + continue; + } + + const DeviceKernelArguments args(&context.guiding_params.device_pointer, + &guiding_offset, + &guiding_offset, + &buffer_params.width, + &buffer_params.height, + &guiding_offset, + &context.guiding_params.stride, + &context.guiding_params.pass_stride, + &guiding_pass); + + if (!denoiser_queue_->enqueue(DEVICE_KERNEL_FILTER_COLOR_FLIP_Y, work_size, args)) { + return false; + } + } + return true; +} + bool DenoiserGPU::denoise_filter_guiding_set_fake_albedo(const DenoiseContext &context) { const BufferParams &buffer_params = context.buffer_params; diff --git a/intern/cycles/integrator/denoiser_gpu.h b/intern/cycles/integrator/denoiser_gpu.h index db52ea16d18..5bcf03057b0 100644 --- a/intern/cycles/integrator/denoiser_gpu.h +++ b/intern/cycles/integrator/denoiser_gpu.h @@ -67,6 +67,8 @@ class DenoiserGPU : public Denoiser { * denoiser result to the render buffer. */ bool denoise_filter_color_preprocess(const DenoiseContext &context, const DenoisePass &pass); bool denoise_filter_color_postprocess(const DenoiseContext &context, const DenoisePass &pass); + bool denoise_filter_color_flip_y(const DenoiseContext &context, const DenoisePass &pass); + bool denoise_filter_guiding_flip_y(const DenoiseContext &context); bool denoise_filter_guiding_set_fake_albedo(const DenoiseContext &context); /* Read guiding passes from the render buffers, preprocess them in a way which is expected by diff --git a/intern/cycles/integrator/denoiser_optix.cpp b/intern/cycles/integrator/denoiser_optix.cpp index b64e209cd28..bfe3883c427 100644 --- a/intern/cycles/integrator/denoiser_optix.cpp +++ b/intern/cycles/integrator/denoiser_optix.cpp @@ -14,194 +14,6 @@ CCL_NAMESPACE_BEGIN -# if OPTIX_ABI_VERSION >= 60 -using ::optixUtilDenoiserInvokeTiled; -# else -// A minimal copy of functionality `optix_denoiser_tiling.h` which allows to fix integer overflow -// issues without bumping SDK or driver requirement. -// -// The original code is Copyright NVIDIA Corporation, BSD-3-Clause. -static OptixResult optixUtilDenoiserSplitImage(const OptixImage2D &input, - const OptixImage2D &output, - unsigned int overlapWindowSizeInPixels, - unsigned int tileWidth, - unsigned int tileHeight, - std::vector &tiles) -{ - if (tileWidth == 0 || tileHeight == 0) { - return OPTIX_ERROR_INVALID_VALUE; - } - - unsigned int inPixelStride = optixUtilGetPixelStride(input); - unsigned int outPixelStride = optixUtilGetPixelStride(output); - - int inp_w = std::min(tileWidth + 2 * overlapWindowSizeInPixels, input.width); - int inp_h = std::min(tileHeight + 2 * overlapWindowSizeInPixels, input.height); - int inp_y = 0, copied_y = 0; - - do { - int inputOffsetY = inp_y == 0 ? 0 : - std::max((int)overlapWindowSizeInPixels, - inp_h - ((int)input.height - inp_y)); - int copy_y = inp_y == 0 ? std::min(input.height, tileHeight + overlapWindowSizeInPixels) : - std::min(tileHeight, input.height - copied_y); - - int inp_x = 0, copied_x = 0; - do { - int inputOffsetX = inp_x == 0 ? 0 : - std::max((int)overlapWindowSizeInPixels, - inp_w - ((int)input.width - inp_x)); - int copy_x = inp_x == 0 ? std::min(input.width, tileWidth + overlapWindowSizeInPixels) : - std::min(tileWidth, input.width - copied_x); - - OptixUtilDenoiserImageTile tile; - tile.input.data = input.data + (size_t)(inp_y - inputOffsetY) * input.rowStrideInBytes + - +(size_t)(inp_x - inputOffsetX) * inPixelStride; - tile.input.width = inp_w; - tile.input.height = inp_h; - tile.input.rowStrideInBytes = input.rowStrideInBytes; - tile.input.pixelStrideInBytes = input.pixelStrideInBytes; - tile.input.format = input.format; - - tile.output.data = output.data + (size_t)inp_y * output.rowStrideInBytes + - (size_t)inp_x * outPixelStride; - tile.output.width = copy_x; - tile.output.height = copy_y; - tile.output.rowStrideInBytes = output.rowStrideInBytes; - tile.output.pixelStrideInBytes = output.pixelStrideInBytes; - tile.output.format = output.format; - - tile.inputOffsetX = inputOffsetX; - tile.inputOffsetY = inputOffsetY; - tiles.push_back(tile); - - inp_x += inp_x == 0 ? tileWidth + overlapWindowSizeInPixels : tileWidth; - copied_x += copy_x; - } while (inp_x < static_cast(input.width)); - - inp_y += inp_y == 0 ? tileHeight + overlapWindowSizeInPixels : tileHeight; - copied_y += copy_y; - } while (inp_y < static_cast(input.height)); - - return OPTIX_SUCCESS; -} - -static OptixResult optixUtilDenoiserInvokeTiled(OptixDenoiser denoiser, - CUstream stream, - const OptixDenoiserParams *params, - CUdeviceptr denoiserState, - const size_t denoiserStateSizeInBytes, - const OptixDenoiserGuideLayer *guideLayer, - const OptixDenoiserLayer *layers, - unsigned int numLayers, - CUdeviceptr scratch, - const size_t scratchSizeInBytes, - unsigned int overlapWindowSizeInPixels, - unsigned int tileWidth, - unsigned int tileHeight) -{ - if (!guideLayer || !layers) { - return OPTIX_ERROR_INVALID_VALUE; - } - - std::vector> tiles(numLayers); - std::vector> prevTiles(numLayers); - for (unsigned int l = 0; l < numLayers; l++) { - if (const OptixResult res = ccl::optixUtilDenoiserSplitImage(layers[l].input, - layers[l].output, - overlapWindowSizeInPixels, - tileWidth, - tileHeight, - tiles[l])) - return res; - - if (layers[l].previousOutput.data) { - OptixImage2D dummyOutput = layers[l].previousOutput; - if (const OptixResult res = ccl::optixUtilDenoiserSplitImage(layers[l].previousOutput, - dummyOutput, - overlapWindowSizeInPixels, - tileWidth, - tileHeight, - prevTiles[l])) - return res; - } - } - - std::vector albedoTiles; - if (guideLayer->albedo.data) { - OptixImage2D dummyOutput = guideLayer->albedo; - if (const OptixResult res = ccl::optixUtilDenoiserSplitImage(guideLayer->albedo, - dummyOutput, - overlapWindowSizeInPixels, - tileWidth, - tileHeight, - albedoTiles)) - return res; - } - - std::vector normalTiles; - if (guideLayer->normal.data) { - OptixImage2D dummyOutput = guideLayer->normal; - if (const OptixResult res = ccl::optixUtilDenoiserSplitImage(guideLayer->normal, - dummyOutput, - overlapWindowSizeInPixels, - tileWidth, - tileHeight, - normalTiles)) - return res; - } - std::vector flowTiles; - if (guideLayer->flow.data) { - OptixImage2D dummyOutput = guideLayer->flow; - if (const OptixResult res = ccl::optixUtilDenoiserSplitImage(guideLayer->flow, - dummyOutput, - overlapWindowSizeInPixels, - tileWidth, - tileHeight, - flowTiles)) - return res; - } - - for (size_t t = 0; t < tiles[0].size(); t++) { - std::vector tlayers; - for (unsigned int l = 0; l < numLayers; l++) { - OptixDenoiserLayer layer = {}; - layer.input = (tiles[l])[t].input; - layer.output = (tiles[l])[t].output; - if (layers[l].previousOutput.data) { - layer.previousOutput = (prevTiles[l])[t].input; - } - tlayers.push_back(layer); - } - - OptixDenoiserGuideLayer gl = {}; - if (guideLayer->albedo.data) { - gl.albedo = albedoTiles[t].input; - } - if (guideLayer->normal.data) { - gl.normal = normalTiles[t].input; - } - if (guideLayer->flow.data) { - gl.flow = flowTiles[t].input; - } - if (const OptixResult res = optixDenoiserInvoke(denoiser, - stream, - params, - denoiserState, - denoiserStateSizeInBytes, - &gl, - &tlayers[0], - numLayers, - (tiles[0])[t].inputOffsetX, - (tiles[0])[t].inputOffsetY, - scratch, - scratchSizeInBytes)) - return res; - } - return OPTIX_SUCCESS; -} -# endif - OptiXDenoiser::OptiXDenoiser(Device *denoiser_device, const DenoiseParams ¶ms) : DenoiserGPU(denoiser_device, params), state_(denoiser_device, "__denoiser_state", true) { @@ -424,7 +236,7 @@ bool OptiXDenoiser::denoise_run(const DenoiseContext &context, const DenoisePass OptixDenoiserParams params = {}; /* All parameters are disabled/zero. */ optix_device_assert(denoiser_device_, - ccl::optixUtilDenoiserInvokeTiled( + optixUtilDenoiserInvokeTiled( optix_denoiser_, static_cast(denoiser_queue_.get())->stream(), ¶ms, diff --git a/intern/cycles/kernel/device/gpu/kernel.h b/intern/cycles/kernel/device/gpu/kernel.h index 356dcf639c5..f237258925d 100644 --- a/intern/cycles/kernel/device/gpu/kernel.h +++ b/intern/cycles/kernel/device/gpu/kernel.h @@ -1055,12 +1055,12 @@ ccl_gpu_kernel(GPU_KERNEL_BLOCK_NUM_THREADS, GPU_KERNEL_MAX_REGISTERS) if (guiding_pass_albedo != PASS_UNUSED) { kernel_assert(render_pass_denoising_albedo != PASS_UNUSED); - const ccl_global float *aledo_in = buffer + render_pass_denoising_albedo; + const ccl_global float *albedo_in = buffer + render_pass_denoising_albedo; ccl_global float *albedo_out = guiding_pixel + guiding_pass_albedo; - albedo_out[0] = aledo_in[0] * pixel_scale; - albedo_out[1] = aledo_in[1] * pixel_scale; - albedo_out[2] = aledo_in[2] * pixel_scale; + albedo_out[0] = albedo_in[0] * pixel_scale; + albedo_out[1] = albedo_in[1] * pixel_scale; + albedo_out[2] = albedo_in[2] * pixel_scale; } /* Normal pass. */ @@ -1177,6 +1177,43 @@ ccl_gpu_kernel(GPU_KERNEL_BLOCK_NUM_THREADS, GPU_KERNEL_MAX_REGISTERS) } ccl_gpu_kernel_postfix +ccl_gpu_kernel(GPU_KERNEL_BLOCK_NUM_THREADS, GPU_KERNEL_MAX_REGISTERS) + ccl_gpu_kernel_signature(filter_color_flip_y, + ccl_global float *render_buffer, + const int full_x, + const int full_y, + const int width, + const int height, + const int offset, + const int stride, + const int pass_stride, + const int pass_denoised) +{ + const int work_index = ccl_gpu_global_id_x(); + const int y = work_index / width; + const int x = work_index - y * width; + + if (x >= width || y >= height / 2) { + return; + } + + const uint64_t render_pixel_index = offset + (x + full_x) + (y + full_y) * stride; + ccl_global float *buffer = render_buffer + render_pixel_index * pass_stride + pass_denoised; + ccl_global float *buffer_flipped = buffer + (height - 1 - y * 2) * stride * pass_stride; + + float3 temp; + temp.x = buffer[0]; + temp.y = buffer[1]; + temp.z = buffer[2]; + buffer[0] = buffer_flipped[0]; + buffer[1] = buffer_flipped[1]; + buffer[2] = buffer_flipped[2]; + buffer_flipped[0] = temp.x; + buffer_flipped[1] = temp.y; + buffer_flipped[2] = temp.z; +} +ccl_gpu_kernel_postfix + /* -------------------------------------------------------------------- * Shadow catcher. */ diff --git a/intern/cycles/kernel/device/oneapi/kernel.cpp b/intern/cycles/kernel/device/oneapi/kernel.cpp index f094ecd35a5..fbf2d0940c3 100644 --- a/intern/cycles/kernel/device/oneapi/kernel.cpp +++ b/intern/cycles/kernel/device/oneapi/kernel.cpp @@ -673,6 +673,10 @@ bool oneapi_enqueue_kernel(KernelContext *kernel_context, kg, cgh, global_size, local_size, args, oneapi_kernel_filter_color_postprocess); break; } + case DEVICE_KERNEL_FILTER_COLOR_FLIP_Y: { + oneapi_call(kg, cgh, global_size, local_size, args, oneapi_kernel_filter_color_flip_y); + break; + } case DEVICE_KERNEL_CRYPTOMATTE_POSTPROCESS: { oneapi_call( kg, cgh, global_size, local_size, args, oneapi_kernel_cryptomatte_postprocess); diff --git a/intern/cycles/kernel/types.h b/intern/cycles/kernel/types.h index 0672414d6f5..e2aa26e90e5 100644 --- a/intern/cycles/kernel/types.h +++ b/intern/cycles/kernel/types.h @@ -1904,6 +1904,7 @@ enum DeviceKernel : int { DEVICE_KERNEL_FILTER_GUIDING_SET_FAKE_ALBEDO, DEVICE_KERNEL_FILTER_COLOR_PREPROCESS, DEVICE_KERNEL_FILTER_COLOR_POSTPROCESS, + DEVICE_KERNEL_FILTER_COLOR_FLIP_Y, DEVICE_KERNEL_VOLUME_GUIDING_FILTER_X, DEVICE_KERNEL_VOLUME_GUIDING_FILTER_Y,