Compositor: Optimize pixel access for image inputs

This patch adds compile-time optimizations where the operation inputs
are guaranteed to be non-single values. Pixel load methods now take an
optional template parameter CouldBeSingle, which is false by default. If
the input is not guaranteed to be single, it needs to be set to true.

Gives up to 3x improvement in some nodes.
This commit is contained in:
Omar Emara
2024-12-17 08:58:13 +02:00
parent d1c9637470
commit 9ed0ce44e4
18 changed files with 179 additions and 76 deletions

View File

@@ -389,23 +389,25 @@ class Result {
/* Returns a reference to the allocate integer data. */
int *integer_texture() const;
/* Gets the single value stored in the result. Asserts if the template type doesn't match the
* result's type. */
/* Gets the single value stored in the result. Assumes the result stores a value of the given
* template type. */
template<typename T> T get_single_value() const;
/* Loads the pixel at the given texel coordinates. If the result is a single value result, then
* that single value is returned for all texel coordinates. Asserts if the template type doesn't
* match the result's type. */
template<typename T> T load_pixel(const int2 &texel) const;
/* Loads the pixel at the given texel coordinates. Assumes the result stores a value of the given
* template type. If the CouldBeSingleValue template argument is true and the result is a single
* value result, then that single value is returned for all texel coordinates. */
template<typename T, bool CouldBeSingleValue = false> T load_pixel(const int2 &texel) const;
/* Identical to load_pixel but with extended boundary condition. */
template<typename T> T load_pixel_extended(const int2 &texel) const;
template<typename T, bool CouldBeSingleValue = false>
T load_pixel_extended(const int2 &texel) const;
/* Identical to load_pixel but with a fallback value for out of bound access. */
template<typename T> T load_pixel_fallback(const int2 &texel, const T &fallback) const;
template<typename T, bool CouldBeSingleValue = false>
T load_pixel_fallback(const int2 &texel, const T &fallback) const;
/* Identical to load_pixel but with zero boundary condition. */
template<typename T> T load_pixel_zero(const int2 &texel) const;
template<typename T, bool CouldBeSingleValue = false> T load_pixel_zero(const int2 &texel) const;
/* Similar to load_pixel, but can load a result whose type is not known at compile time. If the
* number of channels in the result are less than 4, then the rest of the returned float4 will
@@ -413,8 +415,8 @@ class Result {
* texelFetch function in GLSL works. */
float4 load_pixel_generic_type(const int2 &texel) const;
/* Stores the given pixel value in the pixel at the given texel coordinates. Asserts if the
* template type doesn't match the result's type. */
/* Stores the given pixel value in the pixel at the given texel coordinates. Assumes the result
* stores a value of the given template type. */
template<typename T> void store_pixel(const int2 &texel, const T &pixel_value);
/* Similar to store_pixel, but can write to a result whose types is not known at compile time.
@@ -564,10 +566,15 @@ template<typename T> inline T Result::get_single_value() const
}
}
template<typename T> inline T Result::load_pixel(const int2 &texel) const
template<typename T, bool CouldBeSingleValue> inline T Result::load_pixel(const int2 &texel) const
{
if (is_single_value_) {
return this->get_single_value<T>();
if constexpr (CouldBeSingleValue) {
if (is_single_value_) {
return this->get_single_value<T>();
}
}
else {
BLI_assert(!this->is_single_value());
}
if constexpr (std::is_scalar_v<T>) {
@@ -578,10 +585,16 @@ template<typename T> inline T Result::load_pixel(const int2 &texel) const
}
}
template<typename T> inline T Result::load_pixel_extended(const int2 &texel) const
template<typename T, bool CouldBeSingleValue>
inline T Result::load_pixel_extended(const int2 &texel) const
{
if (is_single_value_) {
return this->get_single_value<T>();
if constexpr (CouldBeSingleValue) {
if (is_single_value_) {
return this->get_single_value<T>();
}
}
else {
BLI_assert(!this->is_single_value());
}
const int2 clamped_texel = math::clamp(texel, int2(0), domain_.size - int2(1));
@@ -593,11 +606,16 @@ template<typename T> inline T Result::load_pixel_extended(const int2 &texel) con
}
}
template<typename T>
template<typename T, bool CouldBeSingleValue>
inline T Result::load_pixel_fallback(const int2 &texel, const T &fallback) const
{
if (is_single_value_) {
return this->get_single_value<T>();
if constexpr (CouldBeSingleValue) {
if (is_single_value_) {
return this->get_single_value<T>();
}
}
else {
BLI_assert(!this->is_single_value());
}
if (texel.x < 0 || texel.y < 0 || texel.x >= domain_.size.x || texel.y >= domain_.size.y) {
@@ -612,9 +630,10 @@ inline T Result::load_pixel_fallback(const int2 &texel, const T &fallback) const
}
}
template<typename T> inline T Result::load_pixel_zero(const int2 &texel) const
template<typename T, bool CouldBeSingleValue>
inline T Result::load_pixel_zero(const int2 &texel) const
{
return this->load_pixel_fallback(texel, T(0));
return this->load_pixel_fallback<T, CouldBeSingleValue>(texel, T(0));
}
inline float4 Result::load_pixel_generic_type(const int2 &texel) const

View File

@@ -13,6 +13,7 @@
#include "GPU_shader.hh"
#include "COM_algorithm_symmetric_separable_blur.hh"
#include "COM_node_operation.hh"
#include "COM_utilities.hh"
@@ -68,6 +69,18 @@ class BilateralBlurOperation : public NodeOperation {
return;
}
/* If the determinator is a single value, then the node essentially becomes a box blur. */
const Result &determinator_image = get_input("Determinator");
if (determinator_image.is_single_value()) {
Result &output_image = get_result("Image");
symmetric_separable_blur(this->context(),
input_image,
output_image,
float2(this->get_blur_radius()),
R_FILTER_BOX);
return;
}
if (this->context().use_gpu()) {
this->execute_gpu();
}

View File

@@ -159,7 +159,7 @@ class BokehBlurOperation : public NodeOperation {
parallel_for(domain.size, [&](const int2 texel) {
/* The mask input is treated as a boolean. If it is zero, then no blurring happens for this
* pixel. Otherwise, the pixel is blurred normally and the mask value is irrelevant. */
float mask = mask_image.load_pixel<float>(texel);
float mask = mask_image.load_pixel<float, true>(texel);
if (mask == 0.0f) {
output.store_pixel(texel, input.load_pixel<float4>(texel));
return;
@@ -276,7 +276,7 @@ class BokehBlurOperation : public NodeOperation {
parallel_for(domain.size, [&](const int2 texel) {
/* The mask input is treated as a boolean. If it is zero, then no blurring happens for this
* pixel. Otherwise, the pixel is blurred normally and the mask value is irrelevant. */
float mask = mask_image.load_pixel<float>(texel);
float mask = mask_image.load_pixel<float, true>(texel);
if (mask == 0.0f) {
output.store_pixel(texel, input.load_pixel<float4>(texel));
return;

View File

@@ -99,8 +99,8 @@ static void box_mask(const Result &base_mask,
uv = float2x2(float2(cos_angle, -sin_angle), float2(sin_angle, cos_angle)) * uv;
bool is_inside = math::abs(uv.x) < size.x && math::abs(uv.y) < size.y;
float base_mask_value = base_mask.load_pixel<float>(texel);
float value = value_mask.load_pixel<float>(texel);
float base_mask_value = base_mask.load_pixel<float, true>(texel);
float value = value_mask.load_pixel<float, true>(texel);
float output_mask_value = 0.0f;
if constexpr (MaskType == CMP_NODE_MASKTYPE_ADD) {

View File

@@ -135,7 +135,8 @@ class CompositeOperation : public NodeOperation {
if (output_texel.x > bounds.max.x || output_texel.y > bounds.max.y) {
return;
}
output.store_pixel(texel + bounds.min, float4(image.load_pixel<float4>(texel).xyz(), 1.0f));
output.store_pixel(texel + bounds.min,
float4(image.load_pixel<float4, true>(texel).xyz(), 1.0f));
});
}
@@ -243,9 +244,9 @@ class CompositeOperation : public NodeOperation {
if (output_texel.x > bounds.max.x || output_texel.y > bounds.max.y) {
return;
}
output.store_pixel(
texel + bounds.min,
float4(image.load_pixel<float4>(texel).xyz(), alpha.load_pixel<float>(texel)));
output.store_pixel(texel + bounds.min,
float4(image.load_pixel<float4, true>(texel).xyz(),
alpha.load_pixel<float, true>(texel)));
});
}

View File

@@ -217,7 +217,7 @@ class DefocusOperation : public NodeOperation {
};
parallel_for(domain.size, [&](const int2 texel) {
float center_radius = math::max(0.0f, radius.load_pixel<float>(texel));
float center_radius = math::max(0.0f, radius.load_pixel<float, true>(texel));
/* Go over the window of the given search radius and accumulate the colors multiplied by
* their respective weights as well as the weights themselves, but only if both the radius of
@@ -228,7 +228,7 @@ class DefocusOperation : public NodeOperation {
for (int y = -search_radius; y <= search_radius; y++) {
for (int x = -search_radius; x <= search_radius; x++) {
float candidate_radius = math::max(
0.0f, radius.load_pixel_extended<float>(texel + int2(x, y)));
0.0f, radius.load_pixel_extended<float, true>(texel + int2(x, y)));
/* Skip accumulation if either the x or y distances of the candidate pixel are larger
* than either the center or candidate pixel radius. Note that the max and min functions

View File

@@ -172,7 +172,7 @@ class DespeckleOperation : public NodeOperation {
}
/* We need to despeckle, so write the mean accumulated color. */
float factor = factor_image.load_pixel<float>(texel);
float factor = factor_image.load_pixel<float, true>(texel);
float4 mean_color = accumulated_color / accumulated_weight;
output.store_pixel(texel, math::interpolate(center_color, mean_color, factor));
});

View File

@@ -133,10 +133,10 @@ class DisplaceOperation : public NodeOperation {
/* Note that the input displacement is in pixel space, so divide by the input size to
* transform it into the normalized sampler space. */
float2 scale = float2(x_scale.load_pixel_extended<float>(texel),
y_scale.load_pixel_extended<float>(texel));
float2 displacement = input_displacement.load_pixel_extended<float4>(texel).xy() * scale /
float2(size);
float2 scale = float2(x_scale.load_pixel_extended<float, true>(texel),
y_scale.load_pixel_extended<float, true>(texel));
float2 displacement = input_displacement.load_pixel_extended<float4, true>(texel).xy() *
scale / float2(size);
return coordinates - displacement;
};

View File

@@ -97,8 +97,8 @@ static void ellipse_mask(const Result &base_mask,
uv = float2x2(float2(cos_angle, -sin_angle), float2(sin_angle, cos_angle)) * uv;
bool is_inside = math::length(uv / radius) < 1.0f;
float base_mask_value = base_mask.load_pixel<float>(texel);
float value = value_mask.load_pixel<float>(texel);
float base_mask_value = base_mask.load_pixel<float, true>(texel);
float value = value_mask.load_pixel<float, true>(texel);
float output_mask_value = 0.0f;
if constexpr (MaskType == CMP_NODE_MASKTYPE_ADD) {

View File

@@ -49,6 +49,13 @@ class FilterOperation : public NodeOperation {
void execute() override
{
Result &input_image = get_input("Image");
if (input_image.is_single_value()) {
Result &output_image = get_result("Image");
input_image.pass_through(output_image);
return;
}
if (this->context().use_gpu()) {
this->execute_gpu();
}
@@ -125,7 +132,8 @@ class FilterOperation : public NodeOperation {
/* Mix the channel-wise magnitude with the original color at the center of the kernel using
* the input factor. */
float4 color = input.load_pixel<float4>(texel);
magnitude = math::interpolate(color.xyz(), magnitude, factor.load_pixel<float>(texel));
magnitude = math::interpolate(
color.xyz(), magnitude, factor.load_pixel<float, true>(texel));
/* Store the channel-wise magnitude with the original alpha of the input. */
output.store_pixel(texel, float4(magnitude, color.w));
@@ -143,7 +151,7 @@ class FilterOperation : public NodeOperation {
/* Mix with the original color at the center of the kernel using the input factor. */
color = math::interpolate(
input.load_pixel<float4>(texel), color, factor.load_pixel<float>(texel));
input.load_pixel<float4>(texel), color, factor.load_pixel<float, true>(texel));
/* Store the color making sure it is not negative. */
output.store_pixel(texel, math::max(color, float4(0.0f)));

View File

@@ -90,6 +90,23 @@ class KeyingOperation : public NodeOperation {
void execute() override
{
Result &input_image = get_result("Image");
Result &output_image = get_result("Image");
Result &output_matte = get_result("Matte");
Result &output_edges = get_result("Edges");
if (input_image.is_single_value()) {
if (output_image.should_compute()) {
input_image.pass_through(output_image);
}
if (output_matte.should_compute()) {
output_matte.allocate_invalid();
}
if (output_edges.should_compute()) {
output_edges.allocate_invalid();
}
return;
}
Result blurred_input = compute_blurred_input();
Result matte = compute_matte(blurred_input);
@@ -99,8 +116,6 @@ class KeyingOperation : public NodeOperation {
Result tweaked_matte = compute_tweaked_matte(matte);
matte.release();
Result &output_image = get_result("Image");
Result &output_matte = get_result("Matte");
if (output_image.should_compute() || output_matte.should_compute()) {
Result blurred_matte = compute_blurred_matte(tweaked_matte);
tweaked_matte.release();
@@ -343,7 +358,7 @@ class KeyingOperation : public NodeOperation {
return;
}
float4 key_color = key.load_pixel<float4>(texel);
float4 key_color = key.load_pixel<float4, true>(texel);
int3 key_saturation_indices = compute_saturation_indices(key_color.xyz());
float input_saturation = compute_saturation(input_color, key_saturation_indices);
float key_saturation = compute_saturation(key_color, key_saturation_indices);
@@ -635,7 +650,7 @@ class KeyingOperation : public NodeOperation {
};
parallel_for(input.domain().size, [&](const int2 texel) {
float4 key_color = key.load_pixel<float4>(texel);
float4 key_color = key.load_pixel<float4, true>(texel);
float4 color = input.load_pixel<float4>(texel);
float matte = matte_image.load_pixel<float>(texel);

View File

@@ -247,7 +247,7 @@ class ConvertKuwaharaOperation : public NodeOperation {
const int2 size)
{
parallel_for(size, [&](const int2 texel) {
int radius = math::max(0, int(size_input.load_pixel<float>(texel)));
int radius = math::max(0, int(size_input.load_pixel<float, true>(texel)));
float4 mean_of_squared_color_of_quadrants[4] = {
float4(0.0f), float4(0.0f), float4(0.0f), float4(0.0f)};
@@ -434,7 +434,7 @@ class ConvertKuwaharaOperation : public NodeOperation {
float eigenvalue_difference = first_eigenvalue - second_eigenvalue;
float anisotropy = eigenvalue_sum > 0.0f ? eigenvalue_difference / eigenvalue_sum : 0.0f;
float radius = math::max(0.0f, size.load_pixel<float>(texel));
float radius = math::max(0.0f, size.load_pixel<float, true>(texel));
if (radius == 0.0f) {
output.store_pixel(texel, input.load_pixel<float4>(texel));
return;

View File

@@ -118,6 +118,12 @@ class MapUVOperation : public NodeOperation {
void execute_cpu()
{
const Result &input_uv = get_input("UV");
if (input_uv.is_single_value()) {
this->execute_single_cpu();
return;
}
if (this->get_nearest_neighbour()) {
this->execute_cpu_nearest();
}
@@ -126,6 +132,29 @@ class MapUVOperation : public NodeOperation {
}
}
void execute_single_cpu()
{
const Result &input_uv = get_input("UV");
const Result &input_image = get_input("Image");
float2 uv_coordinates = input_uv.get_single_value<float4>().xy();
float4 sampled_color = input_image.sample_nearest_zero(uv_coordinates);
/* The UV input is assumed to contain an alpha channel as its third channel, since the
* UV coordinates might be defined in only a subset area of the UV texture as mentioned.
* In that case, the alpha is typically opaque at the subset area and transparent
* everywhere else, and alpha pre-multiplication is then performed. This format of having
* an alpha channel in the UV coordinates is the format used by UV passes in render
* engines, hence the mentioned logic. */
float alpha = input_uv.get_single_value<float4>().z;
float4 result = sampled_color * alpha;
Result &output = get_result("Image");
output.allocate_single_value();
output.set_color_value(result);
}
void execute_cpu_nearest()
{
const Result &input_image = get_input("Image");
@@ -140,7 +169,7 @@ class MapUVOperation : public NodeOperation {
float4 sampled_color = input_image.sample_nearest_zero(uv_coordinates);
/* The UV texture is assumed to contain an alpha channel as its third channel, since the
/* The UV input is assumed to contain an alpha channel as its third channel, since the
* UV coordinates might be defined in only a subset area of the UV texture as mentioned.
* In that case, the alpha is typically opaque at the subset area and transparent
* everywhere else, and alpha pre-multiplication is then performed. This format of having
@@ -216,7 +245,7 @@ class MapUVOperation : public NodeOperation {
float gradient_attenuation = math::max(
0.0f, 1.0f - gradient_attenuation_factor * gradient_magnitude);
/* The UV texture is assumed to contain an alpha channel as its third channel, since the
/* The UV input is assumed to contain an alpha channel as its third channel, since the
* UV coordinates might be defined in only a subset area of the UV texture as mentioned.
* In that case, the alpha is typically opaque at the subset area and transparent
* everywhere else, and alpha pre-multiplication is then performed. This format of having

View File

@@ -166,7 +166,8 @@ class ScaleOperation : public NodeOperation {
float2 coordinates = (float2(texel) + float2(0.5f)) / float2(size);
float2 center = float2(0.5f);
float2 scale = float2(x_scale.load_pixel<float>(texel), y_scale.load_pixel<float>(texel));
float2 scale = float2(x_scale.load_pixel<float, true>(texel),
y_scale.load_pixel<float, true>(texel));
float2 scaled_coordinates = center +
(coordinates - center) / math::max(scale, float2(0.0001f));

View File

@@ -110,15 +110,17 @@ class SplitOperation : public NodeOperation {
if (is_horizontal) {
parallel_for(domain.size, [&](const int2 texel) {
output_image.store_pixel(texel,
split_pixel <= texel.x ? first_image.load_pixel<float4>(texel) :
second_image.load_pixel<float4>(texel));
split_pixel <= texel.x ?
first_image.load_pixel<float4, true>(texel) :
second_image.load_pixel<float4, true>(texel));
});
}
else {
parallel_for(domain.size, [&](const int2 texel) {
output_image.store_pixel(texel,
split_pixel <= texel.y ? first_image.load_pixel<float4>(texel) :
second_image.load_pixel<float4>(texel));
split_pixel <= texel.y ?
first_image.load_pixel<float4, true>(texel) :
second_image.load_pixel<float4, true>(texel));
});
}
}

View File

@@ -104,10 +104,17 @@ static float2 max_velocity_approximate(const float2 &a,
* Each of the previous and next velocities are reduces independently. */
static Result compute_max_tile_velocity_cpu(Context &context, const Result &velocity_image)
{
if (velocity_image.is_single_value()) {
Result output = context.create_result(ResultType::Vector);
output.allocate_single_value();
output.set_vector_value(velocity_image.get_single_value<float4>());
return output;
}
const int2 tile_size = int2(MOTION_BLUR_TILE_SIZE);
const int2 velocity_size = velocity_image.domain().size;
const int2 tiles_count = math::divide_ceil(velocity_size, tile_size);
Result output = context.create_result(ResultType::Color);
Result output = context.create_result(ResultType::Vector);
output.allocate_texture(Domain(tiles_count));
parallel_for(tiles_count, [&](const int2 texel) {
@@ -189,8 +196,15 @@ static Result dilate_max_velocity_cpu(Context &context,
const Result &max_tile_velocity,
const float shutter_speed)
{
if (max_tile_velocity.is_single_value()) {
Result output = context.create_result(ResultType::Vector);
output.allocate_single_value();
output.set_vector_value(max_tile_velocity.get_single_value<float4>());
return output;
}
const int2 size = max_tile_velocity.domain().size;
Result output = context.create_result(ResultType::Color);
Result output = context.create_result(ResultType::Vector);
output.allocate_texture(Domain(size));
parallel_for(size, [&](const int2 texel) { output.store_pixel(texel, float4(0.0f)); });
@@ -421,8 +435,8 @@ static void motion_blur_cpu(const Result &input_image,
float2 uv = (float2(texel) + 0.5f) / float2(size);
/* Data of the center pixel of the gather (target). */
float center_depth = input_depth.load_pixel<float>(texel);
float4 center_motion = float4(input_velocity.load_pixel<float4>(texel)) *
float center_depth = input_depth.load_pixel<float, true>(texel);
float4 center_motion = float4(input_velocity.load_pixel<float4, true>(texel)) *
float4(float2(shutter_speed), float2(-shutter_speed));
float4 center_color = input_image.load_pixel<float4>(texel);
@@ -434,7 +448,7 @@ static void motion_blur_cpu(const Result &input_image,
/* No need to multiply by the shutter speed and invert the next velocities since this was
* already done in dilate_max_velocity. */
float4 max_motion = max_velocity.load_pixel<float4>(tile);
float4 max_motion = max_velocity.load_pixel<float4, true>(tile);
Accumulator accum;
accum.weight = float3(0.0f, 0.0f, 1.0f);
@@ -536,7 +550,7 @@ class VectorBlurOperation : public NodeOperation {
Result &input = get_input("Speed");
input.bind_as_texture(shader, "input_tx");
Result output = context().create_result(ResultType::Color);
Result output = context().create_result(ResultType::Vector);
const int2 tiles_count = math::divide_ceil(input.domain().size, int2(32));
output.allocate_texture(Domain(tiles_count));
output.bind_as_image(shader, "output_img");

View File

@@ -153,7 +153,8 @@ class ViewerOperation : public NodeOperation {
if (output_texel.x > bounds.max.x || output_texel.y > bounds.max.y) {
return;
}
output.store_pixel(texel + bounds.min, float4(image.load_pixel<float4>(texel).xyz(), 1.0f));
output.store_pixel(texel + bounds.min,
float4(image.load_pixel<float4, true>(texel).xyz(), 1.0f));
});
}
@@ -265,9 +266,9 @@ class ViewerOperation : public NodeOperation {
if (output_texel.x > bounds.max.x || output_texel.y > bounds.max.y) {
return;
}
output.store_pixel(
texel + bounds.min,
float4(image.load_pixel<float4>(texel).xyz(), alpha.load_pixel<float>(texel)));
output.store_pixel(texel + bounds.min,
float4(image.load_pixel<float4, true>(texel).xyz(),
alpha.load_pixel<float, true>(texel)));
});
}

View File

@@ -172,10 +172,10 @@ class ZCombineOperation : public NodeOperation {
if (combined.should_compute()) {
combined.allocate_texture(domain);
parallel_for(domain.size, [&](const int2 texel) {
float4 first_color = first.load_pixel<float4>(texel);
float4 second_color = second.load_pixel<float4>(texel);
float first_z_value = first_z.load_pixel<float>(texel);
float second_z_value = second_z.load_pixel<float>(texel);
float4 first_color = first.load_pixel<float4, true>(texel);
float4 second_color = second.load_pixel<float4, true>(texel);
float first_z_value = first_z.load_pixel<float, true>(texel);
float second_z_value = second_z.load_pixel<float, true>(texel);
/* Choose the closer pixel as the foreground, that is, the pixel with the lower z value. If
* Use Alpha is disabled, return the foreground, otherwise, mix between the foreground and
@@ -195,8 +195,8 @@ class ZCombineOperation : public NodeOperation {
if (combined_z_output.should_compute()) {
combined_z_output.allocate_texture(domain);
parallel_for(domain.size, [&](const int2 texel) {
float first_z_value = first_z.load_pixel<float>(texel);
float second_z_value = second_z.load_pixel<float>(texel);
float first_z_value = first_z.load_pixel<float, true>(texel);
float second_z_value = second_z.load_pixel<float, true>(texel);
float combined_z = math::min(first_z_value, second_z_value);
combined_z_output.store_pixel(texel, combined_z);
});
@@ -273,8 +273,8 @@ class ZCombineOperation : public NodeOperation {
if (combined.should_compute()) {
combined.allocate_texture(domain);
parallel_for(domain.size, [&](const int2 texel) {
float4 first_color = first.load_pixel<float4>(texel);
float4 second_color = second.load_pixel<float4>(texel);
float4 first_color = first.load_pixel<float4, true>(texel);
float4 second_color = second.load_pixel<float4, true>(texel);
float mask_value = mask.load_pixel<float>(texel);
/* Choose the closer pixel as the foreground, that is, the masked pixel with the lower z
@@ -295,8 +295,8 @@ class ZCombineOperation : public NodeOperation {
if (combined_z_output.should_compute()) {
combined_z_output.allocate_texture(domain);
parallel_for(domain.size, [&](const int2 texel) {
float first_z_value = first_z.load_pixel<float>(texel);
float second_z_value = second_z.load_pixel<float>(texel);
float first_z_value = first_z.load_pixel<float, true>(texel);
float second_z_value = second_z.load_pixel<float, true>(texel);
float combined_z = math::min(first_z_value, second_z_value);
combined_z_output.store_pixel(texel, combined_z);
});
@@ -347,8 +347,8 @@ class ZCombineOperation : public NodeOperation {
mask.allocate_texture(domain);
parallel_for(domain.size, [&](const int2 texel) {
float first_z_value = first_z.load_pixel<float>(texel);
float second_z_value = second_z.load_pixel<float>(texel);
float first_z_value = first_z.load_pixel<float, true>(texel);
float second_z_value = second_z.load_pixel<float, true>(texel);
float z_combine_factor = float(first_z_value < second_z_value);
mask.store_pixel(texel, z_combine_factor);
});