Compositor: Support host allocation for GPU compositing

This patch allows allocating results on the host even if the context
uses GPU. It also adds support for uploading the host result into a GPU
allocated result. This is done to allow using results for storing data
that gets computed on the CPU be end up on the GPU, like some of the
cached resources used by the compositor. Those resources are refactored
accordingly in this patch as well.

Pull Request: https://projects.blender.org/blender/blender/pulls/141745
This commit is contained in:
Omar Emara
2025-07-11 07:55:11 +02:00
committed by Omar Emara
parent 6a2a999173
commit c53a839631
12 changed files with 130 additions and 129 deletions

View File

@@ -5,8 +5,7 @@
#pragma once
#include <cstdint>
#include <type_traits>
#include <utility>
#include <optional>
#include <variant>
#include "BLI_assert.h"
@@ -166,6 +165,9 @@ class Result {
* fourth channel during processing. */
static eGPUTextureFormat gpu_texture_format(ResultType type, ResultPrecision precision);
/* Returns the GPU data format that corresponds to the give result type. */
static eGPUDataFormat gpu_data_format(const ResultType type);
/* Returns the GPU texture format that corresponds to the give one, but whose precision is the
* given precision. */
static eGPUTextureFormat gpu_texture_format(eGPUTextureFormat format, ResultPrecision precision);
@@ -198,18 +200,17 @@ class Result {
* created by uploading data from CPU. */
eGPUTextureFormat get_gpu_texture_format() const;
/* Identical to gpu_data_format but assumes the result's type. */
eGPUDataFormat get_gpu_data_format() const;
/* Declare the result to be a texture result, allocate a texture of an appropriate type with
* the size of the given domain, and set the domain of the result to the given domain.
*
* If from_pool is true, the texture will be allocated from the texture pool of the context,
* otherwise, a new texture will be allocated. Pooling should not be used for persistent
* results that might span more than one evaluation, like cached resources. While pooling should
* be used for most other cases where the result will be allocated then later released in the
* same evaluation.
*
* If the context of the result uses GPU, then GPU allocation will be done, otherwise, CPU
* allocation will be done. */
void allocate_texture(Domain domain, bool from_pool = true);
* See the allocate_data method for more information on the from_pool and storage_type
* parameters. */
void allocate_texture(const Domain domain,
const bool from_pool = true,
const std::optional<ResultStorageType> storage_type = std::nullopt);
/* Declare the result to be a single value result, allocate a texture of an appropriate type with
* size 1x1 from the texture pool, and set the domain to be an identity domain. The value is zero
@@ -220,6 +221,11 @@ class Result {
* can't be computed and are considered invalid. */
void allocate_invalid();
/* Creates and allocates a new result that matches the type and precision of this result and
* uploads the CPU data that exist in this result. The result is assumed to be allocated on the
* CPU. See the allocate_data method for more information on the from_pool parameters. */
Result upload_to_gpu(const bool from_pool);
/* Bind the GPU texture of the result to the texture image unit with the given name in the
* currently bound given shader. This also inserts a memory barrier for texture fetches to ensure
* any prior writes to the texture are reflected before reading from it. */
@@ -434,9 +440,20 @@ class Result {
const float2 &y_gradient) const;
private:
/* Allocates the image data for the given size, either on the GPU or CPU based on the result's
* context. See the allocate_texture method for information about the from_pool argument. */
void allocate_data(int2 size, bool from_pool);
/* Allocates the image data for the given size.
*
* The data is allocated on the CPU or GPU depending on the given storage_type. A nullopt may be
* passed to storage_type, in which case, the data will be allocated on the device of the
* result's context as specified by context.use_gpu().
*
* If from_pool is true, GPU textures will be allocated from the texture pool of the context,
* otherwise, a new texture will be allocated. Pooling should not be used for persistent results
* that might span more than one evaluation, like cached resources. While pooling should be used
* for most other cases where the result will be allocated then later released in the same
* evaluation. */
void allocate_data(const int2 size,
const bool from_pool = true,
const std::optional<ResultStorageType> storage_type = std::nullopt);
/* Same as get_pixel_index but can be used when the type of the result is not known at compile
* time. */

View File

@@ -48,9 +48,6 @@ bool operator==(const CachedMaskKey &a, const CachedMaskKey &b);
* A cached resource that computes and caches a result containing the result of evaluating the
* given mask ID on a space that spans the given size, parameterized by the given parameters. */
class CachedMask : public CachedResource {
private:
Array<float> evaluated_mask_;
public:
Result result;

View File

@@ -7,7 +7,6 @@
#include <cstdint>
#include <memory>
#include "BLI_array.hh"
#include "BLI_map.hh"
#include "BLI_math_vector_types.hh"
@@ -52,9 +51,6 @@ bool operator==(const DistortionGridKey &a, const DistortionGridKey &b);
* applying the camera distortion of a given movie clip tracking camera. See the constructor for
* more information. */
class DistortionGrid : public CachedResource {
private:
Array<float2> distortion_grid_;
public:
Result result;

View File

@@ -7,7 +7,6 @@
#include <cstdint>
#include <memory>
#include "BLI_array.hh"
#include "BLI_map.hh"
#include "COM_cached_resource.hh"
@@ -42,10 +41,6 @@ bool operator==(const MorphologicalDistanceFeatherWeightsKey &a,
* functions are all even functions. Consequently, only the positive half of the filter is computed
* and the shader takes that into consideration. */
class MorphologicalDistanceFeatherWeights : public CachedResource {
private:
Array<float> weights_;
Array<float> falloffs_;
public:
Result weights_result;
Result falloffs_result;

View File

@@ -7,7 +7,6 @@
#include <cstdint>
#include <memory>
#include "BLI_array.hh"
#include "BLI_map.hh"
#include "BLI_math_vector_types.hh"
@@ -41,9 +40,6 @@ bool operator==(const SymmetricBlurWeightsKey &a, const SymmetricBlurWeightsKey
* evaluated on the normalized distance to the center. Consequently, only the upper right quadrant
* are computed and the user takes that into consideration. */
class SymmetricBlurWeights : public CachedResource {
private:
Array<float> weights_;
public:
Result result;

View File

@@ -7,7 +7,6 @@
#include <cstdint>
#include <memory>
#include "BLI_array.hh"
#include "BLI_map.hh"
#include "COM_cached_resource.hh"
@@ -46,9 +45,6 @@ bool operator==(const SymmetricSeparableBlurWeightsKey &a,
* \{ */
class SymmetricSeparableBlurWeights : public CachedResource {
private:
Array<float> weights_;
public:
Result result;

View File

@@ -5,12 +5,9 @@
#include <cstdint>
#include <memory>
#include "BLI_array.hh"
#include "BLI_hash.hh"
#include "BLI_math_vector_types.hh"
#include "GPU_texture.hh"
#include "BKE_lib_id.hh"
#include "BKE_mask.h"
@@ -114,7 +111,7 @@ CachedMask::CachedMask(Context &context,
Vector<MaskRasterHandle *> handles = get_mask_raster_handles(
mask, size, frame, use_feather, motion_blur_samples, motion_blur_shutter);
evaluated_mask_ = Array<float>(size.x * size.y);
this->result.allocate_texture(size, false, ResultStorageType::CPU);
parallel_for(size, [&](const int2 texel) {
/* Compute the coordinates in the [0, 1] range and add 0.5 to evaluate the mask at the
* center of pixels. */
@@ -126,7 +123,7 @@ CachedMask::CachedMask(Context &context,
for (MaskRasterHandle *handle : handles) {
mask_value += BKE_maskrasterize_handle_sample(handle, coordinates);
}
evaluated_mask_[texel.y * size.x + texel.x] = mask_value / handles.size();
this->result.store_pixel(texel, mask_value / handles.size());
});
for (MaskRasterHandle *handle : handles) {
@@ -134,14 +131,9 @@ CachedMask::CachedMask(Context &context,
}
if (context.use_gpu()) {
this->result.allocate_texture(Domain(size), false);
GPU_texture_update(this->result, GPU_DATA_FLOAT, evaluated_mask_.data());
/* CPU-side data no longer needed, so free it. */
evaluated_mask_ = Array<float>();
}
else {
this->result.wrap_external(evaluated_mask_.data(), size);
const Result gpu_result = this->result.upload_to_gpu(false);
this->result.release();
this->result = gpu_result;
}
}

View File

@@ -5,7 +5,6 @@
#include <cstdint>
#include <memory>
#include "BLI_array.hh"
#include "BLI_hash.hh"
#include "BLI_math_vector_types.hh"
@@ -13,8 +12,6 @@
#include "DNA_movieclip_types.h"
#include "DNA_tracking_types.h"
#include "GPU_texture.hh"
#include "BKE_movieclip.h"
#include "BKE_tracking.h"
@@ -83,7 +80,8 @@ DistortionGrid::DistortionGrid(
/* Extend the size by the deltas of the bounds. */
const int2 extended_size = size + int2(right_delta + left_delta, bottom_delta + top_delta);
distortion_grid_ = Array<float2>(int64_t(extended_size.x) * extended_size.y);
this->result.allocate_texture(extended_size, false, ResultStorageType::CPU);
parallel_for(extended_size, [&](const int2 texel) {
/* The tracking distortion functions expect the coordinates to be in the space of the image
* where the tracking camera was calibrated. So we first remap the coordinates into that space,
@@ -105,21 +103,15 @@ DistortionGrid::DistortionGrid(
/* Note that we should remap the coordinates back into the original size by dividing by the
* calibration size and multiplying by the size, however, we skip the latter to store the
* coordinates in normalized form, since this is what the shader expects. */
distortion_grid_[texel.y * int64_t(extended_size.x) + texel.x] = coordinates /
float2(calibration_size);
this->result.store_pixel(texel, coordinates / float2(calibration_size));
});
BKE_tracking_distortion_free(distortion);
if (context.use_gpu()) {
this->result.allocate_texture(Domain(extended_size), false);
GPU_texture_update(this->result, GPU_DATA_FLOAT, distortion_grid_.data());
/* CPU-side data no longer needed, so free it. */
distortion_grid_ = Array<float2>();
}
else {
this->result.wrap_external(&distortion_grid_[0].x, extended_size);
const Result gpu_result = this->result.upload_to_gpu(false);
this->result.release();
this->result = gpu_result;
}
}

View File

@@ -6,7 +6,6 @@
#include <cstdint>
#include <memory>
#include "BLI_array.hh"
#include "BLI_hash.hh"
#include "BLI_index_range.hh"
@@ -14,8 +13,6 @@
#include "DNA_scene_types.h"
#include "GPU_texture.hh"
#include "COM_context.hh"
#include "COM_morphological_distance_feather_weights.hh"
#include "COM_result.hh"
@@ -57,18 +54,12 @@ MorphologicalDistanceFeatherWeights::MorphologicalDistanceFeatherWeights(Context
this->compute_distance_falloffs(type, radius);
if (context.use_gpu()) {
this->weights_result.allocate_texture(Domain(int2(weights_.size(), 1)), false);
this->falloffs_result.allocate_texture(Domain(int2(falloffs_.size(), 1)), false);
GPU_texture_update(this->weights_result, GPU_DATA_FLOAT, weights_.data());
GPU_texture_update(this->falloffs_result, GPU_DATA_FLOAT, falloffs_.data());
/* CPU-side data no longer needed, so free it. */
weights_ = Array<float>();
falloffs_ = Array<float>();
}
else {
this->weights_result.wrap_external(weights_.data(), int2(weights_.size(), 1));
this->falloffs_result.wrap_external(falloffs_.data(), int2(falloffs_.size(), 1));
const Result weights_gpu_result = this->weights_result.upload_to_gpu(false);
const Result falloffs_gpu_result = this->falloffs_result.upload_to_gpu(false);
this->weights_result.release();
this->falloffs_result.release();
this->weights_result = weights_gpu_result;
this->falloffs_result = falloffs_gpu_result;
}
}
@@ -84,28 +75,29 @@ void MorphologicalDistanceFeatherWeights::compute_weights(int radius)
* compute half of it and no doubling happens. We add 1 to make sure the filter size is always
* odd and there is a center weight. */
const int size = radius + 1;
weights_ = Array<float>(size);
this->weights_result.allocate_texture(Domain(int2(size, 1)), false, ResultStorageType::CPU);
float sum = 0.0f;
/* First, compute the center weight. */
const float center_weight = RE_filter_value(R_FILTER_GAUSS, 0.0f);
weights_[0] = center_weight;
this->weights_result.store_pixel(int2(0, 0), center_weight);
sum += center_weight;
/* Second, compute the other weights in the positive direction, making sure to add double the
* weight to the sum of weights because the filter is symmetric and we only loop over half of
* it. Skip the center weight already computed by dropping the front index. */
const float scale = radius > 0.0f ? 1.0f / radius : 0.0f;
for (const int i : weights_.index_range().drop_front(1)) {
for (const int i : IndexRange(size).drop_front(1)) {
const float weight = RE_filter_value(R_FILTER_GAUSS, i * scale);
weights_[i] = weight;
this->weights_result.store_pixel(int2(i, 0), weight);
sum += weight * 2.0f;
}
/* Finally, normalize the weights. */
for (const int i : weights_.index_range()) {
weights_[i] /= sum;
for (const int i : IndexRange(size)) {
const int2 texel = int2(i, 0);
this->weights_result.store_pixel(texel, this->weights_result.load_pixel<float>(texel) / sum);
}
}
@@ -140,13 +132,13 @@ void MorphologicalDistanceFeatherWeights::compute_distance_falloffs(int type, in
* symmetric, we only compute half of them and no doubling happens. We add 1 to make sure the
* falloffs size is always odd and there is a center falloff. */
const int size = radius + 1;
falloffs_ = Array<float>(size);
this->falloffs_result.allocate_texture(Domain(int2(size, 1)), false, ResultStorageType::CPU);
/* Compute the distance falloffs in the positive direction only, because the falloffs are
* symmetric. */
const float scale = radius > 0.0f ? 1.0f / radius : 0.0f;
for (const int i : falloffs_.index_range()) {
falloffs_[i] = compute_distance_falloff(type, i * scale);
for (const int i : IndexRange(size)) {
this->falloffs_result.store_pixel(int2(i, 0), compute_distance_falloff(type, i * scale));
}
}

View File

@@ -5,7 +5,6 @@
#include <cstdint>
#include <memory>
#include "BLI_array.hh"
#include "BLI_hash.hh"
#include "BLI_index_range.hh"
#include "BLI_math_vector.hh"
@@ -13,8 +12,6 @@
#include "RE_pipeline.h"
#include "GPU_texture.hh"
#include "COM_context.hh"
#include "COM_result.hh"
#include "COM_symmetric_blur_weights.hh"
@@ -52,13 +49,13 @@ SymmetricBlurWeights::SymmetricBlurWeights(Context &context, int type, float2 ra
* filter size is always odd and there is a center weight. */
const float2 scale = math::safe_divide(float2(1.0f), radius);
const int2 size = int2(math::ceil(radius)) + int2(1);
weights_ = Array<float>(size.x * size.y);
this->result.allocate_texture(size, false, ResultStorageType::CPU);
float sum = 0.0f;
/* First, compute the center weight. */
const float center_weight = RE_filter_value(type, 0.0f);
weights_[0] = center_weight;
this->result.store_pixel(int2(0, 0), center_weight);
sum += center_weight;
/* Then, compute the weights along the positive x axis, making sure to add double the weight to
@@ -66,7 +63,7 @@ SymmetricBlurWeights::SymmetricBlurWeights(Context &context, int type, float2 ra
* of the x axis. Skip the center weight already computed by dropping the front index. */
for (const int x : IndexRange(size.x).drop_front(1)) {
const float weight = RE_filter_value(type, x * scale.x);
weights_[x] = weight;
this->result.store_pixel(int2(x, 0), weight);
sum += weight * 2.0f;
}
@@ -75,7 +72,7 @@ SymmetricBlurWeights::SymmetricBlurWeights(Context &context, int type, float2 ra
* of the y axis. Skip the center weight already computed by dropping the front index. */
for (const int y : IndexRange(size.y).drop_front(1)) {
const float weight = RE_filter_value(type, y * scale.y);
weights_[size.x * y] = weight;
this->result.store_pixel(int2(0, y), weight);
sum += weight * 2.0f;
}
@@ -86,7 +83,7 @@ SymmetricBlurWeights::SymmetricBlurWeights(Context &context, int type, float2 ra
for (const int y : IndexRange(size.y).drop_front(1)) {
for (const int x : IndexRange(size.x).drop_front(1)) {
const float weight = RE_filter_value(type, math::length(float2(x, y) * scale));
weights_[size.x * y + x] = weight;
this->result.store_pixel(int2(x, y), weight);
sum += weight * 4.0f;
}
}
@@ -94,19 +91,15 @@ SymmetricBlurWeights::SymmetricBlurWeights(Context &context, int type, float2 ra
/* Finally, normalize the weights. */
for (const int y : IndexRange(size.y)) {
for (const int x : IndexRange(size.x)) {
weights_[size.x * y + x] /= sum;
const int2 texel = int2(x, y);
this->result.store_pixel(texel, this->result.load_pixel<float>(texel) / sum);
}
}
if (context.use_gpu()) {
this->result.allocate_texture(Domain(size), false);
GPU_texture_update(this->result, GPU_DATA_FLOAT, weights_.data());
/* CPU-side data no longer needed, so free it. */
weights_ = Array<float>();
}
else {
this->result.wrap_external(weights_.data(), size);
const Result gpu_result = this->result.upload_to_gpu(false);
this->result.release();
this->result = gpu_result;
}
}

View File

@@ -5,15 +5,12 @@
#include <cstdint>
#include <memory>
#include "BLI_array.hh"
#include "BLI_hash.hh"
#include "BLI_index_range.hh"
#include "BLI_math_base.hh"
#include "RE_pipeline.h"
#include "GPU_texture.hh"
#include "COM_context.hh"
#include "COM_result.hh"
#include "COM_symmetric_separable_blur_weights.hh"
@@ -53,39 +50,35 @@ SymmetricSeparableBlurWeights::SymmetricSeparableBlurWeights(Context &context,
* compute half of it and no doubling happens. We add 1 to make sure the filter size is always
* odd and there is a center weight. */
const int size = math::ceil(radius) + 1;
weights_ = Array<float>(size);
this->result.allocate_texture(Domain(int2(size, 1)), false, ResultStorageType::CPU);
float sum = 0.0f;
/* First, compute the center weight. */
const float center_weight = RE_filter_value(type, 0.0f);
weights_[0] = center_weight;
this->result.store_pixel(int2(0, 0), center_weight);
sum += center_weight;
/* Second, compute the other weights in the positive direction, making sure to add double the
* weight to the sum of weights because the filter is symmetric and we only loop over half of
* it. Skip the center weight already computed by dropping the front index. */
const float scale = radius > 0.0f ? 1.0f / radius : 0.0f;
for (const int i : weights_.index_range().drop_front(1)) {
for (const int i : IndexRange(size).drop_front(1)) {
const float weight = RE_filter_value(type, i * scale);
weights_[i] = weight;
this->result.store_pixel(int2(i, 0), weight);
sum += weight * 2.0f;
}
/* Finally, normalize the weights. */
for (const int i : weights_.index_range()) {
weights_[i] /= sum;
for (const int i : IndexRange(size)) {
const int2 texel = int2(i, 0);
this->result.store_pixel(texel, this->result.load_pixel<float>(texel) / sum);
}
if (context.use_gpu()) {
this->result.allocate_texture(Domain(int2(size, 1)), false);
GPU_texture_update(this->result, GPU_DATA_FLOAT, weights_.data());
/* CPU-side data no longer needed, so free it. */
weights_ = Array<float>();
}
else {
this->result.wrap_external(weights_.data(), int2(size, 1));
const Result gpu_result = this->result.upload_to_gpu(false);
this->result.release();
this->result = gpu_result;
}
}

View File

@@ -3,6 +3,7 @@
* SPDX-License-Identifier: GPL-2.0-or-later */
#include <cstdint>
#include <optional>
#include <variant>
#include "MEM_guardedalloc.h"
@@ -92,6 +93,25 @@ eGPUTextureFormat Result::gpu_texture_format(ResultType type, ResultPrecision pr
return GPU_RGBA32F;
}
eGPUDataFormat Result::gpu_data_format(ResultType type)
{
switch (type) {
case ResultType::Float:
case ResultType::Color:
case ResultType::Float4:
case ResultType::Float3:
case ResultType::Float2:
return GPU_DATA_FLOAT;
case ResultType::Int:
case ResultType::Int2:
case ResultType::Bool:
return GPU_DATA_INT;
}
BLI_assert_unreachable();
return GPU_DATA_FLOAT;
}
eGPUTextureFormat Result::gpu_texture_format(eGPUTextureFormat format, ResultPrecision precision)
{
switch (precision) {
@@ -306,13 +326,20 @@ eGPUTextureFormat Result::get_gpu_texture_format() const
return Result::gpu_texture_format(type_, precision_);
}
void Result::allocate_texture(Domain domain, bool from_pool)
eGPUDataFormat Result::get_gpu_data_format() const
{
return Result::gpu_data_format(type_);
}
void Result::allocate_texture(const Domain domain,
const bool from_pool,
const std::optional<ResultStorageType> storage_type)
{
/* Make sure we are not allocating a result that should not be computed. */
BLI_assert(this->should_compute());
is_single_value_ = false;
this->allocate_data(domain.size, from_pool);
this->allocate_data(domain.size, from_pool, storage_type);
domain_ = domain;
}
@@ -362,6 +389,18 @@ void Result::allocate_invalid()
this->allocate_single_value();
}
Result Result::upload_to_gpu(const bool from_pool)
{
BLI_assert(storage_type_ == ResultStorageType::CPU);
BLI_assert(this->is_allocated());
Result result = Result(*context_, this->type(), this->precision());
result.allocate_texture(this->domain().size, from_pool, ResultStorageType::GPU);
GPU_texture_update(result, this->get_gpu_data_format(), this->cpu_data().data());
return result;
}
void Result::bind_as_texture(GPUShader *shader, const char *texture_name) const
{
BLI_assert(storage_type_ == ResultStorageType::GPU);
@@ -663,7 +702,11 @@ void Result::update_single_value_data()
case ResultType::Float2:
case ResultType::Float4:
case ResultType::Color:
GPU_texture_update(this->gpu_texture(), GPU_DATA_FLOAT, this->single_value().get());
case ResultType::Int:
case ResultType::Int2:
case ResultType::Bool:
GPU_texture_update(
this->gpu_texture(), this->get_gpu_data_format(), this->single_value().get());
break;
case ResultType::Float3: {
/* Float3 results are stored in 4-component textures due to hardware limitations. So
@@ -672,11 +715,6 @@ void Result::update_single_value_data()
GPU_texture_update(this->gpu_texture(), GPU_DATA_FLOAT, vector_value);
break;
}
case ResultType::Int:
case ResultType::Int2:
case ResultType::Bool:
GPU_texture_update(this->gpu_texture(), GPU_DATA_INT, this->single_value().get());
break;
}
break;
case ResultStorageType::CPU:
@@ -685,11 +723,15 @@ void Result::update_single_value_data()
}
}
void Result::allocate_data(int2 size, bool from_pool)
void Result::allocate_data(const int2 size,
const bool from_pool,
const std::optional<ResultStorageType> storage_type)
{
BLI_assert(!this->is_allocated());
if (context_->use_gpu()) {
const bool use_gpu = storage_type.has_value() ? storage_type.value() == ResultStorageType::GPU :
context_->use_gpu();
if (use_gpu) {
storage_type_ = ResultStorageType::GPU;
is_from_pool_ = from_pool;