ImBuf: multi-thread IMB_byte_from_float / IMB_float_from_byte

Both were largely or completely single threaded.

They are used in various places, but testing their usage in VSE
compositor modifier branch (!139634), applying a default "do nothing"
compositor modifier on a 1080p image (on Ryzen 5950X):
51.4ms -> 12.2ms

Details about IMB_byte_from_float:
- No longer allocate a full new float buffer, instead do all work in
  a local small (32KB size, half of typical L1 cache) job-local buffer.
- Previous code was doing un-premultiply + OCIO + premultiply
  + un-premultiply again. That is pointless; just do
  un-premultiply once.

Details about IMB_float_from_byte / IMB_float_from_byte_ex:
- Remove incorrect code around"allocate float buffer outside of image
  buffer" since it was not actually true to begin with.
- Inside threaded part, do color space conversion and premultiply at
  once per-scanline, so that data stays in CPU caches more.

Pull Request: https://projects.blender.org/blender/blender/pulls/145716
This commit is contained in:
Aras Pranckevicius
2025-09-05 18:59:18 +02:00
committed by Aras Pranckevicius
parent 68e5851615
commit 44b7d7592d

View File

@@ -7,6 +7,7 @@
* \ingroup imbuf
*/
#include "BLI_array.hh"
#include "BLI_rect.h"
#include "BLI_task.hh"
@@ -603,14 +604,16 @@ void IMB_buffer_byte_from_byte(uchar *rect_to,
void IMB_byte_from_float(ImBuf *ibuf)
{
/* verify we have a float buffer */
using namespace blender;
/* Nothing to do if there's no float buffer */
if (ibuf->float_buffer.data == nullptr) {
return;
}
/* create byte rect if it didn't exist yet */
/* Allocate byte buffer if needed. */
if (ibuf->byte_buffer.data == nullptr) {
if (IMB_alloc_byte_pixels(ibuf, false) == 0) {
if (!IMB_alloc_byte_pixels(ibuf, false)) {
return;
}
}
@@ -623,33 +626,49 @@ void IMB_byte_from_float(ImBuf *ibuf)
IMB_colormanagement_role_colorspace_name_get(
COLOR_ROLE_DEFAULT_BYTE) :
ibuf->byte_buffer.colorspace->name().c_str();
float *buffer = static_cast<float *>(MEM_dupallocN(ibuf->float_buffer.data));
/* first make float buffer in byte space */
const bool predivide = IMB_alpha_affects_rgb(ibuf);
IMB_colormanagement_transform_float(
buffer, ibuf->x, ibuf->y, ibuf->channels, from_colorspace, to_colorspace, predivide);
/* convert from float's premul alpha to byte's straight alpha */
if (IMB_alpha_affects_rgb(ibuf)) {
IMB_unpremultiply_rect_float(buffer, ibuf->channels, ibuf->x, ibuf->y);
ColormanageProcessor *processor = STREQ(from_colorspace, to_colorspace) ?
nullptr :
IMB_colormanagement_colorspace_processor_new(
from_colorspace, to_colorspace);
if (processor && IMB_colormanagement_processor_is_noop(processor)) {
IMB_colormanagement_processor_free(processor);
processor = nullptr;
}
/* convert float to byte */
IMB_buffer_byte_from_float(ibuf->byte_buffer.data,
buffer,
ibuf->channels,
ibuf->dither,
IB_PROFILE_SRGB,
IB_PROFILE_SRGB,
false,
ibuf->x,
ibuf->y,
ibuf->x,
ibuf->x);
MEM_freeN(buffer);
/* At 4 floats per pixel, this is 32KB of data, and fits into typical CPU L1 cache. */
static constexpr int grain_size = 2048;
threading::parallel_for(
IndexRange(IMB_get_pixel_count(ibuf)), grain_size, [&](const IndexRange range) {
/* Copy chunk of source float pixels into a local buffer. */
Array<float, grain_size * 4> buffer(range.size() * ibuf->channels);
buffer.as_mutable_span().copy_from(
Span(ibuf->float_buffer.data + range.first() * ibuf->channels, buffer.size()));
/* Unpremultiply alpha if needed. */
if (predivide) {
IMB_unpremultiply_rect_float(buffer.data(), ibuf->channels, range.size(), 1);
}
/* Convert to byte color space if needed. */
if (processor) {
IMB_colormanagement_processor_apply(
processor, buffer.data(), range.size(), 1, ibuf->channels, false);
}
/* Convert to bytes. */
IMB_buffer_byte_from_float(ibuf->byte_buffer.data + range.first() * 4,
buffer.data(),
ibuf->channels,
ibuf->dither,
IB_PROFILE_SRGB,
IB_PROFILE_SRGB,
false,
range.size(),
1,
ibuf->x,
ibuf->x);
});
if (processor != nullptr) {
IMB_colormanagement_processor_free(processor);
}
/* ensure user flag is reset */
ibuf->userflags &= ~IB_RECT_INVALID;
@@ -657,6 +676,8 @@ void IMB_byte_from_float(ImBuf *ibuf)
void IMB_float_from_byte_ex(ImBuf *dst, const ImBuf *src, const rcti *region_to_update)
{
using namespace blender;
BLI_assert_msg(dst->float_buffer.data != nullptr,
"Destination buffer should have a float buffer assigned.");
BLI_assert_msg(src->byte_buffer.data != nullptr,
@@ -673,64 +694,53 @@ void IMB_float_from_byte_ex(ImBuf *dst, const ImBuf *src, const rcti *region_to_
BLI_assert_msg(region_to_update->ymax <= dst->y,
"Region to update should be clipped to the given buffers.");
float *rect_float = dst->float_buffer.data;
rect_float += (region_to_update->xmin + region_to_update->ymin * dst->x) * 4;
uchar *rect = src->byte_buffer.data;
rect += (region_to_update->xmin + region_to_update->ymin * dst->x) * 4;
const int region_width = BLI_rcti_size_x(region_to_update);
const int region_height = BLI_rcti_size_y(region_to_update);
const bool premultiply_alpha = IMB_alpha_affects_rgb(src);
/* Convert byte buffer to float buffer without color or alpha conversion. */
IMB_buffer_float_from_byte(rect_float,
rect,
IB_PROFILE_SRGB,
IB_PROFILE_SRGB,
false,
region_width,
region_height,
src->x,
dst->x);
threading::parallel_for(
IndexRange(region_to_update->ymin, region_height), 64, [&](const IndexRange y_range) {
const uchar *src_ptr = src->byte_buffer.data;
src_ptr += (region_to_update->xmin + y_range.first() * dst->x) * 4;
float *dst_ptr = dst->float_buffer.data;
dst_ptr += (region_to_update->xmin + y_range.first() * dst->x) * 4;
/* Perform color space conversion from rect color space to linear. */
float *float_ptr = rect_float;
for (int i = 0; i < region_height; i++) {
IMB_colormanagement_colorspace_to_scene_linear(
float_ptr, region_width, 1, dst->channels, src->byte_buffer.colorspace, false);
float_ptr += 4 * dst->x;
}
/* Convert byte -> float without color or alpha conversions. */
IMB_buffer_float_from_byte(dst_ptr,
src_ptr,
IB_PROFILE_SRGB,
IB_PROFILE_SRGB,
false,
region_width,
y_range.size(),
src->x,
dst->x);
/* Perform alpha conversion. */
if (IMB_alpha_affects_rgb(src)) {
float_ptr = rect_float;
for (int i = 0; i < region_height; i++) {
IMB_premultiply_rect_float(float_ptr, dst->channels, region_width, 1);
float_ptr += 4 * dst->x;
}
}
/* Convert to scene linear color space, and premultiply alpha if needed. */
float *dst_ptr_line = dst_ptr;
for ([[maybe_unused]] const int64_t y : y_range) {
IMB_colormanagement_colorspace_to_scene_linear(
dst_ptr_line, region_width, 1, dst->channels, src->byte_buffer.colorspace, false);
if (premultiply_alpha) {
IMB_premultiply_rect_float(dst_ptr_line, dst->channels, region_width, 1);
}
dst_ptr_line += 4 * dst->x;
}
});
}
void IMB_float_from_byte(ImBuf *ibuf)
{
/* verify if we byte and float buffers */
/* Nothing to do if there's no byte buffer. */
if (ibuf->byte_buffer.data == nullptr) {
return;
}
/* allocate float buffer outside of image buffer,
* so work-in-progress color space conversion doesn't
* interfere with other parts of blender
*/
float *rect_float = ibuf->float_buffer.data;
if (rect_float == nullptr) {
rect_float = MEM_calloc_arrayN<float>(4 * IMB_get_pixel_count(ibuf), "IMB_float_from_byte");
if (rect_float == nullptr) {
/* Allocate float buffer if needed. */
if (ibuf->float_buffer.data == nullptr) {
if (!IMB_alloc_float_pixels(ibuf, 4, false)) {
return;
}
ibuf->channels = 4;
IMB_assign_float_buffer(ibuf, rect_float, IB_TAKE_OWNERSHIP);
}
rcti region_to_update;