From 44b7d7592ddca832ffa93edaafd1ff45b334162a Mon Sep 17 00:00:00 2001 From: Aras Pranckevicius Date: Fri, 5 Sep 2025 18:59:18 +0200 Subject: [PATCH] ImBuf: multi-thread IMB_byte_from_float / IMB_float_from_byte Both were largely or completely single threaded. They are used in various places, but testing their usage in VSE compositor modifier branch (!139634), applying a default "do nothing" compositor modifier on a 1080p image (on Ryzen 5950X): 51.4ms -> 12.2ms Details about IMB_byte_from_float: - No longer allocate a full new float buffer, instead do all work in a local small (32KB size, half of typical L1 cache) job-local buffer. - Previous code was doing un-premultiply + OCIO + premultiply + un-premultiply again. That is pointless; just do un-premultiply once. Details about IMB_float_from_byte / IMB_float_from_byte_ex: - Remove incorrect code around"allocate float buffer outside of image buffer" since it was not actually true to begin with. - Inside threaded part, do color space conversion and premultiply at once per-scanline, so that data stays in CPU caches more. Pull Request: https://projects.blender.org/blender/blender/pulls/145716 --- source/blender/imbuf/intern/conversion.cc | 150 ++++++++++++---------- 1 file changed, 80 insertions(+), 70 deletions(-) diff --git a/source/blender/imbuf/intern/conversion.cc b/source/blender/imbuf/intern/conversion.cc index c0b24bc98aa..d14a70df980 100644 --- a/source/blender/imbuf/intern/conversion.cc +++ b/source/blender/imbuf/intern/conversion.cc @@ -7,6 +7,7 @@ * \ingroup imbuf */ +#include "BLI_array.hh" #include "BLI_rect.h" #include "BLI_task.hh" @@ -603,14 +604,16 @@ void IMB_buffer_byte_from_byte(uchar *rect_to, void IMB_byte_from_float(ImBuf *ibuf) { - /* verify we have a float buffer */ + using namespace blender; + + /* Nothing to do if there's no float buffer */ if (ibuf->float_buffer.data == nullptr) { return; } - /* create byte rect if it didn't exist yet */ + /* Allocate byte buffer if needed. */ if (ibuf->byte_buffer.data == nullptr) { - if (IMB_alloc_byte_pixels(ibuf, false) == 0) { + if (!IMB_alloc_byte_pixels(ibuf, false)) { return; } } @@ -623,33 +626,49 @@ void IMB_byte_from_float(ImBuf *ibuf) IMB_colormanagement_role_colorspace_name_get( COLOR_ROLE_DEFAULT_BYTE) : ibuf->byte_buffer.colorspace->name().c_str(); - - float *buffer = static_cast(MEM_dupallocN(ibuf->float_buffer.data)); - - /* first make float buffer in byte space */ const bool predivide = IMB_alpha_affects_rgb(ibuf); - IMB_colormanagement_transform_float( - buffer, ibuf->x, ibuf->y, ibuf->channels, from_colorspace, to_colorspace, predivide); - - /* convert from float's premul alpha to byte's straight alpha */ - if (IMB_alpha_affects_rgb(ibuf)) { - IMB_unpremultiply_rect_float(buffer, ibuf->channels, ibuf->x, ibuf->y); + ColormanageProcessor *processor = STREQ(from_colorspace, to_colorspace) ? + nullptr : + IMB_colormanagement_colorspace_processor_new( + from_colorspace, to_colorspace); + if (processor && IMB_colormanagement_processor_is_noop(processor)) { + IMB_colormanagement_processor_free(processor); + processor = nullptr; } - /* convert float to byte */ - IMB_buffer_byte_from_float(ibuf->byte_buffer.data, - buffer, - ibuf->channels, - ibuf->dither, - IB_PROFILE_SRGB, - IB_PROFILE_SRGB, - false, - ibuf->x, - ibuf->y, - ibuf->x, - ibuf->x); - - MEM_freeN(buffer); + /* At 4 floats per pixel, this is 32KB of data, and fits into typical CPU L1 cache. */ + static constexpr int grain_size = 2048; + threading::parallel_for( + IndexRange(IMB_get_pixel_count(ibuf)), grain_size, [&](const IndexRange range) { + /* Copy chunk of source float pixels into a local buffer. */ + Array buffer(range.size() * ibuf->channels); + buffer.as_mutable_span().copy_from( + Span(ibuf->float_buffer.data + range.first() * ibuf->channels, buffer.size())); + /* Unpremultiply alpha if needed. */ + if (predivide) { + IMB_unpremultiply_rect_float(buffer.data(), ibuf->channels, range.size(), 1); + } + /* Convert to byte color space if needed. */ + if (processor) { + IMB_colormanagement_processor_apply( + processor, buffer.data(), range.size(), 1, ibuf->channels, false); + } + /* Convert to bytes. */ + IMB_buffer_byte_from_float(ibuf->byte_buffer.data + range.first() * 4, + buffer.data(), + ibuf->channels, + ibuf->dither, + IB_PROFILE_SRGB, + IB_PROFILE_SRGB, + false, + range.size(), + 1, + ibuf->x, + ibuf->x); + }); + if (processor != nullptr) { + IMB_colormanagement_processor_free(processor); + } /* ensure user flag is reset */ ibuf->userflags &= ~IB_RECT_INVALID; @@ -657,6 +676,8 @@ void IMB_byte_from_float(ImBuf *ibuf) void IMB_float_from_byte_ex(ImBuf *dst, const ImBuf *src, const rcti *region_to_update) { + using namespace blender; + BLI_assert_msg(dst->float_buffer.data != nullptr, "Destination buffer should have a float buffer assigned."); BLI_assert_msg(src->byte_buffer.data != nullptr, @@ -673,64 +694,53 @@ void IMB_float_from_byte_ex(ImBuf *dst, const ImBuf *src, const rcti *region_to_ BLI_assert_msg(region_to_update->ymax <= dst->y, "Region to update should be clipped to the given buffers."); - float *rect_float = dst->float_buffer.data; - rect_float += (region_to_update->xmin + region_to_update->ymin * dst->x) * 4; - uchar *rect = src->byte_buffer.data; - rect += (region_to_update->xmin + region_to_update->ymin * dst->x) * 4; const int region_width = BLI_rcti_size_x(region_to_update); const int region_height = BLI_rcti_size_y(region_to_update); + const bool premultiply_alpha = IMB_alpha_affects_rgb(src); - /* Convert byte buffer to float buffer without color or alpha conversion. */ - IMB_buffer_float_from_byte(rect_float, - rect, - IB_PROFILE_SRGB, - IB_PROFILE_SRGB, - false, - region_width, - region_height, - src->x, - dst->x); + threading::parallel_for( + IndexRange(region_to_update->ymin, region_height), 64, [&](const IndexRange y_range) { + const uchar *src_ptr = src->byte_buffer.data; + src_ptr += (region_to_update->xmin + y_range.first() * dst->x) * 4; + float *dst_ptr = dst->float_buffer.data; + dst_ptr += (region_to_update->xmin + y_range.first() * dst->x) * 4; - /* Perform color space conversion from rect color space to linear. */ - float *float_ptr = rect_float; - for (int i = 0; i < region_height; i++) { - IMB_colormanagement_colorspace_to_scene_linear( - float_ptr, region_width, 1, dst->channels, src->byte_buffer.colorspace, false); - float_ptr += 4 * dst->x; - } + /* Convert byte -> float without color or alpha conversions. */ + IMB_buffer_float_from_byte(dst_ptr, + src_ptr, + IB_PROFILE_SRGB, + IB_PROFILE_SRGB, + false, + region_width, + y_range.size(), + src->x, + dst->x); - /* Perform alpha conversion. */ - if (IMB_alpha_affects_rgb(src)) { - float_ptr = rect_float; - for (int i = 0; i < region_height; i++) { - IMB_premultiply_rect_float(float_ptr, dst->channels, region_width, 1); - float_ptr += 4 * dst->x; - } - } + /* Convert to scene linear color space, and premultiply alpha if needed. */ + float *dst_ptr_line = dst_ptr; + for ([[maybe_unused]] const int64_t y : y_range) { + IMB_colormanagement_colorspace_to_scene_linear( + dst_ptr_line, region_width, 1, dst->channels, src->byte_buffer.colorspace, false); + if (premultiply_alpha) { + IMB_premultiply_rect_float(dst_ptr_line, dst->channels, region_width, 1); + } + dst_ptr_line += 4 * dst->x; + } + }); } void IMB_float_from_byte(ImBuf *ibuf) { - /* verify if we byte and float buffers */ + /* Nothing to do if there's no byte buffer. */ if (ibuf->byte_buffer.data == nullptr) { return; } - /* allocate float buffer outside of image buffer, - * so work-in-progress color space conversion doesn't - * interfere with other parts of blender - */ - float *rect_float = ibuf->float_buffer.data; - if (rect_float == nullptr) { - rect_float = MEM_calloc_arrayN(4 * IMB_get_pixel_count(ibuf), "IMB_float_from_byte"); - - if (rect_float == nullptr) { + /* Allocate float buffer if needed. */ + if (ibuf->float_buffer.data == nullptr) { + if (!IMB_alloc_float_pixels(ibuf, 4, false)) { return; } - - ibuf->channels = 4; - - IMB_assign_float_buffer(ibuf, rect_float, IB_TAKE_OWNERSHIP); } rcti region_to_update;