From 44b7d7592ddca832ffa93edaafd1ff45b334162a Mon Sep 17 00:00:00 2001
From: Aras Pranckevicius <aras@nesnausk.org>
Date: Fri, 5 Sep 2025 18:59:18 +0200
Subject: [PATCH] ImBuf: multi-thread IMB_byte_from_float / IMB_float_from_byte

Both were largely or completely single threaded.

They are used in various places, but testing their usage in VSE
compositor modifier branch (!139634), applying a default "do nothing"
compositor modifier on a 1080p image (on Ryzen 5950X):
51.4ms -> 12.2ms

Details about IMB_byte_from_float:
- No longer allocate a full new float buffer, instead do all work in
  a local small (32KB size, half of typical L1 cache) job-local buffer.
- Previous code was doing un-premultiply + OCIO + premultiply
  + un-premultiply again. That is pointless; just do
  un-premultiply once.

Details about IMB_float_from_byte / IMB_float_from_byte_ex:
- Remove incorrect code around"allocate float buffer outside of image
  buffer" since it was not actually true to begin with.
- Inside threaded part, do color space conversion and premultiply at
  once per-scanline, so that data stays in CPU caches more.

Pull Request: https://projects.blender.org/blender/blender/pulls/145716
---
 source/blender/imbuf/intern/conversion.cc | 150 ++++++++++++----------
 1 file changed, 80 insertions(+), 70 deletions(-)
diff --git a/source/blender/imbuf/intern/conversion.cc b/source/blender/imbuf/intern/conversion.cc
index c0b24bc98aa..d14a70df980 100644
--- a/source/blender/imbuf/intern/conversion.cc
+++ b/source/blender/imbuf/intern/conversion.cc
@@ -7,6 +7,7 @@
  * \ingroup imbuf
  */
 
+#include "BLI_array.hh"
 #include "BLI_rect.h"
 #include "BLI_task.hh"
 
@@ -603,14 +604,16 @@ void IMB_buffer_byte_from_byte(uchar *rect_to,
 
 void IMB_byte_from_float(ImBuf *ibuf)
 {
-  /* verify we have a float buffer */
+  using namespace blender;
+
+  /* Nothing to do if there's no float buffer */
   if (ibuf->float_buffer.data == nullptr) {
     return;
   }
 
-  /* create byte rect if it didn't exist yet */
+  /* Allocate byte buffer if needed. */
   if (ibuf->byte_buffer.data == nullptr) {
-    if (IMB_alloc_byte_pixels(ibuf, false) == 0) {
+    if (!IMB_alloc_byte_pixels(ibuf, false)) {
       return;
     }
   }
@@ -623,33 +626,49 @@ void IMB_byte_from_float(ImBuf *ibuf)
                                   IMB_colormanagement_role_colorspace_name_get(
                                       COLOR_ROLE_DEFAULT_BYTE) :
                                   ibuf->byte_buffer.colorspace->name().c_str();
-
-  float *buffer = static_cast<float *>(MEM_dupallocN(ibuf->float_buffer.data));
-
-  /* first make float buffer in byte space */
   const bool predivide = IMB_alpha_affects_rgb(ibuf);
-  IMB_colormanagement_transform_float(
-      buffer, ibuf->x, ibuf->y, ibuf->channels, from_colorspace, to_colorspace, predivide);
-
-  /* convert from float's premul alpha to byte's straight alpha */
-  if (IMB_alpha_affects_rgb(ibuf)) {
-    IMB_unpremultiply_rect_float(buffer, ibuf->channels, ibuf->x, ibuf->y);
+  ColormanageProcessor *processor = STREQ(from_colorspace, to_colorspace) ?
+                                        nullptr :
+                                        IMB_colormanagement_colorspace_processor_new(
+                                            from_colorspace, to_colorspace);
+  if (processor && IMB_colormanagement_processor_is_noop(processor)) {
+    IMB_colormanagement_processor_free(processor);
+    processor = nullptr;
   }
 
-  /* convert float to byte */
-  IMB_buffer_byte_from_float(ibuf->byte_buffer.data,
-                             buffer,
-                             ibuf->channels,
-                             ibuf->dither,
-                             IB_PROFILE_SRGB,
-                             IB_PROFILE_SRGB,
-                             false,
-                             ibuf->x,
-                             ibuf->y,
-                             ibuf->x,
-                             ibuf->x);
-
-  MEM_freeN(buffer);
+  /* At 4 floats per pixel, this is 32KB of data, and fits into typical CPU L1 cache. */
+  static constexpr int grain_size = 2048;
+  threading::parallel_for(
+      IndexRange(IMB_get_pixel_count(ibuf)), grain_size, [&](const IndexRange range) {
+        /* Copy chunk of source float pixels into a local buffer. */
+        Array<float, grain_size * 4> buffer(range.size() * ibuf->channels);
+        buffer.as_mutable_span().copy_from(
+            Span(ibuf->float_buffer.data + range.first() * ibuf->channels, buffer.size()));
+        /* Unpremultiply alpha if needed. */
+        if (predivide) {
+          IMB_unpremultiply_rect_float(buffer.data(), ibuf->channels, range.size(), 1);
+        }
+        /* Convert to byte color space if needed. */
+        if (processor) {
+          IMB_colormanagement_processor_apply(
+              processor, buffer.data(), range.size(), 1, ibuf->channels, false);
+        }
+        /* Convert to bytes. */
+        IMB_buffer_byte_from_float(ibuf->byte_buffer.data + range.first() * 4,
+                                   buffer.data(),
+                                   ibuf->channels,
+                                   ibuf->dither,
+                                   IB_PROFILE_SRGB,
+                                   IB_PROFILE_SRGB,
+                                   false,
+                                   range.size(),
+                                   1,
+                                   ibuf->x,
+                                   ibuf->x);
+      });
+  if (processor != nullptr) {
+    IMB_colormanagement_processor_free(processor);
+  }
 
   /* ensure user flag is reset */
   ibuf->userflags &= ~IB_RECT_INVALID;
@@ -657,6 +676,8 @@ void IMB_byte_from_float(ImBuf *ibuf)
 
 void IMB_float_from_byte_ex(ImBuf *dst, const ImBuf *src, const rcti *region_to_update)
 {
+  using namespace blender;
+
   BLI_assert_msg(dst->float_buffer.data != nullptr,
                  "Destination buffer should have a float buffer assigned.");
   BLI_assert_msg(src->byte_buffer.data != nullptr,
@@ -673,64 +694,53 @@ void IMB_float_from_byte_ex(ImBuf *dst, const ImBuf *src, const rcti *region_to_
   BLI_assert_msg(region_to_update->ymax <= dst->y,
                  "Region to update should be clipped to the given buffers.");
 
-  float *rect_float = dst->float_buffer.data;
-  rect_float += (region_to_update->xmin + region_to_update->ymin * dst->x) * 4;
-  uchar *rect = src->byte_buffer.data;
-  rect += (region_to_update->xmin + region_to_update->ymin * dst->x) * 4;
   const int region_width = BLI_rcti_size_x(region_to_update);
   const int region_height = BLI_rcti_size_y(region_to_update);
+  const bool premultiply_alpha = IMB_alpha_affects_rgb(src);
 
-  /* Convert byte buffer to float buffer without color or alpha conversion. */
-  IMB_buffer_float_from_byte(rect_float,
-                             rect,
-                             IB_PROFILE_SRGB,
-                             IB_PROFILE_SRGB,
-                             false,
-                             region_width,
-                             region_height,
-                             src->x,
-                             dst->x);
+  threading::parallel_for(
+      IndexRange(region_to_update->ymin, region_height), 64, [&](const IndexRange y_range) {
+        const uchar *src_ptr = src->byte_buffer.data;
+        src_ptr += (region_to_update->xmin + y_range.first() * dst->x) * 4;
+        float *dst_ptr = dst->float_buffer.data;
+        dst_ptr += (region_to_update->xmin + y_range.first() * dst->x) * 4;
 
-  /* Perform color space conversion from rect color space to linear. */
-  float *float_ptr = rect_float;
-  for (int i = 0; i < region_height; i++) {
-    IMB_colormanagement_colorspace_to_scene_linear(
-        float_ptr, region_width, 1, dst->channels, src->byte_buffer.colorspace, false);
-    float_ptr += 4 * dst->x;
-  }
+        /* Convert byte -> float without color or alpha conversions. */
+        IMB_buffer_float_from_byte(dst_ptr,
+                                   src_ptr,
+                                   IB_PROFILE_SRGB,
+                                   IB_PROFILE_SRGB,
+                                   false,
+                                   region_width,
+                                   y_range.size(),
+                                   src->x,
+                                   dst->x);
 
-  /* Perform alpha conversion. */
-  if (IMB_alpha_affects_rgb(src)) {
-    float_ptr = rect_float;
-    for (int i = 0; i < region_height; i++) {
-      IMB_premultiply_rect_float(float_ptr, dst->channels, region_width, 1);
-      float_ptr += 4 * dst->x;
-    }
-  }
+        /* Convert to scene linear color space, and premultiply alpha if needed. */
+        float *dst_ptr_line = dst_ptr;
+        for ([[maybe_unused]] const int64_t y : y_range) {
+          IMB_colormanagement_colorspace_to_scene_linear(
+              dst_ptr_line, region_width, 1, dst->channels, src->byte_buffer.colorspace, false);
+          if (premultiply_alpha) {
+            IMB_premultiply_rect_float(dst_ptr_line, dst->channels, region_width, 1);
+          }
+          dst_ptr_line += 4 * dst->x;
+        }
+      });
 }
 
 void IMB_float_from_byte(ImBuf *ibuf)
 {
-  /* verify if we byte and float buffers */
+  /* Nothing to do if there's no byte buffer. */
   if (ibuf->byte_buffer.data == nullptr) {
     return;
   }
 
-  /* allocate float buffer outside of image buffer,
-   * so work-in-progress color space conversion doesn't
-   * interfere with other parts of blender
-   */
-  float *rect_float = ibuf->float_buffer.data;
-  if (rect_float == nullptr) {
-    rect_float = MEM_calloc_arrayN<float>(4 * IMB_get_pixel_count(ibuf), "IMB_float_from_byte");
-
-    if (rect_float == nullptr) {
+  /* Allocate float buffer if needed. */
+  if (ibuf->float_buffer.data == nullptr) {
+    if (!IMB_alloc_float_pixels(ibuf, 4, false)) {
       return;
     }
-
-    ibuf->channels = 4;
-
-    IMB_assign_float_buffer(ibuf, rect_float, IB_TAKE_OWNERSHIP);
   }
 
   rcti region_to_update;