IMB: Speedups, fixes and cleanups to various image scaling functions

API: merged IMB_scalefastImBuf, IMB_scaleImBuf, IMB_scaleImBuf_threaded into one function IMB_scale with enum IMBScaleFilter {Nearest, Bilinear, Box} and bool "threaded" param. Performance: - Box filtering (nee IMB_scaleImBuf) can be multi-threaded now. - Nearest filtering (nee IMB_scalefastImBuf) can be multi-threaded now. Also fix performance regression on float images caused by fix in #126234 - Bilinear filtering (nee IMB_scaleImBuf_threaded) is several times faster now. Correctness: - Nearest and Box filtering: no longer loses half of edge pixels when scaling up. - Box: fixed garbage results (and possible out of bounds reads) for non-4 channel float images. - Bilinear: no longer shifts image when scaling up. - Bilinear: properly filters when scaling down by 2x2. Test coverage: - Add gtest coverage for various IMB_scale modes. - Add a IMB_performance_test performance test, ran manually. More details, images and performance numbers in PR. Pull Request: https://projects.blender.org/blender/blender/pulls/126390
2024-08-19 16:50:05 +02:00
parent be0d2e19b5
commit 6d93bf6b44
26 changed files with 918 additions and 1333 deletions
--- a/source/blender/blenkernel/intern/icons_rasterize.cc
+++ b/source/blender/blenkernel/intern/icons_rasterize.cc
@@ -122,7 +122,7 @@ ImBuf *BKE_icon_geom_rasterize(const Icon_Geom *geom, const uint size_x, const u
      BLI_bitmap_draw_2d_tri_v2i(UNPACK3(data.pt), tri_fill_smooth, &data);
    }
  }
-  IMB_scaleImBuf(ibuf, size_x, size_y);
+  IMB_scale(ibuf, size_x, size_y, IMBScaleFilter::Box, false);
  return ibuf;
 }

--- a/source/blender/blenkernel/intern/image.cc
+++ b/source/blender/blenkernel/intern/image.cc
@@ -799,7 +799,7 @@ bool BKE_image_scale(Image *image, int width, int height, ImageUser *iuser)
  ibuf = BKE_image_acquire_ibuf(image, iuser, &lock);

  if (ibuf) {
-    IMB_scaleImBuf(ibuf, width, height);
+    IMB_scale(ibuf, width, height, IMBScaleFilter::Box, false);
    BKE_image_mark_dirty(image, ibuf);
  }

@@ -4923,7 +4923,7 @@ ImBuf *BKE_image_preview(Image *ima, const short max_size, short *r_width, short
  BKE_image_release_ibuf(ima, image_ibuf, lock);

  /* Resize. */
-  IMB_scaleImBuf(preview, scale * image_ibuf->x, scale * image_ibuf->y);
+  IMB_scale(preview, scale * image_ibuf->x, scale * image_ibuf->y, IMBScaleFilter::Box, false);
  IMB_rect_from_float(preview);

  return preview;
--- a/source/blender/blenkernel/intern/image_gpu.cc
+++ b/source/blender/blenkernel/intern/image_gpu.cc
@@ -656,7 +656,7 @@ static ImBuf *update_do_scale(const uchar *rect,

  /* Scale pixels. */
  ImBuf *ibuf = IMB_allocFromBuffer(rect, rect_float, part_w, part_h, 4);
-  IMB_scaleImBuf(ibuf, *w, *h);
+  IMB_scale(ibuf, *w, *h, IMBScaleFilter::Box, false);

  return ibuf;
 }
--- a/source/blender/blenkernel/intern/movieclip.cc
+++ b/source/blender/blenkernel/intern/movieclip.cc
@@ -1047,7 +1047,7 @@ static ImBuf *get_undistorted_ibuf(MovieClip *clip, MovieDistortion *distortion,
    undistibuf = BKE_tracking_undistort_frame(&clip->tracking, ibuf, ibuf->x, ibuf->y, 0.0f);
  }

-  IMB_scaleImBuf(undistibuf, ibuf->x, ibuf->y);
+  IMB_scale(undistibuf, ibuf->x, ibuf->y, IMBScaleFilter::Box, false);

  return undistibuf;
 }
@@ -1771,13 +1771,7 @@ static void movieclip_build_proxy_ibuf(
  recty = ibuf->y * size / 100.0f;

  scaleibuf = IMB_dupImBuf(ibuf);
-
-  if (threaded) {
-    IMB_scaleImBuf_threaded(scaleibuf, short(rectx), short(recty));
-  }
-  else {
-    IMB_scaleImBuf(scaleibuf, short(rectx), short(recty));
-  }
+  IMB_scale(scaleibuf, rectx, recty, IMBScaleFilter::Bilinear, threaded);

  quality = clip->proxy.quality;
  scaleibuf->ftype = IMB_FTYPE_JPG;
--- a/source/blender/blenkernel/intern/preview_image.cc
+++ b/source/blender/blenkernel/intern/preview_image.cc
@@ -407,7 +407,7 @@ void BKE_previewimg_ensure(PreviewImage *prv, const int size)
      icon_w = icon_h = ICON_RENDER_DEFAULT_HEIGHT;
    }

-    IMB_scaleImBuf(thumb, icon_w, icon_h);
+    IMB_scale(thumb, icon_w, icon_h, IMBScaleFilter::Box, false);
    prv->w[ICON_SIZE_ICON] = icon_w;
    prv->h[ICON_SIZE_ICON] = icon_h;
    prv->rect[ICON_SIZE_ICON] = (uint *)MEM_dupallocN(thumb->byte_buffer.data);
--- a/source/blender/blenkernel/intern/tracking_util.cc
+++ b/source/blender/blenkernel/intern/tracking_util.cc
@@ -741,7 +741,11 @@ static ImBuf *accessor_get_ibuf(TrackingImageAccessor *accessor,
    if (final_ibuf == orig_ibuf) {
      final_ibuf = IMB_dupImBuf(orig_ibuf);
    }
-    IMB_scaleImBuf(final_ibuf, orig_ibuf->x / (1 << downscale), orig_ibuf->y / (1 << downscale));
+    IMB_scale(final_ibuf,
+              orig_ibuf->x / (1 << downscale),
+              orig_ibuf->y / (1 << downscale),
+              IMBScaleFilter::Box,
+              false);
  }
  /* Apply possible transformation. */
  if (transform != nullptr) {
--- a/source/blender/editors/interface/interface_draw.cc
+++ b/source/blender/editors/interface/interface_draw.cc
@@ -350,7 +350,7 @@ void ui_draw_but_IMAGE(ARegion * /*region*/,

  if (w != ibuf->x || h != ibuf->y) {
    /* We scale the bitmap, rather than have OGL do a worse job. */
-    IMB_scaleImBuf(ibuf, w, h);
+    IMB_scale(ibuf, w, h, IMBScaleFilter::Box, false);
  }

  float col[4] = {1.0f, 1.0f, 1.0f, 1.0f};
--- a/source/blender/editors/interface/interface_icons.cc
+++ b/source/blender/editors/interface/interface_icons.cc
@@ -799,7 +799,7 @@ static void icon_verify_datatoc(IconImage *iimg)
        iimg->datatoc_rect, iimg->datatoc_size, IB_rect, nullptr, "<matcap icon>");
    /* w and h were set on initialize */
    if (bbuf->x != iimg->h && bbuf->y != iimg->w) {
-      IMB_scaleImBuf(bbuf, iimg->w, iimg->h);
+      IMB_scale(bbuf, iimg->w, iimg->h, IMBScaleFilter::Box, false);
    }

    iimg->rect = IMB_steal_byte_buffer(bbuf);
--- a/source/blender/editors/interface/regions/interface_region_tooltip.cc
+++ b/source/blender/editors/interface/regions/interface_region_tooltip.cc
@@ -1664,7 +1664,7 @@ static void ui_tooltip_from_clip(MovieClip &clip, uiTooltipData &data)
    if (ibuf) {
      /* Resize. */
      float scale = float(200.0f * UI_SCALE_FAC) / float(std::max(ibuf->x, ibuf->y));
-      IMB_scaleImBuf(ibuf, scale * ibuf->x, scale * ibuf->y);
+      IMB_scale(ibuf, scale * ibuf->x, scale * ibuf->y, IMBScaleFilter::Box, false);
      IMB_rect_from_float(ibuf);

      uiTooltipImage image_data;
--- a/source/blender/editors/render/render_preview.cc
+++ b/source/blender/editors/render/render_preview.cc
@@ -1370,7 +1370,7 @@ static void icon_copy_rect(ImBuf *ibuf, uint w, uint h, uint *rect)
  dx = (w - ex) / 2;
  dy = (h - ey) / 2;

-  IMB_scalefastImBuf(ima, ex, ey);
+  IMB_scale(ima, ex, ey, IMBScaleFilter::Nearest, false);

  /* if needed, convert to 32 bits */
  if (ima->byte_buffer.data == nullptr) {
--- a/source/blender/editors/space_image/image_ops.cc
+++ b/source/blender/editors/space_image/image_ops.cc
@@ -3242,7 +3242,7 @@ static int image_scale_exec(bContext *C, wmOperator *op)
  ED_image_undo_push_begin_with_image(op->type->name, ima, ibuf, &iuser);

  ibuf->userflags |= IB_DISPLAY_BUFFER_INVALID;
-  IMB_scaleImBuf(ibuf, size[0], size[1]);
+  IMB_scale(ibuf, size[0], size[1], IMBScaleFilter::Box, false);
  BKE_image_mark_dirty(ima, ibuf);
  BKE_image_release_ibuf(ima, ibuf, nullptr);

--- a/source/blender/freestyle/intern/stroke/Canvas.cpp
+++ b/source/blender/freestyle/intern/stroke/Canvas.cpp
@@ -355,7 +355,7 @@ void Canvas::loadMap(const char *iFileName, const char *iMapName, uint iNbLevels
  ImBuf *scaledImg;
  if ((qimg->x != width()) || (qimg->y != height())) {
    scaledImg = IMB_dupImBuf(qimg);
-    IMB_scaleImBuf(scaledImg, width(), height());
+    IMB_scale(scaledImg, width(), height(), IMBScaleFilter::Box, false);
  }

  // deal with color image
--- a/source/blender/imbuf/CMakeLists.txt
+++ b/source/blender/imbuf/CMakeLists.txt
@@ -185,7 +185,9 @@ blender_add_lib(bf_imbuf "${SRC}" "${INC}" "${INC_SYS}" "${LIB}")

 if(WITH_GTESTS)
  set(TEST_SRC
-    intern/transform_test.cc
+    tests/IMB_scaling_test.cc
+    tests/IMB_transform_test.cc
  )
  blender_add_test_suite_lib(imbuf "${TEST_SRC}" "${INC}" "${INC_SYS}" "${LIB}")
+  add_subdirectory(tests/performance)
 endif()
--- a/source/blender/imbuf/IMB_imbuf.hh
+++ b/source/blender/imbuf/IMB_imbuf.hh
@@ -368,7 +368,6 @@ void IMB_free_anim(ImBufAnim *anim);
 #define FILTER_MASK_MARGIN 1
 #define FILTER_MASK_USED 2

-void IMB_filter(ImBuf *ibuf);
 void IMB_mask_filter_extend(char *mask, int width, int height);
 void IMB_mask_clear(ImBuf *ibuf, const char *mask, int val);
 /**
@@ -392,17 +391,27 @@ void IMB_filtery(ImBuf *ibuf);

 ImBuf *IMB_onehalf(ImBuf *ibuf1);

-/**
- * Return true if \a ibuf is modified.
- */
-bool IMB_scaleImBuf(ImBuf *ibuf, unsigned int newx, unsigned int newy);
+/** Interpolation filter used by `IMB_scale`. */
+enum class IMBScaleFilter {
+  /** No filtering (point sampling). This is fastest but lowest quality. */
+  Nearest,
+  /** Bilinear filter: each pixel in result image interpolates between 2x2 pixels of source image.
+   */
+  Bilinear,
+  /** Box filter. Behaves exactly like Bilinear when scaling up, better results when scaling down
+     by more than 2x. */
+  Box,
+};

 /**
+ * Scale/resize image to new dimensions.
 * Return true if \a ibuf is modified.
 */
-bool IMB_scalefastImBuf(ImBuf *ibuf, unsigned int newx, unsigned int newy);
-
-void IMB_scaleImBuf_threaded(ImBuf *ibuf, unsigned int newx, unsigned int newy);
+bool IMB_scale(ImBuf *ibuf,
+               unsigned int newx,
+               unsigned int newy,
+               IMBScaleFilter filter,
+               bool threaded = true);

 bool IMB_saveiff(ImBuf *ibuf, const char *filepath, int flags);

--- a/source/blender/imbuf/intern/scaling.cc
+++ b/source/blender/imbuf/intern/scaling.cc
@@ -1,4 +1,5 @@
 /* SPDX-FileCopyrightText: 2001-2002 NaN Holding BV. All rights reserved.
+ * SPDX-FileCopyrightText: 2024 Blender Authors
 *
 * SPDX-License-Identifier: GPL-2.0-or-later */

@@ -8,6 +9,8 @@

 #include <cmath>

+#include "BLI_math_vector.hh"
+#include "BLI_task.hh"
 #include "BLI_utildefines.h"
 #include "MEM_guardedalloc.h"

@@ -18,6 +21,11 @@

 #include "BLI_sys_types.h" /* for intptr_t support */

+using blender::float2;
+using blender::float3;
+using blender::float4;
+using blender::uchar4;
+
 static void imb_half_x_no_alloc(ImBuf *ibuf2, ImBuf *ibuf1)
 {
  uchar *p1, *_p1, *dest;
@@ -331,1358 +339,462 @@ ImBuf *IMB_onehalf(ImBuf *ibuf1)
  return ibuf2;
 }

-/* q_scale_linear_interpolation helper functions */
-
-static void enlarge_picture_byte(
-    uchar *src, uchar *dst, int src_width, int src_height, int dst_width, int dst_height)
+static void alloc_scale_dst_buffers(
+    const ImBuf *ibuf, uint newx, uint newy, uchar4 **r_dst_byte, float **r_dst_float)
 {
-  double ratiox = double(dst_width - 1.0) / double(src_width - 1.001);
-  double ratioy = double(dst_height - 1.0) / double(src_height - 1.001);
-  uintptr_t x_src, dx_src, x_dst;
-  uintptr_t y_src, dy_src, y_dst;
-
-  dx_src = 65536.0 / ratiox;
-  dy_src = 65536.0 / ratioy;
-
-  y_src = 0;
-  for (y_dst = 0; y_dst < dst_height; y_dst++) {
-    uchar *line1 = src + (y_src >> 16) * 4 * src_width;
-    uchar *line2 = line1 + 4 * src_width;
-    uintptr_t weight1y = 65536 - (y_src & 0xffff);
-    uintptr_t weight2y = 65536 - weight1y;
-
-    if ((y_src >> 16) == src_height - 1) {
-      line2 = line1;
+  *r_dst_byte = nullptr;
+  if (ibuf->byte_buffer.data != nullptr) {
+    *r_dst_byte = static_cast<uchar4 *>(
+        MEM_mallocN(sizeof(uchar4) * newx * newy, "scale_buf_byte"));
+    if (*r_dst_byte == nullptr) {
+      return;
    }
-
-    x_src = 0;
-    for (x_dst = 0; x_dst < dst_width; x_dst++) {
-      uintptr_t weight1x = 65536 - (x_src & 0xffff);
-      uintptr_t weight2x = 65536 - weight1x;
-
-      ulong x = (x_src >> 16) * 4;
-
-      *dst++ = ((((line1[x] * weight1y) >> 16) * weight1x) >> 16) +
-               ((((line2[x] * weight2y) >> 16) * weight1x) >> 16) +
-               ((((line1[4 + x] * weight1y) >> 16) * weight2x) >> 16) +
-               ((((line2[4 + x] * weight2y) >> 16) * weight2x) >> 16);
-
-      *dst++ = ((((line1[x + 1] * weight1y) >> 16) * weight1x) >> 16) +
-               ((((line2[x + 1] * weight2y) >> 16) * weight1x) >> 16) +
-               ((((line1[4 + x + 1] * weight1y) >> 16) * weight2x) >> 16) +
-               ((((line2[4 + x + 1] * weight2y) >> 16) * weight2x) >> 16);
-
-      *dst++ = ((((line1[x + 2] * weight1y) >> 16) * weight1x) >> 16) +
-               ((((line2[x + 2] * weight2y) >> 16) * weight1x) >> 16) +
-               ((((line1[4 + x + 2] * weight1y) >> 16) * weight2x) >> 16) +
-               ((((line2[4 + x + 2] * weight2y) >> 16) * weight2x) >> 16);
-
-      *dst++ = ((((line1[x + 3] * weight1y) >> 16) * weight1x) >> 16) +
-               ((((line2[x + 3] * weight2y) >> 16) * weight1x) >> 16) +
-               ((((line1[4 + x + 3] * weight1y) >> 16) * weight2x) >> 16) +
-               ((((line2[4 + x + 3] * weight2y) >> 16) * weight2x) >> 16);
-
-      x_src += dx_src;
+  }
+  *r_dst_float = nullptr;
+  if (ibuf->float_buffer.data != nullptr) {
+    *r_dst_float = static_cast<float *>(
+        MEM_mallocN(sizeof(float) * ibuf->channels * newx * newy, "scale_buf_float"));
+    if (*r_dst_float == nullptr) {
+      if (*r_dst_byte) {
+        MEM_freeN(*r_dst_byte);
+      }
+      return;
    }
-    y_src += dy_src;
  }
 }

-struct scale_outpix_byte {
-  uintptr_t r;
-  uintptr_t g;
-  uintptr_t b;
-  uintptr_t a;
-
-  uintptr_t weight;
-};
-
-static void shrink_picture_byte(
-    uchar *src, uchar *dst, int src_width, int src_height, int dst_width, int dst_height)
+static inline float4 load_pixel(const uchar4 *ptr)
 {
-  double ratiox = double(dst_width) / double(src_width);
-  double ratioy = double(dst_height) / double(src_height);
-  uintptr_t x_src, dx_dst, x_dst;
-  uintptr_t y_src, dy_dst, y_dst;
-  intptr_t y_counter;
-  uchar *dst_begin = dst;
-
-  scale_outpix_byte *dst_line1 = nullptr;
-  scale_outpix_byte *dst_line2 = nullptr;
-
-  dst_line1 = (scale_outpix_byte *)MEM_callocN((dst_width + 1) * sizeof(scale_outpix_byte),
-                                               "shrink_picture_byte 1");
-  dst_line2 = (scale_outpix_byte *)MEM_callocN((dst_width + 1) * sizeof(scale_outpix_byte),
-                                               "shrink_picture_byte 2");
-
-  dx_dst = 65536.0 * ratiox;
-  dy_dst = 65536.0 * ratioy;
-
-  y_dst = 0;
-  y_counter = 65536;
-  for (y_src = 0; y_src < src_height; y_src++) {
-    uchar *line = src + y_src * 4 * src_width;
-    uintptr_t weight1y = 65535 - (y_dst & 0xffff);
-    uintptr_t weight2y = 65535 - weight1y;
-    x_dst = 0;
-    for (x_src = 0; x_src < src_width; x_src++) {
-      uintptr_t weight1x = 65535 - (x_dst & 0xffff);
-      uintptr_t weight2x = 65535 - weight1x;
-
-      uintptr_t x = x_dst >> 16;
-
-      uintptr_t w;
-
-      w = (weight1y * weight1x) >> 16;
-
-      /* Ensure correct rounding, without this you get ugly banding,
-       * or too low color values (ton). */
-      dst_line1[x].r += (line[0] * w + 32767) >> 16;
-      dst_line1[x].g += (line[1] * w + 32767) >> 16;
-      dst_line1[x].b += (line[2] * w + 32767) >> 16;
-      dst_line1[x].a += (line[3] * w + 32767) >> 16;
-      dst_line1[x].weight += w;
-
-      w = (weight2y * weight1x) >> 16;
-
-      dst_line2[x].r += (line[0] * w + 32767) >> 16;
-      dst_line2[x].g += (line[1] * w + 32767) >> 16;
-      dst_line2[x].b += (line[2] * w + 32767) >> 16;
-      dst_line2[x].a += (line[3] * w + 32767) >> 16;
-      dst_line2[x].weight += w;
-
-      w = (weight1y * weight2x) >> 16;
-
-      dst_line1[x + 1].r += (line[0] * w + 32767) >> 16;
-      dst_line1[x + 1].g += (line[1] * w + 32767) >> 16;
-      dst_line1[x + 1].b += (line[2] * w + 32767) >> 16;
-      dst_line1[x + 1].a += (line[3] * w + 32767) >> 16;
-      dst_line1[x + 1].weight += w;
-
-      w = (weight2y * weight2x) >> 16;
-
-      dst_line2[x + 1].r += (line[0] * w + 32767) >> 16;
-      dst_line2[x + 1].g += (line[1] * w + 32767) >> 16;
-      dst_line2[x + 1].b += (line[2] * w + 32767) >> 16;
-      dst_line2[x + 1].a += (line[3] * w + 32767) >> 16;
-      dst_line2[x + 1].weight += w;
-
-      x_dst += dx_dst;
-      line += 4;
-    }
-
-    y_dst += dy_dst;
-    y_counter -= dy_dst;
-    if (y_counter < 0) {
-      int val;
-      uintptr_t x;
-      scale_outpix_byte *temp;
-
-      y_counter += 65536;
-
-      for (x = 0; x < dst_width; x++) {
-        uintptr_t f = 0x80000000UL / dst_line1[x].weight;
-        *dst++ = (val = (dst_line1[x].r * f) >> 15) > 255 ? 255 : val;
-        *dst++ = (val = (dst_line1[x].g * f) >> 15) > 255 ? 255 : val;
-        *dst++ = (val = (dst_line1[x].b * f) >> 15) > 255 ? 255 : val;
-        *dst++ = (val = (dst_line1[x].a * f) >> 15) > 255 ? 255 : val;
-      }
-      memset(dst_line1, 0, dst_width * sizeof(scale_outpix_byte));
-      temp = dst_line1;
-      dst_line1 = dst_line2;
-      dst_line2 = temp;
-    }
-  }
-  if (dst - dst_begin < dst_width * dst_height * 4) {
-    int val;
-    uintptr_t x;
-    for (x = 0; x < dst_width; x++) {
-      uintptr_t f = 0x80000000UL / dst_line1[x].weight;
-      *dst++ = (val = (dst_line1[x].r * f) >> 15) > 255 ? 255 : val;
-      *dst++ = (val = (dst_line1[x].g * f) >> 15) > 255 ? 255 : val;
-      *dst++ = (val = (dst_line1[x].b * f) >> 15) > 255 ? 255 : val;
-      *dst++ = (val = (dst_line1[x].a * f) >> 15) > 255 ? 255 : val;
-    }
-  }
-  MEM_freeN(dst_line1);
-  MEM_freeN(dst_line2);
+  return float4(ptr[0]);
+}
+static inline float4 load_pixel(const float *ptr)
+{
+  return float4(ptr[0]);
+}
+static inline float4 load_pixel(const float2 *ptr)
+{
+  return float4(ptr[0]);
+}
+static inline float4 load_pixel(const float3 *ptr)
+{
+  return float4(ptr[0]);
+}
+static inline float4 load_pixel(const float4 *ptr)
+{
+  return float4(ptr[0]);
+}
+static inline void store_pixel(float4 pix, uchar4 *ptr)
+{
+  *ptr = uchar4(blender::math::round(pix));
+}
+static inline void store_pixel(float4 pix, float *ptr)
+{
+  *ptr = pix.x;
+}
+static inline void store_pixel(float4 pix, float2 *ptr)
+{
+  memcpy(ptr, &pix, sizeof(*ptr));
+}
+static inline void store_pixel(float4 pix, float3 *ptr)
+{
+  memcpy(ptr, &pix, sizeof(*ptr));
+}
+static inline void store_pixel(float4 pix, float4 *ptr)
+{
+  *ptr = pix;
 }

-static void q_scale_byte(
-    uchar *in, uchar *out, int in_width, int in_height, int dst_width, int dst_height)
-{
-  if (dst_width > in_width && dst_height > in_height) {
-    enlarge_picture_byte(in, out, in_width, in_height, dst_width, dst_height);
-  }
-  else if (dst_width < in_width && dst_height < in_height) {
-    shrink_picture_byte(in, out, in_width, in_height, dst_width, dst_height);
-  }
-}
+struct ScaleDownX {
+  template<typename T>
+  static void op(const T *src, T *dst, int ibufx, int ibufy, int newx, int /*newy*/, bool threaded)
+  {
+    using namespace blender;
+    const float add = (ibufx - 0.01f) / newx;
+    const float inv_add = 1.0f / add;
+
+    const int grain_size = threaded ? 32 : ibufy;
+    threading::parallel_for(IndexRange(ibufy), grain_size, [&](IndexRange range) {
+      for (const int y : range) {
+        const T *src_ptr = src + y * ibufx;
+        T *dst_ptr = dst + y * newx;
+        float sample = 0.0f;
+        float4 val(0.0f);
+
+        for (int x = 0; x < newx; x++) {
+          float4 nval = -val * sample;
+          sample += add;
+          while (sample >= 1.0f) {
+            sample -= 1.0f;
+            nval += load_pixel(src_ptr);
+            src_ptr++;
+          }
+
+          val = load_pixel(src_ptr);
+          src_ptr++;
+
+          float4 pix = (nval + sample * val) * inv_add;
+          store_pixel(pix, dst_ptr);
+          dst_ptr++;

-static void enlarge_picture_float(
-    float *src, float *dst, int src_width, int src_height, int dst_width, int dst_height)
-{
-  double ratiox = double(dst_width - 1.0) / double(src_width - 1.001);
-  double ratioy = double(dst_height - 1.0) / double(src_height - 1.001);
-  uintptr_t x_dst;
-  uintptr_t y_dst;
-  double x_src, dx_src;
-  double y_src, dy_src;
-
-  dx_src = 1.0 / ratiox;
-  dy_src = 1.0 / ratioy;
-
-  y_src = 0;
-  for (y_dst = 0; y_dst < dst_height; y_dst++) {
-    float *line1 = src + int(y_src) * 4 * src_width;
-    const float *line2 = line1 + 4 * src_width;
-    const float weight1y = float(1.0 - (y_src - int(y_src)));
-    const float weight2y = 1.0f - weight1y;
-
-    if (int(y_src) == src_height - 1) {
-      line2 = line1;
-    }
-
-    x_src = 0;
-    for (x_dst = 0; x_dst < dst_width; x_dst++) {
-      const float weight1x = float(1.0 - (x_src - int(x_src)));
-      const float weight2x = float(1.0f - weight1x);
-
-      const float w11 = weight1y * weight1x;
-      const float w21 = weight2y * weight1x;
-      const float w12 = weight1y * weight2x;
-      const float w22 = weight2y * weight2x;
-
-      uintptr_t x = int(x_src) * 4;
-
-      *dst++ = line1[x] * w11 + line2[x] * w21 + line1[4 + x] * w12 + line2[4 + x] * w22;
-
-      *dst++ = line1[x + 1] * w11 + line2[x + 1] * w21 + line1[4 + x + 1] * w12 +
-               line2[4 + x + 1] * w22;
-
-      *dst++ = line1[x + 2] * w11 + line2[x + 2] * w21 + line1[4 + x + 2] * w12 +
-               line2[4 + x + 2] * w22;
-
-      *dst++ = line1[x + 3] * w11 + line2[x + 3] * w21 + line1[4 + x + 3] * w12 +
-               line2[4 + x + 3] * w22;
-
-      x_src += dx_src;
-    }
-    y_src += dy_src;
-  }
-}
-
-struct scale_outpix_float {
-  float r;
-  float g;
-  float b;
-  float a;
-
-  float weight;
-};
-
-static void shrink_picture_float(
-    const float *src, float *dst, int src_width, int src_height, int dst_width, int dst_height)
-{
-  double ratiox = double(dst_width) / double(src_width);
-  double ratioy = double(dst_height) / double(src_height);
-  uintptr_t x_src;
-  uintptr_t y_src;
-  float dx_dst, x_dst;
-  float dy_dst, y_dst;
-  float y_counter;
-  const float *dst_begin = dst;
-
-  scale_outpix_float *dst_line1;
-  scale_outpix_float *dst_line2;
-
-  dst_line1 = (scale_outpix_float *)MEM_callocN((dst_width + 1) * sizeof(scale_outpix_float),
-                                                "shrink_picture_float 1");
-  dst_line2 = (scale_outpix_float *)MEM_callocN((dst_width + 1) * sizeof(scale_outpix_float),
-                                                "shrink_picture_float 2");
-
-  dx_dst = ratiox;
-  dy_dst = ratioy;
-
-  y_dst = 0;
-  y_counter = 1.0;
-  for (y_src = 0; y_src < src_height; y_src++) {
-    const float *line = src + y_src * 4 * src_width;
-    uintptr_t weight1y = 1.0f - (y_dst - int(y_dst));
-    uintptr_t weight2y = 1.0f - weight1y;
-    x_dst = 0;
-    for (x_src = 0; x_src < src_width; x_src++) {
-      uintptr_t weight1x = 1.0f - (x_dst - int(x_dst));
-      uintptr_t weight2x = 1.0f - weight1x;
-
-      uintptr_t x = int(x_dst);
-
-      float w;
-
-      w = weight1y * weight1x;
-
-      dst_line1[x].r += line[0] * w;
-      dst_line1[x].g += line[1] * w;
-      dst_line1[x].b += line[2] * w;
-      dst_line1[x].a += line[3] * w;
-      dst_line1[x].weight += w;
-
-      w = weight2y * weight1x;
-
-      dst_line2[x].r += line[0] * w;
-      dst_line2[x].g += line[1] * w;
-      dst_line2[x].b += line[2] * w;
-      dst_line2[x].a += line[3] * w;
-      dst_line2[x].weight += w;
-
-      w = weight1y * weight2x;
-
-      dst_line1[x + 1].r += line[0] * w;
-      dst_line1[x + 1].g += line[1] * w;
-      dst_line1[x + 1].b += line[2] * w;
-      dst_line1[x + 1].a += line[3] * w;
-      dst_line1[x + 1].weight += w;
-
-      w = weight2y * weight2x;
-
-      dst_line2[x + 1].r += line[0] * w;
-      dst_line2[x + 1].g += line[1] * w;
-      dst_line2[x + 1].b += line[2] * w;
-      dst_line2[x + 1].a += line[3] * w;
-      dst_line2[x + 1].weight += w;
-
-      x_dst += dx_dst;
-      line += 4;
-    }
-
-    y_dst += dy_dst;
-    y_counter -= dy_dst;
-    if (y_counter < 0) {
-      uintptr_t x;
-      scale_outpix_float *temp;
-
-      y_counter += 1.0f;
-
-      for (x = 0; x < dst_width; x++) {
-        float f = 1.0f / dst_line1[x].weight;
-        *dst++ = dst_line1[x].r * f;
-        *dst++ = dst_line1[x].g * f;
-        *dst++ = dst_line1[x].b * f;
-        *dst++ = dst_line1[x].a * f;
-      }
-      memset(dst_line1, 0, dst_width * sizeof(scale_outpix_float));
-      temp = dst_line1;
-      dst_line1 = dst_line2;
-      dst_line2 = temp;
-    }
-  }
-  if (dst - dst_begin < dst_width * dst_height * 4) {
-    uintptr_t x;
-    for (x = 0; x < dst_width; x++) {
-      float f = 1.0f / dst_line1[x].weight;
-      *dst++ = dst_line1[x].r * f;
-      *dst++ = dst_line1[x].g * f;
-      *dst++ = dst_line1[x].b * f;
-      *dst++ = dst_line1[x].a * f;
-    }
-  }
-  MEM_freeN(dst_line1);
-  MEM_freeN(dst_line2);
-}
-
-static void q_scale_float(
-    float *in, float *out, int in_width, int in_height, int dst_width, int dst_height)
-{
-  if (dst_width > in_width && dst_height > in_height) {
-    enlarge_picture_float(in, out, in_width, in_height, dst_width, dst_height);
-  }
-  else if (dst_width < in_width && dst_height < in_height) {
-    shrink_picture_float(in, out, in_width, in_height, dst_width, dst_height);
-  }
-}
-
-/**
- * q_scale_linear_interpolation (derived from `ppmqscale`, http://libdv.sf.net)
- *
- * q stands for quick _and_ quality :)
- *
- * only handles common cases when we either
- *
- * scale both, x and y or
- * shrink both, x and y
- *
- * but that is pretty fast:
- * - does only blit once instead of two passes like the old code
- *   (fewer cache misses)
- * - uses fixed point integer arithmetic for byte buffers
- * - doesn't branch in tight loops
- *
- * Should be comparable in speed to the ImBuf ..._fast functions at least
- * for byte-buffers.
- *
- * NOTE: disabled, due to unacceptable inaccuracy and quality loss, see bug #18609 (ton)
- */
-static bool q_scale_linear_interpolation(ImBuf *ibuf, int newx, int newy)
-{
-  if ((newx >= ibuf->x && newy <= ibuf->y) || (newx <= ibuf->x && newy >= ibuf->y)) {
-    return false;
-  }
-
-  if (ibuf->byte_buffer.data) {
-    uchar *newrect = static_cast<uchar *>(MEM_mallocN(sizeof(int) * newx * newy, "q_scale rect"));
-    q_scale_byte(ibuf->byte_buffer.data, newrect, ibuf->x, ibuf->y, newx, newy);
-
-    IMB_assign_byte_buffer(ibuf, newrect, IB_TAKE_OWNERSHIP);
-  }
-  if (ibuf->float_buffer.data) {
-    float *newrect = static_cast<float *>(
-        MEM_mallocN(sizeof(float[4]) * newx * newy, "q_scale rectfloat"));
-    q_scale_float(ibuf->float_buffer.data, newrect, ibuf->x, ibuf->y, newx, newy);
-
-    IMB_assign_float_buffer(ibuf, newrect, IB_TAKE_OWNERSHIP);
-  }
-
-  ibuf->x = newx;
-  ibuf->y = newy;
-
-  return true;
-}
-
-static ImBuf *scaledownx(ImBuf *ibuf, int newx)
-{
-  const bool do_rect = (ibuf->byte_buffer.data != nullptr);
-  const bool do_float = (ibuf->float_buffer.data != nullptr);
-  const size_t rect_size = IMB_get_rect_len(ibuf) * 4;
-
-  uchar *rect, *_newrect, *newrect;
-  float *rectf, *_newrectf, *newrectf;
-  float sample, add, val[4], nval[4], valf[4], nvalf[4];
-  int x, y;
-
-  rectf = _newrectf = newrectf = nullptr;
-  rect = _newrect = newrect = nullptr;
-  nval[0] = nval[1] = nval[2] = nval[3] = 0.0f;
-  nvalf[0] = nvalf[1] = nvalf[2] = nvalf[3] = 0.0f;
-
-  if (!do_rect && !do_float) {
-    return ibuf;
-  }
-
-  if (do_rect) {
-    _newrect = static_cast<uchar *>(MEM_mallocN(sizeof(uchar[4]) * newx * ibuf->y, "scaledownx"));
-    if (_newrect == nullptr) {
-      return ibuf;
-    }
-  }
-  if (do_float) {
-    _newrectf = static_cast<float *>(
-        MEM_mallocN(sizeof(float[4]) * newx * ibuf->y, "scaledownxf"));
-    if (_newrectf == nullptr) {
-      if (_newrect) {
-        MEM_freeN(_newrect);
-      }
-      return ibuf;
-    }
-  }
-
-  add = (ibuf->x - 0.01) / newx;
-
-  if (do_rect) {
-    rect = ibuf->byte_buffer.data;
-    newrect = _newrect;
-  }
-  if (do_float) {
-    rectf = ibuf->float_buffer.data;
-    newrectf = _newrectf;
-  }
-
-  for (y = ibuf->y; y > 0; y--) {
-    sample = 0.0f;
-    val[0] = val[1] = val[2] = val[3] = 0.0f;
-    valf[0] = valf[1] = valf[2] = valf[3] = 0.0f;
-
-    for (x = newx; x > 0; x--) {
-      if (do_rect) {
-        nval[0] = -val[0] * sample;
-        nval[1] = -val[1] * sample;
-        nval[2] = -val[2] * sample;
-        nval[3] = -val[3] * sample;
-      }
-      if (do_float) {
-        nvalf[0] = -valf[0] * sample;
-        nvalf[1] = -valf[1] * sample;
-        nvalf[2] = -valf[2] * sample;
-        nvalf[3] = -valf[3] * sample;
-      }
-
-      sample += add;
-
-      while (sample >= 1.0f) {
-        sample -= 1.0f;
-
-        if (do_rect) {
-          nval[0] += rect[0];
-          nval[1] += rect[1];
-          nval[2] += rect[2];
-          nval[3] += rect[3];
-          rect += 4;
-        }
-        if (do_float) {
-          nvalf[0] += rectf[0];
-          nvalf[1] += rectf[1];
-          nvalf[2] += rectf[2];
-          nvalf[3] += rectf[3];
-          rectf += 4;
-        }
-      }
-
-      if (do_rect) {
-        val[0] = rect[0];
-        val[1] = rect[1];
-        val[2] = rect[2];
-        val[3] = rect[3];
-        rect += 4;
-
-        newrect[0] = roundf((nval[0] + sample * val[0]) / add);
-        newrect[1] = roundf((nval[1] + sample * val[1]) / add);
-        newrect[2] = roundf((nval[2] + sample * val[2]) / add);
-        newrect[3] = roundf((nval[3] + sample * val[3]) / add);
-
-        newrect += 4;
-      }
-      if (do_float) {
-
-        valf[0] = rectf[0];
-        valf[1] = rectf[1];
-        valf[2] = rectf[2];
-        valf[3] = rectf[3];
-        rectf += 4;
-
-        newrectf[0] = ((nvalf[0] + sample * valf[0]) / add);
-        newrectf[1] = ((nvalf[1] + sample * valf[1]) / add);
-        newrectf[2] = ((nvalf[2] + sample * valf[2]) / add);
-        newrectf[3] = ((nvalf[3] + sample * valf[3]) / add);
-
-        newrectf += 4;
-      }
-
-      sample -= 1.0f;
-    }
-  }
-
-  if (do_rect) {
-    // printf("%ld %ld\n", (uchar *)rect - ibuf->byte_buffer.data, rect_size);
-    BLI_assert((uchar *)rect - ibuf->byte_buffer.data == rect_size); /* see bug #26502. */
-
-    imb_freerectImBuf(ibuf);
-    IMB_assign_byte_buffer(ibuf, _newrect, IB_TAKE_OWNERSHIP);
-  }
-  if (do_float) {
-    // printf("%ld %ld\n", rectf - ibuf->float_buffer.data, rect_size);
-    BLI_assert((rectf - ibuf->float_buffer.data) == rect_size); /* see bug #26502. */
-
-    imb_freerectfloatImBuf(ibuf);
-    IMB_assign_float_buffer(ibuf, _newrectf, IB_TAKE_OWNERSHIP);
-  }
-
-  (void)rect_size; /* UNUSED in release builds */
-
-  ibuf->x = newx;
-  return ibuf;
-}
-
-static ImBuf *scaledowny(ImBuf *ibuf, int newy)
-{
-  const bool do_rect = (ibuf->byte_buffer.data != nullptr);
-  const bool do_float = (ibuf->float_buffer.data != nullptr);
-  const size_t rect_size = IMB_get_rect_len(ibuf) * 4;
-
-  uchar *rect, *_newrect, *newrect;
-  float *rectf, *_newrectf, *newrectf;
-  float sample, add, val[4], nval[4], valf[4], nvalf[4];
-  int x, y, skipx;
-
-  rectf = _newrectf = newrectf = nullptr;
-  rect = _newrect = newrect = nullptr;
-  nval[0] = nval[1] = nval[2] = nval[3] = 0.0f;
-  nvalf[0] = nvalf[1] = nvalf[2] = nvalf[3] = 0.0f;
-
-  if (!do_rect && !do_float) {
-    return ibuf;
-  }
-
-  if (do_rect) {
-    _newrect = static_cast<uchar *>(MEM_mallocN(sizeof(uchar[4]) * newy * ibuf->x, "scaledowny"));
-    if (_newrect == nullptr) {
-      return ibuf;
-    }
-  }
-  if (do_float) {
-    _newrectf = static_cast<float *>(
-        MEM_mallocN(sizeof(float[4]) * newy * ibuf->x, "scaledownyf"));
-    if (_newrectf == nullptr) {
-      if (_newrect) {
-        MEM_freeN(_newrect);
-      }
-      return ibuf;
-    }
-  }
-
-  add = (ibuf->y - 0.01) / newy;
-  skipx = 4 * ibuf->x;
-
-  for (x = skipx - 4; x >= 0; x -= 4) {
-    if (do_rect) {
-      rect = ibuf->byte_buffer.data + x;
-      newrect = _newrect + x;
-    }
-    if (do_float) {
-      rectf = ibuf->float_buffer.data + x;
-      newrectf = _newrectf + x;
-    }
-
-    sample = 0.0f;
-    val[0] = val[1] = val[2] = val[3] = 0.0f;
-    valf[0] = valf[1] = valf[2] = valf[3] = 0.0f;
-
-    for (y = newy; y > 0; y--) {
-      if (do_rect) {
-        nval[0] = -val[0] * sample;
-        nval[1] = -val[1] * sample;
-        nval[2] = -val[2] * sample;
-        nval[3] = -val[3] * sample;
-      }
-      if (do_float) {
-        nvalf[0] = -valf[0] * sample;
-        nvalf[1] = -valf[1] * sample;
-        nvalf[2] = -valf[2] * sample;
-        nvalf[3] = -valf[3] * sample;
-      }
-
-      sample += add;
-
-      while (sample >= 1.0f) {
-        sample -= 1.0f;
-
-        if (do_rect) {
-          nval[0] += rect[0];
-          nval[1] += rect[1];
-          nval[2] += rect[2];
-          nval[3] += rect[3];
-          rect += skipx;
-        }
-        if (do_float) {
-          nvalf[0] += rectf[0];
-          nvalf[1] += rectf[1];
-          nvalf[2] += rectf[2];
-          nvalf[3] += rectf[3];
-          rectf += skipx;
-        }
-      }
-
-      if (do_rect) {
-        val[0] = rect[0];
-        val[1] = rect[1];
-        val[2] = rect[2];
-        val[3] = rect[3];
-        rect += skipx;
-
-        newrect[0] = roundf((nval[0] + sample * val[0]) / add);
-        newrect[1] = roundf((nval[1] + sample * val[1]) / add);
-        newrect[2] = roundf((nval[2] + sample * val[2]) / add);
-        newrect[3] = roundf((nval[3] + sample * val[3]) / add);
-
-        newrect += skipx;
-      }
-      if (do_float) {
-
-        valf[0] = rectf[0];
-        valf[1] = rectf[1];
-        valf[2] = rectf[2];
-        valf[3] = rectf[3];
-        rectf += skipx;
-
-        newrectf[0] = ((nvalf[0] + sample * valf[0]) / add);
-        newrectf[1] = ((nvalf[1] + sample * valf[1]) / add);
-        newrectf[2] = ((nvalf[2] + sample * valf[2]) / add);
-        newrectf[3] = ((nvalf[3] + sample * valf[3]) / add);
-
-        newrectf += skipx;
-      }
-
-      sample -= 1.0f;
-    }
-  }
-
-  if (do_rect) {
-    // printf("%ld %ld\n", (uchar *)rect - byte_buffer.data, rect_size);
-    BLI_assert((uchar *)rect - ibuf->byte_buffer.data == rect_size); /* see bug #26502. */
-
-    imb_freerectImBuf(ibuf);
-    IMB_assign_byte_buffer(ibuf, _newrect, IB_TAKE_OWNERSHIP);
-  }
-  if (do_float) {
-    // printf("%ld %ld\n", rectf - ibuf->float_buffer.data, rect_size);
-    BLI_assert((rectf - ibuf->float_buffer.data) == rect_size); /* see bug #26502. */
-
-    imb_freerectfloatImBuf(ibuf);
-    IMB_assign_float_buffer(ibuf, _newrectf, IB_TAKE_OWNERSHIP);
-  }
-
-  (void)rect_size; /* UNUSED in release builds */
-
-  ibuf->y = newy;
-  return ibuf;
-}
-
-static ImBuf *scaleupx(ImBuf *ibuf, int newx)
-{
-  uchar *rect, *_newrect = nullptr, *newrect;
-  float *rectf, *_newrectf = nullptr, *newrectf;
-  int x, y;
-  bool do_rect = false, do_float = false;
-
-  if (ibuf == nullptr) {
-    return nullptr;
-  }
-  if (ibuf->byte_buffer.data == nullptr && ibuf->float_buffer.data == nullptr) {
-    return ibuf;
-  }
-
-  if (ibuf->byte_buffer.data) {
-    do_rect = true;
-    _newrect = static_cast<uchar *>(MEM_mallocN(newx * ibuf->y * sizeof(int), "scaleupx"));
-    if (_newrect == nullptr) {
-      return ibuf;
-    }
-  }
-  if (ibuf->float_buffer.data) {
-    do_float = true;
-    _newrectf = static_cast<float *>(MEM_mallocN(sizeof(float[4]) * newx * ibuf->y, "scaleupxf"));
-    if (_newrectf == nullptr) {
-      if (_newrect) {
-        MEM_freeN(_newrect);
-      }
-      return ibuf;
-    }
-  }
-
-  rect = ibuf->byte_buffer.data;
-  rectf = ibuf->float_buffer.data;
-  newrect = _newrect;
-  newrectf = _newrectf;
-
-  /* Special case, copy all columns, needed since the scaling logic assumes there is at least
-   * two rows to interpolate between causing out of bounds read for 1px images, see #70356. */
-  if (UNLIKELY(ibuf->x == 1)) {
-    if (do_rect) {
-      for (y = ibuf->y; y > 0; y--) {
-        for (x = newx; x > 0; x--) {
-          memcpy(newrect, rect, sizeof(char[4]));
-          newrect += 4;
-        }
-        rect += 4;
-      }
-    }
-    if (do_float) {
-      for (y = ibuf->y; y > 0; y--) {
-        for (x = newx; x > 0; x--) {
-          memcpy(newrectf, rectf, sizeof(float[4]));
-          newrectf += 4;
-        }
-        rectf += 4;
-      }
-    }
-  }
-  else {
-    const float add = (ibuf->x - 1.001) / (newx - 1.0);
-    float sample;
-
-    float val_a, nval_a, diff_a;
-    float val_b, nval_b, diff_b;
-    float val_g, nval_g, diff_g;
-    float val_r, nval_r, diff_r;
-    float val_af, nval_af, diff_af;
-    float val_bf, nval_bf, diff_bf;
-    float val_gf, nval_gf, diff_gf;
-    float val_rf, nval_rf, diff_rf;
-
-    val_a = nval_a = diff_a = val_b = nval_b = diff_b = 0;
-    val_g = nval_g = diff_g = val_r = nval_r = diff_r = 0;
-    val_af = nval_af = diff_af = val_bf = nval_bf = diff_bf = 0;
-    val_gf = nval_gf = diff_gf = val_rf = nval_rf = diff_rf = 0;
-
-    for (y = ibuf->y; y > 0; y--) {
-
-      sample = 0;
-
-      if (do_rect) {
-        val_a = rect[0];
-        nval_a = rect[4];
-        diff_a = nval_a - val_a;
-        val_a += 0.5f;
-
-        val_b = rect[1];
-        nval_b = rect[5];
-        diff_b = nval_b - val_b;
-        val_b += 0.5f;
-
-        val_g = rect[2];
-        nval_g = rect[6];
-        diff_g = nval_g - val_g;
-        val_g += 0.5f;
-
-        val_r = rect[3];
-        nval_r = rect[7];
-        diff_r = nval_r - val_r;
-        val_r += 0.5f;
-
-        rect += 8;
-      }
-      if (do_float) {
-        val_af = rectf[0];
-        nval_af = rectf[4];
-        diff_af = nval_af - val_af;
-
-        val_bf = rectf[1];
-        nval_bf = rectf[5];
-        diff_bf = nval_bf - val_bf;
-
-        val_gf = rectf[2];
-        nval_gf = rectf[6];
-        diff_gf = nval_gf - val_gf;
-
-        val_rf = rectf[3];
-        nval_rf = rectf[7];
-        diff_rf = nval_rf - val_rf;
-
-        rectf += 8;
-      }
-      for (x = newx; x > 0; x--) {
-        if (sample >= 1.0f) {
          sample -= 1.0f;
+        }
+      }
+    });
+  }
+};

-          if (do_rect) {
-            val_a = nval_a;
-            nval_a = rect[0];
-            diff_a = nval_a - val_a;
-            val_a += 0.5f;
+struct ScaleDownY {
+  template<typename T>
+  static void op(const T *src, T *dst, int ibufx, int ibufy, int /*newx*/, int newy, bool threaded)
+  {
+    using namespace blender;
+    const float add = (ibufy - 0.01f) / newy;
+    const float inv_add = 1.0f / add;

-            val_b = nval_b;
-            nval_b = rect[1];
-            diff_b = nval_b - val_b;
-            val_b += 0.5f;
+    const int grain_size = threaded ? 32 : ibufx;
+    threading::parallel_for(IndexRange(ibufx), grain_size, [&](IndexRange range) {
+      for (const int x : range) {
+        const T *src_ptr = src + x;
+        T *dst_ptr = dst + x;
+        float sample = 0.0f;
+        float4 val(0.0f);

-            val_g = nval_g;
-            nval_g = rect[2];
-            diff_g = nval_g - val_g;
-            val_g += 0.5f;
-
-            val_r = nval_r;
-            nval_r = rect[3];
-            diff_r = nval_r - val_r;
-            val_r += 0.5f;
-            rect += 4;
+        for (int y = 0; y < newy; y++) {
+          float4 nval = -val * sample;
+          sample += add;
+          while (sample >= 1.0f) {
+            sample -= 1.0f;
+            nval += load_pixel(src_ptr);
+            src_ptr += ibufx;
          }
-          if (do_float) {
-            val_af = nval_af;
-            nval_af = rectf[0];
-            diff_af = nval_af - val_af;

-            val_bf = nval_bf;
-            nval_bf = rectf[1];
-            diff_bf = nval_bf - val_bf;
+          val = load_pixel(src_ptr);
+          src_ptr += ibufx;

-            val_gf = nval_gf;
-            nval_gf = rectf[2];
-            diff_gf = nval_gf - val_gf;
+          float4 pix = (nval + sample * val) * inv_add;
+          store_pixel(pix, dst_ptr);
+          dst_ptr += ibufx;

-            val_rf = nval_rf;
-            nval_rf = rectf[3];
-            diff_rf = nval_rf - val_rf;
-            rectf += 4;
-          }
-        }
-        if (do_rect) {
-          newrect[0] = val_a + sample * diff_a;
-          newrect[1] = val_b + sample * diff_b;
-          newrect[2] = val_g + sample * diff_g;
-          newrect[3] = val_r + sample * diff_r;
-          newrect += 4;
-        }
-        if (do_float) {
-          newrectf[0] = val_af + sample * diff_af;
-          newrectf[1] = val_bf + sample * diff_bf;
-          newrectf[2] = val_gf + sample * diff_gf;
-          newrectf[3] = val_rf + sample * diff_rf;
-          newrectf += 4;
-        }
-        sample += add;
-      }
-    }
-  }
-
-  if (do_rect) {
-    imb_freerectImBuf(ibuf);
-    IMB_assign_byte_buffer(ibuf, _newrect, IB_TAKE_OWNERSHIP);
-  }
-  if (do_float) {
-    imb_freerectfloatImBuf(ibuf);
-    IMB_assign_float_buffer(ibuf, _newrectf, IB_TAKE_OWNERSHIP);
-  }
-
-  ibuf->x = newx;
-  return ibuf;
-}
-
-static ImBuf *scaleupy(ImBuf *ibuf, int newy)
-{
-  uchar *rect, *_newrect = nullptr, *newrect;
-  float *rectf, *_newrectf = nullptr, *newrectf;
-  int x, y, skipx;
-  bool do_rect = false, do_float = false;
-
-  if (ibuf == nullptr) {
-    return nullptr;
-  }
-  if (ibuf->byte_buffer.data == nullptr && ibuf->float_buffer.data == nullptr) {
-    return ibuf;
-  }
-
-  if (ibuf->byte_buffer.data) {
-    do_rect = true;
-    _newrect = static_cast<uchar *>(MEM_mallocN(ibuf->x * newy * sizeof(int), "scaleupy"));
-    if (_newrect == nullptr) {
-      return ibuf;
-    }
-  }
-  if (ibuf->float_buffer.data) {
-    do_float = true;
-    _newrectf = static_cast<float *>(MEM_mallocN(sizeof(float[4]) * ibuf->x * newy, "scaleupyf"));
-    if (_newrectf == nullptr) {
-      if (_newrect) {
-        MEM_freeN(_newrect);
-      }
-      return ibuf;
-    }
-  }
-
-  rect = ibuf->byte_buffer.data;
-  rectf = ibuf->float_buffer.data;
-  newrect = _newrect;
-  newrectf = _newrectf;
-
-  skipx = 4 * ibuf->x;
-
-  /* Special case, copy all rows, needed since the scaling logic assumes there is at least
-   * two rows to interpolate between causing out of bounds read for 1px images, see #70356. */
-  if (UNLIKELY(ibuf->y == 1)) {
-    if (do_rect) {
-      for (y = newy; y > 0; y--) {
-        memcpy(newrect, rect, sizeof(char) * skipx);
-        newrect += skipx;
-      }
-    }
-    if (do_float) {
-      for (y = newy; y > 0; y--) {
-        memcpy(newrectf, rectf, sizeof(float) * skipx);
-        newrectf += skipx;
-      }
-    }
-  }
-  else {
-    const float add = (ibuf->y - 1.001) / (newy - 1.0);
-    float sample;
-
-    float val_a, nval_a, diff_a;
-    float val_b, nval_b, diff_b;
-    float val_g, nval_g, diff_g;
-    float val_r, nval_r, diff_r;
-    float val_af, nval_af, diff_af;
-    float val_bf, nval_bf, diff_bf;
-    float val_gf, nval_gf, diff_gf;
-    float val_rf, nval_rf, diff_rf;
-
-    val_a = nval_a = diff_a = val_b = nval_b = diff_b = 0;
-    val_g = nval_g = diff_g = val_r = nval_r = diff_r = 0;
-    val_af = nval_af = diff_af = val_bf = nval_bf = diff_bf = 0;
-    val_gf = nval_gf = diff_gf = val_rf = nval_rf = diff_rf = 0;
-
-    for (x = ibuf->x; x > 0; x--) {
-      sample = 0;
-      if (do_rect) {
-        rect = ibuf->byte_buffer.data + 4 * (x - 1);
-        newrect = _newrect + 4 * (x - 1);
-
-        val_a = rect[0];
-        nval_a = rect[skipx];
-        diff_a = nval_a - val_a;
-        val_a += 0.5f;
-
-        val_b = rect[1];
-        nval_b = rect[skipx + 1];
-        diff_b = nval_b - val_b;
-        val_b += 0.5f;
-
-        val_g = rect[2];
-        nval_g = rect[skipx + 2];
-        diff_g = nval_g - val_g;
-        val_g += 0.5f;
-
-        val_r = rect[3];
-        nval_r = rect[skipx + 3];
-        diff_r = nval_r - val_r;
-        val_r += 0.5f;
-
-        rect += 2 * skipx;
-      }
-      if (do_float) {
-        rectf = ibuf->float_buffer.data + 4 * (x - 1);
-        newrectf = _newrectf + 4 * (x - 1);
-
-        val_af = rectf[0];
-        nval_af = rectf[skipx];
-        diff_af = nval_af - val_af;
-
-        val_bf = rectf[1];
-        nval_bf = rectf[skipx + 1];
-        diff_bf = nval_bf - val_bf;
-
-        val_gf = rectf[2];
-        nval_gf = rectf[skipx + 2];
-        diff_gf = nval_gf - val_gf;
-
-        val_rf = rectf[3];
-        nval_rf = rectf[skipx + 3];
-        diff_rf = nval_rf - val_rf;
-
-        rectf += 2 * skipx;
-      }
-
-      for (y = newy; y > 0; y--) {
-        if (sample >= 1.0f) {
          sample -= 1.0f;
+        }
+      }
+    });
+  }
+};

-          if (do_rect) {
-            val_a = nval_a;
-            nval_a = rect[0];
-            diff_a = nval_a - val_a;
-            val_a += 0.5f;
-
-            val_b = nval_b;
-            nval_b = rect[1];
-            diff_b = nval_b - val_b;
-            val_b += 0.5f;
-
-            val_g = nval_g;
-            nval_g = rect[2];
-            diff_g = nval_g - val_g;
-            val_g += 0.5f;
-
-            val_r = nval_r;
-            nval_r = rect[3];
-            diff_r = nval_r - val_r;
-            val_r += 0.5f;
-            rect += skipx;
-          }
-          if (do_float) {
-            val_af = nval_af;
-            nval_af = rectf[0];
-            diff_af = nval_af - val_af;
-
-            val_bf = nval_bf;
-            nval_bf = rectf[1];
-            diff_bf = nval_bf - val_bf;
-
-            val_gf = nval_gf;
-            nval_gf = rectf[2];
-            diff_gf = nval_gf - val_gf;
-
-            val_rf = nval_rf;
-            nval_rf = rectf[3];
-            diff_rf = nval_rf - val_rf;
-            rectf += skipx;
+struct ScaleUpX {
+  template<typename T>
+  static void op(const T *src, T *dst, int ibufx, int ibufy, int newx, int /*newy*/, bool threaded)
+  {
+    using namespace blender;
+    const float add = (ibufx - 0.001f) / newx;
+    /* Special case: source is 1px wide (see #70356). */
+    if (UNLIKELY(ibufx == 1)) {
+      for (int y = ibufy; y > 0; y--) {
+        for (int x = newx; x > 0; x--) {
+          *dst = *src;
+          dst++;
+        }
+        src++;
+      }
+    }
+    else {
+      const int grain_size = threaded ? 32 : ibufy;
+      threading::parallel_for(IndexRange(ibufy), grain_size, [&](IndexRange range) {
+        for (const int y : range) {
+          float sample = -0.5f + add * 0.5f;
+          int counter = 0;
+          const T *src_ptr = src + y * ibufx;
+          T *dst_ptr = dst + y * newx;
+          float4 val = load_pixel(src_ptr);
+          float4 nval = load_pixel(src_ptr + 1);
+          float4 diff = nval - val;
+          src_ptr += 2;
+          counter += 2;
+          for (int x = 0; x < newx; x++) {
+            if (sample >= 1.0f) {
+              sample -= 1.0f;
+              val = nval;
+              nval = load_pixel(src_ptr);
+              diff = nval - val;
+              if (counter + 1 < ibufx) {
+                src_ptr++;
+                counter++;
+              }
+            }
+            float4 pix = val + blender::math::max(sample, 0.0f) * diff;
+            store_pixel(pix, dst_ptr);
+            dst_ptr++;
+            sample += add;
          }
        }
-        if (do_rect) {
-          newrect[0] = val_a + sample * diff_a;
-          newrect[1] = val_b + sample * diff_b;
-          newrect[2] = val_g + sample * diff_g;
-          newrect[3] = val_r + sample * diff_r;
-          newrect += skipx;
-        }
-        if (do_float) {
-          newrectf[0] = val_af + sample * diff_af;
-          newrectf[1] = val_bf + sample * diff_bf;
-          newrectf[2] = val_gf + sample * diff_gf;
-          newrectf[3] = val_rf + sample * diff_rf;
-          newrectf += skipx;
-        }
-        sample += add;
-      }
+      });
    }
  }
+};

-  if (do_rect) {
-    imb_freerectImBuf(ibuf);
-    IMB_assign_byte_buffer(ibuf, _newrect, IB_TAKE_OWNERSHIP);
-  }
-  if (do_float) {
-    imb_freerectfloatImBuf(ibuf);
-    IMB_assign_float_buffer(ibuf, _newrectf, IB_TAKE_OWNERSHIP);
-  }
+struct ScaleUpY {
+  template<typename T>
+  static void op(const T *src, T *dst, int ibufx, int ibufy, int /*newx*/, int newy, bool threaded)
+  {
+    using namespace blender;
+    const float add = (ibufy - 0.001f) / newy;
+    /* Special case: source is 1px high (see #70356). */
+    if (UNLIKELY(ibufy == 1)) {
+      for (int y = newy; y > 0; y--) {
+        memcpy(dst, src, sizeof(T) * ibufx);
+        dst += ibufx;
+      }
+    }
+    else {
+      const int grain_size = threaded ? 32 : ibufx;
+      threading::parallel_for(IndexRange(ibufx), grain_size, [&](IndexRange range) {
+        for (const int x : range) {
+          float sample = -0.5f + add * 0.5f;
+          int counter = 0;
+          const T *src_ptr = src + x;
+          T *dst_ptr = dst + x;

-  ibuf->y = newy;
-  return ibuf;
+          float4 val = load_pixel(src_ptr);
+          float4 nval = load_pixel(src_ptr + ibufx);
+          float4 diff = nval - val;
+          src_ptr += ibufx * 2;
+          counter += 2;
+
+          for (int y = 0; y < newy; y++) {
+            if (sample >= 1.0f) {
+              sample -= 1.0f;
+              val = nval;
+              nval = load_pixel(src_ptr);
+              diff = nval - val;
+              if (counter + 1 < ibufy) {
+                src_ptr += ibufx;
+                ++counter;
+              }
+            }
+            float4 pix = val + blender::math::max(sample, 0.0f) * diff;
+            store_pixel(pix, dst_ptr);
+            dst_ptr += ibufx;
+            sample += add;
+          }
+        }
+      });
+    }
+  }
+};
+
+template<typename T>
+static void instantiate_pixel_op(T & /*op*/,
+                                 const ImBuf *ibuf,
+                                 int newx,
+                                 int newy,
+                                 uchar4 *dst_byte,
+                                 float *dst_float,
+                                 bool threaded)
+{
+  if (dst_byte != nullptr) {
+    const uchar4 *src = (const uchar4 *)ibuf->byte_buffer.data;
+    T::op(src, dst_byte, ibuf->x, ibuf->y, newx, newy, threaded);
+  }
+  if (dst_float != nullptr) {
+    if (ibuf->channels == 1) {
+      T::op(ibuf->float_buffer.data, dst_float, ibuf->x, ibuf->y, newx, newy, threaded);
+    }
+    else if (ibuf->channels == 2) {
+      const float2 *src = (const float2 *)ibuf->float_buffer.data;
+      T::op(src, (float2 *)dst_float, ibuf->x, ibuf->y, newx, newy, threaded);
+    }
+    else if (ibuf->channels == 3) {
+      const float3 *src = (const float3 *)ibuf->float_buffer.data;
+      T::op(src, (float3 *)dst_float, ibuf->x, ibuf->y, newx, newy, threaded);
+    }
+    else if (ibuf->channels == 4) {
+      const float4 *src = (const float4 *)ibuf->float_buffer.data;
+      T::op(src, (float4 *)dst_float, ibuf->x, ibuf->y, newx, newy, threaded);
+    }
+  }
 }

-bool IMB_scaleImBuf(ImBuf *ibuf, uint newx, uint newy)
+static void scale_down_x_func(
+    const ImBuf *ibuf, int newx, int newy, uchar4 *dst_byte, float *dst_float, bool threaded)
 {
-  BLI_assert_msg(newx > 0 && newy > 0, "Images must be at least 1 on both dimensions!");
-
-  if (ibuf == nullptr) {
-    return false;
-  }
-  if (ibuf->byte_buffer.data == nullptr && ibuf->float_buffer.data == nullptr) {
-    return false;
-  }
-
-  if (newx == ibuf->x && newy == ibuf->y) {
-    return false;
-  }
-
-  /* try to scale common cases in a fast way */
-  /* disabled, quality loss is unacceptable, see report #18609  (ton) */
-  if (false && q_scale_linear_interpolation(ibuf, newx, newy)) {
-    return true;
-  }
-
-  if (newx && (newx < ibuf->x)) {
-    scaledownx(ibuf, newx);
-  }
-  if (newy && (newy < ibuf->y)) {
-    scaledowny(ibuf, newy);
-  }
-  if (newx && (newx > ibuf->x)) {
-    scaleupx(ibuf, newx);
-  }
-  if (newy && (newy > ibuf->y)) {
-    scaleupy(ibuf, newy);
-  }
-
-  return true;
+  ScaleDownX op;
+  instantiate_pixel_op(op, ibuf, newx, newy, dst_byte, dst_float, threaded);
 }

-bool IMB_scalefastImBuf(ImBuf *ibuf, uint newx, uint newy)
+static void scale_down_y_func(
+    const ImBuf *ibuf, int newx, int newy, uchar4 *dst_byte, float *dst_float, bool threaded)
 {
-  BLI_assert_msg(newx > 0 && newy > 0, "Images must be at least 1 on both dimensions!");
+  ScaleDownY op;
+  instantiate_pixel_op(op, ibuf, newx, newy, dst_byte, dst_float, threaded);
+}

-  uint *rect, *_newrect, *newrect;
-  float *rectf, *_newrectf, *newrectf;
-  int x, y;
-  bool do_float = false, do_rect = false;
-  size_t ofsx, ofsy, stepx, stepy;
+static void scale_up_x_func(
+    const ImBuf *ibuf, int newx, int newy, uchar4 *dst_byte, float *dst_float, bool threaded)
+{
+  ScaleUpX op;
+  instantiate_pixel_op(op, ibuf, newx, newy, dst_byte, dst_float, threaded);
+}

-  rect = nullptr;
-  _newrect = nullptr;
-  newrect = nullptr;
-  rectf = nullptr;
-  _newrectf = nullptr;
-  newrectf = nullptr;
+static void scale_up_y_func(
+    const ImBuf *ibuf, int newx, int newy, uchar4 *dst_byte, float *dst_float, bool threaded)
+{
+  ScaleUpY op;
+  instantiate_pixel_op(op, ibuf, newx, newy, dst_byte, dst_float, threaded);
+}

-  if (ibuf == nullptr) {
-    return false;
-  }
-  if (ibuf->byte_buffer.data) {
-    do_rect = true;
-  }
-  if (ibuf->float_buffer.data) {
-    do_float = true;
-  }
-  if (do_rect == false && do_float == false) {
-    return false;
+using ScaleFunction = void (*)(
+    const ImBuf *ibuf, int newx, int newy, uchar4 *dst_byte, float *dst_float, bool threaded);
+
+static void scale_with_function(ImBuf *ibuf, int newx, int newy, ScaleFunction func, bool threaded)
+{
+  /* Allocate destination buffers. */
+  uchar4 *dst_byte = nullptr;
+  float *dst_float = nullptr;
+  alloc_scale_dst_buffers(ibuf, newx, newy, &dst_byte, &dst_float);
+  if (dst_byte == nullptr && dst_float == nullptr) {
+    return;
  }

-  if (newx == ibuf->x && newy == ibuf->y) {
-    return false;
-  }
+  /* Do actual processing. */
+  func(ibuf, newx, newy, dst_byte, dst_float, threaded);

-  if (do_rect) {
-    _newrect = static_cast<uint *>(MEM_mallocN(newx * newy * sizeof(int), "scalefastimbuf"));
-    if (_newrect == nullptr) {
-      return false;
-    }
-    newrect = _newrect;
-  }
-
-  if (do_float) {
-    _newrectf = static_cast<float *>(
-        MEM_mallocN(sizeof(float) * ibuf->channels * newx * newy, "scalefastimbuf f"));
-    if (_newrectf == nullptr) {
-      if (_newrect) {
-        MEM_freeN(_newrect);
-      }
-      return false;
-    }
-    newrectf = _newrectf;
-  }
-
-  stepx = round(65536.0 * (ibuf->x - 1.0) / (newx - 1.0));
-  stepy = round(65536.0 * (ibuf->y - 1.0) / (newy - 1.0));
-  ofsy = 32768;
-
-  for (y = newy; y > 0; y--, ofsy += stepy) {
-    if (do_rect) {
-      rect = (uint *)ibuf->byte_buffer.data;
-      rect += (ofsy >> 16) * ibuf->x;
-      ofsx = 32768;
-
-      for (x = newx; x > 0; x--, ofsx += stepx) {
-        *newrect++ = rect[ofsx >> 16];
-      }
-    }
-
-    if (do_float) {
-      rectf = ibuf->float_buffer.data;
-      rectf += size_t(ofsy >> 16) * ibuf->x * ibuf->channels;
-      ofsx = 32768;
-
-      for (x = newx; x > 0; x--, ofsx += stepx) {
-        float *pixel = &rectf[size_t(ofsx >> 16) * ibuf->channels];
-        for (int c = 0; c < ibuf->channels; ++c) {
-          *newrectf++ = pixel[c];
-        }
-      }
-    }
-  }
-
-  if (do_rect) {
+  /* Modify image to point to new destination. */
+  if (dst_byte != nullptr) {
    imb_freerectImBuf(ibuf);
-    IMB_assign_byte_buffer(ibuf, reinterpret_cast<uint8_t *>(_newrect), IB_TAKE_OWNERSHIP);
+    IMB_assign_byte_buffer(ibuf, reinterpret_cast<uint8_t *>(dst_byte), IB_TAKE_OWNERSHIP);
  }
-
-  if (do_float) {
+  if (dst_float != nullptr) {
    imb_freerectfloatImBuf(ibuf);
-    IMB_assign_float_buffer(ibuf, reinterpret_cast<float *>(_newrectf), IB_TAKE_OWNERSHIP);
+    IMB_assign_float_buffer(ibuf, dst_float, IB_TAKE_OWNERSHIP);
  }
-
  ibuf->x = newx;
  ibuf->y = newy;
-  return true;
 }

-/* ******** threaded scaling ******** */
-
-struct ScaleTreadInitData {
-  ImBuf *ibuf;
-
-  uint newx;
-  uint newy;
-
-  uchar *byte_buffer;
-  float *float_buffer;
-};
-
-struct ScaleThreadData {
-  ImBuf *ibuf;
-
-  uint newx;
-  uint newy;
-
-  int start_line;
-  int tot_line;
-
-  uchar *byte_buffer;
-  float *float_buffer;
-};
-
-static void scale_thread_init(void *data_v, int start_line, int tot_line, void *init_data_v)
+static void imb_scale_box(ImBuf *ibuf, uint newx, uint newy, bool threaded)
 {
-  ScaleThreadData *data = (ScaleThreadData *)data_v;
-  ScaleTreadInitData *init_data = (ScaleTreadInitData *)init_data_v;
-
-  data->ibuf = init_data->ibuf;
-
-  data->newx = init_data->newx;
-  data->newy = init_data->newy;
-
-  data->start_line = start_line;
-  data->tot_line = tot_line;
-
-  data->byte_buffer = init_data->byte_buffer;
-  data->float_buffer = init_data->float_buffer;
+  if (newx != 0 && (newx < ibuf->x)) {
+    scale_with_function(ibuf, newx, ibuf->y, scale_down_x_func, threaded);
+  }
+  if (newy != 0 && (newy < ibuf->y)) {
+    scale_with_function(ibuf, ibuf->x, newy, scale_down_y_func, threaded);
+  }
+  if (newx != 0 && (newx > ibuf->x)) {
+    scale_with_function(ibuf, newx, ibuf->y, scale_up_x_func, threaded);
+  }
+  if (newy != 0 && (newy > ibuf->y)) {
+    scale_with_function(ibuf, ibuf->x, newy, scale_up_y_func, threaded);
+  }
 }

-static void *do_scale_thread(void *data_v)
+template<typename T>
+static void scale_nearest(
+    const T *src, T *dst, int ibufx, int ibufy, int newx, int newy, blender::IndexRange y_range)
 {
+  /* Nearest sample scaling. Step through pixels in fixed point coordinates. */
+  constexpr int FRAC_BITS = 16;
+  int64_t stepx = ((int64_t(ibufx) << FRAC_BITS) + newx / 2) / newx;
+  int64_t stepy = ((int64_t(ibufy) << FRAC_BITS) + newy / 2) / newy;
+  int64_t posy = y_range.first() * stepy;
+  dst += y_range.first() * newx;
+  for (const int y : y_range) {
+    UNUSED_VARS(y);
+    const T *row = src + (posy >> FRAC_BITS) * ibufx;
+    int64_t posx = 0;
+    for (int x = 0; x < newx; x++, posx += stepx) {
+      *dst = row[posx >> FRAC_BITS];
+      dst++;
+    }
+    posy += stepy;
+  }
+}
+
+static void scale_nearest_func(
+    const ImBuf *ibuf, int newx, int newy, uchar4 *dst_byte, float *dst_float, bool threaded)
+{
+  using namespace blender;
+
+  const int grain_size = threaded ? 64 : newy;
+  threading::parallel_for(IndexRange(newy), grain_size, [&](IndexRange y_range) {
+    /* Byte pixels. */
+    if (dst_byte != nullptr) {
+      const uchar4 *src = (const uchar4 *)ibuf->byte_buffer.data;
+      scale_nearest(src, dst_byte, ibuf->x, ibuf->y, newx, newy, y_range);
+    }
+    /* Float pixels. */
+    if (dst_float != nullptr) {
+      if (ibuf->channels == 1) {
+        scale_nearest(ibuf->float_buffer.data, dst_float, ibuf->x, ibuf->y, newx, newy, y_range);
+      }
+      else if (ibuf->channels == 2) {
+        const float2 *src = (const float2 *)ibuf->float_buffer.data;
+        scale_nearest(src, (float2 *)dst_float, ibuf->x, ibuf->y, newx, newy, y_range);
+      }
+      else if (ibuf->channels == 3) {
+        const float3 *src = (const float3 *)ibuf->float_buffer.data;
+        scale_nearest(src, (float3 *)dst_float, ibuf->x, ibuf->y, newx, newy, y_range);
+      }
+      else if (ibuf->channels == 4) {
+        const float4 *src = (const float4 *)ibuf->float_buffer.data;
+        scale_nearest(src, (float4 *)dst_float, ibuf->x, ibuf->y, newx, newy, y_range);
+      }
+    }
+  });
+}
+
+static void scale_bilinear_func(
+    const ImBuf *ibuf, int newx, int newy, uchar4 *dst_byte, float *dst_float, bool threaded)
+{
+  using namespace blender;
  using namespace blender::imbuf;
-  ScaleThreadData *data = (ScaleThreadData *)data_v;
-  ImBuf *ibuf = data->ibuf;
-  int i;
-  float factor_x = float(ibuf->x) / data->newx;
-  float factor_y = float(ibuf->y) / data->newy;

-  for (i = 0; i < data->tot_line; i++) {
-    int y = data->start_line + i;
-    int x;
+  const int grain_size = threaded ? 32 : newy;
+  threading::parallel_for(IndexRange(newy), grain_size, [&](IndexRange y_range) {
+    float factor_x = float(ibuf->x) / newx;
+    float factor_y = float(ibuf->y) / newy;

-    for (x = 0; x < data->newx; x++) {
-      float u = float(x) * factor_x;
-      float v = float(y) * factor_y;
-      int offset = y * data->newx + x;
-
-      if (data->byte_buffer) {
-        interpolate_bilinear_border_byte(ibuf, data->byte_buffer + 4 * offset, u, v);
-      }
-
-      if (data->float_buffer) {
-        float *pixel = data->float_buffer + ibuf->channels * offset;
-        blender::math::interpolate_bilinear_border_fl(
-            ibuf->float_buffer.data, pixel, ibuf->x, ibuf->y, ibuf->channels, u, v);
+    for (const int y : y_range) {
+      float v = (float(y) + 0.5f) * factor_y - 0.5f;
+      for (int x = 0; x < newx; x++) {
+        float u = (float(x) + 0.5f) * factor_x - 0.5f;
+        int64_t offset = int64_t(y) * newx + x;
+        if (dst_byte) {
+          interpolate_bilinear_byte(ibuf, (uchar *)(dst_byte + offset), u, v);
+        }
+        if (dst_float) {
+          float *pixel = dst_float + ibuf->channels * offset;
+          math::interpolate_bilinear_fl(
+              ibuf->float_buffer.data, pixel, ibuf->x, ibuf->y, ibuf->channels, u, v);
+        }
      }
    }
-  }
-
-  return nullptr;
+  });
 }

-void IMB_scaleImBuf_threaded(ImBuf *ibuf, uint newx, uint newy)
+bool IMB_scale(
+    ImBuf *ibuf, unsigned int newx, unsigned int newy, IMBScaleFilter filter, bool threaded)
 {
  BLI_assert_msg(newx > 0 && newy > 0, "Images must be at least 1 on both dimensions!");
-
-  ScaleTreadInitData init_data = {nullptr};
-
-  /* prepare initialization data */
-  init_data.ibuf = ibuf;
-
-  init_data.newx = newx;
-  init_data.newy = newy;
-
-  if (ibuf->byte_buffer.data) {
-    init_data.byte_buffer = static_cast<uchar *>(
-        MEM_mallocN(4 * newx * newy * sizeof(char), "threaded scale byte buffer"));
+  if (ibuf == nullptr) {
+    return false;
+  }
+  if (newx == ibuf->x && newy == ibuf->y) {
+    return false;
  }

-  if (ibuf->float_buffer.data) {
-    init_data.float_buffer = static_cast<float *>(
-        MEM_mallocN(ibuf->channels * newx * newy * sizeof(float), "threaded scale float buffer"));
+  if (filter == IMBScaleFilter::Nearest) {
+    scale_with_function(ibuf, newx, newy, scale_nearest_func, threaded);
  }
-
-  /* actual scaling threads */
-  IMB_processor_apply_threaded(
-      newy, sizeof(ScaleThreadData), &init_data, scale_thread_init, do_scale_thread);
-
-  /* alter image buffer */
-  ibuf->x = newx;
-  ibuf->y = newy;
-
-  if (ibuf->byte_buffer.data) {
-    imb_freerectImBuf(ibuf);
-    IMB_assign_byte_buffer(ibuf, init_data.byte_buffer, IB_TAKE_OWNERSHIP);
+  else if (filter == IMBScaleFilter::Bilinear) {
+    scale_with_function(ibuf, newx, newy, scale_bilinear_func, threaded);
  }
-
-  if (ibuf->float_buffer.data) {
-    imb_freerectfloatImBuf(ibuf);
-    IMB_assign_float_buffer(ibuf, init_data.float_buffer, IB_TAKE_OWNERSHIP);
+  else if (filter == IMBScaleFilter::Box) {
+    imb_scale_box(ibuf, newx, newy, threaded);
  }
+  else {
+    BLI_assert_unreachable();
+    return false;
+  }
+  return true;
 }
--- a/source/blender/imbuf/intern/stereoimbuf.cc
+++ b/source/blender/imbuf/intern/stereoimbuf.cc
@@ -573,7 +573,7 @@ static void imb_stereo3d_squeeze_ImBuf(ImBuf *ibuf,
    return;
  }

-  IMB_scaleImBuf_threaded(ibuf, x, y);
+  IMB_scale(ibuf, x, y, IMBScaleFilter::Bilinear);
 }

 static void imb_stereo3d_unsqueeze_ImBuf(ImBuf *ibuf,
@@ -589,7 +589,7 @@ static void imb_stereo3d_unsqueeze_ImBuf(ImBuf *ibuf,
    return;
  }

-  IMB_scaleImBuf_threaded(ibuf, x, y);
+  IMB_scale(ibuf, x, y, IMBScaleFilter::Bilinear);
 }

 static void imb_stereo3d_squeeze_rectf(
@@ -621,7 +621,7 @@ static void imb_stereo3d_squeeze_rectf(
                              width,
                              width);

-  IMB_scaleImBuf_threaded(ibuf, x, y);
+  IMB_scale(ibuf, x, y, IMBScaleFilter::Bilinear);
  memcpy(rectf, ibuf->float_buffer.data, x * y * sizeof(float[4]));
  IMB_freeImBuf(ibuf);
 }
@@ -654,7 +654,7 @@ static void imb_stereo3d_squeeze_rect(
                            width,
                            width);

-  IMB_scaleImBuf_threaded(ibuf, x, y);
+  IMB_scale(ibuf, x, y, IMBScaleFilter::Bilinear);
  memcpy(rect, ibuf->byte_buffer.data, x * y * sizeof(uint));
  IMB_freeImBuf(ibuf);
 }
--- a/source/blender/imbuf/intern/thumbs.cc
+++ b/source/blender/imbuf/intern/thumbs.cc
@@ -424,7 +424,7 @@ static ImBuf *thumb_create_ex(const char *file_path,
          }
          imb_freerectfloatImBuf(img);
        }
-        IMB_scaleImBuf(img, ex, ey);
+        IMB_scale(img, ex, ey, IMBScaleFilter::Box, false);
      }
    }
    SNPRINTF(desc, "Thumbnail for %s", uri);
--- a/source/blender/imbuf/intern/util_gpu.cc
+++ b/source/blender/imbuf/intern/util_gpu.cc
@@ -206,7 +206,7 @@ static void *imb_gpu_get_data(const ImBuf *ibuf,
    const float *rect_float = (is_float_rect) ? (float *)data_rect : nullptr;

    ImBuf *scale_ibuf = IMB_allocFromBuffer(rect, rect_float, ibuf->x, ibuf->y, 4);
-    IMB_scaleImBuf(scale_ibuf, UNPACK2(rescale_size));
+    IMB_scale(scale_ibuf, UNPACK2(rescale_size), IMBScaleFilter::Box, false);

    if (freedata) {
      MEM_freeN(data_rect);
--- a/source/blender/imbuf/tests/IMB_scaling_test.cc
+++ b/source/blender/imbuf/tests/IMB_scaling_test.cc
@@ -0,0 +1,294 @@
+/* SPDX-FileCopyrightText: 2024 Blender Authors
+ *
+ * SPDX-License-Identifier: GPL-2.0-or-later */
+
+#include "testing/testing.h"
+
+#include "BLI_math_vector_types.hh"
+#include "IMB_imbuf.hh"
+
+namespace blender::imbuf::tests {
+
+static ImBuf *create_6x2_test_image()
+{
+  ImBuf *img = IMB_allocImBuf(6, 2, 32, IB_rect);
+  uchar4 *col = reinterpret_cast<uchar4 *>(img->byte_buffer.data);
+
+  /* Source pixels are spelled out in 2x2 blocks below:
+   * nearest filter results in corner pixel from each block, bilinear
+   * is average of each block. */
+  col[0] = uchar4(0, 0, 0, 255);
+  col[1] = uchar4(255, 0, 0, 255);
+  col[6] = uchar4(255, 255, 0, 255);
+  col[7] = uchar4(255, 255, 255, 255);
+
+  col[2] = uchar4(133, 55, 31, 13);
+  col[3] = uchar4(133, 55, 31, 15);
+  col[8] = uchar4(133, 55, 31, 17);
+  col[9] = uchar4(133, 55, 31, 19);
+
+  col[4] = uchar4(50, 200, 0, 255);
+  col[5] = uchar4(55, 0, 32, 254);
+  col[10] = uchar4(56, 0, 64, 253);
+  col[11] = uchar4(57, 0, 96, 252);
+
+  return img;
+}
+
+static ImBuf *create_6x2_test_image_fl(int channels)
+{
+  ImBuf *img = IMB_allocImBuf(6, 2, 32, IB_rectfloat);
+  img->channels = channels;
+  float *col = img->float_buffer.data;
+
+  for (int y = 0; y < img->y; y++) {
+    for (int x = 0; x < img->x; x++) {
+      for (int ch = 0; ch < channels; ch++) {
+        *col = x * 1.25f + y * 0.5f + ch * 0.125f;
+        col++;
+      }
+    }
+  }
+  return img;
+}
+
+static ImBuf *scale_2x_smaller(bool nearest, bool threaded, int float_channels = 0)
+{
+  ImBuf *img = float_channels > 0 ? create_6x2_test_image_fl(float_channels) :
+                                    create_6x2_test_image();
+  int ww = 3, hh = 1;
+  if (threaded) {
+    IMB_scale(img, ww, hh, IMBScaleFilter::Bilinear, true);
+  }
+  else if (nearest) {
+    IMB_scale(img, ww, hh, IMBScaleFilter::Nearest, false);
+  }
+  else {
+    IMB_scale(img, ww, hh, IMBScaleFilter::Box, false);
+  }
+  return img;
+}
+
+static ImBuf *scale_to_1x1(bool nearest, bool threaded, int float_channels = 0)
+{
+  ImBuf *img = float_channels > 0 ? create_6x2_test_image_fl(float_channels) :
+                                    create_6x2_test_image();
+  int ww = 1, hh = 1;
+  if (threaded) {
+    IMB_scale(img, ww, hh, IMBScaleFilter::Bilinear, true);
+  }
+  else if (nearest) {
+    IMB_scale(img, ww, hh, IMBScaleFilter::Nearest, false);
+  }
+  else {
+    IMB_scale(img, ww, hh, IMBScaleFilter::Box, false);
+  }
+  return img;
+}
+
+static ImBuf *scale_fractional_larger(bool nearest, bool threaded, int float_channels = 0)
+{
+  ImBuf *img = float_channels > 0 ? create_6x2_test_image_fl(float_channels) :
+                                    create_6x2_test_image();
+  int ww = 9, hh = 7;
+  if (threaded) {
+    IMB_scale(img, ww, hh, IMBScaleFilter::Bilinear, true);
+  }
+  else if (nearest) {
+    IMB_scale(img, ww, hh, IMBScaleFilter::Nearest, false);
+  }
+  else {
+    IMB_scale(img, ww, hh, IMBScaleFilter::Box, false);
+  }
+  return img;
+}
+
+TEST(imbuf_scaling, nearest_2x_smaller)
+{
+  ImBuf *res = scale_2x_smaller(true, false);
+  const uchar4 *got = reinterpret_cast<uchar4 *>(res->byte_buffer.data);
+  EXPECT_EQ(uint4(got[0]), uint4(0, 0, 0, 255));
+  EXPECT_EQ(uint4(got[1]), uint4(133, 55, 31, 13));
+  EXPECT_EQ(uint4(got[2]), uint4(50, 200, 0, 255));
+  IMB_freeImBuf(res);
+}
+
+TEST(imbuf_scaling, threaded_2x_smaller)
+{
+  ImBuf *res = scale_2x_smaller(false, true);
+  const uchar4 *got = reinterpret_cast<uchar4 *>(res->byte_buffer.data);
+  EXPECT_EQ(uint4(got[0]), uint4(191, 128, 64, 255));
+  EXPECT_EQ(uint4(got[1]), uint4(133, 55, 31, 16));
+  EXPECT_EQ(uint4(got[2]), uint4(55, 50, 48, 254));
+  IMB_freeImBuf(res);
+}
+
+TEST(imbuf_scaling, bilinear_2x_smaller)
+{
+  ImBuf *res = scale_2x_smaller(false, false);
+  const uchar4 *got = reinterpret_cast<uchar4 *>(res->byte_buffer.data);
+  /* Note: IMB_transform results in (191, 128, 64, 255), <same>,
+   * (55, 50, 48, 254) i.e. different rounding. */
+  EXPECT_EQ(uint4(got[0]), uint4(191, 127, 63, 255));
+  EXPECT_EQ(uint4(got[1]), uint4(133, 55, 31, 16));
+  EXPECT_EQ(uint4(got[2]), uint4(55, 50, 48, 253));
+  IMB_freeImBuf(res);
+}
+
+TEST(imbuf_scaling, nearest_to_1x1)
+{
+  ImBuf *res = scale_to_1x1(true, false);
+  const uchar4 *got = reinterpret_cast<uchar4 *>(res->byte_buffer.data);
+  EXPECT_EQ(uint4(got[0]), uint4(0, 0, 0, 255));
+  IMB_freeImBuf(res);
+}
+
+TEST(imbuf_scaling, threaded_to_1x1)
+{
+  ImBuf *res = scale_to_1x1(false, true);
+  const uchar4 *got = reinterpret_cast<uchar4 *>(res->byte_buffer.data);
+  EXPECT_EQ(uint4(got[0]), uint4(133, 55, 31, 16));
+  IMB_freeImBuf(res);
+}
+
+TEST(imbuf_scaling, bilinear_to_1x1)
+{
+  ImBuf *res = scale_to_1x1(false, false);
+  const uchar4 *got = reinterpret_cast<uchar4 *>(res->byte_buffer.data);
+  EXPECT_EQ(uint4(got[0]), uint4(126, 78, 47, 174));
+  IMB_freeImBuf(res);
+}
+
+TEST(imbuf_scaling, nearest_fractional_larger)
+{
+  ImBuf *res = scale_fractional_larger(true, false);
+  const uchar4 *got = reinterpret_cast<uchar4 *>(res->byte_buffer.data);
+  EXPECT_EQ(uint4(got[0 + 0 * res->x]), uint4(0, 0, 0, 255));
+  EXPECT_EQ(uint4(got[1 + 0 * res->x]), uint4(0, 0, 0, 255));
+  EXPECT_EQ(uint4(got[7 + 0 * res->x]), uint4(50, 200, 0, 255));
+  EXPECT_EQ(uint4(got[2 + 2 * res->x]), uint4(255, 0, 0, 255));
+  EXPECT_EQ(uint4(got[3 + 2 * res->x]), uint4(133, 55, 31, 13));
+  EXPECT_EQ(uint4(got[8 + 6 * res->x]), uint4(57, 0, 96, 252));
+  IMB_freeImBuf(res);
+}
+
+TEST(imbuf_scaling, bilinear_fractional_larger)
+{
+  ImBuf *res = scale_fractional_larger(false, false);
+  const uchar4 *got = reinterpret_cast<uchar4 *>(res->byte_buffer.data);
+  EXPECT_EQ(uint4(got[0 + 0 * res->x]), uint4(0, 0, 0, 255));
+  EXPECT_EQ(uint4(got[1 + 0 * res->x]), uint4(127, 0, 0, 255));
+  EXPECT_EQ(uint4(got[7 + 0 * res->x]), uint4(52, 100, 16, 255));
+  EXPECT_EQ(uint4(got[2 + 2 * res->x]), uint4(235, 55, 51, 215));
+  EXPECT_EQ(uint4(got[3 + 2 * res->x]), uint4(153, 55, 35, 54));
+  EXPECT_EQ(uint4(got[8 + 6 * res->x]), uint4(37, 0, 62, 162));
+  IMB_freeImBuf(res);
+}
+
+static constexpr float EPS = 0.0001f;
+
+TEST(imbuf_scaling, nearest_2x_smaller_fl1)
+{
+  ImBuf *res = scale_2x_smaller(true, false, 1);
+  const float *got = res->float_buffer.data;
+  EXPECT_NEAR(got[0], 0.0f, EPS);
+  EXPECT_NEAR(got[1], 2.5f, EPS);
+  EXPECT_NEAR(got[2], 5.0f, EPS);
+  IMB_freeImBuf(res);
+}
+
+TEST(imbuf_scaling, nearest_2x_smaller_fl2)
+{
+  ImBuf *res = scale_2x_smaller(true, false, 2);
+  const float2 *got = reinterpret_cast<float2 *>(res->float_buffer.data);
+  EXPECT_V2_NEAR(got[0], float2(0.0f, 0.125f), EPS);
+  EXPECT_V2_NEAR(got[1], float2(2.5f, 2.625f), EPS);
+  EXPECT_V2_NEAR(got[2], float2(5.0f, 5.125f), EPS);
+  IMB_freeImBuf(res);
+}
+
+TEST(imbuf_scaling, nearest_2x_smaller_fl3)
+{
+  ImBuf *res = scale_2x_smaller(true, false, 3);
+  const float3 *got = reinterpret_cast<float3 *>(res->float_buffer.data);
+  EXPECT_V3_NEAR(got[0], float3(0.0f, 0.125f, 0.25f), EPS);
+  EXPECT_V3_NEAR(got[1], float3(2.5f, 2.625f, 2.75f), EPS);
+  EXPECT_V3_NEAR(got[2], float3(5.0f, 5.125f, 5.25f), EPS);
+  IMB_freeImBuf(res);
+}
+
+TEST(imbuf_scaling, nearest_2x_smaller_fl4)
+{
+  ImBuf *res = scale_2x_smaller(true, false, 4);
+  const float4 *got = reinterpret_cast<float4 *>(res->float_buffer.data);
+  EXPECT_V4_NEAR(got[0], float4(0.0f, 0.125f, 0.25f, 0.375f), EPS);
+  EXPECT_V4_NEAR(got[1], float4(2.5f, 2.625f, 2.75f, 2.875f), EPS);
+  EXPECT_V4_NEAR(got[2], float4(5.0f, 5.125f, 5.25f, 5.375f), EPS);
+  IMB_freeImBuf(res);
+}
+
+TEST(imbuf_scaling, nearest_to_1x1_fl3)
+{
+  ImBuf *res = scale_to_1x1(true, false, 3);
+  const float3 *got = reinterpret_cast<float3 *>(res->float_buffer.data);
+  EXPECT_V3_NEAR(got[0], float3(0, 0.125f, 0.25f), EPS);
+  IMB_freeImBuf(res);
+}
+
+TEST(imbuf_scaling, threaded_to_1x1_fl3)
+{
+  ImBuf *res = scale_to_1x1(false, true, 3);
+  const float3 *got = reinterpret_cast<float3 *>(res->float_buffer.data);
+  EXPECT_V3_NEAR(got[0], float3(3.375f, 3.5f, 3.625f), EPS);
+  IMB_freeImBuf(res);
+}
+
+TEST(imbuf_scaling, bilinear_to_1x1_fl3)
+{
+  ImBuf *res = scale_to_1x1(false, false, 3);
+  const float3 *got = reinterpret_cast<float3 *>(res->float_buffer.data);
+  EXPECT_V3_NEAR(got[0], float3(3.36853f, 3.49353f, 3.61853f), EPS);
+  IMB_freeImBuf(res);
+}
+
+TEST(imbuf_scaling, bilinear_2x_smaller_fl3)
+{
+  ImBuf *res = scale_2x_smaller(false, false, 3);
+  const float3 *got = reinterpret_cast<float3 *>(res->float_buffer.data);
+  EXPECT_V3_NEAR(got[0], float3(0.87270f, 0.99770f, 1.12270f), EPS);
+  EXPECT_V3_NEAR(got[1], float3(3.36853f, 3.49353f, 3.61853f), EPS);
+  EXPECT_V3_NEAR(got[2], float3(5.86435f, 5.98935f, 6.11435f), EPS);
+  IMB_freeImBuf(res);
+}
+
+TEST(imbuf_scaling, bilinear_2x_smaller_fl4)
+{
+  ImBuf *res = scale_2x_smaller(false, false, 4);
+  const float4 *got = reinterpret_cast<float4 *>(res->float_buffer.data);
+  EXPECT_V4_NEAR(got[0], float4(0.87270f, 0.99770f, 1.12270f, 1.24770f), EPS);
+  EXPECT_V4_NEAR(got[1], float4(3.36853f, 3.49353f, 3.61853f, 3.74353f), EPS);
+  EXPECT_V4_NEAR(got[2], float4(5.86435f, 5.98935f, 6.11435f, 6.23935f), EPS);
+  IMB_freeImBuf(res);
+}
+
+TEST(imbuf_scaling, threaded_2x_smaller_fl3)
+{
+  ImBuf *res = scale_2x_smaller(false, true, 3);
+  const float3 *got = reinterpret_cast<float3 *>(res->float_buffer.data);
+  EXPECT_V3_NEAR(got[0], float3(0.875f, 1.0f, 1.125f), EPS);
+  EXPECT_V3_NEAR(got[1], float3(3.375f, 3.5f, 3.625f), EPS);
+  EXPECT_V3_NEAR(got[2], float3(5.875f, 6.0f, 6.125f), EPS);
+  IMB_freeImBuf(res);
+}
+
+TEST(imbuf_scaling, threaded_2x_smaller_fl4)
+{
+  ImBuf *res = scale_2x_smaller(false, true, 4);
+  const float4 *got = reinterpret_cast<float4 *>(res->float_buffer.data);
+  EXPECT_V4_NEAR(got[0], float4(0.875f, 1.0f, 1.125f, 1.25f), EPS);
+  EXPECT_V4_NEAR(got[1], float4(3.375f, 3.5f, 3.625f, 3.75f), EPS);
+  EXPECT_V4_NEAR(got[2], float4(5.875f, 6.0f, 6.125f, 6.25f), EPS);
+  IMB_freeImBuf(res);
+}
+
+}  // namespace blender::imbuf::tests
--- a/source/blender/imbuf/tests/IMB_transform_test.cc
+++ b/source/blender/imbuf/tests/IMB_transform_test.cc
--- a/source/blender/imbuf/tests/performance/CMakeLists.txt
+++ b/source/blender/imbuf/tests/performance/CMakeLists.txt
@@ -0,0 +1,24 @@
+# SPDX-FileCopyrightText: 2024 Blender Authors
+#
+# SPDX-License-Identifier: GPL-2.0-or-later
+
+set(INC
+  ../..
+)
+
+set(INC_SYS
+)
+
+set(LIB
+  PRIVATE bf_blenlib
+  PRIVATE bf_imbuf
+)
+
+set(SRC
+  IMB_scaling_performance_test.cc
+)
+
+blender_add_test_performance_executable(IMB_performance "${SRC}" "${INC}" "${INC_SYS}" "${LIB}")
+if(WITH_BUILDINFO)
+  target_link_libraries(IMB_performance_test PRIVATE buildinfoobj)
+endif()
--- a/source/blender/imbuf/tests/performance/IMB_scaling_performance_test.cc
+++ b/source/blender/imbuf/tests/performance/IMB_scaling_performance_test.cc
@@ -0,0 +1,142 @@
+/* SPDX-FileCopyrightText: 2024 Blender Authors
+ *
+ * SPDX-License-Identifier: Apache-2.0 */
+
+#include "testing/testing.h"
+
+#include "IMB_imbuf.hh"
+
+#include "BLI_math_base.hh"
+#include "BLI_math_matrix.hh"
+#include "BLI_timeit.hh"
+
+using namespace blender;
+
+static constexpr int SRC_X = 5123;
+static constexpr int SRC_Y = 4091;
+
+static constexpr int DST_SMALLER_X = (int)(SRC_X * 0.21f);
+static constexpr int DST_SMALLER_Y = (int)(SRC_Y * 0.67f);
+
+static constexpr int DST_LARGER_X = (int)(SRC_X * 1.19f);
+static constexpr int DST_LARGER_Y = (int)(SRC_Y * 2.13f);
+
+static ImBuf *create_src_image(bool use_float)
+{
+  ImBuf *img = IMB_allocImBuf(SRC_X, SRC_Y, 32, use_float ? IB_rectfloat : IB_rect);
+  if (use_float) {
+    float *pix = img->float_buffer.data;
+    for (int i = 0; i < img->x * img->y; i++) {
+      pix[0] = i * 0.1f;
+      pix[1] = i * 2.1f;
+      pix[2] = i * 0.01f;
+      pix[3] = math::mod(i * 0.03f, 2.0f);
+      pix += 4;
+    }
+  }
+  else {
+    uchar *pix = img->byte_buffer.data;
+    for (int i = 0; i < img->x * img->y; i++) {
+      pix[0] = i & 0xFF;
+      pix[1] = (i * 3) & 0xFF;
+      pix[2] = (i + 12345) & 0xFF;
+      pix[3] = (i / 4) & 0xFF;
+      pix += 4;
+    }
+  }
+  return img;
+}
+
+static void imb_scale_via_transform(ImBuf *&src,
+                                    int width,
+                                    int height,
+                                    eIMBInterpolationFilterMode filter)
+{
+  ImBuf *dst = IMB_allocImBuf(width, height, src->planes, src->flags);
+  float4x4 matrix = math::from_scale<float4x4>(
+      float4(float(src->x) / dst->x, float(src->y) / dst->y, 1.0f, 1.0f));
+  IMB_transform(src, dst, IMB_TRANSFORM_MODE_REGULAR, filter, matrix.ptr(), nullptr);
+  IMB_freeImBuf(src);
+  src = dst;
+}
+
+static void imb_xform_nearest(ImBuf *&src, int width, int height)
+{
+  imb_scale_via_transform(src, width, height, IMB_FILTER_NEAREST);
+}
+static void imb_xform_bilinear(ImBuf *&src, int width, int height)
+{
+  imb_scale_via_transform(src, width, height, IMB_FILTER_BILINEAR);
+}
+static void imb_xform_box(ImBuf *&src, int width, int height)
+{
+  imb_scale_via_transform(src,
+                          width,
+                          height,
+                          width < src->x && height < src->y ? IMB_FILTER_BOX :
+                                                              IMB_FILTER_BILINEAR);
+}
+static void imb_scale_nearest_st(ImBuf *&src, int width, int height)
+{
+  IMB_scale(src, width, height, IMBScaleFilter::Nearest, false);
+}
+static void imb_scale_nearest(ImBuf *&src, int width, int height)
+{
+  IMB_scale(src, width, height, IMBScaleFilter::Nearest, true);
+}
+static void imb_scale_bilinear_st(ImBuf *&src, int width, int height)
+{
+  IMB_scale(src, width, height, IMBScaleFilter::Bilinear, false);
+}
+static void imb_scale_bilinear(ImBuf *&src, int width, int height)
+{
+  IMB_scale(src, width, height, IMBScaleFilter::Bilinear, true);
+}
+static void imb_scale_box_st(ImBuf *&src, int width, int height)
+{
+  IMB_scale(src, width, height, IMBScaleFilter::Box, false);
+}
+static void imb_scale_box(ImBuf *&src, int width, int height)
+{
+  IMB_scale(src, width, height, IMBScaleFilter::Box, true);
+}
+
+static void scale_perf_impl(const char *name,
+                            bool use_float,
+                            void (*func)(ImBuf *&src, int width, int height))
+{
+  ImBuf *img = create_src_image(use_float);
+  {
+    SCOPED_TIMER(name);
+    func(img, DST_LARGER_X, DST_LARGER_Y);
+    func(img, SRC_X, SRC_Y);
+    func(img, DST_SMALLER_X, DST_SMALLER_Y);
+    func(img, DST_LARGER_X, DST_LARGER_Y);
+  }
+  IMB_freeImBuf(img);
+}
+
+static void test_scaling_perf(bool use_float)
+{
+  scale_perf_impl("scale_neare_s", use_float, imb_scale_nearest_st);
+  scale_perf_impl("scale_neare_m", use_float, imb_scale_nearest);
+  scale_perf_impl("xform_neare_m", use_float, imb_xform_nearest);
+
+  scale_perf_impl("scale_bilin_s", use_float, imb_scale_bilinear_st);
+  scale_perf_impl("scale_bilin_m", use_float, imb_scale_bilinear);
+  scale_perf_impl("xform_bilin_m", use_float, imb_xform_bilinear);
+
+  scale_perf_impl("scale_boxfl_s", use_float, imb_scale_box_st);
+  scale_perf_impl("scale_boxfl_m", use_float, imb_scale_box);
+  scale_perf_impl("xform_boxfl_m", use_float, imb_xform_box);
+}
+
+TEST(imbuf_scaling, scaling_perf_byte)
+{
+  test_scaling_perf(false);
+}
+
+TEST(imbuf_scaling, scaling_perf_float)
+{
+  test_scaling_perf(true);
+}
--- a/source/blender/python/generic/imbuf_py_api.cc
+++ b/source/blender/python/generic/imbuf_py_api.cc
@@ -117,10 +117,10 @@ static PyObject *py_imbuf_resize(Py_ImBuf *self, PyObject *args, PyObject *kw)
  }

  if (method.value_found == FAST) {
-    IMB_scalefastImBuf(self->ibuf, UNPACK2(size));
+    IMB_scale(self->ibuf, UNPACK2(size), IMBScaleFilter::Nearest, false);
  }
  else if (method.value_found == BILINEAR) {
-    IMB_scaleImBuf(self->ibuf, UNPACK2(size));
+    IMB_scale(self->ibuf, UNPACK2(size), IMBScaleFilter::Box, false);
  }
  else {
    BLI_assert_unreachable();
--- a/source/blender/sequencer/intern/proxy.cc
+++ b/source/blender/sequencer/intern/proxy.cc
@@ -287,7 +287,7 @@ static void seq_proxy_build_frame(const SeqRenderData *context,
    ibuf = IMB_dupImBuf(ibuf_tmp);
    IMB_metadata_copy(ibuf, ibuf_tmp);
    IMB_freeImBuf(ibuf_tmp);
-    IMB_scalefastImBuf(ibuf, short(rectx), short(recty));
+    IMB_scale(ibuf, rectx, recty, IMBScaleFilter::Nearest, false);
  }
  else {
    ibuf = ibuf_tmp;
--- a/source/blender/windowmanager/intern/wm_files.cc
+++ b/source/blender/windowmanager/intern/wm_files.cc
@@ -1774,7 +1774,7 @@ static ImBuf *blend_file_thumb_from_screenshot(bContext *C, BlendThumbnail **r_t
    }

    /* File-system thumbnail image can be 256x256. */
-    IMB_scaleImBuf(ibuf, ex * 2, ey * 2);
+    IMB_scale(ibuf, ex * 2, ey * 2, IMBScaleFilter::Box, false);

    /* Save metadata for quick access. */
    char version_st[10] = {0};
@@ -1784,7 +1784,7 @@ static ImBuf *blend_file_thumb_from_screenshot(bContext *C, BlendThumbnail **r_t

    /* Thumbnail inside blend should be 128x128. */
    ImBuf *thumb_ibuf = IMB_dupImBuf(ibuf);
-    IMB_scaleImBuf(thumb_ibuf, ex, ey);
+    IMB_scale(thumb_ibuf, ex, ey, IMBScaleFilter::Box, false);

    BlendThumbnail *thumb = BKE_main_thumbnail_from_imbuf(nullptr, thumb_ibuf);
    IMB_freeImBuf(thumb_ibuf);
@@ -1899,11 +1899,15 @@ static ImBuf *blend_file_thumb_from_camera(const bContext *C,
    IMB_metadata_set_field(ibuf->metadata, "Thumb::Blender::Version", version_st);

    /* BLEN_THUMB_SIZE is size of thumbnail inside blend file: 128x128. */
-    IMB_scaleImBuf(thumb_ibuf, BLEN_THUMB_SIZE, BLEN_THUMB_SIZE);
+    IMB_scale(thumb_ibuf, BLEN_THUMB_SIZE, BLEN_THUMB_SIZE, IMBScaleFilter::Box, false);
    thumb = BKE_main_thumbnail_from_imbuf(nullptr, thumb_ibuf);
    IMB_freeImBuf(thumb_ibuf);
    /* Thumbnail saved to file-system should be 256x256. */
-    IMB_scaleImBuf(ibuf, PREVIEW_RENDER_LARGE_HEIGHT, PREVIEW_RENDER_LARGE_HEIGHT);
+    IMB_scale(ibuf,
+              PREVIEW_RENDER_LARGE_HEIGHT,
+              PREVIEW_RENDER_LARGE_HEIGHT,
+              IMBScaleFilter::Box,
+              false);
  }
  else {
    /* '*r_thumb' needs to stay nullptr to prevent a bad thumbnail from being handled. */
--- a/source/blender/windowmanager/intern/wm_splash_screen.cc
+++ b/source/blender/windowmanager/intern/wm_splash_screen.cc
@@ -158,7 +158,7 @@ static ImBuf *wm_block_splash_image(int width, int *r_height)
    ibuf->planes = 32; /* The image might not have an alpha channel. */
    height = (width * ibuf->y) / ibuf->x;
    if (width != ibuf->x || height != ibuf->y) {
-      IMB_scaleImBuf(ibuf, width, height);
+      IMB_scale(ibuf, width, height, IMBScaleFilter::Box, false);
    }

    wm_block_splash_image_roundcorners_add(ibuf);