VSE: Multi-threaded video proxy downscaling

When building proxies at lower than 100% resolution, the video frame downscaling step was single threaded, as found via #127956. Make it use the same threaded sws_scale machinery that the usual video decoding/encoding uses. Video encoding/decoding was only using it for RGB<->YUV conversions, so source and destination sizes were always matching; here it needs to have different source and destination sizes though. Time taken to rebuild 50% proxy for a 4K resolution 1440 frames (1 minute) long video file, on Ryzen 5950X (Win10/VS2022): - Blender 4.2: 20.1 sec, CPU usage 30-40%. - Blender 4.3 main: 13.1 sec (ffmpeg build has been fixed to use SIMD), CPU usage still 30-40% though. - This PR: 8.3 sec, CPU usage ~95%. Pull Request: https://projects.blender.org/blender/blender/pulls/128054
2024-09-24 12:47:41 +02:00
parent e44fc3df06
commit 64feb05089
4 changed files with 62 additions and 54 deletions
--- a/source/blender/blenkernel/BKE_writeffmpeg.hh
+++ b/source/blender/blenkernel/BKE_writeffmpeg.hh
@@ -36,13 +36,13 @@ enum {
 };

 struct AVFrame;
+struct ImageFormatData;
+struct ImBuf;
 struct RenderData;
 struct ReportList;
 struct Scene;
 struct SwsContext;

-struct ImBuf;
-
 bool BKE_ffmpeg_start(void *context_v,
                      const Scene *scene,
                      RenderData *rd,
@@ -79,8 +79,13 @@ void BKE_ffmpeg_exit();
 * to release it. Internally the contexts are coming from the context
 * pool/cache.
 */
-SwsContext *BKE_ffmpeg_sws_get_context(
-    int width, int height, int av_src_format, int av_dst_format, int sws_flags);
+SwsContext *BKE_ffmpeg_sws_get_context(int src_width,
+                                       int src_height,
+                                       int av_src_format,
+                                       int dst_width,
+                                       int dst_height,
+                                       int av_dst_format,
+                                       int sws_flags);
 void BKE_ffmpeg_sws_release_context(SwsContext *ctx);

 void BKE_ffmpeg_sws_scale_frame(SwsContext *ctx, AVFrame *dst, const AVFrame *src);
--- a/source/blender/blenkernel/intern/writeffmpeg.cc
+++ b/source/blender/blenkernel/intern/writeffmpeg.cc
@@ -64,7 +64,8 @@ struct StampData;
 constexpr int64_t swscale_cache_max_entries = 32;

 struct SwscaleContext {
-  int width = 0, height = 0;
+  int src_width = 0, src_height = 0;
+  int dst_width = 0, dst_height = 0;
  AVPixelFormat src_format = AV_PIX_FMT_NONE, dst_format = AV_PIX_FMT_NONE;
  int flags = 0;

@@ -700,8 +701,13 @@ static const AVCodec *get_av1_encoder(
  return codec;
 }

-static SwsContext *sws_create_context(
-    int width, int height, int av_src_format, int av_dst_format, int sws_flags)
+static SwsContext *sws_create_context(int src_width,
+                                      int src_height,
+                                      int av_src_format,
+                                      int dst_width,
+                                      int dst_height,
+                                      int av_dst_format,
+                                      int sws_flags)
 {
 #  if defined(FFMPEG_SWSCALE_THREADING)
  /* sws_getContext does not allow passing flags that ask for multi-threaded
@@ -710,11 +716,11 @@ static SwsContext *sws_create_context(
  if (c == nullptr) {
    return nullptr;
  }
-  av_opt_set_int(c, "srcw", width, 0);
-  av_opt_set_int(c, "srch", height, 0);
+  av_opt_set_int(c, "srcw", src_width, 0);
+  av_opt_set_int(c, "srch", src_height, 0);
  av_opt_set_int(c, "src_format", av_src_format, 0);
-  av_opt_set_int(c, "dstw", width, 0);
-  av_opt_set_int(c, "dsth", height, 0);
+  av_opt_set_int(c, "dstw", dst_width, 0);
+  av_opt_set_int(c, "dsth", dst_height, 0);
  av_opt_set_int(c, "dst_format", av_dst_format, 0);
  av_opt_set_int(c, "sws_flags", sws_flags, 0);
  av_opt_set_int(c, "threads", BLI_system_thread_count(), 0);
@@ -724,11 +730,11 @@ static SwsContext *sws_create_context(
    return nullptr;
  }
 #  else
-  SwsContext *c = sws_getContext(width,
-                                 height,
+  SwsContext *c = sws_getContext(src_width,
+                                 src_height,
                                 AVPixelFormat(av_src_format),
-                                 width,
-                                 height,
+                                 dst_width,
+                                 dst_height,
                                 AVPixelFormat(av_dst_format),
                                 sws_flags,
                                 nullptr,
@@ -783,8 +789,13 @@ static void maintain_swscale_cache_size()
  }
 }

-SwsContext *BKE_ffmpeg_sws_get_context(
-    int width, int height, int av_src_format, int av_dst_format, int sws_flags)
+SwsContext *BKE_ffmpeg_sws_get_context(int src_width,
+                                       int src_height,
+                                       int av_src_format,
+                                       int dst_width,
+                                       int dst_height,
+                                       int av_dst_format,
+                                       int sws_flags)
 {
  BLI_mutex_lock(&swscale_cache_lock);

@@ -795,7 +806,8 @@ SwsContext *BKE_ffmpeg_sws_get_context(
  /* Search for unused context that has suitable parameters. */
  SwsContext *ctx = nullptr;
  for (SwscaleContext &c : *swscale_cache) {
-    if (!c.is_used && c.width == width && c.height == height && c.src_format == av_src_format &&
+    if (!c.is_used && c.src_width == src_width && c.src_height == src_height &&
+        c.src_format == av_src_format && c.dst_width == dst_width && c.dst_height == dst_height &&
        c.dst_format == av_dst_format && c.flags == sws_flags)
    {
      ctx = c.context;
@@ -807,10 +819,13 @@ SwsContext *BKE_ffmpeg_sws_get_context(
  }
  if (ctx == nullptr) {
    /* No free matching context in cache: create a new one. */
-    ctx = sws_create_context(width, height, av_src_format, av_dst_format, sws_flags);
+    ctx = sws_create_context(
+        src_width, src_height, av_src_format, dst_width, dst_height, av_dst_format, sws_flags);
    SwscaleContext c;
-    c.width = width;
-    c.height = height;
+    c.src_width = src_width;
+    c.src_height = src_height;
+    c.dst_width = dst_width;
+    c.dst_height = dst_height;
    c.src_format = AVPixelFormat(av_src_format);
    c.dst_format = AVPixelFormat(av_dst_format);
    c.flags = sws_flags;
@@ -1108,7 +1123,7 @@ static AVStream *alloc_video_stream(FFMpegContext *context,
    /* Output pixel format is different, allocate frame for conversion. */
    context->img_convert_frame = alloc_picture(AV_PIX_FMT_RGBA, c->width, c->height);
    context->img_convert_ctx = BKE_ffmpeg_sws_get_context(
-        c->width, c->height, AV_PIX_FMT_RGBA, c->pix_fmt, SWS_BICUBIC);
+        c->width, c->height, AV_PIX_FMT_RGBA, c->width, c->height, c->pix_fmt, SWS_BICUBIC);
  }

  avcodec_parameters_from_context(st->codecpar, c);
--- a/source/blender/imbuf/intern/anim_movie.cc
+++ b/source/blender/imbuf/intern/anim_movie.cc
@@ -418,6 +418,8 @@ static int startffmpeg(ImBufAnim *anim)
  anim->img_convert_ctx = BKE_ffmpeg_sws_get_context(anim->x,
                                                     anim->y,
                                                     anim->pCodecCtx->pix_fmt,
+                                                     anim->x,
+                                                     anim->y,
                                                     AV_PIX_FMT_RGBA,
                                                     SWS_BILINEAR | SWS_PRINT_INFO |
                                                         SWS_FULL_CHR_H_INT);
--- a/source/blender/imbuf/intern/indexer.cc
+++ b/source/blender/imbuf/intern/indexer.cc
@@ -27,6 +27,8 @@
 #  include "BLI_winstuff.h"
 #endif

+#include "BKE_writeffmpeg.hh"
+
 #include "IMB_anim.hh"
 #include "IMB_imbuf.hh"
 #include "IMB_indexer.hh"
@@ -35,6 +37,7 @@
 #ifdef WITH_FFMPEG
 extern "C" {
 #  include "ffmpeg_compat.h"
+#  include <libavutil/cpu.h>
 #  include <libavutil/imgutils.h>
 }
 #endif
@@ -595,32 +598,20 @@ static proxy_output_ctx *alloc_proxy_output_ffmpeg(
  if (st->codecpar->width != width || st->codecpar->height != height ||
      st->codecpar->format != rv->c->pix_fmt)
  {
+    const size_t align = av_cpu_max_align();
    rv->frame = av_frame_alloc();
-
-    av_image_fill_arrays(rv->frame->data,
-                         rv->frame->linesize,
-                         static_cast<const uint8_t *>(MEM_mallocN(
-                             av_image_get_buffer_size(rv->c->pix_fmt, width, height, 1),
-                             "alloc proxy output frame")),
-                         rv->c->pix_fmt,
-                         width,
-                         height,
-                         1);
-
    rv->frame->format = rv->c->pix_fmt;
    rv->frame->width = width;
    rv->frame->height = height;
+    av_frame_get_buffer(rv->frame, align);

-    rv->sws_ctx = sws_getContext(st->codecpar->width,
-                                 rv->orig_height,
-                                 AVPixelFormat(st->codecpar->format),
-                                 width,
-                                 height,
-                                 rv->c->pix_fmt,
-                                 SWS_FAST_BILINEAR | SWS_PRINT_INFO,
-                                 nullptr,
-                                 nullptr,
-                                 nullptr);
+    rv->sws_ctx = BKE_ffmpeg_sws_get_context(st->codecpar->width,
+                                             rv->orig_height,
+                                             AVPixelFormat(st->codecpar->format),
+                                             width,
+                                             height,
+                                             rv->c->pix_fmt,
+                                             SWS_FAST_BILINEAR);
  }

  ret = avformat_write_header(rv->of, nullptr);
@@ -655,13 +646,7 @@ static void add_to_proxy_output_ffmpeg(proxy_output_ctx *ctx, AVFrame *frame)
  if (ctx->sws_ctx && frame &&
      (frame->data[0] || frame->data[1] || frame->data[2] || frame->data[3]))
  {
-    sws_scale(ctx->sws_ctx,
-              (const uint8_t *const *)frame->data,
-              frame->linesize,
-              0,
-              ctx->orig_height,
-              ctx->frame->data,
-              ctx->frame->linesize);
+    BKE_ffmpeg_sws_scale_frame(ctx->sws_ctx, ctx->frame, frame);
  }

  frame = ctx->sws_ctx ? (frame ? ctx->frame : nullptr) : frame;
@@ -752,10 +737,11 @@ static void free_proxy_output_ffmpeg(proxy_output_ctx *ctx, int rollback)
  avformat_free_context(ctx->of);

  if (ctx->sws_ctx) {
-    sws_freeContext(ctx->sws_ctx);
-
-    MEM_freeN(ctx->frame->data[0]);
-    av_free(ctx->frame);
+    BKE_ffmpeg_sws_release_context(ctx->sws_ctx);
+    ctx->sws_ctx = nullptr;
+  }
+  if (ctx->frame) {
+    av_frame_free(&ctx->frame);
  }

  get_proxy_filepath(ctx->anim, ctx->proxy_size, filepath_tmp, true);