Fix #125446: Video decoding artifacts on AVX512 CPU with some video widths

When a video width is a multiple of 8 but not 16 (i.e. line size stride of RGBA frame is multiple of 32 bytes but not 64 bytes), then on an AVX512 capable CPU the decoded video has a block of "wrapped" artifacts on the left side. This started happening in 4.1 with 4ef5d9f60f, where the vertical flip is done together with YUV->RGB conversion. The logic in there checks whether ffmpeg frame line size (which might include necessary SIMD alignment/padding) matches packed imbuf line stride. However, ffmpeg arguably has a bug, where `av_frame_get_buffer` always uses 32 byte alignment, instead of 64 byte when on AVX512 code path. Report has been made to ffmpeg upstream: https://trac.ffmpeg.org/ticket/11116 and in the meantime, explicitly pass `av_cpu_max_align` to `av_frame_get_buffer`. Pull Request: https://projects.blender.org/blender/blender/pulls/125578
2024-07-29 14:58:09 +02:00
parent e0cdfb8566
commit b17734598d
1 changed files with 10 additions and 21 deletions
--- a/source/blender/imbuf/intern/anim_movie.cc
+++ b/source/blender/imbuf/intern/anim_movie.cc
@@ -43,6 +43,7 @@
 extern "C" {
 #  include <libavcodec/avcodec.h>
 #  include <libavformat/avformat.h>
+#  include <libavutil/cpu.h>
 #  include <libavutil/imgutils.h>
 #  include <libavutil/rational.h>
 #  include <libswscale/swscale.h>
@@ -368,7 +369,13 @@ static int startffmpeg(ImBufAnim *anim)
  anim->pFrameRGB->width = anim->x;
  anim->pFrameRGB->height = anim->y;

-  if (av_frame_get_buffer(anim->pFrameRGB, 0) < 0) {
+  /* Note: even if av_frame_get_buffer suggests to pass 0 for alignment,
+   * as of ffmpeg 6.1/7.0 it does not use correct alignment for AVX512
+   * CPU (frame.c get_video_buffer ends up always using 32 alignment,
+   * whereas it should have used 64). Reported upstream:
+   * https://trac.ffmpeg.org/ticket/11116 */
+  const size_t align = av_cpu_max_align();
+  if (av_frame_get_buffer(anim->pFrameRGB, align) < 0) {
    fprintf(stderr, "Could not allocate frame data.\n");
    avcodec_free_context(&anim->pCodecCtx);
    avformat_close_input(&anim->pFormatCtx);
@@ -1115,25 +1122,6 @@ static ImBuf *ffmpeg_fetchibuf(ImBufAnim *anim, int position, IMB_Timecode_Type
  anim->x = anim->pCodecCtx->width;
  anim->y = anim->pCodecCtx->height;

-  /* Certain versions of FFmpeg have a bug in libswscale which ends up in crash
-   * when destination buffer is not properly aligned. For example, this happens
-   * in FFmpeg 4.3.1. It got fixed later on, but for compatibility reasons is
-   * still best to avoid crash.
-   *
-   * This is achieved by using a separate allocation call rather than relying on
-   * IMB_allocImBuf() to do so since the IMB_allocImBuf() is not guaranteed
-   * to perform aligned allocation.
-   *
-   * In theory this could give better performance, since SIMD operations on
-   * aligned data are usually faster.
-   *
-   * Note that even though sometimes vertical flip is required it does not
-   * affect on alignment of data passed to sws_scale because if the X dimension
-   * is not 32 byte aligned special intermediate buffer is allocated.
-   *
-   * The issue was reported to FFmpeg under ticket #8747 in the FFmpeg tracker
-   * and is fixed in the newer versions than 4.3.1. */
-
  const AVPixFmtDescriptor *pix_fmt_descriptor = av_pix_fmt_desc_get(anim->pCodecCtx->pix_fmt);

  int planes = R_IMF_PLANES_RGBA;
@@ -1144,8 +1132,9 @@ static ImBuf *ffmpeg_fetchibuf(ImBufAnim *anim, int position, IMB_Timecode_Type
  ImBuf *cur_frame_final = IMB_allocImBuf(anim->x, anim->y, planes, 0);

  /* Allocate the storage explicitly to ensure the memory is aligned. */
+  const size_t align = av_cpu_max_align();
  uint8_t *buffer_data = static_cast<uint8_t *>(
-      MEM_mallocN_aligned(size_t(4) * anim->x * anim->y, 32, "ffmpeg ibuf"));
+      MEM_mallocN_aligned(size_t(4) * anim->x * anim->y, align, "ffmpeg ibuf"));
  IMB_assign_byte_buffer(cur_frame_final, buffer_data, IB_TAKE_OWNERSHIP);

  cur_frame_final->byte_buffer.colorspace = colormanage_colorspace_get_named(anim->colorspace);