From 7245262de896c63231b6921cab40378edd5794be Mon Sep 17 00:00:00 2001
From: =?UTF-8?q?Cle=CC=81ment=20Foucault?= <foucault.clem@gmail.com>
Date: Wed, 28 May 2025 21:08:38 +0200
Subject: [PATCH] GPU: Metal: Add `--profile-gpu` support for CPU timing

The GPU implementation is a bit too complex
to implement for now.

As we are improving shader loading, having the
CPU timings is already helpful.

Note that `Map<size_t, int>` does not compile
on Clang.

This is exposing the `--profile-gpu` option on
all backends as the vulkan backend should follow
shortly.

Pull Request: https://projects.blender.org/blender/blender/pulls/139551
---
 .../blender/gpu/intern/gpu_profile_report.hh  | 20 +++++-
 source/blender/gpu/metal/mtl_context.hh       | 18 ++++++
 source/blender/gpu/metal/mtl_context.mm       |  4 ++
 source/blender/gpu/metal/mtl_debug.mm         | 63 +++++++++++++++++++
 source/creator/creator_args.cc                |  6 +-
 5 files changed, 106 insertions(+), 5 deletions(-)
diff --git a/source/blender/gpu/intern/gpu_profile_report.hh b/source/blender/gpu/intern/gpu_profile_report.hh
index c69ce5f3d29..5bfec4eef08 100644
--- a/source/blender/gpu/intern/gpu_profile_report.hh
+++ b/source/blender/gpu/intern/gpu_profile_report.hh
@@ -2,6 +2,8 @@
  *
  * SPDX-License-Identifier: GPL-2.0-or-later */
 
+#pragma once
+
 #include "BLI_map.hh"
 #include "BLI_mutex.hh"
 #include "BLI_string_ref.hh"
@@ -19,7 +21,7 @@ class ProfileReport {
  private:
   std::fstream _report;
   Mutex _mutex;
-  Map<size_t, int> _thread_ids;
+  Map<uint64_t, int> _thread_ids;
 
   ProfileReport()
   {
@@ -69,6 +71,22 @@ class ProfileReport {
         (cpu_end - cpu_start) / uint64_t(1000),
         thread_id);
   }
+
+  void add_group_cpu(StringRefNull name, uint64_t cpu_start, uint64_t cpu_end)
+  {
+    std::scoped_lock lock(_mutex);
+
+    size_t thread_hash = std::hash<std::thread::id>()(std::this_thread::get_id());
+    int thread_id = _thread_ids.lookup_or_add(thread_hash, _thread_ids.size());
+
+    _report << fmt::format(
+        ",\n"
+        R"({{"name":"{}","ph":"X","ts":{},"dur":{},"pid":2,"tid":{}}})",
+        name.c_str(),
+        cpu_start / uint64_t(1000),
+        (cpu_end - cpu_start) / uint64_t(1000),
+        thread_id);
+  }
 };
 
 }  // namespace blender::gpu
diff --git a/source/blender/gpu/metal/mtl_context.hh b/source/blender/gpu/metal/mtl_context.hh
index 958ab493459..8b859196e0e 100644
--- a/source/blender/gpu/metal/mtl_context.hh
+++ b/source/blender/gpu/metal/mtl_context.hh
@@ -33,6 +33,7 @@
 #include <Cocoa/Cocoa.h>
 #include <Metal/Metal.h>
 #include <QuartzCore/QuartzCore.h>
+#include <chrono>
 #include <mutex>
 
 @class CAMetalLayer;
@@ -776,6 +777,23 @@ class MTLContext : public Context {
   GPUVertFormat dummy_vertformat_[GPU_SAMPLER_TYPE_MAX];
   VertBuf *dummy_verts_[GPU_SAMPLER_TYPE_MAX] = {nullptr};
 
+  /* Debug scope timings. Adapted form GLContext::TimeQuery.
+   * Only supports CPU timings for now. */
+  struct ScopeTimings {
+    using Clock = std::chrono::steady_clock;
+    using TimePoint = Clock::time_point;
+    using Nanoseconds = std::chrono::nanoseconds;
+
+    static TimePoint epoch;
+
+    std::string name;
+    bool finished;
+    TimePoint cpu_start, cpu_end;
+  };
+  Vector<ScopeTimings> scope_timings;
+
+  void process_frame_timings();
+
  public:
   /* GPUContext interface. */
   MTLContext(void *ghost_window, void *ghost_context);
diff --git a/source/blender/gpu/metal/mtl_context.mm b/source/blender/gpu/metal/mtl_context.mm
index e3c9bed7fad..afcf061d4dd 100644
--- a/source/blender/gpu/metal/mtl_context.mm
+++ b/source/blender/gpu/metal/mtl_context.mm
@@ -374,6 +374,8 @@ MTLContext::~MTLContext()
   if (this->device) {
     [this->device release];
   }
+
+  this->process_frame_timings();
 }
 
 void MTLContext::begin_frame()
@@ -396,6 +398,8 @@ void MTLContext::end_frame()
 
   /* Increment frame counter. */
   is_inside_frame_ = false;
+
+  this->process_frame_timings();
 }
 
 void MTLContext::check_error(const char * /*info*/)
diff --git a/source/blender/gpu/metal/mtl_debug.mm b/source/blender/gpu/metal/mtl_debug.mm
index c9b929c770d..dc95e9c4518 100644
--- a/source/blender/gpu/metal/mtl_debug.mm
+++ b/source/blender/gpu/metal/mtl_debug.mm
@@ -23,6 +23,8 @@
 
 #include "CLG_log.h"
 
+#include "gpu_profile_report.hh"
+
 #include <utility>
 
 namespace blender::gpu::debug {
@@ -50,6 +52,17 @@ void MTLContext::debug_group_begin(const char *name, int index)
   if (G.debug & G_DEBUG_GPU) {
     this->main_command_buffer.push_debug_group(name, index);
   }
+
+  if (!G.profile_gpu) {
+    return;
+  }
+
+  ScopeTimings timings = {};
+  timings.name = name;
+  timings.finished = false;
+  timings.cpu_start = ScopeTimings::Clock::now();
+
+  scope_timings.append(timings);
 }
 
 void MTLContext::debug_group_end()
@@ -57,6 +70,56 @@ void MTLContext::debug_group_end()
   if (G.debug & G_DEBUG_GPU) {
     this->main_command_buffer.pop_debug_group();
   }
+
+  if (!G.profile_gpu) {
+    return;
+  }
+
+  for (int i = scope_timings.size() - 1; i >= 0; i--) {
+    ScopeTimings &query = scope_timings[i];
+    if (!query.finished) {
+      query.finished = true;
+      query.cpu_end = ScopeTimings::Clock::now();
+      break;
+    }
+    if (i == 0) {
+      std::cout << "Profile GPU error: Extra GPU_debug_group_end() call.\n";
+    }
+  }
+}
+
+MTLContext::ScopeTimings::TimePoint MTLContext::ScopeTimings::epoch =
+    MTLContext::ScopeTimings::Clock::now();
+
+void MTLContext::process_frame_timings()
+{
+  if (!G.profile_gpu) {
+    return;
+  }
+
+  Vector<ScopeTimings> &queries = scope_timings;
+
+  bool frame_is_valid = !queries.is_empty();
+
+  for (int i = queries.size() - 1; i >= 0; i--) {
+    if (!queries[i].finished) {
+      frame_is_valid = false;
+      std::cout << "Profile GPU error: Missing GPU_debug_group_end() call\n";
+    }
+    break;
+  }
+
+  if (!frame_is_valid) {
+    return;
+  }
+
+  for (ScopeTimings &query : queries) {
+    ScopeTimings::Nanoseconds begin = query.cpu_start - ScopeTimings::epoch;
+    ScopeTimings::Nanoseconds end = query.cpu_end - ScopeTimings::epoch;
+    ProfileReport::get().add_group_cpu(query.name, begin.count(), end.count());
+  }
+
+  queries.clear();
 }
 
 bool MTLContext::debug_capture_begin(const char * /*title*/)
diff --git a/source/creator/creator_args.cc b/source/creator/creator_args.cc
index 3cea195ec97..45d4e9ff352 100644
--- a/source/creator/creator_args.cc
+++ b/source/creator/creator_args.cc
@@ -784,8 +784,8 @@ static void print_help(bArgs *ba, bool all)
   BLI_args_print_arg_doc(ba, "--gpu-backend");
 #  ifdef WITH_OPENGL_BACKEND
   BLI_args_print_arg_doc(ba, "--gpu-compilation-subprocesses");
-  BLI_args_print_arg_doc(ba, "--profile-gpu");
 #  endif
+  BLI_args_print_arg_doc(ba, "--profile-gpu");
 
   PRINT("\n");
   PRINT("Misc Options:\n");
@@ -2512,7 +2512,6 @@ static int arg_handle_addons_set(int argc, const char **argv, void *data)
   return 0;
 }
 
-#  ifdef WITH_OPENGL_BACKEND
 static const char arg_handle_profile_gpu_set_doc[] =
     "\n"
     "\tEnable CPU & GPU performance profiling for GPU debug groups\n"
@@ -2522,7 +2521,6 @@ static int arg_handle_profile_gpu_set(int /*argc*/, const char ** /*argv*/, void
   G.profile_gpu = true;
   return 0;
 }
-#  endif
 
 /**
  * Implementation for #arg_handle_load_last_file, also used by `--open-last`.
@@ -2694,8 +2692,8 @@ void main_args_setup(bContext *C, bArgs *ba, bool all)
                "--gpu-compilation-subprocesses",
                CB(arg_handle_gpu_compilation_subprocesses_set),
                nullptr);
-  BLI_args_add(ba, nullptr, "--profile-gpu", CB(arg_handle_profile_gpu_set), nullptr);
 #  endif
+  BLI_args_add(ba, nullptr, "--profile-gpu", CB(arg_handle_profile_gpu_set), nullptr);
 
   /* Pass: Background Mode & Settings
    *