GPU: Metal: Add --profile-gpu support for CPU timing

The GPU implementation is a bit too complex to implement for now. As we are improving shader loading, having the CPU timings is already helpful. Note that `Map<size_t, int>` does not compile on Clang. This is exposing the `--profile-gpu` option on all backends as the vulkan backend should follow shortly. Pull Request: https://projects.blender.org/blender/blender/pulls/139551
2025-05-28 21:08:38 +02:00
parent 5106c4e655
commit 7245262de8
5 changed files with 106 additions and 5 deletions
--- a/source/blender/gpu/intern/gpu_profile_report.hh
+++ b/source/blender/gpu/intern/gpu_profile_report.hh
@@ -2,6 +2,8 @@
 *
 * SPDX-License-Identifier: GPL-2.0-or-later */

+#pragma once
+
 #include "BLI_map.hh"
 #include "BLI_mutex.hh"
 #include "BLI_string_ref.hh"
@@ -19,7 +21,7 @@ class ProfileReport {
 private:
  std::fstream _report;
  Mutex _mutex;
-  Map<size_t, int> _thread_ids;
+  Map<uint64_t, int> _thread_ids;

  ProfileReport()
  {
@@ -69,6 +71,22 @@ class ProfileReport {
        (cpu_end - cpu_start) / uint64_t(1000),
        thread_id);
  }
+
+  void add_group_cpu(StringRefNull name, uint64_t cpu_start, uint64_t cpu_end)
+  {
+    std::scoped_lock lock(_mutex);
+
+    size_t thread_hash = std::hash<std::thread::id>()(std::this_thread::get_id());
+    int thread_id = _thread_ids.lookup_or_add(thread_hash, _thread_ids.size());
+
+    _report << fmt::format(
+        ",\n"
+        R"({{"name":"{}","ph":"X","ts":{},"dur":{},"pid":2,"tid":{}}})",
+        name.c_str(),
+        cpu_start / uint64_t(1000),
+        (cpu_end - cpu_start) / uint64_t(1000),
+        thread_id);
+  }
 };

 }  // namespace blender::gpu
--- a/source/blender/gpu/metal/mtl_context.hh
+++ b/source/blender/gpu/metal/mtl_context.hh
@@ -33,6 +33,7 @@
 #include <Cocoa/Cocoa.h>
 #include <Metal/Metal.h>
 #include <QuartzCore/QuartzCore.h>
+#include <chrono>
 #include <mutex>

@class CAMetalLayer;
@@ -776,6 +777,23 @@ class MTLContext : public Context {
  GPUVertFormat dummy_vertformat_[GPU_SAMPLER_TYPE_MAX];
  VertBuf *dummy_verts_[GPU_SAMPLER_TYPE_MAX] = {nullptr};

+  /* Debug scope timings. Adapted form GLContext::TimeQuery.
+   * Only supports CPU timings for now. */
+  struct ScopeTimings {
+    using Clock = std::chrono::steady_clock;
+    using TimePoint = Clock::time_point;
+    using Nanoseconds = std::chrono::nanoseconds;
+
+    static TimePoint epoch;
+
+    std::string name;
+    bool finished;
+    TimePoint cpu_start, cpu_end;
+  };
+  Vector<ScopeTimings> scope_timings;
+
+  void process_frame_timings();
+
 public:
  /* GPUContext interface. */
  MTLContext(void *ghost_window, void *ghost_context);
--- a/source/blender/gpu/metal/mtl_context.mm
+++ b/source/blender/gpu/metal/mtl_context.mm
@@ -374,6 +374,8 @@ MTLContext::~MTLContext()
  if (this->device) {
    [this->device release];
  }
+
+  this->process_frame_timings();
 }

 void MTLContext::begin_frame()
@@ -396,6 +398,8 @@ void MTLContext::end_frame()

  /* Increment frame counter. */
  is_inside_frame_ = false;
+
+  this->process_frame_timings();
 }

 void MTLContext::check_error(const char * /*info*/)
--- a/source/blender/gpu/metal/mtl_debug.mm
+++ b/source/blender/gpu/metal/mtl_debug.mm
@@ -23,6 +23,8 @@

 #include "CLG_log.h"

+#include "gpu_profile_report.hh"
+
 #include <utility>

 namespace blender::gpu::debug {
@@ -50,6 +52,17 @@ void MTLContext::debug_group_begin(const char *name, int index)
  if (G.debug & G_DEBUG_GPU) {
    this->main_command_buffer.push_debug_group(name, index);
  }
+
+  if (!G.profile_gpu) {
+    return;
+  }
+
+  ScopeTimings timings = {};
+  timings.name = name;
+  timings.finished = false;
+  timings.cpu_start = ScopeTimings::Clock::now();
+
+  scope_timings.append(timings);
 }

 void MTLContext::debug_group_end()
@@ -57,6 +70,56 @@ void MTLContext::debug_group_end()
  if (G.debug & G_DEBUG_GPU) {
    this->main_command_buffer.pop_debug_group();
  }
+
+  if (!G.profile_gpu) {
+    return;
+  }
+
+  for (int i = scope_timings.size() - 1; i >= 0; i--) {
+    ScopeTimings &query = scope_timings[i];
+    if (!query.finished) {
+      query.finished = true;
+      query.cpu_end = ScopeTimings::Clock::now();
+      break;
+    }
+    if (i == 0) {
+      std::cout << "Profile GPU error: Extra GPU_debug_group_end() call.\n";
+    }
+  }
+}
+
+MTLContext::ScopeTimings::TimePoint MTLContext::ScopeTimings::epoch =
+    MTLContext::ScopeTimings::Clock::now();
+
+void MTLContext::process_frame_timings()
+{
+  if (!G.profile_gpu) {
+    return;
+  }
+
+  Vector<ScopeTimings> &queries = scope_timings;
+
+  bool frame_is_valid = !queries.is_empty();
+
+  for (int i = queries.size() - 1; i >= 0; i--) {
+    if (!queries[i].finished) {
+      frame_is_valid = false;
+      std::cout << "Profile GPU error: Missing GPU_debug_group_end() call\n";
+    }
+    break;
+  }
+
+  if (!frame_is_valid) {
+    return;
+  }
+
+  for (ScopeTimings &query : queries) {
+    ScopeTimings::Nanoseconds begin = query.cpu_start - ScopeTimings::epoch;
+    ScopeTimings::Nanoseconds end = query.cpu_end - ScopeTimings::epoch;
+    ProfileReport::get().add_group_cpu(query.name, begin.count(), end.count());
+  }
+
+  queries.clear();
 }

 bool MTLContext::debug_capture_begin(const char * /*title*/)
--- a/source/creator/creator_args.cc
+++ b/source/creator/creator_args.cc
@@ -784,8 +784,8 @@ static void print_help(bArgs *ba, bool all)
  BLI_args_print_arg_doc(ba, "--gpu-backend");
 #  ifdef WITH_OPENGL_BACKEND
  BLI_args_print_arg_doc(ba, "--gpu-compilation-subprocesses");
-  BLI_args_print_arg_doc(ba, "--profile-gpu");
 #  endif
+  BLI_args_print_arg_doc(ba, "--profile-gpu");

  PRINT("\n");
  PRINT("Misc Options:\n");
@@ -2512,7 +2512,6 @@ static int arg_handle_addons_set(int argc, const char **argv, void *data)
  return 0;
 }

-#  ifdef WITH_OPENGL_BACKEND
 static const char arg_handle_profile_gpu_set_doc[] =
    "\n"
    "\tEnable CPU & GPU performance profiling for GPU debug groups\n"
@@ -2522,7 +2521,6 @@ static int arg_handle_profile_gpu_set(int /*argc*/, const char ** /*argv*/, void
  G.profile_gpu = true;
  return 0;
 }
-#  endif

 /**
 * Implementation for #arg_handle_load_last_file, also used by `--open-last`.
@@ -2694,8 +2692,8 @@ void main_args_setup(bContext *C, bArgs *ba, bool all)
               "--gpu-compilation-subprocesses",
               CB(arg_handle_gpu_compilation_subprocesses_set),
               nullptr);
-  BLI_args_add(ba, nullptr, "--profile-gpu", CB(arg_handle_profile_gpu_set), nullptr);
 #  endif
+  BLI_args_add(ba, nullptr, "--profile-gpu", CB(arg_handle_profile_gpu_set), nullptr);

  /* Pass: Background Mode & Settings
   *