Cycles: Add Render Time pass

This implements a basic render time pass, using HW-based counters to minimize render time impact. x86-64 uses the TSC instruction for timing, while ARM64 uses the cntvct_el0 register. In theory TSC is not always super reliable (e.g. old CPUs had it tied to their current clock rate), but for somewhat recent CPU models it should be fine. If neither is available, it falls back to `std::chrono::steady_clock`, which should still be very fast. The output is in milliseconds of CPU-time per pixel. Pull Request: https://projects.blender.org/blender/blender/pulls/125933
2025-09-22 21:54:08 +02:00
parent 12cdfb5856
commit 78147b5db7
14 changed files with 162 additions and 11 deletions
--- a/intern/cycles/blender/addon/engine.py
+++ b/intern/cycles/blender/addon/engine.py
@@ -219,6 +219,8 @@ def list_render_passes(scene, srl):
    # Debug passes.
    if crl.pass_debug_sample_count:
        yield ("Debug Sample Count", "X", 'VALUE')
+    if crl.pass_render_time:
+        yield ("Render Time", "X", 'VALUE')

    # Cryptomatte passes.
    # NOTE: Name channels are lowercase RGBA so that compression rules check in OpenEXR DWA code
--- a/intern/cycles/blender/addon/properties.py
+++ b/intern/cycles/blender/addon/properties.py
@@ -1480,6 +1480,12 @@ class CyclesRenderLayerSettings(bpy.types.PropertyGroup):
        default=False,
        update=update_render_passes,
    )
+    pass_render_time: BoolProperty(
+        name="Render Time",
+        description="Pass containing an estimate for how long each pixel took to render",
+        default=False,
+        update=update_render_passes,
+    )
    use_pass_volume_direct: BoolProperty(
        name="Volume Direct",
        description="Deliver direct volumetric scattering pass",
--- a/intern/cycles/blender/addon/ui.py
+++ b/intern/cycles/blender/addon/ui.py
@@ -1033,6 +1033,7 @@ class CYCLES_RENDER_PT_passes_data(CyclesButtonsPanel, Panel):

        col = layout.column(heading="Debug", align=True)
        col.prop(cycles_view_layer, "pass_debug_sample_count", text="Sample Count")
+        col.prop(cycles_view_layer, "pass_render_time", text="Render Time")

        layout.prop(view_layer, "pass_alpha_threshold")

--- a/intern/cycles/blender/sync.cpp
+++ b/intern/cycles/blender/sync.cpp
@@ -711,6 +711,7 @@ static bool get_known_pass_type(BL::RenderPass &b_pass, PassType &type, PassMode

  MAP_PASS("AdaptiveAuxBuffer", PASS_ADAPTIVE_AUX_BUFFER, false);
  MAP_PASS("Debug Sample Count", PASS_SAMPLE_COUNT, false);
+  MAP_PASS("Render Time", PASS_RENDER_TIME, false);

  MAP_PASS("Guiding Color", PASS_GUIDING_COLOR, false);
  MAP_PASS("Guiding Probability", PASS_GUIDING_PROBABILITY, false);
--- a/intern/cycles/integrator/pass_accessor.cpp
+++ b/intern/cycles/integrator/pass_accessor.cpp
@@ -229,11 +229,10 @@ void PassAccessor::init_kernel_film_convert(KernelFilmConvert *kfilm_convert,
                                            const BufferParams &buffer_params,
                                            const Destination &destination) const
 {
+  const PassType type = pass_access_info_.type;
  const PassMode mode = pass_access_info_.mode;
-  const PassInfo &pass_info = Pass::get_info(pass_access_info_.type,
-                                             mode,
-                                             pass_access_info_.include_albedo,
-                                             pass_access_info_.is_lightgroup);
+  const PassInfo &pass_info = Pass::get_info(
+      type, mode, pass_access_info_.include_albedo, pass_access_info_.is_lightgroup);

  kfilm_convert->pass_offset = pass_access_info_.offset;
  kfilm_convert->pass_stride = buffer_params.pass_stride;
@@ -262,11 +261,15 @@ void PassAccessor::init_kernel_film_convert(KernelFilmConvert *kfilm_convert,
  /* Background is not denoised, so always use noisy pass. */
  kfilm_convert->pass_background = buffer_params.get_pass_offset(PASS_BACKGROUND);

-  if (pass_info.use_filter) {
-    kfilm_convert->scale = num_samples_ != 0 ? 1.0f / num_samples_ : 0.0f;
+  /* If we have a sample count pass, we must perform the division in the kernel instead
+   * (unless the sample count pass is the one being read). */
+  const bool divide_by_samples = (type == PASS_SAMPLE_COUNT) ||
+                                 (kfilm_convert->pass_sample_count == PASS_UNUSED);
+  if (pass_info.use_filter && divide_by_samples) {
+    kfilm_convert->scale = num_samples_ != 0 ? pass_info.scale / num_samples_ : 0.0f;
  }
  else {
-    kfilm_convert->scale = 1.0f;
+    kfilm_convert->scale = pass_info.scale;
  }

  if (pass_info.use_exposure) {
--- a/intern/cycles/integrator/path_trace_work_cpu.cpp
+++ b/intern/cycles/integrator/path_trace_work_cpu.cpp
@@ -20,6 +20,7 @@
 #include "session/buffers.h"

 #include "util/tbb.h"
+#include "util/time.h"

 CCL_NAMESPACE_BEGIN

@@ -128,6 +129,8 @@ void PathTraceWorkCPU::render_samples_full_pipeline(ThreadKernelGlobalsCPU *kern
  KernelWorkTile sample_work_tile = work_tile;
  float *render_buffer = buffers_->buffer.data();

+  fast_timer render_timer;
+
  for (int sample = 0; sample < samples_num; ++sample) {
    if (is_cancel_requested()) {
      break;
@@ -173,6 +176,15 @@ void PathTraceWorkCPU::render_samples_full_pipeline(ThreadKernelGlobalsCPU *kern
        kernels_.integrator_megakernel(kernel_globals, shadow_catcher_state, render_buffer);
      }
    }
+
+    if (kernel_globals->data.film.pass_render_time != PASS_UNUSED) {
+      uint64_t time;
+      if (render_timer.lap(time)) {
+        ccl_global float *buffer = render_buffer + (uint64_t)state->path.render_pixel_index *
+                                                       kernel_globals->data.film.pass_stride;
+        *(buffer + kernel_globals->data.film.pass_render_time) += float(time);
+      }
+    }
    ++sample_work_tile.start_sample;
  }
 }
--- a/intern/cycles/kernel/data_template.h
+++ b/intern/cycles/kernel/data_template.h
@@ -116,6 +116,7 @@ KERNEL_STRUCT_MEMBER(film, float, pass_alpha_threshold)
 KERNEL_STRUCT_MEMBER(film, int, pass_shadow_catcher)
 KERNEL_STRUCT_MEMBER(film, int, pass_shadow_catcher_sample_count)
 KERNEL_STRUCT_MEMBER(film, int, pass_shadow_catcher_matte)
+KERNEL_STRUCT_MEMBER(film, int, pass_render_time)
 /* Cryptomatte. */
 KERNEL_STRUCT_MEMBER(film, int, cryptomatte_passes)
 KERNEL_STRUCT_MEMBER(film, int, cryptomatte_depth)
--- a/intern/cycles/kernel/film/read.h
+++ b/intern/cycles/kernel/film/read.h
@@ -37,10 +37,10 @@ ccl_device_inline float film_get_scale(const ccl_global KernelFilmConvert *ccl_r
  if (kfilm_convert->pass_use_filter) {
    const uint sample_count = *(
        (const ccl_global uint *)(buffer + kfilm_convert->pass_sample_count));
-    return 1.0f / sample_count;
+    return kfilm_convert->scale / sample_count;
  }

-  return 1.0f;
+  return kfilm_convert->scale;
 }

 ccl_device_inline float film_get_scale_exposure(const ccl_global KernelFilmConvert *ccl_restrict
@@ -81,10 +81,10 @@ ccl_device_inline bool film_get_scale_and_scale_exposure(
  }

  if (kfilm_convert->pass_use_filter) {
-    *scale = 1.0f / sample_count;
+    *scale = kfilm_convert->scale / sample_count;
  }
  else {
-    *scale = 1.0f;
+    *scale = kfilm_convert->scale;
  }

  if (kfilm_convert->pass_use_exposure) {
--- a/intern/cycles/kernel/types.h
+++ b/intern/cycles/kernel/types.h
@@ -540,6 +540,7 @@ enum PassType {
  PASS_DENOISING_ALBEDO,
  PASS_DENOISING_DEPTH,
  PASS_DENOISING_PREVIOUS,
+  PASS_RENDER_TIME,

  /* PASS_SHADOW_CATCHER accumulates contribution of shadow catcher object which is not affected by
   * any other object. The pass accessor will divide the combined pass by the shadow catcher. The
--- a/intern/cycles/scene/film.cpp
+++ b/intern/cycles/scene/film.cpp
@@ -199,6 +199,7 @@ void Film::device_update(Device *device, DeviceScene *dscene, Scene *scene)
  kfilm->pass_denoising_albedo = PASS_UNUSED;
  kfilm->pass_denoising_depth = PASS_UNUSED;
  kfilm->pass_sample_count = PASS_UNUSED;
+  kfilm->pass_render_time = PASS_UNUSED;
  kfilm->pass_adaptive_aux_buffer = PASS_UNUSED;
  kfilm->pass_shadow_catcher = PASS_UNUSED;
  kfilm->pass_shadow_catcher_sample_count = PASS_UNUSED;
@@ -395,6 +396,9 @@ void Film::device_update(Device *device, DeviceScene *dscene, Scene *scene)
      case PASS_SAMPLE_COUNT:
        kfilm->pass_sample_count = kfilm->pass_stride;
        break;
+      case PASS_RENDER_TIME:
+        kfilm->pass_render_time = kfilm->pass_stride;
+        break;

      case PASS_AOV_COLOR:
        if (!have_aov_color) {
--- a/intern/cycles/scene/pass.cpp
+++ b/intern/cycles/scene/pass.cpp
@@ -5,6 +5,7 @@
 #include "scene/pass.h"

 #include "util/log.h"
+#include "util/time.h"

 CCL_NAMESPACE_BEGIN

@@ -92,6 +93,7 @@ const NodeEnum *Pass::get_type_enum()
    pass_type_enum.insert("denoising_previous", PASS_DENOISING_PREVIOUS);
    pass_type_enum.insert("volume_majorant", PASS_VOLUME_MAJORANT);
    pass_type_enum.insert("volume_majorant_sample_count", PASS_VOLUME_MAJORANT_SAMPLE_COUNT);
+    pass_type_enum.insert("render_time", PASS_RENDER_TIME);

    pass_type_enum.insert("shadow_catcher", PASS_SHADOW_CATCHER);
    pass_type_enum.insert("shadow_catcher_sample_count", PASS_SHADOW_CATCHER_SAMPLE_COUNT);
@@ -344,6 +346,12 @@ PassInfo Pass::get_info(const PassType type,
      pass_info.num_components = 1;
      pass_info.use_exposure = false;
      break;
+    case PASS_RENDER_TIME:
+      pass_info.num_components = 1;
+      pass_info.use_exposure = false;
+      pass_info.use_filter = false;
+      pass_info.scale = 1000.0f / float(time_fast_frequency());
+      break;

    case PASS_AOV_COLOR:
      pass_info.num_components = 4;
--- a/intern/cycles/scene/pass.h
+++ b/intern/cycles/scene/pass.h
@@ -29,6 +29,7 @@ struct PassInfo {
  bool use_filter = false;
  bool use_exposure = false;
  bool is_written = true;
+  float scale = 1.0f;
  PassType divide_type = PASS_NONE;
  PassType direct_type = PASS_NONE;
  PassType indirect_type = PASS_NONE;
--- a/intern/cycles/util/time.cpp
+++ b/intern/cycles/util/time.cpp
@@ -4,6 +4,7 @@

 #include "util/time.h"

+#include <chrono>
 #include <cstdlib>

 #if !defined(_WIN32)
@@ -17,6 +18,14 @@
 #  include "util/windows.h"
 #endif

+#if defined(__x86_64__) || defined(_M_X64) || defined(__i386__) || defined(_M_IX86)
+#  ifdef _MSC_VER
+#    include <intrin.h>
+#  else
+#    include <x86intrin.h>
+#  endif
+#endif
+
 CCL_NAMESPACE_BEGIN

 #ifdef _WIN32
@@ -64,6 +73,71 @@ void time_sleep(double t)
 }
 #endif

+#if defined(__aarch64__) || defined(_M_ARM64)
+/* Use cntvct_el0/cntfrq_el0 registers on ARM64. */
+
+uint64_t time_fast_tick(uint32_t * /*last_cpu*/)
+{
+#  if defined(ARCH_COMPILER_MSVC)
+  return _ReadStatusReg(ARM64_CNTVCT_EL0);
+#  else
+  uint64_t counter;
+  asm("mrs %x0, cntvct_el0" : "=r"(counter));
+  return counter;
+#  endif
+}
+uint64_t time_fast_frequency()
+{
+#  if defined(ARCH_COMPILER_MSVC)
+  return _ReadStatusReg(ARM64_CNTFRQ_EL0);
+#  else
+  uint64_t freq;
+  asm("mrs %x0, cntfrq_el0" : "=r"(freq));
+  return freq;
+#  endif
+}
+#elif defined(__x86_64__) || defined(_M_X64)
+/* Use RDTSCP on x86-64. */
+
+uint64_t time_fast_tick(uint32_t *last_cpu)
+{
+  return __rdtscp(last_cpu);
+}
+uint64_t time_fast_frequency()
+{
+  static bool initialized = false;
+  static uint64_t frequency;
+
+  /* Unfortunately TSC does not provide a easily accessible frequency value, so roughly calibrate
+   * by sleeping a millisecond. Not ideal, but good enough for our purposes. */
+  if (!initialized) {
+    uint32_t cpu;
+    uint64_t start_tick = time_fast_tick(&cpu);
+    double start_precise = time_dt();
+    time_sleep(0.001);
+    uint64_t end_tick = time_fast_tick(&cpu);
+    double end_precise = time_dt();
+    frequency = uint64_t(double(end_tick - start_tick) / (end_precise - start_precise));
+    initialized = true;
+  }
+
+  return frequency;
+}
+#else
+/* Fall back to std::chrono::steady_clock. */
+
+uint64_t time_fast_tick(uint32_t * /*last_cpu*/)
+{
+  auto now = std::chrono::steady_clock::now();
+  auto nanoseconds = std::chrono::time_point_cast<std::chrono::nanoseconds>(now);
+  return nanoseconds.time_since_epoch().count();
+}
+uint64_t time_fast_frequency()
+{
+  return 1000000000;
+}
+#endif
+
 /* Time in format "hours:minutes:seconds.hundreds" */

 string time_human_readable_from_seconds(const double seconds)
--- a/intern/cycles/util/time.h
+++ b/intern/cycles/util/time.h
@@ -18,6 +18,16 @@ double time_dt();

 void time_sleep(const double t);

+/* Fast timer for applications where overhead is critical and some inaccuracy is acceptable.
+ *
+ * On x86, this uses RDTSCP, which also can check which CPU the code runs on, which in turn
+ * allows us to skip measurements where we moved CPU in-between (which might be invalid due
+ * to different clock states between cores and/or misleading due to OS scheduling). Therefore,
+ * we provide last_cpu to time_fast_tick, and it may set it if supported. */
+
+uint64_t time_fast_tick(uint32_t *last_cpu);
+uint64_t time_fast_frequency();
+
 /* Scoped timer. */

 class scoped_timer {
@@ -49,6 +59,33 @@ class scoped_timer {
  double time_start_;
 };

+class fast_timer {
+ public:
+  fast_timer()
+  {
+    last_cpu = 0;
+    last_value = time_fast_tick(&last_cpu);
+  }
+
+  bool lap(uint64_t &delta)
+  {
+    uint32_t new_cpu = 0;
+    uint64_t new_value = time_fast_tick(&new_cpu);
+
+    const bool cpu_consistent = new_cpu == last_cpu;
+    delta = new_value - last_value;
+
+    last_cpu = new_cpu;
+    last_value = new_value;
+
+    return cpu_consistent;
+  }
+
+ protected:
+  uint32_t last_cpu;
+  uint64_t last_value;
+};
+
 class scoped_callback_timer {
 public:
  using callback_type = std::function<void(double)>;