diff --git a/intern/cycles/blender/addon/engine.py b/intern/cycles/blender/addon/engine.py
index 32ad6dedb72..de33952bbfc 100644
--- a/intern/cycles/blender/addon/engine.py
+++ b/intern/cycles/blender/addon/engine.py
@@ -219,6 +219,8 @@ def list_render_passes(scene, srl):
     # Debug passes.
     if crl.pass_debug_sample_count:
         yield ("Debug Sample Count", "X", 'VALUE')
+    if crl.pass_render_time:
+        yield ("Render Time", "X", 'VALUE')
 
     # Cryptomatte passes.
     # NOTE: Name channels are lowercase RGBA so that compression rules check in OpenEXR DWA code
diff --git a/intern/cycles/blender/addon/properties.py b/intern/cycles/blender/addon/properties.py
index 8d6667e60f1..a5450642245 100644
--- a/intern/cycles/blender/addon/properties.py
+++ b/intern/cycles/blender/addon/properties.py
@@ -1480,6 +1480,12 @@ class CyclesRenderLayerSettings(bpy.types.PropertyGroup):
         default=False,
         update=update_render_passes,
     )
+    pass_render_time: BoolProperty(
+        name="Render Time",
+        description="Pass containing an estimate for how long each pixel took to render",
+        default=False,
+        update=update_render_passes,
+    )
     use_pass_volume_direct: BoolProperty(
         name="Volume Direct",
         description="Deliver direct volumetric scattering pass",
diff --git a/intern/cycles/blender/addon/ui.py b/intern/cycles/blender/addon/ui.py
index 3549067a5e1..e4af0466b01 100644
--- a/intern/cycles/blender/addon/ui.py
+++ b/intern/cycles/blender/addon/ui.py
@@ -1033,6 +1033,7 @@ class CYCLES_RENDER_PT_passes_data(CyclesButtonsPanel, Panel):
 
         col = layout.column(heading="Debug", align=True)
         col.prop(cycles_view_layer, "pass_debug_sample_count", text="Sample Count")
+        col.prop(cycles_view_layer, "pass_render_time", text="Render Time")
 
         layout.prop(view_layer, "pass_alpha_threshold")
 
diff --git a/intern/cycles/blender/sync.cpp b/intern/cycles/blender/sync.cpp
index 8353e1f5dc2..6247e4db89c 100644
--- a/intern/cycles/blender/sync.cpp
+++ b/intern/cycles/blender/sync.cpp
@@ -711,6 +711,7 @@ static bool get_known_pass_type(BL::RenderPass &b_pass, PassType &type, PassMode
 
   MAP_PASS("AdaptiveAuxBuffer", PASS_ADAPTIVE_AUX_BUFFER, false);
   MAP_PASS("Debug Sample Count", PASS_SAMPLE_COUNT, false);
+  MAP_PASS("Render Time", PASS_RENDER_TIME, false);
 
   MAP_PASS("Guiding Color", PASS_GUIDING_COLOR, false);
   MAP_PASS("Guiding Probability", PASS_GUIDING_PROBABILITY, false);
diff --git a/intern/cycles/integrator/pass_accessor.cpp b/intern/cycles/integrator/pass_accessor.cpp
index 1b38eaf0215..ea8caeda0de 100644
--- a/intern/cycles/integrator/pass_accessor.cpp
+++ b/intern/cycles/integrator/pass_accessor.cpp
@@ -229,11 +229,10 @@ void PassAccessor::init_kernel_film_convert(KernelFilmConvert *kfilm_convert,
                                             const BufferParams &buffer_params,
                                             const Destination &destination) const
 {
+  const PassType type = pass_access_info_.type;
   const PassMode mode = pass_access_info_.mode;
-  const PassInfo &pass_info = Pass::get_info(pass_access_info_.type,
-                                             mode,
-                                             pass_access_info_.include_albedo,
-                                             pass_access_info_.is_lightgroup);
+  const PassInfo &pass_info = Pass::get_info(
+      type, mode, pass_access_info_.include_albedo, pass_access_info_.is_lightgroup);
 
   kfilm_convert->pass_offset = pass_access_info_.offset;
   kfilm_convert->pass_stride = buffer_params.pass_stride;
@@ -262,11 +261,15 @@ void PassAccessor::init_kernel_film_convert(KernelFilmConvert *kfilm_convert,
   /* Background is not denoised, so always use noisy pass. */
   kfilm_convert->pass_background = buffer_params.get_pass_offset(PASS_BACKGROUND);
 
-  if (pass_info.use_filter) {
-    kfilm_convert->scale = num_samples_ != 0 ? 1.0f / num_samples_ : 0.0f;
+  /* If we have a sample count pass, we must perform the division in the kernel instead
+   * (unless the sample count pass is the one being read). */
+  const bool divide_by_samples = (type == PASS_SAMPLE_COUNT) ||
+                                 (kfilm_convert->pass_sample_count == PASS_UNUSED);
+  if (pass_info.use_filter && divide_by_samples) {
+    kfilm_convert->scale = num_samples_ != 0 ? pass_info.scale / num_samples_ : 0.0f;
   }
   else {
-    kfilm_convert->scale = 1.0f;
+    kfilm_convert->scale = pass_info.scale;
   }
 
   if (pass_info.use_exposure) {
diff --git a/intern/cycles/integrator/path_trace_work_cpu.cpp b/intern/cycles/integrator/path_trace_work_cpu.cpp
index 1089a846ca9..64ab12baf53 100644
--- a/intern/cycles/integrator/path_trace_work_cpu.cpp
+++ b/intern/cycles/integrator/path_trace_work_cpu.cpp
@@ -20,6 +20,7 @@
 #include "session/buffers.h"
 
 #include "util/tbb.h"
+#include "util/time.h"
 
 CCL_NAMESPACE_BEGIN
 
@@ -128,6 +129,8 @@ void PathTraceWorkCPU::render_samples_full_pipeline(ThreadKernelGlobalsCPU *kern
   KernelWorkTile sample_work_tile = work_tile;
   float *render_buffer = buffers_->buffer.data();
 
+  fast_timer render_timer;
+
   for (int sample = 0; sample < samples_num; ++sample) {
     if (is_cancel_requested()) {
       break;
@@ -173,6 +176,15 @@ void PathTraceWorkCPU::render_samples_full_pipeline(ThreadKernelGlobalsCPU *kern
         kernels_.integrator_megakernel(kernel_globals, shadow_catcher_state, render_buffer);
       }
     }
+
+    if (kernel_globals->data.film.pass_render_time != PASS_UNUSED) {
+      uint64_t time;
+      if (render_timer.lap(time)) {
+        ccl_global float *buffer = render_buffer + (uint64_t)state->path.render_pixel_index *
+                                                       kernel_globals->data.film.pass_stride;
+        *(buffer + kernel_globals->data.film.pass_render_time) += float(time);
+      }
+    }
     ++sample_work_tile.start_sample;
   }
 }
diff --git a/intern/cycles/kernel/data_template.h b/intern/cycles/kernel/data_template.h
index 07df1279ef7..74ad79f5db8 100644
--- a/intern/cycles/kernel/data_template.h
+++ b/intern/cycles/kernel/data_template.h
@@ -116,6 +116,7 @@ KERNEL_STRUCT_MEMBER(film, float, pass_alpha_threshold)
 KERNEL_STRUCT_MEMBER(film, int, pass_shadow_catcher)
 KERNEL_STRUCT_MEMBER(film, int, pass_shadow_catcher_sample_count)
 KERNEL_STRUCT_MEMBER(film, int, pass_shadow_catcher_matte)
+KERNEL_STRUCT_MEMBER(film, int, pass_render_time)
 /* Cryptomatte. */
 KERNEL_STRUCT_MEMBER(film, int, cryptomatte_passes)
 KERNEL_STRUCT_MEMBER(film, int, cryptomatte_depth)
diff --git a/intern/cycles/kernel/film/read.h b/intern/cycles/kernel/film/read.h
index 75d86caabbc..ed2e01ab8d9 100644
--- a/intern/cycles/kernel/film/read.h
+++ b/intern/cycles/kernel/film/read.h
@@ -37,10 +37,10 @@ ccl_device_inline float film_get_scale(const ccl_global KernelFilmConvert *ccl_r
   if (kfilm_convert->pass_use_filter) {
     const uint sample_count = *(
         (const ccl_global uint *)(buffer + kfilm_convert->pass_sample_count));
-    return 1.0f / sample_count;
+    return kfilm_convert->scale / sample_count;
   }
 
-  return 1.0f;
+  return kfilm_convert->scale;
 }
 
 ccl_device_inline float film_get_scale_exposure(const ccl_global KernelFilmConvert *ccl_restrict
@@ -81,10 +81,10 @@ ccl_device_inline bool film_get_scale_and_scale_exposure(
   }
 
   if (kfilm_convert->pass_use_filter) {
-    *scale = 1.0f / sample_count;
+    *scale = kfilm_convert->scale / sample_count;
   }
   else {
-    *scale = 1.0f;
+    *scale = kfilm_convert->scale;
   }
 
   if (kfilm_convert->pass_use_exposure) {
diff --git a/intern/cycles/kernel/types.h b/intern/cycles/kernel/types.h
index e2aa26e90e5..55de43cedb5 100644
--- a/intern/cycles/kernel/types.h
+++ b/intern/cycles/kernel/types.h
@@ -540,6 +540,7 @@ enum PassType {
   PASS_DENOISING_ALBEDO,
   PASS_DENOISING_DEPTH,
   PASS_DENOISING_PREVIOUS,
+  PASS_RENDER_TIME,
 
   /* PASS_SHADOW_CATCHER accumulates contribution of shadow catcher object which is not affected by
    * any other object. The pass accessor will divide the combined pass by the shadow catcher. The
diff --git a/intern/cycles/scene/film.cpp b/intern/cycles/scene/film.cpp
index a30bad2620e..dde98762e05 100644
--- a/intern/cycles/scene/film.cpp
+++ b/intern/cycles/scene/film.cpp
@@ -199,6 +199,7 @@ void Film::device_update(Device *device, DeviceScene *dscene, Scene *scene)
   kfilm->pass_denoising_albedo = PASS_UNUSED;
   kfilm->pass_denoising_depth = PASS_UNUSED;
   kfilm->pass_sample_count = PASS_UNUSED;
+  kfilm->pass_render_time = PASS_UNUSED;
   kfilm->pass_adaptive_aux_buffer = PASS_UNUSED;
   kfilm->pass_shadow_catcher = PASS_UNUSED;
   kfilm->pass_shadow_catcher_sample_count = PASS_UNUSED;
@@ -395,6 +396,9 @@ void Film::device_update(Device *device, DeviceScene *dscene, Scene *scene)
       case PASS_SAMPLE_COUNT:
         kfilm->pass_sample_count = kfilm->pass_stride;
         break;
+      case PASS_RENDER_TIME:
+        kfilm->pass_render_time = kfilm->pass_stride;
+        break;
 
       case PASS_AOV_COLOR:
         if (!have_aov_color) {
diff --git a/intern/cycles/scene/pass.cpp b/intern/cycles/scene/pass.cpp
index 780b1ae0cb6..1078df82d37 100644
--- a/intern/cycles/scene/pass.cpp
+++ b/intern/cycles/scene/pass.cpp
@@ -5,6 +5,7 @@
 #include "scene/pass.h"
 
 #include "util/log.h"
+#include "util/time.h"
 
 CCL_NAMESPACE_BEGIN
 
@@ -92,6 +93,7 @@ const NodeEnum *Pass::get_type_enum()
     pass_type_enum.insert("denoising_previous", PASS_DENOISING_PREVIOUS);
     pass_type_enum.insert("volume_majorant", PASS_VOLUME_MAJORANT);
     pass_type_enum.insert("volume_majorant_sample_count", PASS_VOLUME_MAJORANT_SAMPLE_COUNT);
+    pass_type_enum.insert("render_time", PASS_RENDER_TIME);
 
     pass_type_enum.insert("shadow_catcher", PASS_SHADOW_CATCHER);
     pass_type_enum.insert("shadow_catcher_sample_count", PASS_SHADOW_CATCHER_SAMPLE_COUNT);
@@ -344,6 +346,12 @@ PassInfo Pass::get_info(const PassType type,
       pass_info.num_components = 1;
       pass_info.use_exposure = false;
       break;
+    case PASS_RENDER_TIME:
+      pass_info.num_components = 1;
+      pass_info.use_exposure = false;
+      pass_info.use_filter = false;
+      pass_info.scale = 1000.0f / float(time_fast_frequency());
+      break;
 
     case PASS_AOV_COLOR:
       pass_info.num_components = 4;
diff --git a/intern/cycles/scene/pass.h b/intern/cycles/scene/pass.h
index cfca18f6565..46f0a1169d0 100644
--- a/intern/cycles/scene/pass.h
+++ b/intern/cycles/scene/pass.h
@@ -29,6 +29,7 @@ struct PassInfo {
   bool use_filter = false;
   bool use_exposure = false;
   bool is_written = true;
+  float scale = 1.0f;
   PassType divide_type = PASS_NONE;
   PassType direct_type = PASS_NONE;
   PassType indirect_type = PASS_NONE;
diff --git a/intern/cycles/util/time.cpp b/intern/cycles/util/time.cpp
index 685a8b82143..6b951010ff8 100644
--- a/intern/cycles/util/time.cpp
+++ b/intern/cycles/util/time.cpp
@@ -4,6 +4,7 @@
 
 #include "util/time.h"
 
+#include <chrono>
 #include <cstdlib>
 
 #if !defined(_WIN32)
@@ -17,6 +18,14 @@
 #  include "util/windows.h"
 #endif
 
+#if defined(__x86_64__) || defined(_M_X64) || defined(__i386__) || defined(_M_IX86)
+#  ifdef _MSC_VER
+#    include <intrin.h>
+#  else
+#    include <x86intrin.h>
+#  endif
+#endif
+
 CCL_NAMESPACE_BEGIN
 
 #ifdef _WIN32
@@ -64,6 +73,71 @@ void time_sleep(double t)
 }
 #endif
 
+#if defined(__aarch64__) || defined(_M_ARM64)
+/* Use cntvct_el0/cntfrq_el0 registers on ARM64. */
+
+uint64_t time_fast_tick(uint32_t * /*last_cpu*/)
+{
+#  if defined(ARCH_COMPILER_MSVC)
+  return _ReadStatusReg(ARM64_CNTVCT_EL0);
+#  else
+  uint64_t counter;
+  asm("mrs %x0, cntvct_el0" : "=r"(counter));
+  return counter;
+#  endif
+}
+uint64_t time_fast_frequency()
+{
+#  if defined(ARCH_COMPILER_MSVC)
+  return _ReadStatusReg(ARM64_CNTFRQ_EL0);
+#  else
+  uint64_t freq;
+  asm("mrs %x0, cntfrq_el0" : "=r"(freq));
+  return freq;
+#  endif
+}
+#elif defined(__x86_64__) || defined(_M_X64)
+/* Use RDTSCP on x86-64. */
+
+uint64_t time_fast_tick(uint32_t *last_cpu)
+{
+  return __rdtscp(last_cpu);
+}
+uint64_t time_fast_frequency()
+{
+  static bool initialized = false;
+  static uint64_t frequency;
+
+  /* Unfortunately TSC does not provide a easily accessible frequency value, so roughly calibrate
+   * by sleeping a millisecond. Not ideal, but good enough for our purposes. */
+  if (!initialized) {
+    uint32_t cpu;
+    uint64_t start_tick = time_fast_tick(&cpu);
+    double start_precise = time_dt();
+    time_sleep(0.001);
+    uint64_t end_tick = time_fast_tick(&cpu);
+    double end_precise = time_dt();
+    frequency = uint64_t(double(end_tick - start_tick) / (end_precise - start_precise));
+    initialized = true;
+  }
+
+  return frequency;
+}
+#else
+/* Fall back to std::chrono::steady_clock. */
+
+uint64_t time_fast_tick(uint32_t * /*last_cpu*/)
+{
+  auto now = std::chrono::steady_clock::now();
+  auto nanoseconds = std::chrono::time_point_cast<std::chrono::nanoseconds>(now);
+  return nanoseconds.time_since_epoch().count();
+}
+uint64_t time_fast_frequency()
+{
+  return 1000000000;
+}
+#endif
+
 /* Time in format "hours:minutes:seconds.hundreds" */
 
 string time_human_readable_from_seconds(const double seconds)
diff --git a/intern/cycles/util/time.h b/intern/cycles/util/time.h
index 04e06832e13..b07d6620c5e 100644
--- a/intern/cycles/util/time.h
+++ b/intern/cycles/util/time.h
@@ -18,6 +18,16 @@ double time_dt();
 
 void time_sleep(const double t);
 
+/* Fast timer for applications where overhead is critical and some inaccuracy is acceptable.
+ *
+ * On x86, this uses RDTSCP, which also can check which CPU the code runs on, which in turn
+ * allows us to skip measurements where we moved CPU in-between (which might be invalid due
+ * to different clock states between cores and/or misleading due to OS scheduling). Therefore,
+ * we provide last_cpu to time_fast_tick, and it may set it if supported. */
+
+uint64_t time_fast_tick(uint32_t *last_cpu);
+uint64_t time_fast_frequency();
+
 /* Scoped timer. */
 
 class scoped_timer {
@@ -49,6 +59,33 @@ class scoped_timer {
   double time_start_;
 };
 
+class fast_timer {
+ public:
+  fast_timer()
+  {
+    last_cpu = 0;
+    last_value = time_fast_tick(&last_cpu);
+  }
+
+  bool lap(uint64_t &delta)
+  {
+    uint32_t new_cpu = 0;
+    uint64_t new_value = time_fast_tick(&new_cpu);
+
+    const bool cpu_consistent = new_cpu == last_cpu;
+    delta = new_value - last_value;
+
+    last_cpu = new_cpu;
+    last_value = new_value;
+
+    return cpu_consistent;
+  }
+
+ protected:
+  uint32_t last_cpu;
+  uint64_t last_value;
+};
+
 class scoped_callback_timer {
  public:
   using callback_type = std::function<void(double)>;