diff --git a/intern/cycles/blender/addon/engine.py b/intern/cycles/blender/addon/engine.py index 32ad6dedb72..de33952bbfc 100644 --- a/intern/cycles/blender/addon/engine.py +++ b/intern/cycles/blender/addon/engine.py @@ -219,6 +219,8 @@ def list_render_passes(scene, srl): # Debug passes. if crl.pass_debug_sample_count: yield ("Debug Sample Count", "X", 'VALUE') + if crl.pass_render_time: + yield ("Render Time", "X", 'VALUE') # Cryptomatte passes. # NOTE: Name channels are lowercase RGBA so that compression rules check in OpenEXR DWA code diff --git a/intern/cycles/blender/addon/properties.py b/intern/cycles/blender/addon/properties.py index 8d6667e60f1..a5450642245 100644 --- a/intern/cycles/blender/addon/properties.py +++ b/intern/cycles/blender/addon/properties.py @@ -1480,6 +1480,12 @@ class CyclesRenderLayerSettings(bpy.types.PropertyGroup): default=False, update=update_render_passes, ) + pass_render_time: BoolProperty( + name="Render Time", + description="Pass containing an estimate for how long each pixel took to render", + default=False, + update=update_render_passes, + ) use_pass_volume_direct: BoolProperty( name="Volume Direct", description="Deliver direct volumetric scattering pass", diff --git a/intern/cycles/blender/addon/ui.py b/intern/cycles/blender/addon/ui.py index 3549067a5e1..e4af0466b01 100644 --- a/intern/cycles/blender/addon/ui.py +++ b/intern/cycles/blender/addon/ui.py @@ -1033,6 +1033,7 @@ class CYCLES_RENDER_PT_passes_data(CyclesButtonsPanel, Panel): col = layout.column(heading="Debug", align=True) col.prop(cycles_view_layer, "pass_debug_sample_count", text="Sample Count") + col.prop(cycles_view_layer, "pass_render_time", text="Render Time") layout.prop(view_layer, "pass_alpha_threshold") diff --git a/intern/cycles/blender/sync.cpp b/intern/cycles/blender/sync.cpp index 8353e1f5dc2..6247e4db89c 100644 --- a/intern/cycles/blender/sync.cpp +++ b/intern/cycles/blender/sync.cpp @@ -711,6 +711,7 @@ static bool get_known_pass_type(BL::RenderPass &b_pass, PassType &type, PassMode MAP_PASS("AdaptiveAuxBuffer", PASS_ADAPTIVE_AUX_BUFFER, false); MAP_PASS("Debug Sample Count", PASS_SAMPLE_COUNT, false); + MAP_PASS("Render Time", PASS_RENDER_TIME, false); MAP_PASS("Guiding Color", PASS_GUIDING_COLOR, false); MAP_PASS("Guiding Probability", PASS_GUIDING_PROBABILITY, false); diff --git a/intern/cycles/integrator/pass_accessor.cpp b/intern/cycles/integrator/pass_accessor.cpp index 1b38eaf0215..ea8caeda0de 100644 --- a/intern/cycles/integrator/pass_accessor.cpp +++ b/intern/cycles/integrator/pass_accessor.cpp @@ -229,11 +229,10 @@ void PassAccessor::init_kernel_film_convert(KernelFilmConvert *kfilm_convert, const BufferParams &buffer_params, const Destination &destination) const { + const PassType type = pass_access_info_.type; const PassMode mode = pass_access_info_.mode; - const PassInfo &pass_info = Pass::get_info(pass_access_info_.type, - mode, - pass_access_info_.include_albedo, - pass_access_info_.is_lightgroup); + const PassInfo &pass_info = Pass::get_info( + type, mode, pass_access_info_.include_albedo, pass_access_info_.is_lightgroup); kfilm_convert->pass_offset = pass_access_info_.offset; kfilm_convert->pass_stride = buffer_params.pass_stride; @@ -262,11 +261,15 @@ void PassAccessor::init_kernel_film_convert(KernelFilmConvert *kfilm_convert, /* Background is not denoised, so always use noisy pass. */ kfilm_convert->pass_background = buffer_params.get_pass_offset(PASS_BACKGROUND); - if (pass_info.use_filter) { - kfilm_convert->scale = num_samples_ != 0 ? 1.0f / num_samples_ : 0.0f; + /* If we have a sample count pass, we must perform the division in the kernel instead + * (unless the sample count pass is the one being read). */ + const bool divide_by_samples = (type == PASS_SAMPLE_COUNT) || + (kfilm_convert->pass_sample_count == PASS_UNUSED); + if (pass_info.use_filter && divide_by_samples) { + kfilm_convert->scale = num_samples_ != 0 ? pass_info.scale / num_samples_ : 0.0f; } else { - kfilm_convert->scale = 1.0f; + kfilm_convert->scale = pass_info.scale; } if (pass_info.use_exposure) { diff --git a/intern/cycles/integrator/path_trace_work_cpu.cpp b/intern/cycles/integrator/path_trace_work_cpu.cpp index 1089a846ca9..64ab12baf53 100644 --- a/intern/cycles/integrator/path_trace_work_cpu.cpp +++ b/intern/cycles/integrator/path_trace_work_cpu.cpp @@ -20,6 +20,7 @@ #include "session/buffers.h" #include "util/tbb.h" +#include "util/time.h" CCL_NAMESPACE_BEGIN @@ -128,6 +129,8 @@ void PathTraceWorkCPU::render_samples_full_pipeline(ThreadKernelGlobalsCPU *kern KernelWorkTile sample_work_tile = work_tile; float *render_buffer = buffers_->buffer.data(); + fast_timer render_timer; + for (int sample = 0; sample < samples_num; ++sample) { if (is_cancel_requested()) { break; @@ -173,6 +176,15 @@ void PathTraceWorkCPU::render_samples_full_pipeline(ThreadKernelGlobalsCPU *kern kernels_.integrator_megakernel(kernel_globals, shadow_catcher_state, render_buffer); } } + + if (kernel_globals->data.film.pass_render_time != PASS_UNUSED) { + uint64_t time; + if (render_timer.lap(time)) { + ccl_global float *buffer = render_buffer + (uint64_t)state->path.render_pixel_index * + kernel_globals->data.film.pass_stride; + *(buffer + kernel_globals->data.film.pass_render_time) += float(time); + } + } ++sample_work_tile.start_sample; } } diff --git a/intern/cycles/kernel/data_template.h b/intern/cycles/kernel/data_template.h index 07df1279ef7..74ad79f5db8 100644 --- a/intern/cycles/kernel/data_template.h +++ b/intern/cycles/kernel/data_template.h @@ -116,6 +116,7 @@ KERNEL_STRUCT_MEMBER(film, float, pass_alpha_threshold) KERNEL_STRUCT_MEMBER(film, int, pass_shadow_catcher) KERNEL_STRUCT_MEMBER(film, int, pass_shadow_catcher_sample_count) KERNEL_STRUCT_MEMBER(film, int, pass_shadow_catcher_matte) +KERNEL_STRUCT_MEMBER(film, int, pass_render_time) /* Cryptomatte. */ KERNEL_STRUCT_MEMBER(film, int, cryptomatte_passes) KERNEL_STRUCT_MEMBER(film, int, cryptomatte_depth) diff --git a/intern/cycles/kernel/film/read.h b/intern/cycles/kernel/film/read.h index 75d86caabbc..ed2e01ab8d9 100644 --- a/intern/cycles/kernel/film/read.h +++ b/intern/cycles/kernel/film/read.h @@ -37,10 +37,10 @@ ccl_device_inline float film_get_scale(const ccl_global KernelFilmConvert *ccl_r if (kfilm_convert->pass_use_filter) { const uint sample_count = *( (const ccl_global uint *)(buffer + kfilm_convert->pass_sample_count)); - return 1.0f / sample_count; + return kfilm_convert->scale / sample_count; } - return 1.0f; + return kfilm_convert->scale; } ccl_device_inline float film_get_scale_exposure(const ccl_global KernelFilmConvert *ccl_restrict @@ -81,10 +81,10 @@ ccl_device_inline bool film_get_scale_and_scale_exposure( } if (kfilm_convert->pass_use_filter) { - *scale = 1.0f / sample_count; + *scale = kfilm_convert->scale / sample_count; } else { - *scale = 1.0f; + *scale = kfilm_convert->scale; } if (kfilm_convert->pass_use_exposure) { diff --git a/intern/cycles/kernel/types.h b/intern/cycles/kernel/types.h index e2aa26e90e5..55de43cedb5 100644 --- a/intern/cycles/kernel/types.h +++ b/intern/cycles/kernel/types.h @@ -540,6 +540,7 @@ enum PassType { PASS_DENOISING_ALBEDO, PASS_DENOISING_DEPTH, PASS_DENOISING_PREVIOUS, + PASS_RENDER_TIME, /* PASS_SHADOW_CATCHER accumulates contribution of shadow catcher object which is not affected by * any other object. The pass accessor will divide the combined pass by the shadow catcher. The diff --git a/intern/cycles/scene/film.cpp b/intern/cycles/scene/film.cpp index a30bad2620e..dde98762e05 100644 --- a/intern/cycles/scene/film.cpp +++ b/intern/cycles/scene/film.cpp @@ -199,6 +199,7 @@ void Film::device_update(Device *device, DeviceScene *dscene, Scene *scene) kfilm->pass_denoising_albedo = PASS_UNUSED; kfilm->pass_denoising_depth = PASS_UNUSED; kfilm->pass_sample_count = PASS_UNUSED; + kfilm->pass_render_time = PASS_UNUSED; kfilm->pass_adaptive_aux_buffer = PASS_UNUSED; kfilm->pass_shadow_catcher = PASS_UNUSED; kfilm->pass_shadow_catcher_sample_count = PASS_UNUSED; @@ -395,6 +396,9 @@ void Film::device_update(Device *device, DeviceScene *dscene, Scene *scene) case PASS_SAMPLE_COUNT: kfilm->pass_sample_count = kfilm->pass_stride; break; + case PASS_RENDER_TIME: + kfilm->pass_render_time = kfilm->pass_stride; + break; case PASS_AOV_COLOR: if (!have_aov_color) { diff --git a/intern/cycles/scene/pass.cpp b/intern/cycles/scene/pass.cpp index 780b1ae0cb6..1078df82d37 100644 --- a/intern/cycles/scene/pass.cpp +++ b/intern/cycles/scene/pass.cpp @@ -5,6 +5,7 @@ #include "scene/pass.h" #include "util/log.h" +#include "util/time.h" CCL_NAMESPACE_BEGIN @@ -92,6 +93,7 @@ const NodeEnum *Pass::get_type_enum() pass_type_enum.insert("denoising_previous", PASS_DENOISING_PREVIOUS); pass_type_enum.insert("volume_majorant", PASS_VOLUME_MAJORANT); pass_type_enum.insert("volume_majorant_sample_count", PASS_VOLUME_MAJORANT_SAMPLE_COUNT); + pass_type_enum.insert("render_time", PASS_RENDER_TIME); pass_type_enum.insert("shadow_catcher", PASS_SHADOW_CATCHER); pass_type_enum.insert("shadow_catcher_sample_count", PASS_SHADOW_CATCHER_SAMPLE_COUNT); @@ -344,6 +346,12 @@ PassInfo Pass::get_info(const PassType type, pass_info.num_components = 1; pass_info.use_exposure = false; break; + case PASS_RENDER_TIME: + pass_info.num_components = 1; + pass_info.use_exposure = false; + pass_info.use_filter = false; + pass_info.scale = 1000.0f / float(time_fast_frequency()); + break; case PASS_AOV_COLOR: pass_info.num_components = 4; diff --git a/intern/cycles/scene/pass.h b/intern/cycles/scene/pass.h index cfca18f6565..46f0a1169d0 100644 --- a/intern/cycles/scene/pass.h +++ b/intern/cycles/scene/pass.h @@ -29,6 +29,7 @@ struct PassInfo { bool use_filter = false; bool use_exposure = false; bool is_written = true; + float scale = 1.0f; PassType divide_type = PASS_NONE; PassType direct_type = PASS_NONE; PassType indirect_type = PASS_NONE; diff --git a/intern/cycles/util/time.cpp b/intern/cycles/util/time.cpp index 685a8b82143..6b951010ff8 100644 --- a/intern/cycles/util/time.cpp +++ b/intern/cycles/util/time.cpp @@ -4,6 +4,7 @@ #include "util/time.h" +#include #include #if !defined(_WIN32) @@ -17,6 +18,14 @@ # include "util/windows.h" #endif +#if defined(__x86_64__) || defined(_M_X64) || defined(__i386__) || defined(_M_IX86) +# ifdef _MSC_VER +# include +# else +# include +# endif +#endif + CCL_NAMESPACE_BEGIN #ifdef _WIN32 @@ -64,6 +73,71 @@ void time_sleep(double t) } #endif +#if defined(__aarch64__) || defined(_M_ARM64) +/* Use cntvct_el0/cntfrq_el0 registers on ARM64. */ + +uint64_t time_fast_tick(uint32_t * /*last_cpu*/) +{ +# if defined(ARCH_COMPILER_MSVC) + return _ReadStatusReg(ARM64_CNTVCT_EL0); +# else + uint64_t counter; + asm("mrs %x0, cntvct_el0" : "=r"(counter)); + return counter; +# endif +} +uint64_t time_fast_frequency() +{ +# if defined(ARCH_COMPILER_MSVC) + return _ReadStatusReg(ARM64_CNTFRQ_EL0); +# else + uint64_t freq; + asm("mrs %x0, cntfrq_el0" : "=r"(freq)); + return freq; +# endif +} +#elif defined(__x86_64__) || defined(_M_X64) +/* Use RDTSCP on x86-64. */ + +uint64_t time_fast_tick(uint32_t *last_cpu) +{ + return __rdtscp(last_cpu); +} +uint64_t time_fast_frequency() +{ + static bool initialized = false; + static uint64_t frequency; + + /* Unfortunately TSC does not provide a easily accessible frequency value, so roughly calibrate + * by sleeping a millisecond. Not ideal, but good enough for our purposes. */ + if (!initialized) { + uint32_t cpu; + uint64_t start_tick = time_fast_tick(&cpu); + double start_precise = time_dt(); + time_sleep(0.001); + uint64_t end_tick = time_fast_tick(&cpu); + double end_precise = time_dt(); + frequency = uint64_t(double(end_tick - start_tick) / (end_precise - start_precise)); + initialized = true; + } + + return frequency; +} +#else +/* Fall back to std::chrono::steady_clock. */ + +uint64_t time_fast_tick(uint32_t * /*last_cpu*/) +{ + auto now = std::chrono::steady_clock::now(); + auto nanoseconds = std::chrono::time_point_cast(now); + return nanoseconds.time_since_epoch().count(); +} +uint64_t time_fast_frequency() +{ + return 1000000000; +} +#endif + /* Time in format "hours:minutes:seconds.hundreds" */ string time_human_readable_from_seconds(const double seconds) diff --git a/intern/cycles/util/time.h b/intern/cycles/util/time.h index 04e06832e13..b07d6620c5e 100644 --- a/intern/cycles/util/time.h +++ b/intern/cycles/util/time.h @@ -18,6 +18,16 @@ double time_dt(); void time_sleep(const double t); +/* Fast timer for applications where overhead is critical and some inaccuracy is acceptable. + * + * On x86, this uses RDTSCP, which also can check which CPU the code runs on, which in turn + * allows us to skip measurements where we moved CPU in-between (which might be invalid due + * to different clock states between cores and/or misleading due to OS scheduling). Therefore, + * we provide last_cpu to time_fast_tick, and it may set it if supported. */ + +uint64_t time_fast_tick(uint32_t *last_cpu); +uint64_t time_fast_frequency(); + /* Scoped timer. */ class scoped_timer { @@ -49,6 +59,33 @@ class scoped_timer { double time_start_; }; +class fast_timer { + public: + fast_timer() + { + last_cpu = 0; + last_value = time_fast_tick(&last_cpu); + } + + bool lap(uint64_t &delta) + { + uint32_t new_cpu = 0; + uint64_t new_value = time_fast_tick(&new_cpu); + + const bool cpu_consistent = new_cpu == last_cpu; + delta = new_value - last_value; + + last_cpu = new_cpu; + last_value = new_value; + + return cpu_consistent; + } + + protected: + uint32_t last_cpu; + uint64_t last_value; +}; + class scoped_callback_timer { public: using callback_type = std::function;