From 8dce2a422bc66a0900d8e68aee2d12f070e54d34 Mon Sep 17 00:00:00 2001 From: Jeroen Bakker Date: Mon, 26 Feb 2024 16:19:26 +0100 Subject: [PATCH] EEVEE-Next: Specialization Constants for Film Accumulation On lower end hardware the film accumulation has bad performance. Sometimes upto 10ms. This PR improves the performance somewhat by adding a specialization constant around the renderpasses that are actually needed for rendering, the number of samples and if reprojection is enabled. `enabled_categories`: Based on the enabled render passes some outer loops are enabled/disabled that handle the specific render passes. This improves the performance as no memory will be reserved for branches that are never accessed. `samples_len` & `use_reprojection`: GPU compilers tend to optimize texture fetches when they to the outer loop. This is only possible when the inner loop can be unrolled. In the case of the film accumulation the inner loop couldn't be unrolled. By adding a specialization constant would allow unrolling of the inner loop. On old or low-end devices the improvement is around 40%. On newer devices the improvement is 50+%. Performance of this shader is similar to the godot. | GPU | Before | New | |----------------------|--------|-------| | NVIDIA GTX 760 | 3.5ms | 2.4ms | | GFX1036 (RDNA2 iGPU) | 9.9ms | 6.2ms | | AMD Radeon Pro W7500 | 2.1ms | 0.9ms | Pull Request: https://projects.blender.org/blender/blender/pulls/118385 --- .../draw/engines/eevee_next/eevee_film.cc | 42 ++++++++-- .../draw/engines/eevee_next/eevee_film.hh | 2 + .../engines/eevee_next/eevee_shader_shared.hh | 20 +++-- .../eevee_next/shaders/eevee_film_lib.glsl | 82 ++++++++++--------- .../shaders/infos/eevee_film_info.hh | 3 + source/blender/gpu/GPU_shader_shared_utils.h | 1 + 6 files changed, 93 insertions(+), 57 deletions(-) diff --git a/source/blender/draw/engines/eevee_next/eevee_film.cc b/source/blender/draw/engines/eevee_next/eevee_film.cc index 31cdf24beeb..ad2283134ab 100644 --- a/source/blender/draw/engines/eevee_next/eevee_film.cc +++ b/source/blender/draw/engines/eevee_next/eevee_film.cc @@ -77,6 +77,10 @@ void Film::init_aovs() hash = BLI_hash_string(aov->name); index++; } + + if (!aovs.is_empty()) { + enabled_categories_ |= PASS_CATEGORY_AOV; + } } float *Film::read_aov(ViewLayerAOV *aov) @@ -209,6 +213,7 @@ void Film::init(const int2 &extent, const rcti *output_rect) Scene &scene = *inst_.scene; SceneEEVEE &scene_eevee = scene.eevee; + enabled_categories_ = PassCategory(0); init_aovs(); { @@ -285,10 +290,18 @@ void Film::init(const int2 &extent, const rcti *output_rect) const eViewLayerEEVEEPassType color_passes_3 = EEVEE_RENDER_PASS_TRANSPARENT; data_.exposure_scale = pow2f(scene.view_settings.exposure); - data_.has_data = (enabled_passes_ & data_passes) != 0; - data_.any_render_pass_1 = (enabled_passes_ & color_passes_1) != 0; - data_.any_render_pass_2 = (enabled_passes_ & color_passes_2) != 0; - data_.any_render_pass_3 = (enabled_passes_ & color_passes_3) != 0; + if (enabled_passes_ & data_passes) { + enabled_categories_ |= PASS_CATEGORY_DATA; + } + if (enabled_passes_ & color_passes_1) { + enabled_categories_ |= PASS_CATEGORY_COLOR_1; + } + if (enabled_passes_ & color_passes_2) { + enabled_categories_ |= PASS_CATEGORY_COLOR_2; + } + if (enabled_passes_ & color_passes_3) { + enabled_categories_ |= PASS_CATEGORY_COLOR_3; + } } { /* Set pass offsets. */ @@ -358,6 +371,13 @@ void Film::init(const int2 &extent, const rcti *output_rect) data_.cryptomatte_object_id = cryptomatte_index_get(EEVEE_RENDER_PASS_CRYPTOMATTE_OBJECT); data_.cryptomatte_asset_id = cryptomatte_index_get(EEVEE_RENDER_PASS_CRYPTOMATTE_ASSET); data_.cryptomatte_material_id = cryptomatte_index_get(EEVEE_RENDER_PASS_CRYPTOMATTE_MATERIAL); + + if ((enabled_passes_ & + (EEVEE_RENDER_PASS_CRYPTOMATTE_ASSET | EEVEE_RENDER_PASS_CRYPTOMATTE_MATERIAL | + EEVEE_RENDER_PASS_CRYPTOMATTE_OBJECT)) != 0) + { + enabled_categories_ |= PASS_CATEGORY_CRYPTOMATTE; + } } { int2 weight_extent = inst_.camera.is_panoramic() ? data_.extent : int2(data_.scaling_factor); @@ -390,7 +410,7 @@ void Film::init(const int2 &extent, const rcti *output_rect) if (reset > 0) { data_.use_history = 0; - data_.use_reprojection = 0; + use_reprojection_ = false; /* Avoid NaN in uninitialized texture memory making history blending dangerous. */ color_accum_tx_.clear(float4(0.0f)); @@ -423,9 +443,13 @@ void Film::sync() * Still bind previous step to avoid undefined behavior. */ eVelocityStep step_next = inst_.is_viewport() ? STEP_PREVIOUS : STEP_NEXT; + GPUShader *sh = inst_.shaders.static_shader_get(shader); accumulate_ps_.init(); + accumulate_ps_.specialize_constant(sh, "enabled_categories", int(enabled_categories_)); + accumulate_ps_.specialize_constant(sh, "samples_len", &data_.samples_len); + accumulate_ps_.specialize_constant(sh, "use_reprojection", &use_reprojection_); accumulate_ps_.state_set(DRW_STATE_WRITE_COLOR | DRW_STATE_WRITE_DEPTH | DRW_STATE_DEPTH_ALWAYS); - accumulate_ps_.shader_set(inst_.shaders.static_shader_get(shader)); + accumulate_ps_.shader_set(sh); accumulate_ps_.bind_resources(inst_.uniform_data); accumulate_ps_.bind_ubo("camera_prev", &(*velocity.camera_steps[STEP_PREVIOUS])); accumulate_ps_.bind_ubo("camera_curr", &(*velocity.camera_steps[STEP_CURRENT])); @@ -475,11 +499,11 @@ void Film::sync() void Film::end_sync() { - data_.use_reprojection = inst_.sampling.interactive_mode(); + use_reprojection_ = inst_.sampling.interactive_mode(); /* Just bypass the reprojection and reset the accumulation. */ if (inst_.is_viewport() && force_disable_reprojection_ && inst_.sampling.is_reset()) { - data_.use_reprojection = false; + use_reprojection_ = false; data_.use_history = false; } @@ -511,7 +535,7 @@ float2 Film::pixel_jitter_get() const eViewLayerEEVEEPassType Film::enabled_passes_get() const { - if (inst_.is_viewport() && data_.use_reprojection) { + if (inst_.is_viewport() && use_reprojection_) { /* Enable motion vector rendering but not the accumulation buffer. */ return enabled_passes_ | EEVEE_RENDER_PASS_VECTOR; } diff --git a/source/blender/draw/engines/eevee_next/eevee_film.hh b/source/blender/draw/engines/eevee_next/eevee_film.hh index 03232c69f27..f8d563e846b 100644 --- a/source/blender/draw/engines/eevee_next/eevee_film.hh +++ b/source/blender/draw/engines/eevee_next/eevee_film.hh @@ -79,6 +79,8 @@ class Film { int2 display_extent; eViewLayerEEVEEPassType enabled_passes_ = eViewLayerEEVEEPassType(0); + PassCategory enabled_categories_ = PassCategory(0); + bool use_reprojection_ = false; public: Film(Instance &inst, FilmData &data) : inst_(inst), data_(data){}; diff --git a/source/blender/draw/engines/eevee_next/eevee_shader_shared.hh b/source/blender/draw/engines/eevee_next/eevee_shader_shared.hh index 4f3404579e3..a1ada9bf533 100644 --- a/source/blender/draw/engines/eevee_next/eevee_shader_shared.hh +++ b/source/blender/draw/engines/eevee_next/eevee_shader_shared.hh @@ -30,7 +30,6 @@ using namespace draw; constexpr GPUSamplerState no_filter = GPUSamplerState::default_sampler(); constexpr GPUSamplerState with_filter = {GPU_SAMPLER_FILTERING_LINEAR}; - #endif #define UBO_MIN_MAX_SUPPORTED_SIZE 1 << 14 @@ -240,6 +239,16 @@ enum ePassStorageType : uint32_t { PASS_STORAGE_CRYPTOMATTE = 2u, }; +enum PassCategory : uint32_t { + PASS_CATEGORY_DATA = 1u << 0, + PASS_CATEGORY_COLOR_1 = 1u << 1, + PASS_CATEGORY_COLOR_2 = 1u << 2, + PASS_CATEGORY_COLOR_3 = 1u << 3, + PASS_CATEGORY_AOV = 1u << 4, + PASS_CATEGORY_CRYPTOMATTE = 1u << 5, +}; +ENUM_OPERATORS(PassCategory, PASS_CATEGORY_CRYPTOMATTE) + struct FilmSample { int2 texel; float weight; @@ -269,14 +278,6 @@ struct FilmData { float2 extent_inv; /** Is true if history is valid and can be sampled. Bypass history to resets accumulation. */ bool1 use_history; - /** Is true if combined buffer is valid and can be re-projected to reduce variance. */ - bool1 use_reprojection; - /** Is true if accumulation of non-filtered passes is needed. */ - bool1 has_data; - /** Is true if accumulation of filtered passes is needed. */ - bool1 any_render_pass_1; - bool1 any_render_pass_2; - bool1 any_render_pass_3; /** Controlled by user in lookdev mode or by render settings. */ float background_opacity; /** Output counts per type. */ @@ -326,6 +327,7 @@ struct FilmData { int samples_len; /** Sum of the weights of all samples in the sample table. */ float samples_weight_total; + int _pad1; FilmSample samples[FILM_PRECOMP_SAMPLE_MAX]; }; BLI_STATIC_ASSERT_ALIGN(FilmData, 16) diff --git a/source/blender/draw/engines/eevee_next/shaders/eevee_film_lib.glsl b/source/blender/draw/engines/eevee_next/shaders/eevee_film_lib.glsl index d6c350a4f33..0b3160ef447 100644 --- a/source/blender/draw/engines/eevee_next/shaders/eevee_film_lib.glsl +++ b/source/blender/draw/engines/eevee_next/shaders/eevee_film_lib.glsl @@ -180,7 +180,7 @@ void film_cryptomatte_layer_accum_and_store( } /* x = hash, y = accumulated weight. Only keep track of 4 highest weighted samples. */ vec2 crypto_samples[4] = vec2[4](vec2(0.0), vec2(0.0), vec2(0.0), vec2(0.0)); - for (int i = 0; i < uniform_buf.film.samples_len; i++) { + for (int i = 0; i < samples_len; i++) { FilmSample src = film_sample_get(i, texel_film); film_sample_cryptomatte_accum(src, layer_component, cryptomatte_tx, crypto_samples); } @@ -201,7 +201,7 @@ float film_distance_load(ivec2 texel) /* Repeat texture coordinates as the weight can be optimized to a small portion of the film. */ texel = texel % imageSize(in_weight_img).xy; - if (!uniform_buf.film.use_history || uniform_buf.film.use_reprojection) { + if (!uniform_buf.film.use_history || use_reprojection) { return 1.0e16; } return imageLoad(in_weight_img, ivec3(texel, FILM_WEIGHT_LAYER_DISTANCE)).x; @@ -212,7 +212,7 @@ float film_weight_load(ivec2 texel) /* Repeat texture coordinates as the weight can be optimized to a small portion of the film. */ texel = texel % imageSize(in_weight_img).xy; - if (!uniform_buf.film.use_history || uniform_buf.film.use_reprojection) { + if (!uniform_buf.film.use_history || use_reprojection) { return 0.0; } return imageLoad(in_weight_img, ivec3(texel, FILM_WEIGHT_LAYER_ACCUMULATION)).x; @@ -456,7 +456,7 @@ void film_store_combined( /* Undo the weighting to get final spatially-filtered color. */ color_src = color / color_weight; - if (uniform_buf.film.use_reprojection) { + if (use_reprojection) { /* Interactive accumulation. Do reprojection and Temporal Anti-Aliasing. */ /* Reproject by finding where this pixel was in the previous frame. */ @@ -632,7 +632,7 @@ void film_process_data(ivec2 texel_film, out vec4 out_color, out float out_depth vec4 combined_accum = vec4(0.0); FilmSample src; - for (int i = uniform_buf.film.samples_len - 1; i >= 0; i--) { + for (int i = samples_len - 1; i >= 0; i--) { src = film_sample_get(i, texel_film); film_sample_accum_combined(src, combined_accum, weight_accum); } @@ -640,13 +640,13 @@ void film_process_data(ivec2 texel_film, out vec4 out_color, out float out_depth film_store_combined(dst, src.texel, combined_accum, weight_accum, out_color); } - if (uniform_buf.film.has_data) { + if (flag_test(enabled_categories, PASS_CATEGORY_DATA)) { float film_distance = film_distance_load(texel_film); /* Get sample closest to target texel. It is always sample 0. */ FilmSample film_sample = film_sample_get(0, texel_film); - if (uniform_buf.film.use_reprojection || film_sample.weight < film_distance) { + if (use_reprojection || film_sample.weight < film_distance) { float depth = texelFetch(depth_tx, film_sample.texel, 0).x; vec4 vector = velocity_resolve(vector_tx, film_sample.texel, depth); /* Transform to pixel space, matching Cycles format. */ @@ -676,13 +676,13 @@ void film_process_data(ivec2 texel_film, out vec4 out_color, out float out_depth } } - if (uniform_buf.film.any_render_pass_1) { + if (flag_test(enabled_categories, PASS_CATEGORY_COLOR_1)) { vec4 diffuse_light_accum = vec4(0.0); vec4 specular_light_accum = vec4(0.0); vec4 volume_light_accum = vec4(0.0); vec4 emission_accum = vec4(0.0); - for (int i = 0; i < uniform_buf.film.samples_len; i++) { + for (int i = 0; i < samples_len; i++) { FilmSample src = film_sample_get(i, texel_film); film_sample_accum(src, uniform_buf.film.diffuse_light_id, @@ -711,7 +711,7 @@ void film_process_data(ivec2 texel_film, out vec4 out_color, out float out_depth film_store_color(dst, uniform_buf.film.emission_id, emission_accum, out_color); } - if (uniform_buf.film.any_render_pass_2) { + if (flag_test(enabled_categories, PASS_CATEGORY_COLOR_2)) { vec4 diffuse_color_accum = vec4(0.0); vec4 specular_color_accum = vec4(0.0); vec4 environment_accum = vec4(0.0); @@ -719,7 +719,7 @@ void film_process_data(ivec2 texel_film, out vec4 out_color, out float out_depth float shadow_accum = 0.0; float ao_accum = 0.0; - for (int i = 0; i < uniform_buf.film.samples_len; i++) { + for (int i = 0; i < samples_len; i++) { FilmSample src = film_sample_get(i, texel_film); film_sample_accum(src, uniform_buf.film.diffuse_color_id, @@ -760,10 +760,10 @@ void film_process_data(ivec2 texel_film, out vec4 out_color, out float out_depth film_store_value(dst, uniform_buf.film.mist_id, mist_accum, out_color); } - if (uniform_buf.film.any_render_pass_3) { + if (flag_test(enabled_categories, PASS_CATEGORY_COLOR_3)) { vec4 transparent_accum = vec4(0.0); - for (int i = 0; i < uniform_buf.film.samples_len; i++) { + for (int i = 0; i < samples_len; i++) { FilmSample src = film_sample_get(i, texel_film); film_sample_accum(src, uniform_buf.film.transparent_id, @@ -777,37 +777,41 @@ void film_process_data(ivec2 texel_film, out vec4 out_color, out float out_depth film_store_color(dst, uniform_buf.film.transparent_id, transparent_accum, out_color); } - for (int aov = 0; aov < uniform_buf.film.aov_color_len; aov++) { - vec4 aov_accum = vec4(0.0); + if (flag_test(enabled_categories, PASS_CATEGORY_AOV)) { + for (int aov = 0; aov < uniform_buf.film.aov_color_len; aov++) { + vec4 aov_accum = vec4(0.0); - for (int i = 0; i < uniform_buf.film.samples_len; i++) { - FilmSample src = film_sample_get(i, texel_film); - film_sample_accum(src, 0, uniform_buf.render_pass.color_len + aov, rp_color_tx, aov_accum); + for (int i = 0; i < samples_len; i++) { + FilmSample src = film_sample_get(i, texel_film); + film_sample_accum(src, 0, uniform_buf.render_pass.color_len + aov, rp_color_tx, aov_accum); + } + film_store_color(dst, uniform_buf.film.aov_color_id + aov, aov_accum, out_color); + } + + for (int aov = 0; aov < uniform_buf.film.aov_value_len; aov++) { + float aov_accum = 0.0; + + for (int i = 0; i < samples_len; i++) { + FilmSample src = film_sample_get(i, texel_film); + film_sample_accum(src, 0, uniform_buf.render_pass.value_len + aov, rp_value_tx, aov_accum); + } + film_store_value(dst, uniform_buf.film.aov_value_id + aov, aov_accum, out_color); } - film_store_color(dst, uniform_buf.film.aov_color_id + aov, aov_accum, out_color); } - for (int aov = 0; aov < uniform_buf.film.aov_value_len; aov++) { - float aov_accum = 0.0; + if (flag_test(enabled_categories, PASS_CATEGORY_CRYPTOMATTE)) { + if (uniform_buf.film.cryptomatte_samples_len != 0) { + /* Cryptomatte passes cannot be cleared by a weighted store like other passes. */ + if (!uniform_buf.film.use_history || use_reprojection) { + cryptomatte_clear_samples(dst); + } - for (int i = 0; i < uniform_buf.film.samples_len; i++) { - FilmSample src = film_sample_get(i, texel_film); - film_sample_accum(src, 0, uniform_buf.render_pass.value_len + aov, rp_value_tx, aov_accum); + film_cryptomatte_layer_accum_and_store( + dst, texel_film, uniform_buf.film.cryptomatte_object_id, 0, out_color); + film_cryptomatte_layer_accum_and_store( + dst, texel_film, uniform_buf.film.cryptomatte_asset_id, 1, out_color); + film_cryptomatte_layer_accum_and_store( + dst, texel_film, uniform_buf.film.cryptomatte_material_id, 2, out_color); } - film_store_value(dst, uniform_buf.film.aov_value_id + aov, aov_accum, out_color); - } - - if (uniform_buf.film.cryptomatte_samples_len != 0) { - /* Cryptomatte passes cannot be cleared by a weighted store like other passes. */ - if (!uniform_buf.film.use_history || uniform_buf.film.use_reprojection) { - cryptomatte_clear_samples(dst); - } - - film_cryptomatte_layer_accum_and_store( - dst, texel_film, uniform_buf.film.cryptomatte_object_id, 0, out_color); - film_cryptomatte_layer_accum_and_store( - dst, texel_film, uniform_buf.film.cryptomatte_asset_id, 1, out_color); - film_cryptomatte_layer_accum_and_store( - dst, texel_film, uniform_buf.film.cryptomatte_material_id, 2, out_color); } } diff --git a/source/blender/draw/engines/eevee_next/shaders/infos/eevee_film_info.hh b/source/blender/draw/engines/eevee_next/shaders/infos/eevee_film_info.hh index 7d727b894b3..b89feb74ff7 100644 --- a/source/blender/draw/engines/eevee_next/shaders/infos/eevee_film_info.hh +++ b/source/blender/draw/engines/eevee_next/shaders/infos/eevee_film_info.hh @@ -23,6 +23,9 @@ GPU_SHADER_CREATE_INFO(eevee_film) .image(5, GPU_RGBA16F, Qualifier::READ_WRITE, ImageType::FLOAT_2D_ARRAY, "color_accum_img") .image(6, GPU_R16F, Qualifier::READ_WRITE, ImageType::FLOAT_2D_ARRAY, "value_accum_img") .image(7, GPU_RGBA32F, Qualifier::READ_WRITE, ImageType::FLOAT_2D_ARRAY, "cryptomatte_img") + .specialization_constant(Type::INT, "enabled_categories", 0) + .specialization_constant(Type::INT, "samples_len", 0) + .specialization_constant(Type::BOOL, "use_reprojection", false) .additional_info("eevee_shared") .additional_info("eevee_global_ubo") .additional_info("eevee_velocity_camera") diff --git a/source/blender/gpu/GPU_shader_shared_utils.h b/source/blender/gpu/GPU_shader_shared_utils.h index e605b0d4f84..f7900ee48fe 100644 --- a/source/blender/gpu/GPU_shader_shared_utils.h +++ b/source/blender/gpu/GPU_shader_shared_utils.h @@ -32,6 +32,7 @@ # define BLI_STATIC_ASSERT(cond, msg) # define BLI_STATIC_ASSERT_ALIGN(type_, align_) # define BLI_STATIC_ASSERT_SIZE(type_, size_) +# define ENUM_OPERATORS(a, b) # define static # define inline # define cosf cos