EEVEE-Next: Specialization Constants for Film Accumulation
On lower end hardware the film accumulation has bad performance. Sometimes upto 10ms. This PR improves the performance somewhat by adding a specialization constant around the renderpasses that are actually needed for rendering, the number of samples and if reprojection is enabled. `enabled_categories`: Based on the enabled render passes some outer loops are enabled/disabled that handle the specific render passes. This improves the performance as no memory will be reserved for branches that are never accessed. `samples_len` & `use_reprojection`: GPU compilers tend to optimize texture fetches when they to the outer loop. This is only possible when the inner loop can be unrolled. In the case of the film accumulation the inner loop couldn't be unrolled. By adding a specialization constant would allow unrolling of the inner loop. On old or low-end devices the improvement is around 40%. On newer devices the improvement is 50+%. Performance of this shader is similar to the godot. | GPU | Before | New | |----------------------|--------|-------| | NVIDIA GTX 760 | 3.5ms | 2.4ms | | GFX1036 (RDNA2 iGPU) | 9.9ms | 6.2ms | | AMD Radeon Pro W7500 | 2.1ms | 0.9ms | Pull Request: https://projects.blender.org/blender/blender/pulls/118385
This commit is contained in:
@@ -77,6 +77,10 @@ void Film::init_aovs()
|
||||
hash = BLI_hash_string(aov->name);
|
||||
index++;
|
||||
}
|
||||
|
||||
if (!aovs.is_empty()) {
|
||||
enabled_categories_ |= PASS_CATEGORY_AOV;
|
||||
}
|
||||
}
|
||||
|
||||
float *Film::read_aov(ViewLayerAOV *aov)
|
||||
@@ -209,6 +213,7 @@ void Film::init(const int2 &extent, const rcti *output_rect)
|
||||
Scene &scene = *inst_.scene;
|
||||
SceneEEVEE &scene_eevee = scene.eevee;
|
||||
|
||||
enabled_categories_ = PassCategory(0);
|
||||
init_aovs();
|
||||
|
||||
{
|
||||
@@ -285,10 +290,18 @@ void Film::init(const int2 &extent, const rcti *output_rect)
|
||||
const eViewLayerEEVEEPassType color_passes_3 = EEVEE_RENDER_PASS_TRANSPARENT;
|
||||
|
||||
data_.exposure_scale = pow2f(scene.view_settings.exposure);
|
||||
data_.has_data = (enabled_passes_ & data_passes) != 0;
|
||||
data_.any_render_pass_1 = (enabled_passes_ & color_passes_1) != 0;
|
||||
data_.any_render_pass_2 = (enabled_passes_ & color_passes_2) != 0;
|
||||
data_.any_render_pass_3 = (enabled_passes_ & color_passes_3) != 0;
|
||||
if (enabled_passes_ & data_passes) {
|
||||
enabled_categories_ |= PASS_CATEGORY_DATA;
|
||||
}
|
||||
if (enabled_passes_ & color_passes_1) {
|
||||
enabled_categories_ |= PASS_CATEGORY_COLOR_1;
|
||||
}
|
||||
if (enabled_passes_ & color_passes_2) {
|
||||
enabled_categories_ |= PASS_CATEGORY_COLOR_2;
|
||||
}
|
||||
if (enabled_passes_ & color_passes_3) {
|
||||
enabled_categories_ |= PASS_CATEGORY_COLOR_3;
|
||||
}
|
||||
}
|
||||
{
|
||||
/* Set pass offsets. */
|
||||
@@ -358,6 +371,13 @@ void Film::init(const int2 &extent, const rcti *output_rect)
|
||||
data_.cryptomatte_object_id = cryptomatte_index_get(EEVEE_RENDER_PASS_CRYPTOMATTE_OBJECT);
|
||||
data_.cryptomatte_asset_id = cryptomatte_index_get(EEVEE_RENDER_PASS_CRYPTOMATTE_ASSET);
|
||||
data_.cryptomatte_material_id = cryptomatte_index_get(EEVEE_RENDER_PASS_CRYPTOMATTE_MATERIAL);
|
||||
|
||||
if ((enabled_passes_ &
|
||||
(EEVEE_RENDER_PASS_CRYPTOMATTE_ASSET | EEVEE_RENDER_PASS_CRYPTOMATTE_MATERIAL |
|
||||
EEVEE_RENDER_PASS_CRYPTOMATTE_OBJECT)) != 0)
|
||||
{
|
||||
enabled_categories_ |= PASS_CATEGORY_CRYPTOMATTE;
|
||||
}
|
||||
}
|
||||
{
|
||||
int2 weight_extent = inst_.camera.is_panoramic() ? data_.extent : int2(data_.scaling_factor);
|
||||
@@ -390,7 +410,7 @@ void Film::init(const int2 &extent, const rcti *output_rect)
|
||||
|
||||
if (reset > 0) {
|
||||
data_.use_history = 0;
|
||||
data_.use_reprojection = 0;
|
||||
use_reprojection_ = false;
|
||||
|
||||
/* Avoid NaN in uninitialized texture memory making history blending dangerous. */
|
||||
color_accum_tx_.clear(float4(0.0f));
|
||||
@@ -423,9 +443,13 @@ void Film::sync()
|
||||
* Still bind previous step to avoid undefined behavior. */
|
||||
eVelocityStep step_next = inst_.is_viewport() ? STEP_PREVIOUS : STEP_NEXT;
|
||||
|
||||
GPUShader *sh = inst_.shaders.static_shader_get(shader);
|
||||
accumulate_ps_.init();
|
||||
accumulate_ps_.specialize_constant(sh, "enabled_categories", int(enabled_categories_));
|
||||
accumulate_ps_.specialize_constant(sh, "samples_len", &data_.samples_len);
|
||||
accumulate_ps_.specialize_constant(sh, "use_reprojection", &use_reprojection_);
|
||||
accumulate_ps_.state_set(DRW_STATE_WRITE_COLOR | DRW_STATE_WRITE_DEPTH | DRW_STATE_DEPTH_ALWAYS);
|
||||
accumulate_ps_.shader_set(inst_.shaders.static_shader_get(shader));
|
||||
accumulate_ps_.shader_set(sh);
|
||||
accumulate_ps_.bind_resources(inst_.uniform_data);
|
||||
accumulate_ps_.bind_ubo("camera_prev", &(*velocity.camera_steps[STEP_PREVIOUS]));
|
||||
accumulate_ps_.bind_ubo("camera_curr", &(*velocity.camera_steps[STEP_CURRENT]));
|
||||
@@ -475,11 +499,11 @@ void Film::sync()
|
||||
|
||||
void Film::end_sync()
|
||||
{
|
||||
data_.use_reprojection = inst_.sampling.interactive_mode();
|
||||
use_reprojection_ = inst_.sampling.interactive_mode();
|
||||
|
||||
/* Just bypass the reprojection and reset the accumulation. */
|
||||
if (inst_.is_viewport() && force_disable_reprojection_ && inst_.sampling.is_reset()) {
|
||||
data_.use_reprojection = false;
|
||||
use_reprojection_ = false;
|
||||
data_.use_history = false;
|
||||
}
|
||||
|
||||
@@ -511,7 +535,7 @@ float2 Film::pixel_jitter_get() const
|
||||
|
||||
eViewLayerEEVEEPassType Film::enabled_passes_get() const
|
||||
{
|
||||
if (inst_.is_viewport() && data_.use_reprojection) {
|
||||
if (inst_.is_viewport() && use_reprojection_) {
|
||||
/* Enable motion vector rendering but not the accumulation buffer. */
|
||||
return enabled_passes_ | EEVEE_RENDER_PASS_VECTOR;
|
||||
}
|
||||
|
||||
@@ -79,6 +79,8 @@ class Film {
|
||||
int2 display_extent;
|
||||
|
||||
eViewLayerEEVEEPassType enabled_passes_ = eViewLayerEEVEEPassType(0);
|
||||
PassCategory enabled_categories_ = PassCategory(0);
|
||||
bool use_reprojection_ = false;
|
||||
|
||||
public:
|
||||
Film(Instance &inst, FilmData &data) : inst_(inst), data_(data){};
|
||||
|
||||
@@ -30,7 +30,6 @@ using namespace draw;
|
||||
|
||||
constexpr GPUSamplerState no_filter = GPUSamplerState::default_sampler();
|
||||
constexpr GPUSamplerState with_filter = {GPU_SAMPLER_FILTERING_LINEAR};
|
||||
|
||||
#endif
|
||||
|
||||
#define UBO_MIN_MAX_SUPPORTED_SIZE 1 << 14
|
||||
@@ -240,6 +239,16 @@ enum ePassStorageType : uint32_t {
|
||||
PASS_STORAGE_CRYPTOMATTE = 2u,
|
||||
};
|
||||
|
||||
enum PassCategory : uint32_t {
|
||||
PASS_CATEGORY_DATA = 1u << 0,
|
||||
PASS_CATEGORY_COLOR_1 = 1u << 1,
|
||||
PASS_CATEGORY_COLOR_2 = 1u << 2,
|
||||
PASS_CATEGORY_COLOR_3 = 1u << 3,
|
||||
PASS_CATEGORY_AOV = 1u << 4,
|
||||
PASS_CATEGORY_CRYPTOMATTE = 1u << 5,
|
||||
};
|
||||
ENUM_OPERATORS(PassCategory, PASS_CATEGORY_CRYPTOMATTE)
|
||||
|
||||
struct FilmSample {
|
||||
int2 texel;
|
||||
float weight;
|
||||
@@ -269,14 +278,6 @@ struct FilmData {
|
||||
float2 extent_inv;
|
||||
/** Is true if history is valid and can be sampled. Bypass history to resets accumulation. */
|
||||
bool1 use_history;
|
||||
/** Is true if combined buffer is valid and can be re-projected to reduce variance. */
|
||||
bool1 use_reprojection;
|
||||
/** Is true if accumulation of non-filtered passes is needed. */
|
||||
bool1 has_data;
|
||||
/** Is true if accumulation of filtered passes is needed. */
|
||||
bool1 any_render_pass_1;
|
||||
bool1 any_render_pass_2;
|
||||
bool1 any_render_pass_3;
|
||||
/** Controlled by user in lookdev mode or by render settings. */
|
||||
float background_opacity;
|
||||
/** Output counts per type. */
|
||||
@@ -326,6 +327,7 @@ struct FilmData {
|
||||
int samples_len;
|
||||
/** Sum of the weights of all samples in the sample table. */
|
||||
float samples_weight_total;
|
||||
int _pad1;
|
||||
FilmSample samples[FILM_PRECOMP_SAMPLE_MAX];
|
||||
};
|
||||
BLI_STATIC_ASSERT_ALIGN(FilmData, 16)
|
||||
|
||||
@@ -180,7 +180,7 @@ void film_cryptomatte_layer_accum_and_store(
|
||||
}
|
||||
/* x = hash, y = accumulated weight. Only keep track of 4 highest weighted samples. */
|
||||
vec2 crypto_samples[4] = vec2[4](vec2(0.0), vec2(0.0), vec2(0.0), vec2(0.0));
|
||||
for (int i = 0; i < uniform_buf.film.samples_len; i++) {
|
||||
for (int i = 0; i < samples_len; i++) {
|
||||
FilmSample src = film_sample_get(i, texel_film);
|
||||
film_sample_cryptomatte_accum(src, layer_component, cryptomatte_tx, crypto_samples);
|
||||
}
|
||||
@@ -201,7 +201,7 @@ float film_distance_load(ivec2 texel)
|
||||
/* Repeat texture coordinates as the weight can be optimized to a small portion of the film. */
|
||||
texel = texel % imageSize(in_weight_img).xy;
|
||||
|
||||
if (!uniform_buf.film.use_history || uniform_buf.film.use_reprojection) {
|
||||
if (!uniform_buf.film.use_history || use_reprojection) {
|
||||
return 1.0e16;
|
||||
}
|
||||
return imageLoad(in_weight_img, ivec3(texel, FILM_WEIGHT_LAYER_DISTANCE)).x;
|
||||
@@ -212,7 +212,7 @@ float film_weight_load(ivec2 texel)
|
||||
/* Repeat texture coordinates as the weight can be optimized to a small portion of the film. */
|
||||
texel = texel % imageSize(in_weight_img).xy;
|
||||
|
||||
if (!uniform_buf.film.use_history || uniform_buf.film.use_reprojection) {
|
||||
if (!uniform_buf.film.use_history || use_reprojection) {
|
||||
return 0.0;
|
||||
}
|
||||
return imageLoad(in_weight_img, ivec3(texel, FILM_WEIGHT_LAYER_ACCUMULATION)).x;
|
||||
@@ -456,7 +456,7 @@ void film_store_combined(
|
||||
/* Undo the weighting to get final spatially-filtered color. */
|
||||
color_src = color / color_weight;
|
||||
|
||||
if (uniform_buf.film.use_reprojection) {
|
||||
if (use_reprojection) {
|
||||
/* Interactive accumulation. Do reprojection and Temporal Anti-Aliasing. */
|
||||
|
||||
/* Reproject by finding where this pixel was in the previous frame. */
|
||||
@@ -632,7 +632,7 @@ void film_process_data(ivec2 texel_film, out vec4 out_color, out float out_depth
|
||||
vec4 combined_accum = vec4(0.0);
|
||||
|
||||
FilmSample src;
|
||||
for (int i = uniform_buf.film.samples_len - 1; i >= 0; i--) {
|
||||
for (int i = samples_len - 1; i >= 0; i--) {
|
||||
src = film_sample_get(i, texel_film);
|
||||
film_sample_accum_combined(src, combined_accum, weight_accum);
|
||||
}
|
||||
@@ -640,13 +640,13 @@ void film_process_data(ivec2 texel_film, out vec4 out_color, out float out_depth
|
||||
film_store_combined(dst, src.texel, combined_accum, weight_accum, out_color);
|
||||
}
|
||||
|
||||
if (uniform_buf.film.has_data) {
|
||||
if (flag_test(enabled_categories, PASS_CATEGORY_DATA)) {
|
||||
float film_distance = film_distance_load(texel_film);
|
||||
|
||||
/* Get sample closest to target texel. It is always sample 0. */
|
||||
FilmSample film_sample = film_sample_get(0, texel_film);
|
||||
|
||||
if (uniform_buf.film.use_reprojection || film_sample.weight < film_distance) {
|
||||
if (use_reprojection || film_sample.weight < film_distance) {
|
||||
float depth = texelFetch(depth_tx, film_sample.texel, 0).x;
|
||||
vec4 vector = velocity_resolve(vector_tx, film_sample.texel, depth);
|
||||
/* Transform to pixel space, matching Cycles format. */
|
||||
@@ -676,13 +676,13 @@ void film_process_data(ivec2 texel_film, out vec4 out_color, out float out_depth
|
||||
}
|
||||
}
|
||||
|
||||
if (uniform_buf.film.any_render_pass_1) {
|
||||
if (flag_test(enabled_categories, PASS_CATEGORY_COLOR_1)) {
|
||||
vec4 diffuse_light_accum = vec4(0.0);
|
||||
vec4 specular_light_accum = vec4(0.0);
|
||||
vec4 volume_light_accum = vec4(0.0);
|
||||
vec4 emission_accum = vec4(0.0);
|
||||
|
||||
for (int i = 0; i < uniform_buf.film.samples_len; i++) {
|
||||
for (int i = 0; i < samples_len; i++) {
|
||||
FilmSample src = film_sample_get(i, texel_film);
|
||||
film_sample_accum(src,
|
||||
uniform_buf.film.diffuse_light_id,
|
||||
@@ -711,7 +711,7 @@ void film_process_data(ivec2 texel_film, out vec4 out_color, out float out_depth
|
||||
film_store_color(dst, uniform_buf.film.emission_id, emission_accum, out_color);
|
||||
}
|
||||
|
||||
if (uniform_buf.film.any_render_pass_2) {
|
||||
if (flag_test(enabled_categories, PASS_CATEGORY_COLOR_2)) {
|
||||
vec4 diffuse_color_accum = vec4(0.0);
|
||||
vec4 specular_color_accum = vec4(0.0);
|
||||
vec4 environment_accum = vec4(0.0);
|
||||
@@ -719,7 +719,7 @@ void film_process_data(ivec2 texel_film, out vec4 out_color, out float out_depth
|
||||
float shadow_accum = 0.0;
|
||||
float ao_accum = 0.0;
|
||||
|
||||
for (int i = 0; i < uniform_buf.film.samples_len; i++) {
|
||||
for (int i = 0; i < samples_len; i++) {
|
||||
FilmSample src = film_sample_get(i, texel_film);
|
||||
film_sample_accum(src,
|
||||
uniform_buf.film.diffuse_color_id,
|
||||
@@ -760,10 +760,10 @@ void film_process_data(ivec2 texel_film, out vec4 out_color, out float out_depth
|
||||
film_store_value(dst, uniform_buf.film.mist_id, mist_accum, out_color);
|
||||
}
|
||||
|
||||
if (uniform_buf.film.any_render_pass_3) {
|
||||
if (flag_test(enabled_categories, PASS_CATEGORY_COLOR_3)) {
|
||||
vec4 transparent_accum = vec4(0.0);
|
||||
|
||||
for (int i = 0; i < uniform_buf.film.samples_len; i++) {
|
||||
for (int i = 0; i < samples_len; i++) {
|
||||
FilmSample src = film_sample_get(i, texel_film);
|
||||
film_sample_accum(src,
|
||||
uniform_buf.film.transparent_id,
|
||||
@@ -777,37 +777,41 @@ void film_process_data(ivec2 texel_film, out vec4 out_color, out float out_depth
|
||||
film_store_color(dst, uniform_buf.film.transparent_id, transparent_accum, out_color);
|
||||
}
|
||||
|
||||
for (int aov = 0; aov < uniform_buf.film.aov_color_len; aov++) {
|
||||
vec4 aov_accum = vec4(0.0);
|
||||
if (flag_test(enabled_categories, PASS_CATEGORY_AOV)) {
|
||||
for (int aov = 0; aov < uniform_buf.film.aov_color_len; aov++) {
|
||||
vec4 aov_accum = vec4(0.0);
|
||||
|
||||
for (int i = 0; i < uniform_buf.film.samples_len; i++) {
|
||||
FilmSample src = film_sample_get(i, texel_film);
|
||||
film_sample_accum(src, 0, uniform_buf.render_pass.color_len + aov, rp_color_tx, aov_accum);
|
||||
for (int i = 0; i < samples_len; i++) {
|
||||
FilmSample src = film_sample_get(i, texel_film);
|
||||
film_sample_accum(src, 0, uniform_buf.render_pass.color_len + aov, rp_color_tx, aov_accum);
|
||||
}
|
||||
film_store_color(dst, uniform_buf.film.aov_color_id + aov, aov_accum, out_color);
|
||||
}
|
||||
|
||||
for (int aov = 0; aov < uniform_buf.film.aov_value_len; aov++) {
|
||||
float aov_accum = 0.0;
|
||||
|
||||
for (int i = 0; i < samples_len; i++) {
|
||||
FilmSample src = film_sample_get(i, texel_film);
|
||||
film_sample_accum(src, 0, uniform_buf.render_pass.value_len + aov, rp_value_tx, aov_accum);
|
||||
}
|
||||
film_store_value(dst, uniform_buf.film.aov_value_id + aov, aov_accum, out_color);
|
||||
}
|
||||
film_store_color(dst, uniform_buf.film.aov_color_id + aov, aov_accum, out_color);
|
||||
}
|
||||
|
||||
for (int aov = 0; aov < uniform_buf.film.aov_value_len; aov++) {
|
||||
float aov_accum = 0.0;
|
||||
if (flag_test(enabled_categories, PASS_CATEGORY_CRYPTOMATTE)) {
|
||||
if (uniform_buf.film.cryptomatte_samples_len != 0) {
|
||||
/* Cryptomatte passes cannot be cleared by a weighted store like other passes. */
|
||||
if (!uniform_buf.film.use_history || use_reprojection) {
|
||||
cryptomatte_clear_samples(dst);
|
||||
}
|
||||
|
||||
for (int i = 0; i < uniform_buf.film.samples_len; i++) {
|
||||
FilmSample src = film_sample_get(i, texel_film);
|
||||
film_sample_accum(src, 0, uniform_buf.render_pass.value_len + aov, rp_value_tx, aov_accum);
|
||||
film_cryptomatte_layer_accum_and_store(
|
||||
dst, texel_film, uniform_buf.film.cryptomatte_object_id, 0, out_color);
|
||||
film_cryptomatte_layer_accum_and_store(
|
||||
dst, texel_film, uniform_buf.film.cryptomatte_asset_id, 1, out_color);
|
||||
film_cryptomatte_layer_accum_and_store(
|
||||
dst, texel_film, uniform_buf.film.cryptomatte_material_id, 2, out_color);
|
||||
}
|
||||
film_store_value(dst, uniform_buf.film.aov_value_id + aov, aov_accum, out_color);
|
||||
}
|
||||
|
||||
if (uniform_buf.film.cryptomatte_samples_len != 0) {
|
||||
/* Cryptomatte passes cannot be cleared by a weighted store like other passes. */
|
||||
if (!uniform_buf.film.use_history || uniform_buf.film.use_reprojection) {
|
||||
cryptomatte_clear_samples(dst);
|
||||
}
|
||||
|
||||
film_cryptomatte_layer_accum_and_store(
|
||||
dst, texel_film, uniform_buf.film.cryptomatte_object_id, 0, out_color);
|
||||
film_cryptomatte_layer_accum_and_store(
|
||||
dst, texel_film, uniform_buf.film.cryptomatte_asset_id, 1, out_color);
|
||||
film_cryptomatte_layer_accum_and_store(
|
||||
dst, texel_film, uniform_buf.film.cryptomatte_material_id, 2, out_color);
|
||||
}
|
||||
}
|
||||
|
||||
@@ -23,6 +23,9 @@ GPU_SHADER_CREATE_INFO(eevee_film)
|
||||
.image(5, GPU_RGBA16F, Qualifier::READ_WRITE, ImageType::FLOAT_2D_ARRAY, "color_accum_img")
|
||||
.image(6, GPU_R16F, Qualifier::READ_WRITE, ImageType::FLOAT_2D_ARRAY, "value_accum_img")
|
||||
.image(7, GPU_RGBA32F, Qualifier::READ_WRITE, ImageType::FLOAT_2D_ARRAY, "cryptomatte_img")
|
||||
.specialization_constant(Type::INT, "enabled_categories", 0)
|
||||
.specialization_constant(Type::INT, "samples_len", 0)
|
||||
.specialization_constant(Type::BOOL, "use_reprojection", false)
|
||||
.additional_info("eevee_shared")
|
||||
.additional_info("eevee_global_ubo")
|
||||
.additional_info("eevee_velocity_camera")
|
||||
|
||||
@@ -32,6 +32,7 @@
|
||||
# define BLI_STATIC_ASSERT(cond, msg)
|
||||
# define BLI_STATIC_ASSERT_ALIGN(type_, align_)
|
||||
# define BLI_STATIC_ASSERT_SIZE(type_, size_)
|
||||
# define ENUM_OPERATORS(a, b)
|
||||
# define static
|
||||
# define inline
|
||||
# define cosf cos
|
||||
|
||||
Reference in New Issue
Block a user