From 74e6d2c575d967adfd4cf21105889baa2d01642f Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Cl=C3=A9ment=20Foucault?= Date: Mon, 5 May 2025 13:37:51 +0200 Subject: [PATCH] GPU: Shader: Add support for basic loop unrolling through preprocessor This adds basic unrolling support for 2 syntax: - `[[gpu::unroll]]` which does full loop unrolling - `[[gpu::unroll(x)]]` which unrolls `x` iteration Nesting is supported. This change is motivated by the added cost in compilation and execution time that some loops have even if they have compile time defined iteration counts. The syntax is inspired by `GL_EXT_control_flow_attributes`. However, we might want to have our own prefix to show it is a blender specific feature and that it differs from the standard. I propose `[[gpu::unroll]]`. In the future, we could extend this to support more directives that can be expanded to backend specific extension / syntax. This would avoid readability issue an error prone copy paste of large amount of preprocessor directives. Currently, given that GL's GLSL flavor doesn't support any of these attributes, the preprocessor does some copy-pasting that does the unrolling at the source level. Note that the added `#line` allow for correct error logging. For the `[[gpu::unroll]]` syntax, the `for` declaration needs to follow a specific syntax to deduce the number of loop iteration. This variant removes the continue condition between iteration, so all iterations are evaluated. This could be modified using a special keyword. For the `[[gpu::unroll(n)]]` syntax, the usercode needs to make sure that `n` is large enough to cover all iterations as the loop is completely removed. We could add shader `assert` to make sure that there is never a remaining iteration. This behavior is usually different from what you see in other implementation as we do not keep a loop at all. Usually, compilers still keep the loop if it is not unrolled fully. But given we don't have IR, this is the best we can do. `break` and `continue` statement are forbidden at the unrolled loop scope level. Nested loop and switch can contain these keywords. This is accounted for by checks in the pre-processor. Only `for` loops are supported for now. There are no real incentive to add support for `while` given how rare it is in the shader codebase. Rel #137446 Pull Request: https://projects.blender.org/blender/blender/pulls/137444 --- .../eevee_lightprobe_sphere_remap_comp.glsl | 59 ++--- .../shaders/eevee_surf_deferred_frag.glsl | 11 +- .../shaders/workbench_effect_taa_frag.glsl | 6 +- .../gpu/glsl_preprocess/glsl_preprocess.hh | 216 +++++++++++++++++- .../blender/gpu/shaders/gpu_glsl_cpp_stubs.hh | 9 + .../gpu/tests/shader_preprocess_test.cc | 165 +++++++++++++ 6 files changed, 430 insertions(+), 36 deletions(-) diff --git a/source/blender/draw/engines/eevee/shaders/eevee_lightprobe_sphere_remap_comp.glsl b/source/blender/draw/engines/eevee/shaders/eevee_lightprobe_sphere_remap_comp.glsl index 59a2ea93e36..21a5aba1f0a 100644 --- a/source/blender/draw/engines/eevee/shaders/eevee_lightprobe_sphere_remap_comp.glsl +++ b/source/blender/draw/engines/eevee/shaders/eevee_lightprobe_sphere_remap_comp.glsl @@ -12,31 +12,6 @@ COMPUTE_SHADER_CREATE_INFO(eevee_lightprobe_sphere_remap) #include "eevee_lightprobe_sphere_mapping_lib.glsl" #include "eevee_spherical_harmonics_lib.glsl" -/* OpenGL/Intel drivers have known issues where it isn't able to compile barriers inside for loops. - * Macros are needed as driver can decide to not unroll in shaders with more complexity. */ -#define PARALLEL_SUM_INNER() \ - barrier(); \ - if (local_index < stride) { \ - local_radiance[local_index] += local_radiance[local_index + stride]; \ - } \ - stride /= 2; - -#define PARALLEL_SUM \ - { \ - uint stride = group_size / 2; \ - PARALLEL_SUM_INNER(); \ - PARALLEL_SUM_INNER(); \ - PARALLEL_SUM_INNER(); \ - PARALLEL_SUM_INNER(); \ - PARALLEL_SUM_INNER(); \ - PARALLEL_SUM_INNER(); \ - PARALLEL_SUM_INNER(); \ - PARALLEL_SUM_INNER(); \ - PARALLEL_SUM_INNER(); \ - PARALLEL_SUM_INNER(); \ - barrier(); \ - } - shared float4 local_radiance[gl_WorkGroupSize.x * gl_WorkGroupSize.y]; float triangle_solid_angle(float3 A, float3 B, float3 C) @@ -159,7 +134,16 @@ void main() if (extract_sun) { /* Parallel sum. Result is stored inside local_radiance[0]. */ local_radiance[local_index] = radiance_sun.xyzz * sample_weight; - PARALLEL_SUM + /* OpenGL/Intel drivers have known issues where it isn't able to compile barriers inside for + * loops. Unroll is needed as driver might decide to not unroll in shaders with more + * complexity. */ + [[gpu::unroll(10)]] for (uint stride = group_size / 2; stride > 0; stride /= 2) + { + barrier(); + if (local_index < stride) { + local_radiance[local_index] += local_radiance[local_index + stride]; + } + } if (gl_LocalInvocationIndex == 0u) { out_sun[work_group_index].radiance = local_radiance[0].xyz; @@ -169,7 +153,17 @@ void main() /* Reusing local_radiance for directions. */ local_radiance[local_index] = float4(normalize(direction), 1.0f) * sample_weight * length(radiance_sun.xyz); - PARALLEL_SUM + /* OpenGL/Intel drivers have known issues where it isn't able to compile barriers inside for + * loops. Unroll is needed as driver might decide to not unroll in shaders with more + * complexity. */ + [[gpu::unroll(10)]] for (uint stride = group_size / 2; stride > 0; stride /= 2) + { + barrier(); + if (local_index < stride) { + local_radiance[local_index] += local_radiance[local_index + stride]; + } + } + barrier(); if (gl_LocalInvocationIndex == 0u) { out_sun[work_group_index].direction = local_radiance[0]; @@ -180,7 +174,16 @@ void main() if (extract_sh) { /* Parallel sum. Result is stored inside local_radiance[0]. */ local_radiance[local_index] = radiance.xyzz * sample_weight; - PARALLEL_SUM + /* OpenGL/Intel drivers have known issues where it isn't able to compile barriers inside for + * loops. Unroll is needed as driver might decide to not unroll in shaders with more + * complexity. */ + [[gpu::unroll(10)]] for (uint stride = group_size / 2; stride > 0; stride /= 2) + { + barrier(); + if (local_index < stride) { + local_radiance[local_index] += local_radiance[local_index + stride]; + } + } if (gl_LocalInvocationIndex == 0u) { /* Find the middle point of the whole thread-group. Use it as light vector. diff --git a/source/blender/draw/engines/eevee/shaders/eevee_surf_deferred_frag.glsl b/source/blender/draw/engines/eevee/shaders/eevee_surf_deferred_frag.glsl index 49f4468588b..5ee69240754 100644 --- a/source/blender/draw/engines/eevee/shaders/eevee_surf_deferred_frag.glsl +++ b/source/blender/draw/engines/eevee/shaders/eevee_surf_deferred_frag.glsl @@ -121,8 +121,9 @@ void main() out_gbuf_normal = gbuf.N[0]; /* Output remaining closures using image store. */ - for (int layer = GBUF_CLOSURE_FB_LAYER_COUNT; layer < GBUFFER_DATA_MAX && layer < gbuf.data_len; - layer++) + [[gpu::unroll(6)]] for (int layer = GBUF_CLOSURE_FB_LAYER_COUNT; + layer < GBUFFER_DATA_MAX && layer < gbuf.data_len; + layer++) { /* NOTE: The image view start at layer GBUF_CLOSURE_FB_LAYER_COUNT so all destination layer is * `layer - GBUF_CLOSURE_FB_LAYER_COUNT`. */ @@ -130,9 +131,9 @@ void main() int3(out_texel, layer - GBUF_CLOSURE_FB_LAYER_COUNT), gbuf.data[layer]); } - for (int layer = GBUF_NORMAL_FB_LAYER_COUNT; - layer < GBUFFER_NORMAL_MAX && layer < gbuf.normal_len; - layer++) + [[gpu::unroll(4)]] for (int layer = GBUF_NORMAL_FB_LAYER_COUNT; + layer < GBUFFER_NORMAL_MAX && layer < gbuf.normal_len; + layer++) { /* NOTE: The image view start at layer GBUF_NORMAL_FB_LAYER_COUNT so all destination layer is * `layer - GBUF_NORMAL_FB_LAYER_COUNT`. */ diff --git a/source/blender/draw/engines/workbench/shaders/workbench_effect_taa_frag.glsl b/source/blender/draw/engines/workbench/shaders/workbench_effect_taa_frag.glsl index e45230a6156..9fae94b0667 100644 --- a/source/blender/draw/engines/workbench/shaders/workbench_effect_taa_frag.glsl +++ b/source/blender/draw/engines/workbench/shaders/workbench_effect_taa_frag.glsl @@ -13,8 +13,10 @@ void main() frag_color = float4(0.0f); int i = 0; - for (int x = -1; x <= 1; x++) { - for (int y = -1; y <= 1; y++, i++) { + [[gpu::unroll]] for (int x = -1; x <= 1; x++) + { + [[gpu::unroll]] for (int y = -1; y <= 1; y++, i++) + { float4 color = texture(color_buffer, uv + float2(x, y) * texel_size); /* Clamp infinite inputs (See #112211). */ color = clamp(color, float4(0.0f), float4(1e10f)); diff --git a/source/blender/gpu/glsl_preprocess/glsl_preprocess.hh b/source/blender/gpu/glsl_preprocess/glsl_preprocess.hh index f97217b8b71..8f65e3523c6 100644 --- a/source/blender/gpu/glsl_preprocess/glsl_preprocess.hh +++ b/source/blender/gpu/glsl_preprocess/glsl_preprocess.hh @@ -13,7 +13,6 @@ #include #include #include -#include #include namespace blender::gpu::shader { @@ -211,6 +210,7 @@ class Preprocessor { } str = preprocessor_directive_mutation(str); if (language == BLENDER_GLSL) { + str = loop_unroll(str, report_error); str = assert_processing(str, filename); static_strings_parsing(str); str = static_strings_mutation(str); @@ -329,6 +329,220 @@ class Preprocessor { }); } + std::string loop_unroll(const std::string &str, report_callback report_error) + { + if (str.find("[[gpu::unroll") == std::string::npos) { + return str; + } + + struct Loop { + /* `[[gpu::unroll]] for (int i = 0; i < 10; i++)` */ + std::string definition; + /* `{ some_computation(i); }` */ + std::string body; + /* `int i = 0` */ + std::string init_statement; + /* `i < 10` */ + std::string test_statement; + /* `i++` */ + std::string iter_statement; + /* Spaces and newline between loop start and body. */ + std::string body_prefix; + /* Spaces before the loop definition. */ + std::string indent; + /* `10` */ + int64_t iter_count; + /* Line at which the loop was defined. */ + int64_t definition_line; + /* Line at which the body starts. */ + int64_t body_line; + /* Line at which the body ends. */ + int64_t end_line; + }; + + std::vector loops; + + auto add_loop = [&](Loop &loop, + const std::smatch &match, + int64_t line, + int64_t lines_in_content) { + std::string suffix = match.suffix().str(); + loop.body = get_content_between_balanced_pair(loop.definition + suffix, '{', '}'); + loop.body = '{' + loop.body + '}'; + loop.definition_line = line - lines_in_content; + loop.body_line = line; + loop.end_line = loop.body_line + line_count(loop.body); + + /* Check that there is no unsupported keywords in the loop body. */ + if (loop.body.find(" break;") != std::string::npos || + loop.body.find(" continue;") != std::string::npos) + { + /* Expensive check. Remove other loops and switch scopes inside the unrolled loop scope and + * check again to avoid false positive. */ + std::string modified_body = loop.body; + + std::regex regex_loop(R"( (for|while|do) )"); + regex_global_search(loop.body, regex_loop, [&](const std::smatch &match) { + std::string inner_scope = get_content_between_balanced_pair(match.suffix(), '{', '}'); + replace_all(modified_body, inner_scope, ""); + }); + + /* Checks if `continue` exists, even in switch statement inside the unrolled loop scope. */ + if (modified_body.find(" continue;") != std::string::npos) { + report_error(match, "Error: Unrolled loop cannot contain \"continue\" statement."); + } + + std::regex regex_switch(R"( switch )"); + regex_global_search(loop.body, regex_switch, [&](const std::smatch &match) { + std::string inner_scope = get_content_between_balanced_pair(match.suffix(), '{', '}'); + replace_all(modified_body, inner_scope, ""); + }); + + /* Checks if `break` exists inside the unrolled loop scope. */ + if (modified_body.find(" break;") != std::string::npos) { + report_error(match, "Error: Unrolled loop cannot contain \"break\" statement."); + } + } + loops.emplace_back(loop); + }; + + /* Parse the loop syntax. */ + { + /* [[gpu::unroll]]. */ + std::regex regex(R"(( *))" + R"(\[\[gpu::unroll\]\])" + R"(\s*for\s*\()" + R"(\s*((?:uint|int)\s+(\w+)\s+=\s+(-?\d+));)" /* Init statement. */ + R"(\s*((\w+)\s+(>|<)(=?)\s+(-?\d+)))" /* Conditional statement. */ + R"(\s*(?:&&)?\s*([^;)]+)?;)" /* Extra conditional statement. */ + R"(\s*(((\w+)(\+\+|\-\-))[^\)]*))" /* Iteration statement. */ + R"(\)(\s*))"); + + int64_t line = 0; + + regex_global_search(str, regex, [&](const std::smatch &match) { + std::string counter_1 = match[3].str(); + std::string counter_2 = match[6].str(); + std::string counter_3 = match[13].str(); + + std::string content = match[0].str(); + int64_t lines_in_content = line_count(content); + + line += line_count(match.prefix().str()) + lines_in_content; + + if ((counter_1 != counter_2) || (counter_1 != counter_3)) { + report_error(match, "Error: Non matching loop counter variable."); + return; + } + + Loop loop; + + int64_t init = std::stol(match[4].str()); + int64_t end = std::stol(match[9].str()); + /* TODO(fclem): Support arbitrary strides (aka, arbitrary iter statement). */ + loop.iter_count = std::abs(end - init); + + std::string condition = match[7].str(); + if (condition.empty()) { + report_error(match, "Error: Unsupported condition in unrolled loop."); + } + + std::string equal = match[8].str(); + if (equal == "=") { + loop.iter_count += 1; + } + + std::string iter = match[14].str(); + if (iter == "++") { + if (condition == ">") { + report_error(match, "Error: Unsupported condition in unrolled loop."); + } + } + else if (iter == "--") { + if (condition == "<") { + report_error(match, "Error: Unsupported condition in unrolled loop."); + } + } + else { + report_error(match, "Error: Unsupported for loop expression. Expecting ++ or --"); + } + + loop.definition = content; + loop.indent = match[1].str(); + loop.init_statement = match[2].str(); + if (!match[10].str().empty()) { + loop.test_statement = "if (" + match[10].str() + ") "; + } + loop.iter_statement = match[11].str(); + loop.body_prefix = match[15].str(); + + add_loop(loop, match, line, lines_in_content); + }); + } + { + /* [[gpu::unroll(n)]]. */ + std::regex regex(R"(( *))" + R"(\[\[gpu::unroll\((\d+)\)\]\])" + R"(\s*for\s*\()" + R"(\s*([^;]*);)" + R"(\s*([^;]*);)" + R"(\s*([^)]*))" + R"(\)(\s*))"); + + int64_t line = 0; + + regex_global_search(str, regex, [&](const std::smatch &match) { + std::string content = match[0].str(); + + int64_t lines_in_content = line_count(content); + + line += line_count(match.prefix().str()) + lines_in_content; + + Loop loop; + loop.iter_count = std::stol(match[2].str()); + loop.definition = content; + loop.indent = match[1].str(); + loop.init_statement = match[3].str(); + loop.test_statement = "if (" + match[4].str() + ") "; + loop.iter_statement = match[5].str(); + loop.body_prefix = match[13].str(); + + add_loop(loop, match, line, lines_in_content); + }); + } + + std::string out = str; + + /* Copy paste loop iterations. */ + for (const Loop &loop : loops) { + std::string replacement = loop.indent + "{ " + loop.init_statement + ";"; + for (int64_t i = 0; i < loop.iter_count; i++) { + replacement += std::string("\n#line ") + std::to_string(loop.body_line + 1) + "\n"; + replacement += loop.indent + loop.test_statement + loop.body; + replacement += std::string("\n#line ") + std::to_string(loop.definition_line + 1) + "\n"; + replacement += loop.indent + loop.iter_statement + ";"; + if (i == loop.iter_count - 1) { + replacement += std::string("\n#line ") + std::to_string(loop.end_line + 1) + "\n"; + replacement += loop.indent + "}"; + } + } + + std::string replaced = loop.definition + loop.body; + + /* Replace all occurrences in case of recursive unrolling. */ + replace_all(out, replaced, replacement); + } + + /* Check for remaining keywords. */ + if (out.find("[[gpu::unroll") != std::string::npos) { + regex_global_search(str, std::regex(R"(\[\[gpu::unroll)"), [&](const std::smatch &match) { + report_error(match, "Error: Incompatible format for [[gpu::unroll]]."); + }); + } + + return out; + } + std::string preprocessor_directive_mutation(const std::string &str) { /* Remove unsupported directives.` */ diff --git a/source/blender/gpu/shaders/gpu_glsl_cpp_stubs.hh b/source/blender/gpu/shaders/gpu_glsl_cpp_stubs.hh index 3e5b753f86b..7560032b073 100644 --- a/source/blender/gpu/shaders/gpu_glsl_cpp_stubs.hh +++ b/source/blender/gpu/shaders/gpu_glsl_cpp_stubs.hh @@ -1070,3 +1070,12 @@ void groupMemoryBarrier() {} #define row_major row_major_is_reserved_glsl_keyword_do_not_use #include "GPU_shader_shared_utils.hh" + +#ifdef __GNUC__ +/* Avoid warnings caused by our own unroll attributes. */ +# ifdef __clang__ +# pragma GCC diagnostic ignored "-Wunknown-attributes" +# else +# pragma GCC diagnostic ignored "-Wattributes" +# endif +#endif diff --git a/source/blender/gpu/tests/shader_preprocess_test.cc b/source/blender/gpu/tests/shader_preprocess_test.cc index 2f575d09952..141b4172a49 100644 --- a/source/blender/gpu/tests/shader_preprocess_test.cc +++ b/source/blender/gpu/tests/shader_preprocess_test.cc @@ -32,4 +32,169 @@ static void test_preprocess_utilities() } GPU_TEST(preprocess_utilities); +static std::string process_test_string(std::string str, + std::string &first_error, + shader::metadata::Source *r_metadata = nullptr) +{ + using namespace shader; + Preprocessor preprocessor; + shader::metadata::Source metadata; + std::string result = preprocessor.process( + Preprocessor::SourceLanguage::BLENDER_GLSL, + str, + "test.glsl", + true, + true, + [&](const std::smatch & /*match*/, const char *err_msg) { + if (first_error.empty()) { + first_error = err_msg; + } + }, + metadata); + + if (r_metadata != nullptr) { + *r_metadata = metadata; + } + + /* Strip first line directive as they are platform dependent. */ + size_t newline = result.find('\n'); + return result.substr(newline + 1); +} + +static void test_preprocess_unroll() +{ + using namespace shader; + using namespace std; + + { + string input = R"([[gpu::unroll]] for (int i = 2; i < 4; i++, y++) { content += i; })"; + string expect = R"({ int i = 2; +#line 1 +{ content += i; } +#line 1 +i++, y++; +#line 1 +{ content += i; } +#line 1 +i++, y++; +#line 1 +})"; + string error; + string output = process_test_string(input, error); + EXPECT_EQ(output, expect); + EXPECT_EQ(error, ""); + } + { + string input = R"([[gpu::unroll]] for (int i = 2; i < 4 && i < y; i++, y++) { cont += i; })"; + string expect = R"({ int i = 2; +#line 1 +if (i < y) { cont += i; } +#line 1 +i++, y++; +#line 1 +if (i < y) { cont += i; } +#line 1 +i++, y++; +#line 1 +})"; + string error; + string output = process_test_string(input, error); + EXPECT_EQ(output, expect); + EXPECT_EQ(error, ""); + } + { + string input = R"([[gpu::unroll(2)]] for (; i < j;) { content += i; })"; + string expect = R"({ ; +#line 1 +if (i < j) { content += i; } +#line 1 +; +#line 1 +if (i < j) { content += i; } +#line 1 +; +#line 1 +})"; + string error; + string output = process_test_string(input, error); + EXPECT_EQ(output, expect); + EXPECT_EQ(error, ""); + } + { + string input = R"([[gpu::unroll(2)]] for (; i < j;) { [[gpu::unroll(2)]] for (; j < k;) {} })"; + string expect = R"({ ; +#line 1 +if (i < j) { { ; +#line 1 + if (j < k) {} +#line 1 + ; +#line 1 + if (j < k) {} +#line 1 + ; +#line 1 + } } +#line 1 +; +#line 1 +if (i < j) { { ; +#line 1 + if (j < k) {} +#line 1 + ; +#line 1 + if (j < k) {} +#line 1 + ; +#line 1 + } } +#line 1 +; +#line 1 +})"; + string error; + string output = process_test_string(input, error); + EXPECT_EQ(output, expect); + EXPECT_EQ(error, ""); + } + { + string input = R"([[gpu::unroll(2)]] for (; i < j;) { break; })"; + string error; + string output = process_test_string(input, error); + EXPECT_EQ(error, "Error: Unrolled loop cannot contain \"break\" statement."); + } + { + string input = R"([[gpu::unroll(2)]] for (; i < j;) { continue; })"; + string error; + string output = process_test_string(input, error); + EXPECT_EQ(error, "Error: Unrolled loop cannot contain \"continue\" statement."); + } + { + string input = R"([[gpu::unroll(2)]] for (; i < j;) { for (; j < k;) {break;continue;} })"; + string expect = R"({ ; +#line 1 +if (i < j) { for (; j < k;) {break;continue;} } +#line 1 +; +#line 1 +if (i < j) { for (; j < k;) {break;continue;} } +#line 1 +; +#line 1 +})"; + string error; + string output = process_test_string(input, error); + EXPECT_EQ(output, expect); + EXPECT_EQ(error, ""); + } + { + string input = R"([[gpu::unroll]] for (int i = 3; i > 2; i++) {})"; + string error; + string output = process_test_string(input, error); + EXPECT_EQ(error, "Error: Unsupported condition in unrolled loop."); + } +} +GPU_TEST(preprocess_unroll); + } // namespace blender::gpu::tests