GPU: Shader: Add support for basic loop unrolling through preprocessor

This adds basic unrolling support for 2 syntax:
- `[[gpu::unroll]]` which does full loop unrolling
- `[[gpu::unroll(x)]]` which unrolls `x` iteration

Nesting is supported.

This change is motivated by the added cost in compilation
and execution time that some loops have even if they have
compile time defined iteration counts.

The syntax is inspired by `GL_EXT_control_flow_attributes`.
However, we might want to have our own prefix to show it is
a blender specific feature and that it differs from the standard.
I propose `[[gpu::unroll]]`.

In the future, we could extend this to support more directives that
can be expanded to backend specific extension / syntax. This would
avoid readability issue an error prone copy paste of large amount
of preprocessor directives.

Currently, given that GL's GLSL flavor doesn't support
any of these attributes, the preprocessor does some copy-pasting
that does the unrolling at the source level. Note that the added
`#line` allow for correct error logging.

For the `[[gpu::unroll]]` syntax, the `for` declaration
needs to follow a specific syntax to deduce the number
of loop iteration.
This variant removes the continue condition between iteration,
so all iterations are evaluated. This could be modified
using a special keyword.

For the `[[gpu::unroll(n)]]` syntax, the usercode needs
to make sure that `n` is large enough to cover all iterations
as the loop is completely removed.
We could add shader `assert` to make sure that there is
never a remaining iteration.
This behavior is usually different from what you see in other
implementation as we do not keep a loop at all. Usually, compilers
still keep the loop if it is not unrolled fully. But given we don't
have IR, this is the best we can do.

`break` and `continue` statement are forbidden at the unrolled loop
scope level. Nested loop and switch can contain these keywords.
This is accounted for by checks in the pre-processor.

Only `for` loops are supported for now. There are no real
incentive to add support for `while` given how rare it is
in the shader codebase.

Rel #137446

Pull Request: https://projects.blender.org/blender/blender/pulls/137444
This commit is contained in:
Clément Foucault
2025-05-05 13:37:51 +02:00
committed by Clément Foucault
parent c271c3b291
commit 74e6d2c575
6 changed files with 430 additions and 36 deletions

View File

@@ -12,31 +12,6 @@ COMPUTE_SHADER_CREATE_INFO(eevee_lightprobe_sphere_remap)
#include "eevee_lightprobe_sphere_mapping_lib.glsl"
#include "eevee_spherical_harmonics_lib.glsl"
/* OpenGL/Intel drivers have known issues where it isn't able to compile barriers inside for loops.
* Macros are needed as driver can decide to not unroll in shaders with more complexity. */
#define PARALLEL_SUM_INNER() \
barrier(); \
if (local_index < stride) { \
local_radiance[local_index] += local_radiance[local_index + stride]; \
} \
stride /= 2;
#define PARALLEL_SUM \
{ \
uint stride = group_size / 2; \
PARALLEL_SUM_INNER(); \
PARALLEL_SUM_INNER(); \
PARALLEL_SUM_INNER(); \
PARALLEL_SUM_INNER(); \
PARALLEL_SUM_INNER(); \
PARALLEL_SUM_INNER(); \
PARALLEL_SUM_INNER(); \
PARALLEL_SUM_INNER(); \
PARALLEL_SUM_INNER(); \
PARALLEL_SUM_INNER(); \
barrier(); \
}
shared float4 local_radiance[gl_WorkGroupSize.x * gl_WorkGroupSize.y];
float triangle_solid_angle(float3 A, float3 B, float3 C)
@@ -159,7 +134,16 @@ void main()
if (extract_sun) {
/* Parallel sum. Result is stored inside local_radiance[0]. */
local_radiance[local_index] = radiance_sun.xyzz * sample_weight;
PARALLEL_SUM
/* OpenGL/Intel drivers have known issues where it isn't able to compile barriers inside for
* loops. Unroll is needed as driver might decide to not unroll in shaders with more
* complexity. */
[[gpu::unroll(10)]] for (uint stride = group_size / 2; stride > 0; stride /= 2)
{
barrier();
if (local_index < stride) {
local_radiance[local_index] += local_radiance[local_index + stride];
}
}
if (gl_LocalInvocationIndex == 0u) {
out_sun[work_group_index].radiance = local_radiance[0].xyz;
@@ -169,7 +153,17 @@ void main()
/* Reusing local_radiance for directions. */
local_radiance[local_index] = float4(normalize(direction), 1.0f) * sample_weight *
length(radiance_sun.xyz);
PARALLEL_SUM
/* OpenGL/Intel drivers have known issues where it isn't able to compile barriers inside for
* loops. Unroll is needed as driver might decide to not unroll in shaders with more
* complexity. */
[[gpu::unroll(10)]] for (uint stride = group_size / 2; stride > 0; stride /= 2)
{
barrier();
if (local_index < stride) {
local_radiance[local_index] += local_radiance[local_index + stride];
}
}
barrier();
if (gl_LocalInvocationIndex == 0u) {
out_sun[work_group_index].direction = local_radiance[0];
@@ -180,7 +174,16 @@ void main()
if (extract_sh) {
/* Parallel sum. Result is stored inside local_radiance[0]. */
local_radiance[local_index] = radiance.xyzz * sample_weight;
PARALLEL_SUM
/* OpenGL/Intel drivers have known issues where it isn't able to compile barriers inside for
* loops. Unroll is needed as driver might decide to not unroll in shaders with more
* complexity. */
[[gpu::unroll(10)]] for (uint stride = group_size / 2; stride > 0; stride /= 2)
{
barrier();
if (local_index < stride) {
local_radiance[local_index] += local_radiance[local_index + stride];
}
}
if (gl_LocalInvocationIndex == 0u) {
/* Find the middle point of the whole thread-group. Use it as light vector.

View File

@@ -121,8 +121,9 @@ void main()
out_gbuf_normal = gbuf.N[0];
/* Output remaining closures using image store. */
for (int layer = GBUF_CLOSURE_FB_LAYER_COUNT; layer < GBUFFER_DATA_MAX && layer < gbuf.data_len;
layer++)
[[gpu::unroll(6)]] for (int layer = GBUF_CLOSURE_FB_LAYER_COUNT;
layer < GBUFFER_DATA_MAX && layer < gbuf.data_len;
layer++)
{
/* NOTE: The image view start at layer GBUF_CLOSURE_FB_LAYER_COUNT so all destination layer is
* `layer - GBUF_CLOSURE_FB_LAYER_COUNT`. */
@@ -130,9 +131,9 @@ void main()
int3(out_texel, layer - GBUF_CLOSURE_FB_LAYER_COUNT),
gbuf.data[layer]);
}
for (int layer = GBUF_NORMAL_FB_LAYER_COUNT;
layer < GBUFFER_NORMAL_MAX && layer < gbuf.normal_len;
layer++)
[[gpu::unroll(4)]] for (int layer = GBUF_NORMAL_FB_LAYER_COUNT;
layer < GBUFFER_NORMAL_MAX && layer < gbuf.normal_len;
layer++)
{
/* NOTE: The image view start at layer GBUF_NORMAL_FB_LAYER_COUNT so all destination layer is
* `layer - GBUF_NORMAL_FB_LAYER_COUNT`. */

View File

@@ -13,8 +13,10 @@ void main()
frag_color = float4(0.0f);
int i = 0;
for (int x = -1; x <= 1; x++) {
for (int y = -1; y <= 1; y++, i++) {
[[gpu::unroll]] for (int x = -1; x <= 1; x++)
{
[[gpu::unroll]] for (int y = -1; y <= 1; y++, i++)
{
float4 color = texture(color_buffer, uv + float2(x, y) * texel_size);
/* Clamp infinite inputs (See #112211). */
color = clamp(color, float4(0.0f), float4(1e10f));

View File

@@ -13,7 +13,6 @@
#include <regex>
#include <sstream>
#include <string>
#include <unordered_set>
#include <vector>
namespace blender::gpu::shader {
@@ -211,6 +210,7 @@ class Preprocessor {
}
str = preprocessor_directive_mutation(str);
if (language == BLENDER_GLSL) {
str = loop_unroll(str, report_error);
str = assert_processing(str, filename);
static_strings_parsing(str);
str = static_strings_mutation(str);
@@ -329,6 +329,220 @@ class Preprocessor {
});
}
std::string loop_unroll(const std::string &str, report_callback report_error)
{
if (str.find("[[gpu::unroll") == std::string::npos) {
return str;
}
struct Loop {
/* `[[gpu::unroll]] for (int i = 0; i < 10; i++)` */
std::string definition;
/* `{ some_computation(i); }` */
std::string body;
/* `int i = 0` */
std::string init_statement;
/* `i < 10` */
std::string test_statement;
/* `i++` */
std::string iter_statement;
/* Spaces and newline between loop start and body. */
std::string body_prefix;
/* Spaces before the loop definition. */
std::string indent;
/* `10` */
int64_t iter_count;
/* Line at which the loop was defined. */
int64_t definition_line;
/* Line at which the body starts. */
int64_t body_line;
/* Line at which the body ends. */
int64_t end_line;
};
std::vector<Loop> loops;
auto add_loop = [&](Loop &loop,
const std::smatch &match,
int64_t line,
int64_t lines_in_content) {
std::string suffix = match.suffix().str();
loop.body = get_content_between_balanced_pair(loop.definition + suffix, '{', '}');
loop.body = '{' + loop.body + '}';
loop.definition_line = line - lines_in_content;
loop.body_line = line;
loop.end_line = loop.body_line + line_count(loop.body);
/* Check that there is no unsupported keywords in the loop body. */
if (loop.body.find(" break;") != std::string::npos ||
loop.body.find(" continue;") != std::string::npos)
{
/* Expensive check. Remove other loops and switch scopes inside the unrolled loop scope and
* check again to avoid false positive. */
std::string modified_body = loop.body;
std::regex regex_loop(R"( (for|while|do) )");
regex_global_search(loop.body, regex_loop, [&](const std::smatch &match) {
std::string inner_scope = get_content_between_balanced_pair(match.suffix(), '{', '}');
replace_all(modified_body, inner_scope, "");
});
/* Checks if `continue` exists, even in switch statement inside the unrolled loop scope. */
if (modified_body.find(" continue;") != std::string::npos) {
report_error(match, "Error: Unrolled loop cannot contain \"continue\" statement.");
}
std::regex regex_switch(R"( switch )");
regex_global_search(loop.body, regex_switch, [&](const std::smatch &match) {
std::string inner_scope = get_content_between_balanced_pair(match.suffix(), '{', '}');
replace_all(modified_body, inner_scope, "");
});
/* Checks if `break` exists inside the unrolled loop scope. */
if (modified_body.find(" break;") != std::string::npos) {
report_error(match, "Error: Unrolled loop cannot contain \"break\" statement.");
}
}
loops.emplace_back(loop);
};
/* Parse the loop syntax. */
{
/* [[gpu::unroll]]. */
std::regex regex(R"(( *))"
R"(\[\[gpu::unroll\]\])"
R"(\s*for\s*\()"
R"(\s*((?:uint|int)\s+(\w+)\s+=\s+(-?\d+));)" /* Init statement. */
R"(\s*((\w+)\s+(>|<)(=?)\s+(-?\d+)))" /* Conditional statement. */
R"(\s*(?:&&)?\s*([^;)]+)?;)" /* Extra conditional statement. */
R"(\s*(((\w+)(\+\+|\-\-))[^\)]*))" /* Iteration statement. */
R"(\)(\s*))");
int64_t line = 0;
regex_global_search(str, regex, [&](const std::smatch &match) {
std::string counter_1 = match[3].str();
std::string counter_2 = match[6].str();
std::string counter_3 = match[13].str();
std::string content = match[0].str();
int64_t lines_in_content = line_count(content);
line += line_count(match.prefix().str()) + lines_in_content;
if ((counter_1 != counter_2) || (counter_1 != counter_3)) {
report_error(match, "Error: Non matching loop counter variable.");
return;
}
Loop loop;
int64_t init = std::stol(match[4].str());
int64_t end = std::stol(match[9].str());
/* TODO(fclem): Support arbitrary strides (aka, arbitrary iter statement). */
loop.iter_count = std::abs(end - init);
std::string condition = match[7].str();
if (condition.empty()) {
report_error(match, "Error: Unsupported condition in unrolled loop.");
}
std::string equal = match[8].str();
if (equal == "=") {
loop.iter_count += 1;
}
std::string iter = match[14].str();
if (iter == "++") {
if (condition == ">") {
report_error(match, "Error: Unsupported condition in unrolled loop.");
}
}
else if (iter == "--") {
if (condition == "<") {
report_error(match, "Error: Unsupported condition in unrolled loop.");
}
}
else {
report_error(match, "Error: Unsupported for loop expression. Expecting ++ or --");
}
loop.definition = content;
loop.indent = match[1].str();
loop.init_statement = match[2].str();
if (!match[10].str().empty()) {
loop.test_statement = "if (" + match[10].str() + ") ";
}
loop.iter_statement = match[11].str();
loop.body_prefix = match[15].str();
add_loop(loop, match, line, lines_in_content);
});
}
{
/* [[gpu::unroll(n)]]. */
std::regex regex(R"(( *))"
R"(\[\[gpu::unroll\((\d+)\)\]\])"
R"(\s*for\s*\()"
R"(\s*([^;]*);)"
R"(\s*([^;]*);)"
R"(\s*([^)]*))"
R"(\)(\s*))");
int64_t line = 0;
regex_global_search(str, regex, [&](const std::smatch &match) {
std::string content = match[0].str();
int64_t lines_in_content = line_count(content);
line += line_count(match.prefix().str()) + lines_in_content;
Loop loop;
loop.iter_count = std::stol(match[2].str());
loop.definition = content;
loop.indent = match[1].str();
loop.init_statement = match[3].str();
loop.test_statement = "if (" + match[4].str() + ") ";
loop.iter_statement = match[5].str();
loop.body_prefix = match[13].str();
add_loop(loop, match, line, lines_in_content);
});
}
std::string out = str;
/* Copy paste loop iterations. */
for (const Loop &loop : loops) {
std::string replacement = loop.indent + "{ " + loop.init_statement + ";";
for (int64_t i = 0; i < loop.iter_count; i++) {
replacement += std::string("\n#line ") + std::to_string(loop.body_line + 1) + "\n";
replacement += loop.indent + loop.test_statement + loop.body;
replacement += std::string("\n#line ") + std::to_string(loop.definition_line + 1) + "\n";
replacement += loop.indent + loop.iter_statement + ";";
if (i == loop.iter_count - 1) {
replacement += std::string("\n#line ") + std::to_string(loop.end_line + 1) + "\n";
replacement += loop.indent + "}";
}
}
std::string replaced = loop.definition + loop.body;
/* Replace all occurrences in case of recursive unrolling. */
replace_all(out, replaced, replacement);
}
/* Check for remaining keywords. */
if (out.find("[[gpu::unroll") != std::string::npos) {
regex_global_search(str, std::regex(R"(\[\[gpu::unroll)"), [&](const std::smatch &match) {
report_error(match, "Error: Incompatible format for [[gpu::unroll]].");
});
}
return out;
}
std::string preprocessor_directive_mutation(const std::string &str)
{
/* Remove unsupported directives.` */

View File

@@ -1070,3 +1070,12 @@ void groupMemoryBarrier() {}
#define row_major row_major_is_reserved_glsl_keyword_do_not_use
#include "GPU_shader_shared_utils.hh"
#ifdef __GNUC__
/* Avoid warnings caused by our own unroll attributes. */
# ifdef __clang__
# pragma GCC diagnostic ignored "-Wunknown-attributes"
# else
# pragma GCC diagnostic ignored "-Wattributes"
# endif
#endif

View File

@@ -32,4 +32,169 @@ static void test_preprocess_utilities()
}
GPU_TEST(preprocess_utilities);
static std::string process_test_string(std::string str,
std::string &first_error,
shader::metadata::Source *r_metadata = nullptr)
{
using namespace shader;
Preprocessor preprocessor;
shader::metadata::Source metadata;
std::string result = preprocessor.process(
Preprocessor::SourceLanguage::BLENDER_GLSL,
str,
"test.glsl",
true,
true,
[&](const std::smatch & /*match*/, const char *err_msg) {
if (first_error.empty()) {
first_error = err_msg;
}
},
metadata);
if (r_metadata != nullptr) {
*r_metadata = metadata;
}
/* Strip first line directive as they are platform dependent. */
size_t newline = result.find('\n');
return result.substr(newline + 1);
}
static void test_preprocess_unroll()
{
using namespace shader;
using namespace std;
{
string input = R"([[gpu::unroll]] for (int i = 2; i < 4; i++, y++) { content += i; })";
string expect = R"({ int i = 2;
#line 1
{ content += i; }
#line 1
i++, y++;
#line 1
{ content += i; }
#line 1
i++, y++;
#line 1
})";
string error;
string output = process_test_string(input, error);
EXPECT_EQ(output, expect);
EXPECT_EQ(error, "");
}
{
string input = R"([[gpu::unroll]] for (int i = 2; i < 4 && i < y; i++, y++) { cont += i; })";
string expect = R"({ int i = 2;
#line 1
if (i < y) { cont += i; }
#line 1
i++, y++;
#line 1
if (i < y) { cont += i; }
#line 1
i++, y++;
#line 1
})";
string error;
string output = process_test_string(input, error);
EXPECT_EQ(output, expect);
EXPECT_EQ(error, "");
}
{
string input = R"([[gpu::unroll(2)]] for (; i < j;) { content += i; })";
string expect = R"({ ;
#line 1
if (i < j) { content += i; }
#line 1
;
#line 1
if (i < j) { content += i; }
#line 1
;
#line 1
})";
string error;
string output = process_test_string(input, error);
EXPECT_EQ(output, expect);
EXPECT_EQ(error, "");
}
{
string input = R"([[gpu::unroll(2)]] for (; i < j;) { [[gpu::unroll(2)]] for (; j < k;) {} })";
string expect = R"({ ;
#line 1
if (i < j) { { ;
#line 1
if (j < k) {}
#line 1
;
#line 1
if (j < k) {}
#line 1
;
#line 1
} }
#line 1
;
#line 1
if (i < j) { { ;
#line 1
if (j < k) {}
#line 1
;
#line 1
if (j < k) {}
#line 1
;
#line 1
} }
#line 1
;
#line 1
})";
string error;
string output = process_test_string(input, error);
EXPECT_EQ(output, expect);
EXPECT_EQ(error, "");
}
{
string input = R"([[gpu::unroll(2)]] for (; i < j;) { break; })";
string error;
string output = process_test_string(input, error);
EXPECT_EQ(error, "Error: Unrolled loop cannot contain \"break\" statement.");
}
{
string input = R"([[gpu::unroll(2)]] for (; i < j;) { continue; })";
string error;
string output = process_test_string(input, error);
EXPECT_EQ(error, "Error: Unrolled loop cannot contain \"continue\" statement.");
}
{
string input = R"([[gpu::unroll(2)]] for (; i < j;) { for (; j < k;) {break;continue;} })";
string expect = R"({ ;
#line 1
if (i < j) { for (; j < k;) {break;continue;} }
#line 1
;
#line 1
if (i < j) { for (; j < k;) {break;continue;} }
#line 1
;
#line 1
})";
string error;
string output = process_test_string(input, error);
EXPECT_EQ(output, expect);
EXPECT_EQ(error, "");
}
{
string input = R"([[gpu::unroll]] for (int i = 3; i > 2; i++) {})";
string error;
string output = process_test_string(input, error);
EXPECT_EQ(error, "Error: Unsupported condition in unrolled loop.");
}
}
GPU_TEST(preprocess_unroll);
} // namespace blender::gpu::tests