Fix #119043: Compositor crashes with intricate setup

The Realtime Compositor crashes with intricate node setups. That's due to hardware limitations where shaders can't have as many output images as needed by the compositor shader operations. To fix this, we recursively split shader operations until their output count fits the hardware limitation. Pull Request: https://projects.blender.org/blender/blender/pulls/119075
2024-03-06 07:25:05 +01:00
parent 49c723e117
commit bfec649bd9
6 changed files with 88 additions and 15 deletions
--- a/source/blender/compositor/realtime_compositor/COM_compile_state.hh
+++ b/source/blender/compositor/realtime_compositor/COM_compile_state.hh
@@ -162,6 +162,11 @@ class CompileState {
   *   compiled. */
  bool should_compile_shader_compile_unit(DNode node);

+  /* Computes the number of shader operation outputs that will be added for this node in the
+   * current shader compile unit. This is essentially the number of outputs that will be added for
+   * the node in ShaderOperation::populate_results_for_node. */
+  int compute_shader_node_operation_outputs_count(DNode node);
+
 private:
  /* Compute the node domain of the given shader node. This is analogous to the
   * Operation::compute_domain method, except it is computed from the node itself as opposed to a
--- a/source/blender/compositor/realtime_compositor/COM_utilities.hh
+++ b/source/blender/compositor/realtime_compositor/COM_utilities.hh
@@ -73,6 +73,9 @@ void compute_dispatch_threads_at_least(GPUShader *shader,
 /* Returns true if a node preview needs to be computed for the give node. */
 bool is_node_preview_needed(const DNode &node);

+/* Returns the node output that will be used to generate previews. */
+DOutputSocket find_preview_output_socket(const DNode &node);
+
 /* Computes a lower resolution version of the given result and sets it as a preview for the given
 * node after applying the appropriate color management specified in the given context. */
 void compute_preview_from_result(Context &context, const DNode &node, Result &input_result);
--- a/source/blender/compositor/realtime_compositor/intern/compile_state.cc
+++ b/source/blender/compositor/realtime_compositor/intern/compile_state.cc
@@ -105,6 +105,32 @@ bool CompileState::should_compile_shader_compile_unit(DNode node)
  return false;
 }

+int CompileState::compute_shader_node_operation_outputs_count(DNode node)
+{
+  const DOutputSocket preview_output = find_preview_output_socket(node);
+
+  int outputs_count = 0;
+  for (const bNodeSocket *output : node->output_sockets()) {
+    const DOutputSocket doutput{node.context(), output};
+
+    /* If the output is used as the node preview, then an operation output will exist for it. */
+    const bool is_preview_output = doutput == preview_output;
+
+    /* If any of the nodes linked to the output are not part of the shader compile unit but are
+     * part of the execution schedule, then an operation output will exist for it. */
+    const bool is_operation_output = is_output_linked_to_node_conditioned(
+        doutput, [&](DNode node) {
+          return schedule_.contains(node) && !shader_compile_unit_.contains(node);
+        });
+
+    if (is_operation_output || is_preview_output) {
+      outputs_count += 1;
+    }
+  }
+
+  return outputs_count;
+}
+
 Domain CompileState::compute_shader_node_domain(DNode node)
 {
  /* Default to an identity domain in case no domain input was found, most likely because all
--- a/source/blender/compositor/realtime_compositor/intern/evaluator.cc
+++ b/source/blender/compositor/realtime_compositor/intern/evaluator.cc
@@ -142,6 +142,45 @@ void Evaluator::map_node_operation_inputs_to_their_results(DNode node,
 void Evaluator::compile_and_evaluate_shader_compile_unit(CompileState &compile_state)
 {
  ShaderCompileUnit &compile_unit = compile_state.get_shader_compile_unit();
+
+  /* GPUs have hardware limitations on the number of output images shaders can have, so we might
+   * have to split the compile unit into smaller units to workaround this limitation. In practice,
+   * splitting will almost always never happen due to the scheduling strategy we use, so the base
+   * case remains fast. */
+  int number_of_outputs = 0;
+  for (int i : compile_unit.index_range()) {
+    const DNode node = compile_unit[i];
+    number_of_outputs += compile_state.compute_shader_node_operation_outputs_count(node);
+
+    /* The GPU module currently only supports up to 8 output images in shaders, but once this
+     * limitation is lifted, we can replace that with GPU_max_images(). */
+    if (number_of_outputs <= 8) {
+      continue;
+    }
+
+    /* The number of outputs surpassed the limit, so we split the compile unit into two equal parts
+     * and recursively call this method on each of them. It might seem unexpected that we split in
+     * half as opposed to split at the node that surpassed the limit, but that is because the act
+     * of splitting might actually introduce new outputs, since links that were previously internal
+     * to the compile unit might now be external. So we can't precisely split and guarantee correct
+     * units, and we just rely or recursive splitting until units are small enough. Further, half
+     * splitting helps balancing the shaders, where we don't want to have one gigantic shader and
+     * a tiny one. */
+    const int split_index = compile_unit.size() / 2;
+    const ShaderCompileUnit start_compile_unit(compile_unit.as_span().take_front(split_index));
+    const ShaderCompileUnit end_compile_unit(compile_unit.as_span().drop_front(split_index));
+
+    compile_state.get_shader_compile_unit() = start_compile_unit;
+    this->compile_and_evaluate_shader_compile_unit(compile_state);
+
+    compile_state.get_shader_compile_unit() = end_compile_unit;
+    this->compile_and_evaluate_shader_compile_unit(compile_state);
+
+    /* No need to continue, the above recursive calls will eventually exist the loop and do the
+     * actual compilation. */
+    return;
+  }
+
  const Schedule &schedule = compile_state.get_schedule();
  ShaderOperation *operation = new ShaderOperation(context_, compile_unit, schedule);

--- a/source/blender/compositor/realtime_compositor/intern/shader_operation.cc
+++ b/source/blender/compositor/realtime_compositor/intern/shader_operation.cc
@@ -270,21 +270,6 @@ void ShaderOperation::declare_operation_input(DInputSocket input_socket,
  inputs_to_linked_outputs_map_.add_new(input_identifier, output_socket);
 }

-static DOutputSocket find_preview_output_socket(const DNode &node)
-{
-  if (!is_node_preview_needed(node)) {
-    return DOutputSocket();
-  }
-
-  for (const bNodeSocket *output : node->output_sockets()) {
-    if (output->is_logically_linked()) {
-      return DOutputSocket(node.context(), output);
-    }
-  }
-
-  return DOutputSocket();
-}
-
 void ShaderOperation::populate_results_for_node(DNode node, GPUMaterial *material)
 {
  const DOutputSocket preview_output = find_preview_output_socket(node);
--- a/source/blender/compositor/realtime_compositor/intern/utilities.cc
+++ b/source/blender/compositor/realtime_compositor/intern/utilities.cc
@@ -163,6 +163,21 @@ bool is_node_preview_needed(const DNode &node)
  return true;
 }

+DOutputSocket find_preview_output_socket(const DNode &node)
+{
+  if (!is_node_preview_needed(node)) {
+    return DOutputSocket();
+  }
+
+  for (const bNodeSocket *output : node->output_sockets()) {
+    if (output->is_logically_linked()) {
+      return DOutputSocket(node.context(), output);
+    }
+  }
+
+  return DOutputSocket();
+}
+
 /* Given the size of a result, compute a lower resolution size for a preview. The greater dimension
 * will be assigned an arbitrarily chosen size of 128, while the other dimension will get the size
 * that maintains the same aspect ratio. */