Metal: EEVEE Next: Optimize Virtual shadow maps for Apple Silicon

Optimization of EEVEE Next's Virtual Shadow Maps for TBDRs. The core of these optimizations lie in eliminating use of atomic shadow atlas writes and instead utilise tile memory to perform depth accumulation as a secondary pass once all geometry updates for a given shadow view have been updated. This also allows use of fast on-tile depth testing/sorting, reducing overdraw and redundant fragment operations, while also allowing for tile indirection calculations to be offloaded into the vertex shader to increase fragment storage efficiency and throughput. Authored by Apple: Michael Parkin-White Co-authored-by: Michael Parkin-White <mparkinwhite@apple.com> Co-authored-by: Clément Foucault <foucault.clem@gmail.com> Pull Request: https://projects.blender.org/blender/blender/pulls/111283
2023-10-05 19:02:39 +02:00
parent d958874f01
commit 57a3ab29cc
27 changed files with 542 additions and 94 deletions
--- a/source/blender/draw/CMakeLists.txt
+++ b/source/blender/draw/CMakeLists.txt
@@ -570,6 +570,8 @@ set(GLSL_SRC
  engines/eevee_next/shaders/eevee_surf_forward_frag.glsl
  engines/eevee_next/shaders/eevee_surf_lib.glsl
  engines/eevee_next/shaders/eevee_surf_shadow_frag.glsl
+  engines/eevee_next/shaders/eevee_shadow_page_tile_vert.glsl
+  engines/eevee_next/shaders/eevee_shadow_page_tile_frag.glsl
  engines/eevee_next/shaders/eevee_surf_world_frag.glsl
  engines/eevee_next/shaders/eevee_surfel_cluster_build_comp.glsl
  engines/eevee_next/shaders/eevee_surfel_light_comp.glsl
--- a/source/blender/draw/engines/eevee_next/eevee_defines.hh
+++ b/source/blender/draw/engines/eevee_next/eevee_defines.hh
@@ -45,8 +45,15 @@
 * SHADOW_TILEMAP_RES max is 32 because of the shared bitmaps used for LOD tagging.
 * It is also limited by the maximum thread group size (1024).
 */
-#define SHADOW_TILEMAP_RES 32
-#define SHADOW_TILEMAP_LOD 5 /* LOG2(SHADOW_TILEMAP_RES) */
+#if 0
+/* Useful for debugging the tile-copy version of the shadow rendering without making debugging
+ * tools unresponsive. */
+#  define SHADOW_TILEMAP_RES 4
+#  define SHADOW_TILEMAP_LOD 2 /* LOG2(SHADOW_TILEMAP_RES) */
+#else
+#  define SHADOW_TILEMAP_RES 32
+#  define SHADOW_TILEMAP_LOD 5 /* LOG2(SHADOW_TILEMAP_RES) */
+#endif
 #define SHADOW_TILEMAP_LOD0_LEN ((SHADOW_TILEMAP_RES / 1) * (SHADOW_TILEMAP_RES / 1))
 #define SHADOW_TILEMAP_LOD1_LEN ((SHADOW_TILEMAP_RES / 2) * (SHADOW_TILEMAP_RES / 2))
 #define SHADOW_TILEMAP_LOD2_LEN ((SHADOW_TILEMAP_RES / 4) * (SHADOW_TILEMAP_RES / 4))
@@ -57,9 +64,19 @@
 #define SHADOW_TILEDATA_PER_TILEMAP \
  (SHADOW_TILEMAP_LOD0_LEN + SHADOW_TILEMAP_LOD1_LEN + SHADOW_TILEMAP_LOD2_LEN + \
   SHADOW_TILEMAP_LOD3_LEN + SHADOW_TILEMAP_LOD4_LEN + SHADOW_TILEMAP_LOD5_LEN)
-#define SHADOW_PAGE_CLEAR_GROUP_SIZE 32
-#define SHADOW_PAGE_RES 256
-#define SHADOW_PAGE_LOD 8 /* LOG2(SHADOW_PAGE_RES) */
+#if 0
+/* Useful for debugging the tile-copy version of the shadow rendering without making debugging
+ * tools unresponsive. */
+#  define SHADOW_PAGE_CLEAR_GROUP_SIZE 8
+#  define SHADOW_PAGE_RES 8
+#  define SHADOW_PAGE_LOD 3 /* LOG2(SHADOW_PAGE_RES) */
+#else
+#  define SHADOW_PAGE_CLEAR_GROUP_SIZE 32
+#  define SHADOW_PAGE_RES 256
+#  define SHADOW_PAGE_LOD 8 /* LOG2(SHADOW_PAGE_RES) */
+#endif
+/* For testing only. */
+// #define SHADOW_FORCE_LOD0
 #define SHADOW_MAP_MAX_RES (SHADOW_PAGE_RES * SHADOW_TILEMAP_RES)
 #define SHADOW_DEPTH_SCAN_GROUP_SIZE 8
 #define SHADOW_AABB_TAG_GROUP_SIZE 64
@@ -76,6 +93,7 @@
 #define SHADOW_PAGE_PER_LAYER (SHADOW_PAGE_PER_ROW * SHADOW_PAGE_PER_COL)
 #define SHADOW_MAX_STEP 16
 #define SHADOW_MAX_RAY 4
+#define SHADOW_ROG_ID 0

 /* Ray-tracing. */
 #define RAYTRACE_GROUP_SIZE 8
--- a/source/blender/draw/engines/eevee_next/eevee_pipeline.cc
+++ b/source/blender/draw/engines/eevee_next/eevee_pipeline.cc
@@ -14,6 +14,8 @@

 #include "eevee_pipeline.hh"

+#include "eevee_shadow.hh"
+
 #include "draw_common.hh"

 namespace blender::eevee {
@@ -153,25 +155,73 @@ void WorldVolumePipeline::render(View &view)

 void ShadowPipeline::sync()
 {
-  surface_ps_.init();
-  surface_ps_.state_set(DRW_STATE_WRITE_DEPTH | DRW_STATE_DEPTH_LESS);
-  surface_ps_.bind_texture(RBUFS_UTILITY_TEX_SLOT, inst_.pipelines.utility_tx);
-  surface_ps_.bind_image(SHADOW_ATLAS_IMG_SLOT, inst_.shadows.atlas_tx_);
-  surface_ps_.bind_ssbo(SHADOW_RENDER_MAP_BUF_SLOT, &inst_.shadows.render_map_buf_);
-  surface_ps_.bind_ssbo(SHADOW_VIEWPORT_INDEX_BUF_SLOT, &inst_.shadows.viewport_index_buf_);
-  surface_ps_.bind_ssbo(SHADOW_PAGE_INFO_SLOT, &inst_.shadows.pages_infos_data_);
-  inst_.bind_uniform_data(&surface_ps_);
-  inst_.sampling.bind_resources(&surface_ps_);
+  render_ps_.init();
+
+  /* NOTE: TILE_COPY technique perform a three-pass implementation. First performing the clear
+   * directly on tile, followed by a fast depth-only pass, then storing the on-tile results into
+   * the shadow atlas during a final storage pass. This takes advantage of TBDR architecture,
+   * reducing overdraw and additional per-fragment calculations. */
+  bool shadow_update_tbdr = (ShadowModule::shadow_technique == ShadowTechnique::TILE_COPY);
+  if (shadow_update_tbdr) {
+    draw::PassMain::Sub &pass = render_ps_.sub("Shadow.TilePageClear");
+    pass.subpass_transition(GPU_ATTACHEMENT_WRITE, {GPU_ATTACHEMENT_WRITE});
+    pass.shader_set(inst_.shaders.static_shader_get(SHADOW_PAGE_TILE_CLEAR));
+    /* Only manually clear depth of the updated tiles.
+     * This is because the depth is initialized to near depth using attachments for fast clear and
+     * color is cleared to far depth. This way we can save a bit of bandwidth by only clearing
+     * the updated tiles depth to far depth and not touch the color attachment. */
+    pass.state_set(DRW_STATE_WRITE_DEPTH | DRW_STATE_DEPTH_ALWAYS);
+    pass.bind_ssbo("src_coord_buf", inst_.shadows.src_coord_buf_);
+    pass.draw_procedural_indirect(GPU_PRIM_TRIS, inst_.shadows.tile_draw_buf_);
+  }
+
+  {
+    /* Metal writes depth value in local tile memory, which is considered a color attachment. */
+    DRWState state = DRW_STATE_WRITE_DEPTH | DRW_STATE_DEPTH_LESS | DRW_STATE_WRITE_COLOR;
+
+    draw::PassMain::Sub &pass = render_ps_.sub("Shadow.Surface");
+    pass.state_set(state);
+    pass.bind_texture(RBUFS_UTILITY_TEX_SLOT, inst_.pipelines.utility_tx);
+    pass.bind_ssbo(SHADOW_VIEWPORT_INDEX_BUF_SLOT, &inst_.shadows.viewport_index_buf_);
+    if (!shadow_update_tbdr) {
+      /* We do not need all of the shadow information when using the TBDR-optimized approach. */
+      pass.bind_image(SHADOW_ATLAS_IMG_SLOT, inst_.shadows.atlas_tx_);
+      pass.bind_ssbo(SHADOW_RENDER_MAP_BUF_SLOT, &inst_.shadows.render_map_buf_);
+      pass.bind_ssbo(SHADOW_PAGE_INFO_SLOT, &inst_.shadows.pages_infos_data_);
+    }
+    inst_.bind_uniform_data(&pass);
+    inst_.sampling.bind_resources(&pass);
+    surface_ps_ = &pass;
+  }
+
+  if (shadow_update_tbdr) {
+    draw::PassMain::Sub &pass = render_ps_.sub("Shadow.TilePageStore");
+    pass.shader_set(inst_.shaders.static_shader_get(SHADOW_PAGE_TILE_STORE));
+    /* The most optimal way would be to only store pixels that have been rendered to (depth > 0).
+     * But that requires that the destination pages in the atlas would have been already cleared
+     * using compute. Experiments showed that it is faster to just copy the whole tiles back.
+     *
+     * For relative perf, raster-based clear within tile update adds around 0.1ms vs 0.25ms for
+     * compute based clear for a simple test case. */
+    pass.state_set(DRW_STATE_DEPTH_ALWAYS);
+    /* Metal have implicit sync with Raster Order Groups. Other backend need to have manual
+     * sub-pass transition to allow reading the framebuffer. This is a no-op on Metal. */
+    pass.subpass_transition(GPU_ATTACHEMENT_WRITE, {GPU_ATTACHEMENT_READ});
+    pass.bind_image(SHADOW_ATLAS_IMG_SLOT, inst_.shadows.atlas_tx_);
+    pass.bind_ssbo("dst_coord_buf", inst_.shadows.dst_coord_buf_);
+    pass.bind_ssbo("src_coord_buf", inst_.shadows.src_coord_buf_);
+    pass.draw_procedural_indirect(GPU_PRIM_TRIS, inst_.shadows.tile_draw_buf_);
+  }
 }

 PassMain::Sub *ShadowPipeline::surface_material_add(GPUMaterial *gpumat)
 {
-  return &surface_ps_.sub(GPU_material_get_name(gpumat));
+  return &surface_ps_->sub(GPU_material_get_name(gpumat));
 }

 void ShadowPipeline::render(View &view)
 {
-  inst_.manager->submit(surface_ps_, view);
+  inst_.manager->submit(render_ps_, view);
 }

 /** \} */
--- a/source/blender/draw/engines/eevee_next/eevee_pipeline.hh
+++ b/source/blender/draw/engines/eevee_next/eevee_pipeline.hh
@@ -107,7 +107,10 @@ class ShadowPipeline {
 private:
  Instance &inst_;

-  PassMain surface_ps_ = {"Shadow.Surface"};
+  /* Shadow update pass. */
+  PassMain render_ps_ = {"Shadow.Surface"};
+  /* Shadow surface render subpass. */
+  PassMain::Sub *surface_ps_ = nullptr;

 public:
  ShadowPipeline(Instance &inst) : inst_(inst){};
@@ -115,6 +118,7 @@ class ShadowPipeline {
  PassMain::Sub *surface_material_add(GPUMaterial *gpumat);

  void sync();
+
  void render(View &view);
 };

--- a/source/blender/draw/engines/eevee_next/eevee_shader.cc
+++ b/source/blender/draw/engines/eevee_next/eevee_shader.cc
@@ -15,6 +15,10 @@

 #include "eevee_shader.hh"

+#include "eevee_shadow.hh"
+
+#include "BLI_assert.h"
+
 namespace blender::eevee {

 /* -------------------------------------------------------------------- */
@@ -228,6 +232,10 @@ const char *ShaderModule::static_shader_create_info_name_get(eShaderType shader_
      return "eevee_shadow_tag_usage_surfels";
    case SHADOW_TILEMAP_TAG_USAGE_TRANSPARENT:
      return "eevee_shadow_tag_usage_transparent";
+    case SHADOW_PAGE_TILE_CLEAR:
+      return "eevee_shadow_page_tile_clear";
+    case SHADOW_PAGE_TILE_STORE:
+      return "eevee_shadow_page_tile_store";
    case SHADOW_TILEMAP_TAG_USAGE_VOLUME:
      return "eevee_shadow_tag_usage_volume";
    case SUBSURFACE_CONVOLVE:
@@ -552,7 +560,18 @@ void ShaderModule::material_create_info_ammend(GPUMaterial *gpumat, GPUCodegenOu
          info.additional_info("eevee_surf_depth");
          break;
        case MAT_PIPE_SHADOW:
-          info.additional_info("eevee_surf_shadow");
+          /* Determine surface shadow shader depending on used update technique. */
+          switch (ShadowModule::shadow_technique) {
+            case ShadowTechnique::ATOMIC_RASTER: {
+              info.additional_info("eevee_surf_shadow_atomic");
+            } break;
+            case ShadowTechnique::TILE_COPY: {
+              info.additional_info("eevee_surf_shadow_tbdr");
+            } break;
+            default: {
+              BLI_assert_unreachable();
+            } break;
+          }
          break;
        case MAT_PIPE_CAPTURE:
          info.additional_info("eevee_surf_capture");
--- a/source/blender/draw/engines/eevee_next/eevee_shader.hh
+++ b/source/blender/draw/engines/eevee_next/eevee_shader.hh
@@ -105,6 +105,8 @@ enum eShaderType {
  SHADOW_PAGE_DEFRAG,
  SHADOW_PAGE_FREE,
  SHADOW_PAGE_MASK,
+  SHADOW_PAGE_TILE_CLEAR,
+  SHADOW_PAGE_TILE_STORE,
  SHADOW_TILEMAP_BOUNDS,
  SHADOW_TILEMAP_FINALIZE,
  SHADOW_TILEMAP_INIT,
--- a/source/blender/draw/engines/eevee_next/eevee_shader_shared.hh
+++ b/source/blender/draw/engines/eevee_next/eevee_shader_shared.hh
@@ -934,34 +934,33 @@ enum eShadowFlag : uint32_t {
  SHADOW_IS_USED = (1u << 31u)
 };

+/* NOTE: Trust the input to be in valid range (max is [3,3,255]).
+ * If it is in valid range, it should pack to 12bits so that `shadow_tile_pack()` can use it.
+ * But sometime this is used to encode invalid pages uint3(-1) and it needs to output uint(-1). */
 static inline uint shadow_page_pack(uint3 page)
 {
-  /* NOTE: Trust the input to be in valid range.
-   * But sometime this is used to encode invalid pages uint3(-1) and it needs to output uint(-1).
-   */
  return (page.x << 0u) | (page.y << 2u) | (page.z << 4u);
 }
-
 static inline uint3 shadow_page_unpack(uint data)
 {
  uint3 page;
-  /* Tweaked for SHADOW_PAGE_PER_ROW = 4. */
-  page.x = data & uint(SHADOW_PAGE_PER_ROW - 1);
-  page.y = (data >> 2u) & uint(SHADOW_PAGE_PER_COL - 1);
-  page.z = (data >> 4u);
+  BLI_STATIC_ASSERT(SHADOW_PAGE_PER_ROW <= 4 && SHADOW_PAGE_PER_COL <= 4, "Update page packing")
+  page.x = (data >> 0u) & 3u;
+  page.y = (data >> 2u) & 3u;
+  BLI_STATIC_ASSERT(SHADOW_MAX_PAGE <= 4096, "Update page packing")
+  page.z = (data >> 4u) & 255u;
  return page;
 }

 static inline ShadowTileData shadow_tile_unpack(ShadowTileDataPacked data)
 {
  ShadowTileData tile;
-  /* Tweaked for SHADOW_MAX_PAGE = 4096. */
-  tile.page = shadow_page_unpack(data & uint(SHADOW_MAX_PAGE - 1));
+  tile.page = shadow_page_unpack(data);
  /* -- 12 bits -- */
-  /* Tweaked for SHADOW_TILEMAP_LOD < 8. */
+  BLI_STATIC_ASSERT(SHADOW_TILEMAP_LOD < 8, "Update page packing")
  tile.lod = (data >> 12u) & 7u;
  /* -- 15 bits -- */
-  /* Tweaked for SHADOW_MAX_TILEMAP = 4096. */
+  BLI_STATIC_ASSERT(SHADOW_MAX_PAGE <= 4096, "Update page packing")
  tile.cache_index = (data >> 15u) & 4095u;
  /* -- 27 bits -- */
  tile.is_used = (data & SHADOW_IS_USED) != 0;
@@ -974,7 +973,10 @@ static inline ShadowTileData shadow_tile_unpack(ShadowTileDataPacked data)

 static inline ShadowTileDataPacked shadow_tile_pack(ShadowTileData tile)
 {
-  uint data = shadow_page_pack(tile.page) & uint(SHADOW_MAX_PAGE - 1);
+  uint data;
+  /* NOTE: Page might be set to invalid values for tracking invalid usages.
+   * So we have to mask the result. */
+  data = shadow_page_pack(tile.page) & uint(SHADOW_MAX_PAGE - 1);
  data |= (tile.lod & 7u) << 12u;
  data |= (tile.cache_index & 4095u) << 15u;
  data |= (tile.is_used ? uint(SHADOW_IS_USED) : 0);
--- a/source/blender/draw/engines/eevee_next/eevee_shadow.cc
+++ b/source/blender/draw/engines/eevee_next/eevee_shadow.cc
@@ -19,6 +19,8 @@

 namespace blender::eevee {

+ShadowTechnique ShadowModule::shadow_technique = ShadowTechnique::ATOMIC_RASTER;
+
 /* -------------------------------------------------------------------- */
 /** \name Tile map
 *
@@ -719,6 +721,17 @@ ShadowModule::ShadowModule(Instance &inst, ShadowSceneData &data) : inst_(inst),

 void ShadowModule::init()
 {
+  /* Determine shadow update technique and atlas format.
+   * NOTE(Metal): Metal utilizes a tile-optimized approach for Apple Silicon's architecture. */
+  const bool is_metal_backend = (GPU_backend_get_type() == GPU_BACKEND_METAL);
+  const bool is_tile_based_arch = (GPU_platform_architecture() == GPU_ARCHITECTURE_TBDR);
+  if (is_metal_backend && is_tile_based_arch) {
+    ShadowModule::shadow_technique = ShadowTechnique::TILE_COPY;
+  }
+  else {
+    ShadowModule::shadow_technique = ShadowTechnique::ATOMIC_RASTER;
+  }
+
  ::Scene &scene = *inst_.scene;
  bool enabled = (scene.eevee.flag & SCE_EEVEE_SHADOW_ENABLED) != 0;
  if (assign_if_different(enabled_, enabled)) {
@@ -758,7 +771,7 @@ void ShadowModule::init()

  /* Make allocation safe. Avoids crash later on. */
  if (!atlas_tx_.is_valid()) {
-    atlas_tx_.ensure_2d_array(atlas_type, int2(1), 1);
+    atlas_tx_.ensure_2d_array(ShadowModule::atlas_type, int2(1), 1);
    inst_.info = "Error: Could not allocate shadow atlas. Most likely out of GPU memory.";
  }

@@ -794,6 +807,8 @@ void ShadowModule::init()
  /* Create different viewport to support different update region size. The most fitting viewport
   * is then selected during the tilemap finalize stage in `viewport_select`. */
  for (int i = 0; i < multi_viewports_.size(); i++) {
+    /** IMPORTANT: Reflect changes in TBDR tile vertex shader which assumes viewport index 15
+     * covers the whole framebuffer. */
    int size_in_tile = min_ii(1 << i, SHADOW_TILEMAP_RES);
    multi_viewports_[i][0] = 0;
    multi_viewports_[i][1] = 0;
@@ -1118,6 +1133,7 @@ void ShadowModule::end_sync()
        sub.bind_ssbo("pages_cached_buf", pages_cached_data_);
        sub.bind_ssbo("statistics_buf", statistics_buf_.current());
        sub.bind_ssbo("clear_dispatch_buf", clear_dispatch_buf_);
+        sub.bind_ssbo("tile_draw_buf", tile_draw_buf_);
        sub.dispatch(int3(1, 1, 1));
        sub.barrier(GPU_BARRIER_SHADER_STORAGE);
      }
@@ -1144,7 +1160,9 @@ void ShadowModule::end_sync()
        sub.bind_ssbo("view_infos_buf", &shadow_multi_view_.matrices_ubo_get());
        sub.bind_ssbo("statistics_buf", statistics_buf_.current());
        sub.bind_ssbo("clear_dispatch_buf", clear_dispatch_buf_);
-        sub.bind_ssbo("clear_list_buf", clear_list_buf_);
+        sub.bind_ssbo("tile_draw_buf", tile_draw_buf_);
+        sub.bind_ssbo("dst_coord_buf", dst_coord_buf_);
+        sub.bind_ssbo("src_coord_buf", src_coord_buf_);
        sub.bind_ssbo("render_map_buf", render_map_buf_);
        sub.bind_ssbo("viewport_index_buf", viewport_index_buf_);
        sub.bind_ssbo("pages_infos_buf", pages_infos_data_);
@@ -1153,14 +1171,17 @@ void ShadowModule::end_sync()
        sub.barrier(GPU_BARRIER_SHADER_STORAGE | GPU_BARRIER_UNIFORM | GPU_BARRIER_TEXTURE_FETCH |
                    GPU_BARRIER_SHADER_IMAGE_ACCESS);
      }
-      {
+
+      /* NOTE: We do not need to run the clear pass when using the TBDR update variant, as tiles
+       * will be fully cleared as part of the shadow raster step. */
+      if (ShadowModule::shadow_technique != ShadowTechnique::TILE_COPY) {
        /** Clear pages that need to be rendered. */
        PassSimple::Sub &sub = pass.sub("RenderClear");
        sub.framebuffer_set(&render_fb_);
        sub.state_set(DRW_STATE_WRITE_DEPTH | DRW_STATE_DEPTH_ALWAYS);
        sub.shader_set(inst_.shaders.static_shader_get(SHADOW_PAGE_CLEAR));
        sub.bind_ssbo("pages_infos_buf", pages_infos_data_);
-        sub.bind_ssbo("clear_list_buf", clear_list_buf_);
+        sub.bind_ssbo("dst_coord_buf", dst_coord_buf_);
        sub.bind_image("shadow_atlas_img", atlas_tx_);
        sub.dispatch(clear_dispatch_buf_);
        sub.barrier(GPU_BARRIER_SHADER_IMAGE_ACCESS);
@@ -1263,10 +1284,35 @@ void ShadowModule::set_view(View &view)
                                               int2(std::exp2(usage_tag_fb_lod_)));
  usage_tag_fb.ensure(usage_tag_fb_resolution_);

-  render_fb_.ensure(int2(SHADOW_TILEMAP_RES * shadow_page_size_));
-  GPU_framebuffer_bind(render_fb_);
-  GPU_framebuffer_multi_viewports_set(render_fb_,
-                                      reinterpret_cast<int(*)[4]>(multi_viewports_.data()));
+  eGPUTextureUsage usage = GPU_TEXTURE_USAGE_ATTACHMENT | GPU_TEXTURE_USAGE_MEMORYLESS;
+  int2 fb_size = int2(SHADOW_TILEMAP_RES * shadow_page_size_);
+  int fb_layers = SHADOW_VIEW_MAX;
+
+  if (shadow_technique == ShadowTechnique::ATOMIC_RASTER) {
+    if (GPU_backend_get_type() == GPU_BACKEND_METAL) {
+      /* Metal requires a memoryless attachment to create an empty framebuffer.
+       * Might as well make use of it. */
+      shadow_depth_fb_tx_.ensure_2d_array(GPU_DEPTH_COMPONENT32F, fb_size, fb_layers, usage);
+      shadow_depth_accum_tx_.free();
+      render_fb_.ensure(GPU_ATTACHMENT_TEXTURE(shadow_depth_fb_tx_));
+    }
+    else {
+      /* Create attachment-less framebuffer. */
+      shadow_depth_fb_tx_.free();
+      shadow_depth_accum_tx_.free();
+      render_fb_.ensure(fb_size);
+    }
+  }
+  else if (shadow_technique == ShadowTechnique::TILE_COPY) {
+    /* Create memoryless depth attachment for on-tile surface depth accumulation.*/
+    shadow_depth_fb_tx_.ensure_2d_array(GPU_DEPTH_COMPONENT32F, fb_size, fb_layers, usage);
+    shadow_depth_accum_tx_.ensure_2d_array(GPU_R32F, fb_size, fb_layers, usage);
+    render_fb_.ensure(GPU_ATTACHMENT_TEXTURE(shadow_depth_fb_tx_),
+                      GPU_ATTACHMENT_TEXTURE(shadow_depth_accum_tx_));
+  }
+  else {
+    BLI_assert_unreachable();
+  }

  inst_.hiz_buffer.update();

@@ -1277,15 +1323,50 @@ void ShadowModule::set_view(View &view)
      GPU_uniformbuf_clear_to_zero(shadow_multi_view_.matrices_ubo_get());

      inst_.manager->submit(tilemap_setup_ps_, view);
-
      inst_.manager->submit(tilemap_usage_ps_, view);
-
      inst_.manager->submit(tilemap_update_ps_, view);

      shadow_multi_view_.compute_procedural_bounds();

+      /* Isolate shadow update into own command buffer.
+       * If parameter buffer exceeds limits, then other work will not be impacted.  */
+      bool use_flush = (shadow_technique == ShadowTechnique::TILE_COPY) &&
+                       (GPU_backend_get_type() == GPU_BACKEND_METAL);
+
+      if (use_flush) {
+        GPU_flush();
+      }
+
+      /* TODO(fclem): Move all of this to the draw::PassMain. */
+      if (shadow_depth_fb_tx_.is_valid() && shadow_depth_accum_tx_.is_valid()) {
+        GPU_framebuffer_bind_ex(
+            render_fb_,
+            {
+                /* Depth is cleared to 0 for TBDR optimization. */
+                {GPU_LOADACTION_CLEAR, GPU_STOREACTION_DONT_CARE, {0.0f, 0.0f, 0.0f, 0.0f}},
+                {GPU_LOADACTION_CLEAR, GPU_STOREACTION_DONT_CARE, {1.0f, 1.0f, 1.0f, 1.0f}},
+            });
+      }
+      else if (shadow_depth_fb_tx_.is_valid()) {
+        GPU_framebuffer_bind_ex(
+            render_fb_,
+            {
+                {GPU_LOADACTION_CLEAR, GPU_STOREACTION_DONT_CARE, {1.0f, 1.0f, 1.0f, 1.0f}},
+            });
+      }
+      else {
+        GPU_framebuffer_bind(render_fb_);
+      }
+
+      GPU_framebuffer_multi_viewports_set(render_fb_,
+                                          reinterpret_cast<int(*)[4]>(multi_viewports_.data()));
+
      inst_.pipelines.shadow.render(shadow_multi_view_);

+      if (use_flush) {
+        GPU_flush();
+      }
+
      GPU_memory_barrier(GPU_BARRIER_SHADER_IMAGE_ACCESS | GPU_BARRIER_TEXTURE_FETCH);
    }
    DRW_stats_group_end();
@@ -1296,8 +1377,7 @@ void ShadowModule::set_view(View &view)
    else {
      /* This provoke a GPU/CPU sync. Avoid it if we are sure that all tile-maps will be rendered
       * in a single iteration. */
-      bool enough_tilemap_for_single_iteration = tilemap_pool.tilemaps_data.size() <=
-                                                 SHADOW_VIEW_MAX;
+      bool enough_tilemap_for_single_iteration = tilemap_pool.tilemaps_data.size() <= fb_layers;
      if (enough_tilemap_for_single_iteration) {
        tile_update_remains = false;
      }
--- a/source/blender/draw/engines/eevee_next/eevee_shadow.hh
+++ b/source/blender/draw/engines/eevee_next/eevee_shadow.hh
@@ -15,9 +15,11 @@

 #include "GPU_batch.h"

+#include "eevee_camera.hh"
 #include "eevee_material.hh"
 #include "eevee_shader.hh"
 #include "eevee_shader_shared.hh"
+#include "eevee_sync.hh"

 namespace blender::eevee {

@@ -52,6 +54,20 @@ constexpr static const float shadow_clipmap_scale_mat[4][4] = {{SHADOW_TILEMAP_R
                                                               {0, 0, 0.5, 0},
                                                               {0, 0, 0.5, 1}};

+/* Technique used for updating the virtual shadow map contents. */
+enum class ShadowTechnique {
+  /* Default virtual shadow map update using large virtual framebuffer to rasterize geometry with
+   * per-fragment textureAtomicMin to perform depth-test and indirectly store nearest depth value
+   * in the shadow atlas. */
+  ATOMIC_RASTER = 0,
+
+  /* Tile-architecture optimized virtual shadow map update, leveraging on-tile memory for clearing
+   * and depth-testing during geometry rasterization to avoid atomic operations, simplify mesh
+   * depth shader and only perform a single storage operation per pixel. This technique performs
+   * a 3-pass solution, first clearing tiles, updating depth and storing final results. */
+  TILE_COPY = 1,
+};
+
 /* -------------------------------------------------------------------- */
 /** \name Tile-Map
 *
@@ -180,6 +196,9 @@ class ShadowModule {
  friend ShadowTileMapPool;

 public:
+  /* Shadowing technique. */
+  static ShadowTechnique shadow_technique;
+
  /** Need to be first because of destructor order. */
  ShadowTileMapPool tilemap_pool;

@@ -214,10 +233,14 @@ class ShadowModule {
  StorageVectorBuffer<uint, 128> curr_casters_ = {"CurrCasters"};

  /** Indirect arguments for page clearing. */
-  DispatchIndirectBuf clear_dispatch_buf_;
-  /** Array containing a compact stream of tiles to clear. */
-  StorageArrayBuffer<uint, SHADOW_RENDER_MAP_SIZE, true> clear_list_buf_ = {"clear_list_buf"};
-  /** Tile to pages mapping. */
+  DispatchIndirectBuf clear_dispatch_buf_ = {"clear_dispatch_buf"};
+  /** Indirect arguments for TBDR Tile Page passes. */
+  DrawIndirectBuf tile_draw_buf_ = {"tile_draw_buf"};
+  /** A compact stream of rendered tile coordinates in the shadow atlas. */
+  StorageArrayBuffer<uint, SHADOW_RENDER_MAP_SIZE, true> dst_coord_buf_ = {"dst_coord_buf"};
+  /** A compact stream of rendered tile coordinates in the framebuffer. */
+  StorageArrayBuffer<uint, SHADOW_RENDER_MAP_SIZE, true> src_coord_buf_ = {"src_coord_buf"};
+  /** Same as dst_coord_buf_ but is not compact. More like a linear texture. */
  StorageArrayBuffer<uint, SHADOW_RENDER_MAP_SIZE, true> render_map_buf_ = {"render_map_buf"};
  /** View to viewport index mapping. */
  StorageArrayBuffer<uint, SHADOW_VIEW_MAX, true> viewport_index_buf_ = {"viewport_index_buf"};
@@ -264,7 +287,14 @@ class ShadowModule {
  /** Multi-View containing a maximum of 64 view to be rendered with the shadow pipeline. */
  View shadow_multi_view_ = {"ShadowMultiView", SHADOW_VIEW_MAX, true};
  /** Framebuffer with the atlas_tx attached. */
-  Framebuffer render_fb_;
+  Framebuffer render_fb_ = {"shadow_write_framebuffer"};
+
+  /* NOTE(Metal): Metal requires memoryless textures to be created which represent attachments in
+   * the shadow write framebuffer. These textures do not occupy any physical memory, but require a
+   * Texture object containing its parameters.*/
+  Texture shadow_depth_fb_tx_ = {"shadow_depth_fb_tx_"};
+  Texture shadow_depth_accum_tx_ = {"shadow_depth_accum_tx_"};
+
  /** Arrays of viewports to rendering each tile to. */
  std::array<int4, 16> multi_viewports_;

--- a/source/blender/draw/engines/eevee_next/shaders/eevee_shadow_page_clear_comp.glsl
+++ b/source/blender/draw/engines/eevee_next/shaders/eevee_shadow_page_clear_comp.glsl
@@ -8,14 +8,15 @@
 * Equivalent to a frame-buffer depth clear but only for pages pushed to the clear_page_buf.
 */

-#pragma BLENDER_REQUIRE(common_math_lib.glsl)
+#pragma BLENDER_REQUIRE(gpu_shader_utildefines_lib.glsl)

 void main()
 {
-  uint page_packed = clear_list_buf[gl_GlobalInvocationID.z];
+  /* We clear the destination pixels directly for the atomicMin technique. */
+  uint page_packed = dst_coord_buf[gl_GlobalInvocationID.z];
  uvec3 page_co = shadow_page_unpack(page_packed);
  page_co.xy = page_co.xy * SHADOW_PAGE_RES + gl_GlobalInvocationID.xy;

  /* Clear to FLT_MAX instead of 1 so the far plane doesn't cast shadows onto farther objects. */
-  imageStore(shadow_atlas_img, ivec3(page_co), uvec4(floatBitsToUint(FLT_MAX)));
+  imageStoreFast(shadow_atlas_img, ivec3(page_co), uvec4(floatBitsToUint(FLT_MAX)));
 }
--- a/source/blender/draw/engines/eevee_next/shaders/eevee_shadow_page_defrag_comp.glsl
+++ b/source/blender/draw/engines/eevee_next/shaders/eevee_shadow_page_defrag_comp.glsl
@@ -129,4 +129,10 @@ void main()
  clear_dispatch_buf.num_groups_x = SHADOW_PAGE_RES / SHADOW_PAGE_CLEAR_GROUP_SIZE;
  clear_dispatch_buf.num_groups_y = SHADOW_PAGE_RES / SHADOW_PAGE_CLEAR_GROUP_SIZE;
  clear_dispatch_buf.num_groups_z = 0;
+
+  /* Reset TBDR command indirect buffer. */
+  tile_draw_buf.vertex_len = 0u;
+  tile_draw_buf.instance_len = 1u;
+  tile_draw_buf.vertex_first = 0u;
+  tile_draw_buf.base_index = 0u;
 }
--- a/source/blender/draw/engines/eevee_next/shaders/eevee_shadow_page_mask_comp.glsl
+++ b/source/blender/draw/engines/eevee_next/shaders/eevee_shadow_page_mask_comp.glsl
@@ -36,12 +36,21 @@ void main()
      if (thread_active) {
        int tile_offset = shadow_tile_offset(tile_co, tilemap.tiles_index, lod);
        tile = shadow_tile_unpack(tiles_buf[tile_offset]);
+#ifdef SHADOW_FORCE_LOD0
+        if (lod == 0) {
+          tiles_buf[tile_offset] |= SHADOW_IS_USED;
+        }
+        else {
+          tiles_buf[tile_offset] &= ~SHADOW_IS_USED;
+        }
+#else
        if (lod > 0 && usage_grid[tile_co.y][tile_co.x] == 4u) {
          /* Remove the usage flag as this tile is completely covered by higher LOD tiles. */
          tiles_buf[tile_offset] &= ~SHADOW_IS_USED;
          /* Consider this tile occluding lower levels. */
          tile.is_used = true;
        }
+#endif
        /* Reset count for next level. */
        usage_grid[tile_co.y / 2][tile_co.x / 2] = 0u;
      }
--- a/source/blender/draw/engines/eevee_next/shaders/eevee_shadow_page_tile_frag.glsl
+++ b/source/blender/draw/engines/eevee_next/shaders/eevee_shadow_page_tile_frag.glsl
@@ -0,0 +1,64 @@
+
+/**
+ * Virtual Shadow map tile shader.
+ *
+ * On TBDR, we can use a three-pass method to perform virtual shadow map updates, leveraging
+ * efficient use of tile-based GPUs. Shadow updates rasterize geometry for each view in much the
+ * same way as a conventional shadow map render, but for the standard path, there is an additional
+ * cost of an atomic-min abd store to allow for indirection into the atlas. This setup can lead to
+ * excessive overdraw, rasterization and increased complexity in the material depth fragment
+ * shader, reducing rendering performance.
+ *
+ * On a tile-based GPU, as shadow updates are still relative, we can still leverage on-tile depth
+ * testing, to avoid atomic-min operations against global memory, and only write out the final
+ * depth value stored in each tile. Large memory-less render targets are used to create a virtual
+ * render target, where only the updated regions and layers are processed.
+ *
+ * Firstly, invoke an instance of this shader with PASS_CLEAR to clear the depth values to default
+ * for tiles being updated. The first optimization also enables tiles which are not being updated
+ * in this pass to be cleared to zero, saving on fragment invocation costs for unused regions of
+ * the render target.
+ * This allows us to also skip the compute-based tile clear pass.
+ *
+ * Secondly, eevee_surf_shadow_frag is used to generate depth information which is stored
+ * on tile for the closest fragment. The TBDR path uses a simple variant of this shader
+ * which just outputs the depth, without any virtual shadow map processing on top.
+ *
+ * The third pass then runs, writing out only the highest-level final pixel to memory,
+ * avoiding the requirement for atomic texture operations.
+ *
+ * Output shadow atlas page indirection is calculated in the vertex shader, which generates
+ * a lattice of quads covering the shadow pages which are to be updated. The quads which
+ * belong to shadow pages not being updated in this pass are discarded.
+ **/
+
+#pragma BLENDER_REQUIRE(eevee_shadow_tilemap_lib.glsl)
+
+#if defined(PASS_CLEAR)
+
+void main()
+{
+  /* The tile clear pass writes out to tile attachment to ensure raster order groups are satisfied,
+   * allowing the clear to be guaranteed to happen first, as it is first in submission order.   */
+  out_tile_depth = FLT_MAX;
+}
+
+#elif defined(PASS_DEPTH_STORE)
+
+void main()
+{
+  /* For storing pass, we store the result from depth in tile memory. */
+  uint u_depth = floatBitsToUint(in_tile_depth);
+  /* Quantization bias. Equivalent to nextafter in C without all the safety. 1 is not enough. */
+  u_depth += 2;
+
+  /* Write result to altas. */
+#  ifdef GPU_METAL
+  /* NOTE: Use the fastest possible write function without any parameter wrapping or conversion.*/
+  shadow_atlas_img.texture->write(u_depth, ushort2(out_texel_xy), out_page_z);
+#  else
+  imageStore(shadow_atlas_img, ivec3(out_texel_xy, out_page_z), uvec4(u_depth));
+#  endif
+}
+
+#endif
--- a/source/blender/draw/engines/eevee_next/shaders/eevee_shadow_page_tile_vert.glsl
+++ b/source/blender/draw/engines/eevee_next/shaders/eevee_shadow_page_tile_vert.glsl
@@ -0,0 +1,38 @@
+
+/**
+ * Virtual Shadow map tile shader.
+ *
+ * See fragment shader for more infos.
+ */
+#pragma BLENDER_REQUIRE(gpu_shader_utildefines_lib.glsl)
+#pragma BLENDER_REQUIRE(eevee_shadow_tilemap_lib.glsl)
+
+void main()
+{
+  int tile_id = gl_VertexID / 6;
+  int vertex_id = gl_VertexID % 6;
+  /* Generate Quad with 2 triangle with same winding.
+   * This way the can be merged on some hardware. */
+  int v = (vertex_id > 2) ? (3 - (vertex_id - 3)) : vertex_id;
+  vec2 tile_corner = vec2(v & 1, v >> 1);
+
+#ifdef PASS_DEPTH_STORE
+  /* Load where fragment should write the tile data. */
+  uvec3 dst_page_co = shadow_page_unpack(dst_coord_buf[tile_id]);
+  /* Interpolate output texel  */
+  out_texel_xy = (vec2(dst_page_co.xy) + tile_corner) * vec2(SHADOW_PAGE_RES);
+  out_page_z = dst_page_co.z;
+#endif
+
+  /* Load where the quad should be positioned. */
+  uvec3 src_page_co = unpackUvec4x8(src_coord_buf[tile_id]).xyz;
+
+  vec2 uv_pos = (tile_corner + vec2(src_page_co.xy)) / float(SHADOW_TILEMAP_RES);
+  vec2 ndc_pos = uv_pos * 2.0 - 1.0;
+  /* We initially clear depth to 1.0 only for update fragments.
+   * Non-updated tile depth will remain at 0.0 to ensure fragments are discarded. */
+  gl_Position = vec4(ndc_pos.x, ndc_pos.y, 1.0, 1.0);
+  gpu_Layer = int(src_page_co.z);
+  /* Assumes last viewport will always cover the whole framebuffer. */
+  gpu_ViewportIndex = 15;
+}
--- a/source/blender/draw/engines/eevee_next/shaders/eevee_shadow_tilemap_finalize_comp.glsl
+++ b/source/blender/draw/engines/eevee_next/shaders/eevee_shadow_tilemap_finalize_comp.glsl
@@ -160,9 +160,14 @@ void main()
        if (do_page_render) {
          /* Tag tile as rendered. There is a barrier after the read. So it is safe. */
          tiles_buf[tile_index] |= SHADOW_IS_RENDERED;
-          /* Add page to clear list. */
-          uint clear_page_index = atomicAdd(clear_dispatch_buf.num_groups_z, 1u);
-          clear_list_buf[clear_page_index] = page_packed;
+          /* Add page to clear dispatch. */
+          uint page_index = atomicAdd(clear_dispatch_buf.num_groups_z, 1u);
+          /* Add page to tile processing. */
+          atomicAdd(tile_draw_buf.vertex_len, 6u);
+          /* Add page mapping for indexing the page position in atlas and in the framebuffer. */
+          dst_coord_buf[page_index] = page_packed;
+          src_coord_buf[page_index] = packUvec4x8(
+              uvec4(relative_tile_co.x, relative_tile_co.y, view_index, 0));
          /* Statistics. */
          atomicAdd(statistics_buf.page_rendered_count, 1);
        }
--- a/source/blender/draw/engines/eevee_next/shaders/eevee_shadow_tilemap_lib.glsl
+++ b/source/blender/draw/engines/eevee_next/shaders/eevee_shadow_tilemap_lib.glsl
@@ -37,7 +37,7 @@ ivec2 shadow_tile_coord_in_atlas(ivec2 tile, int tilemap_index)
 */
 int shadow_tile_offset(ivec2 tile, int tiles_index, int lod)
 {
-#if SHADOW_TILEMAP_LOD != 5
+#if SHADOW_TILEMAP_LOD > 5
 #  error This needs to be adjusted
 #endif
  const int lod0_width = SHADOW_TILEMAP_RES / 1;
--- a/source/blender/draw/engines/eevee_next/shaders/eevee_surf_shadow_frag.glsl
+++ b/source/blender/draw/engines/eevee_next/shaders/eevee_surf_shadow_frag.glsl
@@ -37,7 +37,9 @@ void main()
  }
 #endif

-#ifdef USE_ATOMIC
+  float f_depth = gl_FragCoord.z + fwidth(gl_FragCoord.z);
+
+#ifdef SHADOW_UPDATE_ATOMIC_RASTER
  ivec2 texel_co = ivec2(gl_FragCoord.xy);

  /* Using bitwise ops is way faster than integer ops. */
@@ -54,9 +56,16 @@ void main()

  ivec3 page = ivec3(shadow_page_unpack(page_packed));
  ivec3 out_texel = ivec3((page.xy << page_shift) | texel_page, page.z);
-  uint u_depth = floatBitsToUint(gl_FragCoord.z + fwidth(gl_FragCoord.z));
+
+  uint u_depth = floatBitsToUint(f_depth);
  /* Quantization bias. Equivalent to `nextafter()` in C without all the safety. */
  u_depth += 2;
  imageAtomicMin(shadow_atlas_img, out_texel, u_depth);
 #endif
+
+#ifdef SHADOW_UPDATE_TBDR
+  /* Store output depth in tile memory using F32 attachment. NOTE: As depth testing is enabled,
+   * only the closest fragment will store the result. */
+  out_depth = f_depth;
+#endif
 }
--- a/source/blender/draw/engines/eevee_next/shaders/infos/eevee_material_info.hh
+++ b/source/blender/draw/engines/eevee_next/shaders/infos/eevee_material_info.hh
@@ -198,28 +198,38 @@ GPU_SHADER_CREATE_INFO(eevee_surf_world)
                     "eevee_utility_texture");

 GPU_SHADER_CREATE_INFO(eevee_surf_shadow)
-    .define("DRW_VIEW_LEN", "64")
+    .define("DRW_VIEW_LEN", STRINGIFY(SHADOW_VIEW_MAX))
    .define("MAT_SHADOW")
-    .define("USE_ATOMIC")
-    .builtins(BuiltinBits::VIEWPORT_INDEX)
-    .builtins(BuiltinBits::LAYER)
-    .builtins(BuiltinBits::TEXTURE_ATOMIC)
-    .storage_buf(SHADOW_RENDER_MAP_BUF_SLOT,
-                 Qualifier::READ,
-                 "uint",
-                 "render_map_buf[SHADOW_RENDER_MAP_SIZE]")
+    .builtins(BuiltinBits::VIEWPORT_INDEX | BuiltinBits::LAYER)
    .storage_buf(SHADOW_VIEWPORT_INDEX_BUF_SLOT,
                 Qualifier::READ,
                 "uint",
                 "viewport_index_buf[SHADOW_VIEW_MAX]")
-    .storage_buf(SHADOW_PAGE_INFO_SLOT, Qualifier::READ, "ShadowPagesInfoData", "pages_infos_buf")
+    .fragment_source("eevee_surf_shadow_frag.glsl")
+    .additional_info("eevee_global_ubo", "eevee_utility_texture", "eevee_sampling_data");
+
+GPU_SHADER_CREATE_INFO(eevee_surf_shadow_atomic)
+    .additional_info("eevee_surf_shadow")
+    .define("SHADOW_UPDATE_ATOMIC_RASTER")
+    .builtins(BuiltinBits::TEXTURE_ATOMIC)
+    /* Early fragment test for speeding up platforms that requires a depth buffer. */
+    /* NOTE: This removes the possibility of using gl_FragDepth. */
+    .early_fragment_test(true)
+    .storage_buf(SHADOW_RENDER_MAP_BUF_SLOT,
+                 Qualifier::READ,
+                 "uint",
+                 "render_map_buf[SHADOW_RENDER_MAP_SIZE]")
    .image(SHADOW_ATLAS_IMG_SLOT,
           GPU_R32UI,
           Qualifier::READ_WRITE,
           ImageType::UINT_2D_ARRAY,
-           "shadow_atlas_img")
-    .fragment_source("eevee_surf_shadow_frag.glsl")
-    .additional_info("eevee_global_ubo", "eevee_utility_texture", "eevee_sampling_data");
+           "shadow_atlas_img");
+
+GPU_SHADER_CREATE_INFO(eevee_surf_shadow_tbdr)
+    .additional_info("eevee_surf_shadow")
+    .define("SHADOW_UPDATE_TBDR")
+    /* F32 color attachment for on-tile depth accumulation without atomics. */
+    .fragment_out(0, Type::FLOAT, "out_depth", DualBlend::NONE, SHADOW_ROG_ID);

 #undef image_out
 #undef image_array_out
@@ -340,7 +350,8 @@ GPU_SHADER_CREATE_INFO(eevee_material_stub)
    EEVEE_MAT_GEOM_VARIATIONS(name##_deferred, "eevee_surf_deferred", __VA_ARGS__) \
    EEVEE_MAT_GEOM_VARIATIONS(name##_forward, "eevee_surf_forward", __VA_ARGS__) \
    EEVEE_MAT_GEOM_VARIATIONS(name##_capture, "eevee_surf_capture", __VA_ARGS__) \
-    EEVEE_MAT_GEOM_VARIATIONS(name##_shadow, "eevee_surf_shadow", __VA_ARGS__)
+    EEVEE_MAT_GEOM_VARIATIONS(name##_shadow_atomic, "eevee_surf_shadow_atomic", __VA_ARGS__) \
+    EEVEE_MAT_GEOM_VARIATIONS(name##_shadow_tbdr, "eevee_surf_shadow_tbdr", __VA_ARGS__)

 EEVEE_MAT_PIPE_VARIATIONS(eevee_surface, "eevee_material_stub")

--- a/source/blender/draw/engines/eevee_next/shaders/infos/eevee_shadow_info.hh
+++ b/source/blender/draw/engines/eevee_next/shaders/infos/eevee_shadow_info.hh
@@ -155,7 +155,8 @@ GPU_SHADER_CREATE_INFO(eevee_shadow_page_defrag)
    .storage_buf(3, Qualifier::READ_WRITE, "uint", "pages_free_buf[]")
    .storage_buf(4, Qualifier::READ_WRITE, "uvec2", "pages_cached_buf[]")
    .storage_buf(5, Qualifier::WRITE, "DispatchCommand", "clear_dispatch_buf")
-    .storage_buf(6, Qualifier::READ_WRITE, "ShadowStatistics", "statistics_buf")
+    .storage_buf(6, Qualifier::WRITE, "DrawCommand", "tile_draw_buf")
+    .storage_buf(7, Qualifier::READ_WRITE, "ShadowStatistics", "statistics_buf")
    .additional_info("eevee_shared")
    .compute_source("eevee_shadow_page_defrag_comp.glsl");

@@ -182,26 +183,73 @@ GPU_SHADER_CREATE_INFO(eevee_shadow_tilemap_finalize)
    .storage_buf(3, Qualifier::WRITE, "ViewMatrices", "view_infos_buf[SHADOW_VIEW_MAX]")
    .storage_buf(4, Qualifier::READ_WRITE, "ShadowStatistics", "statistics_buf")
    .storage_buf(5, Qualifier::READ_WRITE, "DispatchCommand", "clear_dispatch_buf")
-    .storage_buf(6, Qualifier::WRITE, SHADOW_PAGE_PACKED, "clear_list_buf[SHADOW_RENDER_MAP_SIZE]")
-    .storage_buf(7, Qualifier::WRITE, SHADOW_PAGE_PACKED, "render_map_buf[SHADOW_RENDER_MAP_SIZE]")
-    .storage_buf(8, Qualifier::WRITE, "uint", "viewport_index_buf[SHADOW_VIEW_MAX]")
-    .storage_buf(9, Qualifier::READ, "ShadowTileMapClip", "tilemaps_clip_buf[]")
+    .storage_buf(6, Qualifier::READ_WRITE, "DrawCommand", "tile_draw_buf")
+    .storage_buf(7, Qualifier::WRITE, SHADOW_PAGE_PACKED, "dst_coord_buf[SHADOW_RENDER_MAP_SIZE]")
+    .storage_buf(8, Qualifier::WRITE, SHADOW_PAGE_PACKED, "src_coord_buf[SHADOW_RENDER_MAP_SIZE]")
+    .storage_buf(9, Qualifier::WRITE, SHADOW_PAGE_PACKED, "render_map_buf[SHADOW_RENDER_MAP_SIZE]")
+    .storage_buf(10, Qualifier::WRITE, "uint", "viewport_index_buf[SHADOW_VIEW_MAX]")
+    .storage_buf(11, Qualifier::READ, "ShadowTileMapClip", "tilemaps_clip_buf[]")
+    /* 12 is the minimum number of storage buf we require. Do not go above this limit. */
    .image(0, GPU_R32UI, Qualifier::WRITE, ImageType::UINT_2D, "tilemaps_img")
    .additional_info("eevee_shared")
    .compute_source("eevee_shadow_tilemap_finalize_comp.glsl");

+/* AtomicMin clear implementation. */
 GPU_SHADER_CREATE_INFO(eevee_shadow_page_clear)
    .do_static_compilation(true)
    .local_group_size(SHADOW_PAGE_CLEAR_GROUP_SIZE, SHADOW_PAGE_CLEAR_GROUP_SIZE)
    .storage_buf(2, Qualifier::READ, "ShadowPagesInfoData", "pages_infos_buf")
-    .storage_buf(6, Qualifier::READ, SHADOW_PAGE_PACKED, "clear_list_buf[SHADOW_RENDER_MAP_SIZE]")
+    .storage_buf(6, Qualifier::READ, SHADOW_PAGE_PACKED, "dst_coord_buf[SHADOW_RENDER_MAP_SIZE]")
+    .additional_info("eevee_shared")
+    .compute_source("eevee_shadow_page_clear_comp.glsl")
+    .image(SHADOW_ATLAS_IMG_SLOT,
+           GPU_R32UI,
+           Qualifier::READ_WRITE,
+           ImageType::UINT_2D_ARRAY,
+           "shadow_atlas_img");
+
+/* TBDR clear implementation. */
+GPU_SHADER_CREATE_INFO(eevee_shadow_page_tile_clear)
+    .do_static_compilation(true)
+    .define("PASS_CLEAR")
+    .additional_info("eevee_shared")
+    .builtins(BuiltinBits::VIEWPORT_INDEX | BuiltinBits::LAYER)
+    .storage_buf(8, Qualifier::READ, SHADOW_PAGE_PACKED, "src_coord_buf[SHADOW_RENDER_MAP_SIZE]")
+    .vertex_source("eevee_shadow_page_tile_vert.glsl")
+    .fragment_source("eevee_shadow_page_tile_frag.glsl")
+    .fragment_out(0, Type::FLOAT, "out_tile_depth", DualBlend::NONE, SHADOW_ROG_ID);
+
+#ifdef APPLE
+/* Metal supports USHORT which saves a bit of perf here. */
+#  define PAGE_Z_TYPE Type::USHORT
+#else
+#  define PAGE_Z_TYPE Type::UINT
+#endif
+
+/* Interface for passing precalculated values in accumulation vertex to frag. */
+GPU_SHADER_INTERFACE_INFO(eevee_shadow_page_tile_store_iface, "")
+    .no_perspective(Type::VEC2, "out_texel_xy")
+    .flat(PAGE_Z_TYPE, "out_page_z");
+
+#undef PAGE_Z_TYPE
+
+/* 2nd tile pass to store shadow depths in atlas. */
+GPU_SHADER_CREATE_INFO(eevee_shadow_page_tile_store)
+    .do_static_compilation(true)
+    .define("PASS_DEPTH_STORE")
+    .additional_info("eevee_shared")
+    .builtins(BuiltinBits::VIEWPORT_INDEX | BuiltinBits::LAYER)
+    .storage_buf(7, Qualifier::READ, SHADOW_PAGE_PACKED, "dst_coord_buf[SHADOW_RENDER_MAP_SIZE]")
+    .storage_buf(8, Qualifier::READ, SHADOW_PAGE_PACKED, "src_coord_buf[SHADOW_RENDER_MAP_SIZE]")
+    .subpass_in(0, Type::FLOAT, "in_tile_depth", SHADOW_ROG_ID)
    .image(SHADOW_ATLAS_IMG_SLOT,
           GPU_R32UI,
           Qualifier::READ_WRITE,
           ImageType::UINT_2D_ARRAY,
           "shadow_atlas_img")
-    .additional_info("eevee_shared")
-    .compute_source("eevee_shadow_page_clear_comp.glsl");
+    .vertex_out(eevee_shadow_page_tile_store_iface)
+    .vertex_source("eevee_shadow_page_tile_vert.glsl")
+    .fragment_source("eevee_shadow_page_tile_frag.glsl");

 /** \} */

--- a/source/blender/draw/intern/draw_defines.h
+++ b/source/blender/draw/intern/draw_defines.h
@@ -37,3 +37,10 @@
 #define DRW_FINALIZE_GROUP_SIZE 64
 /* Must be multiple of 32. Set to 32 for shader simplicity. */
 #define DRW_VISIBILITY_GROUP_SIZE 32
+
+/**
+ * The maximum of indexable views is dictated by:
+ * - The UBO limit (16KiB) of the ViewMatrices container.
+ * - The maximum resource index supported for shaders using multi-view (see DRW_VIEW_SHIFT).
+ */
+#define DRW_VIEW_MAX 64
--- a/source/blender/draw/intern/draw_shader_shared.h
+++ b/source/blender/draw/intern/draw_shader_shared.h
@@ -66,13 +66,6 @@ extern "C" {
 /** \name Views
 * \{ */

-/**
- * The maximum of indexable views is dictated by:
- * - The UBO limit (16KiB) of the ViewMatrices container.
- * - The maximum resource index supported for shaders using multi-view (see DRW_VIEW_SHIFT).
- */
-#define DRW_VIEW_MAX 64
-
 #ifndef DRW_VIEW_LEN
 /* Single-view case (default). */
 #  define drw_view_id 0
--- a/source/blender/draw/intern/shaders/draw_view_info.hh
+++ b/source/blender/draw/intern/shaders/draw_view_info.hh
@@ -198,8 +198,8 @@ GPU_SHADER_CREATE_INFO(draw_resource_finalize)

 GPU_SHADER_CREATE_INFO(draw_view_finalize)
    .do_static_compilation(true)
-    .local_group_size(64) /* DRW_VIEW_MAX */
-    .define("DRW_VIEW_LEN", "64")
+    .local_group_size(DRW_VIEW_MAX)
+    .define("DRW_VIEW_LEN", STRINGIFY(DRW_VIEW_MAX))
    .storage_buf(0, Qualifier::READ_WRITE, "ViewCullingData", "view_culling_buf[DRW_VIEW_LEN]")
    .compute_source("draw_view_finalize_comp.glsl")
    .additional_info("draw_view");
@@ -207,7 +207,7 @@ GPU_SHADER_CREATE_INFO(draw_view_finalize)
 GPU_SHADER_CREATE_INFO(draw_visibility_compute)
    .do_static_compilation(true)
    .local_group_size(DRW_VISIBILITY_GROUP_SIZE)
-    .define("DRW_VIEW_LEN", "64")
+    .define("DRW_VIEW_LEN", STRINGIFY(DRW_VIEW_MAX))
    .storage_buf(0, Qualifier::READ, "ObjectBounds", "bounds_buf[]")
    .storage_buf(1, Qualifier::READ_WRITE, "uint", "visibility_buf[]")
    .push_constant(Type::INT, "resource_len")
--- a/source/blender/draw/tests/eevee_test.cc
+++ b/source/blender/draw/tests/eevee_test.cc
@@ -502,7 +502,8 @@ class TestDefrag {
  ShadowPageHeapBuf pages_free_data = {"PagesFreeBuf"};
  ShadowPageCacheBuf pages_cached_data = {"PagesCachedBuf"};
  ShadowPagesInfoDataBuf pages_infos_data = {"PagesInfosBuf"};
-  StorageBuffer<DispatchCommand> clear_draw_buf;
+  StorageBuffer<DispatchCommand> clear_dispatch_buf;
+  StorageBuffer<DrawCommand> tile_draw_buf;
  ShadowStatisticsBuf statistics_buf = {"statistics_buf"};

 public:
@@ -572,8 +573,9 @@ class TestDefrag {
    pass.bind_ssbo("pages_infos_buf", pages_infos_data);
    pass.bind_ssbo("pages_free_buf", pages_free_data);
    pass.bind_ssbo("pages_cached_buf", pages_cached_data);
+    pass.bind_ssbo("clear_dispatch_buf", clear_dispatch_buf);
+    pass.bind_ssbo("tile_draw_buf", tile_draw_buf);
    pass.bind_ssbo("statistics_buf", statistics_buf);
-    pass.bind_ssbo("clear_draw_buf", clear_draw_buf);
    pass.dispatch(int3(1, 1, 1));
    pass.barrier(GPU_BARRIER_BUFFER_UPDATE);

@@ -807,6 +809,9 @@ static void test_eevee_shadow_finalize()
    tilemap.viewmat = float4x4::identity();
    tilemap.tiles_index = 0;
    tilemap.clip_data_index = 0;
+    tilemap.clip_far = 10.0f;
+    tilemap.clip_near = 1.0f;
+    tilemap.half_size = 1.0f;
    tilemap.projection_type = SHADOW_PROJECTION_CUBEFACE;
    tilemaps_data.append(tilemap);

@@ -841,7 +846,9 @@ static void test_eevee_shadow_finalize()

  StorageArrayBuffer<ViewMatrices, DRW_VIEW_MAX> shadow_multi_view_buf = {"ShadowMultiView"};
  StorageBuffer<DispatchCommand> clear_dispatch_buf;
-  StorageArrayBuffer<uint, SHADOW_MAX_PAGE> clear_list_buf = {"clear_list_buf"};
+  StorageBuffer<DrawCommand> tile_draw_buf;
+  StorageArrayBuffer<uint, SHADOW_MAX_PAGE> dst_coord_buf = {"dst_coord_buf"};
+  StorageArrayBuffer<uint, SHADOW_MAX_PAGE> src_coord_buf = {"src_coord_buf"};
  StorageArrayBuffer<uint, SHADOW_RENDER_MAP_SIZE> render_map_buf = {"render_map_buf"};
  StorageArrayBuffer<uint, SHADOW_VIEW_MAX> viewport_index_buf = {"viewport_index_buf"};

@@ -857,7 +864,9 @@ static void test_eevee_shadow_finalize()
  pass.bind_ssbo("view_infos_buf", shadow_multi_view_buf);
  pass.bind_ssbo("statistics_buf", statistics_buf);
  pass.bind_ssbo("clear_dispatch_buf", clear_dispatch_buf);
-  pass.bind_ssbo("clear_list_buf", clear_list_buf);
+  pass.bind_ssbo("tile_draw_buf", tile_draw_buf);
+  pass.bind_ssbo("dst_coord_buf", dst_coord_buf);
+  pass.bind_ssbo("src_coord_buf", src_coord_buf);
  pass.bind_ssbo("render_map_buf", render_map_buf);
  pass.bind_ssbo("viewport_index_buf", viewport_index_buf);
  pass.bind_ssbo("pages_infos_buf", pages_infos_data);
--- a/source/blender/gpu/GPU_framebuffer.h
+++ b/source/blender/gpu/GPU_framebuffer.h
@@ -184,6 +184,9 @@ void GPU_framebuffer_bind_loadstore(GPUFrameBuffer *framebuffer,
 * This enables a number of bandwidth optimizations specially on Tile Based Deferred Renderers
 * where the attachments can be kept into tile memory and used in place for later sub-passes.
 *
+ * IMPORTANT: When using this, the framebuffer initial state is undefined. A sub-pass transition
+ * need to be issued before any draw-call.
+ *
 * Example:
 * \code{.c}
 * GPU_framebuffer_bind_loadstore(&fb, {
--- a/source/blender/gpu/GPU_shader_shared_utils.h
+++ b/source/blender/gpu/GPU_shader_shared_utils.h
@@ -29,6 +29,7 @@
 */

 #ifdef GPU_SHADER
+#  define BLI_STATIC_ASSERT(cond, msg)
 #  define BLI_STATIC_ASSERT_ALIGN(type_, align_)
 #  define BLI_STATIC_ASSERT_SIZE(type_, size_)
 #  define static
--- a/source/blender/gpu/metal/mtl_texture.mm
+++ b/source/blender/gpu/metal/mtl_texture.mm
@@ -198,7 +198,22 @@ void gpu::MTLTexture::bake_mip_swizzle_view()
        mip_texture_base_level_,
        min_ii(mip_texture_max_level_, (int)texture_.mipmapLevelCount),
        range_len);
+#ifndef NDEBUG
+    mip_swizzle_view_.label = [NSString
+        stringWithFormat:
+            @"MipSwizzleView_%s__format=%u_type=%u_baselevel=%u_numlevels=%u_swizzle='%c%c%c%c'",
+            [[texture_ label] UTF8String],
+            (uint)texture_view_pixel_format,
+            (uint)texture_view_texture_type,
+            (uint)mip_texture_base_level_,
+            (uint)range_len,
+            tex_swizzle_mask_[0],
+            tex_swizzle_mask_[1],
+            tex_swizzle_mask_[2],
+            tex_swizzle_mask_[3]];
+#else
    mip_swizzle_view_.label = [texture_ label];
+#endif
    texture_view_dirty_flags_ = TEXTURE_VIEW_NOT_DIRTY;
  }
 }
@@ -1189,7 +1204,7 @@ void gpu::MTLTexture::generate_mipmap()
  }

  /* Ensure mipmaps. */
-  this->ensure_mipmaps(9999);
+  this->ensure_mipmaps(mtl_max_mips_);

  /* Ensure texture is baked. */
  this->ensure_baked();
--- a/source/blender/gpu/opengl/gl_framebuffer.cc
+++ b/source/blender/gpu/opengl/gl_framebuffer.cc
@@ -305,6 +305,28 @@ void GLFrameBuffer::attachment_set_loadstore_op(GPUAttachmentType type, GPULoadS

  /* TODO(fclem): Add support for other ops. */
  if (ls.load_action == eGPULoadOp::GPU_LOADACTION_CLEAR) {
+    if (tmp_detached_[type].tex != nullptr) {
+      /* GPULoadStore is used to define the framebuffer before it is used for rendering.
+       * Binding back unattached attachment makes its state undefined. This is described by the
+       * documentation and the userland code should specify a sub-pass at the start of the drawing
+       * to explicitly set attachment state.
+       */
+      if (GLContext::framebuffer_fetch_support) {
+        /* Noop. */
+      }
+      else if (GLContext::texture_barrier_support) {
+        /* Reset default attachment state. */
+        for (int i : IndexRange(ARRAY_SIZE(tmp_detached_))) {
+          tmp_detached_[i] = GPU_ATTACHMENT_NONE;
+        }
+        glDrawBuffers(ARRAY_SIZE(gl_attachments_), gl_attachments_);
+      }
+      else {
+        tmp_detached_[type] = GPU_ATTACHMENT_NONE;
+        this->attachment_set(type, tmp_detached_[type]);
+        this->update_attachments();
+      }
+    }
    clear_attachment(type, GPU_DATA_FLOAT, ls.clear_value);
  }
 }