From 7de4e6d2ee4ae6a89781823856deadcc7fb558ab Mon Sep 17 00:00:00 2001
From: Jeroen Bakker <jeroen@blender.org>
Date: Mon, 16 Dec 2024 10:09:33 +0100
Subject: [PATCH] Vulkan: Add support for ReBAR

This adds initial support for ReBAR capable platforms.

It ensures that when allocating buffers that should not be host visible, still
tries to allocate in host visible memory. When there is space in this memory
heap the buffer will be automatically mapped to host memory.

When mapped staging buffers can be skipped when the buffer was newly
created. In order to make better usage of ReBAR the `VKBuffer::create`
function will need to be revisit. It currently hides to much options to allocate
in the correct memory heap. This change isn't part of this PR.

Using shader_balls.blend rendering the first 50 frames in main takes 1516ms.
When using ReBAR it takes 1416ms.
```
Operating system: Linux-6.8.0-49-generic-x86_64-with-glibc2.39 64 Bits, X11 UI
Graphics card: AMD Radeon Pro W7700 (RADV NAVI32) Advanced Micro Devices radv Mesa 24.3.1 - kisak-mesa PPA Vulkan Backend
```

Pull Request: https://projects.blender.org/blender/blender/pulls/131856
---
 source/blender/gpu/vulkan/vk_buffer.cc        | 21 ++++++-------
 source/blender/gpu/vulkan/vk_buffer.hh        | 13 +++++++-
 source/blender/gpu/vulkan/vk_device.cc        |  3 +-
 source/blender/gpu/vulkan/vk_immediate.cc     |  3 +-
 source/blender/gpu/vulkan/vk_index_buffer.cc  | 18 ++++++++---
 source/blender/gpu/vulkan/vk_index_buffer.hh  |  1 +
 .../blender/gpu/vulkan/vk_push_constants.cc   |  1 +
 source/blender/gpu/vulkan/vk_texture.cc       |  6 ++--
 .../blender/gpu/vulkan/vk_uniform_buffer.cc   | 31 +++++++++++++------
 .../blender/gpu/vulkan/vk_uniform_buffer.hh   | 15 +++++++++
 source/blender/gpu/vulkan/vk_vertex_buffer.cc |  3 +-
 source/blender/gpu/vulkan/vk_vertex_buffer.hh |  1 +
 12 files changed, 85 insertions(+), 31 deletions(-)

diff --git a/source/blender/gpu/vulkan/vk_buffer.cc b/source/blender/gpu/vulkan/vk_buffer.cc
index e340bf8e278..0488dba65b5 100644
--- a/source/blender/gpu/vulkan/vk_buffer.cc
+++ b/source/blender/gpu/vulkan/vk_buffer.cc
@@ -40,9 +40,13 @@ static VmaAllocationCreateFlags vma_allocation_flags(GPUUsageType usage)
   return VMA_ALLOCATION_CREATE_HOST_ACCESS_RANDOM_BIT | VMA_ALLOCATION_CREATE_MAPPED_BIT;
 }
 
-static VkMemoryPropertyFlags vma_preferred_flags()
+static VkMemoryPropertyFlags vma_preferred_flags(const bool is_host_visible)
 {
-  return VK_MEMORY_PROPERTY_DEVICE_LOCAL_BIT;
+  /* When is_host_visible is true, VK_MEMORY_PROPERTY_HOST_VISIBLE_BIT is set in
+   * `vma_required_flags`. We set the reverse to support ReBAR. */
+  return is_host_visible ?
+             VK_MEMORY_PROPERTY_DEVICE_LOCAL_BIT :
+             VK_MEMORY_PROPERTY_DEVICE_LOCAL_BIT | VK_MEMORY_PROPERTY_HOST_VISIBLE_BIT;
 }
 
 static VkMemoryPropertyFlags vma_required_flags(const bool is_host_visible)
@@ -55,12 +59,6 @@ bool VKBuffer::create(size_t size_in_bytes,
                       VkBufferUsageFlags buffer_usage,
                       const bool is_host_visible)
 {
-  /*
-   * TODO: Check which memory is selected and adjust the creation flag to add mapping. This way the
-   * staging buffer can be skipped, or in case of a vertex buffer an intermediate buffer can be
-   * removed.
-   */
-
   BLI_assert(!is_allocated());
   BLI_assert(vk_buffer_ == VK_NULL_HANDLE);
   BLI_assert(mapped_memory_ == nullptr);
@@ -89,7 +87,7 @@ bool VKBuffer::create(size_t size_in_bytes,
   vma_create_info.flags = vma_allocation_flags(usage);
   vma_create_info.priority = 1.0f;
   vma_create_info.requiredFlags = vma_required_flags(is_host_visible);
-  vma_create_info.preferredFlags = vma_preferred_flags();
+  vma_create_info.preferredFlags = vma_preferred_flags(is_host_visible);
   vma_create_info.usage = VMA_MEMORY_USAGE_AUTO;
 
   VkResult result = vmaCreateBuffer(
@@ -100,7 +98,9 @@ bool VKBuffer::create(size_t size_in_bytes,
 
   device.resources.add_buffer(vk_buffer_);
 
-  if (is_host_visible) {
+  vmaGetAllocationMemoryProperties(allocator, allocation_, &vk_memory_property_flags_);
+
+  if (vk_memory_property_flags_ & VK_MEMORY_PROPERTY_HOST_VISIBLE_BIT) {
     return map();
   }
   return true;
@@ -110,7 +110,6 @@ void VKBuffer::update_immediately(const void *data) const
 {
   BLI_assert_msg(is_mapped(), "Cannot update a non-mapped buffer.");
   memcpy(mapped_memory_, data, size_in_bytes_);
-  flush();
 }
 
 void VKBuffer::update_render_graph(VKContext &context, void *data) const
diff --git a/source/blender/gpu/vulkan/vk_buffer.hh b/source/blender/gpu/vulkan/vk_buffer.hh
index fb3265e5eec..fb76f5375f0 100644
--- a/source/blender/gpu/vulkan/vk_buffer.hh
+++ b/source/blender/gpu/vulkan/vk_buffer.hh
@@ -24,6 +24,8 @@ class VKBuffer : public NonCopyable {
   size_t size_in_bytes_ = 0;
   VkBuffer vk_buffer_ = VK_NULL_HANDLE;
   VmaAllocation allocation_ = VK_NULL_HANDLE;
+  VkMemoryPropertyFlags vk_memory_property_flags_;
+
   /* Pointer to the virtually mapped memory. */
   void *mapped_memory_ = nullptr;
 
@@ -33,10 +35,19 @@ class VKBuffer : public NonCopyable {
 
   /** Has this buffer been allocated? */
   bool is_allocated() const;
+
+  /**
+   * Allocate the buffer.
+   *
+   * When `is_host_visible` is set to true it will allocate from a host visible memory heap. When
+   * `is_host_visible` is false it will try to allocate from a host visible memory heap. When not
+   * available it will allocate from a not host visible memory heap. This is also known as
+   * Resizable BAR or ReBAR.
+   */
   bool create(size_t size,
               GPUUsageType usage,
               VkBufferUsageFlags buffer_usage,
-              bool is_host_visible = true);
+              bool is_host_visible);
   void clear(VKContext &context, uint32_t clear_value);
   void update_immediately(const void *data) const;
 
diff --git a/source/blender/gpu/vulkan/vk_device.cc b/source/blender/gpu/vulkan/vk_device.cc
index 6c8a55b1a14..b24410f50e0 100644
--- a/source/blender/gpu/vulkan/vk_device.cc
+++ b/source/blender/gpu/vulkan/vk_device.cc
@@ -196,7 +196,8 @@ void VKDevice::init_dummy_buffer()
 {
   dummy_buffer.create(sizeof(float4x4),
                       GPU_USAGE_DEVICE_ONLY,
-                      VK_BUFFER_USAGE_VERTEX_BUFFER_BIT | VK_BUFFER_USAGE_TRANSFER_DST_BIT);
+                      VK_BUFFER_USAGE_VERTEX_BUFFER_BIT | VK_BUFFER_USAGE_TRANSFER_DST_BIT,
+                      true);
   debug::object_label(dummy_buffer.vk_handle(), "DummyBuffer");
   /* Default dummy buffer. Set the 4th element to 1 to fix missing orcos. */
   float data[16] = {
diff --git a/source/blender/gpu/vulkan/vk_immediate.cc b/source/blender/gpu/vulkan/vk_immediate.cc
index d2c758152aa..5bbea337f85 100644
--- a/source/blender/gpu/vulkan/vk_immediate.cc
+++ b/source/blender/gpu/vulkan/vk_immediate.cc
@@ -169,7 +169,8 @@ VKBuffer &VKImmediate::ensure_space(VkDeviceSize bytes_needed, VkDeviceSize offs
   result.create(alloc_size,
                 GPU_USAGE_DYNAMIC,
                 VK_BUFFER_USAGE_STORAGE_BUFFER_BIT | VK_BUFFER_USAGE_VERTEX_BUFFER_BIT |
-                    VK_BUFFER_USAGE_TRANSFER_DST_BIT);
+                    VK_BUFFER_USAGE_TRANSFER_DST_BIT,
+                true);
   debug::object_label(result.vk_handle(), "Immediate");
 
   return result;
diff --git a/source/blender/gpu/vulkan/vk_index_buffer.cc b/source/blender/gpu/vulkan/vk_index_buffer.cc
index 8b498d33562..b5a264fb851 100644
--- a/source/blender/gpu/vulkan/vk_index_buffer.cc
+++ b/source/blender/gpu/vulkan/vk_index_buffer.cc
@@ -29,11 +29,19 @@ void VKIndexBuffer::ensure_updated()
     return;
   }
 
-  VKContext &context = *VKContext::get();
-  VKStagingBuffer staging_buffer(buffer_, VKStagingBuffer::Direction::HostToDevice);
-  staging_buffer.host_buffer_get().update_immediately(data_);
-  staging_buffer.copy_to_device(context);
-  MEM_SAFE_FREE(data_);
+  if (!data_uploaded_ && buffer_.is_mapped()) {
+    buffer_.update_immediately(data_);
+    MEM_SAFE_FREE(data_);
+  }
+  else {
+    VKContext &context = *VKContext::get();
+    VKStagingBuffer staging_buffer(buffer_, VKStagingBuffer::Direction::HostToDevice);
+    staging_buffer.host_buffer_get().update_immediately(data_);
+    staging_buffer.copy_to_device(context);
+    MEM_SAFE_FREE(data_);
+  }
+
+  data_uploaded_ = true;
 }
 
 void VKIndexBuffer::upload_data()
diff --git a/source/blender/gpu/vulkan/vk_index_buffer.hh b/source/blender/gpu/vulkan/vk_index_buffer.hh
index b5b32eb9094..ae4265e53cb 100644
--- a/source/blender/gpu/vulkan/vk_index_buffer.hh
+++ b/source/blender/gpu/vulkan/vk_index_buffer.hh
@@ -16,6 +16,7 @@ namespace blender::gpu {
 
 class VKIndexBuffer : public IndexBuf {
   VKBuffer buffer_;
+  bool data_uploaded_ = false;
 
  public:
   void upload_data() override;
diff --git a/source/blender/gpu/vulkan/vk_push_constants.cc b/source/blender/gpu/vulkan/vk_push_constants.cc
index 384f539259c..b0adb583d57 100644
--- a/source/blender/gpu/vulkan/vk_push_constants.cc
+++ b/source/blender/gpu/vulkan/vk_push_constants.cc
@@ -151,6 +151,7 @@ void VKPushConstants::update_uniform_buffer()
   BLI_assert(data_ != nullptr);
   VKContext &context = *VKContext::get();
   std::unique_ptr<VKUniformBuffer> &uniform_buffer = tracked_resource_for(context, is_dirty_);
+  uniform_buffer->reset_data_uploaded();
   uniform_buffer->update(data_);
   is_dirty_ = false;
 }
diff --git a/source/blender/gpu/vulkan/vk_texture.cc b/source/blender/gpu/vulkan/vk_texture.cc
index 22a0e7ff750..b72fdc7b53a 100644
--- a/source/blender/gpu/vulkan/vk_texture.cc
+++ b/source/blender/gpu/vulkan/vk_texture.cc
@@ -194,7 +194,8 @@ void VKTexture::read_sub(
   /* Vulkan images cannot be directly mapped to host memory and requires a staging buffer. */
   VKBuffer staging_buffer;
   size_t device_memory_size = sample_len * to_bytesize(device_format_);
-  staging_buffer.create(device_memory_size, GPU_USAGE_DYNAMIC, VK_BUFFER_USAGE_TRANSFER_DST_BIT);
+  staging_buffer.create(
+      device_memory_size, GPU_USAGE_DYNAMIC, VK_BUFFER_USAGE_TRANSFER_DST_BIT, true);
 
   render_graph::VKCopyImageToBufferNode::CreateInfo copy_image_to_buffer = {};
   render_graph::VKCopyImageToBufferNode::Data &node_data = copy_image_to_buffer.node_data;
@@ -300,7 +301,8 @@ void VKTexture::update_sub(
   }
 
   VKBuffer staging_buffer;
-  staging_buffer.create(device_memory_size, GPU_USAGE_DYNAMIC, VK_BUFFER_USAGE_TRANSFER_SRC_BIT);
+  staging_buffer.create(
+      device_memory_size, GPU_USAGE_DYNAMIC, VK_BUFFER_USAGE_TRANSFER_SRC_BIT, true);
   /* Rows are sequentially stored, when unpack row length is 0, or equal to the extent width. In
    * other cases we unpack the rows to reduce the size of the staging buffer and data transfer. */
   const uint texture_unpack_row_length =
diff --git a/source/blender/gpu/vulkan/vk_uniform_buffer.cc b/source/blender/gpu/vulkan/vk_uniform_buffer.cc
index 7198796ffb7..d4bf1480766 100644
--- a/source/blender/gpu/vulkan/vk_uniform_buffer.cc
+++ b/source/blender/gpu/vulkan/vk_uniform_buffer.cc
@@ -21,11 +21,16 @@ void VKUniformBuffer::update(const void *data)
     allocate();
   }
 
-  /* TODO: when buffer is mapped and newly created we should use `buffer_.update_immediately`. */
-  void *data_copy = MEM_mallocN(size_in_bytes_, __func__);
-  memcpy(data_copy, data, size_in_bytes_);
-  VKContext &context = *VKContext::get();
-  buffer_.update_render_graph(context, data_copy);
+  if (!data_uploaded_ && buffer_.is_mapped()) {
+    buffer_.update_immediately(data);
+  }
+  else {
+    void *data_copy = MEM_mallocN(size_in_bytes_, __func__);
+    memcpy(data_copy, data, size_in_bytes_);
+    VKContext &context = *VKContext::get();
+    buffer_.update_render_graph(context, data_copy);
+  }
+  data_uploaded_ = true;
 }
 
 void VKUniformBuffer::allocate()
@@ -45,6 +50,7 @@ void VKUniformBuffer::clear_to_zero()
   }
   VKContext &context = *VKContext::get();
   buffer_.clear(context, 0);
+  data_uploaded_ = true;
 }
 
 void VKUniformBuffer::ensure_updated()
@@ -55,10 +61,17 @@ void VKUniformBuffer::ensure_updated()
 
   /* Upload attached data, during bind time. */
   if (data_) {
-    /* TODO: when buffer is mapped and newly created we should use `buffer_.update_immediately`. */
-    VKContext &context = *VKContext::get();
-    buffer_.update_render_graph(context, std::move(data_));
-    data_ = nullptr;
+    if (!data_uploaded_ && buffer_.is_mapped()) {
+      buffer_.update_immediately(data_);
+      MEM_freeN(data_);
+      data_ = nullptr;
+    }
+    else {
+      VKContext &context = *VKContext::get();
+      buffer_.update_render_graph(context, std::move(data_));
+      data_ = nullptr;
+    }
+    data_uploaded_ = true;
   }
 }
 
diff --git a/source/blender/gpu/vulkan/vk_uniform_buffer.hh b/source/blender/gpu/vulkan/vk_uniform_buffer.hh
index 7b27f915691..75f8adbd546 100644
--- a/source/blender/gpu/vulkan/vk_uniform_buffer.hh
+++ b/source/blender/gpu/vulkan/vk_uniform_buffer.hh
@@ -19,6 +19,12 @@ namespace blender::gpu {
 class VKUniformBuffer : public UniformBuf, NonCopyable {
   VKBuffer buffer_;
 
+  /**
+   * Has this uniform data already been fed with data. When so we are not allowed to directly
+   * overwrite the data as it could still be in use.
+   */
+  bool data_uploaded_ = false;
+
  public:
   VKUniformBuffer(size_t size, const char *name) : UniformBuf(size, name) {}
 
@@ -44,6 +50,15 @@ class VKUniformBuffer : public UniformBuf, NonCopyable {
 
   void ensure_updated();
 
+  /**
+   * Reset data uploaded flag. When the resource is sure it isn't used, the caller can call
+   * reset_data_uploaded so the next update can use ReBAR when available.
+   */
+  void reset_data_uploaded()
+  {
+    data_uploaded_ = false;
+  }
+
  private:
   void allocate();
 };
diff --git a/source/blender/gpu/vulkan/vk_vertex_buffer.cc b/source/blender/gpu/vulkan/vk_vertex_buffer.cc
index 097cf2e0bf7..7795ac7e08f 100644
--- a/source/blender/gpu/vulkan/vk_vertex_buffer.cc
+++ b/source/blender/gpu/vulkan/vk_vertex_buffer.cc
@@ -148,7 +148,7 @@ void VKVertexBuffer::upload_data()
 
   if (flag & GPU_VERTBUF_DATA_DIRTY) {
     device_format_ensure();
-    if (buffer_.is_mapped()) {
+    if (buffer_.is_mapped() && !data_uploaded_) {
       upload_data_direct(buffer_);
     }
     else {
@@ -158,6 +158,7 @@ void VKVertexBuffer::upload_data()
     if (usage_ == GPU_USAGE_STATIC) {
       MEM_SAFE_FREE(data_);
     }
+    data_uploaded_ = true;
 
     flag &= ~GPU_VERTBUF_DATA_DIRTY;
     flag |= GPU_VERTBUF_DATA_UPLOADED;
diff --git a/source/blender/gpu/vulkan/vk_vertex_buffer.hh b/source/blender/gpu/vulkan/vk_vertex_buffer.hh
index 829fb62addd..512c83b64ba 100644
--- a/source/blender/gpu/vulkan/vk_vertex_buffer.hh
+++ b/source/blender/gpu/vulkan/vk_vertex_buffer.hh
@@ -21,6 +21,7 @@ class VKVertexBuffer : public VertBuf {
   VkBufferView vk_buffer_view_ = VK_NULL_HANDLE;
 
   VertexFormatConverter vertex_format_converter;
+  bool data_uploaded_ = false;
 
  public:
   ~VKVertexBuffer();