GPU: Storage buffer allocation alignment

Since the introduction of storage buffers in Blender, the calling code has been responsible for ensuring the buffer meets allocation requirements. All backends require the allocation size to be divisible by 16 bytes. Until now, this was sufficient, but with GPU subdivision changes, an external library must also adhere to these requirements. For OpenSubdiv (OSD), some buffers are not 16-byte aligned, leading to potential misallocation. Currently, this is mitigated by allocating a few extra bytes, but this approach has the drawback of potentially reading unintended bytes beyond the source buffer. This PR adopts a similar approach to vertex buffers: the backend handles extra byte allocation while ensuring data uploads and downloads function correctly without requiring those additional bytes. No changes were needed for Metal, as its allocation size is already aligned to 256 bytes. **Alternative solutions considered**: - Copying the CPU buffer to a larger buffer when needed (performance impact). - Modifying OSD buffers to allocate extra space (requires changes to an external library). - Implementing GPU_storagebuf_update_sub. Ref #135873 Pull Request: https://projects.blender.org/blender/blender/pulls/135716
2025-03-13 15:05:16 +01:00
parent e9af802330
commit 15d88e544a
9 changed files with 41 additions and 61 deletions
--- a/intern/opensubdiv/internal/evaluator/eval_output_gpu.cc
+++ b/intern/opensubdiv/internal/evaluator/eval_output_gpu.cc
@@ -18,11 +18,9 @@ namespace blender::opensubdiv {
 static GPUStorageBuf *create_patch_array_buffer(const PatchArrayVector &patch_arrays)
 {
  const size_t patch_array_size = sizeof(PatchArray);
-  const size_t patch_array_byte_site = patch_array_size * patch_arrays.size();
-  const size_t patch_array_alloc_size = (patch_array_byte_site + 15) & ~0b1111;
-  // TODO: potential read out of bounds.
+  const size_t patch_array_byte_size = patch_array_size * patch_arrays.size();
  GPUStorageBuf *storage_buf = GPU_storagebuf_create_ex(
-      patch_array_alloc_size, patch_arrays.data(), GPU_USAGE_STATIC, "osd_patch_array");
+      patch_array_byte_size, patch_arrays.data(), GPU_USAGE_STATIC, "osd_patch_array");
  return storage_buf;
 }

--- a/intern/opensubdiv/internal/evaluator/gpu_compute_evaluator.cc
+++ b/intern/opensubdiv/internal/evaluator/gpu_compute_evaluator.cc
@@ -55,13 +55,10 @@ template<class T> GPUStorageBuf *create_buffer(std::vector<T> const &src, const
    return nullptr;
  }

-  // TODO: this can lead to an out of bound read. `GPU_storagebuf_update` should have a number of
-  // bytes.
-  size_t buffer_size = src.size() * sizeof(T);
-  size_t buffer_alloc_size = (buffer_size + 15) & ~0b1111;
-
+  const size_t buffer_size = src.size() * sizeof(T);
  GPUStorageBuf *storage_buffer = GPU_storagebuf_create_ex(
-      buffer_alloc_size, &src.at(0), GPU_USAGE_STATIC, name);
+      buffer_size, &src.at(0), GPU_USAGE_STATIC, name);
+
  return storage_buffer;
 }

--- a/intern/opensubdiv/internal/evaluator/gpu_patch_table.cc
+++ b/intern/opensubdiv/internal/evaluator/gpu_patch_table.cc
@@ -47,29 +47,6 @@ GPUPatchTable::~GPUPatchTable()
  discard_list(_fvarParamBuffers);
 }

-/**
- * Storage buffers sizes needs to be divisable by 16 (float4).
- */
-static size_t storage_buffer_size(size_t size)
-{
-  return (size + 15) & (~0b1111);
-}
-
-/**
- * Function to create a storage buffer and upload it with data.
- *
- * - Ensures that allocated size is aligned to 16 byte
- * - WARNING: Can read from not allocated data after `data`.
- */
-// TODO: this means that if buffer size is adjusted we need to copy into a temp buffer otherwise we
-// could read out of bounds. The performance impact of this is measurable so I would suggest to
-// support GPU_storagebuf_update() with a max len to update.
-static GPUStorageBuf *storage_buffer_create(size_t size, const void *data, const char *name)
-{
-  size_t storage_size = storage_buffer_size(size);
-  return GPU_storagebuf_create_ex(storage_size, data, GPU_USAGE_STATIC, name);
-}
-
 bool GPUPatchTable::allocate(PatchTable const *far_patch_table)
 {
  CpuPatchTable patch_table(far_patch_table);
@@ -81,25 +58,29 @@ bool GPUPatchTable::allocate(PatchTable const *far_patch_table)

  /* Patch index buffer */
  const size_t index_size = patch_table.GetPatchIndexSize();
-  _patchIndexBuffer = storage_buffer_create(
+  _patchIndexBuffer = GPU_storagebuf_create_ex(
      index_size * sizeof(int32_t),
      static_cast<const void *>(patch_table.GetPatchIndexBuffer()),
+      GPU_USAGE_STATIC,
      "osd_patch_index");

  /* Patch param buffer */
  const size_t patch_param_size = patch_table.GetPatchParamSize();
-  _patchParamBuffer = storage_buffer_create(
-      patch_param_size * sizeof(PatchParam), patch_table.GetPatchParamBuffer(), "osd_patch_param");
+  _patchParamBuffer = GPU_storagebuf_create_ex(patch_param_size * sizeof(PatchParam),
+                                               patch_table.GetPatchParamBuffer(),
+                                               GPU_USAGE_STATIC,
+                                               "osd_patch_param");

  /* Varying patch array */
  _varyingPatchArrays.assign(patch_table.GetVaryingPatchArrayBuffer(),
                             patch_table.GetVaryingPatchArrayBuffer() + num_patch_arrays);

  /* Varying index buffer */
-  _varyingIndexBuffer = storage_buffer_create(patch_table.GetVaryingPatchIndexSize() *
-                                                  sizeof(uint32_t),
-                                              patch_table.GetVaryingPatchIndexBuffer(),
-                                              "osd_varying_index");
+  _varyingIndexBuffer = GPU_storagebuf_create_ex(patch_table.GetVaryingPatchIndexSize() *
+                                                     sizeof(uint32_t),
+                                                 patch_table.GetVaryingPatchIndexBuffer(),
+                                                 GPU_USAGE_STATIC,
+                                                 "osd_varying_index");

  /* Face varying */
  const int num_face_varying_channels = patch_table.GetNumFVarChannels();
@@ -112,16 +93,18 @@ bool GPUPatchTable::allocate(PatchTable const *far_patch_table)
                                   patch_table.GetFVarPatchArrayBuffer() + num_patch_arrays);

    /* Face varying patch index buffer */
-    _fvarIndexBuffers[index] = storage_buffer_create(patch_table.GetFVarPatchIndexSize(index) *
-                                                         sizeof(int32_t),
-                                                     patch_table.GetFVarPatchIndexBuffer(index),
-                                                     "osd_face_varying_index");
+    _fvarIndexBuffers[index] = GPU_storagebuf_create_ex(patch_table.GetFVarPatchIndexSize(index) *
+                                                            sizeof(int32_t),
+                                                        patch_table.GetFVarPatchIndexBuffer(index),
+                                                        GPU_USAGE_STATIC,
+                                                        "osd_face_varying_index");

    /* Face varying patch param buffer */
-    _fvarParamBuffers[index] = storage_buffer_create(patch_table.GetFVarPatchParamSize(index) *
-                                                         sizeof(PatchParam),
-                                                     patch_table.GetFVarPatchParamBuffer(index),
-                                                     "osd_face_varying_params");
+    _fvarParamBuffers[index] = GPU_storagebuf_create_ex(patch_table.GetFVarPatchParamSize(index) *
+                                                            sizeof(PatchParam),
+                                                        patch_table.GetFVarPatchParamBuffer(index),
+                                                        GPU_USAGE_STATIC,
+                                                        "osd_face_varying_params");
  }

  return true;
--- a/source/blender/gpu/intern/gpu_storage_buffer.cc
+++ b/source/blender/gpu/intern/gpu_storage_buffer.cc
@@ -29,11 +29,7 @@ namespace blender::gpu {

 StorageBuf::StorageBuf(size_t size, const char *name)
 {
-  /* Make sure that UBO is padded to size of vec4 */
-  BLI_assert((size % 16) == 0);
-
  size_in_bytes_ = size;
-
  STRNCPY(name_, name);
 }

--- a/source/blender/gpu/intern/gpu_storage_buffer_private.hh
+++ b/source/blender/gpu/intern/gpu_storage_buffer_private.hh
@@ -28,7 +28,7 @@ class VertBuf;
 */
 class StorageBuf {
 protected:
-  /** Data size in bytes. */
+  /** Data size in bytes. Doesn't need to match actual allocation size due to alignment rules. */
  size_t size_in_bytes_;
  /** Continuous memory block to copy to GPU. This data is owned by the StorageBuf. */
  void *data_ = nullptr;
--- a/source/blender/gpu/opengl/gl_storage_buffer.cc
+++ b/source/blender/gpu/opengl/gl_storage_buffer.cc
@@ -65,9 +65,10 @@ void GLStorageBuf::init()
 {
  BLI_assert(GLContext::get());

+  alloc_size_in_bytes_ = ceil_to_multiple_ul(size_in_bytes_, 16);
  glGenBuffers(1, &ssbo_id_);
  glBindBuffer(GL_SHADER_STORAGE_BUFFER, ssbo_id_);
-  glBufferData(GL_SHADER_STORAGE_BUFFER, size_in_bytes_, nullptr, to_gl(this->usage_));
+  glBufferData(GL_SHADER_STORAGE_BUFFER, alloc_size_in_bytes_, nullptr, to_gl(this->usage_));

  debug::object_label(GL_SHADER_STORAGE_BUFFER, ssbo_id_, name_);
 }
@@ -189,23 +190,25 @@ void GLStorageBuf::async_flush_to_host()
    glGenBuffers(1, &read_ssbo_id_);
    glBindBuffer(GL_SHADER_STORAGE_BUFFER, read_ssbo_id_);
    glBufferStorage(GL_SHADER_STORAGE_BUFFER,
-                    size_in_bytes_,
+                    alloc_size_in_bytes_,
                    nullptr,
                    GL_MAP_PERSISTENT_BIT | GL_MAP_READ_BIT);
-    persistent_ptr_ = glMapBufferRange(
-        GL_SHADER_STORAGE_BUFFER, 0, size_in_bytes_, GL_MAP_PERSISTENT_BIT | GL_MAP_READ_BIT);
+    persistent_ptr_ = glMapBufferRange(GL_SHADER_STORAGE_BUFFER,
+                                       0,
+                                       alloc_size_in_bytes_,
+                                       GL_MAP_PERSISTENT_BIT | GL_MAP_READ_BIT);
    BLI_assert(persistent_ptr_);
    debug::object_label(GL_SHADER_STORAGE_BUFFER, read_ssbo_id_, name_);
    glBindBuffer(GL_SHADER_STORAGE_BUFFER, 0);
  }

  if (GLContext::direct_state_access_support) {
-    glCopyNamedBufferSubData(ssbo_id_, read_ssbo_id_, 0, 0, size_in_bytes_);
+    glCopyNamedBufferSubData(ssbo_id_, read_ssbo_id_, 0, 0, alloc_size_in_bytes_);
  }
  else {
    glBindBuffer(GL_COPY_READ_BUFFER, ssbo_id_);
    glBindBuffer(GL_COPY_WRITE_BUFFER, read_ssbo_id_);
-    glCopyBufferSubData(GL_COPY_READ_BUFFER, GL_COPY_WRITE_BUFFER, 0, 0, size_in_bytes_);
+    glCopyBufferSubData(GL_COPY_READ_BUFFER, GL_COPY_WRITE_BUFFER, 0, 0, alloc_size_in_bytes_);
    glBindBuffer(GL_COPY_READ_BUFFER, 0);
    glBindBuffer(GL_COPY_WRITE_BUFFER, 0);
  }
--- a/source/blender/gpu/opengl/gl_storage_buffer.hh
+++ b/source/blender/gpu/opengl/gl_storage_buffer.hh
@@ -30,6 +30,7 @@ class GLStorageBuf : public StorageBuf {
  GLuint read_ssbo_id_ = 0;
  GLsync read_fence_ = 0;
  void *persistent_ptr_ = nullptr;
+  size_t alloc_size_in_bytes_ = 0;

 public:
  GLStorageBuf(size_t size, GPUUsageType usage, const char *name);
--- a/source/blender/gpu/vulkan/vk_buffer.cc
+++ b/source/blender/gpu/vulkan/vk_buffer.cc
@@ -35,6 +35,7 @@ bool VKBuffer::create(size_t size_in_bytes,
  BLI_assert(mapped_memory_ == nullptr);

  size_in_bytes_ = size_in_bytes;
+  alloc_size_in_bytes_ = ceil_to_multiple_ul(max_ulul(size_in_bytes_, 16), 16);
  VKDevice &device = VKBackend::get().device;

  VmaAllocator allocator = device.mem_allocator_get();
@@ -45,7 +46,7 @@ bool VKBuffer::create(size_t size_in_bytes,
   * Vulkan doesn't allow empty buffers but some areas (DrawManager Instance data, PyGPU) create
   * them.
   */
-  create_info.size = max_ulul(size_in_bytes, 1);
+  create_info.size = alloc_size_in_bytes_;
  create_info.usage = buffer_usage;
  /* We use the same command queue for the compute and graphics pipeline, so it is safe to use
   * exclusive resource handling. */
@@ -112,7 +113,7 @@ void VKBuffer::clear(VKContext &context, uint32_t clear_value)
  render_graph::VKFillBufferNode::CreateInfo fill_buffer = {};
  fill_buffer.vk_buffer = vk_buffer_;
  fill_buffer.data = clear_value;
-  fill_buffer.size = size_in_bytes_;
+  fill_buffer.size = alloc_size_in_bytes_;
  context.render_graph().add_node(fill_buffer);
 }

--- a/source/blender/gpu/vulkan/vk_buffer.hh
+++ b/source/blender/gpu/vulkan/vk_buffer.hh
@@ -22,6 +22,7 @@ class VKDevice;
 */
 class VKBuffer : public NonCopyable {
  size_t size_in_bytes_ = 0;
+  size_t alloc_size_in_bytes_ = 0;
  VkBuffer vk_buffer_ = VK_NULL_HANDLE;
  VmaAllocation allocation_ = VK_NULL_HANDLE;
  VkMemoryPropertyFlags vk_memory_property_flags_;