Subdiv: Split eval shaders

Both eval shaders were implemented in osd_kernel_comp.glsl. This PR separates them for easier understanding of the shaders. Pull Request: https://projects.blender.org/blender/blender/pulls/135719
2025-03-10 16:06:00 +01:00
parent 5a029fdf1f
commit 46cfba075d
4 changed files with 345 additions and 198 deletions
--- a/intern/opensubdiv/CMakeLists.txt
+++ b/intern/opensubdiv/CMakeLists.txt
@@ -91,7 +91,8 @@ if(WITH_OPENSUBDIV)
  endif()

  set(GLSL_SRC
-    internal/evaluator/shaders/osd_kernel_comp.glsl
+    internal/evaluator/shaders/osd_eval_stencils_comp.glsl
+    internal/evaluator/shaders/osd_eval_patches_comp.glsl
  )

  set(GLSL_C)
--- a/intern/opensubdiv/internal/evaluator/gpu_compute_evaluator.cc
+++ b/intern/opensubdiv/internal/evaluator/gpu_compute_evaluator.cc
@@ -135,113 +135,6 @@ GPUComputeEvaluator::~GPUComputeEvaluator()
  }
 }

-static GPUShader *compileKernel(BufferDescriptor const &srcDesc,
-                                BufferDescriptor const &dstDesc,
-                                BufferDescriptor const &duDesc,
-                                BufferDescriptor const &dvDesc,
-                                BufferDescriptor const &duuDesc,
-                                BufferDescriptor const &duvDesc,
-                                BufferDescriptor const &dvvDesc,
-                                bool use_eval_stencil_kernel,
-                                int workGroupSize)
-{
-  using namespace blender::gpu::shader;
-  ShaderCreateInfo info("opensubdiv_compute_eval");
-  info.local_group_size(workGroupSize, 1, 1);
-  if (GPU_backend_get_type() == GPU_BACKEND_METAL) {
-    info.define("OSD_PATCH_BASIS_METAL");
-  }
-  else {
-    info.define("OSD_PATCH_BASIS_GLSL");
-  }
-  if (use_eval_stencil_kernel) {
-    info.define("OPENSUBDIV_GLSL_COMPUTE_KERNEL_EVAL_STENCILS");
-  }
-  else {
-    info.define("OPENSUBDIV_GLSL_COMPUTE_KERNEL_EVAL_PATCHES");
-  }
-
-  // TODO: use specialization constants for src_stride, dst_stride. Not sure we can use
-  // work group size as that requires extensions. This allows us to compile less shaders and
-  // improve overall performance. Adding length as specialization constant will not work as it is
-  // used to define an array length. This is not supported by Metal.
-  std::string length = std::to_string(srcDesc.length);
-  std::string src_stride = std::to_string(srcDesc.stride);
-  std::string dst_stride = std::to_string(dstDesc.stride);
-  std::string work_group_size = std::to_string(workGroupSize);
-  info.define("LENGTH", length);
-  info.define("SRC_STRIDE", src_stride);
-  info.define("DST_STRIDE", dst_stride);
-  info.define("WORK_GROUP_SIZE", work_group_size);
-  info.typedef_source("osd_patch_basis.glsl");
-  info.storage_buf(
-      SHADER_SRC_VERTEX_BUFFER_BUF_SLOT, Qualifier::READ, "float", "srcVertexBuffer[]");
-  info.storage_buf(
-      SHADER_DST_VERTEX_BUFFER_BUF_SLOT, Qualifier::WRITE, "float", "dstVertexBuffer[]");
-  info.push_constant(Type::INT, "srcOffset");
-  info.push_constant(Type::INT, "dstOffset");
-
-  bool deriv1 = (duDesc.length > 0 || dvDesc.length > 0);
-  bool deriv2 = (duuDesc.length > 0 || duvDesc.length > 0 || dvvDesc.length > 0);
-  if (deriv1) {
-    info.define("OPENSUBDIV_GLSL_COMPUTE_USE_1ST_DERIVATIVES");
-    info.storage_buf(SHADER_DU_BUFFER_BUF_SLOT, Qualifier::READ_WRITE, "float", "duBuffer[]");
-    info.storage_buf(SHADER_DV_BUFFER_BUF_SLOT, Qualifier::READ_WRITE, "float", "dvBuffer[]");
-    info.push_constant(Type::IVEC3, "duDesc");
-    info.push_constant(Type::IVEC3, "dvDesc");
-  }
-  if (deriv2) {
-    info.define("OPENSUBDIV_GLSL_COMPUTE_USE_2ND_DERIVATIVES");
-    info.storage_buf(SHADER_DUU_BUFFER_BUF_SLOT, Qualifier::READ_WRITE, "float", "duuBuffer[]");
-    info.storage_buf(SHADER_DUV_BUFFER_BUF_SLOT, Qualifier::READ_WRITE, "float", "duvBuffer[]");
-    info.storage_buf(SHADER_DVV_BUFFER_BUF_SLOT, Qualifier::READ_WRITE, "float", "dvvBuffer[]");
-    info.push_constant(Type::IVEC3, "duuDesc");
-    info.push_constant(Type::IVEC3, "duvDesc");
-    info.push_constant(Type::IVEC3, "dvvDesc");
-  }
-
-  if (use_eval_stencil_kernel) {
-    info.storage_buf(SHADER_SIZES_BUF_SLOT, Qualifier::READ, "int", "sizes_buf[]");
-    info.storage_buf(SHADER_OFFSETS_BUF_SLOT, Qualifier::READ, "int", "offsets_buf[]");
-    info.storage_buf(SHADER_INDICES_BUF_SLOT, Qualifier::READ, "int", "indices_buf[]");
-    info.storage_buf(SHADER_WEIGHTS_BUF_SLOT, Qualifier::READ, "float", "weights_buf[]");
-    if (deriv1) {
-      info.storage_buf(
-          SHADER_DU_WEIGHTS_BUF_SLOT, Qualifier::READ_WRITE, "float", "du_weights_buf[]");
-      info.storage_buf(
-          SHADER_DV_WEIGHTS_BUF_SLOT, Qualifier::READ_WRITE, "float", "dv_weights_buf[]");
-    }
-    if (deriv2) {
-      info.storage_buf(
-          SHADER_DUU_WEIGHTS_BUF_SLOT, Qualifier::READ_WRITE, "float", "duu_weights_buf[]");
-      info.storage_buf(
-          SHADER_DUV_WEIGHTS_BUF_SLOT, Qualifier::READ_WRITE, "float", "duv_weights_buf[]");
-      info.storage_buf(
-          SHADER_DVV_WEIGHTS_BUF_SLOT, Qualifier::READ_WRITE, "float", "dvv_weights_buf[]");
-    }
-    info.push_constant(Type::INT, "batchStart");
-    info.push_constant(Type::INT, "batchEnd");
-  }
-  else {
-    info.storage_buf(SHADER_PATCH_ARRAY_BUFFER_BUF_SLOT,
-                     Qualifier::READ,
-                     "OsdPatchArray",
-                     "patchArrayBuffer[]");
-    info.storage_buf(
-        SHADER_PATCH_COORDS_BUF_SLOT, Qualifier::READ, "OsdPatchCoord", "patchCoords[]");
-    info.storage_buf(
-        SHADER_PATCH_INDEX_BUFFER_BUF_SLOT, Qualifier::READ, "int", "patchIndexBuffer[]");
-    info.storage_buf(SHADER_PATCH_PARAM_BUFFER_BUF_SLOT,
-                     Qualifier::READ,
-                     "OsdPatchParam",
-                     "patchParamBuffer[]");
-  }
-  info.compute_source("osd_kernel_comp.glsl");
-  GPUShader *shader = GPU_shader_create_from_info(
-      reinterpret_cast<const GPUShaderCreateInfo *>(&info));
-  return shader;
-}
-
 bool GPUComputeEvaluator::Compile(BufferDescriptor const &srcDesc,
                                  BufferDescriptor const &dstDesc,
                                  BufferDescriptor const &duDesc,
@@ -581,6 +474,90 @@ GPUComputeEvaluator::_StencilKernel::~_StencilKernel()
    shader = nullptr;
  }
 }
+static GPUShader *compile_eval_stencil_shader(BufferDescriptor const &srcDesc,
+                                              BufferDescriptor const &dstDesc,
+                                              BufferDescriptor const &duDesc,
+                                              BufferDescriptor const &dvDesc,
+                                              BufferDescriptor const &duuDesc,
+                                              BufferDescriptor const &duvDesc,
+                                              BufferDescriptor const &dvvDesc,
+                                              int workGroupSize)
+{
+  using namespace blender::gpu::shader;
+  ShaderCreateInfo info("opensubdiv_compute_eval");
+  info.local_group_size(workGroupSize, 1, 1);
+  if (GPU_backend_get_type() == GPU_BACKEND_METAL) {
+    info.define("OSD_PATCH_BASIS_METAL");
+  }
+  else {
+    info.define("OSD_PATCH_BASIS_GLSL");
+  }
+
+  // TODO: use specialization constants for src_stride, dst_stride. Not sure we can use
+  // work group size as that requires extensions. This allows us to compile less shaders and
+  // improve overall performance. Adding length as specialization constant will not work as it is
+  // used to define an array length. This is not supported by Metal.
+  std::string length = std::to_string(srcDesc.length);
+  std::string src_stride = std::to_string(srcDesc.stride);
+  std::string dst_stride = std::to_string(dstDesc.stride);
+  std::string work_group_size = std::to_string(workGroupSize);
+  info.define("LENGTH", length);
+  info.define("SRC_STRIDE", src_stride);
+  info.define("DST_STRIDE", dst_stride);
+  info.define("WORK_GROUP_SIZE", work_group_size);
+  info.typedef_source("osd_patch_basis.glsl");
+  info.storage_buf(
+      SHADER_SRC_VERTEX_BUFFER_BUF_SLOT, Qualifier::READ, "float", "srcVertexBuffer[]");
+  info.storage_buf(
+      SHADER_DST_VERTEX_BUFFER_BUF_SLOT, Qualifier::WRITE, "float", "dstVertexBuffer[]");
+  info.push_constant(Type::INT, "srcOffset");
+  info.push_constant(Type::INT, "dstOffset");
+
+  bool deriv1 = (duDesc.length > 0 || dvDesc.length > 0);
+  bool deriv2 = (duuDesc.length > 0 || duvDesc.length > 0 || dvvDesc.length > 0);
+  if (deriv1) {
+    info.define("OPENSUBDIV_GLSL_COMPUTE_USE_1ST_DERIVATIVES");
+    info.storage_buf(SHADER_DU_BUFFER_BUF_SLOT, Qualifier::READ_WRITE, "float", "duBuffer[]");
+    info.storage_buf(SHADER_DV_BUFFER_BUF_SLOT, Qualifier::READ_WRITE, "float", "dvBuffer[]");
+    info.push_constant(Type::IVEC3, "duDesc");
+    info.push_constant(Type::IVEC3, "dvDesc");
+  }
+  if (deriv2) {
+    info.define("OPENSUBDIV_GLSL_COMPUTE_USE_2ND_DERIVATIVES");
+    info.storage_buf(SHADER_DUU_BUFFER_BUF_SLOT, Qualifier::READ_WRITE, "float", "duuBuffer[]");
+    info.storage_buf(SHADER_DUV_BUFFER_BUF_SLOT, Qualifier::READ_WRITE, "float", "duvBuffer[]");
+    info.storage_buf(SHADER_DVV_BUFFER_BUF_SLOT, Qualifier::READ_WRITE, "float", "dvvBuffer[]");
+    info.push_constant(Type::IVEC3, "duuDesc");
+    info.push_constant(Type::IVEC3, "duvDesc");
+    info.push_constant(Type::IVEC3, "dvvDesc");
+  }
+
+  info.storage_buf(SHADER_SIZES_BUF_SLOT, Qualifier::READ, "int", "sizes_buf[]");
+  info.storage_buf(SHADER_OFFSETS_BUF_SLOT, Qualifier::READ, "int", "offsets_buf[]");
+  info.storage_buf(SHADER_INDICES_BUF_SLOT, Qualifier::READ, "int", "indices_buf[]");
+  info.storage_buf(SHADER_WEIGHTS_BUF_SLOT, Qualifier::READ, "float", "weights_buf[]");
+  if (deriv1) {
+    info.storage_buf(
+        SHADER_DU_WEIGHTS_BUF_SLOT, Qualifier::READ_WRITE, "float", "du_weights_buf[]");
+    info.storage_buf(
+        SHADER_DV_WEIGHTS_BUF_SLOT, Qualifier::READ_WRITE, "float", "dv_weights_buf[]");
+  }
+  if (deriv2) {
+    info.storage_buf(
+        SHADER_DUU_WEIGHTS_BUF_SLOT, Qualifier::READ_WRITE, "float", "duu_weights_buf[]");
+    info.storage_buf(
+        SHADER_DUV_WEIGHTS_BUF_SLOT, Qualifier::READ_WRITE, "float", "duv_weights_buf[]");
+    info.storage_buf(
+        SHADER_DVV_WEIGHTS_BUF_SLOT, Qualifier::READ_WRITE, "float", "dvv_weights_buf[]");
+  }
+  info.push_constant(Type::INT, "batchStart");
+  info.push_constant(Type::INT, "batchEnd");
+
+  info.compute_source("osd_eval_stencils_comp.glsl");
+  GPUShader *shader = GPU_shader_create_from_info(
+      reinterpret_cast<const GPUShaderCreateInfo *>(&info));
+  return shader;
+}

 bool GPUComputeEvaluator::_StencilKernel::Compile(BufferDescriptor const &srcDesc,
                                                  BufferDescriptor const &dstDesc,
@@ -596,8 +573,8 @@ bool GPUComputeEvaluator::_StencilKernel::Compile(BufferDescriptor const &srcDes
    shader = nullptr;
  }

-  shader = compileKernel(
-      srcDesc, dstDesc, duDesc, dvDesc, duuDesc, duvDesc, dvvDesc, true, workGroupSize);
+  shader = compile_eval_stencil_shader(
+      srcDesc, dstDesc, duDesc, dvDesc, duuDesc, duvDesc, dvvDesc, workGroupSize);
  if (shader == nullptr) {
    return false;
  }
@@ -627,6 +604,79 @@ GPUComputeEvaluator::_PatchKernel::~_PatchKernel()
  }
 }

+static GPUShader *compile_eval_patches_shader(BufferDescriptor const &srcDesc,
+                                              BufferDescriptor const &dstDesc,
+                                              BufferDescriptor const &duDesc,
+                                              BufferDescriptor const &dvDesc,
+                                              BufferDescriptor const &duuDesc,
+                                              BufferDescriptor const &duvDesc,
+                                              BufferDescriptor const &dvvDesc,
+                                              int workGroupSize)
+{
+  using namespace blender::gpu::shader;
+  ShaderCreateInfo info("opensubdiv_compute_eval");
+  info.local_group_size(workGroupSize, 1, 1);
+  if (GPU_backend_get_type() == GPU_BACKEND_METAL) {
+    info.define("OSD_PATCH_BASIS_METAL");
+  }
+  else {
+    info.define("OSD_PATCH_BASIS_GLSL");
+  }
+
+  // TODO: use specialization constants for src_stride, dst_stride. Not sure we can use
+  // work group size as that requires extensions. This allows us to compile less shaders and
+  // improve overall performance. Adding length as specialization constant will not work as it is
+  // used to define an array length. This is not supported by Metal.
+  std::string length = std::to_string(srcDesc.length);
+  std::string src_stride = std::to_string(srcDesc.stride);
+  std::string dst_stride = std::to_string(dstDesc.stride);
+  std::string work_group_size = std::to_string(workGroupSize);
+  info.define("LENGTH", length);
+  info.define("SRC_STRIDE", src_stride);
+  info.define("DST_STRIDE", dst_stride);
+  info.define("WORK_GROUP_SIZE", work_group_size);
+  info.typedef_source("osd_patch_basis.glsl");
+  info.storage_buf(
+      SHADER_SRC_VERTEX_BUFFER_BUF_SLOT, Qualifier::READ, "float", "srcVertexBuffer[]");
+  info.storage_buf(
+      SHADER_DST_VERTEX_BUFFER_BUF_SLOT, Qualifier::WRITE, "float", "dstVertexBuffer[]");
+  info.push_constant(Type::INT, "srcOffset");
+  info.push_constant(Type::INT, "dstOffset");
+
+  bool deriv1 = (duDesc.length > 0 || dvDesc.length > 0);
+  bool deriv2 = (duuDesc.length > 0 || duvDesc.length > 0 || dvvDesc.length > 0);
+  if (deriv1) {
+    info.define("OPENSUBDIV_GLSL_COMPUTE_USE_1ST_DERIVATIVES");
+    info.storage_buf(SHADER_DU_BUFFER_BUF_SLOT, Qualifier::READ_WRITE, "float", "duBuffer[]");
+    info.storage_buf(SHADER_DV_BUFFER_BUF_SLOT, Qualifier::READ_WRITE, "float", "dvBuffer[]");
+    info.push_constant(Type::IVEC3, "duDesc");
+    info.push_constant(Type::IVEC3, "dvDesc");
+  }
+  if (deriv2) {
+    info.define("OPENSUBDIV_GLSL_COMPUTE_USE_2ND_DERIVATIVES");
+    info.storage_buf(SHADER_DUU_BUFFER_BUF_SLOT, Qualifier::READ_WRITE, "float", "duuBuffer[]");
+    info.storage_buf(SHADER_DUV_BUFFER_BUF_SLOT, Qualifier::READ_WRITE, "float", "duvBuffer[]");
+    info.storage_buf(SHADER_DVV_BUFFER_BUF_SLOT, Qualifier::READ_WRITE, "float", "dvvBuffer[]");
+    info.push_constant(Type::IVEC3, "duuDesc");
+    info.push_constant(Type::IVEC3, "duvDesc");
+    info.push_constant(Type::IVEC3, "dvvDesc");
+  }
+
+  info.storage_buf(
+      SHADER_PATCH_ARRAY_BUFFER_BUF_SLOT, Qualifier::READ, "OsdPatchArray", "patchArrayBuffer[]");
+  info.storage_buf(
+      SHADER_PATCH_COORDS_BUF_SLOT, Qualifier::READ, "OsdPatchCoord", "patchCoords[]");
+  info.storage_buf(
+      SHADER_PATCH_INDEX_BUFFER_BUF_SLOT, Qualifier::READ, "int", "patchIndexBuffer[]");
+  info.storage_buf(
+      SHADER_PATCH_PARAM_BUFFER_BUF_SLOT, Qualifier::READ, "OsdPatchParam", "patchParamBuffer[]");
+
+  info.compute_source("osd_eval_patches_comp.glsl");
+  GPUShader *shader = GPU_shader_create_from_info(
+      reinterpret_cast<const GPUShaderCreateInfo *>(&info));
+  return shader;
+}
+
 bool GPUComputeEvaluator::_PatchKernel::Compile(BufferDescriptor const &srcDesc,
                                                BufferDescriptor const &dstDesc,
                                                BufferDescriptor const &duDesc,
@@ -641,8 +691,8 @@ bool GPUComputeEvaluator::_PatchKernel::Compile(BufferDescriptor const &srcDesc,
    shader = nullptr;
  }

-  shader = compileKernel(
-      srcDesc, dstDesc, duDesc, dvDesc, duuDesc, duvDesc, dvvDesc, false, workGroupSize);
+  shader = compile_eval_patches_shader(
+      srcDesc, dstDesc, duDesc, dvDesc, duuDesc, duvDesc, dvvDesc, workGroupSize);
  if (shader == nullptr) {
    return false;
  }
--- a/intern/opensubdiv/internal/evaluator/shaders/osd_eval_patches_comp.glsl
+++ b/intern/opensubdiv/internal/evaluator/shaders/osd_eval_patches_comp.glsl
@@ -24,15 +24,6 @@

 //------------------------------------------------------------------------------

-#if defined(OPENSUBDIV_GLSL_COMPUTE_KERNEL_EVAL_STENCILS)
-uint getGlobalInvocationIndex()
-{
-  uint invocations_per_row = gl_WorkGroupSize.x * gl_NumWorkGroups.x;
-  return gl_GlobalInvocationID.x + gl_GlobalInvocationID.y * invocations_per_row;
-}
-#endif
-
-#if defined(OPENSUBDIV_GLSL_COMPUTE_KERNEL_EVAL_PATCHES)
 OsdPatchCoord GetPatchCoord(int coordIndex)
 {
  return patchCoords[coordIndex];
@@ -47,7 +38,6 @@ OsdPatchParam GetPatchParam(int patchIndex)
 {
  return patchParamBuffer[patchIndex];
 }
-#endif

 //------------------------------------------------------------------------------

@@ -132,76 +122,6 @@ void writeDvv(int index, Vertex dvv)
 #endif

 //------------------------------------------------------------------------------
-#if defined(OPENSUBDIV_GLSL_COMPUTE_KERNEL_EVAL_STENCILS)
-
-void main()
-{
-  int current = int(getGlobalInvocationIndex()) + batchStart;
-
-  if (current >= batchEnd) {
-    return;
-  }
-
-  Vertex dst;
-  clear(dst);
-
-  int offset = offsets_buf[current], size = sizes_buf[current];
-
-  for (int stencil = 0; stencil < size; ++stencil) {
-    int vindex = offset + stencil;
-    addWithWeight(dst, readVertex(indices_buf[vindex]), weights_buf[vindex]);
-  }
-
-  writeVertex(current, dst);
-
-#  if defined(OPENSUBDIV_GLSL_COMPUTE_USE_1ST_DERIVATIVES)
-  Vertex du, dv;
-  clear(du);
-  clear(dv);
-  for (int i = 0; i < size; ++i) {
-    // expects the compiler optimizes readVertex out here.
-    Vertex src = readVertex(indices_buf[offset + i]);
-    addWithWeight(du, src, du_weights_buf[offset + i]);
-    addWithWeight(dv, src, dv_weights_buf[offset + i]);
-  }
-
-  if (duDesc.y > 0) {  // length
-    writeDu(current, du);
-  }
-  if (dvDesc.y > 0) {
-    writeDv(current, dv);
-  }
-#  endif
-#  if defined(OPENSUBDIV_GLSL_COMPUTE_USE_2ND_DERIVATIVES)
-  Vertex duu, duv, dvv;
-  clear(duu);
-  clear(duv);
-  clear(dvv);
-  for (int i = 0; i < size; ++i) {
-    // expects the compiler optimizes readVertex out here.
-    Vertex src = readVertex(indices_buf[offset + i]);
-    addWithWeight(duu, src, duu_weights_buf[offset + i]);
-    addWithWeight(duv, src, duv_weights_buf[offset + i]);
-    addWithWeight(dvv, src, dvv_weights_buf[offset + i]);
-  }
-
-  if (duuDesc.y > 0) {  // length
-    writeDuu(current, duu);
-  }
-  if (duvDesc.y > 0) {
-    writeDuv(current, duv);
-  }
-  if (dvvDesc.y > 0) {
-    writeDvv(current, dvv);
-  }
-#  endif
-}
-
-#endif
-
-//------------------------------------------------------------------------------
-#if defined(OPENSUBDIV_GLSL_COMPUTE_KERNEL_EVAL_PATCHES)
-
 // PERFORMANCE: stride could be constant, but not as significant as length

 void main()
@@ -240,15 +160,15 @@ void main()
  }
  writeVertex(current, dst);

-#  if defined(OPENSUBDIV_GLSL_COMPUTE_USE_1ST_DERIVATIVES)
+#if defined(OPENSUBDIV_GLSL_COMPUTE_USE_1ST_DERIVATIVES)
  if (duDesc.y > 0) {  // length
    writeDu(current, du);
  }
  if (dvDesc.y > 0) {
    writeDv(current, dv);
  }
-#  endif
-#  if defined(OPENSUBDIV_GLSL_COMPUTE_USE_2ND_DERIVATIVES)
+#endif
+#if defined(OPENSUBDIV_GLSL_COMPUTE_USE_2ND_DERIVATIVES)
  if (duuDesc.y > 0) {  // length
    writeDuu(current, duu);
  }
@@ -258,7 +178,5 @@ void main()
  if (dvvDesc.y > 0) {
    writeDvv(current, dvv);
  }
-#  endif
-}
-
 #endif
+}
--- a/intern/opensubdiv/internal/evaluator/shaders/osd_eval_stencils_comp.glsl
+++ b/intern/opensubdiv/internal/evaluator/shaders/osd_eval_stencils_comp.glsl
@@ -0,0 +1,178 @@
+//
+//   Copyright 2013 Pixar
+//
+//   Licensed under the Apache License, Version 2.0 (the "Apache License")
+//   with the following modification; you may not use this file except in
+//   compliance with the Apache License and the following modification to it:
+//   Section 6. Trademarks. is deleted and replaced with:
+//
+//   6. Trademarks. This License does not grant permission to use the trade
+//      names, trademarks, service marks, or product names of the Licensor
+//      and its affiliates, except as required to comply with Section 4(c) of
+//      the License and to reproduce the content of the NOTICE file.
+//
+//   You may obtain a copy of the Apache License at
+//
+//       http://www.apache.org/licenses/LICENSE-2.0
+//
+//   Unless required by applicable law or agreed to in writing, software
+//   distributed under the Apache License with the above modification is
+//   distributed on an "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
+//   KIND, either express or implied. See the Apache License for the specific
+//   language governing permissions and limitations under the Apache License.
+//
+
+//------------------------------------------------------------------------------
+
+uint getGlobalInvocationIndex()
+{
+  uint invocations_per_row = gl_WorkGroupSize.x * gl_NumWorkGroups.x;
+  return gl_GlobalInvocationID.x + gl_GlobalInvocationID.y * invocations_per_row;
+}
+
+//------------------------------------------------------------------------------
+
+struct Vertex {
+  float vertexData[LENGTH];
+};
+
+void clear(out Vertex v)
+{
+  for (int i = 0; i < LENGTH; ++i) {
+    v.vertexData[i] = 0;
+  }
+}
+
+Vertex readVertex(int index)
+{
+  Vertex v;
+  int vertexIndex = srcOffset + index * SRC_STRIDE;
+  for (int i = 0; i < LENGTH; ++i) {
+    v.vertexData[i] = srcVertexBuffer[vertexIndex + i];
+  }
+  return v;
+}
+
+void writeVertex(int index, Vertex v)
+{
+  int vertexIndex = dstOffset + index * DST_STRIDE;
+  for (int i = 0; i < LENGTH; ++i) {
+    dstVertexBuffer[vertexIndex + i] = v.vertexData[i];
+  }
+}
+
+void addWithWeight(inout Vertex v, const Vertex src, float weight)
+{
+  for (int i = 0; i < LENGTH; ++i) {
+    v.vertexData[i] += weight * src.vertexData[i];
+  }
+}
+
+#if defined(OPENSUBDIV_GLSL_COMPUTE_USE_1ST_DERIVATIVES)
+void writeDu(int index, Vertex du)
+{
+  int duIndex = duDesc.x + index * duDesc.z;
+  for (int i = 0; i < LENGTH; ++i) {
+    duBuffer[duIndex + i] = du.vertexData[i];
+  }
+}
+
+void writeDv(int index, Vertex dv)
+{
+  int dvIndex = dvDesc.x + index * dvDesc.z;
+  for (int i = 0; i < LENGTH; ++i) {
+    dvBuffer[dvIndex + i] = dv.vertexData[i];
+  }
+}
+#endif
+
+#if defined(OPENSUBDIV_GLSL_COMPUTE_USE_2ND_DERIVATIVES)
+void writeDuu(int index, Vertex duu)
+{
+  int duuIndex = duuDesc.x + index * duuDesc.z;
+  for (int i = 0; i < LENGTH; ++i) {
+    duuBuffer[duuIndex + i] = duu.vertexData[i];
+  }
+}
+
+void writeDuv(int index, Vertex duv)
+{
+  int duvIndex = duvDesc.x + index * duvDesc.z;
+  for (int i = 0; i < LENGTH; ++i) {
+    duvBuffer[duvIndex + i] = duv.vertexData[i];
+  }
+}
+
+void writeDvv(int index, Vertex dvv)
+{
+  int dvvIndex = dvvDesc.x + index * dvvDesc.z;
+  for (int i = 0; i < LENGTH; ++i) {
+    dvvBuffer[dvvIndex + i] = dvv.vertexData[i];
+  }
+}
+#endif
+
+//------------------------------------------------------------------------------
+
+void main()
+{
+  int current = int(getGlobalInvocationIndex()) + batchStart;
+
+  if (current >= batchEnd) {
+    return;
+  }
+
+  Vertex dst;
+  clear(dst);
+
+  int offset = offsets_buf[current], size = sizes_buf[current];
+
+  for (int stencil = 0; stencil < size; ++stencil) {
+    int vindex = offset + stencil;
+    addWithWeight(dst, readVertex(indices_buf[vindex]), weights_buf[vindex]);
+  }
+
+  writeVertex(current, dst);
+
+#if defined(OPENSUBDIV_GLSL_COMPUTE_USE_1ST_DERIVATIVES)
+  Vertex du, dv;
+  clear(du);
+  clear(dv);
+  for (int i = 0; i < size; ++i) {
+    // expects the compiler optimizes readVertex out here.
+    Vertex src = readVertex(indices_buf[offset + i]);
+    addWithWeight(du, src, du_weights_buf[offset + i]);
+    addWithWeight(dv, src, dv_weights_buf[offset + i]);
+  }
+
+  if (duDesc.y > 0) {  // length
+    writeDu(current, du);
+  }
+  if (dvDesc.y > 0) {
+    writeDv(current, dv);
+  }
+#endif
+#if defined(OPENSUBDIV_GLSL_COMPUTE_USE_2ND_DERIVATIVES)
+  Vertex duu, duv, dvv;
+  clear(duu);
+  clear(duv);
+  clear(dvv);
+  for (int i = 0; i < size; ++i) {
+    // expects the compiler optimizes readVertex out here.
+    Vertex src = readVertex(indices_buf[offset + i]);
+    addWithWeight(duu, src, duu_weights_buf[offset + i]);
+    addWithWeight(duv, src, duv_weights_buf[offset + i]);
+    addWithWeight(dvv, src, dvv_weights_buf[offset + i]);
+  }
+
+  if (duuDesc.y > 0) {  // length
+    writeDuu(current, duu);
+  }
+  if (duvDesc.y > 0) {
+    writeDuv(current, duv);
+  }
+  if (dvvDesc.y > 0) {
+    writeDvv(current, dvv);
+  }
+#endif
+}