Metal: Simplify MSL translation

Move most of the string preprocessing used for MSL compatibility to `glsl_preprocess`. Enforce some changes like matrix constructor and array constructor to the GLSL codebase. This is for C++ compatibility. Additionally reduce the amount of code duplication inside the compatibility code. Pull Request: https://projects.blender.org/blender/blender/pulls/128634
2024-10-07 12:54:10 +02:00
parent 9e604f0e00
commit 9c0321ae9b
64 changed files with 1519 additions and 2928 deletions
--- a/source/blender/compositor/realtime_compositor/shaders/compositor_deriche_gaussian_blur.glsl
+++ b/source/blender/compositor/realtime_compositor/shaders/compositor_deriche_gaussian_blur.glsl
@@ -35,7 +35,7 @@ void main()
   * boundary condition, so we initialize all inputs by the boundary pixel. */
  ivec2 boundary_texel = is_causal ? ivec2(0, y) : ivec2(width - 1, y);
  vec4 input_boundary = texture_load(input_tx, boundary_texel);
-  vec4 inputs[FILTER_ORDER + 1] = vec4[](
+  vec4 inputs[FILTER_ORDER + 1] = float4_array(
      input_boundary, input_boundary, input_boundary, input_boundary, input_boundary);

  /* Create an array that holds the last FILTER_ORDER outputs along with the current output. The
@@ -44,7 +44,7 @@ void main()
   * boundary coefficient. See the DericheGaussianCoefficients class for more information on the
   * boundary handing. */
  vec4 output_boundary = input_boundary * boundary_coefficient;
-  vec4 outputs[FILTER_ORDER + 1] = vec4[](
+  vec4 outputs[FILTER_ORDER + 1] = float4_array(
      output_boundary, output_boundary, output_boundary, output_boundary, output_boundary);

  for (int x = 0; x < width; x++) {
--- a/source/blender/compositor/realtime_compositor/shaders/compositor_kuwahara_classic.glsl
+++ b/source/blender/compositor/realtime_compositor/shaders/compositor_kuwahara_classic.glsl
@@ -16,8 +16,9 @@ void main()
  int radius = max(0, size);
 #endif

-  vec4 mean_of_squared_color_of_quadrants[4] = vec4[](vec4(0.0), vec4(0.0), vec4(0.0), vec4(0.0));
-  vec4 mean_of_color_of_quadrants[4] = vec4[](vec4(0.0), vec4(0.0), vec4(0.0), vec4(0.0));
+  vec4 mean_of_squared_color_of_quadrants[4] = float4_array(
+      vec4(0.0), vec4(0.0), vec4(0.0), vec4(0.0));
+  vec4 mean_of_color_of_quadrants[4] = float4_array(vec4(0.0), vec4(0.0), vec4(0.0), vec4(0.0));

  /* Compute the above statistics for each of the quadrants around the current pixel. */
  for (int q = 0; q < 4; q++) {
--- a/source/blender/compositor/realtime_compositor/shaders/compositor_plane_deform.glsl
+++ b/source/blender/compositor/realtime_compositor/shaders/compositor_plane_deform.glsl
@@ -11,7 +11,7 @@ void main()

  vec2 coordinates = (vec2(texel) + vec2(0.5)) / output_size;

-  vec3 transformed_coordinates = mat3(homography_matrix) * vec3(coordinates, 1.0);
+  vec3 transformed_coordinates = to_float3x3(homography_matrix) * vec3(coordinates, 1.0);
  vec2 projected_coordinates = transformed_coordinates.xy / transformed_coordinates.z;

  /* The derivatives of the projected coordinates with respect to x and y are the first and
--- a/source/blender/compositor/realtime_compositor/shaders/compositor_plane_deform_mask.glsl
+++ b/source/blender/compositor/realtime_compositor/shaders/compositor_plane_deform_mask.glsl
@@ -8,7 +8,7 @@ void main()

  vec2 coordinates = (vec2(texel) + vec2(0.5)) / vec2(imageSize(mask_img));

-  vec3 transformed_coordinates = mat3(homography_matrix) * vec3(coordinates, 1.0);
+  vec3 transformed_coordinates = to_float3x3(homography_matrix) * vec3(coordinates, 1.0);
  vec2 projected_coordinates = transformed_coordinates.xy / transformed_coordinates.z;

  bool is_inside_plane = all(greaterThanEqual(projected_coordinates, vec2(0.0))) &&
--- a/source/blender/compositor/realtime_compositor/shaders/compositor_plane_deform_motion_blur.glsl
+++ b/source/blender/compositor/realtime_compositor/shaders/compositor_plane_deform_motion_blur.glsl
@@ -13,7 +13,7 @@ void main()

  vec4 accumulated_color = vec4(0.0);
  for (int i = 0; i < number_of_motion_blur_samples; i++) {
-    mat3 homography_matrix = mat3(homography_matrices[i]);
+    mat3 homography_matrix = to_float3x3(homography_matrices[i]);

    vec3 transformed_coordinates = homography_matrix * vec3(coordinates, 1.0);
    vec2 projected_coordinates = transformed_coordinates.xy / transformed_coordinates.z;
--- a/source/blender/compositor/realtime_compositor/shaders/compositor_plane_deform_motion_blur_mask.glsl
+++ b/source/blender/compositor/realtime_compositor/shaders/compositor_plane_deform_motion_blur_mask.glsl
@@ -10,7 +10,7 @@ void main()

  float accumulated_mask = 0.0;
  for (int i = 0; i < number_of_motion_blur_samples; i++) {
-    mat3 homography_matrix = mat3(homography_matrices[i]);
+    mat3 homography_matrix = to_float3x3(homography_matrices[i]);

    vec3 transformed_coordinates = homography_matrix * vec3(coordinates, 1.0);
    vec2 projected_coordinates = transformed_coordinates.xy / transformed_coordinates.z;
--- a/source/blender/compositor/realtime_compositor/shaders/compositor_realize_on_domain.glsl
+++ b/source/blender/compositor/realtime_compositor/shaders/compositor_realize_on_domain.glsl
@@ -15,7 +15,7 @@ void main()
  /* Transform the input image by transforming the domain coordinates with the inverse of input
   * image's transformation. The inverse transformation is an affine matrix and thus the
   * coordinates should be in homogeneous coordinates. */
-  coordinates = (mat3(inverse_transformation) * vec3(coordinates, 1.0)).xy;
+  coordinates = (to_float3x3(inverse_transformation) * vec3(coordinates, 1.0)).xy;

  /* Subtract the offset and divide by the input image size to get the relevant coordinates into
   * the sampler's expected [0, 1] range. */
--- a/source/blender/compositor/realtime_compositor/shaders/compositor_van_vliet_gaussian_blur.glsl
+++ b/source/blender/compositor/realtime_compositor/shaders/compositor_van_vliet_gaussian_blur.glsl
@@ -56,7 +56,7 @@ void main()
   * boundary condition, so we initialize all inputs by the boundary pixel. */
  ivec2 boundary_texel = is_causal ? ivec2(0, y) : ivec2(width - 1, y);
  vec4 input_boundary = texture_load(input_tx, boundary_texel);
-  vec4 inputs[FILTER_ORDER + 1] = vec4[](input_boundary, input_boundary, input_boundary);
+  vec4 inputs[FILTER_ORDER + 1] = float4_array(input_boundary, input_boundary, input_boundary);

  /* Create an array that holds the last FILTER_ORDER outputs along with the current output. The
   * current output is at index 0 and the oldest output is at index FILTER_ORDER. We assume Neumann
@@ -64,7 +64,7 @@ void main()
   * boundary coefficient. See the VanVlietGaussianCoefficients class for more information on the
   * boundary handing. */
  vec4 output_boundary = input_boundary * boundary_coefficient;
-  vec4 outputs[FILTER_ORDER + 1] = vec4[](output_boundary, output_boundary, output_boundary);
+  vec4 outputs[FILTER_ORDER + 1] = float4_array(output_boundary, output_boundary, output_boundary);

  for (int x = 0; x < width; x++) {
    /* Run forward across rows for the causal filter and backward for the non causal filter. */
--- a/source/blender/draw/engines/eevee_next/shaders/eevee_depth_of_field_lib.glsl
+++ b/source/blender/draw/engines/eevee_next/shaders/eevee_depth_of_field_lib.glsl
@@ -57,8 +57,8 @@ const float dof_layer_offset = 0.5 + 0.5;

 const int dof_max_slight_focus_radius = DOF_MAX_SLIGHT_FOCUS_RADIUS;

-const uvec2 quad_offsets_u[4] = uvec2[4](uvec2(0, 1), uvec2(1, 1), uvec2(1, 0), uvec2(0, 0));
-const vec2 quad_offsets[4] = vec2[4](
+const uvec2 quad_offsets_u[4] = uint2_array(uvec2(0, 1), uvec2(1, 1), uvec2(1, 0), uvec2(0, 0));
+const vec2 quad_offsets[4] = float2_array(
    vec2(-0.5, 0.5), vec2(0.5, 0.5), vec2(0.5, -0.5), vec2(-0.5, -0.5));

 /** \} */
--- a/source/blender/draw/engines/eevee_next/shaders/eevee_depth_of_field_resolve_comp.glsl
+++ b/source/blender/draw/engines/eevee_next/shaders/eevee_depth_of_field_resolve_comp.glsl
@@ -80,7 +80,7 @@ vec3 dof_neighborhood_clamp(vec2 frag_coord, vec3 color, float center_coc, float
 {
  /* Stabilize color by clamping with the stable half res neighborhood. */
  vec3 neighbor_min, neighbor_max;
-  const vec2 corners[4] = vec2[4](vec2(-1, -1), vec2(1, -1), vec2(-1, 1), vec2(1, 1));
+  const vec2 corners[4] = float2_array(vec2(-1, -1), vec2(1, -1), vec2(-1, 1), vec2(1, 1));
  for (int i = 0; i < 4; i++) {
    /**
     * Visit the 4 half-res texels around (and containing) the full-resolution texel.
--- a/source/blender/draw/engines/eevee_next/shaders/eevee_depth_of_field_stabilize_comp.glsl
+++ b/source/blender/draw/engines/eevee_next/shaders/eevee_depth_of_field_stabilize_comp.glsl
@@ -125,7 +125,7 @@ float dof_bilateral_weight(float reference_coc, float sample_coc)
 DofSample dof_spatial_filtering()
 {
  /* Plus (+) shape offsets. */
-  const ivec2 plus_offsets[4] = ivec2[4](ivec2(-1, 0), ivec2(0, -1), ivec2(1, 0), ivec2(0, 1));
+  const ivec2 plus_offsets[4] = int2_array(ivec2(-1, 0), ivec2(0, -1), ivec2(1, 0), ivec2(0, 1));
  DofSample center = dof_fetch_input_sample(ivec2(0));
  DofSample accum = DofSample(vec4(0.0), 0.0);
  float accum_weight = 0.0;
@@ -165,7 +165,7 @@ struct DofNeighborhoodMinMax {
 DofNeighborhoodMinMax dof_neighbor_boundbox()
 {
  /* Plus (+) shape offsets. */
-  const ivec2 plus_offsets[4] = ivec2[4](ivec2(-1, 0), ivec2(0, -1), ivec2(1, 0), ivec2(0, 1));
+  const ivec2 plus_offsets[4] = int2_array(ivec2(-1, 0), ivec2(0, -1), ivec2(1, 0), ivec2(0, 1));
  /**
   * Simple bounding box calculation in YCoCg as described in:
   * "High Quality Temporal Supersampling" by Brian Karis at SIGGRAPH 2014
@@ -183,7 +183,7 @@ DofNeighborhoodMinMax dof_neighbor_boundbox()
   * Round bbox shape by averaging 2 different min/max from 2 different neighborhood. */
  DofSample min_c_3x3 = min_c;
  DofSample max_c_3x3 = max_c;
-  const ivec2 corners[4] = ivec2[4](ivec2(-1, -1), ivec2(1, -1), ivec2(-1, 1), ivec2(1, 1));
+  const ivec2 corners[4] = int2_array(ivec2(-1, -1), ivec2(1, -1), ivec2(-1, 1), ivec2(1, 1));
  for (int i = 0; i < 4; i++) {
    DofSample samp = dof_fetch_input_sample(corners[i]);
    min_c_3x3.color = min(min_c_3x3.color, samp.color);
@@ -206,7 +206,7 @@ vec2 dof_pixel_history_motion_vector(ivec2 texel_sample)
   * Dilate velocity by using the nearest pixel in a cross pattern.
   * "High Quality Temporal Supersampling" by Brian Karis at SIGGRAPH 2014 (Slide 27)
   */
-  const ivec2 corners[4] = ivec2[4](ivec2(-2, -2), ivec2(2, -2), ivec2(-2, 2), ivec2(2, 2));
+  const ivec2 corners[4] = int2_array(ivec2(-2, -2), ivec2(2, -2), ivec2(-2, 2), ivec2(2, 2));
  float min_depth = dof_fetch_half_depth(ivec2(0));
  ivec2 nearest_texel = ivec2(0);
  for (int i = 0; i < 4; i++) {
--- a/source/blender/draw/engines/eevee_next/shaders/eevee_display_lightprobe_planar_vert.glsl
+++ b/source/blender/draw/engines/eevee_next/shaders/eevee_display_lightprobe_planar_vert.glsl
@@ -9,13 +9,13 @@ void main()
 {
  /* Constant array moved inside function scope.
   * Minimizes local register allocation in MSL. */
-  const vec2 pos[6] = vec2[6](vec2(-1.0, -1.0),
-                              vec2(1.0, -1.0),
-                              vec2(-1.0, 1.0),
+  const vec2 pos[6] = float2_array(vec2(-1.0, -1.0),
+                                   vec2(1.0, -1.0),
+                                   vec2(-1.0, 1.0),

-                              vec2(1.0, -1.0),
-                              vec2(1.0, 1.0),
-                              vec2(-1.0, 1.0));
+                                   vec2(1.0, -1.0),
+                                   vec2(1.0, 1.0),
+                                   vec2(-1.0, 1.0));

  vec2 lP = pos[gl_VertexID % 6];
  int display_index = gl_VertexID / 6;
--- a/source/blender/draw/engines/eevee_next/shaders/eevee_display_lightprobe_sphere_vert.glsl
+++ b/source/blender/draw/engines/eevee_next/shaders/eevee_display_lightprobe_sphere_vert.glsl
@@ -9,13 +9,13 @@ void main()
 {
  /* Constant array moved inside function scope.
   * Minimizes local register allocation in MSL. */
-  const vec2 pos[6] = vec2[6](vec2(-1.0, -1.0),
-                              vec2(1.0, -1.0),
-                              vec2(-1.0, 1.0),
+  const vec2 pos[6] = float2_array(vec2(-1.0, -1.0),
+                                   vec2(1.0, -1.0),
+                                   vec2(-1.0, 1.0),

-                              vec2(1.0, -1.0),
-                              vec2(1.0, 1.0),
-                              vec2(-1.0, 1.0));
+                                   vec2(1.0, -1.0),
+                                   vec2(1.0, 1.0),
+                                   vec2(-1.0, 1.0));

  lP = pos[gl_VertexID % 6];
  int display_index = gl_VertexID / 6;
--- a/source/blender/draw/engines/eevee_next/shaders/eevee_display_lightprobe_volume_vert.glsl
+++ b/source/blender/draw/engines/eevee_next/shaders/eevee_display_lightprobe_volume_vert.glsl
@@ -9,13 +9,13 @@ void main()
 {
  /* Constant array moved inside function scope.
   * Minimizes local register allocation in MSL. */
-  const vec2 pos[6] = vec2[6](vec2(-1.0, -1.0),
-                              vec2(1.0, -1.0),
-                              vec2(-1.0, 1.0),
+  const vec2 pos[6] = float2_array(vec2(-1.0, -1.0),
+                                   vec2(1.0, -1.0),
+                                   vec2(-1.0, 1.0),

-                              vec2(1.0, -1.0),
-                              vec2(1.0, 1.0),
-                              vec2(-1.0, 1.0));
+                                   vec2(1.0, -1.0),
+                                   vec2(1.0, 1.0),
+                                   vec2(-1.0, 1.0));

  lP = pos[gl_VertexID % 6];
  int cell_index = gl_VertexID / 6;
--- a/source/blender/draw/engines/eevee_next/shaders/eevee_film_lib.glsl
+++ b/source/blender/draw/engines/eevee_next/shaders/eevee_film_lib.glsl
@@ -195,7 +195,7 @@ void film_cryptomatte_layer_accum_and_store(
    return;
  }
  /* x = hash, y = accumulated weight. Only keep track of 4 highest weighted samples. */
-  vec2 crypto_samples[4] = vec2[4](vec2(0.0), vec2(0.0), vec2(0.0), vec2(0.0));
+  vec2 crypto_samples[4] = float2_array(vec2(0.0), vec2(0.0), vec2(0.0), vec2(0.0));
  for (int i = 0; i < samples_len; i++) {
    FilmSample src = film_sample_get(i, texel_film);
    film_sample_cryptomatte_accum(src, layer_component, cryptomatte_tx, crypto_samples);
@@ -246,7 +246,7 @@ vec2 film_pixel_history_motion_vector(ivec2 texel_sample)
   * Dilate velocity by using the nearest pixel in a cross pattern.
   * "High Quality Temporal Supersampling" by Brian Karis at SIGGRAPH 2014 (Slide 27)
   */
-  const ivec2 corners[4] = ivec2[4](ivec2(-2, -2), ivec2(2, -2), ivec2(-2, 2), ivec2(2, 2));
+  const ivec2 corners[4] = int2_array(ivec2(-2, -2), ivec2(2, -2), ivec2(-2, 2), ivec2(2, 2));
  float min_depth = texelFetch(depth_tx, texel_sample, 0).x;
  ivec2 nearest_texel = texel_sample;
  for (int i = 0; i < 4; i++) {
@@ -342,11 +342,11 @@ vec4 film_sample_catmull_rom(sampler2D color_tx, vec2 input_texel)
 void film_combined_neighbor_boundbox(ivec2 texel, out vec4 min_c, out vec4 max_c)
 {
  /* Plus (+) shape offsets. */
-  const ivec2 plus_offsets[5] = ivec2[5](ivec2(0, 0), /* Center */
-                                         ivec2(-1, 0),
-                                         ivec2(0, -1),
-                                         ivec2(1, 0),
-                                         ivec2(0, 1));
+  const ivec2 plus_offsets[5] = int2_array(ivec2(0, 0), /* Center */
+                                           ivec2(-1, 0),
+                                           ivec2(0, -1),
+                                           ivec2(1, 0),
+                                           ivec2(0, 1));
 #if 0
  /**
   * Compute Variance of neighborhood as described in:
@@ -389,7 +389,7 @@ void film_combined_neighbor_boundbox(ivec2 texel, out vec4 min_c, out vec4 max_c
   * Round bbox shape by averaging 2 different min/max from 2 different neighborhood. */
  vec4 min_c_3x3 = min_c;
  vec4 max_c_3x3 = max_c;
-  const ivec2 corners[4] = ivec2[4](ivec2(-1, -1), ivec2(1, -1), ivec2(-1, 1), ivec2(1, 1));
+  const ivec2 corners[4] = int2_array(ivec2(-1, -1), ivec2(1, -1), ivec2(-1, 1), ivec2(1, 1));
  for (int i = 0; i < 4; i++) {
    vec4 color = film_texelfetch_as_YCoCg_opacity(combined_tx, texel + corners[i]);
    min_c_3x3 = min(min_c_3x3, color);
--- a/source/blender/draw/engines/eevee_next/shaders/eevee_light_shadow_setup_comp.glsl
+++ b/source/blender/draw/engines/eevee_next/shaders/eevee_light_shadow_setup_comp.glsl
@@ -220,22 +220,22 @@ void cubeface_sync(int tilemap_id,
  /* Use switch instead of inline array of float3x3. */
  switch (cubeface) {
    case Z_NEG:
-      viewmat = mat4x4(mat3x3(+1, +0, +0, +0, +1, +0, +0, +0, +1)) * viewmat;
+      viewmat = to_float4x4(mat3x3(+1, +0, +0, +0, +1, +0, +0, +0, +1)) * viewmat;
      break;
    case X_POS:
-      viewmat = mat4x4(mat3x3(+0, +0, -1, -1, +0, +0, +0, +1, +0)) * viewmat;
+      viewmat = to_float4x4(mat3x3(+0, +0, -1, -1, +0, +0, +0, +1, +0)) * viewmat;
      break;
    case X_NEG:
-      viewmat = mat4x4(mat3x3(+0, +0, +1, +1, +0, +0, +0, +1, +0)) * viewmat;
+      viewmat = to_float4x4(mat3x3(+0, +0, +1, +1, +0, +0, +0, +1, +0)) * viewmat;
      break;
    case Y_POS:
-      viewmat = mat4x4(mat3x3(+1, +0, +0, +0, +0, -1, +0, +1, +0)) * viewmat;
+      viewmat = to_float4x4(mat3x3(+1, +0, +0, +0, +0, -1, +0, +1, +0)) * viewmat;
      break;
    case Y_NEG:
-      viewmat = mat4x4(mat3x3(-1, +0, +0, +0, +0, +1, +0, +1, +0)) * viewmat;
+      viewmat = to_float4x4(mat3x3(-1, +0, +0, +0, +0, +1, +0, +1, +0)) * viewmat;
      break;
    case Z_POS:
-      viewmat = mat4x4(mat3x3(+1, +0, +0, +0, -1, +0, +0, +0, -1)) * viewmat;
+      viewmat = to_float4x4(mat3x3(+1, +0, +0, +0, -1, +0, +0, +0, -1)) * viewmat;
      break;
  }

@@ -289,7 +289,7 @@ void main()
         * set to 0 only when the light radius is also 0 to detect this case. */
      }
      else {
-        light.object_to_world = transform_from_matrix(mat4x4(from_up_axis(shadow_direction)));
+        light.object_to_world = transform_from_matrix(to_float4x4(from_up_axis(shadow_direction)));
      }
    }

--- a/source/blender/draw/engines/eevee_next/shaders/eevee_lightprobe_eval_lib.glsl
+++ b/source/blender/draw/engines/eevee_next/shaders/eevee_lightprobe_eval_lib.glsl
@@ -49,7 +49,7 @@ vec3 lightprobe_sphere_parallax(SphereProbeData probe, vec3 P, vec3 L)
  }
  /* Correct reflection ray using parallax volume intersection. */
  vec3 lP = vec4(P, 1.0) * probe.world_to_probe_transposed;
-  vec3 lL = (mat3x3(probe.world_to_probe_transposed) * L) / probe.parallax_distance;
+  vec3 lL = (to_float3x3(probe.world_to_probe_transposed) * L) / probe.parallax_distance;

  float dist = (probe.parallax_shape == SHAPE_ELIPSOID) ? line_unit_sphere_intersect_dist(lP, lL) :
                                                          line_unit_box_intersect_dist(lP, lL);
--- a/source/blender/draw/engines/eevee_next/shaders/eevee_lightprobe_volume_eval_lib.glsl
+++ b/source/blender/draw/engines/eevee_next/shaders/eevee_lightprobe_volume_eval_lib.glsl
@@ -170,7 +170,7 @@ SphericalHarmonicL1 lightprobe_volume_sample(

  VolumeProbeData grid_data = grids_infos_buf[index];

-  mat3x3 world_to_grid_transposed = mat3x3(grid_data.world_to_grid_transposed);
+  mat3x3 world_to_grid_transposed = to_float3x3(grid_data.world_to_grid_transposed);
  vec3 lNg = safe_normalize(Ng * world_to_grid_transposed);
  vec3 lV = safe_normalize(V * world_to_grid_transposed);

--- a/source/blender/draw/engines/eevee_next/shaders/eevee_lightprobe_volume_load_comp.glsl
+++ b/source/blender/draw/engines/eevee_next/shaders/eevee_lightprobe_volume_load_comp.glsl
@@ -104,7 +104,8 @@ void main()
  }

  /* Rotate Spherical Harmonic into world space. */
-  mat3 grid_to_world_rot = normalize(mat3(grids_infos_buf[grid_index].world_to_grid_transposed));
+  mat3 grid_to_world_rot = normalize(
+      to_float3x3(grids_infos_buf[grid_index].world_to_grid_transposed));
  sh_local = spherical_harmonics_rotate(grid_to_world_rot, sh_local);

  SphericalHarmonicL1 sh_visibility;
--- a/source/blender/draw/engines/overlay/shaders/overlay_armature_envelope_solid_vert.glsl
+++ b/source/blender/draw/engines/overlay/shaders/overlay_armature_envelope_solid_vert.glsl
@@ -36,7 +36,7 @@ void main()
  sp = bone_mat * sp.xzy + headSphere.xyz;
  nor = bone_mat * nor.xzy;

-  normalView = mat3(drw_view.viewmat) * nor;
+  normalView = to_float3x3(drw_view.viewmat) * nor;

  finalStateColor = stateColor;
  finalBoneColor = boneColor;
--- a/source/blender/draw/engines/overlay/shaders/overlay_armature_shape_outline_vert.glsl
+++ b/source/blender/draw/engines/overlay/shaders/overlay_armature_shape_outline_vert.glsl
@@ -34,7 +34,7 @@ VertOut vertex_main(VertIn v_in)

  /* This is slow and run per vertex, but it's still faster than
   * doing it per instance on CPU and sending it on via instance attribute. */
-  mat3 normal_mat = transpose(inverse(mat3(model_mat)));
+  mat3 normal_mat = transpose(inverse(to_float3x3(model_mat)));

  VertOut v_out;
  v_out.vPos = view_pos.xyz;
--- a/source/blender/draw/engines/overlay/shaders/overlay_armature_shape_outline_vert_no_geom.glsl
+++ b/source/blender/draw/engines/overlay/shaders/overlay_armature_shape_outline_vert_no_geom.glsl
@@ -39,7 +39,7 @@ void do_vertex_shader(mat4 in_inst_obmat,

  /* This is slow and run per vertex, but it's still faster than
   * doing it per instance on CPU and sending it on via instance attribute. */
-  mat3 normal_mat = transpose(inverse(mat3(model_mat)));
+  mat3 normal_mat = transpose(inverse(to_float3x3(model_mat)));

  out_ssPos = proj(out_pPos);

--- a/source/blender/draw/engines/overlay/shaders/overlay_armature_shape_solid_vert.glsl
+++ b/source/blender/draw/engines/overlay/shaders/overlay_armature_shape_solid_vert.glsl
@@ -15,7 +15,7 @@ void main()

  /* This is slow and run per vertex, but it's still faster than
   * doing it per instance on CPU and sending it on via instance attribute. */
-  mat3 normal_mat = transpose(inverse(mat3(model_mat)));
+  mat3 normal_mat = transpose(inverse(to_float3x3(model_mat)));
  vec3 normal = normalize(normal_world_to_view(normal_mat * nor));

  inverted = int(dot(cross(model_mat[0].xyz, model_mat[1].xyz), model_mat[2].xyz) < 0.0);
--- a/source/blender/draw/engines/overlay/shaders/overlay_armature_sphere_outline_vert.glsl
+++ b/source/blender/draw/engines/overlay/shaders/overlay_armature_sphere_outline_vert.glsl
@@ -27,7 +27,7 @@ void main()
   * In perspective mode it's also the view-space position
   * of the sphere center. */
  vec3 cam_ray = (is_persp) ? model_view_matrix[3].xyz : vec3(0.0, 0.0, -1.0);
-  cam_ray = mat3(sphereMatrix) * cam_ray;
+  cam_ray = to_float3x3(sphereMatrix) * cam_ray;

  /* Sphere center distance from the camera (persp) in local space. */
  float cam_dist = length(cam_ray);
--- a/source/blender/draw/engines/overlay/shaders/overlay_armature_sphere_solid_vert.glsl
+++ b/source/blender/draw/engines/overlay/shaders/overlay_armature_sphere_solid_vert.glsl
@@ -25,7 +25,7 @@ void main()
   * In perspective mode it's also the view-space position
   * of the sphere center. */
  vec3 cam_ray = (is_persp) ? model_view_matrix[3].xyz : vec3(0.0, 0.0, -1.0);
-  cam_ray = mat3(sphereMatrix) * cam_ray;
+  cam_ray = to_float3x3(sphereMatrix) * cam_ray;

  /* Sphere center distance from the camera (persp) in local space. */
  float cam_dist = length(cam_ray);
--- a/source/blender/draw/engines/overlay/shaders/overlay_background_frag.glsl
+++ b/source/blender/draw/engines/overlay/shaders/overlay_background_frag.glsl
@@ -11,10 +11,10 @@ float dither(void)
 {
  /* NOTE(Metal): Declaring constant array in function scope to avoid increasing local shader
   * memory pressure. */
-  const vec4 dither_mat4x4[4] = vec4[4](vec4(P(0.0), P(8.0), P(2.0), P(10.0)),
-                                        vec4(P(12.0), P(4.0), P(14.0), P(6.0)),
-                                        vec4(P(3.0), P(11.0), P(1.0), P(9.0)),
-                                        vec4(P(15.0), P(7.0), P(13.0), P(5.0)));
+  const vec4 dither_mat4x4[4] = float4_array(vec4(P(0.0), P(8.0), P(2.0), P(10.0)),
+                                             vec4(P(12.0), P(4.0), P(14.0), P(6.0)),
+                                             vec4(P(3.0), P(11.0), P(1.0), P(9.0)),
+                                             vec4(P(15.0), P(7.0), P(13.0), P(5.0)));

  ivec2 co = ivec2(gl_FragCoord.xy) % 4;
  return dither_mat4x4[co.x][co.y];
--- a/source/blender/draw/engines/overlay/shaders/overlay_edit_mesh_skin_root_vert.glsl
+++ b/source/blender/draw/engines/overlay/shaders/overlay_edit_mesh_skin_root_vert.glsl
@@ -8,7 +8,7 @@

 void main()
 {
-  mat3 imat = mat3(ModelMatrixInverse);
+  mat3 imat = to_float3x3(ModelMatrixInverse);
  vec3 right = normalize(imat * ViewMatrixInverse[0].xyz);
  vec3 up = normalize(imat * ViewMatrixInverse[1].xyz);
 #ifdef VERTEX_PULL
--- a/source/blender/draw/engines/overlay/shaders/overlay_outline_prepass_curves_vert.glsl
+++ b/source/blender/draw/engines/overlay/shaders/overlay_outline_prepass_curves_vert.glsl
@@ -61,7 +61,7 @@ void main()
    thick_time = thickness * (thick_time * 2.0 - 1.0);
    /* Take object scale into account.
     * NOTE: This only works fine with uniform scaling. */
-    float scale = 1.0 / length(mat3(ModelMatrixInverse) * binor);
+    float scale = 1.0 / length(to_float3x3(ModelMatrixInverse) * binor);
    world_pos = center_wpos + binor * thick_time * scale;
  }
  else {
--- a/source/blender/draw/engines/overlay/shaders/overlay_paint_weight_frag.glsl
+++ b/source/blender/draw/engines/overlay/shaders/overlay_paint_weight_frag.glsl
@@ -80,7 +80,7 @@ void main()
  /* Weights are available */
  else {
    float weight = weight_interp.x;
-    vec4 weight_color = texture(colorramp, weight, 0);
+    vec4 weight_color = texture(colorramp, weight);
    weight_color = apply_color_fac(weight_color);

    /* Contour display */
--- a/source/blender/draw/engines/overlay/shaders/overlay_volume_gridlines_vert.glsl
+++ b/source/blender/draw/engines/overlay/shaders/overlay_volume_gridlines_vert.glsl
@@ -87,14 +87,14 @@ void main()
 #endif
  /* NOTE(Metal): Declaring constant arrays in function scope to avoid increasing local shader
   * memory pressure. */
-  const int indices[8] = int[8](0, 1, 1, 2, 2, 3, 3, 0);
+  const int indices[8] = int_array(0, 1, 1, 2, 2, 3, 3, 0);

  /* Corners for cell outlines. 0.45 is arbitrary. Any value below 0.5 can be used to avoid
   * overlapping of the outlines. */
-  const vec3 corners[4] = vec3[4](vec3(-0.45, 0.45, 0.0),
-                                  vec3(0.45, 0.45, 0.0),
-                                  vec3(0.45, -0.45, 0.0),
-                                  vec3(-0.45, -0.45, 0.0));
+  const vec3 corners[4] = float3_array(vec3(-0.45, 0.45, 0.0),
+                                       vec3(0.45, 0.45, 0.0),
+                                       vec3(0.45, -0.45, 0.0),
+                                       vec3(-0.45, -0.45, 0.0));

  vec3 pos = domainOriginOffset + cellSize * (vec3(cell_co + adaptiveCellOffset) + cell_offset);
  vec3 rotated_pos = rot_mat * corners[indices[gl_VertexID % 8]];
--- a/source/blender/draw/engines/overlay/shaders/overlay_volume_velocity_vert.glsl
+++ b/source/blender/draw/engines/overlay/shaders/overlay_volume_velocity_vert.glsl
@@ -178,12 +178,12 @@ void main()
 #  ifdef USE_NEEDLE
  /* NOTE(Metal): Declaring constant arrays in function scope to avoid increasing local shader
   * memory pressure. */
-  const vec3 corners[4] = vec3[4](vec3(0.0, 0.2, -0.5),
-                                  vec3(-0.2 * 0.866, -0.2 * 0.5, -0.5),
-                                  vec3(0.2 * 0.866, -0.2 * 0.5, -0.5),
-                                  vec3(0.0, 0.0, 0.5));
+  const vec3 corners[4] = float3_array(vec3(0.0, 0.2, -0.5),
+                                       vec3(-0.2 * 0.866, -0.2 * 0.5, -0.5),
+                                       vec3(0.2 * 0.866, -0.2 * 0.5, -0.5),
+                                       vec3(0.0, 0.0, 0.5));

-  const int indices[12] = int[12](0, 1, 1, 2, 2, 0, 0, 3, 1, 3, 2, 3);
+  const int indices[12] = int_array(0, 1, 1, 2, 2, 0, 0, 3, 1, 3, 2, 3);

  vec3 rotated_pos = rot_mat * corners[indices[gl_VertexID % 12]];
  pos += rotated_pos * vector_length * displaySize * cellSize;
--- a/source/blender/draw/engines/overlay/shaders/overlay_wireframe_vert.glsl
+++ b/source/blender/draw/engines/overlay/shaders/overlay_wireframe_vert.glsl
@@ -100,7 +100,7 @@ void main()
  if (isHair) {
    mat4 obmat = hairDupliMatrix;
    wpos = (obmat * vec4(pos, 1.0)).xyz;
-    wnor = -normalize(mat3(obmat) * nor);
+    wnor = -normalize(to_float3x3(obmat) * nor);
  }

  bool is_persp = (drw_view.winmat[3][3] == 0.0);
--- a/source/blender/draw/engines/workbench/shaders/workbench_volume_frag.glsl
+++ b/source/blender/draw/engines/workbench/shaders/workbench_volume_frag.glsl
@@ -190,10 +190,10 @@ vec4 volume_integration(vec3 ray_ori, vec3 ray_dir, float ray_inc, float ray_max
 {
  /* NOTE: Constant array declared inside function scope to reduce shader core thread memory
   * pressure on Apple Silicon. */
-  const vec4 dither_mat[4] = vec4[4](vec4(P(0.0), P(8.0), P(2.0), P(10.0)),
-                                     vec4(P(12.0), P(4.0), P(14.0), P(6.0)),
-                                     vec4(P(3.0), P(11.0), P(1.0), P(9.0)),
-                                     vec4(P(15.0), P(7.0), P(13.0), P(5.0)));
+  const vec4 dither_mat[4] = float4_array(vec4(P(0.0), P(8.0), P(2.0), P(10.0)),
+                                          vec4(P(12.0), P(4.0), P(14.0), P(6.0)),
+                                          vec4(P(3.0), P(11.0), P(1.0), P(9.0)),
+                                          vec4(P(15.0), P(7.0), P(13.0), P(5.0)));
  /* Start with full transmittance and no scattered light. */
  vec3 final_scattering = vec3(0.0);
  float final_transmittance = 1.0;
--- a/source/blender/draw/intern/shaders/common_debug_draw_lib.glsl
+++ b/source/blender/draw/intern/shaders/common_debug_draw_lib.glsl
@@ -176,7 +176,7 @@ void drw_debug_sphere(vec3 p, float radius)
 */
 void drw_debug_matrix(mat4 mat, vec4 v_color)
 {
-  vec4 p[4] = vec4[4](vec4(0, 0, 0, 1), vec4(1, 0, 0, 1), vec4(0, 1, 0, 1), vec4(0, 0, 1, 1));
+  vec4 p[4] = float4_array(vec4(0, 0, 0, 1), vec4(1, 0, 0, 1), vec4(0, 1, 0, 1), vec4(0, 0, 1, 1));
  for (int i = 0; i < 4; i++) {
    p[i] = mat * p[i];
    p[i].xyz /= p[i].w;
@@ -195,14 +195,14 @@ void drw_debug_matrix(mat4 mat)
 */
 void drw_debug_matrix_as_bbox(mat4 mat, vec4 v_color)
 {
-  vec4 p[8] = vec4[8](vec4(-1, -1, -1, 1),
-                      vec4(1, -1, -1, 1),
-                      vec4(1, 1, -1, 1),
-                      vec4(-1, 1, -1, 1),
-                      vec4(-1, -1, 1, 1),
-                      vec4(1, -1, 1, 1),
-                      vec4(1, 1, 1, 1),
-                      vec4(-1, 1, 1, 1));
+  vec4 p[8] = float4_array(vec4(-1, -1, -1, 1),
+                           vec4(1, -1, -1, 1),
+                           vec4(1, 1, -1, 1),
+                           vec4(-1, 1, -1, 1),
+                           vec4(-1, -1, 1, 1),
+                           vec4(1, -1, 1, 1),
+                           vec4(1, 1, 1, 1),
+                           vec4(-1, 1, 1, 1));
  for (int i = 0; i < 8; i++) {
    p[i] = mat * p[i];
    p[i].xyz /= p[i].w;
--- a/source/blender/draw/intern/shaders/common_debug_print_lib.glsl
+++ b/source/blender/draw/intern/shaders/common_debug_print_lib.glsl
@@ -124,7 +124,7 @@ void drw_print_value_binary(uint value)
 {
  drw_print_no_endl("0b");
  drw_print_string_start(10u * 4u);
-  uint digits[10] = uint[10](0u, 0u, 0u, 0u, 0u, 0u, 0u, 0u, 0u, 0u);
+  uint digits[10] = uint_array(0u, 0u, 0u, 0u, 0u, 0u, 0u, 0u, 0u, 0u);
  uint digit = 0u;
  for (uint i = 0u; i < 32u; i++) {
    drw_print_append_digit(((value >> i) & 1u), digits[digit / 4u]);
@@ -155,7 +155,7 @@ void drw_print_value_uint(uint value, const bool hex, bool is_negative, const bo
  drw_print_string_start(3u * 4u);
  const uint blank_value = hex ? 0x30303030u : 0x20202020u;
  const uint prefix = hex ? 0x78302020u : 0x20202020u;
-  uint digits[3] = uint[3](blank_value, blank_value, prefix);
+  uint digits[3] = uint_array(blank_value, blank_value, prefix);
  const uint base = hex ? 16u : 10u;
  uint digit = 0u;
  /* Add `u` suffix. */
@@ -247,7 +247,7 @@ void drw_print_value(float val)
   * and exponent (4). */
  const float significant_digits = 6.0;
  drw_print_string_start(3u * 4u);
-  uint digits[3] = uint[3](0x20202020u, 0x20202020u, 0x20202020u);
+  uint digits[3] = uint_array(0x20202020u, 0x20202020u, 0x20202020u);

  float exponent = floor(log(abs(val)) / log(10.0));
  bool display_exponent = exponent >= (significant_digits) ||
--- a/source/blender/draw/intern/shaders/common_hair_lib.glsl
+++ b/source/blender/draw/intern/shaders/common_hair_lib.glsl
@@ -210,7 +210,7 @@ void hair_get_center_pos_tan_binor_time(bool is_persp,

  mat4 obmat = hairDupliMatrix;
  wpos = (obmat * vec4(wpos, 1.0)).xyz;
-  wtan = -normalize(mat3(obmat) * wtan);
+  wtan = -normalize(to_float3x3(obmat) * wtan);

  vec3 camera_vec = (is_persp) ? camera_pos - wpos : camera_z;
  wbinor = normalize(cross(camera_vec, wtan));
@@ -236,7 +236,7 @@ void hair_get_pos_tan_binor_time(bool is_persp,
    thick_time = thickness * (thick_time * 2.0 - 1.0);
    /* Take object scale into account.
     * NOTE: This only works fine with uniform scaling. */
-    float scale = 1.0 / length(mat3(invmodel_mat) * wbinor);
+    float scale = 1.0 / length(to_float3x3(invmodel_mat) * wbinor);
    wpos += wbinor * thick_time * scale;
  }
  else {
--- a/source/blender/draw/intern/shaders/common_math_lib.glsl
+++ b/source/blender/draw/intern/shaders/common_math_lib.glsl
@@ -41,7 +41,7 @@ mat3 mul(mat3 m1, mat3 m2)
 #  ifndef GPU_SHADER_MATH_MATRIX_LIB_GLSL
 vec3 transform_direction(mat4 m, vec3 v)
 {
-  return mat3(m) * v;
+  return to_float3x3(m) * v;
 }
 vec3 transform_point(mat4 m, vec3 v)
 {
--- a/source/blender/draw/intern/shaders/common_pointcloud_lib.glsl
+++ b/source/blender/draw/intern/shaders/common_pointcloud_lib.glsl
@@ -37,7 +37,7 @@ void pointcloud_get_pos_and_radius(out vec3 outpos, out float outradius)
  int id = pointcloud_get_point_id();
  vec4 pos_rad = texelFetch(ptcloud_pos_rad_tx, id);
  outpos = point_object_to_world(pos_rad.xyz);
-  outradius = dot(abs(mat3(ModelMatrix) * pos_rad.www), vec3(1.0 / 3.0));
+  outradius = dot(abs(to_float3x3(ModelMatrix) * pos_rad.www), vec3(1.0 / 3.0));
 }

 /* Return world position and normal. */
--- a/source/blender/draw/intern/shaders/common_view_lib.glsl
+++ b/source/blender/draw/intern/shaders/common_view_lib.glsl
@@ -227,14 +227,14 @@ uniform mat4 ModelMatrixInverse;
 * NOTE: This is only valid because we are only using the mat3 of the ViewMatrixInverse.
 * ViewMatrix * transpose(ModelMatrixInverse)
 */
-#  define NormalMatrix transpose(mat3(ModelMatrixInverse))
-#  define NormalMatrixInverse transpose(mat3(ModelMatrix))
+#  define NormalMatrix transpose(to_float3x3(ModelMatrixInverse))
+#  define NormalMatrixInverse transpose(to_float3x3(ModelMatrix))

-#  define normal_object_to_view(n) (mat3(ViewMatrix) * (NormalMatrix * n))
+#  define normal_object_to_view(n) (to_float3x3(ViewMatrix) * (NormalMatrix * n))
 #  define normal_object_to_world(n) (NormalMatrix * n)
 #  define normal_world_to_object(n) (NormalMatrixInverse * n)
-#  define normal_world_to_view(n) (mat3(ViewMatrix) * n)
-#  define normal_view_to_world(n) (mat3(ViewMatrixInverse) * n)
+#  define normal_world_to_view(n) (to_float3x3(ViewMatrix) * n)
+#  define normal_view_to_world(n) (to_float3x3(ViewMatrixInverse) * n)

 #  define point_object_to_ndc(p) \
    (ProjectionMatrix * (ViewMatrix * vec4((ModelMatrix * vec4(p, 1.0)).xyz, 1.0)))
--- a/source/blender/draw/intern/shaders/draw_debug_print_display_frag.glsl
+++ b/source/blender/draw/intern/shaders/draw_debug_print_display_frag.glsl
@@ -9,102 +9,102 @@
 bool char_intersect(uvec2 bitmap_position)
 {
  /* Using 8x8 = 64bits = uvec2. */
-  uvec2 ascii_bitmap[96] = uvec2[96](uvec2(0x00000000u, 0x00000000u),
-                                     uvec2(0x18001800u, 0x183c3c18u),
-                                     uvec2(0x00000000u, 0x36360000u),
-                                     uvec2(0x7f363600u, 0x36367f36u),
-                                     uvec2(0x301f0c00u, 0x0c3e031eu),
-                                     uvec2(0x0c666300u, 0x00633318u),
-                                     uvec2(0x3b336e00u, 0x1c361c6eu),
-                                     uvec2(0x00000000u, 0x06060300u),
-                                     uvec2(0x060c1800u, 0x180c0606u),
-                                     uvec2(0x180c0600u, 0x060c1818u),
-                                     uvec2(0x3c660000u, 0x00663cffu),
-                                     uvec2(0x0c0c0000u, 0x000c0c3fu),
-                                     uvec2(0x000c0c06u, 0x00000000u),
-                                     uvec2(0x00000000u, 0x0000003fu),
-                                     uvec2(0x000c0c00u, 0x00000000u),
-                                     uvec2(0x06030100u, 0x6030180cu),
-                                     uvec2(0x6f673e00u, 0x3e63737bu),
-                                     uvec2(0x0c0c3f00u, 0x0c0e0c0cu),
-                                     uvec2(0x06333f00u, 0x1e33301cu),
-                                     uvec2(0x30331e00u, 0x1e33301cu),
-                                     uvec2(0x7f307800u, 0x383c3633u),
-                                     uvec2(0x30331e00u, 0x3f031f30u),
-                                     uvec2(0x33331e00u, 0x1c06031fu),
-                                     uvec2(0x0c0c0c00u, 0x3f333018u),
-                                     uvec2(0x33331e00u, 0x1e33331eu),
-                                     uvec2(0x30180e00u, 0x1e33333eu),
-                                     uvec2(0x000c0c00u, 0x000c0c00u),
-                                     uvec2(0x000c0c06u, 0x000c0c00u),
-                                     uvec2(0x060c1800u, 0x180c0603u),
-                                     uvec2(0x003f0000u, 0x00003f00u),
-                                     uvec2(0x180c0600u, 0x060c1830u),
-                                     uvec2(0x0c000c00u, 0x1e333018u),
-                                     uvec2(0x7b031e00u, 0x3e637b7bu),
-                                     uvec2(0x3f333300u, 0x0c1e3333u),
-                                     uvec2(0x66663f00u, 0x3f66663eu),
-                                     uvec2(0x03663c00u, 0x3c660303u),
-                                     uvec2(0x66361f00u, 0x1f366666u),
-                                     uvec2(0x16467f00u, 0x7f46161eu),
-                                     uvec2(0x16060f00u, 0x7f46161eu),
-                                     uvec2(0x73667c00u, 0x3c660303u),
-                                     uvec2(0x33333300u, 0x3333333fu),
-                                     uvec2(0x0c0c1e00u, 0x1e0c0c0cu),
-                                     uvec2(0x33331e00u, 0x78303030u),
-                                     uvec2(0x36666700u, 0x6766361eu),
-                                     uvec2(0x46667f00u, 0x0f060606u),
-                                     uvec2(0x6b636300u, 0x63777f7fu),
-                                     uvec2(0x73636300u, 0x63676f7bu),
-                                     uvec2(0x63361c00u, 0x1c366363u),
-                                     uvec2(0x06060f00u, 0x3f66663eu),
-                                     uvec2(0x3b1e3800u, 0x1e333333u),
-                                     uvec2(0x36666700u, 0x3f66663eu),
-                                     uvec2(0x38331e00u, 0x1e33070eu),
-                                     uvec2(0x0c0c1e00u, 0x3f2d0c0cu),
-                                     uvec2(0x33333f00u, 0x33333333u),
-                                     uvec2(0x331e0c00u, 0x33333333u),
-                                     uvec2(0x7f776300u, 0x6363636bu),
-                                     uvec2(0x1c366300u, 0x6363361cu),
-                                     uvec2(0x0c0c1e00u, 0x3333331eu),
-                                     uvec2(0x4c667f00u, 0x7f633118u),
-                                     uvec2(0x06061e00u, 0x1e060606u),
-                                     uvec2(0x30604000u, 0x03060c18u),
-                                     uvec2(0x18181e00u, 0x1e181818u),
-                                     uvec2(0x00000000u, 0x081c3663u),
-                                     uvec2(0x000000ffu, 0x00000000u),
-                                     uvec2(0x00000000u, 0x0c0c1800u),
-                                     uvec2(0x3e336e00u, 0x00001e30u),
-                                     uvec2(0x66663b00u, 0x0706063eu),
-                                     uvec2(0x03331e00u, 0x00001e33u),
-                                     uvec2(0x33336e00u, 0x3830303eu),
-                                     uvec2(0x3f031e00u, 0x00001e33u),
-                                     uvec2(0x06060f00u, 0x1c36060fu),
-                                     uvec2(0x333e301fu, 0x00006e33u),
-                                     uvec2(0x66666700u, 0x0706366eu),
-                                     uvec2(0x0c0c1e00u, 0x0c000e0cu),
-                                     uvec2(0x3033331eu, 0x30003030u),
-                                     uvec2(0x1e366700u, 0x07066636u),
-                                     uvec2(0x0c0c1e00u, 0x0e0c0c0cu),
-                                     uvec2(0x7f6b6300u, 0x0000337fu),
-                                     uvec2(0x33333300u, 0x00001f33u),
-                                     uvec2(0x33331e00u, 0x00001e33u),
-                                     uvec2(0x663e060fu, 0x00003b66u),
-                                     uvec2(0x333e3078u, 0x00006e33u),
-                                     uvec2(0x66060f00u, 0x00003b6eu),
-                                     uvec2(0x1e301f00u, 0x00003e03u),
-                                     uvec2(0x0c2c1800u, 0x080c3e0cu),
-                                     uvec2(0x33336e00u, 0x00003333u),
-                                     uvec2(0x331e0c00u, 0x00003333u),
-                                     uvec2(0x7f7f3600u, 0x0000636bu),
-                                     uvec2(0x1c366300u, 0x00006336u),
-                                     uvec2(0x333e301fu, 0x00003333u),
-                                     uvec2(0x0c263f00u, 0x00003f19u),
-                                     uvec2(0x0c0c3800u, 0x380c0c07u),
-                                     uvec2(0x18181800u, 0x18181800u),
-                                     uvec2(0x0c0c0700u, 0x070c0c38u),
-                                     uvec2(0x00000000u, 0x6e3b0000u),
-                                     uvec2(0x00000000u, 0x00000000u));
+  uvec2 ascii_bitmap[96] = uint2_array(uvec2(0x00000000u, 0x00000000u),
+                                       uvec2(0x18001800u, 0x183c3c18u),
+                                       uvec2(0x00000000u, 0x36360000u),
+                                       uvec2(0x7f363600u, 0x36367f36u),
+                                       uvec2(0x301f0c00u, 0x0c3e031eu),
+                                       uvec2(0x0c666300u, 0x00633318u),
+                                       uvec2(0x3b336e00u, 0x1c361c6eu),
+                                       uvec2(0x00000000u, 0x06060300u),
+                                       uvec2(0x060c1800u, 0x180c0606u),
+                                       uvec2(0x180c0600u, 0x060c1818u),
+                                       uvec2(0x3c660000u, 0x00663cffu),
+                                       uvec2(0x0c0c0000u, 0x000c0c3fu),
+                                       uvec2(0x000c0c06u, 0x00000000u),
+                                       uvec2(0x00000000u, 0x0000003fu),
+                                       uvec2(0x000c0c00u, 0x00000000u),
+                                       uvec2(0x06030100u, 0x6030180cu),
+                                       uvec2(0x6f673e00u, 0x3e63737bu),
+                                       uvec2(0x0c0c3f00u, 0x0c0e0c0cu),
+                                       uvec2(0x06333f00u, 0x1e33301cu),
+                                       uvec2(0x30331e00u, 0x1e33301cu),
+                                       uvec2(0x7f307800u, 0x383c3633u),
+                                       uvec2(0x30331e00u, 0x3f031f30u),
+                                       uvec2(0x33331e00u, 0x1c06031fu),
+                                       uvec2(0x0c0c0c00u, 0x3f333018u),
+                                       uvec2(0x33331e00u, 0x1e33331eu),
+                                       uvec2(0x30180e00u, 0x1e33333eu),
+                                       uvec2(0x000c0c00u, 0x000c0c00u),
+                                       uvec2(0x000c0c06u, 0x000c0c00u),
+                                       uvec2(0x060c1800u, 0x180c0603u),
+                                       uvec2(0x003f0000u, 0x00003f00u),
+                                       uvec2(0x180c0600u, 0x060c1830u),
+                                       uvec2(0x0c000c00u, 0x1e333018u),
+                                       uvec2(0x7b031e00u, 0x3e637b7bu),
+                                       uvec2(0x3f333300u, 0x0c1e3333u),
+                                       uvec2(0x66663f00u, 0x3f66663eu),
+                                       uvec2(0x03663c00u, 0x3c660303u),
+                                       uvec2(0x66361f00u, 0x1f366666u),
+                                       uvec2(0x16467f00u, 0x7f46161eu),
+                                       uvec2(0x16060f00u, 0x7f46161eu),
+                                       uvec2(0x73667c00u, 0x3c660303u),
+                                       uvec2(0x33333300u, 0x3333333fu),
+                                       uvec2(0x0c0c1e00u, 0x1e0c0c0cu),
+                                       uvec2(0x33331e00u, 0x78303030u),
+                                       uvec2(0x36666700u, 0x6766361eu),
+                                       uvec2(0x46667f00u, 0x0f060606u),
+                                       uvec2(0x6b636300u, 0x63777f7fu),
+                                       uvec2(0x73636300u, 0x63676f7bu),
+                                       uvec2(0x63361c00u, 0x1c366363u),
+                                       uvec2(0x06060f00u, 0x3f66663eu),
+                                       uvec2(0x3b1e3800u, 0x1e333333u),
+                                       uvec2(0x36666700u, 0x3f66663eu),
+                                       uvec2(0x38331e00u, 0x1e33070eu),
+                                       uvec2(0x0c0c1e00u, 0x3f2d0c0cu),
+                                       uvec2(0x33333f00u, 0x33333333u),
+                                       uvec2(0x331e0c00u, 0x33333333u),
+                                       uvec2(0x7f776300u, 0x6363636bu),
+                                       uvec2(0x1c366300u, 0x6363361cu),
+                                       uvec2(0x0c0c1e00u, 0x3333331eu),
+                                       uvec2(0x4c667f00u, 0x7f633118u),
+                                       uvec2(0x06061e00u, 0x1e060606u),
+                                       uvec2(0x30604000u, 0x03060c18u),
+                                       uvec2(0x18181e00u, 0x1e181818u),
+                                       uvec2(0x00000000u, 0x081c3663u),
+                                       uvec2(0x000000ffu, 0x00000000u),
+                                       uvec2(0x00000000u, 0x0c0c1800u),
+                                       uvec2(0x3e336e00u, 0x00001e30u),
+                                       uvec2(0x66663b00u, 0x0706063eu),
+                                       uvec2(0x03331e00u, 0x00001e33u),
+                                       uvec2(0x33336e00u, 0x3830303eu),
+                                       uvec2(0x3f031e00u, 0x00001e33u),
+                                       uvec2(0x06060f00u, 0x1c36060fu),
+                                       uvec2(0x333e301fu, 0x00006e33u),
+                                       uvec2(0x66666700u, 0x0706366eu),
+                                       uvec2(0x0c0c1e00u, 0x0c000e0cu),
+                                       uvec2(0x3033331eu, 0x30003030u),
+                                       uvec2(0x1e366700u, 0x07066636u),
+                                       uvec2(0x0c0c1e00u, 0x0e0c0c0cu),
+                                       uvec2(0x7f6b6300u, 0x0000337fu),
+                                       uvec2(0x33333300u, 0x00001f33u),
+                                       uvec2(0x33331e00u, 0x00001e33u),
+                                       uvec2(0x663e060fu, 0x00003b66u),
+                                       uvec2(0x333e3078u, 0x00006e33u),
+                                       uvec2(0x66060f00u, 0x00003b6eu),
+                                       uvec2(0x1e301f00u, 0x00003e03u),
+                                       uvec2(0x0c2c1800u, 0x080c3e0cu),
+                                       uvec2(0x33336e00u, 0x00003333u),
+                                       uvec2(0x331e0c00u, 0x00003333u),
+                                       uvec2(0x7f7f3600u, 0x0000636bu),
+                                       uvec2(0x1c366300u, 0x00006336u),
+                                       uvec2(0x333e301fu, 0x00003333u),
+                                       uvec2(0x0c263f00u, 0x00003f19u),
+                                       uvec2(0x0c0c3800u, 0x380c0c07u),
+                                       uvec2(0x18181800u, 0x18181800u),
+                                       uvec2(0x0c0c0700u, 0x070c0c38u),
+                                       uvec2(0x00000000u, 0x6e3b0000u),
+                                       uvec2(0x00000000u, 0x00000000u));

  if (any(lessThan(bitmap_position, uvec2(0))) || any(greaterThanEqual(bitmap_position, uvec2(8))))
  {
--- a/source/blender/draw/intern/shaders/draw_model_lib.glsl
+++ b/source/blender/draw/intern/shaders/draw_model_lib.glsl
@@ -53,11 +53,11 @@ mat4x4 drw_modelinv()
 */
 mat3x3 drw_normat()
 {
-  return transpose(mat3x3(drw_modelinv()));
+  return transpose(to_float3x3(drw_modelinv()));
 }
 mat3x3 drw_norinv()
 {
-  return transpose(mat3x3(drw_modelmat()));
+  return transpose(to_float3x3(drw_modelmat()));
 }

 /* -------------------------------------------------------------------- */
@@ -77,11 +77,11 @@ vec3 drw_normal_world_to_object(vec3 N)

 vec3 drw_normal_object_to_view(vec3 lN)
 {
-  return (mat3x3(drw_view.viewmat) * (drw_normat() * lN));
+  return (to_float3x3(drw_view.viewmat) * (drw_normat() * lN));
 }
 vec3 drw_normal_view_to_object(vec3 vN)
 {
-  return (drw_norinv() * (mat3x3(drw_view.viewinv) * vN));
+  return (drw_norinv() * (to_float3x3(drw_view.viewinv) * vN));
 }

 /** \} */
--- a/source/blender/draw/intern/shaders/draw_view_lib.glsl
+++ b/source/blender/draw/intern/shaders/draw_view_lib.glsl
@@ -108,12 +108,12 @@ float drw_ndc_to_screen(float ndc_P)

 vec3 drw_normal_view_to_world(vec3 vN)
 {
-  return (mat3x3(drw_view.viewinv) * vN);
+  return (to_float3x3(drw_view.viewinv) * vN);
 }

 vec3 drw_normal_world_to_view(vec3 N)
 {
-  return (mat3x3(drw_view.viewmat) * N);
+  return (to_float3x3(drw_view.viewmat) * N);
 }

 /** \} */
--- a/source/blender/gpu/GPU_shader.hh
+++ b/source/blender/gpu/GPU_shader.hh
@@ -41,6 +41,11 @@ struct GPUShader;
 */
 GPUShader *GPU_shader_create_from_info(const GPUShaderCreateInfo *_info);

+/**
+ * Same as GPU_shader_create_from_info but will run preprocessor on source strings.
+ */
+GPUShader *GPU_shader_create_from_info_python(const GPUShaderCreateInfo *_info);
+
 /**
 * Create a shader using a named #GPUShaderCreateInfo registered at startup.
 * These are declared inside `*_info.hh` files using the `GPU_SHADER_CREATE_INFO()` macro.
--- a/source/blender/gpu/glsl_preprocess/CMakeLists.txt
+++ b/source/blender/gpu/glsl_preprocess/CMakeLists.txt
@@ -7,6 +7,8 @@
 # Build `glsl_preprocess` executable.
 set(SRC
  glsl_preprocess.cc
+
+  glsl_preprocess.hh
 )

 # `SRC_DNA_INC` is defined in the parent directory.
--- a/source/blender/gpu/glsl_preprocess/glsl_preprocess.cc
+++ b/source/blender/gpu/glsl_preprocess/glsl_preprocess.cc
@@ -1,4 +1,4 @@
-/* SPDX-FileCopyrightText: 2001-2002 NaN Holding BV. All rights reserved.
+/* SPDX-FileCopyrightText: 2024 Blender Authors
 *
 * SPDX-License-Identifier: GPL-2.0-or-later */

@@ -6,11 +6,12 @@
 * \ingroup glsl_preprocess
 */

-#include <algorithm>
 #include <fstream>
 #include <iostream>
+#include <regex>
 #include <string>
-#include <vector>
+
+#include "glsl_preprocess.hh"

 int main(int argc, char **argv)
 {
@@ -40,8 +41,29 @@ int main(int argc, char **argv)
  bool first_comment = true;
  bool inside_comment = false;

+  int error = 0;
+  size_t line_index = 0;
+
+  auto report_error =
+      [&](const std::string &src_line, const std::smatch &match, const char *err_msg) {
+        size_t err_line = line_index;
+        size_t err_char = match.position();
+
+        std::cerr << input_file_name;
+        std::cerr << ':' << std::to_string(err_line) << ':' << std::to_string(err_char);
+        std::cerr << ": error: " << err_msg << std::endl;
+        std::cerr << src_line << std::endl;
+        std::cerr << std::string(err_char, ' ') << '^' << std::endl;
+
+        error++;
+      };
+
+  blender::gpu::shader::Preprocessor processor(report_error);
+
  std::string line;
  while (std::getline(input_file, line)) {
+    line_index++;
+
    /* Remove licence headers (first comment). */
    if (line.rfind("/*", 0) == 0 && first_comment) {
      first_comment = false;
@@ -55,17 +77,17 @@ int main(int argc, char **argv)
    }

    if (skip_line) {
-      line = "";
+      output_file << "\n";
    }
-    else if (line.rfind("#include ", 0) == 0 || line.rfind("#pragma once", 0) == 0) {
-      line[0] = line[1] = '/';
+    else {
+      processor << line << '\n';
    }
-
-    output_file << line << "\n";
  }

+  output_file << processor.str();
+
  input_file.close();
  output_file.close();

-  return 0;
+  return error;
 }
--- a/source/blender/gpu/glsl_preprocess/glsl_preprocess.hh
+++ b/source/blender/gpu/glsl_preprocess/glsl_preprocess.hh
@@ -0,0 +1,219 @@
+/* SPDX-FileCopyrightText: 2024 Blender Authors
+ *
+ * SPDX-License-Identifier: GPL-2.0-or-later */
+
+/** \file
+ * \ingroup glsl_preprocess
+ */
+
+#pragma once
+
+#include <algorithm>
+#include <regex>
+#include <sstream>
+#include <string>
+#include <vector>
+
+namespace blender::gpu::shader {
+
+/**
+ * Shader source preprocessor that allow to mutate GLSL into cross API source that can be
+ * interpreted by the different GPU backends. Some syntax are mutated or reported as incompatible.
+ *
+ * Implementation speed is not a huge concern as we only apply this at compile time or on python
+ * shaders source.
+ */
+template<typename T, bool no_linting = false> class Preprocessor {
+  T &report_error;
+
+  struct SharedVar {
+    std::string type;
+    std::string name;
+    std::string array;
+  };
+  std::vector<SharedVar> shared_vars_;
+
+  std::stringstream output_;
+
+ public:
+  Preprocessor(T &error_cb) : report_error(error_cb) {}
+
+  Preprocessor &operator<<(std::string str)
+  {
+    threadgroup_variable_parsing(str);
+    matrix_constructor_linting(str);
+    array_constructor_linting(str);
+    str = preprocessor_directive_mutation(str);
+    str = argument_decorator_macro_injection(str);
+    str = array_constructor_macro_injection(str);
+    output_ << str;
+    return *this;
+  }
+
+  Preprocessor &operator<<(char c)
+  {
+    output_ << c;
+    return *this;
+  }
+
+  std::string str()
+  {
+    return output_.str() + suffix();
+  }
+
+ private:
+  std::string preprocessor_directive_mutation(const std::string &str)
+  {
+    /* Example: `#include "deps.glsl"` > `//include "deps.glsl"` */
+    std::regex regex("#\\s*(include|pragma once)");
+    return std::regex_replace(str, regex, "//$1");
+  }
+
+  void threadgroup_variable_parsing(std::string str)
+  {
+    std::regex regex("shared\\s+(\\w+)\\s+(\\w+)([^;]*);");
+    for (std::smatch match; std::regex_search(str, match, regex); str = match.suffix()) {
+      shared_vars_.push_back({match[1].str(), match[2].str(), match[3].str()});
+    }
+  }
+
+  std::string argument_decorator_macro_injection(const std::string &str)
+  {
+    /* Example: `out float var[2]` > `out float _out_sta var _out_end[2]` */
+    std::regex regex("(out|inout|in|shared)\\s+(\\w+)\\s+(\\w+)");
+    return std::regex_replace(str, regex, "$1 $2 _$1_sta $3 _$1_end");
+  }
+
+  std::string array_constructor_macro_injection(const std::string &str)
+  {
+    /* Example: `= float[2](0.0, 0.0)` > `= ARRAY_T(float) ARRAY_V(0.0, 0.0)` */
+    std::regex regex("=\\s*(\\w+)\\s*\\[[^\\]]*\\]\\s*\\(");
+    return std::regex_replace(str, regex, "= ARRAY_T($1) ARRAY_V(");
+  }
+
+  /* TODO(fclem): Too many false positive and false negative to be applied to python shaders. */
+  void matrix_constructor_linting(std::string str)
+  {
+    if constexpr (no_linting) {
+      return;
+    }
+    /* Example: `mat4(other_mat)`. */
+    std::regex regex("\\s+(mat(\\d|\\dx\\d)|float\\dx\\d)\\([^,\\s\\d]+\\)");
+    for (std::smatch match; std::regex_search(str, match, regex); str = match.suffix()) {
+      /* This only catches some invalid usage. For the rest, the CI will catch them. */
+      const char *msg =
+          "Matrix constructor is not cross API compatible. "
+          "Use to_floatNxM to reshape the matrix or use other constructors instead.";
+      report_error(str, match, msg);
+    }
+  }
+
+  void array_constructor_linting(std::string str)
+  {
+    if constexpr (no_linting) {
+      return;
+    }
+    std::regex regex("=\\s*(\\w+)\\s*\\[[^\\]]*\\]\\s*\\(");
+    for (std::smatch match; std::regex_search(str, match, regex); str = match.suffix()) {
+      /* This only catches some invalid usage. For the rest, the CI will catch them. */
+      const char *msg =
+          "Array constructor is not cross API compatible. Use type_array instead of type[].";
+      report_error(str, match, msg);
+    }
+  }
+
+  std::string suffix()
+  {
+    if (shared_vars_.empty()) {
+      return "";
+    }
+
+    std::stringstream suffix;
+    /**
+     * For Metal shaders to compile, shared (threadgroup) variable cannot be declared globally.
+     * They must reside within a function scope. Hence, we need to extract these declarations and
+     * generate shared memory blocks within the entry point function. These shared memory blocks
+     * can then be passed as references to the remaining shader via the class function scope.
+     *
+     * The shared variable definitions from the source file are replaced with references to
+     * threadgroup memory blocks (using _shared_sta and _shared_end macros), but kept in-line in
+     * case external macros are used to declare the dimensions.
+     *
+     * Each part of the codegen is stored inside macros so that we don't have to do string
+     * replacement at runtime.
+     */
+    /* Arguments of the wrapper class constructor. */
+    suffix << "#undef MSL_SHARED_VARS_ARGS\n";
+    /* References assignment inside wrapper class constructor. */
+    suffix << "#undef MSL_SHARED_VARS_ASSIGN\n";
+    /* Declaration of threadgroup variables in entry point function. */
+    suffix << "#undef MSL_SHARED_VARS_DECLARE\n";
+    /* Arguments for wrapper class constructor call. */
+    suffix << "#undef MSL_SHARED_VARS_PASS\n";
+
+    /**
+     * Example replacement:
+     *
+     * `
+     * // Source
+     * shared float bar[10];                                    // Source declaration.
+     * shared float foo;                                        // Source declaration.
+     * // Rest of the source ...
+     * // End of Source
+     *
+     * // Backend Output
+     * class Wrapper {                                          // Added at runtime by backend.
+     *
+     * threadgroup float (&foo);                                // Replaced by regex and macros.
+     * threadgroup float (&bar)[10];                            // Replaced by regex and macros.
+     * // Rest of the source ...
+     *
+     * Wrapper (                                                // Added at runtime by backend.
+     * threadgroup float (&_foo), threadgroup float (&_bar)[10] // MSL_SHARED_VARS_ARGS
+     * )                                                        // Added at runtime by backend.
+     * : foo(_foo), bar(_bar)                                   // MSL_SHARED_VARS_ASSIGN
+     * {}                                                       // Added at runtime by backend.
+     *
+     * }; // End of Wrapper                                     // Added at runtime by backend.
+     *
+     * kernel entry_point() {                                   // Added at runtime by backend.
+     *
+     * threadgroup float foo;                                   // MSL_SHARED_VARS_DECLARE
+     * threadgroup float bar[10]                                // MSL_SHARED_VARS_DECLARE
+     *
+     * Wrapper wrapper                                          // Added at runtime by backend.
+     * (foo, bar)                                               // MSL_SHARED_VARS_PASS
+     * ;                                                        // Added at runtime by backend.
+     *
+     * }                                                        // Added at runtime by backend.
+     * // End of Backend Output
+     * `
+     */
+    std::stringstream args, assign, declare, pass;
+
+    bool first = true;
+    for (SharedVar &var : shared_vars_) {
+      char sep = first ? ' ' : ',';
+      /*  */
+      args << sep << "threadgroup " << var.type << "(&_" << var.name << ")" << var.array;
+      assign << (first ? ':' : ',') << var.name << "(_" << var.name << ")";
+      declare << "threadgroup " << var.type << ' ' << var.name << var.array << ";";
+      pass << sep << var.name;
+      first = false;
+    }
+
+    suffix << "#define MSL_SHARED_VARS_ARGS " << args.str() << "\n";
+    suffix << "#define MSL_SHARED_VARS_ASSIGN " << assign.str() << "\n";
+    suffix << "#define MSL_SHARED_VARS_DECLARE " << declare.str() << "\n";
+    suffix << "#define MSL_SHARED_VARS_PASS (" << pass.str() << ")\n";
+
+    return suffix.str();
+  }
+};
+
+template<typename T> class PreprocessorPython : public Preprocessor<T, true> {
+ public:
+  PreprocessorPython(T &error_cb) : Preprocessor<T, true>(error_cb){};
+};
+
+}  // namespace blender::gpu::shader
--- a/source/blender/gpu/intern/gpu_shader.cc
+++ b/source/blender/gpu/intern/gpu_shader.cc
@@ -17,6 +17,8 @@
 #include "GPU_matrix.hh"
 #include "GPU_platform.hh"

+#include "glsl_preprocess/glsl_preprocess.hh"
+
 #include "gpu_backend.hh"
 #include "gpu_context_private.hh"
 #include "gpu_shader_create_info.hh"
@@ -299,6 +301,40 @@ GPUShader *GPU_shader_create_from_info(const GPUShaderCreateInfo *_info)
  return wrap(Context::get()->compiler->compile(info, false));
 }

+static std::string preprocess_source(StringRefNull original)
+{
+  auto no_err_report = [](std::string, std::smatch, const char *) {};
+  gpu::shader::PreprocessorPython processor(no_err_report);
+  processor << std::string(original);
+  return processor.str();
+};
+
+GPUShader *GPU_shader_create_from_info_python(const GPUShaderCreateInfo *_info)
+{
+  using namespace blender::gpu::shader;
+  ShaderCreateInfo &info = *const_cast<ShaderCreateInfo *>(
+      reinterpret_cast<const ShaderCreateInfo *>(_info));
+
+  std::string vertex_source_original = info.vertex_source_generated;
+  std::string fragment_source_original = info.fragment_source_generated;
+  std::string geometry_source_original = info.geometry_source_generated;
+  std::string compute_source_original = info.compute_source_generated;
+
+  info.vertex_source_generated = preprocess_source(info.vertex_source_generated);
+  info.fragment_source_generated = preprocess_source(info.fragment_source_generated);
+  info.geometry_source_generated = preprocess_source(info.geometry_source_generated);
+  info.compute_source_generated = preprocess_source(info.compute_source_generated);
+
+  GPUShader *result = wrap(Context::get()->compiler->compile(info, false));
+
+  info.vertex_source_generated = vertex_source_original;
+  info.fragment_source_generated = fragment_source_original;
+  info.geometry_source_generated = geometry_source_original;
+  info.compute_source_generated = compute_source_original;
+
+  return result;
+}
+
 GPUShader *GPU_shader_create_from_python(const char *vertcode,
                                         const char *fragcode,
                                         const char *geomcode,
@@ -315,6 +351,28 @@ GPUShader *GPU_shader_create_from_python(const char *vertcode,
    libcode = libcodecat = BLI_strdupcat(libcode, datatoc_gpu_shader_colorspace_lib_glsl);
  }

+  std::string vertex_source_processed;
+  std::string fragment_source_processed;
+  std::string geometry_source_processed;
+  std::string library_source_processed;
+
+  if (vertcode != nullptr) {
+    vertex_source_processed = preprocess_source(vertcode);
+    vertcode = vertex_source_processed.c_str();
+  }
+  if (fragcode != nullptr) {
+    fragment_source_processed = preprocess_source(fragcode);
+    fragcode = fragment_source_processed.c_str();
+  }
+  if (geomcode != nullptr) {
+    geometry_source_processed = preprocess_source(geomcode);
+    geomcode = geometry_source_processed.c_str();
+  }
+  if (libcode != nullptr) {
+    library_source_processed = preprocess_source(libcode);
+    libcode = library_source_processed.c_str();
+  }
+
  /* Use pyGPUShader as default name for shader. */
  const char *shname = name != nullptr ? name : "pyGPUShader";

--- a/source/blender/gpu/intern/gpu_shader_dependency.cc
+++ b/source/blender/gpu/intern/gpu_shader_dependency.cc
@@ -484,7 +484,13 @@ struct GPUSource {
      int64_t keyword_cursor = 0;
      out_qualifier = keyword_parse(arg, keyword_cursor);
      out_type = keyword_parse(arg, keyword_cursor);
+      /* Skip qualifier prefix macro expanded by GLSL preprocessing (e.g. _out_sta). */
+      StringRef qualifier_prefix = keyword_parse(arg, keyword_cursor);
      out_name = keyword_parse(arg, keyword_cursor);
+
+      if (out_qualifier == "const") {
+        out_name = qualifier_prefix;
+      }
      if (out_name.is_empty()) {
        /* No qualifier case. */
        out_name = out_type;
@@ -1031,8 +1037,8 @@ struct GPUSource {
      GPUSource *dependency_source = nullptr;

      {
-        /* Include directive has been mangled on purpose. See `datatoc.cc`. */
-        pos = source.find("//nclude \"", pos + 1);
+        /* Include directive has been mangled on purpose. See `glsl_preprocess.hh`. */
+        pos = source.find("\n//include \"", pos + 1);
        if (pos == -1) {
          return 0;
        }
--- a/source/blender/gpu/metal/mtl_shader_generator.hh
+++ b/source/blender/gpu/metal/mtl_shader_generator.hh
@@ -426,8 +426,6 @@ class MSLGeneratorInterface {
  blender::Vector<MSLVertexOutputAttribute> vertex_output_varyings_tf;
  /* Clip Distances. */
  blender::Vector<char> clip_distances;
-  /* Shared Memory Blocks. */
-  blender::Vector<MSLSharedMemoryBlock> shared_memory_blocks;
  /* Max bind IDs. */
  int max_tex_bind_index = 0;
  /** GL Global usage. */
--- a/source/blender/gpu/metal/mtl_shader_generator.mm
+++ b/source/blender/gpu/metal/mtl_shader_generator.mm
@@ -182,26 +182,6 @@ static void remove_singleline_comments_func(std::string &str)
  }
 }

-static bool is_program_word(const char *chr, int *len)
-{
-  int numchars = 0;
-  for (const char *c = chr; *c != '\0'; c++) {
-    char ch = *c;
-    /* NOTE: Hash (`#`) is not valid in var names, but is used by Closure macro patterns. */
-    if ((ch >= 'a' && ch <= 'z') || (ch >= 'A' && ch <= 'Z') ||
-        (numchars > 0 && ch >= '0' && ch <= '9') || ch == '_' || ch == '#')
-    {
-      numchars++;
-    }
-    else {
-      *len = numchars;
-      return (numchars > 0);
-    }
-  }
-  *len = numchars;
-  return true;
-}
-
 static int backwards_program_word_scan(const char *array_loc, const char *min)
 {
  const char *start;
@@ -267,130 +247,6 @@ static void extract_and_replace_clipping_distances(std::string &vertex_source,
  }
 }

-/**
- * Replace function parameter patterns containing:
- * `out vec3 somevar` with `THD vec3&somevar`.
- * which enables pass by reference via resolved macro:
- * `thread vec3& somevar`.
- */
-static void replace_outvars(std::string &str)
-{
-  char *current_str_begin = &*str.begin();
-  char *current_str_end = &*str.end();
-
-  for (char *c = current_str_begin + 2; c < current_str_end - 6; c++) {
-    char *start = strstr(c, "out ");
-    if (start == nullptr) {
-      return;
-    }
-    else {
-      c = start;
-      if (strncmp(c - 2, "in", 2) == 0) {
-        start = c - 2;
-      }
-
-      /* Check that the following are words. */
-      int len1, len2;
-      char *word_base1 = c + 4;
-      char *word_base2 = word_base1;
-
-      if (is_program_word(word_base1, &len1) && (*(word_base1 + len1) == ' ')) {
-        word_base2 = word_base1 + len1 + 1;
-        if (is_program_word(word_base2, &len2)) {
-          /* Match found. */
-          bool is_array = (*(word_base2 + len2) == '[');
-          if (is_array) {
-            /* Generate out-variable pattern for arrays, of form
-             * `OUT(vec2,samples,CRYPTOMATTE_LEVELS_MAX)`
-             * replacing original `out vec2 samples[SAMPLE_LEN]`
-             * using 'OUT' macro declared in `mtl_shader_defines.msl`. */
-            char *array_end = strchr(word_base2 + len2, ']');
-            if (array_end != nullptr) {
-              *start = 'O';
-              *(start + 1) = 'U';
-              *(start + 2) = 'T';
-              *(start + 3) = '(';
-              for (char *clear = start + 4; clear < c + 4; clear++) {
-                *clear = ' ';
-              }
-              *(word_base2 - 1) = ',';
-              *(word_base2 + len2) = ',';
-              *array_end = ')';
-            }
-          }
-          else {
-            /* Generate out-variable pattern of form `THD type&var` from original `out vec4 var`.
-             */
-            *start = 'T';
-            *(start + 1) = 'H';
-            *(start + 2) = 'D';
-            for (char *clear = start + 3; clear < c + 4; clear++) {
-              *clear = ' ';
-            }
-            *(word_base2 - 1) = '&';
-          }
-        }
-      }
-    }
-  }
-}
-
-static void replace_matrix_constructors(std::string &str)
-{
-
-  /* Replace matrix constructors with GLSL-compatible constructors for Metal.
-   * Base matrix constructors e.g. mat3x3 do not have as many overload variants as GLSL.
-   * To add compatibility, we declare custom constructors e.g. MAT3x3 in `mtl_shader_defines.msl`.
-   * If the GLSL syntax matches, we map mat3x3(..) -> MAT3x3(..) and implement a custom
-   * constructor. This supports both mat3(..) and mat3x3(..) style syntax. */
-  char *current_str_begin = &*str.begin();
-  char *current_str_end = &*str.end();
-
-  for (char *c = current_str_begin; c < current_str_end - 10; c++) {
-    char *base_scan = strstr(c, "mat");
-    if (base_scan == nullptr) {
-      break;
-    }
-    /* Track end of constructor. */
-    char *constructor_end = nullptr;
-
-    /* check if next character is matrix dim. */
-    c = base_scan + 3;
-    if (!(*c == '2' || *c == '3' || *c == '4')) {
-      /* Not constructor, skip. */
-      continue;
-    }
-
-    /* Possible multiple dimensional matrix constructor. Verify if next char is a dim. */
-    c++;
-    if (*c == 'x') {
-      c++;
-      if (*c == '2' || *c == '3' || *c == '4') {
-        c++;
-      }
-      else {
-        /* Not matrix constructor, continue. */
-        continue;
-      }
-    }
-
-    /* Check for constructor opening brace. */
-    if (*c == '(') {
-      constructor_end = c;
-    }
-    else {
-      /* Not matrix constructor, continue. */
-      continue;
-    }
-
-    /* If is constructor, replace with MATN(..) syntax. */
-    if (constructor_end != nullptr) {
-      ARRAY_SET_ITEMS(base_scan, 'M', 'A', 'T');
-      continue;
-    }
-  }
-}
-
 static void replace_array_initializers_func(std::string &str)
 {
  char *current_str_begin = &*str.begin();
@@ -572,135 +428,6 @@ static bool extract_ssbo_pragma_info(const MTLShader *shader,
  return false;
 }

-/* Extract shared memory declaration and their parameters.
- * Inserts extracted cases as entries in MSLGeneratorInterface's shared memory block
- * list. These will later be used to generate shared memory declarations within the entry point.
- *
- * TODO(Metal/GPU): Move shared memory declarations to GPUShaderCreateInfo. This is currently a
- * necessary workaround to match GLSL functionality and enable full compute shader support. In the
- * long term, best to avoid needing to perform this operation. */
-void extract_shared_memory_blocks(MSLGeneratorInterface &msl_iface,
-                                  std::string &glsl_compute_source)
-{
-  msl_iface.shared_memory_blocks.clear();
-  char *current_str_begin = &*glsl_compute_source.begin();
-  char *current_str_end = &*glsl_compute_source.end();
-
-  for (char *c = current_str_begin; c < current_str_end - 6; c++) {
-    /* Find first instance of "shared ". */
-    char *c_expr_start = strstr(c, "shared ");
-    if (c_expr_start == nullptr) {
-      break;
-    }
-    /* Check if "shared" was part of a previous word. If so, this is not valid. */
-    if (next_word_in_range(c_expr_start - 1, c_expr_start) != nullptr) {
-      c += 7; /* Jump forward by length of "shared ". */
-      continue;
-    }
-
-    /* Jump to shared declaration and detect end of statement. */
-    c = c_expr_start;
-    char *c_expr_end = strstr(c, ";");
-    if (c_expr_end == nullptr) {
-      break;
-    }
-
-    /* Prepare MSLSharedMemoryBlock instance. */
-    MSLSharedMemoryBlock new_shared_block;
-    char buf[256];
-
-    /* Read type-name. */
-    c += 7; /* Jump forward by length of "shared ". */
-    c = next_word_in_range(c, c_expr_end);
-    if (c == nullptr) {
-      c = c_expr_end + 1;
-      continue;
-    }
-
-    char *c_next_space = next_symbol_in_range(c, c_expr_end, ' ');
-    if (c_next_space == nullptr) {
-      c = c_expr_end + 1;
-      continue;
-    }
-    int len = c_next_space - c;
-    BLI_assert(len < 256);
-    BLI_strncpy(buf, c, len + 1);
-    new_shared_block.type_name = std::string(buf);
-
-    /* Read var-name.
-     * `varname` can either come right before the final semi-colon, or
-     * with following array syntax.
-     * spaces may exist before closing symbol. */
-    c = c_next_space + 1;
-    c = next_word_in_range(c, c_expr_end);
-    if (c == nullptr) {
-      c = c_expr_end + 1;
-      continue;
-    }
-
-    char *c_array_begin = next_symbol_in_range(c, c_expr_end, '[');
-    c_next_space = next_symbol_in_range(c, c_expr_end, ' ');
-
-    char *varname_end = nullptr;
-    if (c_array_begin != nullptr) {
-      /* Array path. */
-      if (c_next_space != nullptr) {
-        varname_end = (c_next_space < c_array_begin) ? c_next_space : c_array_begin;
-      }
-      else {
-        varname_end = c_array_begin;
-      }
-      new_shared_block.is_array = true;
-    }
-    else {
-      /* Ending semi-colon. */
-      if (c_next_space != nullptr) {
-        varname_end = (c_next_space < c_expr_end) ? c_next_space : c_expr_end;
-      }
-      else {
-        varname_end = c_expr_end;
-      }
-      new_shared_block.is_array = false;
-    }
-    len = varname_end - c;
-    BLI_assert(len < 256);
-    BLI_strncpy(buf, c, len + 1);
-    new_shared_block.varname = std::string(buf);
-
-    /* Determine if array. */
-    if (new_shared_block.is_array) {
-      int len = c_expr_end - c_array_begin;
-      BLI_strncpy(buf, c_array_begin, len + 1);
-      new_shared_block.array_decl = std::string(buf);
-    }
-
-    /* Shared block is valid, add it to the list and replace declaration with class member.
-     * reference. This declaration needs to have one of the formats:
-     * TG int& varname;
-     * TG int (&varname)[len][len]
-     *
-     * In order to fit in the same space, replace `threadgroup` with `TG` macro.
-     */
-    for (char *c = c_expr_start; c <= c_expr_end; c++) {
-      *c = ' ';
-    }
-    std::string out_str = "TG ";
-    out_str += new_shared_block.type_name;
-    out_str += (new_shared_block.is_array) ? "(&" : "&";
-    out_str += new_shared_block.varname;
-    if (new_shared_block.is_array) {
-      out_str += ")" + new_shared_block.array_decl;
-    }
-    out_str += ";;";
-    memcpy(c_expr_start, out_str.c_str(), (out_str.length() - 1) * sizeof(char));
-
-    /* Jump to end of statement. */
-    c = c_expr_end + 1;
-
-    msl_iface.shared_memory_blocks.append(new_shared_block);
-  }
-}
-
 /** \} */

 /* -------------------------------------------------------------------- */
@@ -973,11 +700,9 @@ bool MTLShader::generate_msl_from_glsl(const shader::ShaderCreateInfo *info)
  }

  /* Special condition - mat3 and array constructor replacement. */
-  replace_matrix_constructors(shd_builder_->glsl_vertex_source_);
  replace_array_initializers_func(shd_builder_->glsl_vertex_source_);

  if (!msl_iface.uses_transform_feedback) {
-    replace_matrix_constructors(shd_builder_->glsl_fragment_source_);
    replace_array_initializers_func(shd_builder_->glsl_fragment_source_);
  }

@@ -1058,12 +783,6 @@ bool MTLShader::generate_msl_from_glsl(const shader::ShaderCreateInfo *info)
  /* Extract gl_ClipDistances. */
  extract_and_replace_clipping_distances(shd_builder_->glsl_vertex_source_, msl_iface);

-  /* Replace 'out' attribute on function parameters with pass-by-reference. */
-  replace_outvars(shd_builder_->glsl_vertex_source_);
-  if (!msl_iface.uses_transform_feedback) {
-    replace_outvars(shd_builder_->glsl_fragment_source_);
-  }
-
  /**** METAL Shader source generation. ****/
  /* Setup `stringstream` for populating generated MSL shader vertex/frag shaders. */
  std::stringstream ss_vertex;
@@ -1252,7 +971,7 @@ bool MTLShader::generate_msl_from_glsl(const shader::ShaderCreateInfo *info)
  /* Add Texture members.
   * These members pack both a texture and a sampler into a single
   * struct, as both are needed within texture functions.
-   * e.g. `_mtl_combined_image_sampler_2d<float, access::read>`
+   * e.g. `_mtl_sampler_2d<float, access::read>`
   * The exact typename is generated inside `get_msl_typestring_wrapper()`. */
  for (const MSLTextureResource &tex : msl_iface.texture_samplers) {
    if (bool(tex.stage & ShaderStage::VERTEX)) {
@@ -1485,7 +1204,6 @@ bool MTLShader::generate_msl_from_glsl_compute(const shader::ShaderCreateInfo *i
  BLI_assert(shd_builder_->glsl_compute_source_.size() > 0);

  /*** Source cleanup. ***/
-  replace_matrix_constructors(shd_builder_->glsl_compute_source_);
  replace_array_initializers_func(shd_builder_->glsl_compute_source_);

  /**** Extract usage of GL globals. ****/
@@ -1526,19 +1244,6 @@ bool MTLShader::generate_msl_from_glsl_compute(const shader::ShaderCreateInfo *i
  remove_multiline_comments_func(shd_builder_->glsl_compute_source_);
  remove_singleline_comments_func(shd_builder_->glsl_compute_source_);

-  /** Extract usage of shared memory.
-   * For Metal shaders to compile, shared (threadgroup) memory cannot be declared globally.
-   * It must reside within a function scope. Hence, we need to extract these uses and generate
-   * shared memory blocks within the entry point function, which can then be passed as references
-   * to the remaining shader via the class function scope.
-   *
-   * The existing block definitions are then replaced with references to threadgroup memory blocks,
-   * but kept in-line in case external macros are used to declare the dimensions. */
-  extract_shared_memory_blocks(msl_iface, shd_builder_->glsl_compute_source_);
-
-  /* Replace 'out' attribute on function parameters with pass-by-reference. */
-  replace_outvars(shd_builder_->glsl_compute_source_);
-
  /** Generate Compute shader stage. **/
  std::stringstream ss_compute;
  ss_compute << "#line 1 \"msl_wrapper_code\"\n";
@@ -1599,7 +1304,7 @@ bool MTLShader::generate_msl_from_glsl_compute(const shader::ShaderCreateInfo *i
  /* Add Texture members.
   * These members pack both a texture and a sampler into a single
   * struct, as both are needed within texture functions.
-   * e.g. `_mtl_combined_image_sampler_2d<float, access::read>`
+   * e.g. `_mtl_sampler_2d<float, access::read>`
   * The exact typename is generated inside `get_msl_typestring_wrapper()`. */
  for (const MSLTextureResource &tex : msl_iface.texture_samplers) {
    if (bool(tex.stage & ShaderStage::COMPUTE)) {
@@ -1631,37 +1336,8 @@ bool MTLShader::generate_msl_from_glsl_compute(const shader::ShaderCreateInfo *i
  /* Compute constructor for Shared memory blocks, as we must pass
   * local references from entry-point function scope into the class
   * instantiation. */
-  ss_compute << get_stage_class_name(ShaderStage::COMPUTE) << "(";
-  bool first = true;
-  if (msl_iface.shared_memory_blocks.size() > 0) {
-    for (const MSLSharedMemoryBlock &block : msl_iface.shared_memory_blocks) {
-      if (!first) {
-        ss_compute << ",";
-      }
-      if (block.is_array) {
-        ss_compute << "TG " << block.type_name << " (&_" << block.varname << ")"
-                   << block.array_decl;
-      }
-      else {
-        ss_compute << "TG " << block.type_name << " &_" << block.varname;
-      }
-      ss_compute << std::endl;
-      first = false;
-    }
-    ss_compute << ") : ";
-    first = true;
-    for (const MSLSharedMemoryBlock &block : msl_iface.shared_memory_blocks) {
-      if (!first) {
-        ss_compute << ",";
-      }
-      ss_compute << block.varname << "(_" << block.varname << ")";
-      first = false;
-    }
-  }
-  else {
-    ss_compute << ") ";
-  }
-  ss_compute << "{ }" << std::endl;
+  ss_compute << get_stage_class_name(ShaderStage::COMPUTE)
+             << "(MSL_SHARED_VARS_ARGS) MSL_SHARED_VARS_ASSIGN {}\n";

  /* Class Closing Bracket to end shader global scope. */
  ss_compute << "};" << std::endl;
@@ -2440,28 +2116,9 @@ std::string MSLGeneratorInterface::generate_msl_compute_entry_stub()

  out << this->generate_msl_compute_inputs_string();
  out << ") {" << std::endl << std::endl;
-  /* Generate Compute shader instance constructor. If shared memory blocks are used,
-   * these must be declared and then passed into the constructor. */
-  std::string stage_instance_constructor = "";
-  bool first = true;
-  if (shared_memory_blocks.size() > 0) {
-    stage_instance_constructor += "(";
-    for (const MSLSharedMemoryBlock &block : shared_memory_blocks) {
-      if (block.is_array) {
-        out << "TG " << block.type_name << " " << block.varname << block.array_decl << ";";
-      }
-      else {
-        out << "TG " << block.type_name << " " << block.varname << ";";
-      }
-      stage_instance_constructor += ((!first) ? "," : "") + block.varname;
-      first = false;
-
-      out << std::endl;
-    }
-    stage_instance_constructor += ")";
-  }
+  out << "MSL_SHARED_VARS_DECLARE\n";
  out << "\t" << get_stage_class_name(ShaderStage::COMPUTE) << " " << shader_stage_inst_name
-      << stage_instance_constructor << ";" << std::endl;
+      << " MSL_SHARED_VARS_PASS;\n";

  /* Copy global variables. */
  /* Entry point parameters for gl Globals. */
@@ -3584,9 +3241,10 @@ std::string MSLGeneratorInterface::generate_msl_texture_vars(ShaderStage shader_
      if (tex_buf_id != -1) {
        MSLBufferBlock &ssbo = this->storage_blocks[tex_buf_id];
        out << "\t" << get_shader_stage_instance_name(shader_stage) << "."
-            << this->texture_samplers[i].name << ".buffer = " << ssbo.name << ";" << std::endl;
+            << this->texture_samplers[i].name << ".atomic.buffer = " << ssbo.name << ";"
+            << std::endl;
        out << "\t" << get_shader_stage_instance_name(shader_stage) << "."
-            << this->texture_samplers[i].name << ".aligned_width = uniforms->"
+            << this->texture_samplers[i].name << ".atomic.aligned_width = uniforms->"
            << this->texture_samplers[i].name << "_metadata.w;" << std::endl;

        /* Buffer-backed 2D Array and 3D texture types are not natively supported so texture size
@@ -3598,7 +3256,7 @@ std::string MSLGeneratorInterface::generate_msl_texture_vars(ShaderStage shader_
                 ImageType::INT_3D_ATOMIC))
        {
          out << "\t" << get_shader_stage_instance_name(shader_stage) << "."
-              << this->texture_samplers[i].name << ".texture_size = ushort3(uniforms->"
+              << this->texture_samplers[i].name << ".atomic.texture_size = ushort3(uniforms->"
              << this->texture_samplers[i].name << "_metadata.xyz);" << std::endl;
        }
      }
@@ -4022,128 +3680,128 @@ std::string MSLTextureResource::get_msl_wrapper_type_str() const
  /* Add Types as needed. */
  switch (this->type) {
    case ImageType::FLOAT_1D: {
-      return "_mtl_combined_image_sampler_1d";
+      return "_mtl_sampler_1d";
    }
    case ImageType::FLOAT_2D: {
-      return "_mtl_combined_image_sampler_2d";
+      return "_mtl_sampler_2d";
    }
    case ImageType::FLOAT_3D: {
-      return "_mtl_combined_image_sampler_3d";
+      return "_mtl_sampler_3d";
    }
    case ImageType::FLOAT_CUBE: {
-      return "_mtl_combined_image_sampler_cube";
+      return "_mtl_sampler_cube";
    }
    case ImageType::FLOAT_1D_ARRAY: {
-      return "_mtl_combined_image_sampler_1d_array";
+      return "_mtl_sampler_1d_array";
    }
    case ImageType::FLOAT_2D_ARRAY: {
-      return "_mtl_combined_image_sampler_2d_array";
+      return "_mtl_sampler_2d_array";
    }
    case ImageType::FLOAT_CUBE_ARRAY: {
-      return "_mtl_combined_image_sampler_cube_array";
+      return "_mtl_sampler_cube_array";
    }
    case ImageType::FLOAT_BUFFER: {
-      return "_mtl_combined_image_sampler_buffer";
+      return "_mtl_sampler_buffer";
    }
    case ImageType::DEPTH_2D: {
-      return "_mtl_combined_image_sampler_depth_2d";
+      return "_mtl_sampler_depth_2d";
    }
    case ImageType::SHADOW_2D: {
-      return "_mtl_combined_image_sampler_depth_2d";
+      return "_mtl_sampler_depth_2d";
    }
    case ImageType::DEPTH_2D_ARRAY: {
-      return "_mtl_combined_image_sampler_depth_2d_array";
+      return "_mtl_sampler_depth_2d_array";
    }
    case ImageType::SHADOW_2D_ARRAY: {
-      return "_mtl_combined_image_sampler_depth_2d_array";
+      return "_mtl_sampler_depth_2d_array";
    }
    case ImageType::DEPTH_CUBE: {
-      return "_mtl_combined_image_sampler_depth_cube";
+      return "_mtl_sampler_depth_cube";
    }
    case ImageType::SHADOW_CUBE: {
-      return "_mtl_combined_image_sampler_depth_cube";
+      return "_mtl_sampler_depth_cube";
    }
    case ImageType::DEPTH_CUBE_ARRAY: {
-      return "_mtl_combined_image_sampler_depth_cube_array";
+      return "_mtl_sampler_depth_cube_array";
    }
    case ImageType::SHADOW_CUBE_ARRAY: {
-      return "_mtl_combined_image_sampler_depth_cube_array";
+      return "_mtl_sampler_depth_cube_array";
    }
    case ImageType::INT_1D: {
-      return "_mtl_combined_image_sampler_1d";
+      return "_mtl_sampler_1d";
    }
    case ImageType::INT_2D: {
-      return "_mtl_combined_image_sampler_2d";
+      return "_mtl_sampler_2d";
    }
    case ImageType::INT_3D: {
-      return "_mtl_combined_image_sampler_3d";
+      return "_mtl_sampler_3d";
    }
    case ImageType::INT_CUBE: {
-      return "_mtl_combined_image_sampler_cube";
+      return "_mtl_sampler_cube";
    }
    case ImageType::INT_1D_ARRAY: {
-      return "_mtl_combined_image_sampler_1d_array";
+      return "_mtl_sampler_1d_array";
    }
    case ImageType::INT_2D_ARRAY: {
-      return "_mtl_combined_image_sampler_2d_array";
+      return "_mtl_sampler_2d_array";
    }
    case ImageType::INT_CUBE_ARRAY: {
-      return "_mtl_combined_image_sampler_cube_array";
+      return "_mtl_sampler_cube_array";
    }
    case ImageType::INT_BUFFER: {
-      return "_mtl_combined_image_sampler_buffer";
+      return "_mtl_sampler_buffer";
    }
    case ImageType::UINT_1D: {
-      return "_mtl_combined_image_sampler_1d";
+      return "_mtl_sampler_1d";
    }
    case ImageType::UINT_2D: {
-      return "_mtl_combined_image_sampler_2d";
+      return "_mtl_sampler_2d";
    }
    case ImageType::UINT_3D: {
-      return "_mtl_combined_image_sampler_3d";
+      return "_mtl_sampler_3d";
    }
    case ImageType::UINT_CUBE: {
-      return "_mtl_combined_image_sampler_cube";
+      return "_mtl_sampler_cube";
    }
    case ImageType::UINT_1D_ARRAY: {
-      return "_mtl_combined_image_sampler_1d_array";
+      return "_mtl_sampler_1d_array";
    }
    case ImageType::UINT_2D_ARRAY: {
-      return "_mtl_combined_image_sampler_2d_array";
+      return "_mtl_sampler_2d_array";
    }
    case ImageType::UINT_CUBE_ARRAY: {
-      return "_mtl_combined_image_sampler_cube_array";
+      return "_mtl_sampler_cube_array";
    }
    case ImageType::UINT_BUFFER: {
-      return "_mtl_combined_image_sampler_buffer";
+      return "_mtl_sampler_buffer";
    }
    /* If native texture atomics are unsupported, map types to fallback atomic structures which
     * contain a buffer pointer and metadata members for size and alignment. */
    case ImageType::INT_2D_ATOMIC:
    case ImageType::UINT_2D_ATOMIC: {
      if (supports_native_atomics) {
-        return "_mtl_combined_image_sampler_2d";
+        return "_mtl_sampler_2d";
      }
      else {
-        return "_mtl_combined_image_sampler_2d_atomic_fallback";
+        return "_mtl_sampler_2d_atomic";
      }
    }
    case ImageType::INT_3D_ATOMIC:
    case ImageType::UINT_3D_ATOMIC: {
      if (supports_native_atomics) {
-        return "_mtl_combined_image_sampler_3d";
+        return "_mtl_sampler_3d";
      }
      else {
-        return "_mtl_combined_image_sampler_3d_atomic_fallback";
+        return "_mtl_sampler_3d_atomic";
      }
    }
    case ImageType::INT_2D_ARRAY_ATOMIC:
    case ImageType::UINT_2D_ARRAY_ATOMIC: {
      if (supports_native_atomics) {
-        return "_mtl_combined_image_sampler_2d_array";
+        return "_mtl_sampler_2d_array";
      }
      else {
-        return "_mtl_combined_image_sampler_2d_array_atomic_fallback";
+        return "_mtl_sampler_2d_array_atomic";
      }
    }
    default: {
--- a/source/blender/gpu/opengl/gl_backend.cc
+++ b/source/blender/gpu/opengl/gl_backend.cc
@@ -427,24 +427,6 @@ static void detect_workarounds()
  {
    GLContext::unused_fb_slot_workaround = true;
  }
-  /* dFdx/dFdy calculation factors, those are dependent on driver. */
-  if (GPU_type_matches(GPU_DEVICE_ATI, GPU_OS_ANY, GPU_DRIVER_ANY) && strstr(version, "3.3.10750"))
-  {
-    GLContext::derivative_signs[0] = 1.0;
-    GLContext::derivative_signs[1] = -1.0;
-  }
-  else if (GPU_type_matches(GPU_DEVICE_INTEL, GPU_OS_WIN, GPU_DRIVER_ANY)) {
-    if (strstr(version, "4.0.0 - Build 10.18.10.3308") ||
-        strstr(version, "4.0.0 - Build 9.18.10.3186") ||
-        strstr(version, "4.0.0 - Build 9.18.10.3165") ||
-        strstr(version, "3.1.0 - Build 9.17.10.3347") ||
-        strstr(version, "3.1.0 - Build 9.17.10.4101") ||
-        strstr(version, "3.3.0 - Build 8.15.10.2618"))
-    {
-      GLContext::derivative_signs[0] = -1.0;
-      GLContext::derivative_signs[1] = 1.0;
-    }
-  }

  /* Draw shader parameters are broken on Qualcomm Windows ARM64 devices
   * on Mesa version < 24.0.0 */
@@ -560,7 +542,6 @@ bool GLContext::texture_filter_anisotropic_support = false;
 bool GLContext::debug_layer_workaround = false;
 bool GLContext::unused_fb_slot_workaround = false;
 bool GLContext::generate_mipmap_workaround = false;
-float GLContext::derivative_signs[2] = {1.0f, 1.0f};

 void GLBackend::capabilities_init()
 {
--- a/source/blender/gpu/opengl/gl_context.hh
+++ b/source/blender/gpu/opengl/gl_context.hh
@@ -67,7 +67,6 @@ class GLContext : public Context {
  static bool debug_layer_workaround;
  static bool unused_fb_slot_workaround;
  static bool generate_mipmap_workaround;
-  static float derivative_signs[2];

  /** VBO for missing vertex attribute binding. Avoid undefined behavior on some implementation. */
  GLuint default_attr_vbo_;
--- a/source/blender/gpu/opengl/gl_shader.cc
+++ b/source/blender/gpu/opengl/gl_shader.cc
@@ -1059,10 +1059,6 @@ static const char *glsl_patch_default_get()
  /* Array compatibility. */
  ss << "#define gpu_Array(_type) _type[]\n";

-  /* Derivative sign can change depending on implementation. */
-  ss << "#define DFDX_SIGN " << std::setprecision(2) << GLContext::derivative_signs[0] << "\n";
-  ss << "#define DFDY_SIGN " << std::setprecision(2) << GLContext::derivative_signs[1] << "\n";
-
  /* GLSL Backend Lib. */
  ss << datatoc_glsl_shader_defines_glsl;

--- a/source/blender/gpu/shaders/common/gpu_shader_math_matrix_lib.glsl
+++ b/source/blender/gpu/shaders/common/gpu_shader_math_matrix_lib.glsl
@@ -803,7 +803,7 @@ mat3x3 rotate(mat3x3 mat, EulerXYZ rotation)

 mat4x4 rotate(mat4x4 mat, AxisAngle rotation)
 {
-  mat4x4 result = mat4x4(rotate(mat3x3(mat), rotation));
+  mat4x4 result = to_float4x4(rotate(to_float3x3(mat), rotation));
  result[0][3] = mat[0][3];
  result[1][3] = mat[1][3];
  result[2][3] = mat[2][3];
@@ -815,7 +815,7 @@ mat4x4 rotate(mat4x4 mat, AxisAngle rotation)
 }
 mat4x4 rotate(mat4x4 mat, EulerXYZ rotation)
 {
-  mat4x4 result = mat4x4(rotate(mat3x3(mat), rotation));
+  mat4x4 result = to_float4x4(rotate(to_float3x3(mat), rotation));
  result[0][3] = mat[0][3];
  result[1][3] = mat[1][3];
  result[2][3] = mat[2][3];
@@ -992,38 +992,38 @@ mat3x3 from_rot_scale(AxisAngle rotation, vec3 scale)

 mat4x4 from_loc_rot(vec3 location, EulerXYZ rotation)
 {
-  mat4x4 ret = mat4x4(from_rotation(rotation));
+  mat4x4 ret = to_float4x4(from_rotation(rotation));
  ret[3].xyz = location;
  return ret;
 }
 mat4x4 from_loc_rot(vec3 location, Quaternion rotation)
 {
-  mat4x4 ret = mat4x4(from_rotation(rotation));
+  mat4x4 ret = to_float4x4(from_rotation(rotation));
  ret[3].xyz = location;
  return ret;
 }
 mat4x4 from_loc_rot(vec3 location, AxisAngle rotation)
 {
-  mat4x4 ret = mat4x4(from_rotation(rotation));
+  mat4x4 ret = to_float4x4(from_rotation(rotation));
  ret[3].xyz = location;
  return ret;
 }

 mat4x4 from_loc_rot_scale(vec3 location, EulerXYZ rotation, vec3 scale)
 {
-  mat4x4 ret = mat4x4(from_rot_scale(rotation, scale));
+  mat4x4 ret = to_float4x4(from_rot_scale(rotation, scale));
  ret[3].xyz = location;
  return ret;
 }
 mat4x4 from_loc_rot_scale(vec3 location, Quaternion rotation, vec3 scale)
 {
-  mat4x4 ret = mat4x4(from_rot_scale(rotation, scale));
+  mat4x4 ret = to_float4x4(from_rot_scale(rotation, scale));
  ret[3].xyz = location;
  return ret;
 }
 mat4x4 from_loc_rot_scale(vec3 location, AxisAngle rotation, vec3 scale)
 {
-  mat4x4 ret = mat4x4(from_rot_scale(rotation, scale));
+  mat4x4 ret = to_float4x4(from_rot_scale(rotation, scale));
  ret[3].xyz = location;
  return ret;
 }
@@ -1079,11 +1079,11 @@ EulerXYZ to_euler(mat3x3 mat, const bool normalized)
 }
 EulerXYZ to_euler(mat4x4 mat)
 {
-  return to_euler(mat3(mat));
+  return to_euler(to_float3x3(mat));
 }
 EulerXYZ to_euler(mat4x4 mat, const bool normalized)
 {
-  return to_euler(mat3(mat), normalized);
+  return to_euler(to_float3x3(mat), normalized);
 }

 Quaternion normalized_to_quat_fast(mat3 mat)
@@ -1193,11 +1193,11 @@ Quaternion to_quaternion(mat3x3 mat, const bool normalized)
 }
 Quaternion to_quaternion(mat4x4 mat)
 {
-  return to_quaternion(mat3(mat));
+  return to_quaternion(to_float3x3(mat));
 }
 Quaternion to_quaternion(mat4x4 mat, const bool normalized)
 {
-  return to_quaternion(mat3(mat), normalized);
+  return to_quaternion(to_float3x3(mat), normalized);
 }

 vec3 to_scale(mat3x3 mat)
@@ -1216,11 +1216,11 @@ vec3 to_scale(mat3x3 mat, const bool allow_negative_scale)
 }
 vec3 to_scale(mat4x4 mat)
 {
-  return to_scale(mat3(mat));
+  return to_scale(to_float3x3(mat));
 }
 vec3 to_scale(mat4x4 mat, const bool allow_negative_scale)
 {
-  return to_scale(mat3(mat), allow_negative_scale);
+  return to_scale(to_float3x3(mat), allow_negative_scale);
 }

 void to_rot_scale(mat3x3 mat, out EulerXYZ r_rotation, out vec3 r_scale)
@@ -1265,7 +1265,7 @@ void to_rot_scale(mat3x3 mat,
 void to_loc_rot_scale(mat4x4 mat, out vec3 r_location, out EulerXYZ r_rotation, out vec3 r_scale)
 {
  r_location = mat[3].xyz;
-  to_rot_scale(mat3(mat), r_rotation, r_scale);
+  to_rot_scale(to_float3x3(mat), r_rotation, r_scale);
 }
 void to_loc_rot_scale(mat4x4 mat,
                      out vec3 r_location,
@@ -1274,12 +1274,12 @@ void to_loc_rot_scale(mat4x4 mat,
                      const bool allow_negative_scale)
 {
  r_location = mat[3].xyz;
-  to_rot_scale(mat3(mat), r_rotation, r_scale, allow_negative_scale);
+  to_rot_scale(to_float3x3(mat), r_rotation, r_scale, allow_negative_scale);
 }
 void to_loc_rot_scale(mat4x4 mat, out vec3 r_location, out Quaternion r_rotation, out vec3 r_scale)
 {
  r_location = mat[3].xyz;
-  to_rot_scale(mat3(mat), r_rotation, r_scale);
+  to_rot_scale(to_float3x3(mat), r_rotation, r_scale);
 }
 void to_loc_rot_scale(mat4x4 mat,
                      out vec3 r_location,
@@ -1288,7 +1288,7 @@ void to_loc_rot_scale(mat4x4 mat,
                      const bool allow_negative_scale)
 {
  r_location = mat[3].xyz;
-  to_rot_scale(mat3(mat), r_rotation, r_scale, allow_negative_scale);
+  to_rot_scale(to_float3x3(mat), r_rotation, r_scale, allow_negative_scale);
 }

 vec3 transform_point(mat3x3 mat, vec3 point)
@@ -1308,7 +1308,7 @@ vec3 transform_direction(mat3x3 mat, vec3 direction)

 vec3 transform_direction(mat4x4 mat, vec3 direction)
 {
-  return mat3x3(mat) * direction;
+  return to_float3x3(mat) * direction;
 }

 vec2 project_point(mat3x3 mat, vec2 point)
@@ -1423,7 +1423,7 @@ bool is_negative(mat3x3 mat)
 }
 bool is_negative(mat4x4 mat)
 {
-  return is_negative(mat3x3(mat));
+  return is_negative(to_float3x3(mat));
 }

 bool is_equal(mat2x2 a, mat2x2 b, float epsilon)
@@ -1505,15 +1505,15 @@ bool is_uniformly_scaled(mat3x3 mat)

 bool is_orthogonal(mat4x4 mat)
 {
-  return is_orthogonal(mat3x3(mat));
+  return is_orthogonal(to_float3x3(mat));
 }
 bool is_orthonormal(mat4x4 mat)
 {
-  return is_orthonormal(mat3x3(mat));
+  return is_orthonormal(to_float3x3(mat));
 }
 bool is_uniformly_scaled(mat4x4 mat)
 {
-  return is_uniformly_scaled(mat3x3(mat));
+  return is_uniformly_scaled(to_float3x3(mat));
 }

 /* Returns true if each individual columns are unit scaled. Mainly for assert usage. */
--- a/source/blender/gpu/shaders/gpu_shader_2D_widget_base_vert.glsl
+++ b/source/blender/gpu/shaders/gpu_shader_2D_widget_base_vert.glsl
@@ -85,12 +85,15 @@ vec2 do_tria()
  float size = (tria2) ? -tria2Size : tria1Size;
  vec2 center = (tria2) ? tria2Center : tria1Center;

-  vec2 arrow_pos[4] = vec2[4](vec2(0.0, 0.6), vec2(0.6, 0.0), vec2(-0.6, 0.0), vec2(0.0, -0.6));
+  vec2 arrow_pos[4] = float2_array(
+      vec2(0.0, 0.6), vec2(0.6, 0.0), vec2(-0.6, 0.0), vec2(0.0, -0.6));
  /* Rotated uv space by 45deg and mirrored. */
-  vec2 arrow_uvs[4] = vec2[4](vec2(0.0, 0.85), vec2(0.85, 0.85), vec2(0.0, 0.0), vec2(0.0, 0.85));
+  vec2 arrow_uvs[4] = float2_array(
+      vec2(0.0, 0.85), vec2(0.85, 0.85), vec2(0.0, 0.0), vec2(0.0, 0.85));

-  vec2 point_pos[4] = vec2[4](vec2(-1.0, -1.0), vec2(-1.0, 1.0), vec2(1.0, -1.0), vec2(1.0, 1.0));
-  vec2 point_uvs[4] = vec2[4](vec2(0.0, 0.0), vec2(0.0, 1.0), vec2(1.0, 0.0), vec2(1.0, 1.0));
+  vec2 point_pos[4] = float2_array(
+      vec2(-1.0, -1.0), vec2(-1.0, 1.0), vec2(1.0, -1.0), vec2(1.0, 1.0));
+  vec2 point_uvs[4] = float2_array(vec2(0.0, 0.0), vec2(0.0, 1.0), vec2(1.0, 0.0), vec2(1.0, 1.0));

  /* We reuse the SDF round-box rendering of widget to render the tria shapes.
   * This means we do clever tricks to position the rectangle the way we want using
--- a/source/blender/gpu/shaders/gpu_shader_2D_widget_shadow_vert.glsl
+++ b/source/blender/gpu/shaders/gpu_shader_2D_widget_shadow_vert.glsl
@@ -30,44 +30,44 @@ void main()
 {
  /* NOTE(Metal): Declaring constant array in function scope to avoid increasing local shader
   * memory pressure. */
-  const vec2 cornervec[36] = vec2[36](vec2(0.0, 1.0),
-                                      vec2(0.02, 0.805),
-                                      vec2(0.067, 0.617),
-                                      vec2(0.169, 0.45),
-                                      vec2(0.293, 0.293),
-                                      vec2(0.45, 0.169),
-                                      vec2(0.617, 0.076),
-                                      vec2(0.805, 0.02),
-                                      vec2(1.0, 0.0),
-                                      vec2(-1.0, 0.0),
-                                      vec2(-0.805, 0.02),
-                                      vec2(-0.617, 0.067),
-                                      vec2(-0.45, 0.169),
-                                      vec2(-0.293, 0.293),
-                                      vec2(-0.169, 0.45),
-                                      vec2(-0.076, 0.617),
-                                      vec2(-0.02, 0.805),
-                                      vec2(0.0, 1.0),
-                                      vec2(0.0, -1.0),
-                                      vec2(-0.02, -0.805),
-                                      vec2(-0.067, -0.617),
-                                      vec2(-0.169, -0.45),
-                                      vec2(-0.293, -0.293),
-                                      vec2(-0.45, -0.169),
-                                      vec2(-0.617, -0.076),
-                                      vec2(-0.805, -0.02),
-                                      vec2(-1.0, 0.0),
-                                      vec2(1.0, 0.0),
-                                      vec2(0.805, -0.02),
-                                      vec2(0.617, -0.067),
-                                      vec2(0.45, -0.169),
-                                      vec2(0.293, -0.293),
-                                      vec2(0.169, -0.45),
-                                      vec2(0.076, -0.617),
-                                      vec2(0.02, -0.805),
-                                      vec2(0.0, -1.0));
+  const vec2 cornervec[36] = float2_array(vec2(0.0, 1.0),
+                                          vec2(0.02, 0.805),
+                                          vec2(0.067, 0.617),
+                                          vec2(0.169, 0.45),
+                                          vec2(0.293, 0.293),
+                                          vec2(0.45, 0.169),
+                                          vec2(0.617, 0.076),
+                                          vec2(0.805, 0.02),
+                                          vec2(1.0, 0.0),
+                                          vec2(-1.0, 0.0),
+                                          vec2(-0.805, 0.02),
+                                          vec2(-0.617, 0.067),
+                                          vec2(-0.45, 0.169),
+                                          vec2(-0.293, 0.293),
+                                          vec2(-0.169, 0.45),
+                                          vec2(-0.076, 0.617),
+                                          vec2(-0.02, 0.805),
+                                          vec2(0.0, 1.0),
+                                          vec2(0.0, -1.0),
+                                          vec2(-0.02, -0.805),
+                                          vec2(-0.067, -0.617),
+                                          vec2(-0.169, -0.45),
+                                          vec2(-0.293, -0.293),
+                                          vec2(-0.45, -0.169),
+                                          vec2(-0.617, -0.076),
+                                          vec2(-0.805, -0.02),
+                                          vec2(-1.0, 0.0),
+                                          vec2(1.0, 0.0),
+                                          vec2(0.805, -0.02),
+                                          vec2(0.617, -0.067),
+                                          vec2(0.45, -0.169),
+                                          vec2(0.293, -0.293),
+                                          vec2(0.169, -0.45),
+                                          vec2(0.076, -0.617),
+                                          vec2(0.02, -0.805),
+                                          vec2(0.0, -1.0));

-  const vec2 center_offset[4] = vec2[4](
+  const vec2 center_offset[4] = float2_array(
      vec2(1.0, 1.0), vec2(-1.0, 1.0), vec2(-1.0, -1.0), vec2(1.0, -1.0));

  uint cflag = vflag & CNR_FLAG_RANGE;
--- a/source/blender/gpu/shaders/gpu_shader_codegen_lib.glsl
+++ b/source/blender/gpu/shaders/gpu_shader_codegen_lib.glsl
@@ -338,8 +338,8 @@ vec3 dF_impl(vec3 v)

 void dF_branch(float fn, out vec2 result)
 {
-  result.x = DFDX_SIGN * dFdx(fn);
-  result.y = DFDY_SIGN * dFdy(fn);
+  result.x = dFdx(fn);
+  result.y = dFdy(fn);
 }

 #else
@@ -349,10 +349,10 @@ int g_derivative_flag = 0;
 vec3 dF_impl(vec3 v)
 {
  if (g_derivative_flag > 0) {
-    return DFDX_SIGN * dFdx(v);
+    return dFdx(v);
  }
  else if (g_derivative_flag < 0) {
-    return DFDY_SIGN * dFdy(v);
+    return dFdy(v);
  }
  return vec3(0.0);
 }
--- a/source/blender/gpu/shaders/gpu_shader_text_frag.glsl
+++ b/source/blender/gpu/shaders/gpu_shader_text_frag.glsl
@@ -133,7 +133,7 @@ void main()
      /* 3x3 blur */

      /* clang-format off */
-      const float weights3x3[16] = float[16](
+      const float weights3x3[16] = float_array(
        1.0, 2.0, 1.0, 0.0,
        2.0, 4.0, 2.0, 0.0,
        1.0, 2.0, 1.0, 0.0,
@@ -169,7 +169,7 @@ void main()
      /* 5x5 blur */

      /* clang-format off */
-      const float weights5x5[36] = float[36](
+      const float weights5x5[36] = float_array(
        1.0, 2.0, 2.0, 2.0, 1.0, 0.0,
        2.0, 5.0, 6.0, 5.0, 2.0, 0.0,
        2.0, 6.0, 8.0, 6.0, 2.0, 0.0,
--- a/source/blender/gpu/shaders/material/gpu_shader_material_transform_utils.glsl
+++ b/source/blender/gpu/shaders/material/gpu_shader_material_transform_utils.glsl
@@ -7,45 +7,45 @@
 void normal_transform_object_to_world(vec3 vin, out vec3 vout)
 {
  /* Expansion of NormalMatrix. */
-  vout = vin * mat3(ModelMatrixInverse);
+  vout = vin * to_float3x3(ModelMatrixInverse);
 }

 void normal_transform_world_to_object(vec3 vin, out vec3 vout)
 {
  /* Expansion of NormalMatrixInverse. */
-  vout = vin * mat3(ModelMatrix);
+  vout = vin * to_float3x3(ModelMatrix);
 }

 void direction_transform_object_to_world(vec3 vin, out vec3 vout)
 {
-  vout = mat3x3(ModelMatrix) * vin;
+  vout = to_float3x3(ModelMatrix) * vin;
 }

 void direction_transform_object_to_view(vec3 vin, out vec3 vout)
 {
-  vout = mat3x3(ModelMatrix) * vin;
-  vout = mat3x3(ViewMatrix) * vout;
+  vout = to_float3x3(ModelMatrix) * vin;
+  vout = to_float3x3(ViewMatrix) * vout;
 }

 void direction_transform_view_to_world(vec3 vin, out vec3 vout)
 {
-  vout = mat3x3(ViewMatrixInverse) * vin;
+  vout = to_float3x3(ViewMatrixInverse) * vin;
 }

 void direction_transform_view_to_object(vec3 vin, out vec3 vout)
 {
-  vout = mat3x3(ViewMatrixInverse) * vin;
-  vout = mat3x3(ModelMatrixInverse) * vout;
+  vout = to_float3x3(ViewMatrixInverse) * vin;
+  vout = to_float3x3(ModelMatrixInverse) * vout;
 }

 void direction_transform_world_to_view(vec3 vin, out vec3 vout)
 {
-  vout = mat3x3(ViewMatrix) * vin;
+  vout = to_float3x3(ViewMatrix) * vin;
 }

 void direction_transform_world_to_object(vec3 vin, out vec3 vout)
 {
-  vout = mat3x3(ModelMatrixInverse) * vin;
+  vout = to_float3x3(ModelMatrixInverse) * vin;
 }

 void point_transform_object_to_world(vec3 vin, out vec3 vout)
--- a/source/blender/gpu/shaders/metal/mtl_shader_defines.msl
+++ b/source/blender/gpu/shaders/metal/mtl_shader_defines.msl
@@ -13,13 +13,37 @@
 #pragma clang diagnostic ignored "-Wunused-variable"
 #pragma clang diagnostic ignored "-Wcomment"

+#define ENABLE_IF(cond) thread metal::enable_if_t<(cond)> * = nullptr
+
 /* Base instance with offsets. */
 #define gpu_BaseInstance gl_BaseInstanceARB
 #define gpu_InstanceIndex (gl_InstanceID + gpu_BaseInstance)

-/* derivative signs. */
-#define DFDX_SIGN 1.0
-#define DFDY_SIGN 1.0
+#ifdef MTL_WORKGROUP_SIZE_X
+/* Older Metal compiler version don't treat vector component access as constexpr.
+ * We have to make a wrapper class for that otherwise we cannot use WorkGroupSize for sizing
+ * threadgroup arrays. Note that this bug is not present in the version 4.1 of the compiler. */
+struct mtl_WorkGroupSize {
+  union {
+    struct {
+      uint x, y, z;
+    };
+    uint2 xy;
+    uint3 xyz;
+  };
+
+  constexpr mtl_WorkGroupSize()
+      : x(MTL_WORKGROUP_SIZE_X), y(MTL_WORKGROUP_SIZE_Y), z(MTL_WORKGROUP_SIZE_Z)
+  {
+  }
+
+  constexpr inline operator uint3() const
+  {
+    return xyz;
+  }
+};
+#  define gl_WorkGroupSize mtl_WorkGroupSize()
+#endif

 /* Type definitions. */
 /* int implicitly cast to bool in MSL. */
@@ -53,64 +77,9 @@ using bvec3 = bool3;
 using bvec4 = bool4;

 /* Compute decorators. */
-#define TG threadgroup
 #define barrier() \
  threadgroup_barrier(mem_flags::mem_threadgroup | mem_flags::mem_device | mem_flags::mem_texture)

-#ifdef MTL_USE_WORKGROUP_SIZE
-/* Compute work-group size. */
-struct constexp_uvec3 {
-  /* Type union to cover all syntax accessors:
-   * .x, .y, .z, .xy, .xyz
-   * Swizzle types invalid.*/
-  union {
-    struct {
-      uint x, y, z;
-    };
-    struct {
-      uint2 xy;
-    };
-    uint3 xyz;
-  };
-
-  constexpr constexp_uvec3(uint _x, uint _y, uint _z) : x(_x), y(_y), z(_z) {}
-  constexpr uint operator[](int i)
-  {
-    /* Note: Need to switch on each elem value as array accessor triggers
-     * non-constant sizing error. This will be statically evaluated at compile time. */
-    switch (i) {
-      case 0:
-        return x;
-      case 1:
-        return y;
-      case 2:
-        return z;
-      default:
-        return 0;
-    }
-  }
-  constexpr inline operator uint3() const
-  {
-    return xyz;
-  }
-  constexpr inline operator uint2() const
-  {
-    return xy;
-  }
-  constexpr inline operator uint() const
-  {
-    return x;
-  }
-};
-
-constexpr constexp_uvec3 __internal_workgroupsize_get()
-{
-  return constexp_uvec3(MTL_WORKGROUP_SIZE_X, MTL_WORKGROUP_SIZE_Y, MTL_WORKGROUP_SIZE_Z);
-}
-
-#  define gl_WorkGroupSize __internal_workgroupsize_get()
-#endif
-
 /** Shader atomics:
 * In order to emulate GLSL-style atomic operations, wherein variables can be used within atomic
 * operations, even if they are not explicitly declared atomic, we can cast the pointer to atomic,
@@ -118,162 +87,652 @@ constexpr constexp_uvec3 __internal_workgroupsize_get()
 *
 * NOTE: We cannot hoist the address space into the template declaration, so these must be declared
 * for each relevant address space. */
+#define ATOMIC_OP_EX(qualifier, glsl_op, mtl_op) \
+  template<typename T> T atomic##glsl_op(qualifier T &mem, T data) \
+  { \
+    return atomic_##mtl_op##_explicit((qualifier _atomic<T> *)&mem, data, memory_order_relaxed); \
+  }

-/* Thread-group memory. */
-template<typename T> T atomicMax(threadgroup T &mem, T data)
-{
-  return atomic_fetch_max_explicit((threadgroup _atomic<T> *)&mem, data, memory_order_relaxed);
-}
-template<typename T> T atomicMin(threadgroup T &mem, T data)
-{
-  return atomic_fetch_min_explicit((threadgroup _atomic<T> *)&mem, data, memory_order_relaxed);
-}
-template<typename T> T atomicAdd(threadgroup T &mem, T data)
-{
-  return atomic_fetch_add_explicit((threadgroup _atomic<T> *)&mem, data, memory_order_relaxed);
-}
-template<typename T> T atomicSub(threadgroup T &mem, T data)
-{
-  return atomic_fetch_sub_explicit((threadgroup _atomic<T> *)&mem, data, memory_order_relaxed);
-}
-template<typename T> T atomicAnd(threadgroup T &mem, T data)
-{
-  return atomic_fetch_and_explicit((threadgroup _atomic<T> *)&mem, data, memory_order_relaxed);
-}
-template<typename T> T atomicOr(threadgroup T &mem, T data)
-{
-  return atomic_fetch_or_explicit((threadgroup _atomic<T> *)&mem, data, memory_order_relaxed);
-}
-template<typename T> T atomicXor(threadgroup T &mem, T data)
-{
-  return atomic_fetch_xor_explicit((threadgroup _atomic<T> *)&mem, data, memory_order_relaxed);
-}
-template<typename T> T atomicExchange(threadgroup T &mem, T data)
-{
-  return atomic_exchange_explicit((threadgroup _atomic<T> *)&mem, data, memory_order_relaxed);
-}
+#define ATOMIC_OP(glsl_op, mtl_op) \
+  ATOMIC_OP_EX(threadgroup, glsl_op, mtl_op) \
+  ATOMIC_OP_EX(device, glsl_op, mtl_op)

-/* Device memory. */
-template<typename T> T atomicMax(device T &mem, T data)
-{
-  return atomic_fetch_max_explicit((device _atomic<T> *)&mem, data, memory_order_relaxed);
-}
-template<typename T> T atomicMin(device T &mem, T data)
-{
-  return atomic_fetch_min_explicit((device _atomic<T> *)&mem, data, memory_order_relaxed);
-}
-template<typename T> T atomicAdd(device T &mem, T data)
-{
-  return atomic_fetch_add_explicit((device _atomic<T> *)&mem, data, memory_order_relaxed);
-}
-template<typename T> T atomicSub(device T &mem, T data)
-{
-  return atomic_fetch_sub_explicit((device _atomic<T> *)&mem, data, memory_order_relaxed);
-}
-template<typename T> T atomicAnd(device T &mem, T data)
-{
-  return atomic_fetch_and_explicit((device _atomic<T> *)&mem, data, memory_order_relaxed);
-}
-template<typename T> T atomicOr(device T &mem, T data)
-{
-  return atomic_fetch_or_explicit((device _atomic<T> *)&mem, data, memory_order_relaxed);
-}
-template<typename T> T atomicXor(device T &mem, T data)
-{
-  return atomic_fetch_xor_explicit((device _atomic<T> *)&mem, data, memory_order_relaxed);
-}
-template<typename T> T atomicExchange(device T &mem, T data)
-{
-  return atomic_exchange_explicit((device _atomic<T> *)&mem, data, memory_order_relaxed);
-}
+ATOMIC_OP(Max, fetch_max)
+ATOMIC_OP(Min, fetch_min)
+ATOMIC_OP(Add, fetch_add)
+ATOMIC_OP(Sub, fetch_sub)
+ATOMIC_OP(And, fetch_and)
+ATOMIC_OP(Or, fetch_or)
+ATOMIC_OP(Xor, fetch_xor)
+ATOMIC_OP(Exchange, exchange)

-/* Used to replace 'out' in function parameters with thread-local reference
- * shortened to avoid expanding the GLSL source string. */
-#define THD thread
-#define OUT(type, name, array) thread type(&name)[array]
+#undef ATOMIC_OP

 /* Generate wrapper structs for combined texture and sampler type. */
 #ifdef USE_ARGUMENT_BUFFER_FOR_SAMPLERS
-#  define SAMPLER_DECLARATION constant sampler *samp;
+using sampler_ptr = constant sampler *;
 #else
-#  define SAMPLER_DECLARATION thread sampler *samp;
+using sampler_ptr = thread sampler *;
 #endif

-#define COMBINED_SAMPLER_TYPE(STRUCT_NAME, TEX_TYPE) \
-  template<typename T, access A = access::sample> struct STRUCT_NAME { \
-    thread TEX_TYPE<T, A> *texture; \
-    SAMPLER_DECLARATION \
+/* Use point sampler instead of texture read to benefit from texture caching and reduce branching
+ * through removal of bounds tests, as these are handled by the sample operation. */
+constexpr sampler _mtl_fetch_samp(address::clamp_to_zero, filter::nearest, coord::pixel);
+
+template<typename T,
+         access A,
+         typename TextureT,
+         bool is_depth,
+         int Dim,
+         int Cube,
+         int Array,
+         bool Atomic = false>
+struct _mtl_sampler {
+
+  template<typename U, int S>
+  using vec_or_scalar = typename metal::conditional<S == 1, U, vec<U, S>>::type;
+
+  using FltCoord = vec_or_scalar<float, Dim + Cube + Array>;
+  using FltDeriv = vec_or_scalar<float, Dim + Cube>;
+  using IntCoord = vec_or_scalar<int, Dim + Cube + Array>;
+  using IntDeriv = vec_or_scalar<int, Dim + Cube>;
+  using UintCoord = vec_or_scalar<uint, Dim + Cube + Array>;
+  using UintDeriv = vec_or_scalar<uint, Dim + Cube>;
+  using SizeVec = vec_or_scalar<int, Dim + Array>;
+  using DataVec = vec<T, 4>;
+  using AtomicT = T;
+
+  /* Template compatible gradient type choosing. */
+  template<int D, int C> struct gradient_n {};
+  /* clang-format off */
+  template<> struct gradient_n<2, 0> { using type = gradient2d; };
+  template<> struct gradient_n<3, 0> { using type = gradient3d; };
+  template<> struct gradient_n<2, 1> { using type = gradientcube; };
+  /* clang-format on */
+  /* Using `using` would invalidate the whole class. */
+#define gradient typename gradient_n<Dim, Cube>::type
+
+#ifndef MTL_SUPPORTS_TEXTURE_ATOMICS
+  /* If native texture atomics are unsupported, we instead utilize a custom type which wraps a
+   * buffer-backed texture. This texture will always be a Texture2D, but will emulate access to
+   * Texture3D and Texture2DArray by stacking layers.
+   * Access pattern will be derived based on the source type. 2DArray and 3D atomic texture
+   * support will require information on the size of each layer within the source 2D texture.
+   *
+   * A device pointer to the backing buffer will also be available for the atomic operations.
+   * NOTE: For atomic ops, it will only be valid to use access::read_write.
+   * We still need to use the wrapped type for access:sample, as texture2DArray and texture3D
+   * will require access indirection.
+   *
+   * NOTE: Only type of UINT is valid, but full template provided to match syntax of standard
+   * textures. */
+  template<typename U, int D, bool At> struct AtomicEmulation {};
+
+  template<typename U> struct AtomicEmulation<U, 2, true> {
+    /** Buffer to do atomic operations on. */
+    device U *buffer;
+    /* Aligned width matches the number of buffer elements in bytes_per_row. This may be greater
+     * than the texture's native width to satisfy device alignment rules. We need to use the padded
+     * width when writing to ensure the correct writing location aligns with a given pixel location
+     * in the texture. */
+    ushort aligned_width;
+
+    int2 to_internal_coord(IntCoord coord) const
+    {
+      return coord.xy;
+    }
+    uint to_linear_coord(IntCoord coord) const
+    {
+      return coord.x + coord.y * aligned_width;
+    }
+  };
+
+  template<typename U> struct AtomicEmulation<U, 3, true> {
+    /** Buffer to do atomic operations on. */
+    device U *buffer;
+    /** Required to pixel location inside the backing texture 2D space, and texture size query. */
+    ushort3 texture_size;
+    /* Aligned width matches the number of buffer elements in bytes_per_row. This may be greater
+     * than the texture's native width to satisfy device alignment rules. We need to use the padded
+     * width when writing to ensure the correct writing location aligns with a given pixel location
+     * in the texture. */
+    ushort aligned_width;
+
+    int2 to_internal_coord(IntCoord coord) const
+    {
+      return int2(coord.x, coord.y + texture_size.y * coord.z);
+    }
+    uint to_linear_coord(IntCoord coord) const
+    {
+      uint row = coord.y + coord.z * texture_size.y;
+      return coord.x + row * aligned_width;
+    }
+  };
+
+  AtomicEmulation<T, Dim + Array, Atomic> atomic;
+#endif
+
+  thread TextureT *texture;
+  sampler_ptr samp;
+
+  template<int dim, int array, bool At = false> SizeVec size_impl(uint lod) const {}
+  template<> SizeVec size_impl<1, 0>(uint lod) const
+  {
+    return SizeVec(texture->get_width());
+  }
+  template<> SizeVec size_impl<1, 1>(uint lod) const
+  {
+    return SizeVec(texture->get_width(), texture->get_array_size());
+  }
+  template<> SizeVec size_impl<2, 0>(uint lod) const
+  {
+    return SizeVec(texture->get_width(lod), texture->get_height(lod));
+  }
+  template<> SizeVec size_impl<2, 1>(uint lod) const
+  {
+    return SizeVec(texture->get_width(lod), texture->get_height(lod), texture->get_array_size());
+  }
+  template<> SizeVec size_impl<3, 0>(uint lod) const
+  {
+    return SizeVec(texture->get_width(lod), texture->get_height(lod), texture->get_depth(lod));
+  }
+#ifndef MTL_SUPPORTS_TEXTURE_ATOMICS
+  template<> SizeVec size_impl<2, 1, true>(uint lod) const
+  {
+    return SizeVec(atomic.texture_size);
+  }
+  template<> SizeVec size_impl<3, 0, true>(uint lod) const
+  {
+    return SizeVec(atomic.texture_size);
+  }
+#endif
+  SizeVec size(int lod = 0) const
+  {
+    return size_impl<Dim, Array, Atomic>(uint(lod));
  }

-/* If native texture atomics are unsupported, we instead utilize a custom type which wraps a
- * buffer-backed texture. This texture will always be a Texture2D, but will emulate access to
- * Texture3D and Texture2DArray by stacking layers.
- * Access pattern will be derived based on the source type. 2DArray and 3D atomic texture
- * support will require information on the size of each layer within the source 2D texture.
- *
- * A device pointer to the backing buffer will also be available for the atomic operations.
- * NOTE: For atomic ops, it will only be valid to use access::read_write.
- * We still need to use the wrapped type for access:sample, as texture2DArray and texture3D
- * will require access indirection.
- *
- * NOTE: Only type of UINT is valid, but full template provided to match syntax of standard
- * textures. */
+#define ARRAY_FN \
+  template<int Ar = Array, bool At = Atomic, ENABLE_IF(Ar == 1), ENABLE_IF(At == false)>
+#define NON_ARRAY_FN \
+  template<int Ar = Array, bool At = Atomic, ENABLE_IF(Ar == 0), ENABLE_IF(At == false)>
+
+  NON_ARRAY_FN DataVec sample(FltCoord coord) const
+  {
+    return texture->sample(*samp, coord);
+  }
+  NON_ARRAY_FN DataVec sample_grad(FltCoord coord, FltDeriv dPdx, FltDeriv dPdy) const
+  {
+    return texture->sample(*samp, coord, gradient(dPdx, dPdy));
+  }
+  NON_ARRAY_FN DataVec sample_bias(FltCoord coord, bias lod_bias) const
+  {
+    return texture->sample(*samp, coord, lod_bias);
+  }
+  NON_ARRAY_FN DataVec sample_lod(FltCoord coord, level lod, IntDeriv offset = {0}) const
+  {
+    return texture->sample(*samp, coord, lod, offset);
+  }
+  NON_ARRAY_FN DataVec gather(FltCoord coord) const
+  {
+    return texture->gather(*samp, coord);
+  }
+  NON_ARRAY_FN DataVec fetch(IntCoord coord) const
+  {
+    return texture->sample(_mtl_fetch_samp, FltCoord(coord));
+  }
+  NON_ARRAY_FN DataVec fetch(IntCoord coord, level lod, IntDeriv offset = {0}) const
+  {
+    return texture->sample(_mtl_fetch_samp, FltCoord(coord), lod, offset);
+  }
+
+  ARRAY_FN DataVec sample(FltCoord coord) const
+  {
+    return texture->sample(*samp, uv_mask(coord), layer_mask(coord));
+  }
+  ARRAY_FN DataVec sample_grad(FltCoord coord, FltDeriv dPdx, FltDeriv dPdy) const
+  {
+    return texture->sample(*samp, uv_mask(coord), layer_mask(coord), gradient(dPdx, dPdy));
+  }
+  ARRAY_FN DataVec sample_bias(FltCoord coord, bias lod_bias) const
+  {
+    return texture->sample(*samp, uv_mask(coord), layer_mask(coord), lod_bias);
+  }
+  ARRAY_FN DataVec sample_lod(FltCoord coord, level lod, IntDeriv offset = {0}) const
+  {
+    return texture->sample(*samp, uv_mask(coord), layer_mask(coord), lod, offset);
+  }
+  ARRAY_FN DataVec gather(FltCoord coord) const
+  {
+    return texture->gather(*samp, uv_mask(coord), layer_mask(coord));
+  }
+  ARRAY_FN DataVec fetch(IntCoord coord) const
+  {
+    return texture->sample(_mtl_fetch_samp, uv_mask(coord), layer_mask(coord));
+  }
+  ARRAY_FN DataVec fetch(IntCoord coord, level lod, IntDeriv ofs = {0}) const
+  {
+    return texture->sample(_mtl_fetch_samp, uv_mask(coord), layer_mask(coord), lod, ofs);
+  }
+
+#undef gradient
+#undef ARRAY_FN
+#undef NON_ARRAY_FN
+
+  /**
+   * Image functions.
+   * To be split to its own class.
+   */
+
+#define ARRAY_FN \
+  template<int Ar = Array, bool At = Atomic, ENABLE_IF(Ar == 1), ENABLE_IF(At == false)>
+#define NON_ARRAY_FN \
+  template<int Ar = Array, bool At = Atomic, ENABLE_IF(Ar == 0), ENABLE_IF(At == false)>
+
+  NON_ARRAY_FN DataVec load(IntCoord coord) const
+  {
+    return texture->read(UintCoord(coord), 0);
+  }
+  NON_ARRAY_FN void store(DataVec data, IntCoord coord) const
+  {
+    texture->write(data, UintCoord(coord), 0);
+  }
+
+  ARRAY_FN DataVec load(IntCoord coord) const
+  {
+    return texture->read(uv_mask_img(coord), layer_mask_img(coord), 0);
+  }
+  ARRAY_FN void store(DataVec data, IntCoord coord) const
+  {
+    texture->write(data, uv_mask_img(coord), layer_mask_img(coord), 0);
+  }
+
+#undef ARRAY_FN
+#undef NON_ARRAY_FN
+
 #ifndef MTL_SUPPORTS_TEXTURE_ATOMICS
+  /* Atomic samplers only support `textureFetch` as the texture layout doesn't allow filtering. */

-template<typename T, access A = access::sample>
-struct _mtl_combined_image_sampler_2d_atomic_fallback {
-  thread texture2d<T, A> *texture;
-  SAMPLER_DECLARATION
-  device T *buffer;
-  /* Aligned width matches the number of buffer elements in bytes_per_row. This may be greater than
-   * the texture's native width to satisfy device alignment rules. We need to use the padded width
-   * when writing to ensure the
-   * correct writing location aligns with a given pixel location in the texture. */
-  uint aligned_width;
-};
+#  define ATOMIC_FN template<bool At = Atomic, ENABLE_IF(At == true)>

-template<typename T, access A = access::sample>
-struct _mtl_combined_image_sampler_2d_array_atomic_fallback {
-  thread texture2d<T, A> *texture;
-  SAMPLER_DECLARATION
-  device T *buffer;
-  /* Aligned width matches the number of buffer elements in bytes_per_row. This may be greater than
-   * the texture's native width to satisfy device alignment rules. We need to use the padded width
-   * when writing to ensure the
-   * correct writing location aligns with a given pixel location in the texture. */
-  uint aligned_width;
-  /* Texture size required to determine location offset of array layer with 2D texture space. */
-  ushort3 texture_size;
-};
+  ATOMIC_FN DataVec fetch(IntCoord coord) const
+  {
+    int2 coord_2d = atomic.to_internal_coord(coord);
+    return texture->sample(_mtl_fetch_samp, float2(coord_2d));
+  }
+  ATOMIC_FN DataVec fetch(IntCoord coord, level lod, IntDeriv offset = {0}) const
+  {
+    int2 coord_2d = atomic.to_internal_coord(coord);
+    return texture->sample(_mtl_fetch_samp, float2(coord_2d), lod, offset);
+  }

-template<typename T, access A = access::sample>
-struct _mtl_combined_image_sampler_3d_atomic_fallback {
-  thread texture2d<T, A> *texture;
-  SAMPLER_DECLARATION
-  device T *buffer;
-  /* Aligned width matches the number of buffer elements in bytes_per_row. This may be greater than
-   * the texture's native width to satisfy device alignment rules. We need to use the padded width
-   * when writing to ensure the
-   * correct writing location aligns with a given pixel location in the texture. */
-  uint aligned_width;
-  /* Texture size required to determine location offset of array layer with 2D texture space. */
-  ushort3 texture_size;
-};
+  ATOMIC_FN DataVec load(IntCoord coord) const
+  {
+    int2 coord_2d = atomic.to_internal_coord(coord);
+    return texture->read(uint2(coord_2d), 0);
+  }
+  ATOMIC_FN void store(DataVec data, IntCoord coord) const
+  {
+    int2 coord_2d = atomic.to_internal_coord(coord);
+    texture->write(data, uint2(coord_2d), 0);
+  }
+
+#  undef ATOMIC_FN
+
+  AtomicT atomic_min(IntCoord coord, AtomicT data) const
+  {
+    return atomicMin(atomic.buffer[atomic.to_linear_coord(coord)], data);
+  }
+  AtomicT atomic_max(IntCoord coord, AtomicT data) const
+  {
+    return atomicMax(atomic.buffer[atomic.to_linear_coord(coord)], data);
+  }
+  AtomicT atomic_add(IntCoord coord, AtomicT data) const
+  {
+    return atomicAdd(atomic.buffer[atomic.to_linear_coord(coord)], data);
+  }
+  AtomicT atomic_and(IntCoord coord, AtomicT data) const
+  {
+    return atomicAnd(atomic.buffer[atomic.to_linear_coord(coord)], data);
+  }
+  AtomicT atomic_or(IntCoord coord, AtomicT data) const
+  {
+    return atomicOr(atomic.buffer[atomic.to_linear_coord(coord)], data);
+  }
+  AtomicT atomic_xor(IntCoord coord, AtomicT data) const
+  {
+    return atomicXor(atomic.buffer[atomic.to_linear_coord(coord)], data);
+  }
+  AtomicT atomic_exchange(IntCoord coord, AtomicT data) const
+  {
+    return atomicExchange(atomic.buffer[atomic.to_linear_coord(coord)], data);
+  }
+
+#else
+#  define NON_ARRAY_ATOMIC \
+    template<typename U = T, \
+             int Ar = Array, \
+             ENABLE_IF(metal::is_integral_v<U> == true), \
+             ENABLE_IF(sizeof(U) == 4), \
+             ENABLE_IF(Ar == 0)>
+#  define ARRAY_ATOMIC \
+    template<typename U = T, \
+             int Ar = Array, \
+             ENABLE_IF(metal::is_integral_v<U> == true), \
+             ENABLE_IF(sizeof(U) == 4), \
+             ENABLE_IF(Ar == 1)>
+
+  NON_ARRAY_ATOMIC AtomicT atomic_min(IntCoord coord, AtomicT data) const
+  {
+    return texture->atomic_fetch_min(UintCoord(coord), data).x;
+  }
+  NON_ARRAY_ATOMIC AtomicT atomic_max(IntCoord coord, AtomicT data) const
+  {
+    return texture->atomic_fetch_max(UintCoord(coord), data).x;
+  }
+  NON_ARRAY_ATOMIC AtomicT atomic_add(IntCoord coord, AtomicT data) const
+  {
+    return texture->atomic_fetch_add(UintCoord(coord), data).x;
+  }
+  NON_ARRAY_ATOMIC AtomicT atomic_and(IntCoord coord, AtomicT data) const
+  {
+    return texture->atomic_fetch_and(UintCoord(coord), data).x;
+  }
+  NON_ARRAY_ATOMIC AtomicT atomic_or(IntCoord coord, AtomicT data) const
+  {
+    return texture->atomic_fetch_or(UintCoord(coord), data).x;
+  }
+  NON_ARRAY_ATOMIC AtomicT atomic_xor(IntCoord coord, AtomicT data) const
+  {
+    return texture->atomic_fetch_xor(UintCoord(coord), data).x;
+  }
+  NON_ARRAY_ATOMIC AtomicT atomic_exchange(IntCoord coord, AtomicT data) const
+  {
+    return texture->atomic_exchange(UintCoord(coord), data).x;
+  }
+
+  ARRAY_ATOMIC AtomicT atomic_min(IntCoord coord, AtomicT data) const
+  {
+    return texture->atomic_fetch_min(uv_mask_img(coord), layer_mask_img(coord), data).x;
+  }
+  ARRAY_ATOMIC AtomicT atomic_max(IntCoord coord, AtomicT data) const
+  {
+    return texture->atomic_fetch_max(uv_mask_img(coord), layer_mask_img(coord), data).x;
+  }
+  ARRAY_ATOMIC AtomicT atomic_add(IntCoord coord, AtomicT data) const
+  {
+    return texture->atomic_fetch_add(uv_mask_img(coord), layer_mask_img(coord), data).x;
+  }
+  ARRAY_ATOMIC AtomicT atomic_and(IntCoord coord, AtomicT data) const
+  {
+    return texture->atomic_fetch_and(uv_mask_img(coord), layer_mask_img(coord), data).x;
+  }
+  ARRAY_ATOMIC AtomicT atomic_or(IntCoord coord, AtomicT data) const
+  {
+    return texture->atomic_fetch_or(uv_mask_img(coord), layer_mask_img(coord), data).x;
+  }
+  ARRAY_ATOMIC AtomicT atomic_xor(IntCoord coord, AtomicT data) const
+  {
+    return texture->atomic_fetch_xor(uv_mask_img(coord), layer_mask_img(coord), data).x;
+  }
+  ARRAY_ATOMIC AtomicT atomic_exchange(IntCoord coord, AtomicT data) const
+  {
+    return texture->atomic_exchange(uv_mask_img(coord), layer_mask_img(coord), data).x;
+  }
+
+#  undef NON_ARRAY_ATOMIC
+#  undef ARRAY_ATOMIC
 #endif

+  void fence()
+  {
+    texture->fence();
+  }
+
+ private:
+  template<typename U, typename V> static U reshape(V v) {}
+  /* clang-format off */
+  template<> float reshape<float>(float2 v) { return v.x; }
+  template<> float2 reshape<float2>(float3 v) { return v.xy; }
+  template<> float3 reshape<float3>(float4 v) { return v.xyz; }
+  template<> int reshape<int>(int2 v) { return v.x; }
+  template<> int2 reshape<int2>(int3 v) { return v.xy; }
+  template<> int3 reshape<int3>(int4 v) { return v.xyz; }
+  /* clang-format on */
+
+  FltDeriv uv_mask(FltCoord coord) const
+  {
+    return reshape<FltDeriv>(coord);
+  }
+  FltDeriv uv_mask(IntCoord coord) const
+  {
+    return FltDeriv(reshape<IntDeriv>(coord));
+  }
+
+  uint layer_mask(FltCoord coord) const
+  {
+    return coord[Dim + Cube];
+  }
+  uint layer_mask(IntCoord coord) const
+  {
+    return coord[Dim + Cube];
+  }
+
+  UintDeriv uv_mask_img(IntCoord coord) const
+  {
+    return UintDeriv(reshape<IntDeriv>(coord));
+  }
+
+  uint layer_mask_img(IntCoord coord) const
+  {
+    return coord[Dim + Cube];
+  }
+};
+
+/** Sampler functions */
+
+#define SAMPLER_FN \
+  template<typename SamplerT, \
+           typename FltCoord = typename SamplerT::FltCoord, \
+           typename FltDeriv = typename SamplerT::FltDeriv, \
+           typename IntCoord = typename SamplerT::IntCoord, \
+           typename IntDeriv = typename SamplerT::IntDeriv, \
+           typename UintCoord = typename SamplerT::UintCoord, \
+           typename UintDeriv = typename SamplerT::UintDeriv, \
+           typename SizeVec = typename SamplerT::SizeVec, \
+           typename DataVec = typename SamplerT::DataVec>
+
+SAMPLER_FN SizeVec textureSize(SamplerT texture, int lod)
+{
+  return texture.size(lod);
+}
+
+SAMPLER_FN DataVec texture(SamplerT texture, FltCoord coord)
+{
+  return texture.sample(coord);
+}
+
+SAMPLER_FN DataVec texture(SamplerT texture, FltCoord coord, float lod_bias)
+{
+  return texture.sample_bias(coord, bias(lod_bias));
+}
+
+SAMPLER_FN DataVec textureLod(SamplerT texture, FltCoord coord, float lod)
+{
+  return texture.sample_lod(coord, level(lod));
+}
+
+SAMPLER_FN DataVec textureLodOffset(SamplerT texture, FltCoord coord, float lod, IntDeriv offset)
+{
+  return texture.sample_lod(coord, level(lod), offset);
+}
+
+SAMPLER_FN DataVec textureGather(SamplerT texture, FltCoord coord)
+{
+  return texture.gather(coord);
+}
+
+SAMPLER_FN DataVec textureGrad(SamplerT texture, FltCoord coord, FltDeriv dPdx, FltDeriv dPdy)
+{
+  return texture.sample_grad(coord, dPdx, dPdy);
+}
+
+SAMPLER_FN DataVec texelFetch(SamplerT texture, IntCoord coord, int lod)
+{
+  return texture.fetch(coord, level(lod));
+}
+
+SAMPLER_FN DataVec texelFetchOffset(SamplerT texture, IntCoord coord, int lod, IntDeriv offset)
+{
+  return texture.fetch(coord, level(lod), offset);
+}
+
+#undef SAMPLER_FN
+
+/** Image functions */
+
+#define IMAGE_FN \
+  template<typename SamplerT, \
+           typename IntCoord = typename SamplerT::IntCoord, \
+           typename IntDeriv = typename SamplerT::IntDeriv, \
+           typename UintCoord = typename SamplerT::UintCoord, \
+           typename UintDeriv = typename SamplerT::UintDeriv, \
+           typename SizeVec = typename SamplerT::SizeVec, \
+           typename DataVec = typename SamplerT::DataVec, \
+           typename AtomicT = typename SamplerT::AtomicT>
+
+IMAGE_FN SizeVec imageSize(SamplerT texture)
+{
+  return texture.size();
+}
+
+IMAGE_FN void imageFence(SamplerT texture)
+{
+  return texture.fence();
+}
+
+IMAGE_FN DataVec imageLoad(SamplerT texture, IntCoord coord)
+{
+  if (any(UintCoord(coord) >= UintCoord(texture.size()))) {
+    return DataVec(0);
+  }
+  return texture.load(coord);
+}
+
+IMAGE_FN DataVec imageLoadFast(SamplerT texture, IntCoord coord)
+{
+  return texture.load(coord);
+}
+
+IMAGE_FN void imageStore(SamplerT texture, IntCoord coord, DataVec data)
+{
+  if (any(UintCoord(coord) >= UintCoord(texture.size()))) {
+    return;
+  }
+  texture.store(data, coord);
+}
+
+IMAGE_FN
+void imageStoreFast(SamplerT texture, IntCoord coord, DataVec data)
+{
+  texture.store(data, coord);
+}
+
+IMAGE_FN AtomicT imageAtomicMin(SamplerT texture, IntCoord coord, AtomicT data)
+{
+  return texture.atomic_min(coord, data);
+}
+IMAGE_FN AtomicT imageAtomicMax(SamplerT texture, IntCoord coord, AtomicT data)
+{
+  return texture.atomic_max(coord, data);
+}
+IMAGE_FN AtomicT imageAtomicAdd(SamplerT texture, IntCoord coord, AtomicT data)
+{
+  return texture.atomic_add(coord, data);
+}
+IMAGE_FN AtomicT imageAtomicAnd(SamplerT texture, IntCoord coord, AtomicT data)
+{
+  return texture.atomic_and(coord, data);
+}
+IMAGE_FN AtomicT imageAtomicOr(SamplerT texture, IntCoord coord, AtomicT data)
+{
+  return texture.atomic_or(coord, data);
+}
+IMAGE_FN AtomicT imageAtomicXor(SamplerT texture, IntCoord coord, AtomicT data)
+{
+  return texture.atomic_xor(coord, data);
+}
+IMAGE_FN AtomicT imageAtomicExchange(SamplerT texture, IntCoord coord, AtomicT data)
+{
+  return texture.atomic_exchange(coord, data);
+}
+
+#undef IMAGE_FN
+
 /* Add any types as needed. */
-COMBINED_SAMPLER_TYPE(_mtl_combined_image_sampler_1d, texture1d);
-COMBINED_SAMPLER_TYPE(_mtl_combined_image_sampler_1d_array, texture1d_array);
-COMBINED_SAMPLER_TYPE(_mtl_combined_image_sampler_2d, texture2d);
-COMBINED_SAMPLER_TYPE(_mtl_combined_image_sampler_depth_2d, depth2d);
-COMBINED_SAMPLER_TYPE(_mtl_combined_image_sampler_2d_array, texture2d_array);
-COMBINED_SAMPLER_TYPE(_mtl_combined_image_sampler_depth_2d_array, depth2d_array);
-COMBINED_SAMPLER_TYPE(_mtl_combined_image_sampler_3d, texture3d);
-COMBINED_SAMPLER_TYPE(_mtl_combined_image_sampler_buffer, texture_buffer);
-COMBINED_SAMPLER_TYPE(_mtl_combined_image_sampler_cube, texturecube);
-COMBINED_SAMPLER_TYPE(_mtl_combined_image_sampler_cube_array, texturecube_array);
-COMBINED_SAMPLER_TYPE(_mtl_combined_image_sampler_depth_cube, texturecube_array);
-COMBINED_SAMPLER_TYPE(_mtl_combined_image_sampler_depth_cube_array, texturecube_array);
+#define TEMPLATE template<typename T = float, access A = access::sample>
+TEMPLATE using depth2D = _mtl_sampler<T, A, depth2d<T, A>, true, 2, 0, 0>;
+TEMPLATE using depth2DArray = _mtl_sampler<T, A, depth2d_array<T, A>, true, 2, 0, 1>;
+TEMPLATE using depthCube = _mtl_sampler<T, A, texturecube<T, A>, true, 2, 1, 1>;
+TEMPLATE using depthCubeArray = _mtl_sampler<T, A, texturecube_array<T, A>, true, 2, 1, 1>;
+TEMPLATE using sampler1D = _mtl_sampler<T, A, texture1d<T, A>, false, 1, 0, 0>;
+TEMPLATE using sampler1DArray = _mtl_sampler<T, A, texture1d_array<T, A>, false, 1, 0, 1>;
+TEMPLATE using sampler2D = _mtl_sampler<T, A, texture2d<T, A>, false, 2, 0, 0>;
+TEMPLATE using sampler2DArray = _mtl_sampler<T, A, texture2d_array<T, A>, false, 2, 0, 1>;
+TEMPLATE using sampler3D = _mtl_sampler<T, A, texture3d<T, A>, false, 3, 0, 0>;
+TEMPLATE using samplerBuffer = _mtl_sampler<T, A, texture_buffer<T, A>, false, 1, 0, 0>;
+TEMPLATE using samplerCube = _mtl_sampler<T, A, texturecube<T, A>, false, 2, 1, 0>;
+TEMPLATE using samplerCubeArray = _mtl_sampler<T, A, texturecube_array<T, A>, false, 2, 1, 1>;
+/* Atomic textures are defined as 2D textures with special layout for 3D texture emulation. */
+TEMPLATE using sampler2DAtomic = _mtl_sampler<T, A, texture2d<T, A>, false, 2, 0, 0, true>;
+TEMPLATE using sampler2DArrayAtomic = _mtl_sampler<T, A, texture2d<T, A>, false, 2, 0, 1, true>;
+TEMPLATE using sampler3DAtomic = _mtl_sampler<T, A, texture2d<T, A>, false, 3, 0, 0, true>;
+
+/* Used by backend to declare the samplers. Could be removed. */
+TEMPLATE using _mtl_sampler_depth_2d = depth2D<T, A>;
+TEMPLATE using _mtl_sampler_depth_2d_array = depth2DArray<T, A>;
+TEMPLATE using _mtl_sampler_depth_cube = depthCube<T, A>;
+TEMPLATE using _mtl_sampler_depth_cube_array = depthCubeArray<T, A>;
+TEMPLATE using _mtl_sampler_1d = sampler1D<T, A>;
+TEMPLATE using _mtl_sampler_1d_array = sampler1DArray<T, A>;
+TEMPLATE using _mtl_sampler_2d = sampler2D<T, A>;
+TEMPLATE using _mtl_sampler_2d_array = sampler2DArray<T, A>;
+TEMPLATE using _mtl_sampler_3d = sampler3D<T, A>;
+TEMPLATE using _mtl_sampler_buffer = samplerBuffer<T, A>;
+TEMPLATE using _mtl_sampler_cube = samplerCube<T, A>;
+TEMPLATE using _mtl_sampler_cube_array = samplerCubeArray<T, A>;
+TEMPLATE using _mtl_sampler_2d_atomic = sampler2DAtomic<T, A>;
+TEMPLATE using _mtl_sampler_2d_array_atomic = sampler2DArrayAtomic<T, A>;
+TEMPLATE using _mtl_sampler_3d_atomic = sampler3DAtomic<T, A>;
+#undef TEMPLATE
+
+/* Variant for 1D samplers. Discard the lod. */
+template<typename T, access A>
+typename sampler1D<T, A>::DataVec texelFetch(sampler1D<T, A> texture, int coord, int lod = 0)
+{
+  return texture.fetch(coord);
+}
+
+/* Variant for 1DArray samplers. Discard the lod. */
+template<typename T, access A>
+typename sampler1DArray<T, A>::DataVec texelFetch(sampler1DArray<T, A> texture,
+                                                  int2 coord,
+                                                  int lod = 0)
+{
+  return texture.fetch(coord);
+}
+
+/* Variant for buffer samplers. Discard the lod. */
+template<typename T, access A>
+typename samplerBuffer<T, A>::DataVec texelFetch(samplerBuffer<T, A> texture,
+                                                 int coord,
+                                                 int lod = 0)
+{
+  uint texel = uint(coord);
+  if (texel < texture.texture->get_width()) {
+    return texture.texture->read(texel);
+  }
+  return typename samplerBuffer<T, A>::DataVec(0);
+}

 /* Sampler struct for argument buffer. */
 #ifdef USE_ARGUMENT_BUFFER_FOR_SAMPLERS
@@ -283,51 +742,51 @@ struct SStruct {
 #endif

 /* Samplers as function parameters. */
-#define sampler1D thread _mtl_combined_image_sampler_1d<float>
-#define sampler1DArray thread _mtl_combined_image_sampler_1d_array<float>
-#define sampler2D thread _mtl_combined_image_sampler_2d<float>
-#define depth2D thread _mtl_combined_image_sampler_depth_2d<float>
-#define sampler2DArray thread _mtl_combined_image_sampler_2d_array<float>
-#define sampler2DArrayShadow thread _mtl_combined_image_sampler_depth_2d_array<float>
-#define depth2DArray thread _mtl_combined_image_sampler_depth_2d_array<float>
-#define depth2DArrayShadow thread _mtl_combined_image_sampler_depth_2d_array<float>
-#define sampler3D thread _mtl_combined_image_sampler_3d<float>
-#define samplerBuffer thread _mtl_combined_image_sampler_buffer<float, access::read>
-#define samplerCube thread _mtl_combined_image_sampler_cube<float>
-#define samplerCubeArray thread _mtl_combined_image_sampler_cube_array<float>
+#define sampler1D thread _mtl_sampler_1d<float>
+#define sampler1DArray thread _mtl_sampler_1d_array<float>
+#define sampler2D thread _mtl_sampler_2d<float>
+#define depth2D thread _mtl_sampler_depth_2d<float>
+#define sampler2DArray thread _mtl_sampler_2d_array<float>
+#define sampler2DArrayShadow thread _mtl_sampler_depth_2d_array<float>
+#define depth2DArray thread _mtl_sampler_depth_2d_array<float>
+#define depth2DArrayShadow thread _mtl_sampler_depth_2d_array<float>
+#define sampler3D thread _mtl_sampler_3d<float>
+#define samplerBuffer thread _mtl_sampler_buffer<float, access::read>
+#define samplerCube thread _mtl_sampler_cube<float>
+#define samplerCubeArray thread _mtl_sampler_cube_array<float>

-#define usampler1D thread _mtl_combined_image_sampler_1d<uint>
-#define usampler1DArray thread _mtl_combined_image_sampler_1d_array<uint>
-#define usampler2D thread _mtl_combined_image_sampler_2d<uint>
-#define udepth2D thread _mtl_combined_image_sampler_depth_2d<uint>
-#define usampler2DArray thread _mtl_combined_image_sampler_2d_array<uint>
-#define usampler2DArrayShadow thread _mtl_combined_image_sampler_depth_2d_array<uint>
-#define udepth2DArrayShadow thread _mtl_combined_image_sampler_depth_2d_array<uint>
-#define usampler3D thread _mtl_combined_image_sampler_3d<uint>
-#define usamplerBuffer thread _mtl_combined_image_sampler_buffer<uint, access::read>
-#define usamplerCube thread _mtl_combined_image_sampler_cube<uint>
-#define usamplerCubeArray thread _mtl_combined_image_sampler_cube_array<uint>
+#define usampler1D thread _mtl_sampler_1d<uint>
+#define usampler1DArray thread _mtl_sampler_1d_array<uint>
+#define usampler2D thread _mtl_sampler_2d<uint>
+#define udepth2D thread _mtl_sampler_depth_2d<uint>
+#define usampler2DArray thread _mtl_sampler_2d_array<uint>
+#define usampler2DArrayShadow thread _mtl_sampler_depth_2d_array<uint>
+#define udepth2DArrayShadow thread _mtl_sampler_depth_2d_array<uint>
+#define usampler3D thread _mtl_sampler_3d<uint>
+#define usamplerBuffer thread _mtl_sampler_buffer<uint, access::read>
+#define usamplerCube thread _mtl_sampler_cube<uint>
+#define usamplerCubeArray thread _mtl_sampler_cube_array<uint>

-#define isampler1D thread _mtl_combined_image_sampler_1d<int>
-#define isampler1DArray thread _mtl_combined_image_sampler_1d_array<int>
-#define isampler2D thread _mtl_combined_image_sampler_2d<int>
-#define idepth2D thread _mtl_combined_image_sampler_depth_2d<int>
-#define isampler2DArray thread _mtl_combined_image_sampler_2d_array<int>
-#define isampler2DArrayShadow thread _mtl_combined_image_sampler_depth_2d_array<int>
-#define idepth2DArrayShadow thread _mtl_combined_image_sampler_depth_2d_array<int>
-#define isampler3D thread _mtl_combined_image_sampler_3d<int>
-#define isamplerBuffer thread _mtl_combined_image_sampler_buffer<int, access::read>
-#define isamplerCube thread _mtl_combined_image_sampler_cube<int>
-#define isamplerCubeArray thread _mtl_combined_image_sampler_cube_array<int>
+#define isampler1D thread _mtl_sampler_1d<int>
+#define isampler1DArray thread _mtl_sampler_1d_array<int>
+#define isampler2D thread _mtl_sampler_2d<int>
+#define idepth2D thread _mtl_sampler_depth_2d<int>
+#define isampler2DArray thread _mtl_sampler_2d_array<int>
+#define isampler2DArrayShadow thread _mtl_sampler_depth_2d_array<int>
+#define idepth2DArrayShadow thread _mtl_sampler_depth_2d_array<int>
+#define isampler3D thread _mtl_sampler_3d<int>
+#define isamplerBuffer thread _mtl_sampler_buffer<int, access::read>
+#define isamplerCube thread _mtl_sampler_cube<int>
+#define isamplerCubeArray thread _mtl_sampler_cube_array<int>

 #ifndef MTL_SUPPORTS_TEXTURE_ATOMICS
 /* If texture atomics are unsupported, map atomic types to internal atomic fallback type. */
-#  define usampler2DArrayAtomic _mtl_combined_image_sampler_2d_array_atomic_fallback<uint>
-#  define usampler2DAtomic _mtl_combined_image_sampler_2d_atomic_fallback<uint>
-#  define usampler3DAtomic _mtl_combined_image_sampler_3d_atomic_fallback<uint>
-#  define isampler2DArrayAtomic _mtl_combined_image_sampler_2d_array_atomic_fallback<int>
-#  define isampler2DAtomic _mtl_combined_image_sampler_2d_atomic_fallback<int>
-#  define isampler3DAtomic _mtl_combined_image_sampler_3d_atomic_fallback<int>
+#  define usampler2DArrayAtomic _mtl_sampler_2d_array_atomic<uint>
+#  define usampler2DAtomic _mtl_sampler_2d_atomic<uint>
+#  define usampler3DAtomic _mtl_sampler_3d_atomic<uint>
+#  define isampler2DArrayAtomic _mtl_sampler_2d_array_atomic<int>
+#  define isampler2DAtomic _mtl_sampler_2d_atomic<int>
+#  define isampler3DAtomic _mtl_sampler_3d_atomic<int>
 #else
 #  define usampler2DArrayAtomic usampler2DArray
 #  define usampler2DAtomic usampler2D
@@ -340,1544 +799,32 @@ struct SStruct {
 /* Vector accessor aliases. */
 #define st xy

-/* Texture functions. */
-#define texelFetch _texelFetch_internal
-#define texelFetchOffset(__tex, __texel, __lod, __offset) \
-  _texelFetch_internal(__tex, __texel, __lod, __offset)
-#define imageLoad(__image, __coord) _texelFetch_internal(__image, __coord, 0)
-#define imageLoadFast(__image, __coord) _texelFetch_internal_fast(__image, __coord, 0)
-#define texture2(__tex, __uv) _texture_internal_samp(__tex, __uv)
-#define texture3(__tex, __uv, _bias) _texture_internal_bias(__tex, __uv, bias(float(_bias)))
-#define textureLod(__tex, __uv, __lod) _texture_internal_level(__tex, __uv, level(float(__lod)))
-#define textureLodOffset(__tex, __uv, __lod, __offset) \
-  _texture_internal_level(__tex, __uv, level(float(__lod)), __offset)
-#define textureGather2(__tex, __uv) _texture_gather_internal(__tex, __uv, 0)
-#define textureGather3(__tex, __uv, __comp) _texture_gather_internal(__tex, __uv, __comp)
-#define textureGatherOffset(__tex, __offset, __uv, __comp) \
-  _texture_gather_internal(__tex, __uv, __comp, __offset)
-#define textureGrad(__tex, __uv, __dpdx, __dpdy) \
-  _texture_grad_internal(__tex, __uv, __dpdx, __dpdy)
-
-#define TEXURE_MACRO(_1, _2, _3, TEXNAME, ...) TEXNAME
-#define texture(...) TEXURE_MACRO(__VA_ARGS__, texture3, texture2)(__VA_ARGS__)
-#define textureGather(...) TEXURE_MACRO(__VA_ARGS__, textureGather3, textureGather2)(__VA_ARGS__)
-
-/* Texture-write functions. */
-#define imageStore(_tex, _coord, _value) _texture_write_internal(_tex, _coord, _value)
-#define imageStoreFast(_tex, _coord, _value) _texture_write_internal_fast(_tex, _coord, _value)
-
-/* Texture synchronization functions. */
-#define imageFence(image) image.texture->fence()
-
-/* Singular return values from texture functions of type DEPTH are often indexed with either .r or
- * .x. This is a lightweight wrapper type for handling this syntax. */
-union _msl_return_float {
-  float r;
-  float x;
-  inline operator float() const
-  {
-    return r;
-  }
-};
-
-/* Add custom texture sampling/reading routines for each type to account for special return cases,
- * e.g. returning a float with an r parameter Note: Cannot use template specialization for input
- * type, as return types are specific to the signature of 'tex'. */
-
-/* Use point sampler instead of texture read to benefit from texture caching and reduce branching
- * through removal of bounds tests, as these are handled by the sample operation. */
-constexpr sampler _point_sample_(address::clamp_to_zero, filter::nearest, coord::pixel);
-
-/* Texture Read via point sampling.
- * NOTE: These templates will evaluate first for texture resources bound with sample. */
-template<typename S, typename T>
-inline vec<S, 4> _texelFetch_internal(thread _mtl_combined_image_sampler_1d<S, access::sample> tex,
-                                      T texel,
-                                      uint lod = 0)
-{
-  return tex.texture->sample(_point_sample_, float(texel));
-}
-
-template<typename S, typename T>
-inline vec<S, 4> _texelFetch_internal(thread _mtl_combined_image_sampler_1d<S, access::sample> tex,
-                                      T texel,
-                                      uint lod,
-                                      T offset)
-{
-  return tex.texture->sample(_point_sample_, float(texel + offset));
-}
-
-template<typename S, typename T>
-inline vec<S, 4> _texelFetch_internal(
-    thread _mtl_combined_image_sampler_1d_array<S, access::sample> tex,
-    vec<T, 2> texel,
-    uint lod,
-    vec<T, 2> offset = vec<T, 2>(0, 0))
-{
-  return tex.texture->sample(_point_sample_, float(texel.x + offset.x), uint(texel.y + offset.y));
-}
-
-template<typename S, typename T>
-inline vec<S, 4> _texelFetch_internal(thread _mtl_combined_image_sampler_2d<S, access::sample> tex,
-                                      vec<T, 2> texel,
-                                      uint lod,
-                                      vec<T, 2> offset = vec<T, 2>(0))
-{
-  return tex.texture->sample(_point_sample_, float2(texel.xy + offset.xy), level(lod));
-}
-
-template<typename S, typename T>
-inline vec<S, 4> _texelFetch_internal(
-    thread _mtl_combined_image_sampler_2d_array<S, access::sample> tex,
-    vec<T, 3> texel,
-    uint lod,
-    vec<T, 3> offset = vec<T, 3>(0))
-{
-  return tex.texture->sample(
-      _point_sample_, float2(texel.xy + offset.xy), uint(texel.z + offset.z), level(lod));
-}
-
-template<typename S, typename T>
-inline vec<S, 4> _texelFetch_internal(thread _mtl_combined_image_sampler_3d<S, access::sample> tex,
-                                      vec<T, 3> texel,
-                                      uint lod,
-                                      vec<T, 3> offset = vec<T, 3>(0))
-{
-  return tex.texture->sample(_point_sample_, float3(texel.xyz + offset.xyz), level(lod));
-}
-
-template<typename T>
-inline _msl_return_float _texelFetch_internal(
-    thread _mtl_combined_image_sampler_depth_2d<float, access::sample> tex,
-    vec<T, 2> texel,
-    uint lod,
-    vec<T, 2> offset = vec<T, 2>(0))
-{
-  _msl_return_float fl = {
-      tex.texture->sample(_point_sample_, float2(texel.xy + offset.xy), level(lod))};
-  return fl;
-}
-
-template<typename S, typename T>
-inline vec<S, 4> _texture_internal_samp(
-    thread _mtl_combined_image_sampler_2d_array<S, access::sample> tex,
-    vec<T, 3> texel,
-    uint lod,
-    vec<T, 3> offset = vec<T, 3>(0))
-{
-  return tex.texture->sample(
-      _point_sample_, float2(texel.xy + offset.xy), uint(texel.z + offset.z), level(lod));
-}
-
-/* Texture Read via read operation. Required by compute/image-bindings. */
-template<typename S, typename T, access A>
-inline vec<S, 4> _texelFetch_internal(thread _mtl_combined_image_sampler_1d<S, A> tex,
-                                      T texel,
-                                      uint lod = 0)
-{
-  float w = tex.texture->get_width();
-  if (texel >= 0 && texel < w) {
-    return tex.texture->read(uint(texel));
-  }
-  else {
-    return vec<S, 4>(0);
-  }
-}
-
-template<typename S, typename T, access A>
-inline vec<S, 4> _texelFetch_internal_fast(thread _mtl_combined_image_sampler_1d<S, A> tex,
-                                           T texel,
-                                           uint lod = 0)
-{
-  return tex.texture->read(uint(texel));
-}
-
-template<typename S, typename T>
-inline vec<S, 4> _texelFetch_internal(
-    const thread _mtl_combined_image_sampler_buffer<S, access::read> tex, T texel, uint lod = 0)
-{
-  float w = tex.texture->get_width();
-  if (texel >= 0 && texel < w) {
-    return tex.texture->read(uint(texel));
-  }
-  else {
-    return vec<S, 4>(0);
-  }
-}
-
-template<typename S, typename T>
-inline vec<S, 4> _texelFetch_internal_fast(
-    const thread _mtl_combined_image_sampler_buffer<S, access::read> tex, T texel, uint lod = 0)
-{
-  return tex.texture->read(uint(texel));
-}
-
-template<typename S, typename T, access A>
-inline vec<S, 4> _texelFetch_internal(thread _mtl_combined_image_sampler_1d<S, A> tex,
-                                      T texel,
-                                      uint lod,
-                                      T offset)
-{
-  float w = tex.texture->get_width();
-  if ((texel + offset) >= 0 && (texel + offset) < w) {
-    /* LODs not supported for 1d textures. This must be zero. */
-    return tex.texture->read(uint(texel + offset), 0);
-  }
-  else {
-    return vec<S, 4>(0);
-  }
-}
-
-template<typename S, typename T, access A>
-inline vec<S, 4> _texelFetch_internal_fast(thread _mtl_combined_image_sampler_1d<S, A> tex,
-                                           T texel,
-                                           uint lod,
-                                           T offset)
-{
-  /* LODs not supported for 1d textures. This must be zero. */
-  return tex.texture->read(uint(texel + offset), 0);
-}
-
-template<typename S, typename T, access A>
-inline vec<S, 4> _texelFetch_internal(thread _mtl_combined_image_sampler_1d_array<S, A> tex,
-                                      vec<T, 2> texel,
-                                      uint lod,
-                                      vec<T, 2> offset = vec<T, 2>(0, 0))
-{
-
-  float w = tex.texture->get_width();
-  float h = tex.texture->get_array_size();
-  if ((texel.x + offset.x) >= 0 && (texel.x + offset.x) < w && (texel.y + offset.y) >= 0 &&
-      (texel.y + offset.y) < h)
-  {
-    /* LODs not supported for 1d textures. This must be zero. */
-    return tex.texture->read(uint(texel.x + offset.x), uint(texel.y + offset.y), 0);
-  }
-  else {
-    return vec<S, 4>(0);
-  }
-}
-
-template<typename S, typename T, access A>
-inline vec<S, 4> _texelFetch_internal_fast(thread _mtl_combined_image_sampler_1d_array<S, A> tex,
-                                           vec<T, 2> texel,
-                                           uint lod,
-                                           vec<T, 2> offset = vec<T, 2>(0, 0))
-{
-  /* LODs not supported for 1d textures. This must be zero. */
-  return tex.texture->read(uint(texel.x + offset.x), uint(texel.y + offset.y), 0);
-}
-
-template<typename S, typename T, access A>
-inline vec<S, 4> _texelFetch_internal(thread _mtl_combined_image_sampler_2d<S, A> tex,
-                                      vec<T, 2> texel,
-                                      uint lod,
-                                      vec<T, 2> offset = vec<T, 2>(0))
-{
-
-  float w = tex.texture->get_width() >> lod;
-  float h = tex.texture->get_height() >> lod;
-  if ((texel.x + offset.x) >= 0 && (texel.x + offset.x) < w && (texel.y + offset.y) >= 0 &&
-      (texel.y + offset.y) < h)
-  {
-    return tex.texture->read(uint2(texel + offset), lod);
-  }
-  else {
-    return vec<S, 4>(0);
-  }
-}
-
-template<typename S, typename T, access A>
-inline vec<S, 4> _texelFetch_internal_fast(thread _mtl_combined_image_sampler_2d<S, A> tex,
-                                           vec<T, 2> texel,
-                                           uint lod,
-                                           vec<T, 2> offset = vec<T, 2>(0))
-{
-  return tex.texture->read(uint2(texel + offset), lod);
-}
-
-template<typename S, typename T, access A>
-inline vec<S, 4> _texelFetch_internal(thread _mtl_combined_image_sampler_2d_array<S, A> tex,
-                                      vec<T, 3> texel,
-                                      uint lod,
-                                      vec<T, 3> offset = vec<T, 3>(0))
-{
-  float w = tex.texture->get_width() >> lod;
-  float h = tex.texture->get_height() >> lod;
-  float d = tex.texture->get_array_size();
-  if ((texel.x + offset.x) >= 0 && (texel.x + offset.x) < w && (texel.y + offset.y) >= 0 &&
-      (texel.y + offset.y) < h && (texel.z + offset.z) >= 0 && (texel.z + offset.z) < d)
-  {
-    return tex.texture->read(uint2(texel.xy + offset.xy), uint(texel.z + offset.z), lod);
-  }
-  else {
-    return vec<S, 4>(0);
-  }
-}
-
-template<typename S, typename T, access A>
-inline vec<S, 4> _texelFetch_internal_fast(thread _mtl_combined_image_sampler_2d_array<S, A> tex,
-                                           vec<T, 3> texel,
-                                           uint lod,
-                                           vec<T, 3> offset = vec<T, 3>(0))
-{
-  return tex.texture->read(uint2(texel.xy + offset.xy), uint(texel.z + offset.z), lod);
-}
-
-template<typename S, typename T, access A>
-inline vec<S, 4> _texelFetch_internal(thread _mtl_combined_image_sampler_3d<S, A> tex,
-                                      vec<T, 3> texel,
-                                      uint lod,
-                                      vec<T, 3> offset = vec<T, 3>(0))
-{
-
-  float w = tex.texture->get_width() >> lod;
-  float h = tex.texture->get_height() >> lod;
-  float d = tex.texture->get_depth() >> lod;
-  if ((texel.x + offset.x) >= 0 && (texel.x + offset.x) < w && (texel.y + offset.y) >= 0 &&
-      (texel.y + offset.y) < h && (texel.z + offset.z) >= 0 && (texel.z + offset.z) < d)
-  {
-    return tex.texture->read(uint3(texel + offset), lod);
-  }
-  else {
-    return vec<S, 4>(0);
-  }
-}
-
-template<typename S, typename T, access A>
-inline vec<S, 4> _texelFetch_internal_fast(thread _mtl_combined_image_sampler_3d<S, A> tex,
-                                           vec<T, 3> texel,
-                                           uint lod,
-                                           vec<T, 3> offset = vec<T, 3>(0))
-{
-  return tex.texture->read(uint3(texel + offset), lod);
-}
-
-template<typename T, access A>
-inline _msl_return_float _texelFetch_internal(
-    thread _mtl_combined_image_sampler_depth_2d<float, A> tex,
-    vec<T, 2> texel,
-    uint lod,
-    vec<T, 2> offset = vec<T, 2>(0))
-{
-
-  float w = tex.texture->get_width() >> lod;
-  float h = tex.texture->get_height() >> lod;
-  if ((texel.x + offset.x) >= 0 && (texel.x + offset.x) < w && (texel.y + offset.y) >= 0 &&
-      (texel.y + offset.y) < h)
-  {
-    _msl_return_float fl = {tex.texture->read(uint2(texel + offset), lod)};
-    return fl;
-  }
-  else {
-    _msl_return_float fl = {0};
-    return fl;
-  }
-}
-
-template<typename T, access A>
-inline _msl_return_float _texelFetch_internal_fast(
-    thread _mtl_combined_image_sampler_depth_2d<float, A> tex,
-    vec<T, 2> texel,
-    uint lod,
-    vec<T, 2> offset = vec<T, 2>(0))
-{
-  _msl_return_float fl = {tex.texture->read(uint2(texel + offset), lod)};
-  return fl;
-}
-
-template<typename S, typename T, access A>
-inline vec<S, 4> _texture_internal_samp(thread _mtl_combined_image_sampler_2d_array<S, A> tex,
-                                        vec<T, 3> texel,
-                                        uint lod,
-                                        vec<T, 3> offset = vec<T, 3>(0))
-{
-
-  float w = tex.texture->get_width() >> lod;
-  float h = tex.texture->get_height() >> lod;
-  float d = tex.texture->get_array_size();
-  if ((texel.x + offset.x) >= 0 && (texel.x + offset.x) < w && (texel.y + offset.y) >= 0 &&
-      (texel.y + offset.y) < h && (texel.z + offset.z) >= 0 && (texel.z + offset.z) < d)
-  {
-    return tex.texture->read(uint2(texel.xy + offset.xy), uint(texel.z + offset.z), lod);
-  }
-  else {
-    return vec<S, 4>(0);
-  }
-}
-
-/* Sample. */
-template<typename T>
-inline vec<T, 4> _texture_internal_samp(
-    thread _mtl_combined_image_sampler_1d<T, access::sample> tex, float u)
-{
-  return tex.texture->sample(*tex.samp, u);
-}
-
-inline float4 _texture_internal_samp(
-    thread _mtl_combined_image_sampler_1d_array<float, access::sample> tex, float2 ua)
-{
-  return tex.texture->sample(*tex.samp, ua.x, uint(ua.y));
-}
-
-inline int4 _texture_internal_samp(thread _mtl_combined_image_sampler_2d<int, access::sample> tex,
-                                   float2 uv)
-{
-  return tex.texture->sample(*tex.samp, uv);
-}
-
-inline uint4 _texture_internal_samp(
-    thread _mtl_combined_image_sampler_2d<uint, access::sample> tex, float2 uv)
-{
-  return tex.texture->sample(*tex.samp, uv);
-}
-
-inline float4 _texture_internal_samp(
-    thread _mtl_combined_image_sampler_2d<float, access::sample> tex, float2 uv)
-{
-  return tex.texture->sample(*tex.samp, uv);
-}
-
-inline _msl_return_float _texture_internal_samp(
-    thread _mtl_combined_image_sampler_depth_2d<float, access::sample> tex, float2 uv)
-{
-  _msl_return_float fl = {tex.texture->sample(*tex.samp, uv)};
-  return fl;
-}
-
-template<typename T>
-inline vec<T, 4> _texture_internal_samp(
-    thread _mtl_combined_image_sampler_3d<T, access::sample> tex, float3 uvw)
-{
-  return tex.texture->sample(*tex.samp, uvw);
-}
-
-template<typename T>
-inline vec<T, 4> _texture_internal_samp(
-    thread _mtl_combined_image_sampler_2d_array<T, access::sample> tex, float3 uva)
-{
-  return tex.texture->sample(*tex.samp, uva.xy, uint(uva.z));
-}
-
-inline _msl_return_float _texture_internal_samp(
-    thread _mtl_combined_image_sampler_depth_2d_array<float, access::sample> tex, float3 uva)
-{
-  _msl_return_float fl = {tex.texture->sample(*tex.samp, uva.xy, uint(uva.z))};
-  return fl;
-}
-
-inline _msl_return_float _texture_internal_samp(
-    thread _mtl_combined_image_sampler_depth_2d_array<float, access::sample> tex, float4 uvac)
-{
-  _msl_return_float fl = {
-      tex.texture->sample_compare(*tex.samp, uvac.xy, uint(uvac.z), uvac.w, level(0))};
-  return fl;
-}
-
-template<typename T>
-inline vec<T, 4> _texture_internal_samp(
-    thread _mtl_combined_image_sampler_cube<T, access::sample> tex, float3 uvs)
-{
-  return tex.texture->sample(*tex.samp, uvs.xyz);
-}
-
-template<typename T>
-inline vec<T, 4> _texture_internal_samp(
-    thread _mtl_combined_image_sampler_cube_array<T, access::sample> tex, float4 coord_a)
-{
-  return tex.texture->sample(*tex.samp, coord_a.xyz, uint(coord_a.w));
-}
-
-/* Sample Level. */
-template<typename T>
-inline vec<T, 4> _texture_internal_level(
-    thread _mtl_combined_image_sampler_1d<T, access::sample> tex,
-    float u,
-    level options,
-    int offset = 0)
-{
-  /* LODs not supported for 1d textures. This must be zero. */
-  return tex.texture->sample(*tex.samp, u);
-}
-
-inline float4 _texture_internal_level(
-    thread _mtl_combined_image_sampler_1d_array<float, access::sample> tex,
-    float2 ua,
-    level options,
-    int offset = 0)
-{
-  /* LODs not supported for 1d textures. This must be zero. */
-  return tex.texture->sample(*tex.samp, ua.x, uint(ua.y));
-}
-
-inline int4 _texture_internal_level(thread _mtl_combined_image_sampler_2d<int, access::sample> tex,
-                                    float2 uv,
-                                    level options,
-                                    int2 offset = int2(0))
-{
-  return tex.texture->sample(*tex.samp, uv, options, offset);
-}
-
-inline uint4 _texture_internal_level(
-    thread _mtl_combined_image_sampler_2d<uint, access::sample> tex,
-    float2 uv,
-    level options,
-    int2 offset = int2(0))
-{
-  return tex.texture->sample(*tex.samp, uv, options, offset);
-}
-
-inline float4 _texture_internal_level(
-    thread _mtl_combined_image_sampler_2d<float, access::sample> tex,
-    float2 uv,
-    level options,
-    int2 offset = int2(0))
-{
-  return tex.texture->sample(*tex.samp, uv, options, offset);
-}
-
-inline _msl_return_float _texture_internal_level(
-    thread _mtl_combined_image_sampler_depth_2d<float, access::sample> tex,
-    float2 uv,
-    level options,
-    int2 offset = int2(0))
-{
-  _msl_return_float fl = {tex.texture->sample(*tex.samp, uv, options, offset)};
-  return fl;
-}
-
-template<typename T>
-inline vec<T, 4> _texture_internal_level(
-    thread _mtl_combined_image_sampler_3d<T, access::sample> tex,
-    float3 uvw,
-    level options = level(0),
-    int3 offset = int3(0))
-{
-  return tex.texture->sample(*tex.samp, uvw, options, offset);
-}
-
-template<typename T>
-inline vec<T, 4> _texture_internal_level(
-    thread _mtl_combined_image_sampler_2d_array<T, access::sample> tex,
-    float3 uva,
-    level options = level(0),
-    int2 offset = int2(0))
-{
-  return tex.texture->sample(*tex.samp, uva.xy, uint(uva.z), options, offset);
-}
-
-inline _msl_return_float _texture_internal_level(
-    thread _mtl_combined_image_sampler_depth_2d_array<float, access::sample> tex,
-    float3 uva,
-    level options = level(0),
-    int2 offset = int2(0))
-{
-  _msl_return_float fl = {tex.texture->sample(*tex.samp, uva.xy, uint(uva.z), options, offset)};
-  return fl;
-}
-
-inline _msl_return_float _texture_internal_level(
-    thread _mtl_combined_image_sampler_depth_2d_array<float, access::sample> tex,
-    float4 uvac,
-    level options = level(0),
-    int2 offset = int2(0))
-{
-  _msl_return_float fl = {
-      tex.texture->sample_compare(*tex.samp, uvac.xy, uint(uvac.z), uvac.w, level(0), offset)};
-  return fl;
-}
-
-template<typename T>
-inline vec<T, 4> _texture_internal_level(
-    thread _mtl_combined_image_sampler_cube<T, access::sample> tex,
-    float3 uvs,
-    level options = level(0),
-    int2 offset = int2(0))
-{
-  return tex.texture->sample(*tex.samp, uvs.xyz, options);
-}
-
-template<typename T>
-inline vec<T, 4> _texture_internal_level(
-    thread _mtl_combined_image_sampler_cube_array<T, access::sample> tex,
-    float4 coord_a,
-    level options = level(0),
-    int3 offset = int3(0))
-{
-  return tex.texture->sample(*tex.samp, coord_a.xyz, uint(coord_a.w), options);
-}
-
-/* Sample Bias. */
-template<typename T>
-inline vec<T, 4> _texture_internal_bias(
-    thread _mtl_combined_image_sampler_1d<T, access::sample> tex,
-    float u,
-    bias options = bias(0.0),
-    int offset = 0)
-{
-  return tex.texture->sample(*tex.samp, u);
-}
-
-inline float4 _texture_internal_bias(
-    thread _mtl_combined_image_sampler_2d<float, access::sample> tex,
-    float2 uv,
-    bias options = bias(0.0),
-    int2 offset = int2(0))
-{
-  return tex.texture->sample(*tex.samp, uv, options, offset);
-}
-
-inline _msl_return_float _texture_internal_bias(
-    thread _mtl_combined_image_sampler_depth_2d<float, access::sample> tex,
-    float2 uv,
-    bias options = bias(0),
-    int2 offset = int2(0))
-{
-  _msl_return_float fl = {tex.texture->sample(*tex.samp, uv, options, offset)};
-  return fl;
-}
-
-/* Texture Gather. */
-component int_to_component(const int comp)
-{
-  switch (comp) {
-    default:
-    case 0:
-      return component::x;
-    case 1:
-      return component::y;
-    case 2:
-      return component::z;
-    case 3:
-      return component::w;
-  }
-  return component::x;
-}
-
-inline float4 _texture_gather_internal(
-    thread _mtl_combined_image_sampler_depth_2d<float, access::sample> tex,
-    float2 uv,
-    const int comp = 0,
-    int2 offset = int2(0))
-{
-  return tex.texture->gather(*tex.samp, uv, offset);
-}
-
-inline float4 _texture_gather_internal(
-    thread _mtl_combined_image_sampler_depth_2d_array<float, access::sample> tex,
-    float3 uva,
-    const int comp = 0,
-    int2 offset = int2(0))
-{
-  return tex.texture->gather(*tex.samp, uva.xy, uint(uva.z), offset);
-}
-
-template<typename T>
-inline vec<T, 4> _texture_gather_internal(
-    thread _mtl_combined_image_sampler_2d<T, access::sample> tex,
-    float2 uv,
-    const int comp = 0,
-    int2 offset = int2(0))
-{
-  return tex.texture->gather(*tex.samp, uv, offset);
-}
-
-template<typename T>
-inline vec<T, 4> _texture_gather_internal(
-    thread _mtl_combined_image_sampler_2d_array<T, access::sample> tex,
-    float3 uva,
-    const int comp = 0,
-    int2 offset = int2(0))
-{
-  return tex.texture->gather(*tex.samp, uva.xy, uint(uva.z), offset);
-}
-
-/* Texture Grad. */
-inline float4 _texture_grad_internal(
-    thread _mtl_combined_image_sampler_2d<float, access::sample> tex,
-    float2 uv,
-    float2 dpdx,
-    float2 dpdy)
-{
-  return tex.texture->sample(*tex.samp, uv, gradient2d(dpdx, dpdy));
-}
-
-inline float4 _texture_grad_internal(
-    thread _mtl_combined_image_sampler_2d_array<float, access::sample> tex,
-    float3 uva,
-    float2 dpdx,
-    float2 dpdy)
-{
-  return tex.texture->sample(*tex.samp, uva.xy, uint(uva.z), gradient2d(dpdx, dpdy));
-}
-
-inline float4 _texture_grad_internal(
-    thread _mtl_combined_image_sampler_3d<float, access::sample> tex,
-    float3 uvw,
-    float3 dpdx,
-    float3 dpdy)
-{
-  return tex.texture->sample(*tex.samp, uvw, gradient3d(dpdx, dpdy));
-}
-
-/* Texture write support. */
-template<typename S, typename T, access A>
-inline void _texture_write_internal(thread _mtl_combined_image_sampler_1d<S, A> tex,
-                                    T _coord,
-                                    vec<S, 4> value)
-{
-  float w = tex.texture->get_width();
-  if (_coord >= 0 && _coord < w) {
-    tex.texture->write(value, uint(_coord));
-  }
-}
-
-template<typename S, typename T, access A>
-inline void _texture_write_internal_fast(thread _mtl_combined_image_sampler_1d<S, A> tex,
-                                         T _coord,
-                                         vec<S, 4> value)
-{
-  tex.texture->write(value, uint(_coord));
-}
-
-template<typename S, typename T, access A>
-inline void _texture_write_internal_fast(thread _mtl_combined_image_sampler_1d<S, A> tex,
-                                         T _coord,
-                                         S value)
-{
-  tex.texture->write(value, uint(_coord));
-}
-
-template<typename S, typename T, access A>
-inline void _texture_write_internal(thread _mtl_combined_image_sampler_2d<S, A> tex,
-                                    T _coord,
-                                    vec<S, 4> value)
-{
-  float w = tex.texture->get_width();
-  float h = tex.texture->get_height();
-  if (_coord.x >= 0 && _coord.x < w && _coord.y >= 0 && _coord.y < h) {
-    tex.texture->write(value, uint2(_coord.xy));
-  }
-}
-
-template<typename S, typename T, access A>
-inline void _texture_write_internal_fast(thread _mtl_combined_image_sampler_2d<S, A> tex,
-                                         T _coord,
-                                         vec<S, 4> value)
-{
-  tex.texture->write(value, uint2(_coord.xy));
-}
-
-template<typename S, typename T, access A>
-inline void _texture_write_internal_fast(thread _mtl_combined_image_sampler_2d<S, A> tex,
-                                         T _coord,
-                                         S value)
-{
-  tex.texture->write(value, uint2(_coord.xy));
-}
-
-template<typename S, typename T, access A>
-inline void _texture_write_internal(thread _mtl_combined_image_sampler_2d_array<S, A> tex,
-                                    T _coord,
-                                    vec<S, 4> value)
-{
-  float w = tex.texture->get_width();
-  float h = tex.texture->get_height();
-  float d = tex.texture->get_array_size();
-  if (_coord.x >= 0 && _coord.x < w && _coord.y >= 0 && _coord.y < h && _coord.z >= 0 &&
-      _coord.z < d)
-  {
-    tex.texture->write(value, uint2(_coord.xy), _coord.z);
-  }
-}
-
-template<typename S, typename T, access A>
-inline void _texture_write_internal_fast(thread _mtl_combined_image_sampler_2d_array<S, A> tex,
-                                         T _coord,
-                                         vec<S, 4> value)
-{
-  tex.texture->write(value, uint2(_coord.xy), _coord.z);
-}
-
-template<typename S, typename T, access A>
-inline void _texture_write_internal_fast(thread _mtl_combined_image_sampler_2d_array<S, A> tex,
-                                         T _coord,
-                                         S value)
-{
-  tex.texture->write(value, uint2(_coord.xy), _coord.z);
-}
-
-template<typename S, typename T, access A>
-inline void _texture_write_internal(thread _mtl_combined_image_sampler_3d<S, A> tex,
-                                    T _coord,
-                                    vec<S, 4> value)
-{
-  float w = tex.texture->get_width();
-  float h = tex.texture->get_height();
-  float d = tex.texture->get_depth();
-  if (_coord.x >= 0 && _coord.x < w && _coord.y >= 0 && _coord.y < h && _coord.z >= 0 &&
-      _coord.z < d)
-  {
-    tex.texture->write(value, uint3(_coord.xyz));
-  }
-}
-
-template<typename S, typename T, access A>
-inline void _texture_write_internal_fast(thread _mtl_combined_image_sampler_3d<S, A> tex,
-                                         T _coord,
-                                         vec<S, 4> value)
-{
-  tex.texture->write(value, uint3(_coord.xyz));
-}
-
-template<typename S, typename T, access A>
-inline void _texture_write_internal_fast(thread _mtl_combined_image_sampler_3d<S, A> tex,
-                                         T _coord,
-                                         S value)
-{
-  tex.texture->write(value, uint3(_coord.xyz));
-}
-
-/* Texture atomic operations are only supported in Metal 3.1 and onward (macOS 14.0 Sonoma). */
-#ifdef MTL_SUPPORTS_TEXTURE_ATOMICS
-
-/* Image atomic operations. */
-#  define imageAtomicMin(tex, coord, data) _texture_image_atomic_min_internal(tex, coord, data)
-#  define imageAtomicAdd(tex, coord, data) _texture_image_atomic_add_internal(tex, coord, data)
-#  define imageAtomicExchange(tex, coord, data) \
-    _texture_image_atomic_exchange_internal(tex, coord, data)
-#  define imageAtomicXor(tex, coord, data) _texture_image_atomic_xor_internal(tex, coord, data)
-#  define imageAtomicOr(tex, coord, data) _texture_image_atomic_or_internal(tex, coord, data)
-
-/* Atomic OR. */
-template<typename S, access A>
-S _texture_image_atomic_or_internal(thread _mtl_combined_image_sampler_1d<S, A> tex,
-                                    int coord,
-                                    S data)
-{
-  return tex.texture->atomic_fetch_or(uint(coord), vec<S, 4>(data)).x;
-}
-
-template<typename S, access A>
-S _texture_image_atomic_or_internal(thread _mtl_combined_image_sampler_1d_array<S, A> tex,
-                                    int2 coord,
-                                    S data)
-{
-  return tex.texture->atomic_fetch_or(uint(coord.x), uint(coord.y), vec<S, 4>(data)).x;
-}
-
-template<typename S, access A>
-S _texture_image_atomic_or_internal(thread _mtl_combined_image_sampler_2d<S, A> tex,
-                                    int2 coord,
-                                    S data)
-{
-  return tex.texture->atomic_fetch_or(uint2(coord.xy), vec<S, 4>(data)).x;
-}
-
-template<typename S, access A>
-S _texture_image_atomic_or_internal(thread _mtl_combined_image_sampler_2d_array<S, A> tex,
-                                    int3 coord,
-                                    S data)
-{
-  return tex.texture->atomic_fetch_or(uint2(coord.xy), uint(coord.z), vec<S, 4>(data)).x;
-}
-
-template<typename S, access A>
-S _texture_image_atomic_or_internal(thread _mtl_combined_image_sampler_3d<S, A> tex,
-                                    int3 coord,
-                                    S data)
-{
-  return tex.texture->atomic_fetch_or(uint3(coord), vec<S, 4>(data)).x;
-}
-
-/* Atomic XOR. */
-template<typename S, access A>
-S _texture_image_atomic_xor_internal(thread _mtl_combined_image_sampler_1d<S, A> tex,
-                                     int coord,
-                                     S data)
-{
-  return tex.texture->atomic_fetch_xor(uint(coord), vec<S, 4>(data)).x;
-}
-
-template<typename S, access A>
-S _texture_image_atomic_xor_internal(thread _mtl_combined_image_sampler_1d_array<S, A> tex,
-                                     int2 coord,
-                                     S data)
-{
-  return tex.texture->atomic_fetch_xor(uint(coord.x), uint(coord.y), vec<S, 4>(data)).x;
-}
-
-template<typename S, access A>
-S _texture_image_atomic_xor_internal(thread _mtl_combined_image_sampler_2d<S, A> tex,
-                                     int2 coord,
-                                     S data)
-{
-  return tex.texture->atomic_fetch_xor(uint2(coord.xy), vec<S, 4>(data)).x;
-}
-
-template<typename S, access A>
-S _texture_image_atomic_xor_internal(thread _mtl_combined_image_sampler_2d_array<S, A> tex,
-                                     int3 coord,
-                                     S data)
-{
-  return tex.texture->atomic_fetch_xor(uint2(coord.xy), uint(coord.z), vec<S, 4>(data)).x;
-}
-
-template<typename S, access A>
-S _texture_image_atomic_xor_internal(thread _mtl_combined_image_sampler_3d<S, A> tex,
-                                     int3 coord,
-                                     S data)
-{
-  return tex.texture->atomic_fetch_xor(uint3(coord), vec<S, 4>(data)).x;
-}
-
-/* Atomic Min. */
-template<typename S, access A>
-S _texture_image_atomic_min_internal(thread _mtl_combined_image_sampler_1d<S, A> tex,
-                                     int coord,
-                                     S data)
-{
-  return tex.texture->atomic_fetch_min(uint(coord), vec<S, 4>(data)).x;
-}
-
-template<typename S, access A>
-S _texture_image_atomic_min_internal(thread _mtl_combined_image_sampler_1d_array<S, A> tex,
-                                     int2 coord,
-                                     S data)
-{
-  return tex.texture->atomic_fetch_min(uint(coord.x), uint(coord.y), vec<S, 4>(data)).x;
-}
-
-template<typename S, access A>
-S _texture_image_atomic_min_internal(thread _mtl_combined_image_sampler_2d<S, A> tex,
-                                     int2 coord,
-                                     S data)
-{
-  return tex.texture->atomic_fetch_min(uint2(coord.xy), vec<S, 4>(data)).x;
-}
-
-template<typename S, access A>
-S _texture_image_atomic_min_internal(thread _mtl_combined_image_sampler_2d_array<S, A> tex,
-                                     int3 coord,
-                                     S data)
-{
-  return tex.texture->atomic_fetch_min(uint2(coord.xy), uint(coord.z), vec<S, 4>(data)).x;
-}
-
-template<typename S, access A>
-S _texture_image_atomic_min_internal(thread _mtl_combined_image_sampler_3d<S, A> tex,
-                                     int3 coord,
-                                     S data)
-{
-  return tex.texture->atomic_fetch_min(uint3(coord), vec<S, 4>(data)).x;
-}
-
-/* Atomic Add. */
-template<typename S, access A>
-S _texture_image_atomic_add_internal(thread _mtl_combined_image_sampler_1d<S, A> tex,
-                                     int coord,
-                                     S data)
-{
-  return tex.texture->atomic_fetch_add(uint(coord), vec<S, 4>(data)).x;
-}
-
-template<typename S, access A>
-S _texture_image_atomic_add_internal(thread _mtl_combined_image_sampler_1d_array<S, A> tex,
-                                     int2 coord,
-                                     S data)
-{
-  return tex.texture->atomic_fetch_add(uint(coord.x), uint(coord.y), vec<S, 4>(data)).x;
-}
-
-template<typename S, access A>
-S _texture_image_atomic_add_internal(thread _mtl_combined_image_sampler_2d<S, A> tex,
-                                     int2 coord,
-                                     S data)
-{
-  return tex.texture->atomic_fetch_add(uint2(coord.xy), vec<S, 4>(data)).x;
-}
-
-template<typename S, access A>
-S _texture_image_atomic_add_internal(thread _mtl_combined_image_sampler_2d_array<S, A> tex,
-                                     int3 coord,
-                                     S data)
-{
-  return tex.texture->atomic_fetch_add(uint2(coord.xy), uint(coord.z), vec<S, 4>(data)).x;
-}
-
-template<typename S, access A>
-S _texture_image_atomic_add_internal(thread _mtl_combined_image_sampler_3d<S, A> tex,
-                                     int3 coord,
-                                     S data)
-{
-  return tex.texture->atomic_fetch_add(uint3(coord), vec<S, 4>(data)).x;
-}
-
-/* Atomic Exchange. */
-template<typename S, access A>
-S _texture_image_atomic_exchange_internal(thread _mtl_combined_image_sampler_1d<S, A> tex,
-                                          int coord,
-                                          S data)
-{
-  return tex.texture->atomic_exchange(uint(coord), vec<S, 4>(data)).x;
-}
-
-template<typename S, access A>
-S _texture_image_atomic_exchange_internal(thread _mtl_combined_image_sampler_1d_array<S, A> tex,
-                                          int2 coord,
-                                          S data)
-{
-  return tex.texture->atomic_exchange(uint(coord.x), uint(coord.y), vec<S, 4>(data)).x;
-}
-
-template<typename S, access A>
-S _texture_image_atomic_exchange_internal(thread _mtl_combined_image_sampler_2d<S, A> tex,
-                                          int2 coord,
-                                          S data)
-{
-  return tex.texture->atomic_exchange(uint2(coord.xy), vec<S, 4>(data)).x;
-}
-
-template<typename S, access A>
-S _texture_image_atomic_exchange_internal(thread _mtl_combined_image_sampler_2d_array<S, A> tex,
-                                          int3 coord,
-                                          S data)
-{
-  return tex.texture->atomic_exchange(uint2(coord.xy), uint(coord.z), vec<S, 4>(data)).x;
-}
-
-template<typename S, access A>
-S _texture_image_atomic_exchange_internal(thread _mtl_combined_image_sampler_3d<S, A> tex,
-                                          int3 coord,
-                                          S data)
-{
-  return tex.texture->atomic_exchange(uint3(coord), vec<S, 4>(data)).x;
-}
-
-#else
-
-/**
- * Texture atomic fallback function entry points.
- * NOTE: When texture atomics are unsupported, the wrapped type contains a buffer-backed 2D
- * texture. Atomic operations happen directly on the underlying buffer, and texture coordinates are
- * remapped into 2D texture space from 2D Array or 3D texture coordinates.
- */
-
-/* Image atomic operations. */
-#  define imageAtomicMin(tex, coord, data) \
-    _texture_image_atomic_min_internal_fallback(tex, coord, data)
-#  define imageAtomicAdd(tex, coord, data) \
-    _texture_image_atomic_add_internal_fallack(tex, coord, data)
-#  define imageAtomicExchange(tex, coord, data) \
-    _texture_image_atomic_exchange_internal_fallack(tex, coord, data)
-#  define imageAtomicXor(tex, coord, data) \
-    _texture_image_atomic_xor_internal_fallack(tex, coord, data)
-#  define imageAtomicOr(tex, coord, data) \
-    _texture_image_atomic_or_internal_fallack(tex, coord, data)
-
-/** Pixel address location remapping. */
-
-/* Map 2D/3D texture coordinate into a linear pixel ID. */
-template<typename S, access A>
-uint tex_coord_to_linear_px(thread _mtl_combined_image_sampler_2d_atomic_fallback<S, A> tex,
-                            uint2 coord)
-{
-  return (coord.x + coord.y * uint(tex.texture->get_width()));
-}
-template<typename S, access A>
-uint tex_coord_to_linear_px(thread _mtl_combined_image_sampler_2d_atomic_fallback<S, A> tex,
-                            int2 coord)
-{
-  return tex_coord_to_linear_px(tex, uint2(coord));
-}
-
-template<typename S, access A>
-uint tex_coord_to_linear_px(thread _mtl_combined_image_sampler_2d_array_atomic_fallback<S, A> tex,
-                            uint3 coord)
-{
-  return (coord.x + coord.y * tex.texture_size.x +
-          coord.z * (tex.texture_size.x * tex.texture_size.y));
-}
-template<typename S, access A>
-uint tex_coord_to_linear_px(thread _mtl_combined_image_sampler_2d_array_atomic_fallback<S, A> tex,
-                            int3 coord)
-{
-  return tex_coord_to_linear_px(tex, uint3(coord));
-}
-
-template<typename S, access A>
-uint tex_coord_to_linear_px(thread _mtl_combined_image_sampler_3d_atomic_fallback<S, A> tex,
-                            uint3 coord)
-{
-  return (coord.x + coord.y * tex.texture_size.x +
-          coord.z * (tex.texture_size.x * tex.texture_size.y));
-}
-template<typename S, access A>
-uint tex_coord_to_linear_px(thread _mtl_combined_image_sampler_3d_atomic_fallback<S, A> tex,
-                            int3 coord)
-{
-  return tex_coord_to_linear_px(tex, uint3(coord));
-}
-
-/* Map 3D texture coordinate into 2D texture space. */
-template<typename S, access A>
-uint2 tex_coord_3d_to_2d(thread _mtl_combined_image_sampler_2d_array_atomic_fallback<S, A> tex,
-                         uint3 coord)
-{
-  uint linear_id = tex_coord_to_linear_px(tex, coord);
-  uint tex_full_w = uint(tex.texture->get_width());
-  uint2 out_2dcoord;
-  out_2dcoord.y = linear_id / tex_full_w;
-  out_2dcoord.x = linear_id - (out_2dcoord.y * tex_full_w);
-  return out_2dcoord;
-}
-
-template<typename S, access A>
-uint2 tex_coord_3d_to_2d(thread _mtl_combined_image_sampler_3d_atomic_fallback<S, A> tex,
-                         uint3 coord)
-{
-  uint linear_id = tex_coord_to_linear_px(tex, coord);
-  uint tex_full_w = uint(tex.texture->get_width());
-  uint2 out_2dcoord;
-  out_2dcoord.y = linear_id / tex_full_w;
-  out_2dcoord.x = linear_id - (out_2dcoord.y * tex_full_w);
-  return out_2dcoord;
-}
-
-template<int N> bool in_range(vec<int, N> value, vec<int, N> min, vec<int, N> max)
-{
-  return (all(value >= min) && all(value < max));
-}
-
-/* Map 2D/3D texture coordinate into buffer index, accounting for padded row widths. */
-template<typename S, access A>
-uint tex_coord_to_linear_buffer_id(thread _mtl_combined_image_sampler_2d_atomic_fallback<S, A> tex,
-                                   uint2 coord)
-{
-  return (coord.x + coord.y * uint(tex.aligned_width));
-}
-template<typename S, access A>
-uint tex_coord_to_linear_buffer_id(thread _mtl_combined_image_sampler_2d_atomic_fallback<S, A> tex,
-                                   int2 coord)
-{
-  return tex_coord_to_linear_buffer_id(tex, uint2(coord));
-}
-
-template<typename S, access A>
-uint tex_coord_to_linear_buffer_id(
-    thread _mtl_combined_image_sampler_2d_array_atomic_fallback<S, A> tex, uint3 coord)
-{
-  uint2 coord2d = tex_coord_3d_to_2d(tex, coord);
-  return (coord2d.x + coord2d.y * uint(tex.aligned_width));
-}
-template<typename S, access A>
-uint tex_coord_to_linear_buffer_id(
-    thread _mtl_combined_image_sampler_2d_array_atomic_fallback<S, A> tex, int3 coord)
-{
-  return tex_coord_to_linear_buffer_id(tex, uint3(coord));
-}
-
-template<typename S, access A>
-uint tex_coord_to_linear_buffer_id(thread _mtl_combined_image_sampler_3d_atomic_fallback<S, A> tex,
-                                   uint3 coord)
-{
-  uint2 coord2d = tex_coord_3d_to_2d(tex, coord);
-  return (coord2d.x + coord2d.y * uint(tex.aligned_width));
-}
-template<typename S, access A>
-uint tex_coord_to_linear_buffer_id(thread _mtl_combined_image_sampler_3d_atomic_fallback<S, A> tex,
-                                   int3 coord)
-{
-  return tex_coord_to_linear_buffer_id(tex, uint3(coord));
-}
-
-/* imageAtomicMin. */
-
-template<typename S, access A>
-S _texture_image_atomic_min_internal_fallback(
-    thread _mtl_combined_image_sampler_2d_atomic_fallback<S, A> tex, int2 coord, S data)
-{
-  if (!in_range(coord.xy, int2(0, 0), int2(tex.texture->get_width(0), tex.texture->get_height(0))))
-  {
-    return S(0);
-  }
-  uint linear_id = tex_coord_to_linear_buffer_id(tex, coord);
-  return atomicMin(tex.buffer[linear_id], data);
-}
-
-template<typename S, access A>
-S _texture_image_atomic_min_internal_fallback(
-    thread _mtl_combined_image_sampler_2d_array_atomic_fallback<S, A> tex, int3 coord, S data)
-{
-  if (!in_range(coord.xyz, int3(0, 0, 0), int3(tex.texture_size.xyz))) {
-    return S(0);
-  }
-  uint linear_id = tex_coord_to_linear_buffer_id(tex, coord);
-  return atomicMin(tex.buffer[linear_id], data);
-}
-
-template<typename S, access A>
-S _texture_image_atomic_min_internal_fallback(
-    thread _mtl_combined_image_sampler_3d_atomic_fallback<S, A> tex, int3 coord, S data)
-{
-  if (!in_range(coord.xyz, int3(0, 0, 0), int3(tex.texture_size.xyz))) {
-    return S(0);
-  }
-  uint linear_id = tex_coord_to_linear_buffer_id(tex, coord);
-  return atomicMin(tex.buffer[linear_id], data);
-}
-
-/* imageAtomicAdd. */
-template<typename S, access A>
-S _texture_image_atomic_add_internal_fallack(
-    thread _mtl_combined_image_sampler_2d_atomic_fallback<S, A> tex, int2 coord, S data)
-{
-  if (!in_range(coord.xy, int2(0, 0), int2(tex.texture->get_width(0), tex.texture->get_height(0))))
-  {
-    return S(0);
-  }
-  uint linear_id = tex_coord_to_linear_buffer_id(tex, coord);
-  return atomicAdd(tex.buffer[linear_id], data);
-}
-
-template<typename S, access A>
-S _texture_image_atomic_add_internal_fallack(
-    thread _mtl_combined_image_sampler_2d_array_atomic_fallback<S, A> tex, int3 coord, S data)
-{
-  if (!in_range(coord.xyz, int3(0, 0, 0), int3(tex.texture_size.xyz))) {
-    return S(0);
-  }
-  uint linear_id = tex_coord_to_linear_buffer_id(tex, coord);
-  return atomicAdd(tex.buffer[linear_id], data);
-}
-
-template<typename S, access A>
-S _texture_image_atomic_add_internal_fallack(
-    thread _mtl_combined_image_sampler_3d_atomic_fallback<S, A> tex, int3 coord, S data)
-{
-  if (!in_range(coord.xyz, int3(0, 0, 0), int3(tex.texture_size.xyz))) {
-    return S(0);
-  }
-  uint linear_id = tex_coord_to_linear_buffer_id(tex, coord);
-  return atomicAdd(tex.buffer[linear_id], data);
-}
-
-/* imageAtomicExchange. */
-template<typename S, access A>
-S _texture_image_atomic_exchange_internal_fallack(
-    thread _mtl_combined_image_sampler_2d_atomic_fallback<S, A> tex, int2 coord, S data)
-{
-  if (!in_range(coord.xy, int2(0, 0), int2(tex.texture->get_width(0), tex.texture->get_height(0))))
-  {
-    return S(0);
-  }
-  uint linear_id = tex_coord_to_linear_buffer_id(tex, coord);
-  return atomicExchange(tex.buffer[linear_id], data);
-}
-
-template<typename S, access A>
-S _texture_image_atomic_exchange_internal_fallack(
-    thread _mtl_combined_image_sampler_2d_array_atomic_fallback<S, A> tex, int3 coord, S data)
-{
-  if (!in_range(coord.xyz, int3(0, 0, 0), int3(tex.texture_size.xyz))) {
-    return S(0);
-  }
-  uint linear_id = tex_coord_to_linear_buffer_id(tex, coord);
-  return atomicExchange(tex.buffer[linear_id], data);
-}
-
-template<typename S, access A>
-S _texture_image_atomic_exchange_internal_fallack(
-    thread _mtl_combined_image_sampler_3d_atomic_fallback<S, A> tex, int3 coord, S data)
-{
-  if (!in_range(coord.xyz, int3(0, 0, 0), int3(tex.texture_size.xyz))) {
-    return S(0);
-  }
-  uint linear_id = tex_coord_to_linear_buffer_id(tex, coord);
-  return atomicExchange(tex.buffer[linear_id], data);
-}
-
-/* imageAtomicXor. */
-template<typename S, access A>
-S _texture_image_atomic_xor_internal_fallack(
-    thread _mtl_combined_image_sampler_2d_atomic_fallback<S, A> tex, int2 coord, S data)
-{
-  if (!in_range(coord.xy, int2(0, 0), int2(tex.texture->get_width(0), tex.texture->get_height(0))))
-  {
-    return S(0);
-  }
-  uint linear_id = tex_coord_to_linear_buffer_id(tex, coord);
-  return atomicXor(tex.buffer[linear_id], data);
-}
-
-template<typename S, access A>
-S _texture_image_atomic_xor_internal_fallack(
-    thread _mtl_combined_image_sampler_2d_array_atomic_fallback<S, A> tex, int3 coord, S data)
-{
-  if (!in_range(coord.xyz, int3(0, 0, 0), int3(tex.texture_size.xyz))) {
-    return S(0);
-  }
-  uint linear_id = tex_coord_to_linear_buffer_id(tex, coord);
-  return atomicXor(tex.buffer[linear_id], data);
-}
-
-template<typename S, access A>
-S _texture_image_atomic_xor_internal_fallack(
-    thread _mtl_combined_image_sampler_3d_atomic_fallback<S, A> tex, int3 coord, S data)
-{
-  if (!in_range(coord.xyz, int3(0, 0, 0), int3(tex.texture_size.xyz))) {
-    return S(0);
-  }
-  uint linear_id = tex_coord_to_linear_buffer_id(tex, coord);
-  return atomicXor(tex.buffer[linear_id], data);
-}
-
-/* imageAtomicOr. */
-template<typename S, access A>
-S _texture_image_atomic_or_internal_fallack(
-    thread _mtl_combined_image_sampler_2d_atomic_fallback<S, A> tex, int2 coord, S data)
-{
-  if (!in_range(coord.xy, int2(0, 0), int2(tex.texture->get_width(0), tex.texture->get_height(0))))
-  {
-    return S(0);
-  }
-  uint linear_id = tex_coord_to_linear_buffer_id(tex, coord);
-  return atomicOr(tex.buffer[linear_id], data);
-}
-
-template<typename S, access A>
-S _texture_image_atomic_or_internal_fallack(
-    thread _mtl_combined_image_sampler_2d_array_atomic_fallback<S, A> tex, int3 coord, S data)
-{
-  if (!in_range(coord.xyz, int3(0, 0, 0), int3(tex.texture_size.xyz))) {
-    return S(0);
-  }
-  uint linear_id = tex_coord_to_linear_buffer_id(tex, coord);
-  return atomicOr(tex.buffer[linear_id], data);
-}
-
-template<typename S, access A>
-S _texture_image_atomic_or_internal_fallack(
-    thread _mtl_combined_image_sampler_3d_atomic_fallback<S, A> tex, int3 coord, S data)
-{
-  if (!in_range(coord.xyz, int3(0, 0, 0), int3(tex.texture_size.xyz))) {
-    return S(0);
-  }
-  uint linear_id = tex_coord_to_linear_buffer_id(tex, coord);
-  return atomicOr(tex.buffer[linear_id], data);
-}
-
-/** Texture sampling, reading and writing functions with layer mapping. */
-
-/* Texel Fetch. */
-template<typename S, typename T>
-inline vec<S, 4> _texelFetch_internal(
-    thread _mtl_combined_image_sampler_2d_atomic_fallback<S, access::sample> tex,
-    vec<T, 2> texel,
-    uint lod,
-    vec<T, 2> offset = vec<T, 2>(0))
-{
-  return tex.texture->sample(_point_sample_, float2(texel.xy + offset.xy), level(lod));
-}
-
-template<typename S, typename T>
-inline vec<S, 4> _texelFetch_internal(
-    thread _mtl_combined_image_sampler_2d_array_atomic_fallback<S, access::sample> tex,
-    vec<T, 3> texel,
-    uint lod,
-    vec<T, 3> offset = vec<T, 3>(0))
-{
-  return tex.texture->sample(
-      _point_sample_, float2(tex_coord_3d_to_2d(tex, uint3(texel + offset))), level(lod));
-}
-
-template<typename S, typename T>
-inline vec<S, 4> _texelFetch_internal(
-    thread _mtl_combined_image_sampler_3d_atomic_fallback<S, access::sample> tex,
-    vec<T, 3> texel,
-    uint lod,
-    vec<T, 3> offset = vec<T, 3>(0))
-{
-  return tex.texture->sample(
-      _point_sample_, float2(tex_coord_3d_to_2d(tex, uint3(texel + offset))), level(lod));
-}
-
-template<typename S, typename T, access A>
-inline vec<S, 4> _texelFetch_internal(
-    thread _mtl_combined_image_sampler_2d_atomic_fallback<S, A> tex,
-    vec<T, 2> texel,
-    uint lod,
-    vec<T, 2> offset = vec<T, 2>(0))
-{
-
-  float w = tex.texture->get_width() >> lod;
-  float h = tex.texture->get_height() >> lod;
-  if ((texel.x + offset.x) >= 0 && (texel.x + offset.x) < w && (texel.y + offset.y) >= 0 &&
-      (texel.y + offset.y) < h)
-  {
-    return tex.texture->read(uint2(texel + offset), lod);
-  }
-  else {
-    return vec<S, 4>(0);
-  }
-}
-
-template<typename S, typename T, access A>
-inline vec<S, 4> _texelFetch_internal(
-    thread _mtl_combined_image_sampler_2d_array_atomic_fallback<S, A> tex,
-    vec<T, 3> texel,
-    uint lod,
-    vec<T, 3> offset = vec<T, 3>(0))
-{
-  return tex.texture->read(tex_coord_3d_to_2d(tex, uint3(texel + offset)), lod);
-}
-
-template<typename S, typename T, access A>
-inline vec<S, 4> _texelFetch_internal(
-    thread _mtl_combined_image_sampler_3d_atomic_fallback<S, A> tex,
-    vec<T, 3> texel,
-    uint lod,
-    vec<T, 3> offset = vec<T, 3>(0))
-{
-  return tex.texture->read(tex_coord_3d_to_2d(tex, uint3(texel + offset)), lod);
-}
-
-/* imageStore. */
-
-template<typename S, typename T, access A>
-inline void _texture_write_internal(
-    thread _mtl_combined_image_sampler_2d_atomic_fallback<S, A> tex, T _coord, vec<S, 4> value)
-{
-  float w = tex.texture_size.x;
-  float h = tex.texture_size.y;
-  if (_coord.x >= 0 && _coord.x < w && _coord.y >= 0 && _coord.y < h) {
-    tex.texture->write(value, uint2(_coord.xy));
-  }
-}
-
-template<typename S, typename T, access A>
-inline void _texture_write_internal_fast(
-    thread _mtl_combined_image_sampler_2d_atomic_fallback<S, A> tex, T _coord, vec<S, 4> value)
-{
-  tex.texture->write(value, uint2(_coord.xy));
-}
-
-template<typename S, typename T, access A>
-inline void _texture_write_internal(
-    thread _mtl_combined_image_sampler_2d_array_atomic_fallback<S, A> tex,
-    T _coord,
-    vec<S, 4> value)
-{
-  float w = tex.texture_size.x;
-  float h = tex.texture_size.y;
-  float d = tex.texture_size.z;
-  if (_coord.x >= 0 && _coord.x < w && _coord.y >= 0 && _coord.y < h && _coord.z >= 0 &&
-      _coord.z < d)
-  {
-    tex.texture->write(value, tex_coord_3d_to_2d(tex, uint3(_coord)));
-  }
-}
-
-template<typename S, typename T, access A>
-inline void _texture_write_internal_fast(
-    thread _mtl_combined_image_sampler_2d_array_atomic_fallback<S, A> tex,
-    T _coord,
-    vec<S, 4> value)
-{
-  tex.texture->write(value, tex_coord_3d_to_2d(tex, uint3(_coord)));
-}
-
-template<typename S, typename T, access A>
-inline void _texture_write_internal(
-    thread _mtl_combined_image_sampler_3d_atomic_fallback<S, A> tex, T _coord, vec<S, 4> value)
-{
-  float w = tex.texture_size.x;
-  float h = tex.texture_size.y;
-  float d = tex.texture_size.z;
-  if (_coord.x >= 0 && _coord.x < w && _coord.y >= 0 && _coord.y < h && _coord.z >= 0 &&
-      _coord.z < d)
-  {
-    tex.texture->write(value, tex_coord_3d_to_2d(tex, uint3(_coord)));
-  }
-}
-
-template<typename S, typename T, access A>
-inline void _texture_write_internal_fast(
-    thread _mtl_combined_image_sampler_3d_atomic_fallback<S, A> tex, T _coord, vec<S, 4> value)
-{
-  tex.texture->write(value, tex_coord_3d_to_2d(tex, uint3(_coord)));
-}
-
-#endif
-
 /* Matrix compare operators. */
-/** TODO(fclem): Template. */
-inline bool operator==(float4x4 a, float4x4 b)
-{
-  for (int i = 0; i < 4; i++) {
-    if (any(a[i] != b[i])) {
-      return false;
-    }
+#define EQ_OP(type, ...) \
+  inline bool operator==(type a, type b) \
+  { \
+    return __VA_ARGS__; \
  }
-  return true;
-}
-inline bool operator==(float3x3 a, float3x3 b)
-{
-  for (int i = 0; i < 3; i++) {
-    if (any(a[i] != b[i])) {
-      return false;
-    }
-  }
-  return true;
-}
-inline bool operator==(float2x2 a, float2x2 b)
-{
-  for (int i = 0; i < 2; i++) {
-    if (any(a[i] != b[i])) {
-      return false;
-    }
-  }
-  return true;
-}
+EQ_OP(float2x2, all(a[0] == b[0]) && all(a[1] == b[1]))
+EQ_OP(float2x3, all(a[0] == b[0]) && all(a[1] == b[1]))
+EQ_OP(float2x4, all(a[0] == b[0]) && all(a[1] == b[1]))
+EQ_OP(float3x2, all(a[0] == b[0]) && all(a[1] == b[1]) && all(a[2] == b[2]))
+EQ_OP(float3x3, all(a[0] == b[0]) && all(a[1] == b[1]) && all(a[2] == b[2]))
+EQ_OP(float3x4, all(a[0] == b[0]) && all(a[1] == b[1]) && all(a[2] == b[2]))
+EQ_OP(float4x2, all(a[0] == b[0]) && all(a[1] == b[1]) && all(a[2] == b[2]) && all(a[3] == b[3]))
+EQ_OP(float4x3, all(a[0] == b[0]) && all(a[1] == b[1]) && all(a[2] == b[2]) && all(a[3] == b[3]))
+EQ_OP(float4x4, all(a[0] == b[0]) && all(a[1] == b[1]) && all(a[2] == b[2]) && all(a[3] == b[3]))
+#undef EQ_OP

-inline bool operator!=(float4x4 a, float4x4 b)
-{
-  return !(a == b);
-}
-inline bool operator!=(float3x3 a, float3x3 b)
-{
-  return !(a == b);
-}
-inline bool operator!=(float2x2 a, float2x2 b)
+template<int N, int M> inline bool operator!=(matrix<float, N, M> a, matrix<float, N, M> b)
 {
  return !(a == b);
 }

 /* Matrix unary minus operator. */
-
-inline float4x4 operator-(float4x4 a)
+template<int N, int M> inline matrix<float, N, M> operator-(matrix<float, N, M> a)
 {
-  float4x4 b;
-  for (int i = 0; i < 4; i++) {
-    b[i] = -a[i];
-  }
-  return b;
-}
-inline float3x3 operator-(float3x3 a)
-{
-  float3x3 b;
-  for (int i = 0; i < 3; i++) {
-    b[i] = -a[i];
-  }
-  return b;
-}
-inline float2x2 operator-(float2x2 a)
-{
-  float2x2 b;
-  for (int i = 0; i < 2; i++) {
-    b[i] = -a[i];
-  }
-  return b;
+  return a * -1.0;
 }

 /* SSBO Vertex Fetch Mode. */
@@ -1954,115 +901,29 @@ inline float2x2 operator-(float2x2 a)
 /* Common Functions. */
 #define dFdx(x) dfdx(x)
 #define dFdy(x) dfdy(x)
-#define mod(x, y) _mtlmod(x, y)
 #define discard discard_fragment()
 #define inversesqrt rsqrt

-inline float radians(float deg)
-{
-  /* Constant factor: M_PI_F/180.0. */
-  return deg * 0.01745329251f;
-}
-
-inline float degrees(float rad)
-{
-  /* Constant factor: 180.0/M_PI_F. */
-  return rad * 57.2957795131;
-}
-
-#define select(A, B, C) mix(A, B, C)
-
-/* Type conversions and type truncation. */
-inline float4 to_float4(float3 val)
-{
-  return float4(val, 1.0);
-}
-
-/* Type conversions and type truncation (Utility Functions). */
-inline float3x3 mat4_to_mat3(float4x4 matrix)
-{
-  return float3x3(matrix[0].xyz, matrix[1].xyz, matrix[2].xyz);
-}
-
-inline int floatBitsToInt(float f)
-{
-  return as_type<int>(f);
-}
-
-inline int2 floatBitsToInt(float2 f)
-{
-  return as_type<int2>(f);
-}
-
-inline int3 floatBitsToInt(float3 f)
-{
-  return as_type<int3>(f);
-}
-
-inline int4 floatBitsToInt(float4 f)
-{
-  return as_type<int4>(f);
-}
-
-inline uint floatBitsToUint(float f)
-{
-  return as_type<uint>(f);
-}
-
-inline uint2 floatBitsToUint(float2 f)
-{
-  return as_type<uint2>(f);
-}
-
-inline uint3 floatBitsToUint(float3 f)
-{
-  return as_type<uint3>(f);
-}
-
-inline uint4 floatBitsToUint(float4 f)
-{
-  return as_type<uint4>(f);
-}
-
-inline float intBitsToFloat(int f)
-{
-  return as_type<float>(f);
-}
-
-inline float2 intBitsToFloat(int2 f)
-{
-  return as_type<float2>(f);
-}
-
-inline float3 intBitsToFloat(int3 f)
-{
-  return as_type<float3>(f);
-}
-
-inline float4 intBitsToFloat(int4 f)
-{
-  return as_type<float4>(f);
-}
-
-inline float uintBitsToFloat(uint f)
-{
-  return as_type<float>(f);
-}
-
-inline float2 uintBitsToFloat(uint2 f)
-{
-  return as_type<float2>(f);
-}
-
-inline float3 uintBitsToFloat(uint3 f)
-{
-  return as_type<float3>(f);
-}
-
-inline float4 uintBitsToFloat(uint4 f)
-{
-  return as_type<float4>(f);
-}
+/* clang-format off */
+inline float radians(float deg) { return deg * 0.01745329251; /* M_PI_F / 180 */ }
+inline float degrees(float rad) { return rad * 57.2957795131; /* 180 / M_PI_F */ }
+inline int floatBitsToInt(float f) { return as_type<int>(f); }
+inline int2 floatBitsToInt(float2 f) { return as_type<int2>(f); }
+inline int3 floatBitsToInt(float3 f) { return as_type<int3>(f); }
+inline int4 floatBitsToInt(float4 f) { return as_type<int4>(f); }
+inline uint floatBitsToUint(float f) { return as_type<uint>(f); }
+inline uint2 floatBitsToUint(float2 f) { return as_type<uint2>(f); }
+inline uint3 floatBitsToUint(float3 f) { return as_type<uint3>(f); }
+inline uint4 floatBitsToUint(float4 f) { return as_type<uint4>(f); }
+inline float intBitsToFloat(int f) { return as_type<float>(f); }
+inline float2 intBitsToFloat(int2 f) { return as_type<float2>(f); }
+inline float3 intBitsToFloat(int3 f) { return as_type<float3>(f); }
+inline float4 intBitsToFloat(int4 f) { return as_type<float4>(f); }
+inline float uintBitsToFloat(uint f) { return as_type<float>(f); }
+inline float2 uintBitsToFloat(uint2 f) { return as_type<float2>(f); }
+inline float3 uintBitsToFloat(uint3 f) { return as_type<float3>(f); }
+inline float4 uintBitsToFloat(uint4 f) { return as_type<float4>(f); }
+/* clang-format on */

 #define bitfieldReverse reverse_bits
 #define bitfieldExtract extract_bits
@@ -2093,89 +954,32 @@ template<typename T> int findMSB(T x)
  return int(sizeof(T) * 8) - 1 - int(clz(x));
 }

-#define unpackUnorm4x8 unpack_unorm4x8_to_float
-#define unpackSnorm4x8 unpack_snorm4x8_to_float
-#define unpackUnorm2x16 unpack_unorm2x16_to_float
-#define unpackSnorm2x16 unpack_snorm2x16_to_float
-
-/* Texture size functions. Add texture types as needed. */
-#define imageSize(image) textureSize(image, 0)
-
-template<typename T, access A>
-int textureSize(thread _mtl_combined_image_sampler_1d<T, A> image, uint lod)
-{
-  return int(image.texture->get_width());
-}
-
-template<typename T, access A>
-int2 textureSize(thread _mtl_combined_image_sampler_1d_array<T, A> image, uint lod)
-{
-  return int2(image.texture->get_width(), image.texture->get_array_size());
-}
-
-template<typename T, access A>
-int2 textureSize(thread _mtl_combined_image_sampler_2d<T, A> image, uint lod)
-{
-  return int2(image.texture->get_width(lod), image.texture->get_height(lod));
-}
-
-template<typename T, access A>
-int2 textureSize(thread _mtl_combined_image_sampler_depth_2d<T, A> image, uint lod)
-{
-  return int2(image.texture->get_width(lod), image.texture->get_height(lod));
-}
-
-template<typename T, access A>
-int3 textureSize(thread _mtl_combined_image_sampler_2d_array<T, A> image, uint lod)
-{
-  return int3(image.texture->get_width(lod),
-              image.texture->get_height(lod),
-              image.texture->get_array_size());
-}
-
-template<typename T, access A>
-int3 textureSize(thread _mtl_combined_image_sampler_depth_2d_array<T, A> image, uint lod)
-{
-  return int3(image.texture->get_width(lod),
-              image.texture->get_height(lod),
-              image.texture->get_array_size());
-}
-
-template<typename T, access A>
-int2 textureSize(thread _mtl_combined_image_sampler_cube<T, A> image, uint lod)
-{
-  return int2(image.texture->get_width(lod), image.texture->get_height(lod));
-}
-
-template<typename T, access A>
-int3 textureSize(thread _mtl_combined_image_sampler_3d<T, A> image, uint lod)
-{
-  return int3(image.texture->get_width(lod),
-              image.texture->get_height(lod),
-              image.texture->get_depth(lod));
-}
-
 #ifndef MTL_SUPPORTS_TEXTURE_ATOMICS
 /* textureSize functions for fallback atomic textures. */
 template<typename T, access A>
-int2 textureSize(thread _mtl_combined_image_sampler_2d_atomic_fallback<T, A> image, uint lod)
+int2 textureSize(thread _mtl_sampler_2d_atomic<T, A> image, uint lod)
 {
  return int2(image.texture->get_width(lod), image.texture->get_height(lod));
 }

 template<typename T, access A>
-int3 textureSize(thread _mtl_combined_image_sampler_2d_array_atomic_fallback<T, A> image, uint lod)
+int3 textureSize(thread _mtl_sampler_2d_array_atomic<T, A> image, uint lod)
 {
  return int3(image.texture_size);
 }

 template<typename T, access A>
-int3 textureSize(thread _mtl_combined_image_sampler_3d_atomic_fallback<T, A> image, uint lod)
+int3 textureSize(thread _mtl_sampler_3d_atomic<T, A> image, uint lod)
 {
  return int3(image.texture_size);
 }
 #endif

+#define unpackUnorm4x8 unpack_unorm4x8_to_float
+#define unpackSnorm4x8 unpack_snorm4x8_to_float
+#define unpackUnorm2x16 unpack_unorm2x16_to_float
+#define unpackSnorm2x16 unpack_snorm2x16_to_float
+
 /* Equality and comparison functions. */
 #define lessThan(a, b) ((a) < (b))
 #define lessThanEqual(a, b) ((a) <= (b))
@@ -2184,49 +988,18 @@ int3 textureSize(thread _mtl_combined_image_sampler_3d_atomic_fallback<T, A> ima
 #define equal(a, b) ((a) == (b))
 #define notEqual(a, b) ((a) != (b))

-template<typename T, int n> bool all(vec<T, n> x)
-{
-  bool _all = true;
-  for (int i = 0; i < n; i++) {
-    _all = _all && (x[i] > 0);
-  }
-  return _all;
-}
-
-template<typename T, int n> bool any(vec<T, n> x)
-{
-  bool _any = false;
-  for (int i = 0; i < n; i++) {
-    _any = _any || (x[i] > 0);
-  }
-  return _any;
-}
-
 /* Modulo functionality. */
-int _mtlmod(int a, int b)
-{
-  return a - b * (a / b);
-}
-
-float _mtlmod(float a, float b)
-{
-  return a - b * floor(a / b);
-}
-
-template<typename T, int n> vec<T, n> _mtlmod(vec<T, n> x, vec<T, n> y)
-{
-  return x - (y * floor(x / y));
-}
-
-template<typename T, int n, typename U> vec<T, n> _mtlmod(vec<T, n> x, U y)
-{
-  return x - (vec<T, n>(y) * floor(x / vec<T, n>(y)));
-}
-
-template<typename T, typename U, int n> vec<U, n> _mtlmod(T x, vec<U, n> y)
-{
-  return vec<U, n>(x) - (y * floor(vec<U, n>(x) / y));
-}
+/* `mod(x, y)` is defined as `x - (y * floor(x / y))` in the metal specification.
+ * This is not compatible with GLSL implementation. So we override it with a compatible one. */
+#define mod(x, y) _compatible_mod(x, y)
+#define MOD \
+  { \
+    return x - y * floor(x / y); \
+  }
+float _compatible_mod(float x, float y) MOD;
+template<int S> vec<float, S> _compatible_mod(vec<float, S> x, float y) MOD;
+template<int S> vec<float, S> _compatible_mod(vec<float, S> x, vec<float, S> y) MOD;
+#undef MOD

 /* Mathematical functions. */
 template<typename T> T atan(T y, T x)
@@ -2249,10 +1022,7 @@ float4x4 inverse(float4x4 a)
  float b09 = a[2][1] * a[3][2] - a[2][2] * a[3][1];
  float b10 = a[2][1] * a[3][3] - a[2][3] * a[3][1];
  float b11 = a[2][2] * a[3][3] - a[2][3] * a[3][2];
-
-  float inv_det = 1.0 / (b00 * b11 - b01 * b10 + b02 * b09 + b03 * b08 - b04 * b07 + b05 * b06);
-
-  float4x4 adjoint{};
+  float4x4 adjoint;
  adjoint[0][0] = a[1][1] * b11 - a[1][2] * b10 + a[1][3] * b09;
  adjoint[0][1] = a[0][2] * b10 - a[0][1] * b11 - a[0][3] * b09;
  adjoint[0][2] = a[3][1] * b05 - a[3][2] * b04 + a[3][3] * b03;
@@ -2269,40 +1039,38 @@ float4x4 inverse(float4x4 a)
  adjoint[3][1] = a[0][0] * b09 - a[0][1] * b07 + a[0][2] * b06;
  adjoint[3][2] = a[3][1] * b01 - a[3][0] * b03 - a[3][2] * b00;
  adjoint[3][3] = a[2][0] * b03 - a[2][1] * b01 + a[2][2] * b00;
-  return adjoint * inv_det;
+  float determinant = b00 * b11 - b01 * b10 + b02 * b09 + b03 * b08 - b04 * b07 + b05 * b06;
+  /* Multiplying by inverse since matrix types don't have divide operators. */
+  return adjoint * (1.0 / determinant);
 }

 float3x3 inverse(float3x3 m)
 {
-  float b00 = m[1][1] * m[2][2] - m[2][1] * m[1][2];
-  float b01 = m[0][1] * m[2][2] - m[2][1] * m[0][2];
-  float b02 = m[0][1] * m[1][2] - m[1][1] * m[0][2];
-
-  float inv_det = 1.0 / (m[0][0] * b00 - m[1][0] * b01 + m[2][0] * b02);
-
-  float3x3 adjoint{};
-  adjoint[0][0] = +b00;
-  adjoint[0][1] = -b01;
-  adjoint[0][2] = +b02;
+  float3x3 adjoint;
+  adjoint[0][0] = +(m[1][1] * m[2][2] - m[2][1] * m[1][2]);
+  adjoint[0][1] = -(m[0][1] * m[2][2] - m[2][1] * m[0][2]);
+  adjoint[0][2] = +(m[0][1] * m[1][2] - m[1][1] * m[0][2]);
  adjoint[1][0] = -(m[1][0] * m[2][2] - m[2][0] * m[1][2]);
  adjoint[1][1] = +(m[0][0] * m[2][2] - m[2][0] * m[0][2]);
  adjoint[1][2] = -(m[0][0] * m[1][2] - m[1][0] * m[0][2]);
  adjoint[2][0] = +(m[1][0] * m[2][1] - m[2][0] * m[1][1]);
  adjoint[2][1] = -(m[0][0] * m[2][1] - m[2][0] * m[0][1]);
  adjoint[2][2] = +(m[0][0] * m[1][1] - m[1][0] * m[0][1]);
-  return adjoint * inv_det;
+  float determinant = m[0][0] * adjoint[0][0] + m[1][0] * adjoint[0][1] + m[2][0] * adjoint[0][2];
+  /* Multiplying by inverse since matrix types don't have divide operators. */
+  return adjoint * (1.0 / determinant);
 }

 float2x2 inverse(float2x2 m)
 {
-  float inv_det = 1.0 / (m[0][0] * m[1][1] - m[1][0] * m[0][1]);
-
-  float2x2 adjoint{};
+  float2x2 adjoint;
  adjoint[0][0] = +m[1][1];
  adjoint[1][0] = -m[1][0];
  adjoint[0][1] = -m[0][1];
  adjoint[1][1] = +m[0][0];
-  return adjoint * inv_det;
+  float determinant = m[0][0] * m[1][1] - m[1][0] * m[0][1];
+  /* Multiplying by inverse since matrix types don't have divide operators. */
+  return adjoint * (1.0 / determinant);
 }

 /* Additional overloads for builtin functions. */
@@ -2311,226 +1079,70 @@ float distance(float x, float y)
  return abs(y - x);
 }

-/* Overload for mix(A, B, float ratio). */
-template<typename T, int Size> vec<T, Size> mix(vec<T, Size> a, vec<T, Size> b, float val)
+/* Overload for mix(vec<T>, vec<T>, float). */
+template<typename T, int S> vec<T, S> mix(vec<T, S> a, vec<T, S> b, float fac)
 {
-  return mix(a, b, vec<T, Size>(val));
-}
-
-/* Overload for mix(A, B, bvec<N>). */
-template<typename T, int Size>
-vec<T, Size> mix(vec<T, Size> a, vec<T, Size> b, vec<int, Size> mask)
-{
-  vec<T, Size> result;
-  for (int i = 0; i < Size; i++) {
-    result[i] = mask[i] ? b[i] : a[i];
-  }
-  return result;
+  return a * (1.0 - fac) + b * fac;
 }

 /* Using vec<bool, S> does not appear to work, splitting cases. */
-/* Overload for mix(A, B, bvec<N>). */
-template<typename T> vec<T, 4> mix(vec<T, 4> a, vec<T, 4> b, bvec4 mask)
-{
-  vec<T, 4> result;
-  for (int i = 0; i < 4; i++) {
-    result[i] = mask[i] ? b[i] : a[i];
+#define SELECT \
+  { \
+    return select(a, b, mask); \
  }
-  return result;
-}

-/* Overload for mix(A, B, bvec<N>). */
-template<typename T> vec<T, 3> mix(vec<T, 3> a, vec<T, 3> b, bvec3 mask)
-{
-  vec<T, 3> result;
-  for (int i = 0; i < 3; i++) {
-    result[i] = mask[i] ? b[i] : a[i];
-  }
-  return result;
-}
+template<typename T> vec<T, 4> mix(vec<T, 4> a, vec<T, 4> b, bool4 mask) SELECT;
+template<typename T> vec<T, 3> mix(vec<T, 3> a, vec<T, 3> b, bool3 mask) SELECT;
+template<typename T> vec<T, 2> mix(vec<T, 2> a, vec<T, 2> b, bool2 mask) SELECT;

-/* Overload for mix(A, B, bvec<N>). */
-template<typename T> vec<T, 2> mix(vec<T, 2> a, vec<T, 2> b, bvec2 mask)
-{
-  vec<T, 2> result;
-  for (int i = 0; i < 2; i++) {
-    result[i] = mask[i] ? b[i] : a[i];
-  }
-  return result;
-}
-
-/* Overload for mix(A, B, bvec<N>). */
-template<typename T> T mix(T a, T b, bool mask)
-{
-  return (mask) ? b : a;
-}
+#undef SELECT

 template<typename T, int Size> bool is_zero(vec<T, Size> a)
 {
-  for (int i = 0; i < Size; i++) {
-    if (a[i] != T(0)) {
-      return false;
-    }
+  return all(a == vec<T, Size>(0));
+}
+
+#define in
+#define out thread
+#define inout thread
+#define _in_sta
+#define _in_end
+#define _out_sta (&
+#define _out_end )
+#define _inout_sta (&
+#define _inout_end )
+
+#define shared threadgroup
+#define _shared_sta (&
+#define _shared_end )
+
+/* Defines generated by glsl_preprocess that contains threadgroup variables related codegen.
+ * See glsl_preprocess for more detail. */
+#define MSL_SHARED_VARS_ARGS
+#define MSL_SHARED_VARS_ASSIGN
+#define MSL_SHARED_VARS_DECLARE
+#define MSL_SHARED_VARS_PASS
+
+/* Matrix reshaping functions. */
+#define RESHAPE(mat_to, mat_from, ...) \
+  mat_to to_##mat_to(mat_from m) \
+  { \
+    return mat_to(__VA_ARGS__); \
  }
-  return true;
-}

-/**
- * Matrix conversion fallback for functional style casting & constructors.
- * To avoid name collision with the types, they are replaced with uppercase version
- * before compilation.
- */
+/* clang-format off */
+RESHAPE(float2x2, float3x3, m[0].xy, m[1].xy)
+RESHAPE(float2x2, float4x4, m[0].xy, m[1].xy)
+RESHAPE(float3x3, float4x4, m[0].xyz, m[1].xyz, m[2].xyz)
+RESHAPE(float3x3, float2x2, m[0].x, m[0].y, 0, m[1].x, m[1].y, 0, 0, 0, 1)
+RESHAPE(float4x4, float2x2, m[0].x, m[0].y, 0, 0, m[1].x, m[1].y, 0, 0, 0, 0, 1, 0, 0, 0, 0, 1)
+RESHAPE(float4x4, float3x3, m[0].x, m[0].y, m[0].z, 0, m[1].x, m[1].y, m[1].z, 0, m[2].x, m[2].y, m[2].z, 0, 0, 0, 0, 1)
+/* clang-format on */
+/* TODO(fclem): Remove. Use Transform instead. */
+RESHAPE(float3x3, float3x4, m[0].xyz, m[1].xyz, m[2].xyz)
+#undef RESHAPE

-mat2 MAT2x2(vec2 a, vec2 b)
-{
-  return mat2(a, b);
-}
-mat2 MAT2x2(float a1, float a2, float b1, float b2)
-{
-  return mat2(vec2(a1, a2), vec2(b1, b2));
-}
-mat2 MAT2x2(float f)
-{
-  return mat2(f);
-}
-mat2 MAT2x2(mat3 m)
-{
-  return mat2(m[0].xy, m[1].xy);
-}
-mat2 MAT2x2(mat4 m)
-{
-  return mat2(m[0].xy, m[1].xy);
-}
-
-mat3 MAT3x3(vec3 a, vec3 b, vec3 c)
-{
-  return mat3(a, b, c);
-}
-mat3 MAT3x3(
-    float a1, float a2, float a3, float b1, float b2, float b3, float c1, float c2, float c3)
-{
-  return mat3(vec3(a1, a2, a3), vec3(b1, b2, b3), vec3(c1, c2, c3));
-}
-mat3 MAT3x3(vec3 a, float b1, float b2, float b3, float c1, float c2, float c3)
-{
-  return mat3(a, vec3(b1, b2, b3), vec3(c1, c2, c3));
-}
-mat3 MAT3x3(float f)
-{
-  return mat3(f);
-}
-mat3 MAT3x3(mat4 m)
-{
-  return mat3(m[0].xyz, m[1].xyz, m[2].xyz);
-}
-mat3 MAT3x3(mat3x4 m)
-{
-  return mat3(m[0].xyz, m[1].xyz, m[2].xyz);
-}
-mat3 MAT3x3(mat2 m)
-{
-  return mat3(vec3(m[0].xy, 0.0), vec3(m[1].xy, 0.0), vec3(0.0, 0.0, 1.0));
-}
-
-mat4 MAT4x4(vec4 a, vec4 b, vec4 c, vec4 d)
-{
-  return mat4(a, b, c, d);
-}
-mat4 MAT4x4(float a1,
-            float a2,
-            float a3,
-            float a4,
-            float b1,
-            float b2,
-            float b3,
-            float b4,
-            float c1,
-            float c2,
-            float c3,
-            float c4,
-            float d1,
-            float d2,
-            float d3,
-            float d4)
-{
-  return mat4(
-      vec4(a1, a2, a3, a4), vec4(b1, b2, b3, b4), vec4(c1, c2, c3, c4), vec4(d1, d2, d3, d4));
-}
-mat4 MAT4x4(float f)
-{
-  return mat4(f);
-}
-mat4 MAT4x4(mat3 m)
-{
-  return mat4(
-      vec4(m[0].xyz, 0.0), vec4(m[1].xyz, 0.0), vec4(m[2].xyz, 0.0), vec4(0.0, 0.0, 0.0, 1.0));
-}
-mat4 MAT4x4(mat3x4 m)
-{
-  return mat4(m[0], m[1], m[2], vec4(0.0, 0.0, 0.0, 1.0));
-}
-mat4 MAT4x4(mat4x3 m)
-{
-  return mat4(m[0][0],
-              m[0][1],
-              m[0][2],
-              0.0,
-              m[1][0],
-              m[1][1],
-              m[1][2],
-              0.0,
-              m[2][0],
-              m[2][1],
-              m[2][2],
-              0.0,
-              m[3][0],
-              m[3][1],
-              m[3][2],
-              0.0);
-}
-mat4 MAT4x4(mat2 m)
-{
-  return mat4(vec4(m[0].xy, 0.0, 0.0),
-              vec4(m[1].xy, 0.0, 0.0),
-              vec4(0.0, 0.0, 1.0, 0.0),
-              vec4(0.0, 0.0, 0.0, 1.0));
-}
-
-mat3x4 MAT3x4(vec4 a, vec4 b, vec4 c)
-{
-  return mat3x4(a, b, c);
-}
-mat3x4 MAT3x4(float a1,
-              float a2,
-              float a3,
-              float a4,
-              float b1,
-              float b2,
-              float b3,
-              float b4,
-              float c1,
-              float c2,
-              float c3,
-              float c4)
-{
-  return mat3x4(vec4(a1, a2, a3, a4), vec4(b1, b2, b3, b4), vec4(c1, c2, c3, c4));
-}
-mat3x4 MAT3x4(float f)
-{
-  return mat3x4(f);
-}
-mat3x4 MAT3x4(mat3 m)
-{
-  return mat3x4(vec4(m[0].xyz, 0.0), vec4(m[1].xyz, 0.0), vec4(m[2].xyz, 0.0));
-}
-mat3x4 MAT3x4(mat2 m)
-{
-  return mat3x4(vec4(m[0].xy, 0.0, 0.0), vec4(m[1].xy, 0.0, 0.0), vec4(0.0, 0.0, 1.0, 0.0));
-}
-
-#define MAT2 MAT2x2
-#define MAT3 MAT3x3
-#define MAT4 MAT4x4
+#undef ENABLE_IF

 /* Array syntax compatibility. */
 /* clang-format off */
@@ -2550,6 +1162,8 @@ mat3x4 MAT3x4(mat2 m)
 #define bool2_array(...) { __VA_ARGS__ }
 #define bool3_array(...) { __VA_ARGS__ }
 #define bool4_array(...) { __VA_ARGS__ }
+#define ARRAY_T(type)
+#define ARRAY_V(...) {__VA_ARGS__}
 /* clang-format on */

 #define SHADER_LIBRARY_CREATE_INFO(a)
--- a/source/blender/gpu/shaders/opengl/glsl_shader_defines.glsl
+++ b/source/blender/gpu/shaders/opengl/glsl_shader_defines.glsl
@@ -6,6 +6,26 @@
 /** IMPORTANT: Be wary of size and alignment matching for types that are present
 * in C++ shared code. */

+/* Matrix reshaping functions. Needs to be declared before matrix type aliases. */
+#define RESHAPE(name, mat_to, mat_from) \
+  mat_to to_##name(mat_from m) \
+  { \
+    return mat_to(m); \
+  }
+
+/* clang-format off */
+RESHAPE(float2x2, mat2x2, mat3x3)
+RESHAPE(float2x2, mat2x2, mat4x4)
+RESHAPE(float3x3, mat3x3, mat4x4)
+RESHAPE(float3x3, mat3x3, mat2x2)
+RESHAPE(float4x4, mat4x4, mat2x2)
+RESHAPE(float4x4, mat4x4, mat3x3)
+/* clang-format on */
+/* TODO(fclem): Remove. Use Transform instead. */
+RESHAPE(float3x3, mat3x3, mat3x4)
+
+#undef RESHAPE
+
 /* Boolean in GLSL are 32bit in interface structs. */
 #define bool32_t bool
 #define bool2 bvec2
@@ -121,8 +141,19 @@ bool is_zero(vec4 A)
 #define bool2_array bool2[]
 #define bool3_array bool3[]
 #define bool4_array bool4[]
+#define ARRAY_T(type) type[]
+#define ARRAY_V

 #define SHADER_LIBRARY_CREATE_INFO(a)
 #define VERTEX_SHADER_CREATE_INFO(a)
 #define FRAGMENT_SHADER_CREATE_INFO(a)
 #define COMPUTE_SHADER_CREATE_INFO(a)
+
+#define _in_sta
+#define _in_end
+#define _out_sta
+#define _out_end
+#define _inout_sta
+#define _inout_end
+#define _shared_sta
+#define _shared_end
--- a/source/blender/gpu/tests/shaders/gpu_math_test.glsl
+++ b/source/blender/gpu/tests/shaders/gpu_math_test.glsl
@@ -70,22 +70,22 @@ void main()
    EulerXYZ euler = EulerXYZ(1, 2, 3);
    Quaternion quat = to_quaternion(euler);
    AxisAngle axis_angle = to_axis_angle(euler);
-    m = mat4(from_rotation(euler));
+    m = to_float4x4(from_rotation(euler));
    EXPECT_NEAR(m, expect, 1e-5);
-    m = mat4(from_rotation(quat));
+    m = to_float4x4(from_rotation(quat));
    EXPECT_NEAR(m, expect, 1e-5);
-    m = mat4(from_rotation(axis_angle));
+    m = to_float4x4(from_rotation(axis_angle));
    EXPECT_NEAR(m, expect, 3e-4); /* Has some precision issue on some platform. */

    m = from_scale(vec4(1, 2, 3, 4));
    expect = mat4x4(vec4(1, 0, 0, 0), vec4(0, 2, 0, 0), vec4(0, 0, 3, 0), vec4(0, 0, 0, 4));
    EXPECT_TRUE(is_equal(m, expect, 0.00001));

-    m = mat4(from_scale(vec3(1, 2, 3)));
+    m = to_float4x4(from_scale(vec3(1, 2, 3)));
    expect = mat4x4(vec4(1, 0, 0, 0), vec4(0, 2, 0, 0), vec4(0, 0, 3, 0), vec4(0, 0, 0, 1));
    EXPECT_TRUE(is_equal(m, expect, 0.00001));

-    m = mat4(from_scale(vec2(1, 2)));
+    m = to_float4x4(from_scale(vec2(1, 2)));
    expect = mat4x4(vec4(1, 0, 0, 0), vec4(0, 2, 0, 0), vec4(0, 0, 1, 0), vec4(0, 0, 0, 1));
    EXPECT_TRUE(is_equal(m, expect, 0.00001));

@@ -182,8 +182,8 @@ void main()
    EulerXYZ eul;
    Quaternion qt;
    vec3 scale;
-    to_rot_scale(mat3x3(m), eul, scale);
-    to_rot_scale(mat3x3(m), qt, scale);
+    to_rot_scale(to_float3x3(m), eul, scale);
+    to_rot_scale(to_float3x3(m), qt, scale);
    EXPECT_NEAR(scale, expect_scale, 0.00001);
    EXPECT_NEAR(as_vec4(qt), as_vec4(expect_qt), 0.0002);
    EXPECT_NEAR(as_vec3(eul), as_vec3(expect_eul), 0.0002);
--- a/source/blender/gpu/vulkan/vk_device.cc
+++ b/source/blender/gpu/vulkan/vk_device.cc
@@ -236,9 +236,6 @@ void VKDevice::init_glsl_patch()
    ss << "#define gpu_BaryCoordNoPersp gl_BaryCoordNoPerspEXT\n";
  }

-  ss << "#define DFDX_SIGN 1.0\n";
-  ss << "#define DFDY_SIGN 1.0\n";
-
  /* GLSL Backend Lib. */
  ss << datatoc_glsl_shader_defines_glsl;
  glsl_patch_ = ss.str();
--- a/source/blender/python/gpu/gpu_py_shader.cc
+++ b/source/blender/python/gpu/gpu_py_shader.cc
@@ -1009,7 +1009,7 @@ static PyObject *pygpu_shader_create_from_info(BPyGPUShader * /*self*/, BPyGPUSh
    return nullptr;
  }

-  GPUShader *shader = GPU_shader_create_from_info(o->info);
+  GPUShader *shader = GPU_shader_create_from_info_python(o->info);
  if (!shader) {
    PyErr_SetString(PyExc_Exception, "Shader Compile Error, see console for more details");
    return nullptr;