diff --git a/intern/cycles/kernel/CMakeLists.txt b/intern/cycles/kernel/CMakeLists.txt
index 46f08996a72..34735dc57e8 100644
--- a/intern/cycles/kernel/CMakeLists.txt
+++ b/intern/cycles/kernel/CMakeLists.txt
@@ -463,6 +463,41 @@ endif()
 
 # CUDA module
 
+function(cuda_add_common_flags cuda_version in_flags out_flags)
+  set(flags ${in_flags})
+
+  if(CUDA_HOST_COMPILER)
+    set(flags ${flags} -ccbin="${CUDA_HOST_COMPILER}")
+  endif()
+
+  set(flags ${flags}
+    # Helps with compatibility when using recent clang host compiler.
+    "-std=c++17"
+    --use_fast_math
+    -Wno-deprecated-gpu-targets)
+
+  if(WITH_CYCLES_DEBUG)
+    set(flags ${flags}
+      -D WITH_CYCLES_DEBUG
+      --ptxas-options="-v")
+  endif()
+
+  if(WITH_NANOVDB)
+    set(flags ${flags} -D WITH_NANOVDB)
+  endif()
+
+  if(NOT WITH_CYCLES_CUDA_BUILD_SERIAL AND "${cuda_version}" GREATER_EQUAL 129)
+    # Only use split compile with few binaries, to avoid excessive memory usage.
+    # This is mainly helpful for quick local builds for one architecture.
+    list(LENGTH CYCLES_CUDA_BINARIES_ARCH _num_binaries)
+    if(_num_binaries LESS_EQUAL 2)
+      set(flags ${flags} --split-compile=0)
+    endif()
+  endif()
+
+  set(${out_flags} ${flags} PARENT_SCOPE)
+endfunction()
+
 if(WITH_CYCLES_CUDA_BINARIES)
   # 64 bit only
   set(CUDA_BITS 64)
@@ -520,33 +555,9 @@ if(WITH_CYCLES_CUDA_BINARIES)
       -m ${CUDA_BITS}
       -I ${CMAKE_CURRENT_SOURCE_DIR}/..
       -I ${CMAKE_CURRENT_SOURCE_DIR}/device/cuda
-      --use_fast_math
-      -o ${CMAKE_CURRENT_BINARY_DIR}/${cuda_file}
-      -Wno-deprecated-gpu-targets)
+      -o ${CMAKE_CURRENT_BINARY_DIR}/${cuda_file})
 
-    if(CUDA_HOST_COMPILER)
-      set(cuda_flags ${cuda_flags}
-        -ccbin="${CUDA_HOST_COMPILER}")
-    endif()
-
-    if(WITH_NANOVDB)
-      set(cuda_flags ${cuda_flags}
-        -D WITH_NANOVDB)
-    endif()
-
-    if(WITH_CYCLES_DEBUG)
-      set(cuda_flags ${cuda_flags}
-        -D WITH_CYCLES_DEBUG
-        --ptxas-options="-v")
-    endif()
-
-    if(NOT WITH_CYCLES_CUDA_BUILD_SERIAL AND "${cuda_version}" GREATER_EQUAL 129)
-      set(cuda_flags ${cuda_flags}
-        --split-compile=0)
-    endif()
-
-    # Helps with compatibility when using recent clang host compiler.
-    set(cuda_flags ${cuda_flags} -std=c++17)
+    cuda_add_common_flags(${cuda_version} "${cuda_flags}" cuda_flags)
 
     set(_cuda_nvcc_args
       -arch=${arch}
@@ -832,30 +843,13 @@ if(WITH_CYCLES_CUDA_BINARIES AND WITH_CYCLES_DEVICE_OPTIX)
       -I "${OPTIX_INCLUDE_DIR}"
       -I "${CMAKE_CURRENT_SOURCE_DIR}/.."
       -I "${CMAKE_CURRENT_SOURCE_DIR}/device/cuda"
-      --use_fast_math
-      -Wno-deprecated-gpu-targets
       -o ${output})
 
-    if(CUDA_HOST_COMPILER)
-      set(cuda_flags ${cuda_flags}
-        -ccbin="${CUDA_HOST_COMPILER}")
-    endif()
-
-    if(WITH_NANOVDB)
-      set(cuda_flags ${cuda_flags}
-        -D WITH_NANOVDB)
-    endif()
-
     if(WITH_CYCLES_OSL)
       set(cuda_flags ${cuda_flags}
         -D OSL_LIBRARY_VERSION_CODE=${OSL_LIBRARY_VERSION_CODE})
     endif()
 
-    if(WITH_CYCLES_DEBUG)
-      set(cuda_flags ${cuda_flags}
-        -D WITH_CYCLES_DEBUG)
-    endif()
-
     set(arch compute_50)
     set(cuda_nvcc_executable ${CUDA_NVCC_EXECUTABLE})
     set(cuda_version ${CUDA_VERSION})
@@ -869,10 +863,7 @@ if(WITH_CYCLES_CUDA_BINARIES AND WITH_CYCLES_DEVICE_OPTIX)
       endif()
     endif()
 
-    if(NOT WITH_CYCLES_CUDA_BUILD_SERIAL AND "${cuda_version}" GREATER_EQUAL 129)
-      set(cuda_flags ${cuda_flags}
-        --split-compile=0)
-    endif()
+    cuda_add_common_flags(${cuda_version} "${cuda_flags}" cuda_flags)
 
     add_custom_command(
       OUTPUT
@@ -887,7 +878,6 @@ if(WITH_CYCLES_CUDA_BINARIES AND WITH_CYCLES_DEVICE_OPTIX)
       COMMAND
         ${cuda_nvcc_executable}
         --ptx
-        -std=c++17
         -arch=${arch}
         ${cuda_flags}
         ${input}
diff --git a/intern/cycles/kernel/integrator/shade_volume.h b/intern/cycles/kernel/integrator/shade_volume.h
index af3093befcc..e3e533876aa 100644
--- a/intern/cycles/kernel/integrator/shade_volume.h
+++ b/intern/cycles/kernel/integrator/shade_volume.h
@@ -298,14 +298,21 @@ ccl_device void volume_voxel_get(KernelGlobals kg, ccl_private OctreeTracing &oc
 /* If there exists a Light Path Node, it could affect the density evaluation at runtime.
  * Randomly sample a few points on the ray to estimate the extrema. */
 template<const bool shadow, typename IntegratorGenericState>
-ccl_device_noinline Extrema<float> volume_estimate_extrema(KernelGlobals kg,
-                                                           const ccl_private Ray *ccl_restrict ray,
-                                                           ccl_private ShaderData *ccl_restrict sd,
-                                                           const IntegratorGenericState state,
-                                                           const ccl_private RNGState *rng_state,
-                                                           const uint32_t path_flag,
-                                                           const Interval<float> t,
-                                                           const VolumeStack entry)
+/* Work around apparent HIP compiler bug. */
+#  ifdef __KERNEL_HIP__
+ccl_device
+#  else
+ccl_device_noinline
+#  endif
+    Extrema<float>
+    volume_estimate_extrema(KernelGlobals kg,
+                            const ccl_private Ray *ccl_restrict ray,
+                            ccl_private ShaderData *ccl_restrict sd,
+                            const IntegratorGenericState state,
+                            const ccl_private RNGState *rng_state,
+                            const uint32_t path_flag,
+                            const Interval<float> t,
+                            const VolumeStack entry)
 {
   const bool homogeneous = volume_is_homogeneous(kg, entry);
   const int samples = homogeneous ? 1 : 4;
diff --git a/tests/python/eevee_render_tests.py b/tests/python/eevee_render_tests.py
index 9d36bd809e3..b0bc97a3688 100644
--- a/tests/python/eevee_render_tests.py
+++ b/tests/python/eevee_render_tests.py
@@ -170,26 +170,6 @@ if inside_blender:
         sys.exit(1)
 
 
-def get_gpu_device_type(blender):
-    # TODO: This always fails.
-    command = [
-        blender,
-        "--background",
-        "--factory-startup",
-        "--python",
-        str(pathlib.Path(__file__).parent / "gpu_info.py")
-    ]
-    try:
-        completed_process = subprocess.run(command, stdout=subprocess.PIPE)
-        for line in completed_process.stdout.read_text():
-            if line.startswith("GPU_DEVICE_TYPE:"):
-                vendor = line.split(':')[1]
-                return vendor
-    except Exception:
-        return None
-    return None
-
-
 def get_arguments(filepath, output_filepath, gpu_backend):
     arguments = [
         "--background",
@@ -230,11 +210,6 @@ def main():
     parser = create_argparse()
     args = parser.parse_args()
 
-    gpu_device_type = get_gpu_device_type(args.blender)
-    reference_override_dir = None
-    if gpu_device_type == "AMD":
-        reference_override_dir = "eevee_renders/amd"
-
     blocklist = BLOCKLIST
     if args.gpu_backend == "metal":
         blocklist += BLOCKLIST_METAL
@@ -249,7 +224,6 @@ def main():
 
     report.set_pixelated(True)
     report.set_reference_dir("eevee_renders")
-    report.set_reference_override_dir(reference_override_dir)
 
     test_dir_name = Path(args.testdir).name
     if test_dir_name.startswith('image_mapping'):
diff --git a/tests/python/gpu_info.py b/tests/python/modules/gpu_info.py
similarity index 100%
rename from tests/python/gpu_info.py
rename to tests/python/modules/gpu_info.py
diff --git a/tests/python/modules/render_report.py b/tests/python/modules/render_report.py
index 4d888c9371a..2a163bd49a9 100644
--- a/tests/python/modules/render_report.py
+++ b/tests/python/modules/render_report.py
@@ -174,6 +174,25 @@ def diff_output(test, oiiotool, fail_threshold, fail_percent, verbose, update):
     return test
 
 
+def get_gpu_device_vendor(blender):
+    command = [
+        blender,
+        "--background",
+        "--factory-startup",
+        "--python",
+        str(pathlib.Path(__file__).parent / "gpu_info.py")
+    ]
+    try:
+        completed_process = subprocess.run(command, stdout=subprocess.PIPE, universal_newlines=True)
+        for line in completed_process.stdout.splitlines():
+            if line.startswith("GPU_DEVICE_TYPE:"):
+                vendor = line.split(':')[1].upper()
+                return vendor
+    except Exception:
+        return None
+    return None
+
+
 class Report:
     __slots__ = (
         'title',
diff --git a/tests/python/storm_render_tests.py b/tests/python/storm_render_tests.py
index d27c40cb94f..87facf20897 100644
--- a/tests/python/storm_render_tests.py
+++ b/tests/python/storm_render_tests.py
@@ -59,6 +59,25 @@ BLOCKLIST_METAL = [
     "autosmooth_custom_normals.blend",
 ]
 
+# AMD seems to have similar limitations as Metal for transparency.
+BLOCKLIST_AMD = BLOCKLIST_METAL + [
+    "musgrave_.*_multifractal.*.blend",
+    "noise_lacunarity.blend",
+]
+
+# Minor difference in texture coordinate for white noise hash.
+BLOCKLIST_INTEL = [
+    "autosmooth_custom_normals.blend",
+    "hair_reflection.blend",
+    "hair_transmission.blend",
+    "principled_bsdf_emission.blend",
+    "principled_bsdf_sheen.blend",
+    "musgrave_.*_multifractal.*.blend",
+    "noise_lacunarity.blend",
+    "sss_hair.blend",
+    "white_noise.*.blend",
+]
+
 
 def setup():
     import bpy
@@ -120,7 +139,16 @@ def main():
 
     from modules import render_report
 
-    blocklist = BLOCKLIST_METAL if sys.platform == "darwin" else []
+    if sys.platform == "darwin":
+        blocklist = BLOCKLIST_METAL
+    else:
+        gpu_vendor = render_report.get_gpu_device_vendor(args.blender)
+        if gpu_vendor == "AMD":
+            blocklist = BLOCKLIST_AMD
+        elif gpu_vendor == "INTEL":
+            blocklist = BLOCKLIST_INTEL
+        else:
+            blocklist = []
 
     if args.export_method == 'HYDRA':
         report = render_report.Report("Storm Hydra", args.outdir, args.oiiotool, blocklist=blocklist + BLOCKLIST_HYDRA)