From 74b7d663e12cbc8cf83005deb88042f38e283258 Mon Sep 17 00:00:00 2001
From: Brecht Van Lommel <brecht@noreply.localhost>
Date: Thu, 16 Oct 2025 18:00:39 +0200
Subject: [PATCH] Build: Avoid excessive number of threads with CUDA split
 compile

Only use this feature when building for 1 or 2 CUDA architectures.
Otherwise CMake will build the binaries in parallel, and NVCC will then
also launch multiple threads for each binary.

We could add more manual control for this, but the main use case for
this is local builds and an automatic heuristic seems more likely to
help than an option that developers or users might not discover.

For minimal memory usage WITH_CYCLES_CUDA_BUILD_SERIAL still exists
to use only 1 thread for CUDA compilation.

Pull Request: https://projects.blender.org/blender/blender/pulls/147303
---
 intern/cycles/kernel/CMakeLists.txt | 86 +++++++++++++----------------
 1 file changed, 38 insertions(+), 48 deletions(-)

diff --git a/intern/cycles/kernel/CMakeLists.txt b/intern/cycles/kernel/CMakeLists.txt
index 2306898f6b8..a6352c079cd 100644
--- a/intern/cycles/kernel/CMakeLists.txt
+++ b/intern/cycles/kernel/CMakeLists.txt
@@ -463,6 +463,41 @@ endif()
 
 # CUDA module
 
+function(cuda_add_common_flags cuda_version in_flags out_flags)
+  set(flags ${in_flags})
+
+  if(CUDA_HOST_COMPILER)
+    set(flags ${flags} -ccbin="${CUDA_HOST_COMPILER}")
+  endif()
+
+  set(flags ${flags}
+    # Helps with compatibility when using recent clang host compiler.
+    "-std=c++17"
+    --use_fast_math
+    -Wno-deprecated-gpu-targets)
+
+  if(WITH_CYCLES_DEBUG)
+    set(flags ${flags}
+      -D WITH_CYCLES_DEBUG
+      --ptxas-options="-v")
+  endif()
+
+  if(WITH_NANOVDB)
+    set(flags ${flags} -D WITH_NANOVDB)
+  endif()
+
+  if(NOT WITH_CYCLES_CUDA_BUILD_SERIAL AND "${cuda_version}" GREATER_EQUAL 129)
+    # Only use split compile with few binaries, to avoid excessive memory usage.
+    # This is mainly helpful for quick local builds for one architecture.
+    list(LENGTH CYCLES_CUDA_BINARIES_ARCH _num_binaries)
+    if(_num_binaries LESS_EQUAL 2)
+      set(flags ${flags} --split-compile=0)
+    endif()
+  endif()
+
+  set(${out_flags} ${flags} PARENT_SCOPE)
+endfunction()
+
 if(WITH_CYCLES_CUDA_BINARIES)
   # 64 bit only
   set(CUDA_BITS 64)
@@ -520,33 +555,9 @@ if(WITH_CYCLES_CUDA_BINARIES)
       -m ${CUDA_BITS}
       -I ${CMAKE_CURRENT_SOURCE_DIR}/..
       -I ${CMAKE_CURRENT_SOURCE_DIR}/device/cuda
-      --use_fast_math
-      -o ${CMAKE_CURRENT_BINARY_DIR}/${cuda_file}
-      -Wno-deprecated-gpu-targets)
+      -o ${CMAKE_CURRENT_BINARY_DIR}/${cuda_file})
 
-    if(CUDA_HOST_COMPILER)
-      set(cuda_flags ${cuda_flags}
-        -ccbin="${CUDA_HOST_COMPILER}")
-    endif()
-
-    if(WITH_NANOVDB)
-      set(cuda_flags ${cuda_flags}
-        -D WITH_NANOVDB)
-    endif()
-
-    if(WITH_CYCLES_DEBUG)
-      set(cuda_flags ${cuda_flags}
-        -D WITH_CYCLES_DEBUG
-        --ptxas-options="-v")
-    endif()
-
-    if(NOT WITH_CYCLES_CUDA_BUILD_SERIAL AND "${cuda_version}" GREATER_EQUAL 129)
-      set(cuda_flags ${cuda_flags}
-        --split-compile=0)
-    endif()
-
-    # Helps with compatibility when using recent clang host compiler.
-    set(cuda_flags ${cuda_flags} -std=c++17)
+    cuda_add_common_flags(${cuda_version} "${cuda_flags}" cuda_flags)
 
     set(_cuda_nvcc_args
       -arch=${arch}
@@ -832,30 +843,13 @@ if(WITH_CYCLES_CUDA_BINARIES AND WITH_CYCLES_DEVICE_OPTIX)
       -I "${OPTIX_INCLUDE_DIR}"
       -I "${CMAKE_CURRENT_SOURCE_DIR}/.."
       -I "${CMAKE_CURRENT_SOURCE_DIR}/device/cuda"
-      --use_fast_math
-      -Wno-deprecated-gpu-targets
       -o ${output})
 
-    if(CUDA_HOST_COMPILER)
-      set(cuda_flags ${cuda_flags}
-        -ccbin="${CUDA_HOST_COMPILER}")
-    endif()
-
-    if(WITH_NANOVDB)
-      set(cuda_flags ${cuda_flags}
-        -D WITH_NANOVDB)
-    endif()
-
     if(WITH_CYCLES_OSL)
       set(cuda_flags ${cuda_flags}
         -D OSL_LIBRARY_VERSION_CODE=${OSL_LIBRARY_VERSION_CODE})
     endif()
 
-    if(WITH_CYCLES_DEBUG)
-      set(cuda_flags ${cuda_flags}
-        -D WITH_CYCLES_DEBUG)
-    endif()
-
     set(arch compute_50)
     set(cuda_nvcc_executable ${CUDA_NVCC_EXECUTABLE})
     set(cuda_version ${CUDA_VERSION})
@@ -869,10 +863,7 @@ if(WITH_CYCLES_CUDA_BINARIES AND WITH_CYCLES_DEVICE_OPTIX)
       endif()
     endif()
 
-    if(NOT WITH_CYCLES_CUDA_BUILD_SERIAL AND "${cuda_version}" GREATER_EQUAL 129)
-      set(cuda_flags ${cuda_flags}
-        --split-compile=0)
-    endif()
+    cuda_add_common_flags(${cuda_version} "${cuda_flags}" cuda_flags)
 
     add_custom_command(
       OUTPUT
@@ -887,7 +878,6 @@ if(WITH_CYCLES_CUDA_BINARIES AND WITH_CYCLES_DEVICE_OPTIX)
       COMMAND
         ${cuda_nvcc_executable}
         --ptx
-        -std=c++17
         -arch=${arch}
         ${cuda_flags}
         ${input}