Build: Avoid excessive number of threads with CUDA split compile

Only use this feature when building for 1 or 2 CUDA architectures.
Otherwise CMake will build the binaries in parallel, and NVCC will then
also launch multiple threads for each binary.

We could add more manual control for this, but the main use case for
this is local builds and an automatic heuristic seems more likely to
help than an option that developers or users might not discover.

For minimal memory usage WITH_CYCLES_CUDA_BUILD_SERIAL still exists
to use only 1 thread for CUDA compilation.

Pull Request: https://projects.blender.org/blender/blender/pulls/147303
This commit is contained in:
Brecht Van Lommel
2025-10-16 18:00:39 +02:00
parent 6e96f1cca8
commit 74b7d663e1

View File

@@ -463,6 +463,41 @@ endif()
# CUDA module
function(cuda_add_common_flags cuda_version in_flags out_flags)
set(flags ${in_flags})
if(CUDA_HOST_COMPILER)
set(flags ${flags} -ccbin="${CUDA_HOST_COMPILER}")
endif()
set(flags ${flags}
# Helps with compatibility when using recent clang host compiler.
"-std=c++17"
--use_fast_math
-Wno-deprecated-gpu-targets)
if(WITH_CYCLES_DEBUG)
set(flags ${flags}
-D WITH_CYCLES_DEBUG
--ptxas-options="-v")
endif()
if(WITH_NANOVDB)
set(flags ${flags} -D WITH_NANOVDB)
endif()
if(NOT WITH_CYCLES_CUDA_BUILD_SERIAL AND "${cuda_version}" GREATER_EQUAL 129)
# Only use split compile with few binaries, to avoid excessive memory usage.
# This is mainly helpful for quick local builds for one architecture.
list(LENGTH CYCLES_CUDA_BINARIES_ARCH _num_binaries)
if(_num_binaries LESS_EQUAL 2)
set(flags ${flags} --split-compile=0)
endif()
endif()
set(${out_flags} ${flags} PARENT_SCOPE)
endfunction()
if(WITH_CYCLES_CUDA_BINARIES)
# 64 bit only
set(CUDA_BITS 64)
@@ -520,33 +555,9 @@ if(WITH_CYCLES_CUDA_BINARIES)
-m ${CUDA_BITS}
-I ${CMAKE_CURRENT_SOURCE_DIR}/..
-I ${CMAKE_CURRENT_SOURCE_DIR}/device/cuda
--use_fast_math
-o ${CMAKE_CURRENT_BINARY_DIR}/${cuda_file}
-Wno-deprecated-gpu-targets)
-o ${CMAKE_CURRENT_BINARY_DIR}/${cuda_file})
if(CUDA_HOST_COMPILER)
set(cuda_flags ${cuda_flags}
-ccbin="${CUDA_HOST_COMPILER}")
endif()
if(WITH_NANOVDB)
set(cuda_flags ${cuda_flags}
-D WITH_NANOVDB)
endif()
if(WITH_CYCLES_DEBUG)
set(cuda_flags ${cuda_flags}
-D WITH_CYCLES_DEBUG
--ptxas-options="-v")
endif()
if(NOT WITH_CYCLES_CUDA_BUILD_SERIAL AND "${cuda_version}" GREATER_EQUAL 129)
set(cuda_flags ${cuda_flags}
--split-compile=0)
endif()
# Helps with compatibility when using recent clang host compiler.
set(cuda_flags ${cuda_flags} -std=c++17)
cuda_add_common_flags(${cuda_version} "${cuda_flags}" cuda_flags)
set(_cuda_nvcc_args
-arch=${arch}
@@ -832,30 +843,13 @@ if(WITH_CYCLES_CUDA_BINARIES AND WITH_CYCLES_DEVICE_OPTIX)
-I "${OPTIX_INCLUDE_DIR}"
-I "${CMAKE_CURRENT_SOURCE_DIR}/.."
-I "${CMAKE_CURRENT_SOURCE_DIR}/device/cuda"
--use_fast_math
-Wno-deprecated-gpu-targets
-o ${output})
if(CUDA_HOST_COMPILER)
set(cuda_flags ${cuda_flags}
-ccbin="${CUDA_HOST_COMPILER}")
endif()
if(WITH_NANOVDB)
set(cuda_flags ${cuda_flags}
-D WITH_NANOVDB)
endif()
if(WITH_CYCLES_OSL)
set(cuda_flags ${cuda_flags}
-D OSL_LIBRARY_VERSION_CODE=${OSL_LIBRARY_VERSION_CODE})
endif()
if(WITH_CYCLES_DEBUG)
set(cuda_flags ${cuda_flags}
-D WITH_CYCLES_DEBUG)
endif()
set(arch compute_50)
set(cuda_nvcc_executable ${CUDA_NVCC_EXECUTABLE})
set(cuda_version ${CUDA_VERSION})
@@ -869,10 +863,7 @@ if(WITH_CYCLES_CUDA_BINARIES AND WITH_CYCLES_DEVICE_OPTIX)
endif()
endif()
if(NOT WITH_CYCLES_CUDA_BUILD_SERIAL AND "${cuda_version}" GREATER_EQUAL 129)
set(cuda_flags ${cuda_flags}
--split-compile=0)
endif()
cuda_add_common_flags(${cuda_version} "${cuda_flags}" cuda_flags)
add_custom_command(
OUTPUT
@@ -887,7 +878,6 @@ if(WITH_CYCLES_CUDA_BINARIES AND WITH_CYCLES_DEVICE_OPTIX)
COMMAND
${cuda_nvcc_executable}
--ptx
-std=c++17
-arch=${arch}
${cuda_flags}
${input}