Fix Cycles CUDA crash when building kernels without optimizations (for debug)

In this case the blocksize may not the one we requested, which was assumed to be
the case. Instead get the effective block size from the compiler as was already
done for Metal and OneAPI.
This commit is contained in:
Brecht Van Lommel
2022-11-30 21:38:57 +01:00
parent b25c301c15
commit 222b64fcdc
2 changed files with 18 additions and 42 deletions

View File

@@ -314,11 +314,7 @@ ccl_gpu_kernel_threads(GPU_PARALLEL_ACTIVE_INDEX_DEFAULT_BLOCK_SIZE)
int kernel_index);
ccl_gpu_kernel_lambda_pass.kernel_index = kernel_index;
gpu_parallel_active_index_array(GPU_PARALLEL_ACTIVE_INDEX_DEFAULT_BLOCK_SIZE,
num_states,
indices,
num_indices,
ccl_gpu_kernel_lambda_pass);
gpu_parallel_active_index_array(num_states, indices, num_indices, ccl_gpu_kernel_lambda_pass);
}
ccl_gpu_kernel_postfix
@@ -333,11 +329,7 @@ ccl_gpu_kernel_threads(GPU_PARALLEL_ACTIVE_INDEX_DEFAULT_BLOCK_SIZE)
int kernel_index);
ccl_gpu_kernel_lambda_pass.kernel_index = kernel_index;
gpu_parallel_active_index_array(GPU_PARALLEL_ACTIVE_INDEX_DEFAULT_BLOCK_SIZE,
num_states,
indices,
num_indices,
ccl_gpu_kernel_lambda_pass);
gpu_parallel_active_index_array(num_states, indices, num_indices, ccl_gpu_kernel_lambda_pass);
}
ccl_gpu_kernel_postfix
@@ -349,11 +341,7 @@ ccl_gpu_kernel_threads(GPU_PARALLEL_ACTIVE_INDEX_DEFAULT_BLOCK_SIZE)
{
ccl_gpu_kernel_lambda(INTEGRATOR_STATE(state, path, queued_kernel) != 0);
gpu_parallel_active_index_array(GPU_PARALLEL_ACTIVE_INDEX_DEFAULT_BLOCK_SIZE,
num_states,
indices,
num_indices,
ccl_gpu_kernel_lambda_pass);
gpu_parallel_active_index_array(num_states, indices, num_indices, ccl_gpu_kernel_lambda_pass);
}
ccl_gpu_kernel_postfix
@@ -366,11 +354,8 @@ ccl_gpu_kernel_threads(GPU_PARALLEL_ACTIVE_INDEX_DEFAULT_BLOCK_SIZE)
{
ccl_gpu_kernel_lambda(INTEGRATOR_STATE(state, path, queued_kernel) == 0);
gpu_parallel_active_index_array(GPU_PARALLEL_ACTIVE_INDEX_DEFAULT_BLOCK_SIZE,
num_states,
indices + indices_offset,
num_indices,
ccl_gpu_kernel_lambda_pass);
gpu_parallel_active_index_array(
num_states, indices + indices_offset, num_indices, ccl_gpu_kernel_lambda_pass);
}
ccl_gpu_kernel_postfix
@@ -383,11 +368,8 @@ ccl_gpu_kernel_threads(GPU_PARALLEL_ACTIVE_INDEX_DEFAULT_BLOCK_SIZE)
{
ccl_gpu_kernel_lambda(INTEGRATOR_STATE(state, shadow_path, queued_kernel) == 0);
gpu_parallel_active_index_array(GPU_PARALLEL_ACTIVE_INDEX_DEFAULT_BLOCK_SIZE,
num_states,
indices + indices_offset,
num_indices,
ccl_gpu_kernel_lambda_pass);
gpu_parallel_active_index_array(
num_states, indices + indices_offset, num_indices, ccl_gpu_kernel_lambda_pass);
}
ccl_gpu_kernel_postfix
@@ -431,11 +413,7 @@ ccl_gpu_kernel_threads(GPU_PARALLEL_ACTIVE_INDEX_DEFAULT_BLOCK_SIZE)
int num_active_paths);
ccl_gpu_kernel_lambda_pass.num_active_paths = num_active_paths;
gpu_parallel_active_index_array(GPU_PARALLEL_ACTIVE_INDEX_DEFAULT_BLOCK_SIZE,
num_states,
indices,
num_indices,
ccl_gpu_kernel_lambda_pass);
gpu_parallel_active_index_array(num_states, indices, num_indices, ccl_gpu_kernel_lambda_pass);
}
ccl_gpu_kernel_postfix
@@ -469,11 +447,7 @@ ccl_gpu_kernel_threads(GPU_PARALLEL_ACTIVE_INDEX_DEFAULT_BLOCK_SIZE)
int num_active_paths);
ccl_gpu_kernel_lambda_pass.num_active_paths = num_active_paths;
gpu_parallel_active_index_array(GPU_PARALLEL_ACTIVE_INDEX_DEFAULT_BLOCK_SIZE,
num_states,
indices,
num_indices,
ccl_gpu_kernel_lambda_pass);
gpu_parallel_active_index_array(num_states, indices, num_indices, ccl_gpu_kernel_lambda_pass);
}
ccl_gpu_kernel_postfix

View File

@@ -56,7 +56,7 @@ void gpu_parallel_active_index_array_impl(const uint num_states,
const uint is_active = (state_index < num_states) ? is_active_op(state_index) : 0;
#else /* !__KERNEL__ONEAPI__ */
# ifndef __KERNEL_METAL__
template<uint blocksize, typename IsActiveOp>
template<typename IsActiveOp>
__device__
# endif
void
@@ -79,6 +79,10 @@ __device__
{
extern ccl_gpu_shared int warp_offset[];
# ifndef __KERNEL_METAL__
const uint blocksize = ccl_gpu_block_dim_x;
# endif
const uint thread_index = ccl_gpu_thread_idx_x;
const uint thread_warp = thread_index % ccl_gpu_warp_size;
@@ -149,7 +153,7 @@ __device__
#ifdef __KERNEL_METAL__
# define gpu_parallel_active_index_array(dummy, num_states, indices, num_indices, is_active_op) \
# define gpu_parallel_active_index_array(num_states, indices, num_indices, is_active_op) \
const uint is_active = (ccl_gpu_global_id_x() < num_states) ? \
is_active_op(ccl_gpu_global_id_x()) : \
0; \
@@ -167,15 +171,13 @@ __device__
simdgroup_offset)
#elif defined(__KERNEL_ONEAPI__)
# define gpu_parallel_active_index_array( \
blocksize, num_states, indices, num_indices, is_active_op) \
# define gpu_parallel_active_index_array(num_states, indices, num_indices, is_active_op) \
gpu_parallel_active_index_array_impl(num_states, indices, num_indices, is_active_op)
#else
# define gpu_parallel_active_index_array( \
blocksize, num_states, indices, num_indices, is_active_op) \
gpu_parallel_active_index_array_impl<blocksize>(num_states, indices, num_indices, is_active_op)
# define gpu_parallel_active_index_array(num_states, indices, num_indices, is_active_op) \
gpu_parallel_active_index_array_impl(num_states, indices, num_indices, is_active_op)
#endif