Fix Cycles CUDA crash when building kernels without optimizations (for debug)
In this case the blocksize may not the one we requested, which was assumed to be the case. Instead get the effective block size from the compiler as was already done for Metal and OneAPI.
This commit is contained in:
@@ -314,11 +314,7 @@ ccl_gpu_kernel_threads(GPU_PARALLEL_ACTIVE_INDEX_DEFAULT_BLOCK_SIZE)
|
||||
int kernel_index);
|
||||
ccl_gpu_kernel_lambda_pass.kernel_index = kernel_index;
|
||||
|
||||
gpu_parallel_active_index_array(GPU_PARALLEL_ACTIVE_INDEX_DEFAULT_BLOCK_SIZE,
|
||||
num_states,
|
||||
indices,
|
||||
num_indices,
|
||||
ccl_gpu_kernel_lambda_pass);
|
||||
gpu_parallel_active_index_array(num_states, indices, num_indices, ccl_gpu_kernel_lambda_pass);
|
||||
}
|
||||
ccl_gpu_kernel_postfix
|
||||
|
||||
@@ -333,11 +329,7 @@ ccl_gpu_kernel_threads(GPU_PARALLEL_ACTIVE_INDEX_DEFAULT_BLOCK_SIZE)
|
||||
int kernel_index);
|
||||
ccl_gpu_kernel_lambda_pass.kernel_index = kernel_index;
|
||||
|
||||
gpu_parallel_active_index_array(GPU_PARALLEL_ACTIVE_INDEX_DEFAULT_BLOCK_SIZE,
|
||||
num_states,
|
||||
indices,
|
||||
num_indices,
|
||||
ccl_gpu_kernel_lambda_pass);
|
||||
gpu_parallel_active_index_array(num_states, indices, num_indices, ccl_gpu_kernel_lambda_pass);
|
||||
}
|
||||
ccl_gpu_kernel_postfix
|
||||
|
||||
@@ -349,11 +341,7 @@ ccl_gpu_kernel_threads(GPU_PARALLEL_ACTIVE_INDEX_DEFAULT_BLOCK_SIZE)
|
||||
{
|
||||
ccl_gpu_kernel_lambda(INTEGRATOR_STATE(state, path, queued_kernel) != 0);
|
||||
|
||||
gpu_parallel_active_index_array(GPU_PARALLEL_ACTIVE_INDEX_DEFAULT_BLOCK_SIZE,
|
||||
num_states,
|
||||
indices,
|
||||
num_indices,
|
||||
ccl_gpu_kernel_lambda_pass);
|
||||
gpu_parallel_active_index_array(num_states, indices, num_indices, ccl_gpu_kernel_lambda_pass);
|
||||
}
|
||||
ccl_gpu_kernel_postfix
|
||||
|
||||
@@ -366,11 +354,8 @@ ccl_gpu_kernel_threads(GPU_PARALLEL_ACTIVE_INDEX_DEFAULT_BLOCK_SIZE)
|
||||
{
|
||||
ccl_gpu_kernel_lambda(INTEGRATOR_STATE(state, path, queued_kernel) == 0);
|
||||
|
||||
gpu_parallel_active_index_array(GPU_PARALLEL_ACTIVE_INDEX_DEFAULT_BLOCK_SIZE,
|
||||
num_states,
|
||||
indices + indices_offset,
|
||||
num_indices,
|
||||
ccl_gpu_kernel_lambda_pass);
|
||||
gpu_parallel_active_index_array(
|
||||
num_states, indices + indices_offset, num_indices, ccl_gpu_kernel_lambda_pass);
|
||||
}
|
||||
ccl_gpu_kernel_postfix
|
||||
|
||||
@@ -383,11 +368,8 @@ ccl_gpu_kernel_threads(GPU_PARALLEL_ACTIVE_INDEX_DEFAULT_BLOCK_SIZE)
|
||||
{
|
||||
ccl_gpu_kernel_lambda(INTEGRATOR_STATE(state, shadow_path, queued_kernel) == 0);
|
||||
|
||||
gpu_parallel_active_index_array(GPU_PARALLEL_ACTIVE_INDEX_DEFAULT_BLOCK_SIZE,
|
||||
num_states,
|
||||
indices + indices_offset,
|
||||
num_indices,
|
||||
ccl_gpu_kernel_lambda_pass);
|
||||
gpu_parallel_active_index_array(
|
||||
num_states, indices + indices_offset, num_indices, ccl_gpu_kernel_lambda_pass);
|
||||
}
|
||||
ccl_gpu_kernel_postfix
|
||||
|
||||
@@ -431,11 +413,7 @@ ccl_gpu_kernel_threads(GPU_PARALLEL_ACTIVE_INDEX_DEFAULT_BLOCK_SIZE)
|
||||
int num_active_paths);
|
||||
ccl_gpu_kernel_lambda_pass.num_active_paths = num_active_paths;
|
||||
|
||||
gpu_parallel_active_index_array(GPU_PARALLEL_ACTIVE_INDEX_DEFAULT_BLOCK_SIZE,
|
||||
num_states,
|
||||
indices,
|
||||
num_indices,
|
||||
ccl_gpu_kernel_lambda_pass);
|
||||
gpu_parallel_active_index_array(num_states, indices, num_indices, ccl_gpu_kernel_lambda_pass);
|
||||
}
|
||||
ccl_gpu_kernel_postfix
|
||||
|
||||
@@ -469,11 +447,7 @@ ccl_gpu_kernel_threads(GPU_PARALLEL_ACTIVE_INDEX_DEFAULT_BLOCK_SIZE)
|
||||
int num_active_paths);
|
||||
ccl_gpu_kernel_lambda_pass.num_active_paths = num_active_paths;
|
||||
|
||||
gpu_parallel_active_index_array(GPU_PARALLEL_ACTIVE_INDEX_DEFAULT_BLOCK_SIZE,
|
||||
num_states,
|
||||
indices,
|
||||
num_indices,
|
||||
ccl_gpu_kernel_lambda_pass);
|
||||
gpu_parallel_active_index_array(num_states, indices, num_indices, ccl_gpu_kernel_lambda_pass);
|
||||
}
|
||||
ccl_gpu_kernel_postfix
|
||||
|
||||
|
||||
@@ -56,7 +56,7 @@ void gpu_parallel_active_index_array_impl(const uint num_states,
|
||||
const uint is_active = (state_index < num_states) ? is_active_op(state_index) : 0;
|
||||
#else /* !__KERNEL__ONEAPI__ */
|
||||
# ifndef __KERNEL_METAL__
|
||||
template<uint blocksize, typename IsActiveOp>
|
||||
template<typename IsActiveOp>
|
||||
__device__
|
||||
# endif
|
||||
void
|
||||
@@ -79,6 +79,10 @@ __device__
|
||||
{
|
||||
extern ccl_gpu_shared int warp_offset[];
|
||||
|
||||
# ifndef __KERNEL_METAL__
|
||||
const uint blocksize = ccl_gpu_block_dim_x;
|
||||
# endif
|
||||
|
||||
const uint thread_index = ccl_gpu_thread_idx_x;
|
||||
const uint thread_warp = thread_index % ccl_gpu_warp_size;
|
||||
|
||||
@@ -149,7 +153,7 @@ __device__
|
||||
|
||||
#ifdef __KERNEL_METAL__
|
||||
|
||||
# define gpu_parallel_active_index_array(dummy, num_states, indices, num_indices, is_active_op) \
|
||||
# define gpu_parallel_active_index_array(num_states, indices, num_indices, is_active_op) \
|
||||
const uint is_active = (ccl_gpu_global_id_x() < num_states) ? \
|
||||
is_active_op(ccl_gpu_global_id_x()) : \
|
||||
0; \
|
||||
@@ -167,15 +171,13 @@ __device__
|
||||
simdgroup_offset)
|
||||
#elif defined(__KERNEL_ONEAPI__)
|
||||
|
||||
# define gpu_parallel_active_index_array( \
|
||||
blocksize, num_states, indices, num_indices, is_active_op) \
|
||||
# define gpu_parallel_active_index_array(num_states, indices, num_indices, is_active_op) \
|
||||
gpu_parallel_active_index_array_impl(num_states, indices, num_indices, is_active_op)
|
||||
|
||||
#else
|
||||
|
||||
# define gpu_parallel_active_index_array( \
|
||||
blocksize, num_states, indices, num_indices, is_active_op) \
|
||||
gpu_parallel_active_index_array_impl<blocksize>(num_states, indices, num_indices, is_active_op)
|
||||
# define gpu_parallel_active_index_array(num_states, indices, num_indices, is_active_op) \
|
||||
gpu_parallel_active_index_array_impl(num_states, indices, num_indices, is_active_op)
|
||||
|
||||
#endif
|
||||
|
||||
|
||||
Reference in New Issue
Block a user