diff --git a/intern/cycles/kernel/device/gpu/kernel.h b/intern/cycles/kernel/device/gpu/kernel.h index e44941a1313..5dacf2910be 100644 --- a/intern/cycles/kernel/device/gpu/kernel.h +++ b/intern/cycles/kernel/device/gpu/kernel.h @@ -282,7 +282,7 @@ ccl_gpu_kernel_threads(GPU_PARALLEL_ACTIVE_INDEX_DEFAULT_BLOCK_SIZE) int kernel_index); ccl_gpu_kernel_lambda_pass.kernel_index = kernel_index; - gpu_parallel_active_index_array( + gpu_parallel_active_index_array(GPU_PARALLEL_ACTIVE_INDEX_DEFAULT_BLOCK_SIZE, num_states, indices, num_indices, ccl_gpu_kernel_lambda_pass); } @@ -297,7 +297,7 @@ ccl_gpu_kernel_threads(GPU_PARALLEL_ACTIVE_INDEX_DEFAULT_BLOCK_SIZE) int kernel_index); ccl_gpu_kernel_lambda_pass.kernel_index = kernel_index; - gpu_parallel_active_index_array( + gpu_parallel_active_index_array(GPU_PARALLEL_ACTIVE_INDEX_DEFAULT_BLOCK_SIZE, num_states, indices, num_indices, ccl_gpu_kernel_lambda_pass); } @@ -309,7 +309,7 @@ ccl_gpu_kernel_threads(GPU_PARALLEL_ACTIVE_INDEX_DEFAULT_BLOCK_SIZE) { ccl_gpu_kernel_lambda(INTEGRATOR_STATE(state, path, queued_kernel) != 0); - gpu_parallel_active_index_array( + gpu_parallel_active_index_array(GPU_PARALLEL_ACTIVE_INDEX_DEFAULT_BLOCK_SIZE, num_states, indices, num_indices, ccl_gpu_kernel_lambda_pass); } @@ -322,7 +322,7 @@ ccl_gpu_kernel_threads(GPU_PARALLEL_ACTIVE_INDEX_DEFAULT_BLOCK_SIZE) { ccl_gpu_kernel_lambda(INTEGRATOR_STATE(state, path, queued_kernel) == 0); - gpu_parallel_active_index_array( + gpu_parallel_active_index_array(GPU_PARALLEL_ACTIVE_INDEX_DEFAULT_BLOCK_SIZE, num_states, indices + indices_offset, num_indices, ccl_gpu_kernel_lambda_pass); } @@ -335,7 +335,7 @@ ccl_gpu_kernel_threads(GPU_PARALLEL_ACTIVE_INDEX_DEFAULT_BLOCK_SIZE) { ccl_gpu_kernel_lambda(INTEGRATOR_STATE(state, shadow_path, queued_kernel) == 0); - gpu_parallel_active_index_array( + gpu_parallel_active_index_array(GPU_PARALLEL_ACTIVE_INDEX_DEFAULT_BLOCK_SIZE, num_states, indices + indices_offset, num_indices, ccl_gpu_kernel_lambda_pass); } @@ -378,7 +378,7 @@ ccl_gpu_kernel_threads(GPU_PARALLEL_ACTIVE_INDEX_DEFAULT_BLOCK_SIZE) int num_active_paths); ccl_gpu_kernel_lambda_pass.num_active_paths = num_active_paths; - gpu_parallel_active_index_array( + gpu_parallel_active_index_array(GPU_PARALLEL_ACTIVE_INDEX_DEFAULT_BLOCK_SIZE, num_states, indices, num_indices, ccl_gpu_kernel_lambda_pass); } @@ -411,7 +411,7 @@ ccl_gpu_kernel_threads(GPU_PARALLEL_ACTIVE_INDEX_DEFAULT_BLOCK_SIZE) int num_active_paths); ccl_gpu_kernel_lambda_pass.num_active_paths = num_active_paths; - gpu_parallel_active_index_array( + gpu_parallel_active_index_array(GPU_PARALLEL_ACTIVE_INDEX_DEFAULT_BLOCK_SIZE, num_states, indices, num_indices, ccl_gpu_kernel_lambda_pass); } diff --git a/intern/cycles/kernel/device/gpu/parallel_active_index.h b/intern/cycles/kernel/device/gpu/parallel_active_index.h index 33b108f9625..32dbe0ddaa3 100644 --- a/intern/cycles/kernel/device/gpu/parallel_active_index.h +++ b/intern/cycles/kernel/device/gpu/parallel_active_index.h @@ -18,44 +18,26 @@ CCL_NAMESPACE_BEGIN # define GPU_PARALLEL_ACTIVE_INDEX_DEFAULT_BLOCK_SIZE 512 #endif -#ifdef __KERNEL_METAL__ -struct ActiveIndexContext { - ActiveIndexContext(int _thread_index, - int _global_index, - int _threadgroup_size, - int _simdgroup_size, - int _simd_lane_index, - int _simd_group_index, - int _num_simd_groups, - threadgroup int *_simdgroup_offset) - : thread_index(_thread_index), - global_index(_global_index), - blocksize(_threadgroup_size), - ccl_gpu_warp_size(_simdgroup_size), - thread_warp(_simd_lane_index), - warp_index(_simd_group_index), - num_warps(_num_simd_groups), - warp_offset(_simdgroup_offset) - { - } - - const int thread_index, global_index, blocksize, ccl_gpu_warp_size, thread_warp, warp_index, - num_warps; - threadgroup int *warp_offset; - - template - void active_index_array(const uint num_states, - ccl_global int *indices, - ccl_global int *num_indices, - IsActiveOp is_active_op) - { - const uint state_index = global_index; -#else +#ifndef __KERNEL_METAL__ template -__device__ void gpu_parallel_active_index_array(const uint num_states, - ccl_global int *indices, - ccl_global int *num_indices, - IsActiveOp is_active_op) +__device__ +#endif +void gpu_parallel_active_index_array_impl(const uint num_states, + ccl_global int *indices, + ccl_global int *num_indices, +#ifdef __KERNEL_METAL__ + const uint is_active, + const uint blocksize, + const int thread_index, + const uint state_index, + const int ccl_gpu_warp_size, + const int thread_warp, + const int warp_index, + const int num_warps, + threadgroup int *warp_offset) +{ +#else + IsActiveOp is_active_op) { extern ccl_gpu_shared int warp_offset[]; @@ -66,61 +48,59 @@ __device__ void gpu_parallel_active_index_array(const uint num_states, const uint num_warps = blocksize / ccl_gpu_warp_size; const uint state_index = ccl_gpu_block_idx_x * blocksize + thread_index; + + /* Test if state corresponding to this thread is active. */ + const uint is_active = (state_index < num_states) ? is_active_op(state_index) : 0; #endif - /* Test if state corresponding to this thread is active. */ - const uint is_active = (state_index < num_states) ? is_active_op(state_index) : 0; + /* For each thread within a warp compute how many other active states precede it. */ + const uint thread_offset = popcount(ccl_gpu_ballot(is_active) & + ccl_gpu_thread_mask(thread_warp)); - /* For each thread within a warp compute how many other active states precede it. */ - const uint thread_offset = popcount(ccl_gpu_ballot(is_active) & - ccl_gpu_thread_mask(thread_warp)); - - /* Last thread in warp stores number of active states for each warp. */ - if (thread_warp == ccl_gpu_warp_size - 1) { - warp_offset[warp_index] = thread_offset + is_active; - } - - ccl_gpu_syncthreads(); - - /* Last thread in block converts per-warp sizes to offsets, increments global size of - * index array and gets offset to write to. */ - if (thread_index == blocksize - 1) { - /* TODO: parallelize this. */ - int offset = 0; - for (int i = 0; i < num_warps; i++) { - int num_active = warp_offset[i]; - warp_offset[i] = offset; - offset += num_active; - } - - const uint block_num_active = warp_offset[warp_index] + thread_offset + is_active; - warp_offset[num_warps] = atomic_fetch_and_add_uint32(num_indices, block_num_active); - } - - ccl_gpu_syncthreads(); - - /* Write to index array. */ - if (is_active) { - const uint block_offset = warp_offset[num_warps]; - indices[block_offset + warp_offset[warp_index] + thread_offset] = state_index; - } + /* Last thread in warp stores number of active states for each warp. */ + if (thread_warp == ccl_gpu_warp_size - 1) { + warp_offset[warp_index] = thread_offset + is_active; } -#ifdef __KERNEL_METAL__ -}; /* end class ActiveIndexContext */ + ccl_gpu_syncthreads(); + + /* Last thread in block converts per-warp sizes to offsets, increments global size of + * index array and gets offset to write to. */ + if (thread_index == blocksize - 1) { + /* TODO: parallelize this. */ + int offset = 0; + for (int i = 0; i < num_warps; i++) { + int num_active = warp_offset[i]; + warp_offset[i] = offset; + offset += num_active; + } + + const uint block_num_active = warp_offset[warp_index] + thread_offset + is_active; + warp_offset[num_warps] = atomic_fetch_and_add_uint32(num_indices, block_num_active); + } + + ccl_gpu_syncthreads(); + + /* Write to index array. */ + if (is_active) { + const uint block_offset = warp_offset[num_warps]; + indices[block_offset + warp_offset[warp_index] + thread_offset] = state_index; + } +} + +#ifdef __KERNEL_METAL__ + +# define gpu_parallel_active_index_array(dummy, num_states, indices, num_indices, is_active_op) \ + const uint is_active = (ccl_gpu_global_id_x() < num_states) ? is_active_op(ccl_gpu_global_id_x()) : 0; \ + gpu_parallel_active_index_array_impl(num_states, indices, num_indices, is_active, \ + metal_local_size, metal_local_id, metal_global_id, simdgroup_size, simd_lane_index, \ + simd_group_index, num_simd_groups, simdgroup_offset) + +#else + +# define gpu_parallel_active_index_array(blocksize, num_states, indices, num_indices, is_active_op) \ + gpu_parallel_active_index_array_impl(num_states, indices, num_indices, is_active_op) -/* inject the required thread params into a struct, and redirect to its templated member function - */ -# define gpu_parallel_active_index_array \ - ActiveIndexContext(metal_local_id, \ - metal_global_id, \ - metal_local_size, \ - simdgroup_size, \ - simd_lane_index, \ - simd_group_index, \ - num_simd_groups, \ - simdgroup_offset) \ - .active_index_array #endif CCL_NAMESPACE_END