This change puts all the block size macros in the same common header, so they can be included in host side code without needing to also include the kernels that are defined in the device headers that contained these values. This change also removes a magic number used to enqueue a kernel, which happened to agree with the GPU_PARALLEL_SORT_BLOCK_SIZE macro. Pull Request: https://projects.blender.org/blender/blender/pulls/143646
37 lines
895 B
C
37 lines
895 B
C
/* SPDX-FileCopyrightText: 2021-2022 Blender Foundation
|
|
*
|
|
* SPDX-License-Identifier: Apache-2.0 */
|
|
|
|
#pragma once
|
|
|
|
CCL_NAMESPACE_BEGIN
|
|
|
|
/* Parallel prefix sum.
|
|
*
|
|
* TODO: actually make this work in parallel.
|
|
*
|
|
* This is used for an array the size of the number of shaders in the scene
|
|
* which is not usually huge, so might not be a significant bottleneck. */
|
|
|
|
#include "util/atomic.h"
|
|
|
|
__device__ void gpu_parallel_prefix_sum(const int global_id,
|
|
ccl_global int *counter,
|
|
ccl_global int *prefix_sum,
|
|
const int num_values)
|
|
{
|
|
if (global_id != 0) {
|
|
return;
|
|
}
|
|
|
|
int offset = 0;
|
|
for (int i = 0; i < num_values; i++) {
|
|
const int new_offset = offset + counter[i];
|
|
prefix_sum[i] = offset;
|
|
counter[i] = 0;
|
|
offset = new_offset;
|
|
}
|
|
}
|
|
|
|
CCL_NAMESPACE_END
|