The performance of the sorted_paths_array kernel on B570 is problematic. Relying on local sorting+partitioning instead gives a 25% overall rendering speedup and no regression in shade_surface when rendering Agent 327 Barbershop scene. On Arc A770, it still gives a 2% speedup when rendering Barbershop. Pull Request: https://projects.blender.org/blender/blender/pulls/140308
57 lines
1.3 KiB
C++
57 lines
1.3 KiB
C++
/* SPDX-FileCopyrightText: 2021-2022 Intel Corporation
|
|
*
|
|
* SPDX-License-Identifier: Apache-2.0 */
|
|
|
|
#pragma once
|
|
|
|
#ifdef WITH_ONEAPI
|
|
|
|
# include "device/memory.h"
|
|
# include "device/queue.h"
|
|
|
|
# include "kernel/device/oneapi/kernel.h"
|
|
|
|
# include "util/unique_ptr.h"
|
|
|
|
CCL_NAMESPACE_BEGIN
|
|
|
|
class OneapiDevice;
|
|
class device_memory;
|
|
|
|
/* Base class for OneAPI queues. */
|
|
class OneapiDeviceQueue : public DeviceQueue {
|
|
public:
|
|
explicit OneapiDeviceQueue(OneapiDevice *device);
|
|
|
|
int num_concurrent_states(const size_t state_size) const override;
|
|
|
|
int num_concurrent_busy_states(const size_t state_size) const override;
|
|
|
|
int num_sort_partitions(int max_num_paths, uint max_scene_shaders) const override;
|
|
|
|
void init_execution() override;
|
|
|
|
bool enqueue(DeviceKernel kernel,
|
|
const int kernel_work_size,
|
|
const DeviceKernelArguments &args) override;
|
|
|
|
bool synchronize() override;
|
|
|
|
void zero_to_device(device_memory &mem) override;
|
|
void copy_to_device(device_memory &mem) override;
|
|
void copy_from_device(device_memory &mem) override;
|
|
|
|
bool supports_local_atomic_sort() const override
|
|
{
|
|
return true;
|
|
}
|
|
|
|
protected:
|
|
OneapiDevice *oneapi_device_;
|
|
unique_ptr<KernelContext> kernel_context_;
|
|
};
|
|
|
|
CCL_NAMESPACE_END
|
|
|
|
#endif /* WITH_ONEAPI */
|