Files
test/intern/cycles/device/queue.h
Brecht Van Lommel 523bbf7065 Cycles: generalize shader sorting / locality heuristic to all GPU devices
This was added for Metal, but also gives good results with CUDA and OptiX.
Also enable it for future Apple GPUs instead of only M1 and M2, since this has
been shown to help across multiple GPUs so the better bet seems to enable
rather than disable it.

Also moves some of the logic outside of the Metal device code, and always
enables the code in the kernel since other devices don't do dynamic compile.

Time per sample with OptiX + RTX A6000:
                                         new                  old
barbershop_interior                      0.0730s              0.0727s
bmw27                                    0.0047s              0.0053s
classroom                                0.0428s              0.0464s
fishy_cat                                0.0102s              0.0108s
junkshop                                 0.0366s              0.0395s
koro                                     0.0567s              0.0578s
monster                                  0.0206s              0.0223s
pabellon                                 0.0158s              0.0174s
sponza                                   0.0088s              0.0100s
spring                                   0.1267s              0.1280s
victor                                   0.0524s              0.0531s
wdas_cloud                               0.0817s              0.0816s

Ref D15331, T87836
2022-07-15 13:42:47 +02:00

178 lines
5.2 KiB
C++

/* SPDX-License-Identifier: Apache-2.0
* Copyright 2011-2022 Blender Foundation */
#pragma once
#include "device/kernel.h"
#include "device/graphics_interop.h"
#include "util/debug.h"
#include "util/log.h"
#include "util/map.h"
#include "util/string.h"
#include "util/unique_ptr.h"
CCL_NAMESPACE_BEGIN
class Device;
class device_memory;
struct KernelWorkTile;
/* Container for device kernel arguments with type correctness ensured by API. */
struct DeviceKernelArguments {
enum Type {
POINTER,
INT32,
FLOAT32,
BOOLEAN,
KERNEL_FILM_CONVERT,
};
static const int MAX_ARGS = 18;
Type types[MAX_ARGS];
void *values[MAX_ARGS];
size_t sizes[MAX_ARGS];
size_t count = 0;
DeviceKernelArguments()
{
}
template<class T> DeviceKernelArguments(const T *arg)
{
add(arg);
}
template<class T, class... Args> DeviceKernelArguments(const T *first, Args... args)
{
add(first);
add(args...);
}
void add(const KernelFilmConvert *value)
{
add(KERNEL_FILM_CONVERT, value, sizeof(KernelFilmConvert));
}
void add(const device_ptr *value)
{
add(POINTER, value, sizeof(device_ptr));
}
void add(const int32_t *value)
{
add(INT32, value, sizeof(int32_t));
}
void add(const float *value)
{
add(FLOAT32, value, sizeof(float));
}
void add(const bool *value)
{
add(BOOLEAN, value, 4);
}
void add(const Type type, const void *value, size_t size)
{
assert(count < MAX_ARGS);
types[count] = type;
values[count] = (void *)value;
sizes[count] = size;
count++;
}
template<typename T, typename... Args> void add(const T *first, Args... args)
{
add(first);
add(args...);
}
};
/* Abstraction of a command queue for a device.
* Provides API to schedule kernel execution in a specific queue with minimal possible overhead
* from driver side.
*
* This class encapsulates all properties needed for commands execution. */
class DeviceQueue {
public:
virtual ~DeviceQueue();
/* Number of concurrent states to process for integrator,
* based on number of cores and/or available memory. */
virtual int num_concurrent_states(const size_t state_size) const = 0;
/* Number of states which keeps the device occupied with work without losing performance.
* The renderer will add more work (when available) when number of active paths falls below this
* value. */
virtual int num_concurrent_busy_states() const = 0;
/* Number of elements in a partition of sorted shaders, that improves memory locality of
* integrator state fetch at the cost of decreased coherence for shader kernel execution. */
virtual int num_sort_partition_elements() const
{
return 65536;
}
/* Initialize execution of kernels on this queue.
*
* Will, for example, load all data required by the kernels from Device to global or path state.
*
* Use this method after device synchronization has finished before enqueueing any kernels. */
virtual void init_execution() = 0;
/* Enqueue kernel execution.
*
* Execute the kernel work_size times on the device.
* Supported arguments types:
* - int: pass pointer to the int
* - device memory: pass pointer to device_memory.device_pointer
* Return false if there was an error executing this or a previous kernel. */
virtual bool enqueue(DeviceKernel kernel,
const int work_size,
DeviceKernelArguments const &args) = 0;
/* Wait unit all enqueued kernels have finished execution.
* Return false if there was an error executing any of the enqueued kernels. */
virtual bool synchronize() = 0;
/* Copy memory to/from device as part of the command queue, to ensure
* operations are done in order without having to synchronize. */
virtual void zero_to_device(device_memory &mem) = 0;
virtual void copy_to_device(device_memory &mem) = 0;
virtual void copy_from_device(device_memory &mem) = 0;
/* Graphics resources interoperability.
*
* The interoperability comes here by the meaning that the device is capable of computing result
* directly into an OpenGL (or other graphics library) buffer. */
/* Create graphics interoperability context which will be taking care of mapping graphics
* resource as a buffer writable by kernels of this device. */
virtual unique_ptr<DeviceGraphicsInterop> graphics_interop_create()
{
LOG(FATAL) << "Request of GPU interop of a device which does not support it.";
return nullptr;
}
/* Device this queue has been created for. */
Device *device;
protected:
/* Hide construction so that allocation via `Device` API is enforced. */
explicit DeviceQueue(Device *device);
/* Implementations call these from the corresponding methods to generate debugging logs. */
void debug_init_execution();
void debug_enqueue(DeviceKernel kernel, const int work_size);
void debug_synchronize();
string debug_active_kernels();
/* Combination of kernels enqueued together sync last synchronize. */
DeviceKernelMask last_kernels_enqueued_;
/* Time of synchronize call. */
double last_sync_time_;
/* Accumulated execution time for combinations of kernels launched together. */
map<DeviceKernelMask, double> stats_kernel_time_;
};
CCL_NAMESPACE_END