This change switches Cycles to an opensource HIP-RT library which
implements hardware ray-tracing. This library is now used on
both Windows and Linux. While there should be no noticeable changes
on Windows, on Linux this adds support for hardware ray-tracing on
AMD GPUs.
The majority of the change is typical platform code to add new
library to the dependency builder, and a change in the way how
ahead-of-time (AoT) kernels are compiled. There are changes in
Cycles itself, but they are rather straightforward: some APIs
changed in the opensource version of the library.
There are a couple of extra files which are needed for this to
work: hiprt02003_6.1_amd.hipfb and oro_compiled_kernels.hipfb.
There are some assumptions in the HIP-RT library about how they
are available. Currently they follow the same rule as AoT
kernels for oneAPI:
- On Windows they are next to blender.exe
- On Linux they are in the lib/ folder
Performance comparison on Ubuntu 22.04.5:
```
GPU: AMD Radeon PRO W7800
Driver: amdgpu-install_6.1.60103-1_all.deb
main hip-rt
attic 0.1414s 0.0932s
barbershop_interior 0.1563s 0.1258s
bistro 0.2134s 0.1597s
bmw27 0.0119s 0.0099s
classroom 0.1006s 0.0803s
fishy_cat 0.0248s 0.0178s
junkshop 0.0916s 0.0713s
koro 0.0589s 0.0720s
monster 0.0435s 0.0385s
pabellon 0.0543s 0.0391s
sponza 0.0223s 0.0180s
spring 0.1026s 1.5145s
victor 0.1901s 0.1239s
wdas_cloud 0.1153s 0.1125s
```
Co-authored-by: Brecht Van Lommel <brecht@blender.org>
Co-authored-by: Ray Molenkamp <github@lazydodo.com>
Co-authored-by: Sergey Sharybin <sergey@blender.org>
Pull Request: https://projects.blender.org/blender/blender/pulls/121050
189 lines
5.7 KiB
C++
189 lines
5.7 KiB
C++
/* SPDX-FileCopyrightText: 2011-2022 Blender Foundation
|
|
*
|
|
* SPDX-License-Identifier: Apache-2.0 */
|
|
|
|
#pragma once
|
|
|
|
#include "device/kernel.h"
|
|
|
|
#include "device/graphics_interop.h"
|
|
#include "util/debug.h"
|
|
#include "util/log.h"
|
|
#include "util/map.h"
|
|
#include "util/string.h"
|
|
#include "util/unique_ptr.h"
|
|
|
|
CCL_NAMESPACE_BEGIN
|
|
|
|
class Device;
|
|
class device_memory;
|
|
|
|
struct KernelWorkTile;
|
|
|
|
/* Container for device kernel arguments with type correctness ensured by API. */
|
|
struct DeviceKernelArguments {
|
|
|
|
enum Type {
|
|
POINTER,
|
|
INT32,
|
|
FLOAT32,
|
|
KERNEL_FILM_CONVERT,
|
|
HIPRT_GLOBAL_STACK,
|
|
};
|
|
|
|
static const int MAX_ARGS = 18;
|
|
Type types[MAX_ARGS];
|
|
void *values[MAX_ARGS];
|
|
size_t sizes[MAX_ARGS];
|
|
size_t count = 0;
|
|
|
|
DeviceKernelArguments() {}
|
|
|
|
template<class T> DeviceKernelArguments(const T *arg)
|
|
{
|
|
add(arg);
|
|
}
|
|
|
|
template<class T, class... Args> DeviceKernelArguments(const T *first, Args... args)
|
|
{
|
|
add(first);
|
|
add(args...);
|
|
}
|
|
|
|
void add(const KernelFilmConvert *value)
|
|
{
|
|
add(KERNEL_FILM_CONVERT, value, sizeof(KernelFilmConvert));
|
|
}
|
|
void add(const device_ptr *value)
|
|
{
|
|
add(POINTER, value, sizeof(device_ptr));
|
|
}
|
|
void add(const int32_t *value)
|
|
{
|
|
add(INT32, value, sizeof(int32_t));
|
|
}
|
|
void add(const float *value)
|
|
{
|
|
add(FLOAT32, value, sizeof(float));
|
|
}
|
|
void add(const Type type, const void *value, size_t size)
|
|
{
|
|
assert(count < MAX_ARGS);
|
|
|
|
types[count] = type;
|
|
values[count] = (void *)value;
|
|
sizes[count] = size;
|
|
count++;
|
|
}
|
|
template<typename T, typename... Args> void add(const T *first, Args... args)
|
|
{
|
|
add(first);
|
|
add(args...);
|
|
}
|
|
};
|
|
|
|
/* Abstraction of a command queue for a device.
|
|
* Provides API to schedule kernel execution in a specific queue with minimal possible overhead
|
|
* from driver side.
|
|
*
|
|
* This class encapsulates all properties needed for commands execution. */
|
|
class DeviceQueue {
|
|
public:
|
|
virtual ~DeviceQueue();
|
|
|
|
/* Number of concurrent states to process for integrator,
|
|
* based on number of cores and/or available memory. */
|
|
virtual int num_concurrent_states(const size_t state_size) const = 0;
|
|
|
|
/* Number of states which keeps the device occupied with work without losing performance.
|
|
* The renderer will add more work (when available) when number of active paths falls below this
|
|
* value. */
|
|
virtual int num_concurrent_busy_states(const size_t state_size) const = 0;
|
|
|
|
/* Number of elements in a partition of sorted shaders, that improves memory locality of
|
|
* integrator state fetch at the cost of decreased coherence for shader kernel execution. */
|
|
virtual int num_sort_partition_elements() const
|
|
{
|
|
return 65536;
|
|
}
|
|
|
|
/* Does device support local atomic sorting kernels (INTEGRATOR_SORT_BUCKET_PASS and
|
|
* INTEGRATOR_SORT_WRITE_PASS)? */
|
|
virtual bool supports_local_atomic_sort() const
|
|
{
|
|
return false;
|
|
}
|
|
|
|
/* Initialize execution of kernels on this queue.
|
|
*
|
|
* Will, for example, load all data required by the kernels from Device to global or path state.
|
|
*
|
|
* Use this method after device synchronization has finished before enqueueing any kernels. */
|
|
virtual void init_execution() = 0;
|
|
|
|
/* Enqueue kernel execution.
|
|
*
|
|
* Execute the kernel work_size times on the device.
|
|
* Supported arguments types:
|
|
* - int: pass pointer to the int
|
|
* - device memory: pass pointer to device_memory.device_pointer
|
|
* Return false if there was an error executing this or a previous kernel. */
|
|
virtual bool enqueue(DeviceKernel kernel,
|
|
const int work_size,
|
|
DeviceKernelArguments const &args) = 0;
|
|
|
|
/* Wait unit all enqueued kernels have finished execution.
|
|
* Return false if there was an error executing any of the enqueued kernels. */
|
|
virtual bool synchronize() = 0;
|
|
|
|
/* Copy memory to/from device as part of the command queue, to ensure
|
|
* operations are done in order without having to synchronize. */
|
|
virtual void zero_to_device(device_memory &mem) = 0;
|
|
virtual void copy_to_device(device_memory &mem) = 0;
|
|
virtual void copy_from_device(device_memory &mem) = 0;
|
|
|
|
/* Graphics resources interoperability.
|
|
*
|
|
* The interoperability comes here by the meaning that the device is capable of computing result
|
|
* directly into an OpenGL (or other graphics library) buffer. */
|
|
|
|
/* Create graphics interoperability context which will be taking care of mapping graphics
|
|
* resource as a buffer writable by kernels of this device. */
|
|
virtual unique_ptr<DeviceGraphicsInterop> graphics_interop_create()
|
|
{
|
|
LOG(FATAL) << "Request of GPU interop of a device which does not support it.";
|
|
return nullptr;
|
|
}
|
|
|
|
/* Device this queue has been created for. */
|
|
Device *device;
|
|
|
|
virtual void *native_queue()
|
|
{
|
|
return nullptr;
|
|
}
|
|
|
|
protected:
|
|
/* Hide construction so that allocation via `Device` API is enforced. */
|
|
explicit DeviceQueue(Device *device);
|
|
|
|
/* Implementations call these from the corresponding methods to generate debugging logs. */
|
|
void debug_init_execution();
|
|
void debug_enqueue_begin(DeviceKernel kernel, const int work_size);
|
|
void debug_enqueue_end();
|
|
void debug_synchronize();
|
|
string debug_active_kernels();
|
|
|
|
/* Combination of kernels enqueued together sync last synchronize. */
|
|
DeviceKernelMask last_kernels_enqueued_;
|
|
/* Time of synchronize call. */
|
|
double last_sync_time_;
|
|
/* Accumulated execution time for combinations of kernels launched together. */
|
|
map<DeviceKernelMask, double> stats_kernel_time_;
|
|
/* If it is true, then a performance statistics in the debugging logs will have focus on kernels
|
|
* and an explicit queue synchronization will be added after each kernel execution. */
|
|
bool is_per_kernel_performance_;
|
|
};
|
|
|
|
CCL_NAMESPACE_END
|