Refactor: Cycles: Simplify Metal backend with direct bindless resource encoding

This re-applies pull request #140671, but with a fix for #144713 where the
non-pointer part of IntegratorStateGPU was not initialized.

This PR is a more extensive follow on from #123551 (removal of AMD and Intel
GPU support).

All supported Apple GPUs have Metal 3 and tier 2 argument buffer support.
The invariant resource properties `gpuAddress` and `gpuResourceID` can be
written directly into GPU structs once at setup time rather than once per
dispatch. More background info can be found in this article:
https://developer.apple.com/documentation/metal/improving-cpu-performance-by-using-argument-buffers?language=objc

Code changes:
- All code relating to `MTLArgumentEncoder` is removed
- `KernelParamsMetal` updates are directly written into
  `id<MTLBuffer> launch_params_buffer` which is used for the "static"
  dispatch arguments
- Dynamic dispatch arguments are small enough to be encoded using the
  `MTLComputeCommandEncoder.setBytes` function, eliminating the need for
  cycling temporary arg buffers

Fix #144713

Co-authored-by: Brecht Van Lommel <brecht@noreply.localhost>
Pull Request: https://projects.blender.org/blender/blender/pulls/145175
This commit is contained in:
Michael Jones
2025-08-27 13:58:30 +02:00
committed by Michael Jones (Apple)
parent 11afddc681
commit 193e22ee7e
9 changed files with 307 additions and 443 deletions

View File

@@ -24,12 +24,8 @@ class MetalDevice : public Device {
public:
id<MTLDevice> mtlDevice = nil;
id<MTLLibrary> mtlLibrary[PSO_NUM] = {nil};
id<MTLArgumentEncoder> mtlBufferKernelParamsEncoder =
nil; /* encoder used for fetching device pointers from MTLBuffers */
id<MTLCommandQueue> mtlComputeCommandQueue = nil;
id<MTLCommandQueue> mtlGeneralCommandQueue = nil;
id<MTLArgumentEncoder> mtlAncillaryArgEncoder =
nil; /* encoder used for fetching device pointers from MTLBuffers */
id<MTLCounterSampleBuffer> mtlCounterSampleBuffer = nil;
string source[PSO_NUM];
string kernels_md5[PSO_NUM];
@@ -37,22 +33,24 @@ class MetalDevice : public Device {
bool capture_enabled = false;
KernelParamsMetal launch_params = {nullptr};
/* Argument buffer for static data. */
id<MTLBuffer> launch_params_buffer = nil;
KernelParamsMetal *launch_params = nullptr;
/* MetalRT members ----------------------------------*/
bool use_metalrt = false;
bool use_metalrt_extended_limits = false;
bool motion_blur = false;
bool use_pcmi = false;
id<MTLArgumentEncoder> mtlASArgEncoder =
nil; /* encoder used for fetching device pointers from MTLAccelerationStructure */
id<MTLArgumentEncoder> mtlBlasArgEncoder = nil;
id<MTLBuffer> blas_buffer = nil;
API_AVAILABLE(macos(11.0))
vector<id<MTLAccelerationStructure>> unique_blas_array;
API_AVAILABLE(macos(11.0))
vector<id<MTLAccelerationStructure>> blas_array;
API_AVAILABLE(macos(11.0))
id<MTLAccelerationStructure> accel_struct = nil;
/*---------------------------------------------------*/
@@ -81,12 +79,8 @@ class MetalDevice : public Device {
/* Bindless Textures */
bool is_texture(const TextureInfo &tex);
device_vector<TextureInfo> texture_info;
bool need_texture_info = false;
id<MTLArgumentEncoder> mtlTextureArgEncoder = nil;
id<MTLArgumentEncoder> mtlBufferArgEncoder = nil;
id<MTLBuffer> buffer_bindings_1d = nil;
id<MTLBuffer> texture_bindings_2d = nil;
std::vector<id<MTLTexture>> texture_slot_map;
id<MTLBuffer> texture_bindings = nil;
std::vector<id<MTLResource>> texture_slot_map;
MetalPipelineType kernel_specialization_level = PSO_GENERIC;

View File

@@ -87,6 +87,9 @@ MetalDevice::MetalDevice(const DeviceInfo &info, Stats &stats, Profiler &profile
mtlDevice = usable_devices[mtlDevId];
metal_printf("Creating new Cycles Metal device: %s", info.description.c_str());
/* Ensure that back-compatability helpers for getting gpuAddress & gpuResourceID are set up. */
metal_gpu_address_helper_init(mtlDevice);
/* Enable increased concurrent shader compiler limit.
* This is also done by MTLContext::MTLContext, but only in GUI mode. */
if (@available(macOS 13.3, *)) {
@@ -160,26 +163,15 @@ MetalDevice::MetalDevice(const DeviceInfo &info, Stats &stats, Profiler &profile
kernel_type_as_string(
(MetalPipelineType)min((int)kernel_specialization_level, (int)PSO_NUM - 1)));
MTLArgumentDescriptor *arg_desc_params = [[MTLArgumentDescriptor alloc] init];
arg_desc_params.dataType = MTLDataTypePointer;
arg_desc_params.access = MTLArgumentAccessReadOnly;
arg_desc_params.arrayLength = sizeof(KernelParamsMetal) / sizeof(device_ptr);
mtlBufferKernelParamsEncoder = [mtlDevice
newArgumentEncoderWithArguments:@[ arg_desc_params ]];
texture_bindings = [mtlDevice newBufferWithLength:8192 options:MTLResourceStorageModeShared];
stats.mem_alloc(texture_bindings.allocatedSize);
MTLArgumentDescriptor *arg_desc_texture = [[MTLArgumentDescriptor alloc] init];
arg_desc_texture.dataType = MTLDataTypeTexture;
arg_desc_texture.access = MTLArgumentAccessReadOnly;
mtlTextureArgEncoder = [mtlDevice newArgumentEncoderWithArguments:@[ arg_desc_texture ]];
MTLArgumentDescriptor *arg_desc_buffer = [[MTLArgumentDescriptor alloc] init];
arg_desc_buffer.dataType = MTLDataTypePointer;
arg_desc_buffer.access = MTLArgumentAccessReadOnly;
mtlBufferArgEncoder = [mtlDevice newArgumentEncoderWithArguments:@[ arg_desc_buffer ]];
launch_params_buffer = [mtlDevice newBufferWithLength:sizeof(KernelParamsMetal)
options:MTLResourceStorageModeShared];
stats.mem_alloc(sizeof(KernelParamsMetal));
buffer_bindings_1d = [mtlDevice newBufferWithLength:8192 options:MTLResourceStorageModeShared];
texture_bindings_2d = [mtlDevice newBufferWithLength:8192
options:MTLResourceStorageModeShared];
stats.mem_alloc(buffer_bindings_1d.allocatedSize + texture_bindings_2d.allocatedSize);
/* Cache unified pointer so we can write kernel params directly in place. */
launch_params = (KernelParamsMetal *)launch_params_buffer.contents;
/* Command queue for path-tracing work on the GPU. In a situation where multiple
* MetalDeviceQueues are spawned from one MetalDevice, they share the same MTLCommandQueue.
@@ -189,97 +181,6 @@ MetalDevice::MetalDevice(const DeviceInfo &info, Stats &stats, Profiler &profile
/* Command queue for non-tracing work on the GPU. */
mtlGeneralCommandQueue = [mtlDevice newCommandQueue];
/* Acceleration structure arg encoder, if needed */
if (@available(macos 12.0, *)) {
if (use_metalrt) {
MTLArgumentDescriptor *arg_desc_as = [[MTLArgumentDescriptor alloc] init];
arg_desc_as.dataType = MTLDataTypeInstanceAccelerationStructure;
arg_desc_as.access = MTLArgumentAccessReadOnly;
mtlASArgEncoder = [mtlDevice newArgumentEncoderWithArguments:@[ arg_desc_as ]];
[arg_desc_as release];
}
}
/* Build the arg encoder for the ancillary bindings */
{
NSMutableArray *ancillary_desc = [[NSMutableArray alloc] init];
int index = 0;
MTLArgumentDescriptor *arg_desc_tex = [[MTLArgumentDescriptor alloc] init];
arg_desc_tex.dataType = MTLDataTypePointer;
arg_desc_tex.access = MTLArgumentAccessReadOnly;
arg_desc_tex.index = index++;
[ancillary_desc addObject:[arg_desc_tex copy]]; /* metal_buf_1d */
arg_desc_tex.index = index++;
[ancillary_desc addObject:[arg_desc_tex copy]]; /* metal_tex_2d */
[arg_desc_tex release];
if (@available(macos 12.0, *)) {
if (use_metalrt) {
MTLArgumentDescriptor *arg_desc_as = [[MTLArgumentDescriptor alloc] init];
arg_desc_as.dataType = MTLDataTypeInstanceAccelerationStructure;
arg_desc_as.access = MTLArgumentAccessReadOnly;
MTLArgumentDescriptor *arg_desc_ptrs = [[MTLArgumentDescriptor alloc] init];
arg_desc_ptrs.dataType = MTLDataTypePointer;
arg_desc_ptrs.access = MTLArgumentAccessReadOnly;
MTLArgumentDescriptor *arg_desc_ift = [[MTLArgumentDescriptor alloc] init];
arg_desc_ift.dataType = MTLDataTypeIntersectionFunctionTable;
arg_desc_ift.access = MTLArgumentAccessReadOnly;
arg_desc_as.index = index++;
[ancillary_desc addObject:[arg_desc_as copy]]; /* accel_struct */
/* Intersection function tables */
arg_desc_ift.index = index++;
[ancillary_desc addObject:[arg_desc_ift copy]]; /* ift_default */
arg_desc_ift.index = index++;
[ancillary_desc addObject:[arg_desc_ift copy]]; /* ift_shadow */
arg_desc_ift.index = index++;
[ancillary_desc addObject:[arg_desc_ift copy]]; /* ift_shadow_all */
arg_desc_ift.index = index++;
[ancillary_desc addObject:[arg_desc_ift copy]]; /* ift_volume */
arg_desc_ift.index = index++;
[ancillary_desc addObject:[arg_desc_ift copy]]; /* ift_local */
arg_desc_ift.index = index++;
[ancillary_desc addObject:[arg_desc_ift copy]]; /* ift_local_mblur */
arg_desc_ift.index = index++;
[ancillary_desc addObject:[arg_desc_ift copy]]; /* ift_local_single_hit */
arg_desc_ift.index = index++;
[ancillary_desc addObject:[arg_desc_ift copy]]; /* ift_local_single_hit_mblur */
arg_desc_ptrs.index = index++;
[ancillary_desc addObject:[arg_desc_ptrs copy]]; /* blas_accel_structs */
[arg_desc_ift release];
[arg_desc_as release];
[arg_desc_ptrs release];
}
}
mtlAncillaryArgEncoder = [mtlDevice newArgumentEncoderWithArguments:ancillary_desc];
// preparing the blas arg encoder
if (use_metalrt) {
MTLArgumentDescriptor *arg_desc_blas = [[MTLArgumentDescriptor alloc] init];
arg_desc_blas.dataType = MTLDataTypeInstanceAccelerationStructure;
arg_desc_blas.access = MTLArgumentAccessReadOnly;
mtlBlasArgEncoder = [mtlDevice newArgumentEncoderWithArguments:@[ arg_desc_blas ]];
[arg_desc_blas release];
}
for (int i = 0; i < ancillary_desc.count; i++) {
[ancillary_desc[i] release];
}
[ancillary_desc release];
}
[arg_desc_params release];
[arg_desc_texture release];
}
}
@@ -292,27 +193,21 @@ MetalDevice::~MetalDevice()
* existing_devices_mutex). */
thread_scoped_lock lock(existing_devices_mutex);
int num_resources = texture_info.size();
for (int res = 0; res < num_resources; res++) {
if (is_texture(texture_info[res])) {
[texture_slot_map[res] release];
texture_slot_map[res] = nil;
}
/* Release textures that weren't already freed by tex_free. */
for (int res = 0; res < texture_info.size(); res++) {
[texture_slot_map[res] release];
texture_slot_map[res] = nil;
}
free_bvh();
flush_delayed_free_list();
if (texture_bindings_2d) {
stats.mem_free(buffer_bindings_1d.allocatedSize + texture_bindings_2d.allocatedSize);
[buffer_bindings_1d release];
[texture_bindings_2d release];
}
[mtlTextureArgEncoder release];
[mtlBufferKernelParamsEncoder release];
[mtlBufferArgEncoder release];
[mtlASArgEncoder release];
[mtlAncillaryArgEncoder release];
stats.mem_free(sizeof(KernelParamsMetal));
[launch_params_buffer release];
stats.mem_free(texture_bindings.allocatedSize);
[texture_bindings release];
[mtlComputeCommandQueue release];
[mtlGeneralCommandQueue release];
if (mtlCounterSampleBuffer) {
@@ -511,7 +406,7 @@ void MetalDevice::refresh_source_and_kernels_md5(MetalPipelineType pso_type)
# define KERNEL_STRUCT_MEMBER(parent, _type, name) \
if (next_member_is_specialized) { \
constant_values += string(#parent "." #name "=") + \
to_string(_type(launch_params.data.parent.name)) + "\n"; \
to_string(_type(launch_params->data.parent.name)) + "\n"; \
} \
else { \
next_member_is_specialized = true; \
@@ -648,30 +543,7 @@ bool MetalDevice::is_texture(const TextureInfo &tex)
return tex.height > 0;
}
void MetalDevice::load_texture_info()
{
if (need_texture_info) {
/* Unset flag before copying. */
need_texture_info = false;
texture_info.copy_to_device();
int num_textures = texture_info.size();
for (int tex = 0; tex < num_textures; tex++) {
uint64_t offset = tex * sizeof(void *);
if (is_texture(texture_info[tex]) && texture_slot_map[tex]) {
id<MTLTexture> metal_texture = texture_slot_map[tex];
MTLTextureType type = metal_texture.textureType;
[mtlTextureArgEncoder setArgumentBuffer:texture_bindings_2d offset:offset];
[mtlTextureArgEncoder setTexture:type == MTLTextureType2D ? metal_texture : nil atIndex:0];
}
else {
[mtlTextureArgEncoder setArgumentBuffer:texture_bindings_2d offset:offset];
[mtlTextureArgEncoder setTexture:nil atIndex:0];
}
}
}
}
void MetalDevice::load_texture_info() {}
void MetalDevice::erase_allocation(device_memory &mem)
{
@@ -683,9 +555,9 @@ void MetalDevice::erase_allocation(device_memory &mem)
if (it != metal_mem_map.end()) {
MetalMem *mmem = it->second.get();
/* blank out reference to MetalMem* in the launch params (fixes crash #94736) */
/* blank out reference to resource in the launch params (fixes crash #94736) */
if (mmem->pointer_index >= 0) {
device_ptr *pointers = (device_ptr *)&launch_params;
device_ptr *pointers = (device_ptr *)launch_params;
pointers[mmem->pointer_index] = 0;
}
metal_mem_map.erase(it);
@@ -1026,7 +898,7 @@ void MetalDevice::const_copy_to(const char *name, void *host, const size_t size)
{
if (strcmp(name, "data") == 0) {
assert(size == sizeof(KernelData));
memcpy((uint8_t *)&launch_params.data, host, sizeof(KernelData));
memcpy((uint8_t *)&launch_params->data, host, sizeof(KernelData));
/* Refresh the kernels_md5 checksums for specialized kernel sets. */
for (int level = 1; level <= int(kernel_specialization_level); level++) {
@@ -1035,30 +907,41 @@ void MetalDevice::const_copy_to(const char *name, void *host, const size_t size)
return;
}
auto update_launch_pointers =
[&](size_t offset, void *data, const size_t data_size, const size_t pointers_size) {
memcpy((uint8_t *)&launch_params + offset, data, data_size);
auto update_launch_pointers = [&](size_t offset, void *data, const size_t pointers_size) {
uint64_t *addresses = (uint64_t *)((uint8_t *)launch_params + offset);
MetalMem **mmem = (MetalMem **)data;
int pointer_count = pointers_size / sizeof(device_ptr);
int pointer_index = offset / sizeof(device_ptr);
for (int i = 0; i < pointer_count; i++) {
if (mmem[i]) {
mmem[i]->pointer_index = pointer_index + i;
MetalMem **mmem = (MetalMem **)data;
int pointer_count = pointers_size / sizeof(device_ptr);
int pointer_index = offset / sizeof(device_ptr);
for (int i = 0; i < pointer_count; i++) {
addresses[i] = 0;
if (mmem[i]) {
mmem[i]->pointer_index = pointer_index + i;
if (mmem[i]->mtlBuffer) {
if (@available(macOS 13.0, *)) {
addresses[i] = metal_gpuAddress(mmem[i]->mtlBuffer);
}
}
};
}
}
};
/* Update data storage pointers in launch parameters. */
if (strcmp(name, "integrator_state") == 0) {
/* IntegratorStateGPU is contiguous pointers */
/* IntegratorStateGPU is contiguous pointers up until sort_partition_divisor. */
const size_t pointer_block_size = offsetof(IntegratorStateGPU, sort_partition_divisor);
update_launch_pointers(
offsetof(KernelParamsMetal, integrator_state), host, size, pointer_block_size);
offsetof(KernelParamsMetal, integrator_state), host, pointer_block_size);
/* Ensure the non-pointers part of IntegratorStateGPU is copied (this is the proper fix for
* #144713). */
memcpy((uint8_t *)&launch_params->integrator_state + pointer_block_size,
(uint8_t *)host + pointer_block_size,
sizeof(IntegratorStateGPU) - pointer_block_size);
}
# define KERNEL_DATA_ARRAY(data_type, tex_name) \
else if (strcmp(name, #tex_name) == 0) { \
update_launch_pointers(offsetof(KernelParamsMetal, tex_name), host, size, size); \
update_launch_pointers(offsetof(KernelParamsMetal, tex_name), host, size); \
}
# include "kernel/data_arrays.h"
# undef KERNEL_DATA_ARRAY
@@ -1096,12 +979,7 @@ void MetalDevice::tex_alloc_as_buffer(device_texture &mem)
}
texture_info[slot] = mem.info;
uint64_t offset = slot * sizeof(void *);
[mtlBufferArgEncoder setArgumentBuffer:buffer_bindings_1d offset:offset];
[mtlBufferArgEncoder setBuffer:mmem->mtlBuffer offset:0 atIndex:0];
texture_info[slot].data = *(uint64_t *)((uint64_t)buffer_bindings_1d.contents + offset);
texture_slot_map[slot] = nil;
need_texture_info = true;
texture_slot_map[slot] = mmem->mtlBuffer;
if (is_nanovdb_type(mem.info.data_type)) {
using_nanovdb = true;
@@ -1245,34 +1123,21 @@ void MetalDevice::tex_alloc(device_texture &mem)
texture_slot_map.resize(slot + 128);
ssize_t min_buffer_length = sizeof(void *) * texture_info.size();
if (!texture_bindings_2d || (texture_bindings_2d.length < min_buffer_length)) {
if (texture_bindings_2d) {
delayed_free_list.push_back(buffer_bindings_1d);
delayed_free_list.push_back(texture_bindings_2d);
stats.mem_free(buffer_bindings_1d.allocatedSize + texture_bindings_2d.allocatedSize);
if (!texture_bindings || (texture_bindings.length < min_buffer_length)) {
if (texture_bindings) {
delayed_free_list.push_back(texture_bindings);
stats.mem_free(texture_bindings.allocatedSize);
}
buffer_bindings_1d = [mtlDevice newBufferWithLength:min_buffer_length
options:MTLResourceStorageModeShared];
texture_bindings_2d = [mtlDevice newBufferWithLength:min_buffer_length
options:MTLResourceStorageModeShared];
texture_bindings = [mtlDevice newBufferWithLength:min_buffer_length
options:MTLResourceStorageModeShared];
stats.mem_alloc(buffer_bindings_1d.allocatedSize + texture_bindings_2d.allocatedSize);
stats.mem_alloc(texture_bindings.allocatedSize);
}
}
/* Optimize the texture for GPU access. */
id<MTLCommandBuffer> commandBuffer = [mtlGeneralCommandQueue commandBuffer];
id<MTLBlitCommandEncoder> blitCommandEncoder = [commandBuffer blitCommandEncoder];
[blitCommandEncoder optimizeContentsForGPUAccess:mtlTexture];
[blitCommandEncoder endEncoding];
[commandBuffer commit];
/* Set Mapping and tag that we need to (re-)upload to device */
/* Set Mapping. */
texture_slot_map[slot] = mtlTexture;
texture_info[slot] = mem.info;
need_texture_info = true;
texture_info[slot].data = uint64_t(slot) | (sampler_index << 32);
if (max_working_set_exceeded()) {
@@ -1305,27 +1170,20 @@ void MetalDevice::tex_copy_to(device_texture &mem)
void MetalDevice::tex_free(device_texture &mem)
{
int slot = mem.slot;
if (mem.data_height == 0) {
generic_free(mem);
return;
}
if (metal_mem_map.count(&mem)) {
else if (metal_mem_map.count(&mem)) {
std::lock_guard<std::recursive_mutex> lock(metal_mem_map_mutex);
MetalMem &mmem = *metal_mem_map.at(&mem);
assert(texture_slot_map[mem.slot] == mmem.mtlTexture);
if (texture_slot_map[mem.slot] == mmem.mtlTexture) {
texture_slot_map[mem.slot] = nil;
}
if (mmem.mtlTexture) {
/* Free bindless texture. */
delayed_free_list.push_back(mmem.mtlTexture);
mmem.mtlTexture = nil;
}
/* Free bindless texture. */
delayed_free_list.push_back(mmem.mtlTexture);
mmem.mtlTexture = nil;
erase_allocation(mem);
}
texture_slot_map[slot] = nil;
}
unique_ptr<DeviceQueue> MetalDevice::gpu_queue_create()
@@ -1388,6 +1246,7 @@ void MetalDevice::free_bvh()
[blas release];
}
unique_blas_array.clear();
blas_array.clear();
if (blas_buffer) {
[blas_buffer release];
@@ -1410,6 +1269,7 @@ void MetalDevice::update_bvh(BVHMetal *bvh_metal)
accel_struct = bvh_metal->accel_struct;
unique_blas_array = bvh_metal->unique_blas_array;
blas_array = bvh_metal->blas_array;
[accel_struct retain];
for (id<MTLAccelerationStructure> &blas : unique_blas_array) {
@@ -1417,17 +1277,9 @@ void MetalDevice::update_bvh(BVHMetal *bvh_metal)
}
// Allocate required buffers for BLAS array.
uint64_t count = bvh_metal->blas_array.size();
uint64_t buffer_size = mtlBlasArgEncoder.encodedLength * count;
uint64_t buffer_size = blas_array.size() * sizeof(uint64_t);
blas_buffer = [mtlDevice newBufferWithLength:buffer_size options:MTLResourceStorageModeShared];
stats.mem_alloc(blas_buffer.allocatedSize);
for (uint64_t i = 0; i < count; ++i) {
if (bvh_metal->blas_array[i]) {
[mtlBlasArgEncoder setArgumentBuffer:blas_buffer offset:i * mtlBlasArgEncoder.encodedLength];
[mtlBlasArgEncoder setAccelerationStructure:bvh_metal->blas_array[i] atIndex:0];
}
}
}
CCL_NAMESPACE_END

View File

@@ -340,7 +340,7 @@ void ShaderCache::load_kernel(DeviceKernel device_kernel,
* to be active. */
pipeline->pipeline_id = g_next_pipeline_id.fetch_add(1);
pipeline->originating_device_id = device->device_id;
pipeline->kernel_data_ = device->launch_params.data;
pipeline->kernel_data_ = device->launch_params->data;
pipeline->pso_type = pso_type;
pipeline->mtlDevice = mtlDevice;
pipeline->kernels_md5 = device->kernels_md5[pso_type];

View File

@@ -15,6 +15,9 @@
# define MAX_SAMPLE_BUFFER_LENGTH 4096
/* The number of resources to be contiguously encoded into the MetalAncillaries struct. */
# define ANCILLARY_SLOT_COUNT 11
CCL_NAMESPACE_BEGIN
class MetalDevice;
@@ -57,7 +60,6 @@ class MetalDeviceQueue : public DeviceQueue {
id<MTLBlitCommandEncoder> get_blit_encoder();
MetalDevice *metal_device_;
MetalBufferPool temp_buffer_pool_;
API_AVAILABLE(macos(11.0), ios(14.0))
MTLCommandBufferDescriptor *command_buffer_desc_ = nullptr;

View File

@@ -317,11 +317,71 @@ bool MetalDeviceQueue::supports_local_atomic_sort() const
return metal_device_->use_local_atomic_sort();
}
static void zero_resource(void *address_in_arg_buffer, int index = 0)
{
uint64_t *pptr = (uint64_t *)address_in_arg_buffer;
pptr[index] = 0;
}
template<class T> void write_resource(void *address_in_arg_buffer, T resource, int index = 0)
{
zero_resource(address_in_arg_buffer, index);
uint64_t *pptr = (uint64_t *)address_in_arg_buffer;
if (resource) {
pptr[index] = metal_gpuResourceID(resource);
}
}
template<> void write_resource(void *address_in_arg_buffer, id<MTLBuffer> buffer, int index)
{
zero_resource(address_in_arg_buffer, index);
uint64_t *pptr = (uint64_t *)address_in_arg_buffer;
if (buffer) {
pptr[index] = metal_gpuAddress(buffer);
}
}
static id<MTLBuffer> patch_resource(void *address_in_arg_buffer, int index = 0)
{
uint64_t *pptr = (uint64_t *)address_in_arg_buffer;
if (MetalDevice::MetalMem *mmem = (MetalDevice::MetalMem *)pptr[index]) {
write_resource<id<MTLBuffer>>(address_in_arg_buffer, mmem->mtlBuffer, index);
return mmem->mtlBuffer;
}
return nil;
}
void MetalDeviceQueue::init_execution()
{
/* Synchronize all textures and memory copies before executing task. */
metal_device_->load_texture_info();
/* Populate blas_array. */
uint64_t *blas_array = (uint64_t *)metal_device_->blas_buffer.contents;
for (uint64_t slot = 0; slot < metal_device_->blas_array.size(); ++slot) {
write_resource(blas_array, metal_device_->blas_array[slot], slot);
}
device_vector<TextureInfo> &texture_info = metal_device_->texture_info;
id<MTLBuffer> &texture_bindings = metal_device_->texture_bindings;
std::vector<id<MTLResource>> &texture_slot_map = metal_device_->texture_slot_map;
/* Ensure texture_info is allocated before populating. */
texture_info.copy_to_device();
/* Populate texture bindings. */
uint64_t *bindings = (uint64_t *)texture_bindings.contents;
memset(bindings, 0, texture_bindings.length);
for (int slot = 0; slot < texture_info.size(); ++slot) {
if (texture_slot_map[slot]) {
if (metal_device_->is_texture(texture_info[slot])) {
write_resource(bindings, id<MTLTexture>(texture_slot_map[slot]), slot);
}
else {
/* The GPU address of a 1D buffer texture is written into the slot data field. */
write_resource(&texture_info[slot].data, id<MTLBuffer>(texture_slot_map[slot]), 0);
}
}
}
/* Synchronize memory copies. */
synchronize();
}
@@ -346,83 +406,6 @@ bool MetalDeviceQueue::enqueue(DeviceKernel kernel,
if (profiling_enabled_) {
command_encoder_labels_.push_back({kernel, work_size, current_encoder_idx_});
}
/* Determine size requirement for argument buffer. */
size_t arg_buffer_length = 0;
for (size_t i = 0; i < args.count; i++) {
size_t size_in_bytes = args.sizes[i];
arg_buffer_length = round_up(arg_buffer_length, size_in_bytes) + size_in_bytes;
}
/* 256 is the Metal offset alignment for constant address space bindings */
arg_buffer_length = round_up(arg_buffer_length, 256);
/* Globals placed after "vanilla" arguments. */
size_t globals_offsets = arg_buffer_length;
arg_buffer_length += sizeof(KernelParamsMetal);
arg_buffer_length = round_up(arg_buffer_length, 256);
/* Metal ancillary bindless pointers. */
size_t metal_offsets = arg_buffer_length;
arg_buffer_length += metal_device_->mtlAncillaryArgEncoder.encodedLength;
arg_buffer_length = round_up(arg_buffer_length,
metal_device_->mtlAncillaryArgEncoder.alignment);
/* Temporary buffer used to prepare arg_buffer */
uint8_t *init_arg_buffer = (uint8_t *)alloca(arg_buffer_length);
memset(init_arg_buffer, 0, arg_buffer_length);
/* Prepare the non-pointer "enqueue" arguments */
size_t bytes_written = 0;
for (size_t i = 0; i < args.count; i++) {
size_t size_in_bytes = args.sizes[i];
bytes_written = round_up(bytes_written, size_in_bytes);
if (args.types[i] != DeviceKernelArguments::POINTER) {
memcpy(init_arg_buffer + bytes_written, args.values[i], size_in_bytes);
}
bytes_written += size_in_bytes;
}
/* Prepare any non-pointer (i.e. plain-old-data) KernelParamsMetal data */
/* The plain-old-data is contiguous, continuing to the end of KernelParamsMetal */
size_t plain_old_launch_data_offset = offsetof(KernelParamsMetal, integrator_state) +
offsetof(IntegratorStateGPU, sort_partition_divisor);
size_t plain_old_launch_data_size = sizeof(KernelParamsMetal) - plain_old_launch_data_offset;
memcpy(init_arg_buffer + globals_offsets + plain_old_launch_data_offset,
(uint8_t *)&metal_device_->launch_params + plain_old_launch_data_offset,
plain_old_launch_data_size);
/* Allocate an argument buffer. */
id<MTLBuffer> arg_buffer = temp_buffer_pool_.get_buffer(
mtlDevice_, mtlCommandBuffer_, arg_buffer_length, init_arg_buffer, stats_);
/* Encode the pointer "enqueue" arguments */
bytes_written = 0;
for (size_t i = 0; i < args.count; i++) {
size_t size_in_bytes = args.sizes[i];
bytes_written = round_up(bytes_written, size_in_bytes);
if (args.types[i] == DeviceKernelArguments::POINTER) {
[metal_device_->mtlBufferKernelParamsEncoder setArgumentBuffer:arg_buffer
offset:bytes_written];
if (MetalDevice::MetalMem *mmem = *(MetalDevice::MetalMem **)args.values[i]) {
[mtlComputeCommandEncoder useResource:mmem->mtlBuffer
usage:MTLResourceUsageRead | MTLResourceUsageWrite];
[metal_device_->mtlBufferKernelParamsEncoder setBuffer:mmem->mtlBuffer
offset:0
atIndex:0];
}
else {
if (@available(macos 12.0, *)) {
[metal_device_->mtlBufferKernelParamsEncoder setBuffer:nil offset:0 atIndex:0];
}
}
}
bytes_written += size_in_bytes;
}
/* Encode KernelParamsMetal buffers */
[metal_device_->mtlBufferKernelParamsEncoder setArgumentBuffer:arg_buffer
offset:globals_offsets];
if (label_command_encoders_) {
/* Add human-readable labels if we're doing any form of debugging / profiling. */
mtlComputeCommandEncoder.label = [NSString
@@ -431,28 +414,6 @@ bool MetalDeviceQueue::enqueue(DeviceKernel kernel,
work_size];
}
/* this relies on IntegratorStateGPU layout being contiguous device_ptrs. */
const size_t pointer_block_end = offsetof(KernelParamsMetal, integrator_state) +
offsetof(IntegratorStateGPU, sort_partition_divisor);
for (size_t offset = 0; offset < pointer_block_end; offset += sizeof(device_ptr)) {
int pointer_index = int(offset / sizeof(device_ptr));
MetalDevice::MetalMem *mmem = *(
MetalDevice::MetalMem **)((uint8_t *)&metal_device_->launch_params + offset);
if (mmem && mmem->mem && (mmem->mtlBuffer || mmem->mtlTexture)) {
[metal_device_->mtlBufferKernelParamsEncoder setBuffer:mmem->mtlBuffer
offset:0
atIndex:pointer_index];
}
else {
if (@available(macos 12.0, *)) {
[metal_device_->mtlBufferKernelParamsEncoder setBuffer:nil
offset:0
atIndex:pointer_index];
}
}
}
bytes_written = globals_offsets + sizeof(KernelParamsMetal);
if (!active_pipelines_[kernel].update(metal_device_, kernel)) {
metal_device_->set_error(
string_printf("Could not activate pipeline for %s\n", device_kernel_as_string(kernel)));
@@ -460,47 +421,66 @@ bool MetalDeviceQueue::enqueue(DeviceKernel kernel,
}
MetalDispatchPipeline &active_pipeline = active_pipelines_[kernel];
/* Encode ancillaries */
[metal_device_->mtlAncillaryArgEncoder setArgumentBuffer:arg_buffer offset:metal_offsets];
[metal_device_->mtlAncillaryArgEncoder setBuffer:metal_device_->texture_bindings_2d
offset:0
atIndex:0];
[metal_device_->mtlAncillaryArgEncoder setBuffer:metal_device_->buffer_bindings_1d
offset:0
atIndex:1];
uint8_t dynamic_args[512] = {0};
if (@available(macos 12.0, *)) {
if (metal_device_->use_metalrt && device_kernel_has_intersection(kernel)) {
if (id<MTLAccelerationStructure> accel_struct = metal_device_->accel_struct) {
[metal_device_->mtlAncillaryArgEncoder setAccelerationStructure:accel_struct atIndex:2];
[metal_device_->mtlAncillaryArgEncoder setBuffer:metal_device_->blas_buffer
offset:0
atIndex:(METALRT_TABLE_NUM + 3)];
}
for (int table = 0; table < METALRT_TABLE_NUM; table++) {
if (active_pipeline.intersection_func_table[table]) {
[active_pipeline.intersection_func_table[table] setBuffer:arg_buffer
offset:globals_offsets
atIndex:1];
[metal_device_->mtlAncillaryArgEncoder
setIntersectionFunctionTable:active_pipeline.intersection_func_table[table]
atIndex:3 + table];
[mtlComputeCommandEncoder useResource:active_pipeline.intersection_func_table[table]
usage:MTLResourceUsageRead];
}
else {
[metal_device_->mtlAncillaryArgEncoder setIntersectionFunctionTable:nil
atIndex:3 + table];
}
/* Prepare the dynamic "enqueue" arguments */
size_t dynamic_bytes_written = 0;
size_t max_size_in_bytes = 0;
for (size_t i = 0; i < args.count; i++) {
size_t size_in_bytes = args.sizes[i];
max_size_in_bytes = max(max_size_in_bytes, size_in_bytes);
dynamic_bytes_written = round_up(dynamic_bytes_written, size_in_bytes);
memcpy(dynamic_args + dynamic_bytes_written, args.values[i], size_in_bytes);
if (args.types[i] == DeviceKernelArguments::POINTER) {
if (id<MTLBuffer> buffer = patch_resource(dynamic_args + dynamic_bytes_written)) {
[mtlComputeCommandEncoder useResource:buffer
usage:MTLResourceUsageRead | MTLResourceUsageWrite];
}
}
bytes_written = metal_offsets + metal_device_->mtlAncillaryArgEncoder.encodedLength;
dynamic_bytes_written += size_in_bytes;
}
/* Apply conventional struct alignment (stops asserts firing when API validation is enabled).
*/
dynamic_bytes_written = round_up(dynamic_bytes_written, max_size_in_bytes);
/* Check that the dynamic args didn't overflow. */
assert(dynamic_bytes_written <= sizeof(dynamic_args));
uint64_t ancillary_args[ANCILLARY_SLOT_COUNT] = {0};
/* Encode ancillaries */
int ancillary_index = 0;
write_resource(ancillary_args, metal_device_->texture_bindings, ancillary_index++);
if (metal_device_->use_metalrt) {
write_resource(ancillary_args, metal_device_->accel_struct, ancillary_index++);
write_resource(ancillary_args, metal_device_->blas_buffer, ancillary_index++);
/* Write the intersection function table. */
for (int table_idx = 0; table_idx < METALRT_TABLE_NUM; table_idx++) {
write_resource(
ancillary_args, active_pipeline.intersection_func_table[table_idx], ancillary_index++);
}
assert(ancillary_index == ANCILLARY_SLOT_COUNT);
}
[mtlComputeCommandEncoder setBuffer:arg_buffer offset:0 atIndex:0];
[mtlComputeCommandEncoder setBuffer:arg_buffer offset:globals_offsets atIndex:1];
[mtlComputeCommandEncoder setBuffer:arg_buffer offset:metal_offsets atIndex:2];
/* Encode ancillaries */
if (metal_device_->use_metalrt) {
for (int table = 0; table < METALRT_TABLE_NUM; table++) {
if (active_pipeline.intersection_func_table[table]) {
[active_pipeline.intersection_func_table[table]
setBuffer:metal_device_->launch_params_buffer
offset:0
atIndex:1];
[mtlComputeCommandEncoder useResource:active_pipeline.intersection_func_table[table]
usage:MTLResourceUsageRead];
}
}
}
[mtlComputeCommandEncoder setBytes:dynamic_args length:dynamic_bytes_written atIndex:0];
[mtlComputeCommandEncoder setBuffer:metal_device_->launch_params_buffer offset:0 atIndex:1];
[mtlComputeCommandEncoder setBytes:ancillary_args length:sizeof(ancillary_args) atIndex:2];
if (metal_device_->use_metalrt && device_kernel_has_intersection(kernel)) {
if (@available(macos 12.0, *)) {
@@ -542,7 +522,7 @@ bool MetalDeviceQueue::enqueue(DeviceKernel kernel,
case DEVICE_KERNEL_INTEGRATOR_SORT_BUCKET_PASS:
case DEVICE_KERNEL_INTEGRATOR_SORT_WRITE_PASS: {
int key_count = metal_device_->launch_params.data.max_shaders;
int key_count = metal_device_->launch_params->data.max_shaders;
shared_mem_bytes = (int)round_up(key_count * sizeof(int), 16);
break;
}
@@ -674,7 +654,6 @@ bool MetalDeviceQueue::synchronize()
[mtlCommandBuffer_ release];
temp_buffer_pool_.process_command_buffer_completion(mtlCommandBuffer_);
metal_device_->flush_delayed_free_list();
mtlCommandBuffer_ = nil;
@@ -771,8 +750,7 @@ void MetalDeviceQueue::prepare_resources(DeviceKernel /*kernel*/)
}
/* ancillaries */
[mtlComputeEncoder_ useResource:metal_device_->texture_bindings_2d usage:MTLResourceUsageRead];
[mtlComputeEncoder_ useResource:metal_device_->buffer_bindings_1d usage:MTLResourceUsageRead];
[mtlComputeEncoder_ useResource:metal_device_->texture_bindings usage:MTLResourceUsageRead];
}
id<MTLComputeCommandEncoder> MetalDeviceQueue::get_compute_encoder(DeviceKernel kernel)

View File

@@ -38,26 +38,12 @@ struct MetalInfo {
static string get_device_name(id<MTLDevice> device);
};
/* Pool of MTLBuffers whose lifetime is linked to a single MTLCommandBuffer */
class MetalBufferPool {
struct MetalBufferListEntry {
id<MTLBuffer> buffer;
id<MTLCommandBuffer> command_buffer;
};
std::vector<MetalBufferListEntry> temp_buffers;
thread_mutex buffer_mutex;
size_t total_temp_mem_size = 0;
void metal_gpu_address_helper_init(id<MTLDevice> device);
public:
~MetalBufferPool();
id<MTLBuffer> get_buffer(id<MTLDevice> device,
id<MTLCommandBuffer> command_buffer,
NSUInteger length,
const void *pointer,
Stats &stats);
void process_command_buffer_completion(id<MTLCommandBuffer> command_buffer);
};
uint64_t metal_gpuAddress(id<MTLBuffer> buffer);
uint64_t metal_gpuResourceID(id<MTLTexture> texture);
uint64_t metal_gpuResourceID(id<MTLAccelerationStructure> accel_struct);
uint64_t metal_gpuResourceID(id<MTLIntersectionFunctionTable> ift);
CCL_NAMESPACE_END

View File

@@ -18,6 +18,9 @@
CCL_NAMESPACE_BEGIN
/* Comment this out to test workaround for getting gpuAddress and gpuResourceID on macOS < 13.0. */
# define CYCLES_USE_TIER2D_BINDLESS
string MetalInfo::get_device_name(id<MTLDevice> device)
{
string device_name = [device.name UTF8String];
@@ -118,63 +121,110 @@ const vector<id<MTLDevice>> &MetalInfo::get_usable_devices()
return usable_devices;
}
id<MTLBuffer> MetalBufferPool::get_buffer(id<MTLDevice> device,
id<MTLCommandBuffer> command_buffer,
NSUInteger length,
const void *pointer,
Stats &stats)
{
id<MTLBuffer> buffer = nil;
struct GPUAddressHelper {
id<MTLBuffer> resource_buffer = nil;
id<MTLArgumentEncoder> address_encoder = nil;
/* One time setup of arg encoder. */
void init(id<MTLDevice> device)
{
thread_scoped_lock lock(buffer_mutex);
/* Find an unused buffer with matching size and storage mode. */
for (MetalBufferListEntry &bufferEntry : temp_buffers) {
if (bufferEntry.buffer.length == length && bufferEntry.command_buffer == nil) {
buffer = bufferEntry.buffer;
bufferEntry.command_buffer = command_buffer;
break;
}
if (resource_buffer) {
/* No setup required - already initialised. */
return;
}
if (!buffer) {
/* Create a new buffer and add it to the pool. Typically this pool will only grow to a
* handful of entries. */
buffer = [device newBufferWithLength:length options:MTLResourceStorageModeShared];
stats.mem_alloc(buffer.allocatedSize);
total_temp_mem_size += buffer.allocatedSize;
temp_buffers.push_back(MetalBufferListEntry{buffer, command_buffer});
# ifdef CYCLES_USE_TIER2D_BINDLESS
if (@available(macos 13.0, *)) {
/* No setup required - there's an API now! */
return;
}
# endif
/* Setup a tiny buffer to encode the GPU address / resourceID into. */
resource_buffer = [device newBufferWithLength:8 options:MTLResourceStorageModeShared];
/* Create an encoder to extract a gpuAddress from a MTLBuffer. */
MTLArgumentDescriptor *encoder_params = [[MTLArgumentDescriptor alloc] init];
encoder_params.arrayLength = 1;
encoder_params.access = MTLBindingAccessReadWrite;
encoder_params.dataType = MTLDataTypePointer;
address_encoder = [device newArgumentEncoderWithArguments:@[ encoder_params ]];
[address_encoder setArgumentBuffer:resource_buffer offset:0];
};
uint64_t gpuAddress(id<MTLBuffer> buffer)
{
# ifdef CYCLES_USE_TIER2D_BINDLESS
if (@available(macos 13.0, *)) {
return buffer.gpuAddress;
}
# endif
[address_encoder setBuffer:buffer offset:0 atIndex:0];
return *(uint64_t *)[resource_buffer contents];
}
/* Copy over data */
if (pointer) {
memcpy(buffer.contents, pointer, length);
uint64_t gpuResourceID(id<MTLTexture> texture)
{
# ifdef CYCLES_USE_TIER2D_BINDLESS
if (@available(macos 13.0, *)) {
MTLResourceID resourceID = texture.gpuResourceID;
return (uint64_t &)resourceID;
}
# endif
[address_encoder setTexture:texture atIndex:0];
return *(uint64_t *)[resource_buffer contents];
}
return buffer;
uint64_t gpuResourceID(id<MTLAccelerationStructure> accel_struct)
{
# ifdef CYCLES_USE_TIER2D_BINDLESS
if (@available(macos 13.0, *)) {
MTLResourceID resourceID = accel_struct.gpuResourceID;
return (uint64_t &)resourceID;
}
# endif
[address_encoder setAccelerationStructure:accel_struct atIndex:0];
return *(uint64_t *)[resource_buffer contents];
}
uint64_t gpuResourceID(id<MTLIntersectionFunctionTable> ift)
{
# ifdef CYCLES_USE_TIER2D_BINDLESS
if (@available(macos 13.0, *)) {
MTLResourceID resourceID = ift.gpuResourceID;
return (uint64_t &)resourceID;
}
# endif
[address_encoder setIntersectionFunctionTable:ift atIndex:0];
return *(uint64_t *)[resource_buffer contents];
}
};
GPUAddressHelper g_gpu_address_helper;
void metal_gpu_address_helper_init(id<MTLDevice> device)
{
g_gpu_address_helper.init(device);
}
void MetalBufferPool::process_command_buffer_completion(id<MTLCommandBuffer> command_buffer)
uint64_t metal_gpuAddress(id<MTLBuffer> buffer)
{
assert(command_buffer);
thread_scoped_lock lock(buffer_mutex);
/* Mark any temp buffers associated with command_buffer as unused. */
for (MetalBufferListEntry &buffer_entry : temp_buffers) {
if (buffer_entry.command_buffer == command_buffer) {
buffer_entry.command_buffer = nil;
}
}
return g_gpu_address_helper.gpuAddress(buffer);
}
MetalBufferPool::~MetalBufferPool()
uint64_t metal_gpuResourceID(id<MTLTexture> texture)
{
thread_scoped_lock lock(buffer_mutex);
/* Release all buffers that have not been recently reused */
for (MetalBufferListEntry &buffer_entry : temp_buffers) {
total_temp_mem_size -= buffer_entry.buffer.allocatedSize;
[buffer_entry.buffer release];
buffer_entry.buffer = nil;
}
temp_buffers.clear();
return g_gpu_address_helper.gpuResourceID(texture);
}
uint64_t metal_gpuResourceID(id<MTLAccelerationStructure> accel_struct)
{
return g_gpu_address_helper.gpuResourceID(accel_struct);
}
uint64_t metal_gpuResourceID(id<MTLIntersectionFunctionTable> ift)
{
return g_gpu_address_helper.gpuResourceID(ift);
}
CCL_NAMESPACE_END

View File

@@ -330,10 +330,10 @@ typedef metal::raytracing::intersector<triangle_data, curve_data METALRT_LIMITS>
/* texture bindings and sampler setup */
struct Buffer1DParamsMetal {
device float *buf;
/* TextureParamsMetal is reinterpreted as Texture2DParamsMetal. */
struct TextureParamsMetal {
uint64_t tex;
};
struct Texture2DParamsMetal {
texture2d<float, access::sample> tex;
};
@@ -344,12 +344,15 @@ struct MetalRTBlasWrapper {
};
#endif
/* Additional Metal-specific resources which aren't encoded in KernelData.
* IMPORTANT: If this layout changes, ANCILLARY_SLOT_COUNT and the host-side encoding must change
* to match. */
struct MetalAncillaries {
device Texture2DParamsMetal *textures_2d;
device Buffer1DParamsMetal *buffers;
device TextureParamsMetal *textures;
#ifdef __METALRT__
metalrt_as_type accel_struct;
constant MetalRTBlasWrapper *blas_accel_structs;
metalrt_ift_type ift_default;
metalrt_ift_type ift_shadow;
metalrt_ift_type ift_shadow_all;
@@ -358,7 +361,6 @@ struct MetalAncillaries {
metalrt_ift_type ift_local_mblur;
metalrt_blas_ift_type ift_local_single_hit;
metalrt_ift_type ift_local_single_hit_mblur;
constant MetalRTBlasWrapper *blas_accel_structs;
#endif
};

View File

@@ -39,14 +39,14 @@ class MetalKernelContext {
float4 ccl_gpu_tex_object_read_2D(ccl_gpu_tex_object_2D tex, const float x, float y) const {
const uint tid(tex);
const uint sid(tex >> 32);
return metal_ancillaries->textures_2d[tid].tex.sample(metal_samplers[sid], float2(x, y));
return ((ccl_global Texture2DParamsMetal*)metal_ancillaries->textures)[tid].tex.sample(metal_samplers[sid], float2(x, y));
}
template<>
inline __attribute__((__always_inline__))
float ccl_gpu_tex_object_read_2D(ccl_gpu_tex_object_2D tex, const float x, float y) const {
const uint tid(tex);
const uint sid(tex >> 32);
return metal_ancillaries->textures_2d[tid].tex.sample(metal_samplers[sid], float2(x, y)).x;
return ((ccl_global Texture2DParamsMetal*)metal_ancillaries->textures)[tid].tex.sample(metal_samplers[sid], float2(x, y)).x;
}
# include "kernel/device/gpu/image.h"