diff --git a/intern/cycles/device/metal/device_impl.h b/intern/cycles/device/metal/device_impl.h index 6fe30039886..ebccae7e2f3 100644 --- a/intern/cycles/device/metal/device_impl.h +++ b/intern/cycles/device/metal/device_impl.h @@ -24,12 +24,8 @@ class MetalDevice : public Device { public: id mtlDevice = nil; id mtlLibrary[PSO_NUM] = {nil}; - id mtlBufferKernelParamsEncoder = - nil; /* encoder used for fetching device pointers from MTLBuffers */ id mtlComputeCommandQueue = nil; id mtlGeneralCommandQueue = nil; - id mtlAncillaryArgEncoder = - nil; /* encoder used for fetching device pointers from MTLBuffers */ id mtlCounterSampleBuffer = nil; string source[PSO_NUM]; string kernels_md5[PSO_NUM]; @@ -37,22 +33,24 @@ class MetalDevice : public Device { bool capture_enabled = false; - KernelParamsMetal launch_params = {nullptr}; + /* Argument buffer for static data. */ + id launch_params_buffer = nil; + KernelParamsMetal *launch_params = nullptr; /* MetalRT members ----------------------------------*/ bool use_metalrt = false; bool use_metalrt_extended_limits = false; bool motion_blur = false; bool use_pcmi = false; - id mtlASArgEncoder = - nil; /* encoder used for fetching device pointers from MTLAccelerationStructure */ - id mtlBlasArgEncoder = nil; id blas_buffer = nil; API_AVAILABLE(macos(11.0)) vector> unique_blas_array; + API_AVAILABLE(macos(11.0)) + vector> blas_array; + API_AVAILABLE(macos(11.0)) id accel_struct = nil; /*---------------------------------------------------*/ @@ -81,12 +79,8 @@ class MetalDevice : public Device { /* Bindless Textures */ bool is_texture(const TextureInfo &tex); device_vector texture_info; - bool need_texture_info = false; - id mtlTextureArgEncoder = nil; - id mtlBufferArgEncoder = nil; - id buffer_bindings_1d = nil; - id texture_bindings_2d = nil; - std::vector> texture_slot_map; + id texture_bindings = nil; + std::vector> texture_slot_map; MetalPipelineType kernel_specialization_level = PSO_GENERIC; diff --git a/intern/cycles/device/metal/device_impl.mm b/intern/cycles/device/metal/device_impl.mm index b511f9fee16..2e53a6ac47a 100644 --- a/intern/cycles/device/metal/device_impl.mm +++ b/intern/cycles/device/metal/device_impl.mm @@ -87,6 +87,9 @@ MetalDevice::MetalDevice(const DeviceInfo &info, Stats &stats, Profiler &profile mtlDevice = usable_devices[mtlDevId]; metal_printf("Creating new Cycles Metal device: %s", info.description.c_str()); + /* Ensure that back-compatability helpers for getting gpuAddress & gpuResourceID are set up. */ + metal_gpu_address_helper_init(mtlDevice); + /* Enable increased concurrent shader compiler limit. * This is also done by MTLContext::MTLContext, but only in GUI mode. */ if (@available(macOS 13.3, *)) { @@ -160,26 +163,15 @@ MetalDevice::MetalDevice(const DeviceInfo &info, Stats &stats, Profiler &profile kernel_type_as_string( (MetalPipelineType)min((int)kernel_specialization_level, (int)PSO_NUM - 1))); - MTLArgumentDescriptor *arg_desc_params = [[MTLArgumentDescriptor alloc] init]; - arg_desc_params.dataType = MTLDataTypePointer; - arg_desc_params.access = MTLArgumentAccessReadOnly; - arg_desc_params.arrayLength = sizeof(KernelParamsMetal) / sizeof(device_ptr); - mtlBufferKernelParamsEncoder = [mtlDevice - newArgumentEncoderWithArguments:@[ arg_desc_params ]]; + texture_bindings = [mtlDevice newBufferWithLength:8192 options:MTLResourceStorageModeShared]; + stats.mem_alloc(texture_bindings.allocatedSize); - MTLArgumentDescriptor *arg_desc_texture = [[MTLArgumentDescriptor alloc] init]; - arg_desc_texture.dataType = MTLDataTypeTexture; - arg_desc_texture.access = MTLArgumentAccessReadOnly; - mtlTextureArgEncoder = [mtlDevice newArgumentEncoderWithArguments:@[ arg_desc_texture ]]; - MTLArgumentDescriptor *arg_desc_buffer = [[MTLArgumentDescriptor alloc] init]; - arg_desc_buffer.dataType = MTLDataTypePointer; - arg_desc_buffer.access = MTLArgumentAccessReadOnly; - mtlBufferArgEncoder = [mtlDevice newArgumentEncoderWithArguments:@[ arg_desc_buffer ]]; + launch_params_buffer = [mtlDevice newBufferWithLength:sizeof(KernelParamsMetal) + options:MTLResourceStorageModeShared]; + stats.mem_alloc(sizeof(KernelParamsMetal)); - buffer_bindings_1d = [mtlDevice newBufferWithLength:8192 options:MTLResourceStorageModeShared]; - texture_bindings_2d = [mtlDevice newBufferWithLength:8192 - options:MTLResourceStorageModeShared]; - stats.mem_alloc(buffer_bindings_1d.allocatedSize + texture_bindings_2d.allocatedSize); + /* Cache unified pointer so we can write kernel params directly in place. */ + launch_params = (KernelParamsMetal *)launch_params_buffer.contents; /* Command queue for path-tracing work on the GPU. In a situation where multiple * MetalDeviceQueues are spawned from one MetalDevice, they share the same MTLCommandQueue. @@ -189,97 +181,6 @@ MetalDevice::MetalDevice(const DeviceInfo &info, Stats &stats, Profiler &profile /* Command queue for non-tracing work on the GPU. */ mtlGeneralCommandQueue = [mtlDevice newCommandQueue]; - - /* Acceleration structure arg encoder, if needed */ - if (@available(macos 12.0, *)) { - if (use_metalrt) { - MTLArgumentDescriptor *arg_desc_as = [[MTLArgumentDescriptor alloc] init]; - arg_desc_as.dataType = MTLDataTypeInstanceAccelerationStructure; - arg_desc_as.access = MTLArgumentAccessReadOnly; - mtlASArgEncoder = [mtlDevice newArgumentEncoderWithArguments:@[ arg_desc_as ]]; - [arg_desc_as release]; - } - } - - /* Build the arg encoder for the ancillary bindings */ - { - NSMutableArray *ancillary_desc = [[NSMutableArray alloc] init]; - - int index = 0; - MTLArgumentDescriptor *arg_desc_tex = [[MTLArgumentDescriptor alloc] init]; - arg_desc_tex.dataType = MTLDataTypePointer; - arg_desc_tex.access = MTLArgumentAccessReadOnly; - - arg_desc_tex.index = index++; - [ancillary_desc addObject:[arg_desc_tex copy]]; /* metal_buf_1d */ - arg_desc_tex.index = index++; - [ancillary_desc addObject:[arg_desc_tex copy]]; /* metal_tex_2d */ - - [arg_desc_tex release]; - - if (@available(macos 12.0, *)) { - if (use_metalrt) { - MTLArgumentDescriptor *arg_desc_as = [[MTLArgumentDescriptor alloc] init]; - arg_desc_as.dataType = MTLDataTypeInstanceAccelerationStructure; - arg_desc_as.access = MTLArgumentAccessReadOnly; - - MTLArgumentDescriptor *arg_desc_ptrs = [[MTLArgumentDescriptor alloc] init]; - arg_desc_ptrs.dataType = MTLDataTypePointer; - arg_desc_ptrs.access = MTLArgumentAccessReadOnly; - - MTLArgumentDescriptor *arg_desc_ift = [[MTLArgumentDescriptor alloc] init]; - arg_desc_ift.dataType = MTLDataTypeIntersectionFunctionTable; - arg_desc_ift.access = MTLArgumentAccessReadOnly; - - arg_desc_as.index = index++; - [ancillary_desc addObject:[arg_desc_as copy]]; /* accel_struct */ - - /* Intersection function tables */ - arg_desc_ift.index = index++; - [ancillary_desc addObject:[arg_desc_ift copy]]; /* ift_default */ - arg_desc_ift.index = index++; - [ancillary_desc addObject:[arg_desc_ift copy]]; /* ift_shadow */ - arg_desc_ift.index = index++; - [ancillary_desc addObject:[arg_desc_ift copy]]; /* ift_shadow_all */ - arg_desc_ift.index = index++; - [ancillary_desc addObject:[arg_desc_ift copy]]; /* ift_volume */ - arg_desc_ift.index = index++; - [ancillary_desc addObject:[arg_desc_ift copy]]; /* ift_local */ - arg_desc_ift.index = index++; - [ancillary_desc addObject:[arg_desc_ift copy]]; /* ift_local_mblur */ - arg_desc_ift.index = index++; - [ancillary_desc addObject:[arg_desc_ift copy]]; /* ift_local_single_hit */ - arg_desc_ift.index = index++; - [ancillary_desc addObject:[arg_desc_ift copy]]; /* ift_local_single_hit_mblur */ - - arg_desc_ptrs.index = index++; - [ancillary_desc addObject:[arg_desc_ptrs copy]]; /* blas_accel_structs */ - - [arg_desc_ift release]; - [arg_desc_as release]; - [arg_desc_ptrs release]; - } - } - - mtlAncillaryArgEncoder = [mtlDevice newArgumentEncoderWithArguments:ancillary_desc]; - - // preparing the blas arg encoder - - if (use_metalrt) { - MTLArgumentDescriptor *arg_desc_blas = [[MTLArgumentDescriptor alloc] init]; - arg_desc_blas.dataType = MTLDataTypeInstanceAccelerationStructure; - arg_desc_blas.access = MTLArgumentAccessReadOnly; - mtlBlasArgEncoder = [mtlDevice newArgumentEncoderWithArguments:@[ arg_desc_blas ]]; - [arg_desc_blas release]; - } - - for (int i = 0; i < ancillary_desc.count; i++) { - [ancillary_desc[i] release]; - } - [ancillary_desc release]; - } - [arg_desc_params release]; - [arg_desc_texture release]; } } @@ -292,27 +193,21 @@ MetalDevice::~MetalDevice() * existing_devices_mutex). */ thread_scoped_lock lock(existing_devices_mutex); - int num_resources = texture_info.size(); - for (int res = 0; res < num_resources; res++) { - if (is_texture(texture_info[res])) { - [texture_slot_map[res] release]; - texture_slot_map[res] = nil; - } + /* Release textures that weren't already freed by tex_free. */ + for (int res = 0; res < texture_info.size(); res++) { + [texture_slot_map[res] release]; + texture_slot_map[res] = nil; } free_bvh(); flush_delayed_free_list(); - if (texture_bindings_2d) { - stats.mem_free(buffer_bindings_1d.allocatedSize + texture_bindings_2d.allocatedSize); - [buffer_bindings_1d release]; - [texture_bindings_2d release]; - } - [mtlTextureArgEncoder release]; - [mtlBufferKernelParamsEncoder release]; - [mtlBufferArgEncoder release]; - [mtlASArgEncoder release]; - [mtlAncillaryArgEncoder release]; + stats.mem_free(sizeof(KernelParamsMetal)); + [launch_params_buffer release]; + + stats.mem_free(texture_bindings.allocatedSize); + [texture_bindings release]; + [mtlComputeCommandQueue release]; [mtlGeneralCommandQueue release]; if (mtlCounterSampleBuffer) { @@ -511,7 +406,7 @@ void MetalDevice::refresh_source_and_kernels_md5(MetalPipelineType pso_type) # define KERNEL_STRUCT_MEMBER(parent, _type, name) \ if (next_member_is_specialized) { \ constant_values += string(#parent "." #name "=") + \ - to_string(_type(launch_params.data.parent.name)) + "\n"; \ + to_string(_type(launch_params->data.parent.name)) + "\n"; \ } \ else { \ next_member_is_specialized = true; \ @@ -648,30 +543,7 @@ bool MetalDevice::is_texture(const TextureInfo &tex) return tex.height > 0; } -void MetalDevice::load_texture_info() -{ - if (need_texture_info) { - /* Unset flag before copying. */ - need_texture_info = false; - texture_info.copy_to_device(); - - int num_textures = texture_info.size(); - - for (int tex = 0; tex < num_textures; tex++) { - uint64_t offset = tex * sizeof(void *); - if (is_texture(texture_info[tex]) && texture_slot_map[tex]) { - id metal_texture = texture_slot_map[tex]; - MTLTextureType type = metal_texture.textureType; - [mtlTextureArgEncoder setArgumentBuffer:texture_bindings_2d offset:offset]; - [mtlTextureArgEncoder setTexture:type == MTLTextureType2D ? metal_texture : nil atIndex:0]; - } - else { - [mtlTextureArgEncoder setArgumentBuffer:texture_bindings_2d offset:offset]; - [mtlTextureArgEncoder setTexture:nil atIndex:0]; - } - } - } -} +void MetalDevice::load_texture_info() {} void MetalDevice::erase_allocation(device_memory &mem) { @@ -683,9 +555,9 @@ void MetalDevice::erase_allocation(device_memory &mem) if (it != metal_mem_map.end()) { MetalMem *mmem = it->second.get(); - /* blank out reference to MetalMem* in the launch params (fixes crash #94736) */ + /* blank out reference to resource in the launch params (fixes crash #94736) */ if (mmem->pointer_index >= 0) { - device_ptr *pointers = (device_ptr *)&launch_params; + device_ptr *pointers = (device_ptr *)launch_params; pointers[mmem->pointer_index] = 0; } metal_mem_map.erase(it); @@ -1026,7 +898,7 @@ void MetalDevice::const_copy_to(const char *name, void *host, const size_t size) { if (strcmp(name, "data") == 0) { assert(size == sizeof(KernelData)); - memcpy((uint8_t *)&launch_params.data, host, sizeof(KernelData)); + memcpy((uint8_t *)&launch_params->data, host, sizeof(KernelData)); /* Refresh the kernels_md5 checksums for specialized kernel sets. */ for (int level = 1; level <= int(kernel_specialization_level); level++) { @@ -1035,30 +907,41 @@ void MetalDevice::const_copy_to(const char *name, void *host, const size_t size) return; } - auto update_launch_pointers = - [&](size_t offset, void *data, const size_t data_size, const size_t pointers_size) { - memcpy((uint8_t *)&launch_params + offset, data, data_size); + auto update_launch_pointers = [&](size_t offset, void *data, const size_t pointers_size) { + uint64_t *addresses = (uint64_t *)((uint8_t *)launch_params + offset); - MetalMem **mmem = (MetalMem **)data; - int pointer_count = pointers_size / sizeof(device_ptr); - int pointer_index = offset / sizeof(device_ptr); - for (int i = 0; i < pointer_count; i++) { - if (mmem[i]) { - mmem[i]->pointer_index = pointer_index + i; + MetalMem **mmem = (MetalMem **)data; + int pointer_count = pointers_size / sizeof(device_ptr); + int pointer_index = offset / sizeof(device_ptr); + for (int i = 0; i < pointer_count; i++) { + addresses[i] = 0; + if (mmem[i]) { + mmem[i]->pointer_index = pointer_index + i; + if (mmem[i]->mtlBuffer) { + if (@available(macOS 13.0, *)) { + addresses[i] = metal_gpuAddress(mmem[i]->mtlBuffer); } } - }; + } + } + }; /* Update data storage pointers in launch parameters. */ if (strcmp(name, "integrator_state") == 0) { - /* IntegratorStateGPU is contiguous pointers */ + /* IntegratorStateGPU is contiguous pointers up until sort_partition_divisor. */ const size_t pointer_block_size = offsetof(IntegratorStateGPU, sort_partition_divisor); update_launch_pointers( - offsetof(KernelParamsMetal, integrator_state), host, size, pointer_block_size); + offsetof(KernelParamsMetal, integrator_state), host, pointer_block_size); + + /* Ensure the non-pointers part of IntegratorStateGPU is copied (this is the proper fix for + * #144713). */ + memcpy((uint8_t *)&launch_params->integrator_state + pointer_block_size, + (uint8_t *)host + pointer_block_size, + sizeof(IntegratorStateGPU) - pointer_block_size); } # define KERNEL_DATA_ARRAY(data_type, tex_name) \ else if (strcmp(name, #tex_name) == 0) { \ - update_launch_pointers(offsetof(KernelParamsMetal, tex_name), host, size, size); \ + update_launch_pointers(offsetof(KernelParamsMetal, tex_name), host, size); \ } # include "kernel/data_arrays.h" # undef KERNEL_DATA_ARRAY @@ -1096,12 +979,7 @@ void MetalDevice::tex_alloc_as_buffer(device_texture &mem) } texture_info[slot] = mem.info; - uint64_t offset = slot * sizeof(void *); - [mtlBufferArgEncoder setArgumentBuffer:buffer_bindings_1d offset:offset]; - [mtlBufferArgEncoder setBuffer:mmem->mtlBuffer offset:0 atIndex:0]; - texture_info[slot].data = *(uint64_t *)((uint64_t)buffer_bindings_1d.contents + offset); - texture_slot_map[slot] = nil; - need_texture_info = true; + texture_slot_map[slot] = mmem->mtlBuffer; if (is_nanovdb_type(mem.info.data_type)) { using_nanovdb = true; @@ -1245,34 +1123,21 @@ void MetalDevice::tex_alloc(device_texture &mem) texture_slot_map.resize(slot + 128); ssize_t min_buffer_length = sizeof(void *) * texture_info.size(); - if (!texture_bindings_2d || (texture_bindings_2d.length < min_buffer_length)) { - if (texture_bindings_2d) { - delayed_free_list.push_back(buffer_bindings_1d); - delayed_free_list.push_back(texture_bindings_2d); - - stats.mem_free(buffer_bindings_1d.allocatedSize + texture_bindings_2d.allocatedSize); + if (!texture_bindings || (texture_bindings.length < min_buffer_length)) { + if (texture_bindings) { + delayed_free_list.push_back(texture_bindings); + stats.mem_free(texture_bindings.allocatedSize); } - buffer_bindings_1d = [mtlDevice newBufferWithLength:min_buffer_length - options:MTLResourceStorageModeShared]; - texture_bindings_2d = [mtlDevice newBufferWithLength:min_buffer_length - options:MTLResourceStorageModeShared]; + texture_bindings = [mtlDevice newBufferWithLength:min_buffer_length + options:MTLResourceStorageModeShared]; - stats.mem_alloc(buffer_bindings_1d.allocatedSize + texture_bindings_2d.allocatedSize); + stats.mem_alloc(texture_bindings.allocatedSize); } } - /* Optimize the texture for GPU access. */ - id commandBuffer = [mtlGeneralCommandQueue commandBuffer]; - id blitCommandEncoder = [commandBuffer blitCommandEncoder]; - [blitCommandEncoder optimizeContentsForGPUAccess:mtlTexture]; - [blitCommandEncoder endEncoding]; - [commandBuffer commit]; - - /* Set Mapping and tag that we need to (re-)upload to device */ + /* Set Mapping. */ texture_slot_map[slot] = mtlTexture; texture_info[slot] = mem.info; - need_texture_info = true; - texture_info[slot].data = uint64_t(slot) | (sampler_index << 32); if (max_working_set_exceeded()) { @@ -1305,27 +1170,20 @@ void MetalDevice::tex_copy_to(device_texture &mem) void MetalDevice::tex_free(device_texture &mem) { + int slot = mem.slot; if (mem.data_height == 0) { generic_free(mem); - return; } - - if (metal_mem_map.count(&mem)) { + else if (metal_mem_map.count(&mem)) { std::lock_guard lock(metal_mem_map_mutex); MetalMem &mmem = *metal_mem_map.at(&mem); - assert(texture_slot_map[mem.slot] == mmem.mtlTexture); - if (texture_slot_map[mem.slot] == mmem.mtlTexture) { - texture_slot_map[mem.slot] = nil; - } - - if (mmem.mtlTexture) { - /* Free bindless texture. */ - delayed_free_list.push_back(mmem.mtlTexture); - mmem.mtlTexture = nil; - } + /* Free bindless texture. */ + delayed_free_list.push_back(mmem.mtlTexture); + mmem.mtlTexture = nil; erase_allocation(mem); } + texture_slot_map[slot] = nil; } unique_ptr MetalDevice::gpu_queue_create() @@ -1388,6 +1246,7 @@ void MetalDevice::free_bvh() [blas release]; } unique_blas_array.clear(); + blas_array.clear(); if (blas_buffer) { [blas_buffer release]; @@ -1410,6 +1269,7 @@ void MetalDevice::update_bvh(BVHMetal *bvh_metal) accel_struct = bvh_metal->accel_struct; unique_blas_array = bvh_metal->unique_blas_array; + blas_array = bvh_metal->blas_array; [accel_struct retain]; for (id &blas : unique_blas_array) { @@ -1417,17 +1277,9 @@ void MetalDevice::update_bvh(BVHMetal *bvh_metal) } // Allocate required buffers for BLAS array. - uint64_t count = bvh_metal->blas_array.size(); - uint64_t buffer_size = mtlBlasArgEncoder.encodedLength * count; + uint64_t buffer_size = blas_array.size() * sizeof(uint64_t); blas_buffer = [mtlDevice newBufferWithLength:buffer_size options:MTLResourceStorageModeShared]; stats.mem_alloc(blas_buffer.allocatedSize); - - for (uint64_t i = 0; i < count; ++i) { - if (bvh_metal->blas_array[i]) { - [mtlBlasArgEncoder setArgumentBuffer:blas_buffer offset:i * mtlBlasArgEncoder.encodedLength]; - [mtlBlasArgEncoder setAccelerationStructure:bvh_metal->blas_array[i] atIndex:0]; - } - } } CCL_NAMESPACE_END diff --git a/intern/cycles/device/metal/kernel.mm b/intern/cycles/device/metal/kernel.mm index d0f44b4bd34..7fda8c14587 100644 --- a/intern/cycles/device/metal/kernel.mm +++ b/intern/cycles/device/metal/kernel.mm @@ -340,7 +340,7 @@ void ShaderCache::load_kernel(DeviceKernel device_kernel, * to be active. */ pipeline->pipeline_id = g_next_pipeline_id.fetch_add(1); pipeline->originating_device_id = device->device_id; - pipeline->kernel_data_ = device->launch_params.data; + pipeline->kernel_data_ = device->launch_params->data; pipeline->pso_type = pso_type; pipeline->mtlDevice = mtlDevice; pipeline->kernels_md5 = device->kernels_md5[pso_type]; diff --git a/intern/cycles/device/metal/queue.h b/intern/cycles/device/metal/queue.h index 193f3db9e87..7643e8f1902 100644 --- a/intern/cycles/device/metal/queue.h +++ b/intern/cycles/device/metal/queue.h @@ -15,6 +15,9 @@ # define MAX_SAMPLE_BUFFER_LENGTH 4096 +/* The number of resources to be contiguously encoded into the MetalAncillaries struct. */ +# define ANCILLARY_SLOT_COUNT 11 + CCL_NAMESPACE_BEGIN class MetalDevice; @@ -57,7 +60,6 @@ class MetalDeviceQueue : public DeviceQueue { id get_blit_encoder(); MetalDevice *metal_device_; - MetalBufferPool temp_buffer_pool_; API_AVAILABLE(macos(11.0), ios(14.0)) MTLCommandBufferDescriptor *command_buffer_desc_ = nullptr; diff --git a/intern/cycles/device/metal/queue.mm b/intern/cycles/device/metal/queue.mm index 313835aa535..6103bd2207e 100644 --- a/intern/cycles/device/metal/queue.mm +++ b/intern/cycles/device/metal/queue.mm @@ -317,11 +317,71 @@ bool MetalDeviceQueue::supports_local_atomic_sort() const return metal_device_->use_local_atomic_sort(); } +static void zero_resource(void *address_in_arg_buffer, int index = 0) +{ + uint64_t *pptr = (uint64_t *)address_in_arg_buffer; + pptr[index] = 0; +} + +template void write_resource(void *address_in_arg_buffer, T resource, int index = 0) +{ + zero_resource(address_in_arg_buffer, index); + uint64_t *pptr = (uint64_t *)address_in_arg_buffer; + if (resource) { + pptr[index] = metal_gpuResourceID(resource); + } +} + +template<> void write_resource(void *address_in_arg_buffer, id buffer, int index) +{ + zero_resource(address_in_arg_buffer, index); + uint64_t *pptr = (uint64_t *)address_in_arg_buffer; + if (buffer) { + pptr[index] = metal_gpuAddress(buffer); + } +} + +static id patch_resource(void *address_in_arg_buffer, int index = 0) +{ + uint64_t *pptr = (uint64_t *)address_in_arg_buffer; + if (MetalDevice::MetalMem *mmem = (MetalDevice::MetalMem *)pptr[index]) { + write_resource>(address_in_arg_buffer, mmem->mtlBuffer, index); + return mmem->mtlBuffer; + } + return nil; +} + void MetalDeviceQueue::init_execution() { - /* Synchronize all textures and memory copies before executing task. */ - metal_device_->load_texture_info(); + /* Populate blas_array. */ + uint64_t *blas_array = (uint64_t *)metal_device_->blas_buffer.contents; + for (uint64_t slot = 0; slot < metal_device_->blas_array.size(); ++slot) { + write_resource(blas_array, metal_device_->blas_array[slot], slot); + } + device_vector &texture_info = metal_device_->texture_info; + id &texture_bindings = metal_device_->texture_bindings; + std::vector> &texture_slot_map = metal_device_->texture_slot_map; + + /* Ensure texture_info is allocated before populating. */ + texture_info.copy_to_device(); + + /* Populate texture bindings. */ + uint64_t *bindings = (uint64_t *)texture_bindings.contents; + memset(bindings, 0, texture_bindings.length); + for (int slot = 0; slot < texture_info.size(); ++slot) { + if (texture_slot_map[slot]) { + if (metal_device_->is_texture(texture_info[slot])) { + write_resource(bindings, id(texture_slot_map[slot]), slot); + } + else { + /* The GPU address of a 1D buffer texture is written into the slot data field. */ + write_resource(&texture_info[slot].data, id(texture_slot_map[slot]), 0); + } + } + } + + /* Synchronize memory copies. */ synchronize(); } @@ -346,83 +406,6 @@ bool MetalDeviceQueue::enqueue(DeviceKernel kernel, if (profiling_enabled_) { command_encoder_labels_.push_back({kernel, work_size, current_encoder_idx_}); } - - /* Determine size requirement for argument buffer. */ - size_t arg_buffer_length = 0; - for (size_t i = 0; i < args.count; i++) { - size_t size_in_bytes = args.sizes[i]; - arg_buffer_length = round_up(arg_buffer_length, size_in_bytes) + size_in_bytes; - } - /* 256 is the Metal offset alignment for constant address space bindings */ - arg_buffer_length = round_up(arg_buffer_length, 256); - - /* Globals placed after "vanilla" arguments. */ - size_t globals_offsets = arg_buffer_length; - arg_buffer_length += sizeof(KernelParamsMetal); - arg_buffer_length = round_up(arg_buffer_length, 256); - - /* Metal ancillary bindless pointers. */ - size_t metal_offsets = arg_buffer_length; - arg_buffer_length += metal_device_->mtlAncillaryArgEncoder.encodedLength; - arg_buffer_length = round_up(arg_buffer_length, - metal_device_->mtlAncillaryArgEncoder.alignment); - - /* Temporary buffer used to prepare arg_buffer */ - uint8_t *init_arg_buffer = (uint8_t *)alloca(arg_buffer_length); - memset(init_arg_buffer, 0, arg_buffer_length); - - /* Prepare the non-pointer "enqueue" arguments */ - size_t bytes_written = 0; - for (size_t i = 0; i < args.count; i++) { - size_t size_in_bytes = args.sizes[i]; - bytes_written = round_up(bytes_written, size_in_bytes); - if (args.types[i] != DeviceKernelArguments::POINTER) { - memcpy(init_arg_buffer + bytes_written, args.values[i], size_in_bytes); - } - bytes_written += size_in_bytes; - } - - /* Prepare any non-pointer (i.e. plain-old-data) KernelParamsMetal data */ - /* The plain-old-data is contiguous, continuing to the end of KernelParamsMetal */ - size_t plain_old_launch_data_offset = offsetof(KernelParamsMetal, integrator_state) + - offsetof(IntegratorStateGPU, sort_partition_divisor); - size_t plain_old_launch_data_size = sizeof(KernelParamsMetal) - plain_old_launch_data_offset; - memcpy(init_arg_buffer + globals_offsets + plain_old_launch_data_offset, - (uint8_t *)&metal_device_->launch_params + plain_old_launch_data_offset, - plain_old_launch_data_size); - - /* Allocate an argument buffer. */ - id arg_buffer = temp_buffer_pool_.get_buffer( - mtlDevice_, mtlCommandBuffer_, arg_buffer_length, init_arg_buffer, stats_); - - /* Encode the pointer "enqueue" arguments */ - bytes_written = 0; - for (size_t i = 0; i < args.count; i++) { - size_t size_in_bytes = args.sizes[i]; - bytes_written = round_up(bytes_written, size_in_bytes); - if (args.types[i] == DeviceKernelArguments::POINTER) { - [metal_device_->mtlBufferKernelParamsEncoder setArgumentBuffer:arg_buffer - offset:bytes_written]; - if (MetalDevice::MetalMem *mmem = *(MetalDevice::MetalMem **)args.values[i]) { - [mtlComputeCommandEncoder useResource:mmem->mtlBuffer - usage:MTLResourceUsageRead | MTLResourceUsageWrite]; - [metal_device_->mtlBufferKernelParamsEncoder setBuffer:mmem->mtlBuffer - offset:0 - atIndex:0]; - } - else { - if (@available(macos 12.0, *)) { - [metal_device_->mtlBufferKernelParamsEncoder setBuffer:nil offset:0 atIndex:0]; - } - } - } - bytes_written += size_in_bytes; - } - - /* Encode KernelParamsMetal buffers */ - [metal_device_->mtlBufferKernelParamsEncoder setArgumentBuffer:arg_buffer - offset:globals_offsets]; - if (label_command_encoders_) { /* Add human-readable labels if we're doing any form of debugging / profiling. */ mtlComputeCommandEncoder.label = [NSString @@ -431,28 +414,6 @@ bool MetalDeviceQueue::enqueue(DeviceKernel kernel, work_size]; } - /* this relies on IntegratorStateGPU layout being contiguous device_ptrs. */ - const size_t pointer_block_end = offsetof(KernelParamsMetal, integrator_state) + - offsetof(IntegratorStateGPU, sort_partition_divisor); - for (size_t offset = 0; offset < pointer_block_end; offset += sizeof(device_ptr)) { - int pointer_index = int(offset / sizeof(device_ptr)); - MetalDevice::MetalMem *mmem = *( - MetalDevice::MetalMem **)((uint8_t *)&metal_device_->launch_params + offset); - if (mmem && mmem->mem && (mmem->mtlBuffer || mmem->mtlTexture)) { - [metal_device_->mtlBufferKernelParamsEncoder setBuffer:mmem->mtlBuffer - offset:0 - atIndex:pointer_index]; - } - else { - if (@available(macos 12.0, *)) { - [metal_device_->mtlBufferKernelParamsEncoder setBuffer:nil - offset:0 - atIndex:pointer_index]; - } - } - } - bytes_written = globals_offsets + sizeof(KernelParamsMetal); - if (!active_pipelines_[kernel].update(metal_device_, kernel)) { metal_device_->set_error( string_printf("Could not activate pipeline for %s\n", device_kernel_as_string(kernel))); @@ -460,47 +421,66 @@ bool MetalDeviceQueue::enqueue(DeviceKernel kernel, } MetalDispatchPipeline &active_pipeline = active_pipelines_[kernel]; - /* Encode ancillaries */ - [metal_device_->mtlAncillaryArgEncoder setArgumentBuffer:arg_buffer offset:metal_offsets]; - [metal_device_->mtlAncillaryArgEncoder setBuffer:metal_device_->texture_bindings_2d - offset:0 - atIndex:0]; - [metal_device_->mtlAncillaryArgEncoder setBuffer:metal_device_->buffer_bindings_1d - offset:0 - atIndex:1]; + uint8_t dynamic_args[512] = {0}; - if (@available(macos 12.0, *)) { - if (metal_device_->use_metalrt && device_kernel_has_intersection(kernel)) { - if (id accel_struct = metal_device_->accel_struct) { - [metal_device_->mtlAncillaryArgEncoder setAccelerationStructure:accel_struct atIndex:2]; - [metal_device_->mtlAncillaryArgEncoder setBuffer:metal_device_->blas_buffer - offset:0 - atIndex:(METALRT_TABLE_NUM + 3)]; - } - - for (int table = 0; table < METALRT_TABLE_NUM; table++) { - if (active_pipeline.intersection_func_table[table]) { - [active_pipeline.intersection_func_table[table] setBuffer:arg_buffer - offset:globals_offsets - atIndex:1]; - [metal_device_->mtlAncillaryArgEncoder - setIntersectionFunctionTable:active_pipeline.intersection_func_table[table] - atIndex:3 + table]; - [mtlComputeCommandEncoder useResource:active_pipeline.intersection_func_table[table] - usage:MTLResourceUsageRead]; - } - else { - [metal_device_->mtlAncillaryArgEncoder setIntersectionFunctionTable:nil - atIndex:3 + table]; - } + /* Prepare the dynamic "enqueue" arguments */ + size_t dynamic_bytes_written = 0; + size_t max_size_in_bytes = 0; + for (size_t i = 0; i < args.count; i++) { + size_t size_in_bytes = args.sizes[i]; + max_size_in_bytes = max(max_size_in_bytes, size_in_bytes); + dynamic_bytes_written = round_up(dynamic_bytes_written, size_in_bytes); + memcpy(dynamic_args + dynamic_bytes_written, args.values[i], size_in_bytes); + if (args.types[i] == DeviceKernelArguments::POINTER) { + if (id buffer = patch_resource(dynamic_args + dynamic_bytes_written)) { + [mtlComputeCommandEncoder useResource:buffer + usage:MTLResourceUsageRead | MTLResourceUsageWrite]; } } - bytes_written = metal_offsets + metal_device_->mtlAncillaryArgEncoder.encodedLength; + dynamic_bytes_written += size_in_bytes; + } + /* Apply conventional struct alignment (stops asserts firing when API validation is enabled). + */ + dynamic_bytes_written = round_up(dynamic_bytes_written, max_size_in_bytes); + + /* Check that the dynamic args didn't overflow. */ + assert(dynamic_bytes_written <= sizeof(dynamic_args)); + + uint64_t ancillary_args[ANCILLARY_SLOT_COUNT] = {0}; + + /* Encode ancillaries */ + int ancillary_index = 0; + write_resource(ancillary_args, metal_device_->texture_bindings, ancillary_index++); + + if (metal_device_->use_metalrt) { + write_resource(ancillary_args, metal_device_->accel_struct, ancillary_index++); + write_resource(ancillary_args, metal_device_->blas_buffer, ancillary_index++); + + /* Write the intersection function table. */ + for (int table_idx = 0; table_idx < METALRT_TABLE_NUM; table_idx++) { + write_resource( + ancillary_args, active_pipeline.intersection_func_table[table_idx], ancillary_index++); + } + assert(ancillary_index == ANCILLARY_SLOT_COUNT); } - [mtlComputeCommandEncoder setBuffer:arg_buffer offset:0 atIndex:0]; - [mtlComputeCommandEncoder setBuffer:arg_buffer offset:globals_offsets atIndex:1]; - [mtlComputeCommandEncoder setBuffer:arg_buffer offset:metal_offsets atIndex:2]; + /* Encode ancillaries */ + if (metal_device_->use_metalrt) { + for (int table = 0; table < METALRT_TABLE_NUM; table++) { + if (active_pipeline.intersection_func_table[table]) { + [active_pipeline.intersection_func_table[table] + setBuffer:metal_device_->launch_params_buffer + offset:0 + atIndex:1]; + [mtlComputeCommandEncoder useResource:active_pipeline.intersection_func_table[table] + usage:MTLResourceUsageRead]; + } + } + } + + [mtlComputeCommandEncoder setBytes:dynamic_args length:dynamic_bytes_written atIndex:0]; + [mtlComputeCommandEncoder setBuffer:metal_device_->launch_params_buffer offset:0 atIndex:1]; + [mtlComputeCommandEncoder setBytes:ancillary_args length:sizeof(ancillary_args) atIndex:2]; if (metal_device_->use_metalrt && device_kernel_has_intersection(kernel)) { if (@available(macos 12.0, *)) { @@ -542,7 +522,7 @@ bool MetalDeviceQueue::enqueue(DeviceKernel kernel, case DEVICE_KERNEL_INTEGRATOR_SORT_BUCKET_PASS: case DEVICE_KERNEL_INTEGRATOR_SORT_WRITE_PASS: { - int key_count = metal_device_->launch_params.data.max_shaders; + int key_count = metal_device_->launch_params->data.max_shaders; shared_mem_bytes = (int)round_up(key_count * sizeof(int), 16); break; } @@ -674,7 +654,6 @@ bool MetalDeviceQueue::synchronize() [mtlCommandBuffer_ release]; - temp_buffer_pool_.process_command_buffer_completion(mtlCommandBuffer_); metal_device_->flush_delayed_free_list(); mtlCommandBuffer_ = nil; @@ -771,8 +750,7 @@ void MetalDeviceQueue::prepare_resources(DeviceKernel /*kernel*/) } /* ancillaries */ - [mtlComputeEncoder_ useResource:metal_device_->texture_bindings_2d usage:MTLResourceUsageRead]; - [mtlComputeEncoder_ useResource:metal_device_->buffer_bindings_1d usage:MTLResourceUsageRead]; + [mtlComputeEncoder_ useResource:metal_device_->texture_bindings usage:MTLResourceUsageRead]; } id MetalDeviceQueue::get_compute_encoder(DeviceKernel kernel) diff --git a/intern/cycles/device/metal/util.h b/intern/cycles/device/metal/util.h index e1d0a1db548..3e3a6fc6d38 100644 --- a/intern/cycles/device/metal/util.h +++ b/intern/cycles/device/metal/util.h @@ -38,26 +38,12 @@ struct MetalInfo { static string get_device_name(id device); }; -/* Pool of MTLBuffers whose lifetime is linked to a single MTLCommandBuffer */ -class MetalBufferPool { - struct MetalBufferListEntry { - id buffer; - id command_buffer; - }; - std::vector temp_buffers; - thread_mutex buffer_mutex; - size_t total_temp_mem_size = 0; +void metal_gpu_address_helper_init(id device); - public: - ~MetalBufferPool(); - - id get_buffer(id device, - id command_buffer, - NSUInteger length, - const void *pointer, - Stats &stats); - void process_command_buffer_completion(id command_buffer); -}; +uint64_t metal_gpuAddress(id buffer); +uint64_t metal_gpuResourceID(id texture); +uint64_t metal_gpuResourceID(id accel_struct); +uint64_t metal_gpuResourceID(id ift); CCL_NAMESPACE_END diff --git a/intern/cycles/device/metal/util.mm b/intern/cycles/device/metal/util.mm index 56b369dade9..de7dc425f94 100644 --- a/intern/cycles/device/metal/util.mm +++ b/intern/cycles/device/metal/util.mm @@ -18,6 +18,9 @@ CCL_NAMESPACE_BEGIN +/* Comment this out to test workaround for getting gpuAddress and gpuResourceID on macOS < 13.0. */ +# define CYCLES_USE_TIER2D_BINDLESS + string MetalInfo::get_device_name(id device) { string device_name = [device.name UTF8String]; @@ -118,63 +121,110 @@ const vector> &MetalInfo::get_usable_devices() return usable_devices; } -id MetalBufferPool::get_buffer(id device, - id command_buffer, - NSUInteger length, - const void *pointer, - Stats &stats) -{ - id buffer = nil; +struct GPUAddressHelper { + id resource_buffer = nil; + id address_encoder = nil; + + /* One time setup of arg encoder. */ + void init(id device) { - thread_scoped_lock lock(buffer_mutex); - /* Find an unused buffer with matching size and storage mode. */ - for (MetalBufferListEntry &bufferEntry : temp_buffers) { - if (bufferEntry.buffer.length == length && bufferEntry.command_buffer == nil) { - buffer = bufferEntry.buffer; - bufferEntry.command_buffer = command_buffer; - break; - } + if (resource_buffer) { + /* No setup required - already initialised. */ + return; } - if (!buffer) { - /* Create a new buffer and add it to the pool. Typically this pool will only grow to a - * handful of entries. */ - buffer = [device newBufferWithLength:length options:MTLResourceStorageModeShared]; - stats.mem_alloc(buffer.allocatedSize); - total_temp_mem_size += buffer.allocatedSize; - temp_buffers.push_back(MetalBufferListEntry{buffer, command_buffer}); + +# ifdef CYCLES_USE_TIER2D_BINDLESS + if (@available(macos 13.0, *)) { + /* No setup required - there's an API now! */ + return; } +# endif + + /* Setup a tiny buffer to encode the GPU address / resourceID into. */ + resource_buffer = [device newBufferWithLength:8 options:MTLResourceStorageModeShared]; + + /* Create an encoder to extract a gpuAddress from a MTLBuffer. */ + MTLArgumentDescriptor *encoder_params = [[MTLArgumentDescriptor alloc] init]; + encoder_params.arrayLength = 1; + encoder_params.access = MTLBindingAccessReadWrite; + encoder_params.dataType = MTLDataTypePointer; + address_encoder = [device newArgumentEncoderWithArguments:@[ encoder_params ]]; + [address_encoder setArgumentBuffer:resource_buffer offset:0]; + }; + + uint64_t gpuAddress(id buffer) + { +# ifdef CYCLES_USE_TIER2D_BINDLESS + if (@available(macos 13.0, *)) { + return buffer.gpuAddress; + } +# endif + [address_encoder setBuffer:buffer offset:0 atIndex:0]; + return *(uint64_t *)[resource_buffer contents]; } - /* Copy over data */ - if (pointer) { - memcpy(buffer.contents, pointer, length); + uint64_t gpuResourceID(id texture) + { +# ifdef CYCLES_USE_TIER2D_BINDLESS + if (@available(macos 13.0, *)) { + MTLResourceID resourceID = texture.gpuResourceID; + return (uint64_t &)resourceID; + } +# endif + [address_encoder setTexture:texture atIndex:0]; + return *(uint64_t *)[resource_buffer contents]; } - return buffer; + uint64_t gpuResourceID(id accel_struct) + { +# ifdef CYCLES_USE_TIER2D_BINDLESS + if (@available(macos 13.0, *)) { + MTLResourceID resourceID = accel_struct.gpuResourceID; + return (uint64_t &)resourceID; + } +# endif + [address_encoder setAccelerationStructure:accel_struct atIndex:0]; + return *(uint64_t *)[resource_buffer contents]; + } + + uint64_t gpuResourceID(id ift) + { +# ifdef CYCLES_USE_TIER2D_BINDLESS + if (@available(macos 13.0, *)) { + MTLResourceID resourceID = ift.gpuResourceID; + return (uint64_t &)resourceID; + } +# endif + [address_encoder setIntersectionFunctionTable:ift atIndex:0]; + return *(uint64_t *)[resource_buffer contents]; + } +}; + +GPUAddressHelper g_gpu_address_helper; + +void metal_gpu_address_helper_init(id device) +{ + g_gpu_address_helper.init(device); } -void MetalBufferPool::process_command_buffer_completion(id command_buffer) +uint64_t metal_gpuAddress(id buffer) { - assert(command_buffer); - thread_scoped_lock lock(buffer_mutex); - /* Mark any temp buffers associated with command_buffer as unused. */ - for (MetalBufferListEntry &buffer_entry : temp_buffers) { - if (buffer_entry.command_buffer == command_buffer) { - buffer_entry.command_buffer = nil; - } - } + return g_gpu_address_helper.gpuAddress(buffer); } -MetalBufferPool::~MetalBufferPool() +uint64_t metal_gpuResourceID(id texture) { - thread_scoped_lock lock(buffer_mutex); - /* Release all buffers that have not been recently reused */ - for (MetalBufferListEntry &buffer_entry : temp_buffers) { - total_temp_mem_size -= buffer_entry.buffer.allocatedSize; - [buffer_entry.buffer release]; - buffer_entry.buffer = nil; - } - temp_buffers.clear(); + return g_gpu_address_helper.gpuResourceID(texture); +} + +uint64_t metal_gpuResourceID(id accel_struct) +{ + return g_gpu_address_helper.gpuResourceID(accel_struct); +} + +uint64_t metal_gpuResourceID(id ift) +{ + return g_gpu_address_helper.gpuResourceID(ift); } CCL_NAMESPACE_END diff --git a/intern/cycles/kernel/device/metal/compat.h b/intern/cycles/kernel/device/metal/compat.h index 72dff1ceb76..0b198dfe265 100644 --- a/intern/cycles/kernel/device/metal/compat.h +++ b/intern/cycles/kernel/device/metal/compat.h @@ -330,10 +330,10 @@ typedef metal::raytracing::intersector /* texture bindings and sampler setup */ -struct Buffer1DParamsMetal { - device float *buf; +/* TextureParamsMetal is reinterpreted as Texture2DParamsMetal. */ +struct TextureParamsMetal { + uint64_t tex; }; - struct Texture2DParamsMetal { texture2d tex; }; @@ -344,12 +344,15 @@ struct MetalRTBlasWrapper { }; #endif +/* Additional Metal-specific resources which aren't encoded in KernelData. + * IMPORTANT: If this layout changes, ANCILLARY_SLOT_COUNT and the host-side encoding must change + * to match. */ struct MetalAncillaries { - device Texture2DParamsMetal *textures_2d; - device Buffer1DParamsMetal *buffers; + device TextureParamsMetal *textures; #ifdef __METALRT__ metalrt_as_type accel_struct; + constant MetalRTBlasWrapper *blas_accel_structs; metalrt_ift_type ift_default; metalrt_ift_type ift_shadow; metalrt_ift_type ift_shadow_all; @@ -358,7 +361,6 @@ struct MetalAncillaries { metalrt_ift_type ift_local_mblur; metalrt_blas_ift_type ift_local_single_hit; metalrt_ift_type ift_local_single_hit_mblur; - constant MetalRTBlasWrapper *blas_accel_structs; #endif }; diff --git a/intern/cycles/kernel/device/metal/context_begin.h b/intern/cycles/kernel/device/metal/context_begin.h index 1e7863be826..cb6688867fc 100644 --- a/intern/cycles/kernel/device/metal/context_begin.h +++ b/intern/cycles/kernel/device/metal/context_begin.h @@ -39,14 +39,14 @@ class MetalKernelContext { float4 ccl_gpu_tex_object_read_2D(ccl_gpu_tex_object_2D tex, const float x, float y) const { const uint tid(tex); const uint sid(tex >> 32); - return metal_ancillaries->textures_2d[tid].tex.sample(metal_samplers[sid], float2(x, y)); + return ((ccl_global Texture2DParamsMetal*)metal_ancillaries->textures)[tid].tex.sample(metal_samplers[sid], float2(x, y)); } template<> inline __attribute__((__always_inline__)) float ccl_gpu_tex_object_read_2D(ccl_gpu_tex_object_2D tex, const float x, float y) const { const uint tid(tex); const uint sid(tex >> 32); - return metal_ancillaries->textures_2d[tid].tex.sample(metal_samplers[sid], float2(x, y)).x; + return ((ccl_global Texture2DParamsMetal*)metal_ancillaries->textures)[tid].tex.sample(metal_samplers[sid], float2(x, y)).x; } # include "kernel/device/gpu/image.h"