Refactor: Cycles: Simplify Metal backend with direct bindless resource encoding
This re-applies pull request #140671, but with a fix for #144713 where the non-pointer part of IntegratorStateGPU was not initialized. This PR is a more extensive follow on from #123551 (removal of AMD and Intel GPU support). All supported Apple GPUs have Metal 3 and tier 2 argument buffer support. The invariant resource properties `gpuAddress` and `gpuResourceID` can be written directly into GPU structs once at setup time rather than once per dispatch. More background info can be found in this article: https://developer.apple.com/documentation/metal/improving-cpu-performance-by-using-argument-buffers?language=objc Code changes: - All code relating to `MTLArgumentEncoder` is removed - `KernelParamsMetal` updates are directly written into `id<MTLBuffer> launch_params_buffer` which is used for the "static" dispatch arguments - Dynamic dispatch arguments are small enough to be encoded using the `MTLComputeCommandEncoder.setBytes` function, eliminating the need for cycling temporary arg buffers Fix #144713 Co-authored-by: Brecht Van Lommel <brecht@noreply.localhost> Pull Request: https://projects.blender.org/blender/blender/pulls/145175
This commit is contained in:
committed by
Michael Jones (Apple)
parent
11afddc681
commit
193e22ee7e
@@ -24,12 +24,8 @@ class MetalDevice : public Device {
|
||||
public:
|
||||
id<MTLDevice> mtlDevice = nil;
|
||||
id<MTLLibrary> mtlLibrary[PSO_NUM] = {nil};
|
||||
id<MTLArgumentEncoder> mtlBufferKernelParamsEncoder =
|
||||
nil; /* encoder used for fetching device pointers from MTLBuffers */
|
||||
id<MTLCommandQueue> mtlComputeCommandQueue = nil;
|
||||
id<MTLCommandQueue> mtlGeneralCommandQueue = nil;
|
||||
id<MTLArgumentEncoder> mtlAncillaryArgEncoder =
|
||||
nil; /* encoder used for fetching device pointers from MTLBuffers */
|
||||
id<MTLCounterSampleBuffer> mtlCounterSampleBuffer = nil;
|
||||
string source[PSO_NUM];
|
||||
string kernels_md5[PSO_NUM];
|
||||
@@ -37,22 +33,24 @@ class MetalDevice : public Device {
|
||||
|
||||
bool capture_enabled = false;
|
||||
|
||||
KernelParamsMetal launch_params = {nullptr};
|
||||
/* Argument buffer for static data. */
|
||||
id<MTLBuffer> launch_params_buffer = nil;
|
||||
KernelParamsMetal *launch_params = nullptr;
|
||||
|
||||
/* MetalRT members ----------------------------------*/
|
||||
bool use_metalrt = false;
|
||||
bool use_metalrt_extended_limits = false;
|
||||
bool motion_blur = false;
|
||||
bool use_pcmi = false;
|
||||
id<MTLArgumentEncoder> mtlASArgEncoder =
|
||||
nil; /* encoder used for fetching device pointers from MTLAccelerationStructure */
|
||||
|
||||
id<MTLArgumentEncoder> mtlBlasArgEncoder = nil;
|
||||
id<MTLBuffer> blas_buffer = nil;
|
||||
|
||||
API_AVAILABLE(macos(11.0))
|
||||
vector<id<MTLAccelerationStructure>> unique_blas_array;
|
||||
|
||||
API_AVAILABLE(macos(11.0))
|
||||
vector<id<MTLAccelerationStructure>> blas_array;
|
||||
|
||||
API_AVAILABLE(macos(11.0))
|
||||
id<MTLAccelerationStructure> accel_struct = nil;
|
||||
/*---------------------------------------------------*/
|
||||
@@ -81,12 +79,8 @@ class MetalDevice : public Device {
|
||||
/* Bindless Textures */
|
||||
bool is_texture(const TextureInfo &tex);
|
||||
device_vector<TextureInfo> texture_info;
|
||||
bool need_texture_info = false;
|
||||
id<MTLArgumentEncoder> mtlTextureArgEncoder = nil;
|
||||
id<MTLArgumentEncoder> mtlBufferArgEncoder = nil;
|
||||
id<MTLBuffer> buffer_bindings_1d = nil;
|
||||
id<MTLBuffer> texture_bindings_2d = nil;
|
||||
std::vector<id<MTLTexture>> texture_slot_map;
|
||||
id<MTLBuffer> texture_bindings = nil;
|
||||
std::vector<id<MTLResource>> texture_slot_map;
|
||||
|
||||
MetalPipelineType kernel_specialization_level = PSO_GENERIC;
|
||||
|
||||
|
||||
@@ -87,6 +87,9 @@ MetalDevice::MetalDevice(const DeviceInfo &info, Stats &stats, Profiler &profile
|
||||
mtlDevice = usable_devices[mtlDevId];
|
||||
metal_printf("Creating new Cycles Metal device: %s", info.description.c_str());
|
||||
|
||||
/* Ensure that back-compatability helpers for getting gpuAddress & gpuResourceID are set up. */
|
||||
metal_gpu_address_helper_init(mtlDevice);
|
||||
|
||||
/* Enable increased concurrent shader compiler limit.
|
||||
* This is also done by MTLContext::MTLContext, but only in GUI mode. */
|
||||
if (@available(macOS 13.3, *)) {
|
||||
@@ -160,26 +163,15 @@ MetalDevice::MetalDevice(const DeviceInfo &info, Stats &stats, Profiler &profile
|
||||
kernel_type_as_string(
|
||||
(MetalPipelineType)min((int)kernel_specialization_level, (int)PSO_NUM - 1)));
|
||||
|
||||
MTLArgumentDescriptor *arg_desc_params = [[MTLArgumentDescriptor alloc] init];
|
||||
arg_desc_params.dataType = MTLDataTypePointer;
|
||||
arg_desc_params.access = MTLArgumentAccessReadOnly;
|
||||
arg_desc_params.arrayLength = sizeof(KernelParamsMetal) / sizeof(device_ptr);
|
||||
mtlBufferKernelParamsEncoder = [mtlDevice
|
||||
newArgumentEncoderWithArguments:@[ arg_desc_params ]];
|
||||
texture_bindings = [mtlDevice newBufferWithLength:8192 options:MTLResourceStorageModeShared];
|
||||
stats.mem_alloc(texture_bindings.allocatedSize);
|
||||
|
||||
MTLArgumentDescriptor *arg_desc_texture = [[MTLArgumentDescriptor alloc] init];
|
||||
arg_desc_texture.dataType = MTLDataTypeTexture;
|
||||
arg_desc_texture.access = MTLArgumentAccessReadOnly;
|
||||
mtlTextureArgEncoder = [mtlDevice newArgumentEncoderWithArguments:@[ arg_desc_texture ]];
|
||||
MTLArgumentDescriptor *arg_desc_buffer = [[MTLArgumentDescriptor alloc] init];
|
||||
arg_desc_buffer.dataType = MTLDataTypePointer;
|
||||
arg_desc_buffer.access = MTLArgumentAccessReadOnly;
|
||||
mtlBufferArgEncoder = [mtlDevice newArgumentEncoderWithArguments:@[ arg_desc_buffer ]];
|
||||
launch_params_buffer = [mtlDevice newBufferWithLength:sizeof(KernelParamsMetal)
|
||||
options:MTLResourceStorageModeShared];
|
||||
stats.mem_alloc(sizeof(KernelParamsMetal));
|
||||
|
||||
buffer_bindings_1d = [mtlDevice newBufferWithLength:8192 options:MTLResourceStorageModeShared];
|
||||
texture_bindings_2d = [mtlDevice newBufferWithLength:8192
|
||||
options:MTLResourceStorageModeShared];
|
||||
stats.mem_alloc(buffer_bindings_1d.allocatedSize + texture_bindings_2d.allocatedSize);
|
||||
/* Cache unified pointer so we can write kernel params directly in place. */
|
||||
launch_params = (KernelParamsMetal *)launch_params_buffer.contents;
|
||||
|
||||
/* Command queue for path-tracing work on the GPU. In a situation where multiple
|
||||
* MetalDeviceQueues are spawned from one MetalDevice, they share the same MTLCommandQueue.
|
||||
@@ -189,97 +181,6 @@ MetalDevice::MetalDevice(const DeviceInfo &info, Stats &stats, Profiler &profile
|
||||
|
||||
/* Command queue for non-tracing work on the GPU. */
|
||||
mtlGeneralCommandQueue = [mtlDevice newCommandQueue];
|
||||
|
||||
/* Acceleration structure arg encoder, if needed */
|
||||
if (@available(macos 12.0, *)) {
|
||||
if (use_metalrt) {
|
||||
MTLArgumentDescriptor *arg_desc_as = [[MTLArgumentDescriptor alloc] init];
|
||||
arg_desc_as.dataType = MTLDataTypeInstanceAccelerationStructure;
|
||||
arg_desc_as.access = MTLArgumentAccessReadOnly;
|
||||
mtlASArgEncoder = [mtlDevice newArgumentEncoderWithArguments:@[ arg_desc_as ]];
|
||||
[arg_desc_as release];
|
||||
}
|
||||
}
|
||||
|
||||
/* Build the arg encoder for the ancillary bindings */
|
||||
{
|
||||
NSMutableArray *ancillary_desc = [[NSMutableArray alloc] init];
|
||||
|
||||
int index = 0;
|
||||
MTLArgumentDescriptor *arg_desc_tex = [[MTLArgumentDescriptor alloc] init];
|
||||
arg_desc_tex.dataType = MTLDataTypePointer;
|
||||
arg_desc_tex.access = MTLArgumentAccessReadOnly;
|
||||
|
||||
arg_desc_tex.index = index++;
|
||||
[ancillary_desc addObject:[arg_desc_tex copy]]; /* metal_buf_1d */
|
||||
arg_desc_tex.index = index++;
|
||||
[ancillary_desc addObject:[arg_desc_tex copy]]; /* metal_tex_2d */
|
||||
|
||||
[arg_desc_tex release];
|
||||
|
||||
if (@available(macos 12.0, *)) {
|
||||
if (use_metalrt) {
|
||||
MTLArgumentDescriptor *arg_desc_as = [[MTLArgumentDescriptor alloc] init];
|
||||
arg_desc_as.dataType = MTLDataTypeInstanceAccelerationStructure;
|
||||
arg_desc_as.access = MTLArgumentAccessReadOnly;
|
||||
|
||||
MTLArgumentDescriptor *arg_desc_ptrs = [[MTLArgumentDescriptor alloc] init];
|
||||
arg_desc_ptrs.dataType = MTLDataTypePointer;
|
||||
arg_desc_ptrs.access = MTLArgumentAccessReadOnly;
|
||||
|
||||
MTLArgumentDescriptor *arg_desc_ift = [[MTLArgumentDescriptor alloc] init];
|
||||
arg_desc_ift.dataType = MTLDataTypeIntersectionFunctionTable;
|
||||
arg_desc_ift.access = MTLArgumentAccessReadOnly;
|
||||
|
||||
arg_desc_as.index = index++;
|
||||
[ancillary_desc addObject:[arg_desc_as copy]]; /* accel_struct */
|
||||
|
||||
/* Intersection function tables */
|
||||
arg_desc_ift.index = index++;
|
||||
[ancillary_desc addObject:[arg_desc_ift copy]]; /* ift_default */
|
||||
arg_desc_ift.index = index++;
|
||||
[ancillary_desc addObject:[arg_desc_ift copy]]; /* ift_shadow */
|
||||
arg_desc_ift.index = index++;
|
||||
[ancillary_desc addObject:[arg_desc_ift copy]]; /* ift_shadow_all */
|
||||
arg_desc_ift.index = index++;
|
||||
[ancillary_desc addObject:[arg_desc_ift copy]]; /* ift_volume */
|
||||
arg_desc_ift.index = index++;
|
||||
[ancillary_desc addObject:[arg_desc_ift copy]]; /* ift_local */
|
||||
arg_desc_ift.index = index++;
|
||||
[ancillary_desc addObject:[arg_desc_ift copy]]; /* ift_local_mblur */
|
||||
arg_desc_ift.index = index++;
|
||||
[ancillary_desc addObject:[arg_desc_ift copy]]; /* ift_local_single_hit */
|
||||
arg_desc_ift.index = index++;
|
||||
[ancillary_desc addObject:[arg_desc_ift copy]]; /* ift_local_single_hit_mblur */
|
||||
|
||||
arg_desc_ptrs.index = index++;
|
||||
[ancillary_desc addObject:[arg_desc_ptrs copy]]; /* blas_accel_structs */
|
||||
|
||||
[arg_desc_ift release];
|
||||
[arg_desc_as release];
|
||||
[arg_desc_ptrs release];
|
||||
}
|
||||
}
|
||||
|
||||
mtlAncillaryArgEncoder = [mtlDevice newArgumentEncoderWithArguments:ancillary_desc];
|
||||
|
||||
// preparing the blas arg encoder
|
||||
|
||||
if (use_metalrt) {
|
||||
MTLArgumentDescriptor *arg_desc_blas = [[MTLArgumentDescriptor alloc] init];
|
||||
arg_desc_blas.dataType = MTLDataTypeInstanceAccelerationStructure;
|
||||
arg_desc_blas.access = MTLArgumentAccessReadOnly;
|
||||
mtlBlasArgEncoder = [mtlDevice newArgumentEncoderWithArguments:@[ arg_desc_blas ]];
|
||||
[arg_desc_blas release];
|
||||
}
|
||||
|
||||
for (int i = 0; i < ancillary_desc.count; i++) {
|
||||
[ancillary_desc[i] release];
|
||||
}
|
||||
[ancillary_desc release];
|
||||
}
|
||||
[arg_desc_params release];
|
||||
[arg_desc_texture release];
|
||||
}
|
||||
}
|
||||
|
||||
@@ -292,27 +193,21 @@ MetalDevice::~MetalDevice()
|
||||
* existing_devices_mutex). */
|
||||
thread_scoped_lock lock(existing_devices_mutex);
|
||||
|
||||
int num_resources = texture_info.size();
|
||||
for (int res = 0; res < num_resources; res++) {
|
||||
if (is_texture(texture_info[res])) {
|
||||
[texture_slot_map[res] release];
|
||||
texture_slot_map[res] = nil;
|
||||
}
|
||||
/* Release textures that weren't already freed by tex_free. */
|
||||
for (int res = 0; res < texture_info.size(); res++) {
|
||||
[texture_slot_map[res] release];
|
||||
texture_slot_map[res] = nil;
|
||||
}
|
||||
|
||||
free_bvh();
|
||||
flush_delayed_free_list();
|
||||
|
||||
if (texture_bindings_2d) {
|
||||
stats.mem_free(buffer_bindings_1d.allocatedSize + texture_bindings_2d.allocatedSize);
|
||||
[buffer_bindings_1d release];
|
||||
[texture_bindings_2d release];
|
||||
}
|
||||
[mtlTextureArgEncoder release];
|
||||
[mtlBufferKernelParamsEncoder release];
|
||||
[mtlBufferArgEncoder release];
|
||||
[mtlASArgEncoder release];
|
||||
[mtlAncillaryArgEncoder release];
|
||||
stats.mem_free(sizeof(KernelParamsMetal));
|
||||
[launch_params_buffer release];
|
||||
|
||||
stats.mem_free(texture_bindings.allocatedSize);
|
||||
[texture_bindings release];
|
||||
|
||||
[mtlComputeCommandQueue release];
|
||||
[mtlGeneralCommandQueue release];
|
||||
if (mtlCounterSampleBuffer) {
|
||||
@@ -511,7 +406,7 @@ void MetalDevice::refresh_source_and_kernels_md5(MetalPipelineType pso_type)
|
||||
# define KERNEL_STRUCT_MEMBER(parent, _type, name) \
|
||||
if (next_member_is_specialized) { \
|
||||
constant_values += string(#parent "." #name "=") + \
|
||||
to_string(_type(launch_params.data.parent.name)) + "\n"; \
|
||||
to_string(_type(launch_params->data.parent.name)) + "\n"; \
|
||||
} \
|
||||
else { \
|
||||
next_member_is_specialized = true; \
|
||||
@@ -648,30 +543,7 @@ bool MetalDevice::is_texture(const TextureInfo &tex)
|
||||
return tex.height > 0;
|
||||
}
|
||||
|
||||
void MetalDevice::load_texture_info()
|
||||
{
|
||||
if (need_texture_info) {
|
||||
/* Unset flag before copying. */
|
||||
need_texture_info = false;
|
||||
texture_info.copy_to_device();
|
||||
|
||||
int num_textures = texture_info.size();
|
||||
|
||||
for (int tex = 0; tex < num_textures; tex++) {
|
||||
uint64_t offset = tex * sizeof(void *);
|
||||
if (is_texture(texture_info[tex]) && texture_slot_map[tex]) {
|
||||
id<MTLTexture> metal_texture = texture_slot_map[tex];
|
||||
MTLTextureType type = metal_texture.textureType;
|
||||
[mtlTextureArgEncoder setArgumentBuffer:texture_bindings_2d offset:offset];
|
||||
[mtlTextureArgEncoder setTexture:type == MTLTextureType2D ? metal_texture : nil atIndex:0];
|
||||
}
|
||||
else {
|
||||
[mtlTextureArgEncoder setArgumentBuffer:texture_bindings_2d offset:offset];
|
||||
[mtlTextureArgEncoder setTexture:nil atIndex:0];
|
||||
}
|
||||
}
|
||||
}
|
||||
}
|
||||
void MetalDevice::load_texture_info() {}
|
||||
|
||||
void MetalDevice::erase_allocation(device_memory &mem)
|
||||
{
|
||||
@@ -683,9 +555,9 @@ void MetalDevice::erase_allocation(device_memory &mem)
|
||||
if (it != metal_mem_map.end()) {
|
||||
MetalMem *mmem = it->second.get();
|
||||
|
||||
/* blank out reference to MetalMem* in the launch params (fixes crash #94736) */
|
||||
/* blank out reference to resource in the launch params (fixes crash #94736) */
|
||||
if (mmem->pointer_index >= 0) {
|
||||
device_ptr *pointers = (device_ptr *)&launch_params;
|
||||
device_ptr *pointers = (device_ptr *)launch_params;
|
||||
pointers[mmem->pointer_index] = 0;
|
||||
}
|
||||
metal_mem_map.erase(it);
|
||||
@@ -1026,7 +898,7 @@ void MetalDevice::const_copy_to(const char *name, void *host, const size_t size)
|
||||
{
|
||||
if (strcmp(name, "data") == 0) {
|
||||
assert(size == sizeof(KernelData));
|
||||
memcpy((uint8_t *)&launch_params.data, host, sizeof(KernelData));
|
||||
memcpy((uint8_t *)&launch_params->data, host, sizeof(KernelData));
|
||||
|
||||
/* Refresh the kernels_md5 checksums for specialized kernel sets. */
|
||||
for (int level = 1; level <= int(kernel_specialization_level); level++) {
|
||||
@@ -1035,30 +907,41 @@ void MetalDevice::const_copy_to(const char *name, void *host, const size_t size)
|
||||
return;
|
||||
}
|
||||
|
||||
auto update_launch_pointers =
|
||||
[&](size_t offset, void *data, const size_t data_size, const size_t pointers_size) {
|
||||
memcpy((uint8_t *)&launch_params + offset, data, data_size);
|
||||
auto update_launch_pointers = [&](size_t offset, void *data, const size_t pointers_size) {
|
||||
uint64_t *addresses = (uint64_t *)((uint8_t *)launch_params + offset);
|
||||
|
||||
MetalMem **mmem = (MetalMem **)data;
|
||||
int pointer_count = pointers_size / sizeof(device_ptr);
|
||||
int pointer_index = offset / sizeof(device_ptr);
|
||||
for (int i = 0; i < pointer_count; i++) {
|
||||
if (mmem[i]) {
|
||||
mmem[i]->pointer_index = pointer_index + i;
|
||||
MetalMem **mmem = (MetalMem **)data;
|
||||
int pointer_count = pointers_size / sizeof(device_ptr);
|
||||
int pointer_index = offset / sizeof(device_ptr);
|
||||
for (int i = 0; i < pointer_count; i++) {
|
||||
addresses[i] = 0;
|
||||
if (mmem[i]) {
|
||||
mmem[i]->pointer_index = pointer_index + i;
|
||||
if (mmem[i]->mtlBuffer) {
|
||||
if (@available(macOS 13.0, *)) {
|
||||
addresses[i] = metal_gpuAddress(mmem[i]->mtlBuffer);
|
||||
}
|
||||
}
|
||||
};
|
||||
}
|
||||
}
|
||||
};
|
||||
|
||||
/* Update data storage pointers in launch parameters. */
|
||||
if (strcmp(name, "integrator_state") == 0) {
|
||||
/* IntegratorStateGPU is contiguous pointers */
|
||||
/* IntegratorStateGPU is contiguous pointers up until sort_partition_divisor. */
|
||||
const size_t pointer_block_size = offsetof(IntegratorStateGPU, sort_partition_divisor);
|
||||
update_launch_pointers(
|
||||
offsetof(KernelParamsMetal, integrator_state), host, size, pointer_block_size);
|
||||
offsetof(KernelParamsMetal, integrator_state), host, pointer_block_size);
|
||||
|
||||
/* Ensure the non-pointers part of IntegratorStateGPU is copied (this is the proper fix for
|
||||
* #144713). */
|
||||
memcpy((uint8_t *)&launch_params->integrator_state + pointer_block_size,
|
||||
(uint8_t *)host + pointer_block_size,
|
||||
sizeof(IntegratorStateGPU) - pointer_block_size);
|
||||
}
|
||||
# define KERNEL_DATA_ARRAY(data_type, tex_name) \
|
||||
else if (strcmp(name, #tex_name) == 0) { \
|
||||
update_launch_pointers(offsetof(KernelParamsMetal, tex_name), host, size, size); \
|
||||
update_launch_pointers(offsetof(KernelParamsMetal, tex_name), host, size); \
|
||||
}
|
||||
# include "kernel/data_arrays.h"
|
||||
# undef KERNEL_DATA_ARRAY
|
||||
@@ -1096,12 +979,7 @@ void MetalDevice::tex_alloc_as_buffer(device_texture &mem)
|
||||
}
|
||||
|
||||
texture_info[slot] = mem.info;
|
||||
uint64_t offset = slot * sizeof(void *);
|
||||
[mtlBufferArgEncoder setArgumentBuffer:buffer_bindings_1d offset:offset];
|
||||
[mtlBufferArgEncoder setBuffer:mmem->mtlBuffer offset:0 atIndex:0];
|
||||
texture_info[slot].data = *(uint64_t *)((uint64_t)buffer_bindings_1d.contents + offset);
|
||||
texture_slot_map[slot] = nil;
|
||||
need_texture_info = true;
|
||||
texture_slot_map[slot] = mmem->mtlBuffer;
|
||||
|
||||
if (is_nanovdb_type(mem.info.data_type)) {
|
||||
using_nanovdb = true;
|
||||
@@ -1245,34 +1123,21 @@ void MetalDevice::tex_alloc(device_texture &mem)
|
||||
texture_slot_map.resize(slot + 128);
|
||||
|
||||
ssize_t min_buffer_length = sizeof(void *) * texture_info.size();
|
||||
if (!texture_bindings_2d || (texture_bindings_2d.length < min_buffer_length)) {
|
||||
if (texture_bindings_2d) {
|
||||
delayed_free_list.push_back(buffer_bindings_1d);
|
||||
delayed_free_list.push_back(texture_bindings_2d);
|
||||
|
||||
stats.mem_free(buffer_bindings_1d.allocatedSize + texture_bindings_2d.allocatedSize);
|
||||
if (!texture_bindings || (texture_bindings.length < min_buffer_length)) {
|
||||
if (texture_bindings) {
|
||||
delayed_free_list.push_back(texture_bindings);
|
||||
stats.mem_free(texture_bindings.allocatedSize);
|
||||
}
|
||||
buffer_bindings_1d = [mtlDevice newBufferWithLength:min_buffer_length
|
||||
options:MTLResourceStorageModeShared];
|
||||
texture_bindings_2d = [mtlDevice newBufferWithLength:min_buffer_length
|
||||
options:MTLResourceStorageModeShared];
|
||||
texture_bindings = [mtlDevice newBufferWithLength:min_buffer_length
|
||||
options:MTLResourceStorageModeShared];
|
||||
|
||||
stats.mem_alloc(buffer_bindings_1d.allocatedSize + texture_bindings_2d.allocatedSize);
|
||||
stats.mem_alloc(texture_bindings.allocatedSize);
|
||||
}
|
||||
}
|
||||
|
||||
/* Optimize the texture for GPU access. */
|
||||
id<MTLCommandBuffer> commandBuffer = [mtlGeneralCommandQueue commandBuffer];
|
||||
id<MTLBlitCommandEncoder> blitCommandEncoder = [commandBuffer blitCommandEncoder];
|
||||
[blitCommandEncoder optimizeContentsForGPUAccess:mtlTexture];
|
||||
[blitCommandEncoder endEncoding];
|
||||
[commandBuffer commit];
|
||||
|
||||
/* Set Mapping and tag that we need to (re-)upload to device */
|
||||
/* Set Mapping. */
|
||||
texture_slot_map[slot] = mtlTexture;
|
||||
texture_info[slot] = mem.info;
|
||||
need_texture_info = true;
|
||||
|
||||
texture_info[slot].data = uint64_t(slot) | (sampler_index << 32);
|
||||
|
||||
if (max_working_set_exceeded()) {
|
||||
@@ -1305,27 +1170,20 @@ void MetalDevice::tex_copy_to(device_texture &mem)
|
||||
|
||||
void MetalDevice::tex_free(device_texture &mem)
|
||||
{
|
||||
int slot = mem.slot;
|
||||
if (mem.data_height == 0) {
|
||||
generic_free(mem);
|
||||
return;
|
||||
}
|
||||
|
||||
if (metal_mem_map.count(&mem)) {
|
||||
else if (metal_mem_map.count(&mem)) {
|
||||
std::lock_guard<std::recursive_mutex> lock(metal_mem_map_mutex);
|
||||
MetalMem &mmem = *metal_mem_map.at(&mem);
|
||||
|
||||
assert(texture_slot_map[mem.slot] == mmem.mtlTexture);
|
||||
if (texture_slot_map[mem.slot] == mmem.mtlTexture) {
|
||||
texture_slot_map[mem.slot] = nil;
|
||||
}
|
||||
|
||||
if (mmem.mtlTexture) {
|
||||
/* Free bindless texture. */
|
||||
delayed_free_list.push_back(mmem.mtlTexture);
|
||||
mmem.mtlTexture = nil;
|
||||
}
|
||||
/* Free bindless texture. */
|
||||
delayed_free_list.push_back(mmem.mtlTexture);
|
||||
mmem.mtlTexture = nil;
|
||||
erase_allocation(mem);
|
||||
}
|
||||
texture_slot_map[slot] = nil;
|
||||
}
|
||||
|
||||
unique_ptr<DeviceQueue> MetalDevice::gpu_queue_create()
|
||||
@@ -1388,6 +1246,7 @@ void MetalDevice::free_bvh()
|
||||
[blas release];
|
||||
}
|
||||
unique_blas_array.clear();
|
||||
blas_array.clear();
|
||||
|
||||
if (blas_buffer) {
|
||||
[blas_buffer release];
|
||||
@@ -1410,6 +1269,7 @@ void MetalDevice::update_bvh(BVHMetal *bvh_metal)
|
||||
|
||||
accel_struct = bvh_metal->accel_struct;
|
||||
unique_blas_array = bvh_metal->unique_blas_array;
|
||||
blas_array = bvh_metal->blas_array;
|
||||
|
||||
[accel_struct retain];
|
||||
for (id<MTLAccelerationStructure> &blas : unique_blas_array) {
|
||||
@@ -1417,17 +1277,9 @@ void MetalDevice::update_bvh(BVHMetal *bvh_metal)
|
||||
}
|
||||
|
||||
// Allocate required buffers for BLAS array.
|
||||
uint64_t count = bvh_metal->blas_array.size();
|
||||
uint64_t buffer_size = mtlBlasArgEncoder.encodedLength * count;
|
||||
uint64_t buffer_size = blas_array.size() * sizeof(uint64_t);
|
||||
blas_buffer = [mtlDevice newBufferWithLength:buffer_size options:MTLResourceStorageModeShared];
|
||||
stats.mem_alloc(blas_buffer.allocatedSize);
|
||||
|
||||
for (uint64_t i = 0; i < count; ++i) {
|
||||
if (bvh_metal->blas_array[i]) {
|
||||
[mtlBlasArgEncoder setArgumentBuffer:blas_buffer offset:i * mtlBlasArgEncoder.encodedLength];
|
||||
[mtlBlasArgEncoder setAccelerationStructure:bvh_metal->blas_array[i] atIndex:0];
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
CCL_NAMESPACE_END
|
||||
|
||||
@@ -340,7 +340,7 @@ void ShaderCache::load_kernel(DeviceKernel device_kernel,
|
||||
* to be active. */
|
||||
pipeline->pipeline_id = g_next_pipeline_id.fetch_add(1);
|
||||
pipeline->originating_device_id = device->device_id;
|
||||
pipeline->kernel_data_ = device->launch_params.data;
|
||||
pipeline->kernel_data_ = device->launch_params->data;
|
||||
pipeline->pso_type = pso_type;
|
||||
pipeline->mtlDevice = mtlDevice;
|
||||
pipeline->kernels_md5 = device->kernels_md5[pso_type];
|
||||
|
||||
@@ -15,6 +15,9 @@
|
||||
|
||||
# define MAX_SAMPLE_BUFFER_LENGTH 4096
|
||||
|
||||
/* The number of resources to be contiguously encoded into the MetalAncillaries struct. */
|
||||
# define ANCILLARY_SLOT_COUNT 11
|
||||
|
||||
CCL_NAMESPACE_BEGIN
|
||||
|
||||
class MetalDevice;
|
||||
@@ -57,7 +60,6 @@ class MetalDeviceQueue : public DeviceQueue {
|
||||
id<MTLBlitCommandEncoder> get_blit_encoder();
|
||||
|
||||
MetalDevice *metal_device_;
|
||||
MetalBufferPool temp_buffer_pool_;
|
||||
|
||||
API_AVAILABLE(macos(11.0), ios(14.0))
|
||||
MTLCommandBufferDescriptor *command_buffer_desc_ = nullptr;
|
||||
|
||||
@@ -317,11 +317,71 @@ bool MetalDeviceQueue::supports_local_atomic_sort() const
|
||||
return metal_device_->use_local_atomic_sort();
|
||||
}
|
||||
|
||||
static void zero_resource(void *address_in_arg_buffer, int index = 0)
|
||||
{
|
||||
uint64_t *pptr = (uint64_t *)address_in_arg_buffer;
|
||||
pptr[index] = 0;
|
||||
}
|
||||
|
||||
template<class T> void write_resource(void *address_in_arg_buffer, T resource, int index = 0)
|
||||
{
|
||||
zero_resource(address_in_arg_buffer, index);
|
||||
uint64_t *pptr = (uint64_t *)address_in_arg_buffer;
|
||||
if (resource) {
|
||||
pptr[index] = metal_gpuResourceID(resource);
|
||||
}
|
||||
}
|
||||
|
||||
template<> void write_resource(void *address_in_arg_buffer, id<MTLBuffer> buffer, int index)
|
||||
{
|
||||
zero_resource(address_in_arg_buffer, index);
|
||||
uint64_t *pptr = (uint64_t *)address_in_arg_buffer;
|
||||
if (buffer) {
|
||||
pptr[index] = metal_gpuAddress(buffer);
|
||||
}
|
||||
}
|
||||
|
||||
static id<MTLBuffer> patch_resource(void *address_in_arg_buffer, int index = 0)
|
||||
{
|
||||
uint64_t *pptr = (uint64_t *)address_in_arg_buffer;
|
||||
if (MetalDevice::MetalMem *mmem = (MetalDevice::MetalMem *)pptr[index]) {
|
||||
write_resource<id<MTLBuffer>>(address_in_arg_buffer, mmem->mtlBuffer, index);
|
||||
return mmem->mtlBuffer;
|
||||
}
|
||||
return nil;
|
||||
}
|
||||
|
||||
void MetalDeviceQueue::init_execution()
|
||||
{
|
||||
/* Synchronize all textures and memory copies before executing task. */
|
||||
metal_device_->load_texture_info();
|
||||
/* Populate blas_array. */
|
||||
uint64_t *blas_array = (uint64_t *)metal_device_->blas_buffer.contents;
|
||||
for (uint64_t slot = 0; slot < metal_device_->blas_array.size(); ++slot) {
|
||||
write_resource(blas_array, metal_device_->blas_array[slot], slot);
|
||||
}
|
||||
|
||||
device_vector<TextureInfo> &texture_info = metal_device_->texture_info;
|
||||
id<MTLBuffer> &texture_bindings = metal_device_->texture_bindings;
|
||||
std::vector<id<MTLResource>> &texture_slot_map = metal_device_->texture_slot_map;
|
||||
|
||||
/* Ensure texture_info is allocated before populating. */
|
||||
texture_info.copy_to_device();
|
||||
|
||||
/* Populate texture bindings. */
|
||||
uint64_t *bindings = (uint64_t *)texture_bindings.contents;
|
||||
memset(bindings, 0, texture_bindings.length);
|
||||
for (int slot = 0; slot < texture_info.size(); ++slot) {
|
||||
if (texture_slot_map[slot]) {
|
||||
if (metal_device_->is_texture(texture_info[slot])) {
|
||||
write_resource(bindings, id<MTLTexture>(texture_slot_map[slot]), slot);
|
||||
}
|
||||
else {
|
||||
/* The GPU address of a 1D buffer texture is written into the slot data field. */
|
||||
write_resource(&texture_info[slot].data, id<MTLBuffer>(texture_slot_map[slot]), 0);
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
/* Synchronize memory copies. */
|
||||
synchronize();
|
||||
}
|
||||
|
||||
@@ -346,83 +406,6 @@ bool MetalDeviceQueue::enqueue(DeviceKernel kernel,
|
||||
if (profiling_enabled_) {
|
||||
command_encoder_labels_.push_back({kernel, work_size, current_encoder_idx_});
|
||||
}
|
||||
|
||||
/* Determine size requirement for argument buffer. */
|
||||
size_t arg_buffer_length = 0;
|
||||
for (size_t i = 0; i < args.count; i++) {
|
||||
size_t size_in_bytes = args.sizes[i];
|
||||
arg_buffer_length = round_up(arg_buffer_length, size_in_bytes) + size_in_bytes;
|
||||
}
|
||||
/* 256 is the Metal offset alignment for constant address space bindings */
|
||||
arg_buffer_length = round_up(arg_buffer_length, 256);
|
||||
|
||||
/* Globals placed after "vanilla" arguments. */
|
||||
size_t globals_offsets = arg_buffer_length;
|
||||
arg_buffer_length += sizeof(KernelParamsMetal);
|
||||
arg_buffer_length = round_up(arg_buffer_length, 256);
|
||||
|
||||
/* Metal ancillary bindless pointers. */
|
||||
size_t metal_offsets = arg_buffer_length;
|
||||
arg_buffer_length += metal_device_->mtlAncillaryArgEncoder.encodedLength;
|
||||
arg_buffer_length = round_up(arg_buffer_length,
|
||||
metal_device_->mtlAncillaryArgEncoder.alignment);
|
||||
|
||||
/* Temporary buffer used to prepare arg_buffer */
|
||||
uint8_t *init_arg_buffer = (uint8_t *)alloca(arg_buffer_length);
|
||||
memset(init_arg_buffer, 0, arg_buffer_length);
|
||||
|
||||
/* Prepare the non-pointer "enqueue" arguments */
|
||||
size_t bytes_written = 0;
|
||||
for (size_t i = 0; i < args.count; i++) {
|
||||
size_t size_in_bytes = args.sizes[i];
|
||||
bytes_written = round_up(bytes_written, size_in_bytes);
|
||||
if (args.types[i] != DeviceKernelArguments::POINTER) {
|
||||
memcpy(init_arg_buffer + bytes_written, args.values[i], size_in_bytes);
|
||||
}
|
||||
bytes_written += size_in_bytes;
|
||||
}
|
||||
|
||||
/* Prepare any non-pointer (i.e. plain-old-data) KernelParamsMetal data */
|
||||
/* The plain-old-data is contiguous, continuing to the end of KernelParamsMetal */
|
||||
size_t plain_old_launch_data_offset = offsetof(KernelParamsMetal, integrator_state) +
|
||||
offsetof(IntegratorStateGPU, sort_partition_divisor);
|
||||
size_t plain_old_launch_data_size = sizeof(KernelParamsMetal) - plain_old_launch_data_offset;
|
||||
memcpy(init_arg_buffer + globals_offsets + plain_old_launch_data_offset,
|
||||
(uint8_t *)&metal_device_->launch_params + plain_old_launch_data_offset,
|
||||
plain_old_launch_data_size);
|
||||
|
||||
/* Allocate an argument buffer. */
|
||||
id<MTLBuffer> arg_buffer = temp_buffer_pool_.get_buffer(
|
||||
mtlDevice_, mtlCommandBuffer_, arg_buffer_length, init_arg_buffer, stats_);
|
||||
|
||||
/* Encode the pointer "enqueue" arguments */
|
||||
bytes_written = 0;
|
||||
for (size_t i = 0; i < args.count; i++) {
|
||||
size_t size_in_bytes = args.sizes[i];
|
||||
bytes_written = round_up(bytes_written, size_in_bytes);
|
||||
if (args.types[i] == DeviceKernelArguments::POINTER) {
|
||||
[metal_device_->mtlBufferKernelParamsEncoder setArgumentBuffer:arg_buffer
|
||||
offset:bytes_written];
|
||||
if (MetalDevice::MetalMem *mmem = *(MetalDevice::MetalMem **)args.values[i]) {
|
||||
[mtlComputeCommandEncoder useResource:mmem->mtlBuffer
|
||||
usage:MTLResourceUsageRead | MTLResourceUsageWrite];
|
||||
[metal_device_->mtlBufferKernelParamsEncoder setBuffer:mmem->mtlBuffer
|
||||
offset:0
|
||||
atIndex:0];
|
||||
}
|
||||
else {
|
||||
if (@available(macos 12.0, *)) {
|
||||
[metal_device_->mtlBufferKernelParamsEncoder setBuffer:nil offset:0 atIndex:0];
|
||||
}
|
||||
}
|
||||
}
|
||||
bytes_written += size_in_bytes;
|
||||
}
|
||||
|
||||
/* Encode KernelParamsMetal buffers */
|
||||
[metal_device_->mtlBufferKernelParamsEncoder setArgumentBuffer:arg_buffer
|
||||
offset:globals_offsets];
|
||||
|
||||
if (label_command_encoders_) {
|
||||
/* Add human-readable labels if we're doing any form of debugging / profiling. */
|
||||
mtlComputeCommandEncoder.label = [NSString
|
||||
@@ -431,28 +414,6 @@ bool MetalDeviceQueue::enqueue(DeviceKernel kernel,
|
||||
work_size];
|
||||
}
|
||||
|
||||
/* this relies on IntegratorStateGPU layout being contiguous device_ptrs. */
|
||||
const size_t pointer_block_end = offsetof(KernelParamsMetal, integrator_state) +
|
||||
offsetof(IntegratorStateGPU, sort_partition_divisor);
|
||||
for (size_t offset = 0; offset < pointer_block_end; offset += sizeof(device_ptr)) {
|
||||
int pointer_index = int(offset / sizeof(device_ptr));
|
||||
MetalDevice::MetalMem *mmem = *(
|
||||
MetalDevice::MetalMem **)((uint8_t *)&metal_device_->launch_params + offset);
|
||||
if (mmem && mmem->mem && (mmem->mtlBuffer || mmem->mtlTexture)) {
|
||||
[metal_device_->mtlBufferKernelParamsEncoder setBuffer:mmem->mtlBuffer
|
||||
offset:0
|
||||
atIndex:pointer_index];
|
||||
}
|
||||
else {
|
||||
if (@available(macos 12.0, *)) {
|
||||
[metal_device_->mtlBufferKernelParamsEncoder setBuffer:nil
|
||||
offset:0
|
||||
atIndex:pointer_index];
|
||||
}
|
||||
}
|
||||
}
|
||||
bytes_written = globals_offsets + sizeof(KernelParamsMetal);
|
||||
|
||||
if (!active_pipelines_[kernel].update(metal_device_, kernel)) {
|
||||
metal_device_->set_error(
|
||||
string_printf("Could not activate pipeline for %s\n", device_kernel_as_string(kernel)));
|
||||
@@ -460,47 +421,66 @@ bool MetalDeviceQueue::enqueue(DeviceKernel kernel,
|
||||
}
|
||||
MetalDispatchPipeline &active_pipeline = active_pipelines_[kernel];
|
||||
|
||||
/* Encode ancillaries */
|
||||
[metal_device_->mtlAncillaryArgEncoder setArgumentBuffer:arg_buffer offset:metal_offsets];
|
||||
[metal_device_->mtlAncillaryArgEncoder setBuffer:metal_device_->texture_bindings_2d
|
||||
offset:0
|
||||
atIndex:0];
|
||||
[metal_device_->mtlAncillaryArgEncoder setBuffer:metal_device_->buffer_bindings_1d
|
||||
offset:0
|
||||
atIndex:1];
|
||||
uint8_t dynamic_args[512] = {0};
|
||||
|
||||
if (@available(macos 12.0, *)) {
|
||||
if (metal_device_->use_metalrt && device_kernel_has_intersection(kernel)) {
|
||||
if (id<MTLAccelerationStructure> accel_struct = metal_device_->accel_struct) {
|
||||
[metal_device_->mtlAncillaryArgEncoder setAccelerationStructure:accel_struct atIndex:2];
|
||||
[metal_device_->mtlAncillaryArgEncoder setBuffer:metal_device_->blas_buffer
|
||||
offset:0
|
||||
atIndex:(METALRT_TABLE_NUM + 3)];
|
||||
}
|
||||
|
||||
for (int table = 0; table < METALRT_TABLE_NUM; table++) {
|
||||
if (active_pipeline.intersection_func_table[table]) {
|
||||
[active_pipeline.intersection_func_table[table] setBuffer:arg_buffer
|
||||
offset:globals_offsets
|
||||
atIndex:1];
|
||||
[metal_device_->mtlAncillaryArgEncoder
|
||||
setIntersectionFunctionTable:active_pipeline.intersection_func_table[table]
|
||||
atIndex:3 + table];
|
||||
[mtlComputeCommandEncoder useResource:active_pipeline.intersection_func_table[table]
|
||||
usage:MTLResourceUsageRead];
|
||||
}
|
||||
else {
|
||||
[metal_device_->mtlAncillaryArgEncoder setIntersectionFunctionTable:nil
|
||||
atIndex:3 + table];
|
||||
}
|
||||
/* Prepare the dynamic "enqueue" arguments */
|
||||
size_t dynamic_bytes_written = 0;
|
||||
size_t max_size_in_bytes = 0;
|
||||
for (size_t i = 0; i < args.count; i++) {
|
||||
size_t size_in_bytes = args.sizes[i];
|
||||
max_size_in_bytes = max(max_size_in_bytes, size_in_bytes);
|
||||
dynamic_bytes_written = round_up(dynamic_bytes_written, size_in_bytes);
|
||||
memcpy(dynamic_args + dynamic_bytes_written, args.values[i], size_in_bytes);
|
||||
if (args.types[i] == DeviceKernelArguments::POINTER) {
|
||||
if (id<MTLBuffer> buffer = patch_resource(dynamic_args + dynamic_bytes_written)) {
|
||||
[mtlComputeCommandEncoder useResource:buffer
|
||||
usage:MTLResourceUsageRead | MTLResourceUsageWrite];
|
||||
}
|
||||
}
|
||||
bytes_written = metal_offsets + metal_device_->mtlAncillaryArgEncoder.encodedLength;
|
||||
dynamic_bytes_written += size_in_bytes;
|
||||
}
|
||||
/* Apply conventional struct alignment (stops asserts firing when API validation is enabled).
|
||||
*/
|
||||
dynamic_bytes_written = round_up(dynamic_bytes_written, max_size_in_bytes);
|
||||
|
||||
/* Check that the dynamic args didn't overflow. */
|
||||
assert(dynamic_bytes_written <= sizeof(dynamic_args));
|
||||
|
||||
uint64_t ancillary_args[ANCILLARY_SLOT_COUNT] = {0};
|
||||
|
||||
/* Encode ancillaries */
|
||||
int ancillary_index = 0;
|
||||
write_resource(ancillary_args, metal_device_->texture_bindings, ancillary_index++);
|
||||
|
||||
if (metal_device_->use_metalrt) {
|
||||
write_resource(ancillary_args, metal_device_->accel_struct, ancillary_index++);
|
||||
write_resource(ancillary_args, metal_device_->blas_buffer, ancillary_index++);
|
||||
|
||||
/* Write the intersection function table. */
|
||||
for (int table_idx = 0; table_idx < METALRT_TABLE_NUM; table_idx++) {
|
||||
write_resource(
|
||||
ancillary_args, active_pipeline.intersection_func_table[table_idx], ancillary_index++);
|
||||
}
|
||||
assert(ancillary_index == ANCILLARY_SLOT_COUNT);
|
||||
}
|
||||
|
||||
[mtlComputeCommandEncoder setBuffer:arg_buffer offset:0 atIndex:0];
|
||||
[mtlComputeCommandEncoder setBuffer:arg_buffer offset:globals_offsets atIndex:1];
|
||||
[mtlComputeCommandEncoder setBuffer:arg_buffer offset:metal_offsets atIndex:2];
|
||||
/* Encode ancillaries */
|
||||
if (metal_device_->use_metalrt) {
|
||||
for (int table = 0; table < METALRT_TABLE_NUM; table++) {
|
||||
if (active_pipeline.intersection_func_table[table]) {
|
||||
[active_pipeline.intersection_func_table[table]
|
||||
setBuffer:metal_device_->launch_params_buffer
|
||||
offset:0
|
||||
atIndex:1];
|
||||
[mtlComputeCommandEncoder useResource:active_pipeline.intersection_func_table[table]
|
||||
usage:MTLResourceUsageRead];
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
[mtlComputeCommandEncoder setBytes:dynamic_args length:dynamic_bytes_written atIndex:0];
|
||||
[mtlComputeCommandEncoder setBuffer:metal_device_->launch_params_buffer offset:0 atIndex:1];
|
||||
[mtlComputeCommandEncoder setBytes:ancillary_args length:sizeof(ancillary_args) atIndex:2];
|
||||
|
||||
if (metal_device_->use_metalrt && device_kernel_has_intersection(kernel)) {
|
||||
if (@available(macos 12.0, *)) {
|
||||
@@ -542,7 +522,7 @@ bool MetalDeviceQueue::enqueue(DeviceKernel kernel,
|
||||
|
||||
case DEVICE_KERNEL_INTEGRATOR_SORT_BUCKET_PASS:
|
||||
case DEVICE_KERNEL_INTEGRATOR_SORT_WRITE_PASS: {
|
||||
int key_count = metal_device_->launch_params.data.max_shaders;
|
||||
int key_count = metal_device_->launch_params->data.max_shaders;
|
||||
shared_mem_bytes = (int)round_up(key_count * sizeof(int), 16);
|
||||
break;
|
||||
}
|
||||
@@ -674,7 +654,6 @@ bool MetalDeviceQueue::synchronize()
|
||||
|
||||
[mtlCommandBuffer_ release];
|
||||
|
||||
temp_buffer_pool_.process_command_buffer_completion(mtlCommandBuffer_);
|
||||
metal_device_->flush_delayed_free_list();
|
||||
|
||||
mtlCommandBuffer_ = nil;
|
||||
@@ -771,8 +750,7 @@ void MetalDeviceQueue::prepare_resources(DeviceKernel /*kernel*/)
|
||||
}
|
||||
|
||||
/* ancillaries */
|
||||
[mtlComputeEncoder_ useResource:metal_device_->texture_bindings_2d usage:MTLResourceUsageRead];
|
||||
[mtlComputeEncoder_ useResource:metal_device_->buffer_bindings_1d usage:MTLResourceUsageRead];
|
||||
[mtlComputeEncoder_ useResource:metal_device_->texture_bindings usage:MTLResourceUsageRead];
|
||||
}
|
||||
|
||||
id<MTLComputeCommandEncoder> MetalDeviceQueue::get_compute_encoder(DeviceKernel kernel)
|
||||
|
||||
@@ -38,26 +38,12 @@ struct MetalInfo {
|
||||
static string get_device_name(id<MTLDevice> device);
|
||||
};
|
||||
|
||||
/* Pool of MTLBuffers whose lifetime is linked to a single MTLCommandBuffer */
|
||||
class MetalBufferPool {
|
||||
struct MetalBufferListEntry {
|
||||
id<MTLBuffer> buffer;
|
||||
id<MTLCommandBuffer> command_buffer;
|
||||
};
|
||||
std::vector<MetalBufferListEntry> temp_buffers;
|
||||
thread_mutex buffer_mutex;
|
||||
size_t total_temp_mem_size = 0;
|
||||
void metal_gpu_address_helper_init(id<MTLDevice> device);
|
||||
|
||||
public:
|
||||
~MetalBufferPool();
|
||||
|
||||
id<MTLBuffer> get_buffer(id<MTLDevice> device,
|
||||
id<MTLCommandBuffer> command_buffer,
|
||||
NSUInteger length,
|
||||
const void *pointer,
|
||||
Stats &stats);
|
||||
void process_command_buffer_completion(id<MTLCommandBuffer> command_buffer);
|
||||
};
|
||||
uint64_t metal_gpuAddress(id<MTLBuffer> buffer);
|
||||
uint64_t metal_gpuResourceID(id<MTLTexture> texture);
|
||||
uint64_t metal_gpuResourceID(id<MTLAccelerationStructure> accel_struct);
|
||||
uint64_t metal_gpuResourceID(id<MTLIntersectionFunctionTable> ift);
|
||||
|
||||
CCL_NAMESPACE_END
|
||||
|
||||
|
||||
@@ -18,6 +18,9 @@
|
||||
|
||||
CCL_NAMESPACE_BEGIN
|
||||
|
||||
/* Comment this out to test workaround for getting gpuAddress and gpuResourceID on macOS < 13.0. */
|
||||
# define CYCLES_USE_TIER2D_BINDLESS
|
||||
|
||||
string MetalInfo::get_device_name(id<MTLDevice> device)
|
||||
{
|
||||
string device_name = [device.name UTF8String];
|
||||
@@ -118,63 +121,110 @@ const vector<id<MTLDevice>> &MetalInfo::get_usable_devices()
|
||||
return usable_devices;
|
||||
}
|
||||
|
||||
id<MTLBuffer> MetalBufferPool::get_buffer(id<MTLDevice> device,
|
||||
id<MTLCommandBuffer> command_buffer,
|
||||
NSUInteger length,
|
||||
const void *pointer,
|
||||
Stats &stats)
|
||||
{
|
||||
id<MTLBuffer> buffer = nil;
|
||||
struct GPUAddressHelper {
|
||||
id<MTLBuffer> resource_buffer = nil;
|
||||
id<MTLArgumentEncoder> address_encoder = nil;
|
||||
|
||||
/* One time setup of arg encoder. */
|
||||
void init(id<MTLDevice> device)
|
||||
{
|
||||
thread_scoped_lock lock(buffer_mutex);
|
||||
/* Find an unused buffer with matching size and storage mode. */
|
||||
for (MetalBufferListEntry &bufferEntry : temp_buffers) {
|
||||
if (bufferEntry.buffer.length == length && bufferEntry.command_buffer == nil) {
|
||||
buffer = bufferEntry.buffer;
|
||||
bufferEntry.command_buffer = command_buffer;
|
||||
break;
|
||||
}
|
||||
if (resource_buffer) {
|
||||
/* No setup required - already initialised. */
|
||||
return;
|
||||
}
|
||||
if (!buffer) {
|
||||
/* Create a new buffer and add it to the pool. Typically this pool will only grow to a
|
||||
* handful of entries. */
|
||||
buffer = [device newBufferWithLength:length options:MTLResourceStorageModeShared];
|
||||
stats.mem_alloc(buffer.allocatedSize);
|
||||
total_temp_mem_size += buffer.allocatedSize;
|
||||
temp_buffers.push_back(MetalBufferListEntry{buffer, command_buffer});
|
||||
|
||||
# ifdef CYCLES_USE_TIER2D_BINDLESS
|
||||
if (@available(macos 13.0, *)) {
|
||||
/* No setup required - there's an API now! */
|
||||
return;
|
||||
}
|
||||
# endif
|
||||
|
||||
/* Setup a tiny buffer to encode the GPU address / resourceID into. */
|
||||
resource_buffer = [device newBufferWithLength:8 options:MTLResourceStorageModeShared];
|
||||
|
||||
/* Create an encoder to extract a gpuAddress from a MTLBuffer. */
|
||||
MTLArgumentDescriptor *encoder_params = [[MTLArgumentDescriptor alloc] init];
|
||||
encoder_params.arrayLength = 1;
|
||||
encoder_params.access = MTLBindingAccessReadWrite;
|
||||
encoder_params.dataType = MTLDataTypePointer;
|
||||
address_encoder = [device newArgumentEncoderWithArguments:@[ encoder_params ]];
|
||||
[address_encoder setArgumentBuffer:resource_buffer offset:0];
|
||||
};
|
||||
|
||||
uint64_t gpuAddress(id<MTLBuffer> buffer)
|
||||
{
|
||||
# ifdef CYCLES_USE_TIER2D_BINDLESS
|
||||
if (@available(macos 13.0, *)) {
|
||||
return buffer.gpuAddress;
|
||||
}
|
||||
# endif
|
||||
[address_encoder setBuffer:buffer offset:0 atIndex:0];
|
||||
return *(uint64_t *)[resource_buffer contents];
|
||||
}
|
||||
|
||||
/* Copy over data */
|
||||
if (pointer) {
|
||||
memcpy(buffer.contents, pointer, length);
|
||||
uint64_t gpuResourceID(id<MTLTexture> texture)
|
||||
{
|
||||
# ifdef CYCLES_USE_TIER2D_BINDLESS
|
||||
if (@available(macos 13.0, *)) {
|
||||
MTLResourceID resourceID = texture.gpuResourceID;
|
||||
return (uint64_t &)resourceID;
|
||||
}
|
||||
# endif
|
||||
[address_encoder setTexture:texture atIndex:0];
|
||||
return *(uint64_t *)[resource_buffer contents];
|
||||
}
|
||||
|
||||
return buffer;
|
||||
uint64_t gpuResourceID(id<MTLAccelerationStructure> accel_struct)
|
||||
{
|
||||
# ifdef CYCLES_USE_TIER2D_BINDLESS
|
||||
if (@available(macos 13.0, *)) {
|
||||
MTLResourceID resourceID = accel_struct.gpuResourceID;
|
||||
return (uint64_t &)resourceID;
|
||||
}
|
||||
# endif
|
||||
[address_encoder setAccelerationStructure:accel_struct atIndex:0];
|
||||
return *(uint64_t *)[resource_buffer contents];
|
||||
}
|
||||
|
||||
uint64_t gpuResourceID(id<MTLIntersectionFunctionTable> ift)
|
||||
{
|
||||
# ifdef CYCLES_USE_TIER2D_BINDLESS
|
||||
if (@available(macos 13.0, *)) {
|
||||
MTLResourceID resourceID = ift.gpuResourceID;
|
||||
return (uint64_t &)resourceID;
|
||||
}
|
||||
# endif
|
||||
[address_encoder setIntersectionFunctionTable:ift atIndex:0];
|
||||
return *(uint64_t *)[resource_buffer contents];
|
||||
}
|
||||
};
|
||||
|
||||
GPUAddressHelper g_gpu_address_helper;
|
||||
|
||||
void metal_gpu_address_helper_init(id<MTLDevice> device)
|
||||
{
|
||||
g_gpu_address_helper.init(device);
|
||||
}
|
||||
|
||||
void MetalBufferPool::process_command_buffer_completion(id<MTLCommandBuffer> command_buffer)
|
||||
uint64_t metal_gpuAddress(id<MTLBuffer> buffer)
|
||||
{
|
||||
assert(command_buffer);
|
||||
thread_scoped_lock lock(buffer_mutex);
|
||||
/* Mark any temp buffers associated with command_buffer as unused. */
|
||||
for (MetalBufferListEntry &buffer_entry : temp_buffers) {
|
||||
if (buffer_entry.command_buffer == command_buffer) {
|
||||
buffer_entry.command_buffer = nil;
|
||||
}
|
||||
}
|
||||
return g_gpu_address_helper.gpuAddress(buffer);
|
||||
}
|
||||
|
||||
MetalBufferPool::~MetalBufferPool()
|
||||
uint64_t metal_gpuResourceID(id<MTLTexture> texture)
|
||||
{
|
||||
thread_scoped_lock lock(buffer_mutex);
|
||||
/* Release all buffers that have not been recently reused */
|
||||
for (MetalBufferListEntry &buffer_entry : temp_buffers) {
|
||||
total_temp_mem_size -= buffer_entry.buffer.allocatedSize;
|
||||
[buffer_entry.buffer release];
|
||||
buffer_entry.buffer = nil;
|
||||
}
|
||||
temp_buffers.clear();
|
||||
return g_gpu_address_helper.gpuResourceID(texture);
|
||||
}
|
||||
|
||||
uint64_t metal_gpuResourceID(id<MTLAccelerationStructure> accel_struct)
|
||||
{
|
||||
return g_gpu_address_helper.gpuResourceID(accel_struct);
|
||||
}
|
||||
|
||||
uint64_t metal_gpuResourceID(id<MTLIntersectionFunctionTable> ift)
|
||||
{
|
||||
return g_gpu_address_helper.gpuResourceID(ift);
|
||||
}
|
||||
|
||||
CCL_NAMESPACE_END
|
||||
|
||||
@@ -330,10 +330,10 @@ typedef metal::raytracing::intersector<triangle_data, curve_data METALRT_LIMITS>
|
||||
|
||||
/* texture bindings and sampler setup */
|
||||
|
||||
struct Buffer1DParamsMetal {
|
||||
device float *buf;
|
||||
/* TextureParamsMetal is reinterpreted as Texture2DParamsMetal. */
|
||||
struct TextureParamsMetal {
|
||||
uint64_t tex;
|
||||
};
|
||||
|
||||
struct Texture2DParamsMetal {
|
||||
texture2d<float, access::sample> tex;
|
||||
};
|
||||
@@ -344,12 +344,15 @@ struct MetalRTBlasWrapper {
|
||||
};
|
||||
#endif
|
||||
|
||||
/* Additional Metal-specific resources which aren't encoded in KernelData.
|
||||
* IMPORTANT: If this layout changes, ANCILLARY_SLOT_COUNT and the host-side encoding must change
|
||||
* to match. */
|
||||
struct MetalAncillaries {
|
||||
device Texture2DParamsMetal *textures_2d;
|
||||
device Buffer1DParamsMetal *buffers;
|
||||
device TextureParamsMetal *textures;
|
||||
|
||||
#ifdef __METALRT__
|
||||
metalrt_as_type accel_struct;
|
||||
constant MetalRTBlasWrapper *blas_accel_structs;
|
||||
metalrt_ift_type ift_default;
|
||||
metalrt_ift_type ift_shadow;
|
||||
metalrt_ift_type ift_shadow_all;
|
||||
@@ -358,7 +361,6 @@ struct MetalAncillaries {
|
||||
metalrt_ift_type ift_local_mblur;
|
||||
metalrt_blas_ift_type ift_local_single_hit;
|
||||
metalrt_ift_type ift_local_single_hit_mblur;
|
||||
constant MetalRTBlasWrapper *blas_accel_structs;
|
||||
#endif
|
||||
};
|
||||
|
||||
|
||||
@@ -39,14 +39,14 @@ class MetalKernelContext {
|
||||
float4 ccl_gpu_tex_object_read_2D(ccl_gpu_tex_object_2D tex, const float x, float y) const {
|
||||
const uint tid(tex);
|
||||
const uint sid(tex >> 32);
|
||||
return metal_ancillaries->textures_2d[tid].tex.sample(metal_samplers[sid], float2(x, y));
|
||||
return ((ccl_global Texture2DParamsMetal*)metal_ancillaries->textures)[tid].tex.sample(metal_samplers[sid], float2(x, y));
|
||||
}
|
||||
template<>
|
||||
inline __attribute__((__always_inline__))
|
||||
float ccl_gpu_tex_object_read_2D(ccl_gpu_tex_object_2D tex, const float x, float y) const {
|
||||
const uint tid(tex);
|
||||
const uint sid(tex >> 32);
|
||||
return metal_ancillaries->textures_2d[tid].tex.sample(metal_samplers[sid], float2(x, y)).x;
|
||||
return ((ccl_global Texture2DParamsMetal*)metal_ancillaries->textures)[tid].tex.sample(metal_samplers[sid], float2(x, y)).x;
|
||||
}
|
||||
|
||||
# include "kernel/device/gpu/image.h"
|
||||
|
||||
Reference in New Issue
Block a user