2023-06-14 16:52:36 +10:00
|
|
|
/* SPDX-FileCopyrightText: 2021-2022 Blender Foundation
|
|
|
|
|
*
|
|
|
|
|
* SPDX-License-Identifier: Apache-2.0 */
|
2021-12-07 15:11:35 +00:00
|
|
|
|
|
|
|
|
#ifdef WITH_METAL
|
|
|
|
|
|
2024-12-26 17:53:59 +01:00
|
|
|
# include <map>
|
|
|
|
|
# include <mutex>
|
|
|
|
|
|
2021-12-07 15:11:35 +00:00
|
|
|
# include "device/metal/device.h"
|
2024-12-26 17:53:59 +01:00
|
|
|
# include "device/metal/device_impl.h"
|
2021-12-07 15:11:35 +00:00
|
|
|
|
2022-07-12 15:32:46 +02:00
|
|
|
# include "scene/scene.h"
|
|
|
|
|
|
2025-04-14 14:06:58 +02:00
|
|
|
# include "session/display_driver.h"
|
|
|
|
|
|
2021-12-07 15:11:35 +00:00
|
|
|
# include "util/debug.h"
|
|
|
|
|
# include "util/md5.h"
|
|
|
|
|
# include "util/path.h"
|
2022-06-22 22:32:34 +01:00
|
|
|
# include "util/time.h"
|
2021-12-07 15:11:35 +00:00
|
|
|
|
2024-08-08 16:01:23 +02:00
|
|
|
# include <TargetConditionals.h>
|
2023-01-04 14:23:33 +00:00
|
|
|
# include <crt_externs.h>
|
|
|
|
|
|
2021-12-07 15:11:35 +00:00
|
|
|
CCL_NAMESPACE_BEGIN
|
|
|
|
|
|
|
|
|
|
class MetalDevice;
|
|
|
|
|
|
2023-01-04 14:23:33 +00:00
|
|
|
thread_mutex MetalDevice::existing_devices_mutex;
|
|
|
|
|
std::map<int, MetalDevice *> MetalDevice::active_device_ids;
|
|
|
|
|
|
2023-01-06 13:57:21 +11:00
|
|
|
/* Thread-safe device access for async work. Calling code must pass an appropriately scoped lock
|
2023-01-04 14:23:33 +00:00
|
|
|
* to existing_devices_mutex to safeguard against destruction of the returned instance. */
|
2025-01-01 18:15:54 +01:00
|
|
|
MetalDevice *MetalDevice::get_device_by_ID(const int ID,
|
2023-07-05 12:02:06 +02:00
|
|
|
thread_scoped_lock & /*existing_devices_mutex_lock*/)
|
2023-01-04 14:23:33 +00:00
|
|
|
{
|
|
|
|
|
auto it = active_device_ids.find(ID);
|
|
|
|
|
if (it != active_device_ids.end()) {
|
|
|
|
|
return it->second;
|
|
|
|
|
}
|
|
|
|
|
return nullptr;
|
|
|
|
|
}
|
|
|
|
|
|
2025-01-01 18:15:54 +01:00
|
|
|
bool MetalDevice::is_device_cancelled(const int ID)
|
2023-01-04 14:23:33 +00:00
|
|
|
{
|
|
|
|
|
thread_scoped_lock lock(existing_devices_mutex);
|
|
|
|
|
return get_device_by_ID(ID, lock) == nullptr;
|
|
|
|
|
}
|
|
|
|
|
|
2023-04-06 12:16:13 +02:00
|
|
|
BVHLayoutMask MetalDevice::get_bvh_layout_mask(uint /*kernel_features*/) const
|
2021-12-07 15:11:35 +00:00
|
|
|
{
|
|
|
|
|
return use_metalrt ? BVH_LAYOUT_METAL : BVH_LAYOUT_BVH2;
|
|
|
|
|
}
|
|
|
|
|
|
|
|
|
|
void MetalDevice::set_error(const string &error)
|
|
|
|
|
{
|
|
|
|
|
static std::mutex s_error_mutex;
|
|
|
|
|
std::lock_guard<std::mutex> lock(s_error_mutex);
|
|
|
|
|
|
|
|
|
|
Device::set_error(error);
|
|
|
|
|
|
2023-05-05 18:52:54 +02:00
|
|
|
if (!has_error) {
|
2021-12-07 15:11:35 +00:00
|
|
|
fprintf(stderr, "\nRefer to the Cycles GPU rendering documentation for possible solutions:\n");
|
|
|
|
|
fprintf(stderr,
|
|
|
|
|
"https://docs.blender.org/manual/en/latest/render/cycles/gpu_rendering.html\n\n");
|
2023-05-05 18:52:54 +02:00
|
|
|
has_error = true;
|
2021-12-07 15:11:35 +00:00
|
|
|
}
|
|
|
|
|
}
|
|
|
|
|
|
2024-06-07 17:53:44 +02:00
|
|
|
MetalDevice::MetalDevice(const DeviceInfo &info, Stats &stats, Profiler &profiler, bool headless)
|
|
|
|
|
: Device(info, stats, profiler, headless), texture_info(this, "texture_info", MEM_GLOBAL)
|
2021-12-07 15:11:35 +00:00
|
|
|
{
|
2023-10-24 23:20:16 +01:00
|
|
|
@autoreleasepool {
|
|
|
|
|
{
|
|
|
|
|
/* Assign an ID for this device which we can use to query whether async shader compilation
|
|
|
|
|
* requests are still relevant. */
|
|
|
|
|
thread_scoped_lock lock(existing_devices_mutex);
|
|
|
|
|
static int existing_devices_counter = 1;
|
|
|
|
|
device_id = existing_devices_counter++;
|
|
|
|
|
active_device_ids[device_id] = this;
|
|
|
|
|
}
|
2021-12-07 15:11:35 +00:00
|
|
|
|
2023-10-24 23:20:16 +01:00
|
|
|
mtlDevId = info.num;
|
2021-12-07 15:11:35 +00:00
|
|
|
|
2023-10-24 23:20:16 +01:00
|
|
|
/* select chosen device */
|
|
|
|
|
auto usable_devices = MetalInfo::get_usable_devices();
|
|
|
|
|
assert(mtlDevId < usable_devices.size());
|
|
|
|
|
mtlDevice = usable_devices[mtlDevId];
|
|
|
|
|
metal_printf("Creating new Cycles Metal device: %s\n", info.description.c_str());
|
2021-12-07 15:11:35 +00:00
|
|
|
|
2025-03-20 14:07:14 +01:00
|
|
|
/* Enable increased concurrent shader compiler limit.
|
|
|
|
|
* This is also done by MTLContext::MTLContext, but only in GUI mode. */
|
|
|
|
|
if (@available(macOS 13.3, *)) {
|
|
|
|
|
[mtlDevice setShouldMaximizeConcurrentCompilation:YES];
|
|
|
|
|
}
|
|
|
|
|
|
2024-06-26 17:16:20 +02:00
|
|
|
max_threads_per_threadgroup = 512;
|
2023-01-04 16:01:24 +00:00
|
|
|
|
2023-10-24 23:20:16 +01:00
|
|
|
use_metalrt = info.use_hardware_raytracing;
|
2024-12-26 17:53:59 +01:00
|
|
|
if (auto *metalrt = getenv("CYCLES_METALRT")) {
|
2023-10-24 23:20:16 +01:00
|
|
|
use_metalrt = (atoi(metalrt) != 0);
|
2021-12-07 15:11:35 +00:00
|
|
|
}
|
|
|
|
|
|
2025-04-03 16:24:04 +02:00
|
|
|
# if defined(MAC_OS_VERSION_15_0)
|
|
|
|
|
/* Use "Ray tracing with per component motion interpolation" if available.
|
|
|
|
|
* Requires Apple9 support (https://developer.apple.com/metal/Metal-Feature-Set-Tables.pdf). */
|
|
|
|
|
if (use_metalrt && [mtlDevice supportsFamily:MTLGPUFamilyApple9]) {
|
|
|
|
|
if (@available(macos 15.0, *)) {
|
|
|
|
|
use_pcmi = DebugFlags().metal.use_metalrt_pcmi;
|
|
|
|
|
}
|
|
|
|
|
}
|
|
|
|
|
# endif
|
|
|
|
|
|
2023-10-24 23:20:16 +01:00
|
|
|
if (getenv("CYCLES_DEBUG_METAL_CAPTURE_KERNEL")) {
|
|
|
|
|
capture_enabled = true;
|
|
|
|
|
}
|
2021-12-07 15:11:35 +00:00
|
|
|
|
2025-03-21 12:47:15 +01:00
|
|
|
/* Create a global counter sampling buffer when kernel profiling is enabled.
|
|
|
|
|
* There's a limit to the number of concurrent counter sampling buffers per device, so we
|
|
|
|
|
* create one that can be reused by successive device queues. */
|
|
|
|
|
if (auto str = getenv("CYCLES_METAL_PROFILING")) {
|
|
|
|
|
if (atoi(str) && [mtlDevice supportsCounterSampling:MTLCounterSamplingPointAtStageBoundary])
|
|
|
|
|
{
|
|
|
|
|
NSArray<id<MTLCounterSet>> *counterSets = [mtlDevice counterSets];
|
|
|
|
|
|
|
|
|
|
NSError *error = nil;
|
|
|
|
|
MTLCounterSampleBufferDescriptor *desc = [[MTLCounterSampleBufferDescriptor alloc] init];
|
|
|
|
|
[desc setStorageMode:MTLStorageModeShared];
|
|
|
|
|
[desc setLabel:@"CounterSampleBuffer"];
|
|
|
|
|
[desc setSampleCount:MAX_SAMPLE_BUFFER_LENGTH];
|
|
|
|
|
[desc setCounterSet:counterSets[0]];
|
|
|
|
|
mtlCounterSampleBuffer = [mtlDevice newCounterSampleBufferWithDescriptor:desc
|
|
|
|
|
error:&error];
|
|
|
|
|
[mtlCounterSampleBuffer retain];
|
|
|
|
|
}
|
|
|
|
|
}
|
|
|
|
|
|
2024-06-26 17:16:20 +02:00
|
|
|
/* Set kernel_specialization_level based on user preferences. */
|
|
|
|
|
switch (info.kernel_optimization_level) {
|
|
|
|
|
case KERNEL_OPTIMIZATION_LEVEL_OFF:
|
|
|
|
|
kernel_specialization_level = PSO_GENERIC;
|
|
|
|
|
break;
|
|
|
|
|
default:
|
|
|
|
|
case KERNEL_OPTIMIZATION_LEVEL_INTERSECT:
|
|
|
|
|
kernel_specialization_level = PSO_SPECIALIZED_INTERSECT;
|
|
|
|
|
break;
|
|
|
|
|
case KERNEL_OPTIMIZATION_LEVEL_FULL:
|
|
|
|
|
kernel_specialization_level = PSO_SPECIALIZED_SHADE;
|
|
|
|
|
break;
|
2023-10-24 23:20:16 +01:00
|
|
|
}
|
2021-12-07 15:11:35 +00:00
|
|
|
|
2024-12-26 17:53:59 +01:00
|
|
|
if (auto *envstr = getenv("CYCLES_METAL_SPECIALIZATION_LEVEL")) {
|
2023-10-24 23:20:16 +01:00
|
|
|
kernel_specialization_level = (MetalPipelineType)atoi(envstr);
|
|
|
|
|
}
|
|
|
|
|
metal_printf("kernel_specialization_level = %s\n",
|
|
|
|
|
kernel_type_as_string(
|
|
|
|
|
(MetalPipelineType)min((int)kernel_specialization_level, (int)PSO_NUM - 1)));
|
|
|
|
|
|
|
|
|
|
MTLArgumentDescriptor *arg_desc_params = [[MTLArgumentDescriptor alloc] init];
|
|
|
|
|
arg_desc_params.dataType = MTLDataTypePointer;
|
|
|
|
|
arg_desc_params.access = MTLArgumentAccessReadOnly;
|
|
|
|
|
arg_desc_params.arrayLength = sizeof(KernelParamsMetal) / sizeof(device_ptr);
|
|
|
|
|
mtlBufferKernelParamsEncoder = [mtlDevice
|
|
|
|
|
newArgumentEncoderWithArguments:@[ arg_desc_params ]];
|
|
|
|
|
|
|
|
|
|
MTLArgumentDescriptor *arg_desc_texture = [[MTLArgumentDescriptor alloc] init];
|
|
|
|
|
arg_desc_texture.dataType = MTLDataTypeTexture;
|
|
|
|
|
arg_desc_texture.access = MTLArgumentAccessReadOnly;
|
|
|
|
|
mtlTextureArgEncoder = [mtlDevice newArgumentEncoderWithArguments:@[ arg_desc_texture ]];
|
|
|
|
|
MTLArgumentDescriptor *arg_desc_buffer = [[MTLArgumentDescriptor alloc] init];
|
|
|
|
|
arg_desc_buffer.dataType = MTLDataTypePointer;
|
|
|
|
|
arg_desc_buffer.access = MTLArgumentAccessReadOnly;
|
|
|
|
|
mtlBufferArgEncoder = [mtlDevice newArgumentEncoderWithArguments:@[ arg_desc_buffer ]];
|
|
|
|
|
|
2025-03-19 12:53:01 +01:00
|
|
|
buffer_bindings_1d = [mtlDevice newBufferWithLength:8192 options:MTLResourceStorageModeShared];
|
|
|
|
|
texture_bindings_2d = [mtlDevice newBufferWithLength:8192
|
|
|
|
|
options:MTLResourceStorageModeShared];
|
|
|
|
|
texture_bindings_3d = [mtlDevice newBufferWithLength:8192
|
|
|
|
|
options:MTLResourceStorageModeShared];
|
2023-10-24 23:20:16 +01:00
|
|
|
stats.mem_alloc(buffer_bindings_1d.allocatedSize + texture_bindings_2d.allocatedSize +
|
|
|
|
|
texture_bindings_3d.allocatedSize);
|
2023-10-29 14:19:39 +01:00
|
|
|
|
|
|
|
|
/* Command queue for path-tracing work on the GPU. In a situation where multiple
|
|
|
|
|
* MetalDeviceQueues are spawned from one MetalDevice, they share the same MTLCommandQueue.
|
|
|
|
|
* This is thread safe and just as performant as each having their own instance. It also
|
|
|
|
|
* adheres to best practices of maximizing the lifetime of each MTLCommandQueue. */
|
2023-10-24 23:20:16 +01:00
|
|
|
mtlComputeCommandQueue = [mtlDevice newCommandQueue];
|
2023-10-29 14:19:39 +01:00
|
|
|
|
|
|
|
|
/* Command queue for non-tracing work on the GPU. */
|
2023-10-24 23:20:16 +01:00
|
|
|
mtlGeneralCommandQueue = [mtlDevice newCommandQueue];
|
|
|
|
|
|
|
|
|
|
/* Acceleration structure arg encoder, if needed */
|
2021-12-07 15:11:35 +00:00
|
|
|
if (@available(macos 12.0, *)) {
|
|
|
|
|
if (use_metalrt) {
|
|
|
|
|
MTLArgumentDescriptor *arg_desc_as = [[MTLArgumentDescriptor alloc] init];
|
|
|
|
|
arg_desc_as.dataType = MTLDataTypeInstanceAccelerationStructure;
|
|
|
|
|
arg_desc_as.access = MTLArgumentAccessReadOnly;
|
2023-10-24 23:20:16 +01:00
|
|
|
mtlASArgEncoder = [mtlDevice newArgumentEncoderWithArguments:@[ arg_desc_as ]];
|
2021-12-07 15:11:35 +00:00
|
|
|
[arg_desc_as release];
|
|
|
|
|
}
|
|
|
|
|
}
|
|
|
|
|
|
2023-10-24 23:20:16 +01:00
|
|
|
/* Build the arg encoder for the ancillary bindings */
|
|
|
|
|
{
|
|
|
|
|
NSMutableArray *ancillary_desc = [[NSMutableArray alloc] init];
|
|
|
|
|
|
|
|
|
|
int index = 0;
|
|
|
|
|
MTLArgumentDescriptor *arg_desc_tex = [[MTLArgumentDescriptor alloc] init];
|
|
|
|
|
arg_desc_tex.dataType = MTLDataTypePointer;
|
|
|
|
|
arg_desc_tex.access = MTLArgumentAccessReadOnly;
|
|
|
|
|
|
|
|
|
|
arg_desc_tex.index = index++;
|
|
|
|
|
[ancillary_desc addObject:[arg_desc_tex copy]]; /* metal_buf_1d */
|
|
|
|
|
arg_desc_tex.index = index++;
|
|
|
|
|
[ancillary_desc addObject:[arg_desc_tex copy]]; /* metal_tex_2d */
|
|
|
|
|
arg_desc_tex.index = index++;
|
|
|
|
|
[ancillary_desc addObject:[arg_desc_tex copy]]; /* metal_tex_3d */
|
|
|
|
|
|
|
|
|
|
[arg_desc_tex release];
|
|
|
|
|
|
|
|
|
|
if (@available(macos 12.0, *)) {
|
|
|
|
|
if (use_metalrt) {
|
|
|
|
|
MTLArgumentDescriptor *arg_desc_as = [[MTLArgumentDescriptor alloc] init];
|
|
|
|
|
arg_desc_as.dataType = MTLDataTypeInstanceAccelerationStructure;
|
|
|
|
|
arg_desc_as.access = MTLArgumentAccessReadOnly;
|
|
|
|
|
|
|
|
|
|
MTLArgumentDescriptor *arg_desc_ptrs = [[MTLArgumentDescriptor alloc] init];
|
|
|
|
|
arg_desc_ptrs.dataType = MTLDataTypePointer;
|
|
|
|
|
arg_desc_ptrs.access = MTLArgumentAccessReadOnly;
|
|
|
|
|
|
|
|
|
|
MTLArgumentDescriptor *arg_desc_ift = [[MTLArgumentDescriptor alloc] init];
|
|
|
|
|
arg_desc_ift.dataType = MTLDataTypeIntersectionFunctionTable;
|
|
|
|
|
arg_desc_ift.access = MTLArgumentAccessReadOnly;
|
|
|
|
|
|
|
|
|
|
arg_desc_as.index = index++;
|
|
|
|
|
[ancillary_desc addObject:[arg_desc_as copy]]; /* accel_struct */
|
Cycles: MetalRT optimisations (scene_intersect_shadow + random_walk)
This PR contains optimisations and a general tidy-up of the MetalRT backend.
- Currently `scene_intersect` is used for both normal and (opaque) shadow rays, however the usage patterns are different enough to warrant specialisation. Shadow intersection tests (flagged with `PATH_RAY_SHADOW_OPAQUE`) only need a bool result, but need a larger "self" payload in order to exclude hits against target lights. By specialising we can minimise the payload size in each case (which is helps performance) and avoid some dynamic branching. This PR introduces a new `scene_intersect_shadow` function which is specialised in Metal, and currently redirects to `scene_intersect` in the other backends.
- Currently `scene_intersect_local` is implemented for worst-case payload requirements as demanded by `subsurface_disk` (where `max_hits` is 4). The random_walk case only demands 1 hit result which we can retrieve directly from the intersector object (rather than stashing it in the payload). By specialising, we significantly reduce the payload size for random_walk queries, which has a big impact on performance. Additionally, we only need to use a custom intersection function for the first ray test in a random walk (for self-primitive filtering), so this PR forces faster `opaque` intersection testing for all but the first random walk test.
- Currently `scene_intersect_volume` has a lot of redundant code to handle non-triangle primitives despite volumes only being enclosed by trimeshes. This PR removes this code.
Additionally, this PR tidies up the convoluted intersection function linking code, removes some redundant intersection handlers, and uses more consistent naming of intersection functions.
On a M3 MacBook Pro, these changes give 2-3% performance increase on typical scenes with opaque trimesh materials (e.g. barbershop, classroom junkshop), but can give over 15% performance increase for certain scenes using random walk SSS (e.g. monster).
Pull Request: https://projects.blender.org/blender/blender/pulls/121397
2024-05-10 16:38:02 +02:00
|
|
|
|
|
|
|
|
/* Intersection function tables */
|
2023-10-24 23:20:16 +01:00
|
|
|
arg_desc_ift.index = index++;
|
|
|
|
|
[ancillary_desc addObject:[arg_desc_ift copy]]; /* ift_default */
|
|
|
|
|
arg_desc_ift.index = index++;
|
|
|
|
|
[ancillary_desc addObject:[arg_desc_ift copy]]; /* ift_shadow */
|
|
|
|
|
arg_desc_ift.index = index++;
|
Cycles: MetalRT optimisations (scene_intersect_shadow + random_walk)
This PR contains optimisations and a general tidy-up of the MetalRT backend.
- Currently `scene_intersect` is used for both normal and (opaque) shadow rays, however the usage patterns are different enough to warrant specialisation. Shadow intersection tests (flagged with `PATH_RAY_SHADOW_OPAQUE`) only need a bool result, but need a larger "self" payload in order to exclude hits against target lights. By specialising we can minimise the payload size in each case (which is helps performance) and avoid some dynamic branching. This PR introduces a new `scene_intersect_shadow` function which is specialised in Metal, and currently redirects to `scene_intersect` in the other backends.
- Currently `scene_intersect_local` is implemented for worst-case payload requirements as demanded by `subsurface_disk` (where `max_hits` is 4). The random_walk case only demands 1 hit result which we can retrieve directly from the intersector object (rather than stashing it in the payload). By specialising, we significantly reduce the payload size for random_walk queries, which has a big impact on performance. Additionally, we only need to use a custom intersection function for the first ray test in a random walk (for self-primitive filtering), so this PR forces faster `opaque` intersection testing for all but the first random walk test.
- Currently `scene_intersect_volume` has a lot of redundant code to handle non-triangle primitives despite volumes only being enclosed by trimeshes. This PR removes this code.
Additionally, this PR tidies up the convoluted intersection function linking code, removes some redundant intersection handlers, and uses more consistent naming of intersection functions.
On a M3 MacBook Pro, these changes give 2-3% performance increase on typical scenes with opaque trimesh materials (e.g. barbershop, classroom junkshop), but can give over 15% performance increase for certain scenes using random walk SSS (e.g. monster).
Pull Request: https://projects.blender.org/blender/blender/pulls/121397
2024-05-10 16:38:02 +02:00
|
|
|
[ancillary_desc addObject:[arg_desc_ift copy]]; /* ift_shadow_all */
|
|
|
|
|
arg_desc_ift.index = index++;
|
2023-10-24 23:20:16 +01:00
|
|
|
[ancillary_desc addObject:[arg_desc_ift copy]]; /* ift_volume */
|
|
|
|
|
arg_desc_ift.index = index++;
|
|
|
|
|
[ancillary_desc addObject:[arg_desc_ift copy]]; /* ift_local */
|
|
|
|
|
arg_desc_ift.index = index++;
|
Cycles: MetalRT optimisations (scene_intersect_shadow + random_walk)
This PR contains optimisations and a general tidy-up of the MetalRT backend.
- Currently `scene_intersect` is used for both normal and (opaque) shadow rays, however the usage patterns are different enough to warrant specialisation. Shadow intersection tests (flagged with `PATH_RAY_SHADOW_OPAQUE`) only need a bool result, but need a larger "self" payload in order to exclude hits against target lights. By specialising we can minimise the payload size in each case (which is helps performance) and avoid some dynamic branching. This PR introduces a new `scene_intersect_shadow` function which is specialised in Metal, and currently redirects to `scene_intersect` in the other backends.
- Currently `scene_intersect_local` is implemented for worst-case payload requirements as demanded by `subsurface_disk` (where `max_hits` is 4). The random_walk case only demands 1 hit result which we can retrieve directly from the intersector object (rather than stashing it in the payload). By specialising, we significantly reduce the payload size for random_walk queries, which has a big impact on performance. Additionally, we only need to use a custom intersection function for the first ray test in a random walk (for self-primitive filtering), so this PR forces faster `opaque` intersection testing for all but the first random walk test.
- Currently `scene_intersect_volume` has a lot of redundant code to handle non-triangle primitives despite volumes only being enclosed by trimeshes. This PR removes this code.
Additionally, this PR tidies up the convoluted intersection function linking code, removes some redundant intersection handlers, and uses more consistent naming of intersection functions.
On a M3 MacBook Pro, these changes give 2-3% performance increase on typical scenes with opaque trimesh materials (e.g. barbershop, classroom junkshop), but can give over 15% performance increase for certain scenes using random walk SSS (e.g. monster).
Pull Request: https://projects.blender.org/blender/blender/pulls/121397
2024-05-10 16:38:02 +02:00
|
|
|
[ancillary_desc addObject:[arg_desc_ift copy]]; /* ift_local_mblur */
|
|
|
|
|
arg_desc_ift.index = index++;
|
|
|
|
|
[ancillary_desc addObject:[arg_desc_ift copy]]; /* ift_local_single_hit */
|
|
|
|
|
arg_desc_ift.index = index++;
|
|
|
|
|
[ancillary_desc addObject:[arg_desc_ift copy]]; /* ift_local_single_hit_mblur */
|
|
|
|
|
|
2023-10-24 23:20:16 +01:00
|
|
|
arg_desc_ptrs.index = index++;
|
Cycles: MetalRT optimisations (scene_intersect_shadow + random_walk)
This PR contains optimisations and a general tidy-up of the MetalRT backend.
- Currently `scene_intersect` is used for both normal and (opaque) shadow rays, however the usage patterns are different enough to warrant specialisation. Shadow intersection tests (flagged with `PATH_RAY_SHADOW_OPAQUE`) only need a bool result, but need a larger "self" payload in order to exclude hits against target lights. By specialising we can minimise the payload size in each case (which is helps performance) and avoid some dynamic branching. This PR introduces a new `scene_intersect_shadow` function which is specialised in Metal, and currently redirects to `scene_intersect` in the other backends.
- Currently `scene_intersect_local` is implemented for worst-case payload requirements as demanded by `subsurface_disk` (where `max_hits` is 4). The random_walk case only demands 1 hit result which we can retrieve directly from the intersector object (rather than stashing it in the payload). By specialising, we significantly reduce the payload size for random_walk queries, which has a big impact on performance. Additionally, we only need to use a custom intersection function for the first ray test in a random walk (for self-primitive filtering), so this PR forces faster `opaque` intersection testing for all but the first random walk test.
- Currently `scene_intersect_volume` has a lot of redundant code to handle non-triangle primitives despite volumes only being enclosed by trimeshes. This PR removes this code.
Additionally, this PR tidies up the convoluted intersection function linking code, removes some redundant intersection handlers, and uses more consistent naming of intersection functions.
On a M3 MacBook Pro, these changes give 2-3% performance increase on typical scenes with opaque trimesh materials (e.g. barbershop, classroom junkshop), but can give over 15% performance increase for certain scenes using random walk SSS (e.g. monster).
Pull Request: https://projects.blender.org/blender/blender/pulls/121397
2024-05-10 16:38:02 +02:00
|
|
|
[ancillary_desc addObject:[arg_desc_ptrs copy]]; /* blas_accel_structs */
|
2023-10-24 23:20:16 +01:00
|
|
|
|
|
|
|
|
[arg_desc_ift release];
|
|
|
|
|
[arg_desc_as release];
|
|
|
|
|
[arg_desc_ptrs release];
|
|
|
|
|
}
|
|
|
|
|
}
|
2021-12-07 15:11:35 +00:00
|
|
|
|
2023-10-24 23:20:16 +01:00
|
|
|
mtlAncillaryArgEncoder = [mtlDevice newArgumentEncoderWithArguments:ancillary_desc];
|
|
|
|
|
|
|
|
|
|
// preparing the blas arg encoder
|
2024-02-16 19:03:23 +01:00
|
|
|
|
|
|
|
|
if (use_metalrt) {
|
|
|
|
|
MTLArgumentDescriptor *arg_desc_blas = [[MTLArgumentDescriptor alloc] init];
|
|
|
|
|
arg_desc_blas.dataType = MTLDataTypeInstanceAccelerationStructure;
|
|
|
|
|
arg_desc_blas.access = MTLArgumentAccessReadOnly;
|
|
|
|
|
mtlBlasArgEncoder = [mtlDevice newArgumentEncoderWithArguments:@[ arg_desc_blas ]];
|
|
|
|
|
[arg_desc_blas release];
|
2023-02-21 12:03:21 +01:00
|
|
|
}
|
2023-02-06 19:09:51 +00:00
|
|
|
|
2023-10-24 23:20:16 +01:00
|
|
|
for (int i = 0; i < ancillary_desc.count; i++) {
|
|
|
|
|
[ancillary_desc[i] release];
|
|
|
|
|
}
|
|
|
|
|
[ancillary_desc release];
|
2021-12-07 15:11:35 +00:00
|
|
|
}
|
2023-10-24 23:20:16 +01:00
|
|
|
[arg_desc_params release];
|
|
|
|
|
[arg_desc_texture release];
|
2021-12-07 15:11:35 +00:00
|
|
|
}
|
|
|
|
|
}
|
|
|
|
|
|
|
|
|
|
MetalDevice::~MetalDevice()
|
|
|
|
|
{
|
2023-01-04 14:23:33 +00:00
|
|
|
/* Cancel any async shader compilations that are in flight. */
|
|
|
|
|
cancel();
|
|
|
|
|
|
|
|
|
|
/* This lock safeguards against destruction during use (see other uses of
|
|
|
|
|
* existing_devices_mutex). */
|
|
|
|
|
thread_scoped_lock lock(existing_devices_mutex);
|
|
|
|
|
|
2023-02-10 18:44:46 +01:00
|
|
|
int num_resources = texture_info.size();
|
|
|
|
|
for (int res = 0; res < num_resources; res++) {
|
|
|
|
|
if (is_texture(texture_info[res])) {
|
|
|
|
|
[texture_slot_map[res] release];
|
|
|
|
|
texture_slot_map[res] = nil;
|
2021-12-07 15:11:35 +00:00
|
|
|
}
|
|
|
|
|
}
|
2023-02-10 18:44:46 +01:00
|
|
|
|
2024-07-08 16:18:34 +02:00
|
|
|
free_bvh();
|
2021-12-07 15:11:35 +00:00
|
|
|
flush_delayed_free_list();
|
|
|
|
|
|
|
|
|
|
if (texture_bindings_2d) {
|
2023-02-10 18:44:46 +01:00
|
|
|
stats.mem_free(buffer_bindings_1d.allocatedSize + texture_bindings_2d.allocatedSize +
|
|
|
|
|
texture_bindings_3d.allocatedSize);
|
|
|
|
|
[buffer_bindings_1d release];
|
2021-12-07 15:11:35 +00:00
|
|
|
[texture_bindings_2d release];
|
|
|
|
|
[texture_bindings_3d release];
|
|
|
|
|
}
|
|
|
|
|
[mtlTextureArgEncoder release];
|
|
|
|
|
[mtlBufferKernelParamsEncoder release];
|
2023-02-10 18:44:46 +01:00
|
|
|
[mtlBufferArgEncoder release];
|
2021-12-07 15:11:35 +00:00
|
|
|
[mtlASArgEncoder release];
|
|
|
|
|
[mtlAncillaryArgEncoder release];
|
2023-10-24 23:20:16 +01:00
|
|
|
[mtlComputeCommandQueue release];
|
2021-12-07 15:11:35 +00:00
|
|
|
[mtlGeneralCommandQueue release];
|
2025-03-21 12:47:15 +01:00
|
|
|
if (mtlCounterSampleBuffer) {
|
|
|
|
|
[mtlCounterSampleBuffer release];
|
|
|
|
|
}
|
2021-12-07 15:11:35 +00:00
|
|
|
[mtlDevice release];
|
|
|
|
|
|
|
|
|
|
texture_info.free();
|
|
|
|
|
}
|
|
|
|
|
|
2023-07-05 12:02:06 +02:00
|
|
|
bool MetalDevice::support_device(const uint /*kernel_features*/)
|
2021-12-07 15:11:35 +00:00
|
|
|
{
|
|
|
|
|
return true;
|
|
|
|
|
}
|
|
|
|
|
|
2023-07-05 12:02:06 +02:00
|
|
|
bool MetalDevice::check_peer_access(Device * /*peer_device*/)
|
2021-12-07 15:11:35 +00:00
|
|
|
{
|
|
|
|
|
assert(0);
|
|
|
|
|
/* does peer access make sense? */
|
|
|
|
|
return false;
|
|
|
|
|
}
|
|
|
|
|
|
|
|
|
|
bool MetalDevice::use_adaptive_compilation()
|
|
|
|
|
{
|
|
|
|
|
return DebugFlags().metal.adaptive_compile;
|
|
|
|
|
}
|
|
|
|
|
|
2023-02-06 11:16:02 +00:00
|
|
|
bool MetalDevice::use_local_atomic_sort() const
|
|
|
|
|
{
|
|
|
|
|
return DebugFlags().metal.use_local_atomic_sort;
|
|
|
|
|
}
|
|
|
|
|
|
2023-02-26 11:55:22 +13:00
|
|
|
string MetalDevice::preprocess_source(MetalPipelineType pso_type,
|
|
|
|
|
const uint kernel_features,
|
|
|
|
|
string *source)
|
2021-12-07 15:11:35 +00:00
|
|
|
{
|
2022-07-12 15:32:46 +02:00
|
|
|
string global_defines;
|
2021-12-07 15:11:35 +00:00
|
|
|
if (use_adaptive_compilation()) {
|
2022-07-12 15:32:46 +02:00
|
|
|
global_defines += "#define __KERNEL_FEATURES__ " + to_string(kernel_features) + "\n";
|
2021-12-07 15:11:35 +00:00
|
|
|
}
|
|
|
|
|
|
2023-02-06 11:16:02 +00:00
|
|
|
if (use_local_atomic_sort()) {
|
|
|
|
|
global_defines += "#define __KERNEL_LOCAL_ATOMIC_SORT__\n";
|
|
|
|
|
}
|
|
|
|
|
|
2021-12-07 15:11:35 +00:00
|
|
|
if (use_metalrt) {
|
2022-07-12 15:32:46 +02:00
|
|
|
global_defines += "#define __METALRT__\n";
|
2021-12-07 15:11:35 +00:00
|
|
|
if (motion_blur) {
|
2022-07-12 15:32:46 +02:00
|
|
|
global_defines += "#define __METALRT_MOTION__\n";
|
2021-12-07 15:11:35 +00:00
|
|
|
}
|
|
|
|
|
}
|
|
|
|
|
|
|
|
|
|
# ifdef WITH_CYCLES_DEBUG
|
2024-03-20 16:42:42 +01:00
|
|
|
global_defines += "#define WITH_CYCLES_DEBUG\n";
|
2021-12-07 15:11:35 +00:00
|
|
|
# endif
|
|
|
|
|
|
2024-06-26 17:16:20 +02:00
|
|
|
global_defines += "#define __KERNEL_METAL_APPLE__\n";
|
|
|
|
|
if (@available(macos 14.0, *)) {
|
|
|
|
|
/* Use Program Scope Global Built-ins, when available. */
|
|
|
|
|
global_defines += "#define __METAL_GLOBAL_BUILTINS__\n";
|
|
|
|
|
}
|
2023-02-10 18:44:46 +01:00
|
|
|
# ifdef WITH_NANOVDB
|
2024-06-26 17:16:20 +02:00
|
|
|
/* Compiling in NanoVDB results in a marginal drop in render performance,
|
|
|
|
|
* so disable it for specialized PSOs when no textures are using it. */
|
|
|
|
|
if ((pso_type == PSO_GENERIC || using_nanovdb) && DebugFlags().metal.use_nanovdb) {
|
|
|
|
|
global_defines += "#define WITH_NANOVDB\n";
|
2021-12-07 15:11:35 +00:00
|
|
|
}
|
2024-06-26 17:16:20 +02:00
|
|
|
# endif
|
2021-12-07 15:11:35 +00:00
|
|
|
|
2022-10-12 17:06:06 +01:00
|
|
|
NSProcessInfo *processInfo = [NSProcessInfo processInfo];
|
|
|
|
|
NSOperatingSystemVersion macos_ver = [processInfo operatingSystemVersion];
|
|
|
|
|
global_defines += "#define __KERNEL_METAL_MACOS__ " + to_string(macos_ver.majorVersion) + "\n";
|
|
|
|
|
|
2024-08-08 16:01:23 +02:00
|
|
|
# if TARGET_CPU_ARM64
|
|
|
|
|
global_defines += "#define __KERNEL_METAL_TARGET_CPU_ARM64__\n";
|
|
|
|
|
# endif
|
|
|
|
|
|
2022-07-12 15:32:46 +02:00
|
|
|
/* Replace specific KernelData "dot" dereferences with a Metal function_constant identifier of
|
|
|
|
|
* the same character length. Build a string of all active constant values which is then hashed
|
|
|
|
|
* in order to identify the PSO.
|
|
|
|
|
*/
|
|
|
|
|
if (pso_type != PSO_GENERIC) {
|
2023-02-23 11:07:28 +01:00
|
|
|
if (source) {
|
|
|
|
|
const double starttime = time_dt();
|
2022-07-12 15:32:46 +02:00
|
|
|
|
|
|
|
|
# define KERNEL_STRUCT_BEGIN(name, parent) \
|
2023-02-26 11:55:22 +13:00
|
|
|
string_replace_same_length(*source, "kernel_data." #parent ".", "kernel_data_" #parent "_");
|
2022-07-12 15:32:46 +02:00
|
|
|
|
2023-02-23 11:07:28 +01:00
|
|
|
bool next_member_is_specialized = true;
|
2023-01-19 17:57:26 +00:00
|
|
|
|
|
|
|
|
# define KERNEL_STRUCT_MEMBER_DONT_SPECIALIZE next_member_is_specialized = false;
|
|
|
|
|
|
2022-07-12 15:32:46 +02:00
|
|
|
# define KERNEL_STRUCT_MEMBER(parent, _type, name) \
|
2023-02-26 11:55:22 +13:00
|
|
|
if (!next_member_is_specialized) { \
|
|
|
|
|
string_replace( \
|
|
|
|
|
*source, "kernel_data_" #parent "_" #name, "kernel_data." #parent ".__unused_" #name); \
|
|
|
|
|
next_member_is_specialized = true; \
|
|
|
|
|
}
|
2022-07-12 15:32:46 +02:00
|
|
|
|
|
|
|
|
# include "kernel/data_template.h"
|
|
|
|
|
|
2023-02-23 11:07:28 +01:00
|
|
|
# undef KERNEL_STRUCT_MEMBER
|
|
|
|
|
# undef KERNEL_STRUCT_MEMBER_DONT_SPECIALIZE
|
|
|
|
|
# undef KERNEL_STRUCT_BEGIN
|
|
|
|
|
|
|
|
|
|
metal_printf("KernelData patching took %.1f ms\n", (time_dt() - starttime) * 1000.0);
|
|
|
|
|
}
|
|
|
|
|
|
2022-07-12 15:32:46 +02:00
|
|
|
/* Opt in to all of available specializations. This can be made more granular for the
|
|
|
|
|
* PSO_SPECIALIZED_INTERSECT case in order to minimize the number of specialization requests,
|
|
|
|
|
* but the overhead should be negligible as these are very quick to (re)build and aren't
|
|
|
|
|
* serialized to disk via MTLBinaryArchives.
|
|
|
|
|
*/
|
|
|
|
|
global_defines += "#define __KERNEL_USE_DATA_CONSTANTS__\n";
|
|
|
|
|
}
|
2021-12-07 15:11:35 +00:00
|
|
|
|
2023-02-23 11:07:28 +01:00
|
|
|
if (source) {
|
|
|
|
|
*source = global_defines + *source;
|
2022-11-14 15:35:47 +00:00
|
|
|
}
|
2023-02-23 11:07:28 +01:00
|
|
|
|
|
|
|
|
MD5Hash md5;
|
|
|
|
|
md5.append(global_defines);
|
|
|
|
|
return md5.get_hex();
|
|
|
|
|
}
|
|
|
|
|
|
|
|
|
|
void MetalDevice::make_source(MetalPipelineType pso_type, const uint kernel_features)
|
|
|
|
|
{
|
|
|
|
|
string &source = this->source[pso_type];
|
|
|
|
|
source = "\n#include \"kernel/device/metal/kernel.metal\"\n";
|
|
|
|
|
source = path_source_replace_includes(source, path_get("source"));
|
2023-02-26 11:55:22 +13:00
|
|
|
|
2023-02-23 11:07:28 +01:00
|
|
|
/* Perform any required specialization on the source.
|
|
|
|
|
* With Metal function constants we can generate a single variant of the kernel source which can
|
|
|
|
|
* be repeatedly respecialized.
|
|
|
|
|
*/
|
|
|
|
|
global_defines_md5[pso_type] = preprocess_source(pso_type, kernel_features, &source);
|
2021-12-07 15:11:35 +00:00
|
|
|
}
|
|
|
|
|
|
|
|
|
|
bool MetalDevice::load_kernels(const uint _kernel_features)
|
|
|
|
|
{
|
2023-10-24 23:20:16 +01:00
|
|
|
@autoreleasepool {
|
2024-09-26 13:39:22 +02:00
|
|
|
kernel_features |= _kernel_features;
|
2021-12-07 15:11:35 +00:00
|
|
|
|
2023-10-24 23:20:16 +01:00
|
|
|
/* check if GPU is supported */
|
2024-12-26 17:53:59 +01:00
|
|
|
if (!support_device(kernel_features)) {
|
2023-10-24 23:20:16 +01:00
|
|
|
return false;
|
2024-12-26 17:53:59 +01:00
|
|
|
}
|
2021-12-07 15:11:35 +00:00
|
|
|
|
2023-10-24 23:20:16 +01:00
|
|
|
/* Keep track of whether motion blur is enabled, so to enable/disable motion in BVH builds
|
|
|
|
|
* This is necessary since objects may be reported to have motion if the Vector pass is
|
|
|
|
|
* active, but may still need to be rendered without motion blur if that isn't active as well.
|
2023-01-09 18:56:54 +11:00
|
|
|
*/
|
2025-03-26 17:20:33 +01:00
|
|
|
motion_blur = motion_blur || (kernel_features & KERNEL_FEATURE_OBJECT_MOTION);
|
2023-10-24 23:20:16 +01:00
|
|
|
|
|
|
|
|
/* Only request generic kernels if they aren't cached in memory. */
|
2023-10-25 17:47:13 +02:00
|
|
|
refresh_source_and_kernels_md5(PSO_GENERIC);
|
|
|
|
|
if (MetalDeviceKernels::should_load_kernels(this, PSO_GENERIC)) {
|
2023-10-24 23:20:16 +01:00
|
|
|
/* If needed, load them asynchronously in order to responsively message progress to the user.
|
|
|
|
|
*/
|
|
|
|
|
int this_device_id = this->device_id;
|
|
|
|
|
auto compile_kernels_fn = ^() {
|
|
|
|
|
compile_and_load(this_device_id, PSO_GENERIC);
|
|
|
|
|
};
|
2023-01-04 14:23:33 +00:00
|
|
|
|
2023-10-24 23:20:16 +01:00
|
|
|
dispatch_async(dispatch_get_global_queue(DISPATCH_QUEUE_PRIORITY_DEFAULT, 0),
|
|
|
|
|
compile_kernels_fn);
|
|
|
|
|
}
|
2023-01-04 14:23:33 +00:00
|
|
|
}
|
|
|
|
|
return true;
|
|
|
|
|
}
|
2022-05-11 14:52:49 +01:00
|
|
|
|
2023-10-25 17:47:13 +02:00
|
|
|
void MetalDevice::refresh_source_and_kernels_md5(MetalPipelineType pso_type)
|
2023-01-04 14:23:33 +00:00
|
|
|
{
|
2023-02-23 11:07:28 +01:00
|
|
|
string defines_md5 = preprocess_source(pso_type, kernel_features);
|
|
|
|
|
|
|
|
|
|
/* Rebuild the source string if the injected block of #defines has changed. */
|
|
|
|
|
if (global_defines_md5[pso_type] != defines_md5) {
|
2023-01-04 14:23:33 +00:00
|
|
|
make_source(pso_type, kernel_features);
|
|
|
|
|
}
|
2023-02-23 11:07:28 +01:00
|
|
|
|
|
|
|
|
string constant_values;
|
|
|
|
|
if (pso_type != PSO_GENERIC) {
|
|
|
|
|
bool next_member_is_specialized = true;
|
|
|
|
|
|
|
|
|
|
# define KERNEL_STRUCT_MEMBER_DONT_SPECIALIZE next_member_is_specialized = false;
|
|
|
|
|
|
2023-02-26 11:55:22 +13:00
|
|
|
/* Add specialization constants to md5 so that 'get_best_pipeline' is able to return a suitable
|
|
|
|
|
* match. */
|
2023-02-23 11:07:28 +01:00
|
|
|
# define KERNEL_STRUCT_MEMBER(parent, _type, name) \
|
|
|
|
|
if (next_member_is_specialized) { \
|
|
|
|
|
constant_values += string(#parent "." #name "=") + \
|
|
|
|
|
to_string(_type(launch_params.data.parent.name)) + "\n"; \
|
|
|
|
|
} \
|
|
|
|
|
else { \
|
|
|
|
|
next_member_is_specialized = true; \
|
|
|
|
|
}
|
|
|
|
|
|
|
|
|
|
# include "kernel/data_template.h"
|
|
|
|
|
|
|
|
|
|
# undef KERNEL_STRUCT_MEMBER
|
|
|
|
|
# undef KERNEL_STRUCT_MEMBER_DONT_SPECIALIZE
|
|
|
|
|
}
|
|
|
|
|
|
|
|
|
|
MD5Hash md5;
|
|
|
|
|
md5.append(constant_values);
|
|
|
|
|
md5.append(source[pso_type]);
|
2023-02-28 11:42:08 +01:00
|
|
|
if (use_metalrt) {
|
|
|
|
|
md5.append(string_printf("metalrt_features=%d", kernel_features & METALRT_FEATURE_MASK));
|
|
|
|
|
}
|
2023-02-23 11:07:28 +01:00
|
|
|
kernels_md5[pso_type] = md5.get_hex();
|
2022-05-11 14:52:49 +01:00
|
|
|
}
|
|
|
|
|
|
2025-01-01 18:15:54 +01:00
|
|
|
void MetalDevice::compile_and_load(const int device_id, MetalPipelineType pso_type)
|
2022-05-11 14:52:49 +01:00
|
|
|
{
|
2023-10-24 23:20:16 +01:00
|
|
|
@autoreleasepool {
|
|
|
|
|
/* Thread-safe front-end compilation. Typically the MSL->AIR compilation can take a few
|
|
|
|
|
* seconds, so we avoid blocking device tear-down if the user cancels a render immediately. */
|
|
|
|
|
|
|
|
|
|
id<MTLDevice> mtlDevice;
|
|
|
|
|
string source;
|
|
|
|
|
|
|
|
|
|
/* Safely gather any state required for the MSL->AIR compilation. */
|
|
|
|
|
{
|
|
|
|
|
thread_scoped_lock lock(existing_devices_mutex);
|
|
|
|
|
|
|
|
|
|
/* Check whether the device still exists. */
|
|
|
|
|
MetalDevice *instance = get_device_by_ID(device_id, lock);
|
|
|
|
|
if (!instance) {
|
|
|
|
|
metal_printf("Ignoring %s compilation request - device no longer exists\n",
|
|
|
|
|
kernel_type_as_string(pso_type));
|
|
|
|
|
return;
|
|
|
|
|
}
|
2023-01-04 14:23:33 +00:00
|
|
|
|
2023-10-25 17:47:13 +02:00
|
|
|
if (!MetalDeviceKernels::should_load_kernels(instance, pso_type)) {
|
2023-10-24 23:20:16 +01:00
|
|
|
/* We already have a full set of matching pipelines which are cached or queued. Return
|
|
|
|
|
* early to avoid redundant MTLLibrary compilation. */
|
|
|
|
|
metal_printf("Ignoreing %s compilation request - kernels already requested\n",
|
|
|
|
|
kernel_type_as_string(pso_type));
|
|
|
|
|
return;
|
|
|
|
|
}
|
2023-01-04 14:23:33 +00:00
|
|
|
|
2023-10-24 23:20:16 +01:00
|
|
|
mtlDevice = instance->mtlDevice;
|
|
|
|
|
source = instance->source[pso_type];
|
2023-01-04 14:23:33 +00:00
|
|
|
}
|
2022-07-12 15:32:46 +02:00
|
|
|
|
2023-10-24 23:20:16 +01:00
|
|
|
/* Perform the actual compilation using our cached context. The MetalDevice can safely destruct
|
|
|
|
|
* in this time. */
|
2023-01-04 14:23:33 +00:00
|
|
|
|
2023-10-24 23:20:16 +01:00
|
|
|
MTLCompileOptions *options = [[MTLCompileOptions alloc] init];
|
2021-12-07 15:11:35 +00:00
|
|
|
|
2023-10-24 23:20:16 +01:00
|
|
|
options.fastMathEnabled = YES;
|
|
|
|
|
if (@available(macos 12.0, *)) {
|
|
|
|
|
options.languageVersion = MTLLanguageVersion2_4;
|
|
|
|
|
}
|
2024-01-04 17:08:29 +01:00
|
|
|
# if defined(MAC_OS_VERSION_13_0)
|
|
|
|
|
if (@available(macos 13.0, *)) {
|
|
|
|
|
options.languageVersion = MTLLanguageVersion3_0;
|
|
|
|
|
}
|
|
|
|
|
# endif
|
2023-09-13 16:02:49 +02:00
|
|
|
# if defined(MAC_OS_VERSION_14_0)
|
2023-10-24 23:20:16 +01:00
|
|
|
if (@available(macos 14.0, *)) {
|
|
|
|
|
options.languageVersion = MTLLanguageVersion3_1;
|
|
|
|
|
}
|
2023-09-13 16:02:49 +02:00
|
|
|
# endif
|
2021-12-07 15:11:35 +00:00
|
|
|
|
2023-10-24 23:20:16 +01:00
|
|
|
if (getenv("CYCLES_METAL_PROFILING") || getenv("CYCLES_METAL_DEBUG")) {
|
|
|
|
|
path_write_text(path_cache_get(string_printf("%s.metal", kernel_type_as_string(pso_type))),
|
|
|
|
|
source);
|
|
|
|
|
}
|
2022-07-12 15:32:46 +02:00
|
|
|
|
2023-10-24 23:20:16 +01:00
|
|
|
double starttime = time_dt();
|
2022-07-12 15:32:46 +02:00
|
|
|
|
2024-12-26 17:53:55 +01:00
|
|
|
NSError *error = nullptr;
|
2023-10-24 23:20:16 +01:00
|
|
|
id<MTLLibrary> mtlLibrary = [mtlDevice newLibraryWithSource:@(source.c_str())
|
|
|
|
|
options:options
|
|
|
|
|
error:&error];
|
2021-12-07 15:11:35 +00:00
|
|
|
|
2023-10-24 23:20:16 +01:00
|
|
|
metal_printf("Front-end compilation finished in %.1f seconds (%s)\n",
|
|
|
|
|
time_dt() - starttime,
|
|
|
|
|
kernel_type_as_string(pso_type));
|
2022-07-12 15:32:46 +02:00
|
|
|
|
2023-10-24 23:20:16 +01:00
|
|
|
[options release];
|
2021-12-07 15:11:35 +00:00
|
|
|
|
2023-10-24 23:20:16 +01:00
|
|
|
bool blocking_pso_build = (getenv("CYCLES_METAL_PROFILING") ||
|
|
|
|
|
MetalDeviceKernels::is_benchmark_warmup());
|
|
|
|
|
if (blocking_pso_build) {
|
|
|
|
|
MetalDeviceKernels::wait_for_all();
|
|
|
|
|
starttime = 0.0;
|
|
|
|
|
}
|
2023-01-04 16:01:24 +00:00
|
|
|
|
2023-10-24 23:20:16 +01:00
|
|
|
/* Save the compiled MTLLibrary and trigger the AIR->PSO builds (if the MetalDevice still
|
|
|
|
|
* exists). */
|
|
|
|
|
{
|
|
|
|
|
thread_scoped_lock lock(existing_devices_mutex);
|
|
|
|
|
if (MetalDevice *instance = get_device_by_ID(device_id, lock)) {
|
|
|
|
|
if (mtlLibrary) {
|
|
|
|
|
if (error && [error localizedDescription]) {
|
|
|
|
|
VLOG_WARNING << "MSL compilation messages: "
|
|
|
|
|
<< [[error localizedDescription] UTF8String];
|
|
|
|
|
}
|
2023-03-14 22:05:55 +01:00
|
|
|
|
2023-10-24 23:20:16 +01:00
|
|
|
instance->mtlLibrary[pso_type] = mtlLibrary;
|
2023-01-04 16:01:24 +00:00
|
|
|
|
2023-10-24 23:20:16 +01:00
|
|
|
starttime = time_dt();
|
|
|
|
|
MetalDeviceKernels::load(instance, pso_type);
|
|
|
|
|
}
|
|
|
|
|
else {
|
|
|
|
|
NSString *err = [error localizedDescription];
|
|
|
|
|
instance->set_error(string_printf("Failed to compile library:\n%s", [err UTF8String]));
|
|
|
|
|
}
|
2023-01-04 14:23:33 +00:00
|
|
|
}
|
|
|
|
|
}
|
2023-01-04 16:01:24 +00:00
|
|
|
|
2023-10-24 23:20:16 +01:00
|
|
|
if (starttime && blocking_pso_build) {
|
|
|
|
|
MetalDeviceKernels::wait_for_all();
|
2023-01-04 16:01:24 +00:00
|
|
|
|
2023-10-24 23:20:16 +01:00
|
|
|
metal_printf("Back-end compilation finished in %.1f seconds (%s)\n",
|
|
|
|
|
time_dt() - starttime,
|
|
|
|
|
kernel_type_as_string(pso_type));
|
|
|
|
|
}
|
2023-01-04 16:01:24 +00:00
|
|
|
}
|
2021-12-07 15:11:35 +00:00
|
|
|
}
|
|
|
|
|
|
2023-02-10 18:44:46 +01:00
|
|
|
bool MetalDevice::is_texture(const TextureInfo &tex)
|
|
|
|
|
{
|
|
|
|
|
return (tex.depth > 0 || tex.height > 0);
|
|
|
|
|
}
|
|
|
|
|
|
2021-12-07 15:11:35 +00:00
|
|
|
void MetalDevice::load_texture_info()
|
|
|
|
|
{
|
|
|
|
|
if (need_texture_info) {
|
|
|
|
|
/* Unset flag before copying. */
|
|
|
|
|
need_texture_info = false;
|
|
|
|
|
texture_info.copy_to_device();
|
|
|
|
|
|
|
|
|
|
int num_textures = texture_info.size();
|
|
|
|
|
|
|
|
|
|
for (int tex = 0; tex < num_textures; tex++) {
|
|
|
|
|
uint64_t offset = tex * sizeof(void *);
|
2023-02-10 18:44:46 +01:00
|
|
|
if (is_texture(texture_info[tex]) && texture_slot_map[tex]) {
|
|
|
|
|
id<MTLTexture> metal_texture = texture_slot_map[tex];
|
|
|
|
|
MTLTextureType type = metal_texture.textureType;
|
2021-12-07 15:11:35 +00:00
|
|
|
[mtlTextureArgEncoder setArgumentBuffer:texture_bindings_2d offset:offset];
|
2023-02-10 18:44:46 +01:00
|
|
|
[mtlTextureArgEncoder setTexture:type == MTLTextureType2D ? metal_texture : nil atIndex:0];
|
2021-12-07 15:11:35 +00:00
|
|
|
[mtlTextureArgEncoder setArgumentBuffer:texture_bindings_3d offset:offset];
|
2023-02-10 18:44:46 +01:00
|
|
|
[mtlTextureArgEncoder setTexture:type == MTLTextureType3D ? metal_texture : nil atIndex:0];
|
2021-12-07 15:11:35 +00:00
|
|
|
}
|
|
|
|
|
else {
|
|
|
|
|
[mtlTextureArgEncoder setArgumentBuffer:texture_bindings_2d offset:offset];
|
2023-02-10 18:44:46 +01:00
|
|
|
[mtlTextureArgEncoder setTexture:nil atIndex:0];
|
2021-12-07 15:11:35 +00:00
|
|
|
[mtlTextureArgEncoder setArgumentBuffer:texture_bindings_3d offset:offset];
|
2023-02-10 18:44:46 +01:00
|
|
|
[mtlTextureArgEncoder setTexture:nil atIndex:0];
|
2021-12-07 15:11:35 +00:00
|
|
|
}
|
|
|
|
|
}
|
|
|
|
|
}
|
|
|
|
|
}
|
|
|
|
|
|
2022-01-19 17:57:24 +00:00
|
|
|
void MetalDevice::erase_allocation(device_memory &mem)
|
|
|
|
|
{
|
|
|
|
|
stats.mem_free(mem.device_size);
|
|
|
|
|
mem.device_pointer = 0;
|
|
|
|
|
mem.device_size = 0;
|
|
|
|
|
|
|
|
|
|
auto it = metal_mem_map.find(&mem);
|
|
|
|
|
if (it != metal_mem_map.end()) {
|
|
|
|
|
MetalMem *mmem = it->second.get();
|
|
|
|
|
|
2023-02-12 14:37:16 +11:00
|
|
|
/* blank out reference to MetalMem* in the launch params (fixes crash #94736) */
|
2022-01-19 17:57:24 +00:00
|
|
|
if (mmem->pointer_index >= 0) {
|
2022-01-20 11:48:29 +11:00
|
|
|
device_ptr *pointers = (device_ptr *)&launch_params;
|
2022-01-19 17:57:24 +00:00
|
|
|
pointers[mmem->pointer_index] = 0;
|
|
|
|
|
}
|
|
|
|
|
metal_mem_map.erase(it);
|
|
|
|
|
}
|
|
|
|
|
}
|
|
|
|
|
|
2025-01-01 18:15:54 +01:00
|
|
|
bool MetalDevice::max_working_set_exceeded(const size_t safety_margin) const
|
2022-12-07 13:28:59 +00:00
|
|
|
{
|
|
|
|
|
/* We're allowed to allocate beyond the safe working set size, but then if all resources are made
|
|
|
|
|
* resident we will get command buffer failures at render time. */
|
|
|
|
|
size_t available = [mtlDevice recommendedMaxWorkingSetSize] - safety_margin;
|
|
|
|
|
return (stats.mem_used > available);
|
|
|
|
|
}
|
|
|
|
|
|
2021-12-07 15:11:35 +00:00
|
|
|
MetalDevice::MetalMem *MetalDevice::generic_alloc(device_memory &mem)
|
|
|
|
|
{
|
2023-10-24 23:20:16 +01:00
|
|
|
@autoreleasepool {
|
|
|
|
|
size_t size = mem.memory_size();
|
2021-12-07 15:11:35 +00:00
|
|
|
|
2023-10-24 23:20:16 +01:00
|
|
|
mem.device_pointer = 0;
|
2021-12-07 15:11:35 +00:00
|
|
|
|
2023-10-24 23:20:16 +01:00
|
|
|
id<MTLBuffer> metal_buffer = nil;
|
2025-03-19 12:53:01 +01:00
|
|
|
MTLResourceOptions options = MTLResourceStorageModeShared;
|
2022-02-10 10:57:28 +00:00
|
|
|
|
2023-10-24 23:20:16 +01:00
|
|
|
if (size > 0) {
|
|
|
|
|
if (mem.type == MEM_DEVICE_ONLY && !capture_enabled) {
|
|
|
|
|
options = MTLResourceStorageModePrivate;
|
|
|
|
|
}
|
2021-12-07 15:11:35 +00:00
|
|
|
|
2023-10-24 23:20:16 +01:00
|
|
|
metal_buffer = [mtlDevice newBufferWithLength:size options:options];
|
|
|
|
|
|
|
|
|
|
if (!metal_buffer) {
|
|
|
|
|
set_error("System is out of GPU memory");
|
|
|
|
|
return nullptr;
|
|
|
|
|
}
|
2021-12-07 15:11:35 +00:00
|
|
|
}
|
|
|
|
|
|
2023-10-24 23:20:16 +01:00
|
|
|
if (mem.name) {
|
|
|
|
|
VLOG_WORK << "Buffer allocate: " << mem.name << ", "
|
|
|
|
|
<< string_human_readable_number(mem.memory_size()) << " bytes. ("
|
|
|
|
|
<< string_human_readable_size(mem.memory_size()) << ")";
|
|
|
|
|
}
|
2021-12-07 15:11:35 +00:00
|
|
|
|
2023-10-24 23:20:16 +01:00
|
|
|
mem.device_size = metal_buffer.allocatedSize;
|
|
|
|
|
stats.mem_alloc(mem.device_size);
|
2021-12-07 15:11:35 +00:00
|
|
|
|
2024-11-01 11:56:51 +01:00
|
|
|
metal_buffer.label = [NSString stringWithFormat:@"%s", mem.name];
|
2021-12-07 15:11:35 +00:00
|
|
|
|
2023-10-24 23:20:16 +01:00
|
|
|
std::lock_guard<std::recursive_mutex> lock(metal_mem_map_mutex);
|
2021-12-07 15:11:35 +00:00
|
|
|
|
2023-10-24 23:20:16 +01:00
|
|
|
assert(metal_mem_map.count(&mem) == 0); /* assert against double-alloc */
|
2024-12-29 23:13:45 +01:00
|
|
|
unique_ptr<MetalMem> mmem = make_unique<MetalMem>();
|
2021-12-07 15:11:35 +00:00
|
|
|
|
2023-10-24 23:20:16 +01:00
|
|
|
mmem->mem = &mem;
|
|
|
|
|
mmem->mtlBuffer = metal_buffer;
|
|
|
|
|
mmem->offset = 0;
|
|
|
|
|
mmem->size = size;
|
|
|
|
|
if (options != MTLResourceStorageModePrivate) {
|
|
|
|
|
mmem->hostPtr = [metal_buffer contents];
|
|
|
|
|
}
|
|
|
|
|
else {
|
|
|
|
|
mmem->hostPtr = nullptr;
|
|
|
|
|
}
|
2021-12-07 15:11:35 +00:00
|
|
|
|
2023-10-24 23:20:16 +01:00
|
|
|
/* encode device_pointer as (MetalMem*) in order to handle resource relocation and device
|
|
|
|
|
* pointer recalculation */
|
2024-12-29 23:13:45 +01:00
|
|
|
mem.device_pointer = device_ptr(mmem.get());
|
2021-12-07 15:11:35 +00:00
|
|
|
|
2025-03-18 20:36:28 +01:00
|
|
|
if (metal_buffer.storageMode == MTLStorageModeShared) {
|
|
|
|
|
/* Replace host pointer with our host allocation. */
|
|
|
|
|
if (mem.host_pointer && mem.host_pointer != mmem->hostPtr) {
|
|
|
|
|
memcpy(mmem->hostPtr, mem.host_pointer, size);
|
2021-12-07 15:11:35 +00:00
|
|
|
|
2025-03-18 20:36:28 +01:00
|
|
|
host_free(mem.type, mem.host_pointer, mem.memory_size());
|
|
|
|
|
mem.host_pointer = mmem->hostPtr;
|
|
|
|
|
}
|
|
|
|
|
mem.shared_pointer = mmem->hostPtr;
|
|
|
|
|
mem.shared_counter++;
|
2021-12-07 15:11:35 +00:00
|
|
|
}
|
|
|
|
|
|
2024-12-29 23:13:45 +01:00
|
|
|
MetalMem *mmem_ptr = mmem.get();
|
|
|
|
|
metal_mem_map[&mem] = std::move(mmem);
|
|
|
|
|
|
2023-10-24 23:20:16 +01:00
|
|
|
if (max_working_set_exceeded()) {
|
|
|
|
|
set_error("System is out of GPU memory");
|
|
|
|
|
return nullptr;
|
|
|
|
|
}
|
2022-12-07 13:28:59 +00:00
|
|
|
|
2024-12-29 23:13:45 +01:00
|
|
|
return mmem_ptr;
|
2023-10-24 23:20:16 +01:00
|
|
|
}
|
2021-12-07 15:11:35 +00:00
|
|
|
}
|
|
|
|
|
|
2025-03-19 12:53:01 +01:00
|
|
|
void MetalDevice::generic_copy_to(device_memory &)
|
2021-12-07 15:11:35 +00:00
|
|
|
{
|
2025-03-19 12:53:01 +01:00
|
|
|
/* No need to copy - Apple Silicon has Unified Memory Architecture. */
|
2021-12-07 15:11:35 +00:00
|
|
|
}
|
|
|
|
|
|
|
|
|
|
void MetalDevice::generic_free(device_memory &mem)
|
|
|
|
|
{
|
2025-01-09 12:04:08 +01:00
|
|
|
if (!mem.device_pointer) {
|
|
|
|
|
return;
|
|
|
|
|
}
|
2021-12-07 15:11:35 +00:00
|
|
|
|
2025-01-17 09:41:34 +01:00
|
|
|
/* Host pointer should already have been freed at this point. If not we might
|
|
|
|
|
* end up freeing shared memory and can't recover original host memory. */
|
|
|
|
|
assert(mem.host_pointer == nullptr);
|
|
|
|
|
|
2025-01-09 12:04:08 +01:00
|
|
|
std::lock_guard<std::recursive_mutex> lock(metal_mem_map_mutex);
|
|
|
|
|
MetalMem &mmem = *metal_mem_map.at(&mem);
|
|
|
|
|
size_t size = mmem.size;
|
2021-12-07 15:11:35 +00:00
|
|
|
|
2025-03-19 12:53:01 +01:00
|
|
|
bool free_mtlBuffer = true;
|
2021-12-07 15:11:35 +00:00
|
|
|
|
2025-03-19 12:53:01 +01:00
|
|
|
/* If this is shared, reference counting is used to safely free memory. */
|
|
|
|
|
if (mem.shared_pointer) {
|
|
|
|
|
assert(mem.shared_counter > 0);
|
|
|
|
|
if (--mem.shared_counter > 0) {
|
|
|
|
|
free_mtlBuffer = false;
|
2025-01-09 12:04:08 +01:00
|
|
|
}
|
|
|
|
|
}
|
2021-12-07 15:11:35 +00:00
|
|
|
|
2025-01-09 12:04:08 +01:00
|
|
|
if (free_mtlBuffer) {
|
|
|
|
|
if (mem.host_pointer && mem.host_pointer == mem.shared_pointer) {
|
2025-01-17 09:41:34 +01:00
|
|
|
/* Safely move the device-side data back to the host before it is freed.
|
|
|
|
|
* We should actually never reach this code as it is inefficient, but
|
|
|
|
|
* better than to crash if there is a bug. */
|
|
|
|
|
assert(!"Metal device should not copy memory back to host");
|
2025-01-09 12:04:08 +01:00
|
|
|
mem.host_pointer = mem.host_alloc(size);
|
|
|
|
|
memcpy(mem.host_pointer, mem.shared_pointer, size);
|
2021-12-07 15:11:35 +00:00
|
|
|
}
|
|
|
|
|
|
2025-01-09 12:04:08 +01:00
|
|
|
mem.shared_pointer = nullptr;
|
|
|
|
|
|
|
|
|
|
/* Free device memory. */
|
|
|
|
|
delayed_free_list.push_back(mmem.mtlBuffer);
|
|
|
|
|
mmem.mtlBuffer = nil;
|
2021-12-07 15:11:35 +00:00
|
|
|
}
|
2025-01-09 12:04:08 +01:00
|
|
|
|
|
|
|
|
erase_allocation(mem);
|
2021-12-07 15:11:35 +00:00
|
|
|
}
|
|
|
|
|
|
|
|
|
|
void MetalDevice::mem_alloc(device_memory &mem)
|
|
|
|
|
{
|
|
|
|
|
if (mem.type == MEM_TEXTURE) {
|
|
|
|
|
assert(!"mem_alloc not supported for textures.");
|
|
|
|
|
}
|
|
|
|
|
else if (mem.type == MEM_GLOBAL) {
|
|
|
|
|
generic_alloc(mem);
|
|
|
|
|
}
|
|
|
|
|
else {
|
|
|
|
|
generic_alloc(mem);
|
|
|
|
|
}
|
|
|
|
|
}
|
|
|
|
|
|
|
|
|
|
void MetalDevice::mem_copy_to(device_memory &mem)
|
|
|
|
|
{
|
2025-01-09 12:04:08 +01:00
|
|
|
if (!mem.device_pointer) {
|
|
|
|
|
if (mem.type == MEM_GLOBAL) {
|
|
|
|
|
global_alloc(mem);
|
|
|
|
|
}
|
|
|
|
|
else if (mem.type == MEM_TEXTURE) {
|
|
|
|
|
tex_alloc((device_texture &)mem);
|
|
|
|
|
}
|
|
|
|
|
else {
|
2021-12-07 15:11:35 +00:00
|
|
|
generic_alloc(mem);
|
2025-01-09 12:04:08 +01:00
|
|
|
generic_copy_to(mem);
|
2021-12-07 15:11:35 +00:00
|
|
|
}
|
|
|
|
|
}
|
2025-01-09 12:04:08 +01:00
|
|
|
else if (mem.is_resident(this)) {
|
|
|
|
|
if (mem.type == MEM_GLOBAL) {
|
|
|
|
|
generic_copy_to(mem);
|
|
|
|
|
}
|
|
|
|
|
else if (mem.type == MEM_TEXTURE) {
|
|
|
|
|
tex_copy_to((device_texture &)mem);
|
|
|
|
|
}
|
|
|
|
|
else {
|
|
|
|
|
generic_copy_to(mem);
|
|
|
|
|
}
|
|
|
|
|
}
|
|
|
|
|
}
|
|
|
|
|
|
|
|
|
|
void MetalDevice::mem_move_to_host(device_memory & /*mem*/)
|
|
|
|
|
{
|
|
|
|
|
/* Metal implements own mechanism for moving host memory. */
|
2025-01-21 16:07:29 +01:00
|
|
|
assert(!"Metal does not support mem_move_to_host");
|
2021-12-07 15:11:35 +00:00
|
|
|
}
|
|
|
|
|
|
2025-03-19 12:53:01 +01:00
|
|
|
void MetalDevice::mem_copy_from(device_memory &, const size_t, size_t, const size_t, size_t)
|
2021-12-07 15:11:35 +00:00
|
|
|
{
|
2025-03-19 12:53:01 +01:00
|
|
|
/* No need to copy - Apple Silicon has Unified Memory Architecture. */
|
2021-12-07 15:11:35 +00:00
|
|
|
}
|
|
|
|
|
|
|
|
|
|
void MetalDevice::mem_zero(device_memory &mem)
|
|
|
|
|
{
|
|
|
|
|
if (!mem.device_pointer) {
|
|
|
|
|
mem_alloc(mem);
|
|
|
|
|
}
|
2025-03-19 12:53:01 +01:00
|
|
|
assert(mem.shared_pointer);
|
|
|
|
|
memset(mem.shared_pointer, 0, mem.memory_size());
|
2021-12-07 15:11:35 +00:00
|
|
|
}
|
|
|
|
|
|
|
|
|
|
void MetalDevice::mem_free(device_memory &mem)
|
|
|
|
|
{
|
|
|
|
|
if (mem.type == MEM_GLOBAL) {
|
|
|
|
|
global_free(mem);
|
|
|
|
|
}
|
|
|
|
|
else if (mem.type == MEM_TEXTURE) {
|
|
|
|
|
tex_free((device_texture &)mem);
|
|
|
|
|
}
|
|
|
|
|
else {
|
|
|
|
|
generic_free(mem);
|
|
|
|
|
}
|
|
|
|
|
}
|
|
|
|
|
|
2023-07-05 12:02:06 +02:00
|
|
|
device_ptr MetalDevice::mem_alloc_sub_ptr(device_memory & /*mem*/,
|
|
|
|
|
size_t /*offset*/,
|
|
|
|
|
size_t /*size*/)
|
2021-12-07 15:11:35 +00:00
|
|
|
{
|
|
|
|
|
/* METAL_WIP - revive if necessary */
|
|
|
|
|
assert(0);
|
|
|
|
|
return 0;
|
|
|
|
|
}
|
|
|
|
|
|
2023-01-04 14:23:33 +00:00
|
|
|
void MetalDevice::cancel()
|
2022-07-12 15:32:46 +02:00
|
|
|
{
|
2023-01-04 14:23:33 +00:00
|
|
|
/* Remove this device's ID from the list of active devices. Any pending compilation requests
|
|
|
|
|
* originating from this session will be cancelled. */
|
|
|
|
|
thread_scoped_lock lock(existing_devices_mutex);
|
|
|
|
|
if (device_id) {
|
|
|
|
|
active_device_ids.erase(device_id);
|
|
|
|
|
device_id = 0;
|
|
|
|
|
}
|
|
|
|
|
}
|
2022-07-12 15:32:46 +02:00
|
|
|
|
2023-01-04 14:23:33 +00:00
|
|
|
bool MetalDevice::is_ready(string &status) const
|
|
|
|
|
{
|
2023-02-24 17:55:27 +01:00
|
|
|
if (!error_msg.empty()) {
|
|
|
|
|
/* Avoid hanging if we had an error. */
|
|
|
|
|
return true;
|
|
|
|
|
}
|
|
|
|
|
|
2023-01-04 14:23:33 +00:00
|
|
|
int num_loaded = MetalDeviceKernels::get_loaded_kernel_count(this, PSO_GENERIC);
|
|
|
|
|
if (num_loaded < DEVICE_KERNEL_NUM) {
|
|
|
|
|
status = string_printf("%d / %d render kernels loaded (may take a few minutes the first time)",
|
|
|
|
|
num_loaded,
|
|
|
|
|
DEVICE_KERNEL_NUM);
|
|
|
|
|
return false;
|
2022-07-12 15:32:46 +02:00
|
|
|
}
|
2023-02-28 11:42:08 +01:00
|
|
|
|
|
|
|
|
if (int num_requests = MetalDeviceKernels::num_incomplete_specialization_requests()) {
|
|
|
|
|
status = string_printf("%d kernels to optimize", num_requests);
|
|
|
|
|
}
|
|
|
|
|
else if (kernel_specialization_level == PSO_SPECIALIZED_INTERSECT) {
|
|
|
|
|
status = "Using optimized intersection kernels";
|
|
|
|
|
}
|
|
|
|
|
else if (kernel_specialization_level == PSO_SPECIALIZED_SHADE) {
|
|
|
|
|
status = "Using optimized kernels";
|
|
|
|
|
}
|
|
|
|
|
|
2023-01-04 14:23:33 +00:00
|
|
|
metal_printf("MetalDevice::is_ready(...) --> true\n");
|
|
|
|
|
return true;
|
|
|
|
|
}
|
2022-07-12 15:32:46 +02:00
|
|
|
|
2023-01-04 14:23:33 +00:00
|
|
|
void MetalDevice::optimize_for_scene(Scene *scene)
|
|
|
|
|
{
|
|
|
|
|
MetalPipelineType specialization_level = kernel_specialization_level;
|
2022-07-12 15:32:46 +02:00
|
|
|
|
|
|
|
|
if (!scene->params.background) {
|
2023-01-04 14:23:33 +00:00
|
|
|
/* In live viewport, don't specialize beyond intersection kernels for responsiveness. */
|
|
|
|
|
specialization_level = (MetalPipelineType)min(specialization_level, PSO_SPECIALIZED_INTERSECT);
|
2022-07-12 15:32:46 +02:00
|
|
|
}
|
|
|
|
|
|
2023-01-04 14:23:33 +00:00
|
|
|
/* For responsive rendering, specialize the kernels in the background, and only if there isn't an
|
|
|
|
|
* existing "optimize_for_scene" request in flight. */
|
|
|
|
|
int this_device_id = this->device_id;
|
|
|
|
|
auto specialize_kernels_fn = ^() {
|
|
|
|
|
for (int level = 1; level <= int(specialization_level); level++) {
|
|
|
|
|
compile_and_load(this_device_id, MetalPipelineType(level));
|
|
|
|
|
}
|
2022-07-12 15:32:46 +02:00
|
|
|
};
|
|
|
|
|
|
2023-01-04 14:23:33 +00:00
|
|
|
/* In normal use, we always compile the specialized kernels in the background. */
|
|
|
|
|
bool specialize_in_background = true;
|
2022-07-12 15:32:46 +02:00
|
|
|
|
|
|
|
|
/* Block if a per-kernel profiling is enabled (ensure steady rendering rate). */
|
|
|
|
|
if (getenv("CYCLES_METAL_PROFILING") != nullptr) {
|
2023-01-04 14:23:33 +00:00
|
|
|
specialize_in_background = false;
|
|
|
|
|
}
|
|
|
|
|
|
|
|
|
|
/* Block during benchmark warm-up to ensure kernels are cached prior to the observed run. */
|
2023-01-04 16:01:24 +00:00
|
|
|
if (MetalDeviceKernels::is_benchmark_warmup()) {
|
|
|
|
|
specialize_in_background = false;
|
2022-07-12 15:32:46 +02:00
|
|
|
}
|
|
|
|
|
|
2023-01-04 14:23:33 +00:00
|
|
|
if (specialize_in_background) {
|
2023-02-28 11:42:08 +01:00
|
|
|
if (MetalDeviceKernels::num_incomplete_specialization_requests() == 0) {
|
2022-07-12 15:32:46 +02:00
|
|
|
dispatch_async(dispatch_get_global_queue(DISPATCH_QUEUE_PRIORITY_DEFAULT, 0),
|
2023-01-04 14:23:33 +00:00
|
|
|
specialize_kernels_fn);
|
2022-07-12 15:32:46 +02:00
|
|
|
}
|
|
|
|
|
else {
|
2023-01-04 14:23:33 +00:00
|
|
|
metal_printf("\"optimize_for_scene\" request already in flight - dropping request\n");
|
2022-07-12 15:32:46 +02:00
|
|
|
}
|
|
|
|
|
}
|
|
|
|
|
else {
|
2023-01-04 14:23:33 +00:00
|
|
|
specialize_kernels_fn();
|
2022-07-12 15:32:46 +02:00
|
|
|
}
|
|
|
|
|
}
|
|
|
|
|
|
2025-01-01 18:15:54 +01:00
|
|
|
void MetalDevice::const_copy_to(const char *name, void *host, const size_t size)
|
2021-12-07 15:11:35 +00:00
|
|
|
{
|
2022-06-17 17:16:37 +02:00
|
|
|
if (strcmp(name, "data") == 0) {
|
2021-12-07 15:11:35 +00:00
|
|
|
assert(size == sizeof(KernelData));
|
2022-06-22 22:36:33 +01:00
|
|
|
memcpy((uint8_t *)&launch_params.data, host, sizeof(KernelData));
|
2023-10-25 17:47:13 +02:00
|
|
|
|
|
|
|
|
/* Refresh the kernels_md5 checksums for specialized kernel sets. */
|
|
|
|
|
for (int level = 1; level <= int(kernel_specialization_level); level++) {
|
|
|
|
|
refresh_source_and_kernels_md5(MetalPipelineType(level));
|
|
|
|
|
}
|
2021-12-07 15:11:35 +00:00
|
|
|
return;
|
|
|
|
|
}
|
|
|
|
|
|
|
|
|
|
auto update_launch_pointers =
|
2025-01-01 18:15:54 +01:00
|
|
|
[&](size_t offset, void *data, const size_t data_size, const size_t pointers_size) {
|
2021-12-07 15:11:35 +00:00
|
|
|
memcpy((uint8_t *)&launch_params + offset, data, data_size);
|
|
|
|
|
|
|
|
|
|
MetalMem **mmem = (MetalMem **)data;
|
|
|
|
|
int pointer_count = pointers_size / sizeof(device_ptr);
|
|
|
|
|
int pointer_index = offset / sizeof(device_ptr);
|
|
|
|
|
for (int i = 0; i < pointer_count; i++) {
|
|
|
|
|
if (mmem[i]) {
|
|
|
|
|
mmem[i]->pointer_index = pointer_index + i;
|
|
|
|
|
}
|
|
|
|
|
}
|
|
|
|
|
};
|
|
|
|
|
|
|
|
|
|
/* Update data storage pointers in launch parameters. */
|
2022-06-17 17:16:37 +02:00
|
|
|
if (strcmp(name, "integrator_state") == 0) {
|
2021-12-07 15:11:35 +00:00
|
|
|
/* IntegratorStateGPU is contiguous pointers */
|
2022-07-13 20:56:57 +01:00
|
|
|
const size_t pointer_block_size = offsetof(IntegratorStateGPU, sort_partition_divisor);
|
2021-12-07 15:11:35 +00:00
|
|
|
update_launch_pointers(
|
2022-06-17 17:16:37 +02:00
|
|
|
offsetof(KernelParamsMetal, integrator_state), host, size, pointer_block_size);
|
2021-12-07 15:11:35 +00:00
|
|
|
}
|
2022-06-17 17:16:37 +02:00
|
|
|
# define KERNEL_DATA_ARRAY(data_type, tex_name) \
|
2021-12-07 15:11:35 +00:00
|
|
|
else if (strcmp(name, #tex_name) == 0) { \
|
|
|
|
|
update_launch_pointers(offsetof(KernelParamsMetal, tex_name), host, size, size); \
|
|
|
|
|
}
|
2022-06-17 17:16:37 +02:00
|
|
|
# include "kernel/data_arrays.h"
|
|
|
|
|
# undef KERNEL_DATA_ARRAY
|
2021-12-07 15:11:35 +00:00
|
|
|
}
|
|
|
|
|
|
|
|
|
|
void MetalDevice::global_alloc(device_memory &mem)
|
|
|
|
|
{
|
|
|
|
|
if (mem.is_resident(this)) {
|
|
|
|
|
generic_alloc(mem);
|
|
|
|
|
generic_copy_to(mem);
|
|
|
|
|
}
|
|
|
|
|
|
|
|
|
|
const_copy_to(mem.name, &mem.device_pointer, sizeof(mem.device_pointer));
|
|
|
|
|
}
|
|
|
|
|
|
|
|
|
|
void MetalDevice::global_free(device_memory &mem)
|
|
|
|
|
{
|
|
|
|
|
if (mem.is_resident(this) && mem.device_pointer) {
|
|
|
|
|
generic_free(mem);
|
|
|
|
|
}
|
|
|
|
|
}
|
|
|
|
|
|
|
|
|
|
void MetalDevice::tex_alloc_as_buffer(device_texture &mem)
|
|
|
|
|
{
|
2023-02-10 18:44:46 +01:00
|
|
|
MetalDevice::MetalMem *mmem = generic_alloc(mem);
|
2021-12-07 15:11:35 +00:00
|
|
|
generic_copy_to(mem);
|
|
|
|
|
|
|
|
|
|
/* Resize once */
|
|
|
|
|
const uint slot = mem.slot;
|
|
|
|
|
if (slot >= texture_info.size()) {
|
|
|
|
|
/* Allocate some slots in advance, to reduce amount
|
|
|
|
|
* of re-allocations. */
|
|
|
|
|
texture_info.resize(round_up(slot + 1, 128));
|
2023-02-10 18:44:46 +01:00
|
|
|
texture_slot_map.resize(round_up(slot + 1, 128));
|
2021-12-07 15:11:35 +00:00
|
|
|
}
|
|
|
|
|
|
|
|
|
|
texture_info[slot] = mem.info;
|
2023-02-10 18:44:46 +01:00
|
|
|
uint64_t offset = slot * sizeof(void *);
|
|
|
|
|
[mtlBufferArgEncoder setArgumentBuffer:buffer_bindings_1d offset:offset];
|
|
|
|
|
[mtlBufferArgEncoder setBuffer:mmem->mtlBuffer offset:0 atIndex:0];
|
|
|
|
|
texture_info[slot].data = *(uint64_t *)((uint64_t)buffer_bindings_1d.contents + offset);
|
|
|
|
|
texture_slot_map[slot] = nil;
|
2021-12-07 15:11:35 +00:00
|
|
|
need_texture_info = true;
|
2023-09-25 14:56:58 +02:00
|
|
|
|
|
|
|
|
if (mem.info.data_type == IMAGE_DATA_TYPE_NANOVDB_FLOAT ||
|
|
|
|
|
mem.info.data_type == IMAGE_DATA_TYPE_NANOVDB_FLOAT3 ||
|
|
|
|
|
mem.info.data_type == IMAGE_DATA_TYPE_NANOVDB_FPN ||
|
|
|
|
|
mem.info.data_type == IMAGE_DATA_TYPE_NANOVDB_FP16)
|
|
|
|
|
{
|
|
|
|
|
using_nanovdb = true;
|
|
|
|
|
}
|
2021-12-07 15:11:35 +00:00
|
|
|
}
|
|
|
|
|
|
|
|
|
|
void MetalDevice::tex_alloc(device_texture &mem)
|
|
|
|
|
{
|
2023-10-24 23:20:16 +01:00
|
|
|
@autoreleasepool {
|
|
|
|
|
/* Check that dimensions fit within maximum allowable size.
|
|
|
|
|
* If 1D texture is allocated, use 1D buffer.
|
|
|
|
|
* See: https://developer.apple.com/metal/Metal-Feature-Set-Tables.pdf */
|
|
|
|
|
if (mem.data_height > 0) {
|
|
|
|
|
if (mem.data_width > 16384 || mem.data_height > 16384) {
|
|
|
|
|
set_error(string_printf(
|
|
|
|
|
"Texture exceeds maximum allowed size of 16384 x 16384 (requested: %zu x %zu)",
|
|
|
|
|
mem.data_width,
|
|
|
|
|
mem.data_height));
|
|
|
|
|
return;
|
|
|
|
|
}
|
2023-02-10 18:44:46 +01:00
|
|
|
}
|
2021-12-07 15:11:35 +00:00
|
|
|
|
2023-10-24 23:20:16 +01:00
|
|
|
/* General variables for both architectures */
|
|
|
|
|
size_t size = mem.memory_size();
|
2021-12-07 15:11:35 +00:00
|
|
|
|
2023-10-24 23:20:16 +01:00
|
|
|
/* sampler_index maps into the GPU's constant 'metal_samplers' array */
|
|
|
|
|
uint64_t sampler_index = mem.info.extension;
|
|
|
|
|
if (mem.info.interpolation != INTERPOLATION_CLOSEST) {
|
|
|
|
|
sampler_index += 4;
|
|
|
|
|
}
|
2021-12-07 15:11:35 +00:00
|
|
|
|
2023-10-24 23:20:16 +01:00
|
|
|
/* Image Texture Storage */
|
|
|
|
|
MTLPixelFormat format;
|
|
|
|
|
switch (mem.data_type) {
|
|
|
|
|
case TYPE_UCHAR: {
|
|
|
|
|
MTLPixelFormat formats[] = {MTLPixelFormatR8Unorm,
|
|
|
|
|
MTLPixelFormatRG8Unorm,
|
|
|
|
|
MTLPixelFormatInvalid,
|
|
|
|
|
MTLPixelFormatRGBA8Unorm};
|
|
|
|
|
format = formats[mem.data_elements - 1];
|
|
|
|
|
} break;
|
|
|
|
|
case TYPE_UINT16: {
|
|
|
|
|
MTLPixelFormat formats[] = {MTLPixelFormatR16Unorm,
|
|
|
|
|
MTLPixelFormatRG16Unorm,
|
|
|
|
|
MTLPixelFormatInvalid,
|
|
|
|
|
MTLPixelFormatRGBA16Unorm};
|
|
|
|
|
format = formats[mem.data_elements - 1];
|
|
|
|
|
} break;
|
|
|
|
|
case TYPE_UINT: {
|
|
|
|
|
MTLPixelFormat formats[] = {MTLPixelFormatR32Uint,
|
|
|
|
|
MTLPixelFormatRG32Uint,
|
|
|
|
|
MTLPixelFormatInvalid,
|
|
|
|
|
MTLPixelFormatRGBA32Uint};
|
|
|
|
|
format = formats[mem.data_elements - 1];
|
|
|
|
|
} break;
|
|
|
|
|
case TYPE_INT: {
|
|
|
|
|
MTLPixelFormat formats[] = {MTLPixelFormatR32Sint,
|
|
|
|
|
MTLPixelFormatRG32Sint,
|
|
|
|
|
MTLPixelFormatInvalid,
|
|
|
|
|
MTLPixelFormatRGBA32Sint};
|
|
|
|
|
format = formats[mem.data_elements - 1];
|
|
|
|
|
} break;
|
|
|
|
|
case TYPE_FLOAT: {
|
|
|
|
|
MTLPixelFormat formats[] = {MTLPixelFormatR32Float,
|
|
|
|
|
MTLPixelFormatRG32Float,
|
|
|
|
|
MTLPixelFormatInvalid,
|
|
|
|
|
MTLPixelFormatRGBA32Float};
|
|
|
|
|
format = formats[mem.data_elements - 1];
|
|
|
|
|
} break;
|
|
|
|
|
case TYPE_HALF: {
|
|
|
|
|
MTLPixelFormat formats[] = {MTLPixelFormatR16Float,
|
|
|
|
|
MTLPixelFormatRG16Float,
|
|
|
|
|
MTLPixelFormatInvalid,
|
|
|
|
|
MTLPixelFormatRGBA16Float};
|
|
|
|
|
format = formats[mem.data_elements - 1];
|
|
|
|
|
} break;
|
|
|
|
|
default:
|
|
|
|
|
assert(0);
|
|
|
|
|
return;
|
|
|
|
|
}
|
2021-12-07 15:11:35 +00:00
|
|
|
|
2023-10-24 23:20:16 +01:00
|
|
|
assert(format != MTLPixelFormatInvalid);
|
2021-12-07 15:11:35 +00:00
|
|
|
|
2023-10-24 23:20:16 +01:00
|
|
|
id<MTLTexture> mtlTexture = nil;
|
2025-01-09 12:04:08 +01:00
|
|
|
size_t src_pitch = mem.data_width * datatype_size(mem.data_type) * mem.data_elements;
|
2021-12-07 15:11:35 +00:00
|
|
|
|
2023-10-24 23:20:16 +01:00
|
|
|
if (mem.data_depth > 1) {
|
|
|
|
|
/* 3D texture using array */
|
|
|
|
|
MTLTextureDescriptor *desc;
|
2021-12-07 15:11:35 +00:00
|
|
|
|
2023-10-24 23:20:16 +01:00
|
|
|
desc = [MTLTextureDescriptor texture2DDescriptorWithPixelFormat:format
|
|
|
|
|
width:mem.data_width
|
|
|
|
|
height:mem.data_height
|
|
|
|
|
mipmapped:NO];
|
2021-12-07 15:11:35 +00:00
|
|
|
|
2025-03-19 12:53:01 +01:00
|
|
|
desc.storageMode = MTLStorageModeShared;
|
2023-10-24 23:20:16 +01:00
|
|
|
desc.usage = MTLTextureUsageShaderRead;
|
2021-12-07 15:11:35 +00:00
|
|
|
|
2023-10-24 23:20:16 +01:00
|
|
|
desc.textureType = MTLTextureType3D;
|
|
|
|
|
desc.depth = mem.data_depth;
|
2021-12-07 15:11:35 +00:00
|
|
|
|
2023-10-24 23:20:16 +01:00
|
|
|
VLOG_WORK << "Texture 3D allocate: " << mem.name << ", "
|
|
|
|
|
<< string_human_readable_number(mem.memory_size()) << " bytes. ("
|
|
|
|
|
<< string_human_readable_size(mem.memory_size()) << ")";
|
2021-12-07 15:11:35 +00:00
|
|
|
|
2023-10-24 23:20:16 +01:00
|
|
|
mtlTexture = [mtlDevice newTextureWithDescriptor:desc];
|
|
|
|
|
if (!mtlTexture) {
|
|
|
|
|
set_error("System is out of GPU memory");
|
|
|
|
|
return;
|
|
|
|
|
}
|
|
|
|
|
|
|
|
|
|
const size_t imageBytes = src_pitch * mem.data_height;
|
|
|
|
|
for (size_t d = 0; d < mem.data_depth; d++) {
|
|
|
|
|
const size_t offset = d * imageBytes;
|
|
|
|
|
[mtlTexture replaceRegion:MTLRegionMake3D(0, 0, d, mem.data_width, mem.data_height, 1)
|
|
|
|
|
mipmapLevel:0
|
|
|
|
|
slice:0
|
|
|
|
|
withBytes:(uint8_t *)mem.host_pointer + offset
|
|
|
|
|
bytesPerRow:src_pitch
|
|
|
|
|
bytesPerImage:0];
|
|
|
|
|
}
|
2021-12-07 15:11:35 +00:00
|
|
|
}
|
2023-10-24 23:20:16 +01:00
|
|
|
else if (mem.data_height > 0) {
|
|
|
|
|
/* 2D texture */
|
|
|
|
|
MTLTextureDescriptor *desc;
|
|
|
|
|
|
|
|
|
|
desc = [MTLTextureDescriptor texture2DDescriptorWithPixelFormat:format
|
|
|
|
|
width:mem.data_width
|
|
|
|
|
height:mem.data_height
|
|
|
|
|
mipmapped:NO];
|
|
|
|
|
|
2025-03-19 12:53:01 +01:00
|
|
|
desc.storageMode = MTLStorageModeShared;
|
2023-10-24 23:20:16 +01:00
|
|
|
desc.usage = MTLTextureUsageShaderRead;
|
|
|
|
|
|
|
|
|
|
VLOG_WORK << "Texture 2D allocate: " << mem.name << ", "
|
|
|
|
|
<< string_human_readable_number(mem.memory_size()) << " bytes. ("
|
|
|
|
|
<< string_human_readable_size(mem.memory_size()) << ")";
|
|
|
|
|
|
|
|
|
|
mtlTexture = [mtlDevice newTextureWithDescriptor:desc];
|
|
|
|
|
if (!mtlTexture) {
|
|
|
|
|
set_error("System is out of GPU memory");
|
|
|
|
|
return;
|
|
|
|
|
}
|
2021-12-07 15:11:35 +00:00
|
|
|
|
2023-10-24 23:20:16 +01:00
|
|
|
[mtlTexture replaceRegion:MTLRegionMake2D(0, 0, mem.data_width, mem.data_height)
|
2021-12-07 15:11:35 +00:00
|
|
|
mipmapLevel:0
|
2023-10-24 23:20:16 +01:00
|
|
|
withBytes:mem.host_pointer
|
|
|
|
|
bytesPerRow:src_pitch];
|
2021-12-07 15:11:35 +00:00
|
|
|
}
|
2023-10-24 23:20:16 +01:00
|
|
|
else {
|
|
|
|
|
/* 1D texture, using linear memory. */
|
|
|
|
|
tex_alloc_as_buffer(mem);
|
2022-12-07 13:28:59 +00:00
|
|
|
return;
|
|
|
|
|
}
|
2021-12-07 15:11:35 +00:00
|
|
|
|
2023-10-24 23:20:16 +01:00
|
|
|
mem.device_pointer = (device_ptr)mtlTexture;
|
|
|
|
|
mem.device_size = size;
|
|
|
|
|
stats.mem_alloc(size);
|
2021-12-07 15:11:35 +00:00
|
|
|
|
2023-10-24 23:20:16 +01:00
|
|
|
std::lock_guard<std::recursive_mutex> lock(metal_mem_map_mutex);
|
2024-12-29 23:13:45 +01:00
|
|
|
unique_ptr<MetalMem> mmem = make_unique<MetalMem>();
|
2023-10-24 23:20:16 +01:00
|
|
|
mmem->mem = &mem;
|
|
|
|
|
mmem->mtlTexture = mtlTexture;
|
2024-12-29 23:13:45 +01:00
|
|
|
metal_mem_map[&mem] = std::move(mmem);
|
2023-10-24 23:20:16 +01:00
|
|
|
|
|
|
|
|
/* Resize once */
|
|
|
|
|
const uint slot = mem.slot;
|
|
|
|
|
if (slot >= texture_info.size()) {
|
|
|
|
|
/* Allocate some slots in advance, to reduce amount
|
|
|
|
|
* of re-allocations. */
|
|
|
|
|
texture_info.resize(slot + 128);
|
|
|
|
|
texture_slot_map.resize(slot + 128);
|
|
|
|
|
|
|
|
|
|
ssize_t min_buffer_length = sizeof(void *) * texture_info.size();
|
|
|
|
|
if (!texture_bindings_2d || (texture_bindings_2d.length < min_buffer_length)) {
|
|
|
|
|
if (texture_bindings_2d) {
|
|
|
|
|
delayed_free_list.push_back(buffer_bindings_1d);
|
|
|
|
|
delayed_free_list.push_back(texture_bindings_2d);
|
|
|
|
|
delayed_free_list.push_back(texture_bindings_3d);
|
|
|
|
|
|
|
|
|
|
stats.mem_free(buffer_bindings_1d.allocatedSize + texture_bindings_2d.allocatedSize +
|
|
|
|
|
texture_bindings_3d.allocatedSize);
|
|
|
|
|
}
|
|
|
|
|
buffer_bindings_1d = [mtlDevice newBufferWithLength:min_buffer_length
|
2025-03-19 12:53:01 +01:00
|
|
|
options:MTLResourceStorageModeShared];
|
2023-10-24 23:20:16 +01:00
|
|
|
texture_bindings_2d = [mtlDevice newBufferWithLength:min_buffer_length
|
2025-03-19 12:53:01 +01:00
|
|
|
options:MTLResourceStorageModeShared];
|
2023-10-24 23:20:16 +01:00
|
|
|
texture_bindings_3d = [mtlDevice newBufferWithLength:min_buffer_length
|
2025-03-19 12:53:01 +01:00
|
|
|
options:MTLResourceStorageModeShared];
|
2023-10-24 23:20:16 +01:00
|
|
|
|
|
|
|
|
stats.mem_alloc(buffer_bindings_1d.allocatedSize + texture_bindings_2d.allocatedSize +
|
|
|
|
|
texture_bindings_3d.allocatedSize);
|
2021-12-07 15:11:35 +00:00
|
|
|
}
|
|
|
|
|
}
|
|
|
|
|
|
2024-02-16 19:03:23 +01:00
|
|
|
/* Optimize the texture for GPU access. */
|
|
|
|
|
id<MTLCommandBuffer> commandBuffer = [mtlGeneralCommandQueue commandBuffer];
|
|
|
|
|
id<MTLBlitCommandEncoder> blitCommandEncoder = [commandBuffer blitCommandEncoder];
|
|
|
|
|
[blitCommandEncoder optimizeContentsForGPUAccess:mtlTexture];
|
|
|
|
|
[blitCommandEncoder endEncoding];
|
|
|
|
|
[commandBuffer commit];
|
2021-12-07 15:11:35 +00:00
|
|
|
|
2023-10-24 23:20:16 +01:00
|
|
|
/* Set Mapping and tag that we need to (re-)upload to device */
|
|
|
|
|
texture_slot_map[slot] = mtlTexture;
|
|
|
|
|
texture_info[slot] = mem.info;
|
|
|
|
|
need_texture_info = true;
|
2021-12-07 15:11:35 +00:00
|
|
|
|
2023-10-24 23:20:16 +01:00
|
|
|
texture_info[slot].data = uint64_t(slot) | (sampler_index << 32);
|
2022-12-07 13:28:59 +00:00
|
|
|
|
2023-10-24 23:20:16 +01:00
|
|
|
if (max_working_set_exceeded()) {
|
|
|
|
|
set_error("System is out of GPU memory");
|
|
|
|
|
}
|
2022-12-07 13:28:59 +00:00
|
|
|
}
|
2021-12-07 15:11:35 +00:00
|
|
|
}
|
|
|
|
|
|
2025-01-09 12:04:08 +01:00
|
|
|
void MetalDevice::tex_copy_to(device_texture &mem)
|
|
|
|
|
{
|
|
|
|
|
if (mem.is_resident(this)) {
|
|
|
|
|
const size_t src_pitch = mem.data_width * datatype_size(mem.data_type) * mem.data_elements;
|
|
|
|
|
|
|
|
|
|
if (mem.data_depth > 0) {
|
|
|
|
|
id<MTLTexture> mtlTexture;
|
|
|
|
|
{
|
|
|
|
|
std::lock_guard<std::recursive_mutex> lock(metal_mem_map_mutex);
|
|
|
|
|
mtlTexture = metal_mem_map.at(&mem)->mtlTexture;
|
|
|
|
|
}
|
|
|
|
|
const size_t imageBytes = src_pitch * mem.data_height;
|
|
|
|
|
for (size_t d = 0; d < mem.data_depth; d++) {
|
|
|
|
|
const size_t offset = d * imageBytes;
|
|
|
|
|
[mtlTexture replaceRegion:MTLRegionMake3D(0, 0, d, mem.data_width, mem.data_height, 1)
|
|
|
|
|
mipmapLevel:0
|
|
|
|
|
slice:0
|
|
|
|
|
withBytes:(uint8_t *)mem.host_pointer + offset
|
|
|
|
|
bytesPerRow:src_pitch
|
|
|
|
|
bytesPerImage:0];
|
|
|
|
|
}
|
|
|
|
|
}
|
|
|
|
|
else if (mem.data_height > 0) {
|
|
|
|
|
id<MTLTexture> mtlTexture;
|
|
|
|
|
{
|
|
|
|
|
std::lock_guard<std::recursive_mutex> lock(metal_mem_map_mutex);
|
|
|
|
|
mtlTexture = metal_mem_map.at(&mem)->mtlTexture;
|
|
|
|
|
}
|
|
|
|
|
[mtlTexture replaceRegion:MTLRegionMake2D(0, 0, mem.data_width, mem.data_height)
|
|
|
|
|
mipmapLevel:0
|
|
|
|
|
withBytes:mem.host_pointer
|
|
|
|
|
bytesPerRow:src_pitch];
|
|
|
|
|
}
|
|
|
|
|
else {
|
|
|
|
|
generic_copy_to(mem);
|
|
|
|
|
}
|
|
|
|
|
}
|
|
|
|
|
}
|
|
|
|
|
|
2021-12-07 15:11:35 +00:00
|
|
|
void MetalDevice::tex_free(device_texture &mem)
|
|
|
|
|
{
|
2023-02-10 18:44:46 +01:00
|
|
|
if (mem.data_depth == 0 && mem.data_height == 0) {
|
|
|
|
|
generic_free(mem);
|
|
|
|
|
return;
|
|
|
|
|
}
|
|
|
|
|
|
2021-12-07 15:11:35 +00:00
|
|
|
if (metal_mem_map.count(&mem)) {
|
|
|
|
|
std::lock_guard<std::recursive_mutex> lock(metal_mem_map_mutex);
|
|
|
|
|
MetalMem &mmem = *metal_mem_map.at(&mem);
|
|
|
|
|
|
|
|
|
|
assert(texture_slot_map[mem.slot] == mmem.mtlTexture);
|
2024-12-26 17:53:59 +01:00
|
|
|
if (texture_slot_map[mem.slot] == mmem.mtlTexture) {
|
2023-02-10 18:44:46 +01:00
|
|
|
texture_slot_map[mem.slot] = nil;
|
2024-12-26 17:53:59 +01:00
|
|
|
}
|
2021-12-07 15:11:35 +00:00
|
|
|
|
|
|
|
|
if (mmem.mtlTexture) {
|
|
|
|
|
/* Free bindless texture. */
|
|
|
|
|
delayed_free_list.push_back(mmem.mtlTexture);
|
|
|
|
|
mmem.mtlTexture = nil;
|
|
|
|
|
}
|
2022-01-19 17:57:24 +00:00
|
|
|
erase_allocation(mem);
|
2021-12-07 15:11:35 +00:00
|
|
|
}
|
|
|
|
|
}
|
|
|
|
|
|
|
|
|
|
unique_ptr<DeviceQueue> MetalDevice::gpu_queue_create()
|
|
|
|
|
{
|
|
|
|
|
return make_unique<MetalDeviceQueue>(this);
|
|
|
|
|
}
|
|
|
|
|
|
2025-04-14 14:06:58 +02:00
|
|
|
bool MetalDevice::should_use_graphics_interop(const GraphicsInteropDevice &interop_device,
|
2025-04-12 14:24:08 +02:00
|
|
|
const bool /*log*/)
|
2021-12-07 15:11:35 +00:00
|
|
|
{
|
2025-04-14 14:06:58 +02:00
|
|
|
/* Always supported with unified memory. */
|
|
|
|
|
return interop_device.type == GraphicsInteropDevice::METAL;
|
2021-12-07 15:11:35 +00:00
|
|
|
}
|
|
|
|
|
|
2024-02-06 21:13:23 +01:00
|
|
|
void *MetalDevice::get_native_buffer(device_ptr ptr)
|
|
|
|
|
{
|
|
|
|
|
return ((MetalMem *)ptr)->mtlBuffer;
|
|
|
|
|
}
|
|
|
|
|
|
2021-12-07 15:11:35 +00:00
|
|
|
void MetalDevice::flush_delayed_free_list()
|
|
|
|
|
{
|
|
|
|
|
/* free any Metal buffers that may have been freed by host while a command
|
|
|
|
|
* buffer was being generated. This function should be called after each
|
|
|
|
|
* completion of a command buffer */
|
|
|
|
|
std::lock_guard<std::recursive_mutex> lock(metal_mem_map_mutex);
|
|
|
|
|
for (auto &it : delayed_free_list) {
|
|
|
|
|
[it release];
|
|
|
|
|
}
|
|
|
|
|
delayed_free_list.clear();
|
|
|
|
|
}
|
|
|
|
|
|
|
|
|
|
void MetalDevice::build_bvh(BVH *bvh, Progress &progress, bool refit)
|
|
|
|
|
{
|
2023-10-24 23:20:16 +01:00
|
|
|
@autoreleasepool {
|
|
|
|
|
if (bvh->params.bvh_layout == BVH_LAYOUT_BVH2) {
|
|
|
|
|
Device::build_bvh(bvh, progress, refit);
|
|
|
|
|
return;
|
|
|
|
|
}
|
2021-12-07 15:11:35 +00:00
|
|
|
|
2023-10-24 23:20:16 +01:00
|
|
|
BVHMetal *bvh_metal = static_cast<BVHMetal *>(bvh);
|
|
|
|
|
bvh_metal->motion_blur = motion_blur;
|
2025-04-03 16:24:04 +02:00
|
|
|
bvh_metal->use_pcmi = use_pcmi;
|
2023-10-24 23:20:16 +01:00
|
|
|
if (bvh_metal->build(progress, mtlDevice, mtlGeneralCommandQueue, refit)) {
|
|
|
|
|
|
2024-02-16 19:03:23 +01:00
|
|
|
if (bvh->params.top_level) {
|
2024-07-08 16:18:34 +02:00
|
|
|
update_bvh(bvh_metal);
|
2021-12-07 15:11:35 +00:00
|
|
|
}
|
|
|
|
|
}
|
2022-12-07 13:28:59 +00:00
|
|
|
|
2023-10-24 23:20:16 +01:00
|
|
|
if (max_working_set_exceeded()) {
|
|
|
|
|
set_error("System is out of GPU memory");
|
|
|
|
|
}
|
2022-12-07 13:28:59 +00:00
|
|
|
}
|
2021-12-07 15:11:35 +00:00
|
|
|
}
|
|
|
|
|
|
2024-07-08 16:18:34 +02:00
|
|
|
void MetalDevice::free_bvh()
|
|
|
|
|
{
|
|
|
|
|
for (id<MTLAccelerationStructure> &blas : unique_blas_array) {
|
|
|
|
|
[blas release];
|
|
|
|
|
}
|
|
|
|
|
unique_blas_array.clear();
|
|
|
|
|
|
|
|
|
|
if (blas_buffer) {
|
|
|
|
|
[blas_buffer release];
|
|
|
|
|
blas_buffer = nil;
|
|
|
|
|
}
|
|
|
|
|
|
|
|
|
|
if (accel_struct) {
|
|
|
|
|
[accel_struct release];
|
|
|
|
|
accel_struct = nil;
|
|
|
|
|
}
|
|
|
|
|
}
|
|
|
|
|
|
|
|
|
|
void MetalDevice::update_bvh(BVHMetal *bvh_metal)
|
2024-03-18 11:00:21 +01:00
|
|
|
{
|
2024-07-08 16:18:34 +02:00
|
|
|
free_bvh();
|
|
|
|
|
|
|
|
|
|
if (!bvh_metal) {
|
|
|
|
|
return;
|
|
|
|
|
}
|
|
|
|
|
|
|
|
|
|
accel_struct = bvh_metal->accel_struct;
|
|
|
|
|
unique_blas_array = bvh_metal->unique_blas_array;
|
|
|
|
|
|
|
|
|
|
[accel_struct retain];
|
|
|
|
|
for (id<MTLAccelerationStructure> &blas : unique_blas_array) {
|
|
|
|
|
[blas retain];
|
|
|
|
|
}
|
|
|
|
|
|
|
|
|
|
// Allocate required buffers for BLAS array.
|
|
|
|
|
uint64_t count = bvh_metal->blas_array.size();
|
|
|
|
|
uint64_t buffer_size = mtlBlasArgEncoder.encodedLength * count;
|
2025-03-19 12:53:01 +01:00
|
|
|
blas_buffer = [mtlDevice newBufferWithLength:buffer_size options:MTLResourceStorageModeShared];
|
2024-07-08 16:18:34 +02:00
|
|
|
stats.mem_alloc(blas_buffer.allocatedSize);
|
|
|
|
|
|
|
|
|
|
for (uint64_t i = 0; i < count; ++i) {
|
|
|
|
|
if (bvh_metal->blas_array[i]) {
|
|
|
|
|
[mtlBlasArgEncoder setArgumentBuffer:blas_buffer offset:i * mtlBlasArgEncoder.encodedLength];
|
|
|
|
|
[mtlBlasArgEncoder setAccelerationStructure:bvh_metal->blas_array[i] atIndex:0];
|
|
|
|
|
}
|
|
|
|
|
}
|
2024-03-18 11:00:21 +01:00
|
|
|
}
|
|
|
|
|
|
2021-12-07 15:11:35 +00:00
|
|
|
CCL_NAMESPACE_END
|
|
|
|
|
|
|
|
|
|
#endif
|