2023-06-14 16:52:36 +10:00
|
|
|
/* SPDX-FileCopyrightText: 2021-2022 Blender Foundation
|
|
|
|
|
*
|
|
|
|
|
* SPDX-License-Identifier: Apache-2.0 */
|
2021-12-07 15:11:35 +00:00
|
|
|
|
|
|
|
|
#ifdef WITH_METAL
|
|
|
|
|
|
2024-12-26 17:53:59 +01:00
|
|
|
# include <algorithm>
|
|
|
|
|
# include <atomic>
|
|
|
|
|
# include <chrono>
|
|
|
|
|
# include <deque>
|
|
|
|
|
# include <thread>
|
|
|
|
|
# include <vector>
|
|
|
|
|
|
2021-12-07 15:11:35 +00:00
|
|
|
# include "device/metal/device_impl.h"
|
2024-12-26 17:53:59 +01:00
|
|
|
# include "device/metal/kernel.h"
|
|
|
|
|
|
2022-07-12 15:32:46 +02:00
|
|
|
# include "kernel/device/metal/function_constants.h"
|
2024-12-26 17:53:59 +01:00
|
|
|
|
|
|
|
|
# include "util/debug.h"
|
2021-12-07 15:11:35 +00:00
|
|
|
# include "util/md5.h"
|
|
|
|
|
# include "util/path.h"
|
|
|
|
|
# include "util/tbb.h"
|
|
|
|
|
# include "util/time.h"
|
2022-05-11 14:52:49 +01:00
|
|
|
# include "util/unique_ptr.h"
|
2021-12-07 15:11:35 +00:00
|
|
|
|
|
|
|
|
CCL_NAMESPACE_BEGIN
|
|
|
|
|
|
2022-07-12 15:32:46 +02:00
|
|
|
const char *kernel_type_as_string(MetalPipelineType pso_type)
|
2021-12-07 15:11:35 +00:00
|
|
|
{
|
2022-07-12 15:32:46 +02:00
|
|
|
switch (pso_type) {
|
2021-12-07 15:11:35 +00:00
|
|
|
case PSO_GENERIC:
|
|
|
|
|
return "PSO_GENERIC";
|
2022-07-12 15:32:46 +02:00
|
|
|
case PSO_SPECIALIZED_INTERSECT:
|
|
|
|
|
return "PSO_SPECIALIZED_INTERSECT";
|
|
|
|
|
case PSO_SPECIALIZED_SHADE:
|
|
|
|
|
return "PSO_SPECIALIZED_SHADE";
|
2021-12-07 15:11:35 +00:00
|
|
|
default:
|
|
|
|
|
assert(0);
|
|
|
|
|
}
|
|
|
|
|
return "";
|
|
|
|
|
}
|
|
|
|
|
|
2022-05-11 14:52:49 +01:00
|
|
|
struct ShaderCache {
|
|
|
|
|
ShaderCache(id<MTLDevice> _mtlDevice) : mtlDevice(_mtlDevice)
|
|
|
|
|
{
|
2022-11-04 15:59:55 +00:00
|
|
|
/* Initialize occupancy tuning LUT. */
|
2023-05-24 13:36:13 +02:00
|
|
|
|
|
|
|
|
// TODO: Look into tuning for DEVICE_KERNEL_INTEGRATOR_INTERSECT_DEDICATED_LIGHT and
|
|
|
|
|
// DEVICE_KERNEL_INTEGRATOR_SHADE_DEDICATED_LIGHT.
|
|
|
|
|
|
2024-06-26 17:16:20 +02:00
|
|
|
switch (MetalInfo::get_apple_gpu_architecture(mtlDevice)) {
|
|
|
|
|
default:
|
|
|
|
|
case APPLE_M3:
|
|
|
|
|
/* Peak occupancy is achieved through Dynamic Caching on M3 GPUs. */
|
|
|
|
|
for (size_t i = 0; i < DEVICE_KERNEL_NUM; i++) {
|
|
|
|
|
occupancy_tuning[i] = {64, 64};
|
|
|
|
|
}
|
|
|
|
|
break;
|
|
|
|
|
case APPLE_M2_BIG:
|
|
|
|
|
occupancy_tuning[DEVICE_KERNEL_INTEGRATOR_COMPACT_SHADOW_STATES] = {384, 128};
|
|
|
|
|
occupancy_tuning[DEVICE_KERNEL_INTEGRATOR_INIT_FROM_CAMERA] = {640, 128};
|
|
|
|
|
occupancy_tuning[DEVICE_KERNEL_INTEGRATOR_INTERSECT_CLOSEST] = {1024, 64};
|
|
|
|
|
occupancy_tuning[DEVICE_KERNEL_INTEGRATOR_INTERSECT_SHADOW] = {704, 704};
|
|
|
|
|
occupancy_tuning[DEVICE_KERNEL_INTEGRATOR_INTERSECT_SUBSURFACE] = {640, 32};
|
|
|
|
|
occupancy_tuning[DEVICE_KERNEL_INTEGRATOR_QUEUED_PATHS_ARRAY] = {896, 768};
|
|
|
|
|
occupancy_tuning[DEVICE_KERNEL_INTEGRATOR_SHADE_BACKGROUND] = {512, 128};
|
|
|
|
|
occupancy_tuning[DEVICE_KERNEL_INTEGRATOR_SHADE_SHADOW] = {32, 32};
|
|
|
|
|
occupancy_tuning[DEVICE_KERNEL_INTEGRATOR_SHADE_SURFACE] = {768, 576};
|
|
|
|
|
occupancy_tuning[DEVICE_KERNEL_INTEGRATOR_SORTED_PATHS_ARRAY] = {896, 768};
|
|
|
|
|
break;
|
|
|
|
|
case APPLE_M2:
|
|
|
|
|
occupancy_tuning[DEVICE_KERNEL_INTEGRATOR_COMPACT_SHADOW_STATES] = {32, 32};
|
|
|
|
|
occupancy_tuning[DEVICE_KERNEL_INTEGRATOR_INIT_FROM_CAMERA] = {832, 32};
|
|
|
|
|
occupancy_tuning[DEVICE_KERNEL_INTEGRATOR_INTERSECT_CLOSEST] = {64, 64};
|
|
|
|
|
occupancy_tuning[DEVICE_KERNEL_INTEGRATOR_INTERSECT_SHADOW] = {64, 64};
|
|
|
|
|
occupancy_tuning[DEVICE_KERNEL_INTEGRATOR_INTERSECT_SUBSURFACE] = {704, 32};
|
|
|
|
|
occupancy_tuning[DEVICE_KERNEL_INTEGRATOR_QUEUED_PATHS_ARRAY] = {1024, 256};
|
|
|
|
|
occupancy_tuning[DEVICE_KERNEL_INTEGRATOR_SHADE_BACKGROUND] = {64, 32};
|
|
|
|
|
occupancy_tuning[DEVICE_KERNEL_INTEGRATOR_SHADE_SHADOW] = {256, 256};
|
|
|
|
|
occupancy_tuning[DEVICE_KERNEL_INTEGRATOR_SHADE_SURFACE] = {448, 384};
|
|
|
|
|
occupancy_tuning[DEVICE_KERNEL_INTEGRATOR_SORTED_PATHS_ARRAY] = {1024, 1024};
|
|
|
|
|
break;
|
|
|
|
|
case APPLE_M1:
|
|
|
|
|
occupancy_tuning[DEVICE_KERNEL_INTEGRATOR_COMPACT_SHADOW_STATES] = {256, 128};
|
|
|
|
|
occupancy_tuning[DEVICE_KERNEL_INTEGRATOR_INIT_FROM_CAMERA] = {768, 32};
|
|
|
|
|
occupancy_tuning[DEVICE_KERNEL_INTEGRATOR_INTERSECT_CLOSEST] = {512, 128};
|
|
|
|
|
occupancy_tuning[DEVICE_KERNEL_INTEGRATOR_INTERSECT_SHADOW] = {384, 128};
|
|
|
|
|
occupancy_tuning[DEVICE_KERNEL_INTEGRATOR_INTERSECT_SUBSURFACE] = {512, 64};
|
|
|
|
|
occupancy_tuning[DEVICE_KERNEL_INTEGRATOR_QUEUED_PATHS_ARRAY] = {512, 256};
|
|
|
|
|
occupancy_tuning[DEVICE_KERNEL_INTEGRATOR_SHADE_BACKGROUND] = {512, 128};
|
|
|
|
|
occupancy_tuning[DEVICE_KERNEL_INTEGRATOR_SHADE_SHADOW] = {384, 32};
|
|
|
|
|
occupancy_tuning[DEVICE_KERNEL_INTEGRATOR_SHADE_SURFACE] = {576, 384};
|
|
|
|
|
occupancy_tuning[DEVICE_KERNEL_INTEGRATOR_SORTED_PATHS_ARRAY] = {832, 832};
|
|
|
|
|
break;
|
2022-11-04 15:59:55 +00:00
|
|
|
}
|
2023-02-06 11:16:02 +00:00
|
|
|
|
|
|
|
|
occupancy_tuning[DEVICE_KERNEL_INTEGRATOR_SORT_BUCKET_PASS] = {1024, 1024};
|
|
|
|
|
occupancy_tuning[DEVICE_KERNEL_INTEGRATOR_SORT_WRITE_PASS] = {1024, 1024};
|
2022-05-11 14:52:49 +01:00
|
|
|
}
|
|
|
|
|
~ShaderCache();
|
|
|
|
|
|
|
|
|
|
/* Get the fastest available pipeline for the specified kernel. */
|
|
|
|
|
MetalKernelPipeline *get_best_pipeline(DeviceKernel kernel, const MetalDevice *device);
|
|
|
|
|
|
|
|
|
|
/* Non-blocking request for a kernel, optionally specialized to the scene being rendered by
|
|
|
|
|
* device. */
|
2022-07-12 15:32:46 +02:00
|
|
|
void load_kernel(DeviceKernel kernel, MetalDevice *device, MetalPipelineType pso_type);
|
|
|
|
|
|
|
|
|
|
bool should_load_kernel(DeviceKernel device_kernel,
|
2024-12-29 17:32:00 +01:00
|
|
|
const MetalDevice *device,
|
2022-07-12 15:32:46 +02:00
|
|
|
MetalPipelineType pso_type);
|
2022-05-11 14:52:49 +01:00
|
|
|
|
|
|
|
|
void wait_for_all();
|
|
|
|
|
|
|
|
|
|
friend ShaderCache *get_shader_cache(id<MTLDevice> mtlDevice);
|
|
|
|
|
|
2024-03-25 11:36:15 +01:00
|
|
|
void compile_thread_func();
|
2022-05-11 14:52:49 +01:00
|
|
|
|
|
|
|
|
using PipelineCollection = std::vector<unique_ptr<MetalKernelPipeline>>;
|
|
|
|
|
|
2022-11-04 15:59:55 +00:00
|
|
|
struct OccupancyTuningParameters {
|
|
|
|
|
int threads_per_threadgroup = 0;
|
|
|
|
|
int num_threads_per_block = 0;
|
|
|
|
|
} occupancy_tuning[DEVICE_KERNEL_NUM];
|
|
|
|
|
|
2022-05-11 14:52:49 +01:00
|
|
|
std::mutex cache_mutex;
|
|
|
|
|
|
|
|
|
|
PipelineCollection pipelines[DEVICE_KERNEL_NUM];
|
|
|
|
|
id<MTLDevice> mtlDevice;
|
|
|
|
|
|
2023-01-04 14:23:33 +00:00
|
|
|
static bool running;
|
2022-05-11 14:52:49 +01:00
|
|
|
std::condition_variable cond_var;
|
2024-12-29 23:13:45 +01:00
|
|
|
std::deque<unique_ptr<MetalKernelPipeline>> request_queue;
|
2022-05-11 14:52:49 +01:00
|
|
|
std::vector<std::thread> compile_threads;
|
|
|
|
|
std::atomic_int incomplete_requests = 0;
|
2023-01-04 14:23:33 +00:00
|
|
|
std::atomic_int incomplete_specialization_requests = 0;
|
2022-05-11 14:52:49 +01:00
|
|
|
};
|
|
|
|
|
|
2023-01-04 14:23:33 +00:00
|
|
|
bool ShaderCache::running = true;
|
2023-01-04 16:01:24 +00:00
|
|
|
|
|
|
|
|
const int MAX_POSSIBLE_GPUS_ON_SYSTEM = 8;
|
|
|
|
|
using DeviceShaderCache = std::pair<id<MTLDevice>, unique_ptr<ShaderCache>>;
|
|
|
|
|
int g_shaderCacheCount = 0;
|
|
|
|
|
DeviceShaderCache g_shaderCache[MAX_POSSIBLE_GPUS_ON_SYSTEM];
|
2022-05-11 14:52:49 +01:00
|
|
|
|
2024-07-08 16:18:34 +02:00
|
|
|
/* Next UID for associating a MetalDispatchPipeline with an originating MetalKernelPipeline. */
|
|
|
|
|
static std::atomic_int g_next_pipeline_id = 0;
|
|
|
|
|
|
2022-05-11 14:52:49 +01:00
|
|
|
ShaderCache *get_shader_cache(id<MTLDevice> mtlDevice)
|
|
|
|
|
{
|
2023-01-05 19:42:16 +01:00
|
|
|
for (int i = 0; i < g_shaderCacheCount; i++) {
|
2023-01-04 16:01:24 +00:00
|
|
|
if (g_shaderCache[i].first == mtlDevice) {
|
|
|
|
|
return g_shaderCache[i].second.get();
|
|
|
|
|
}
|
2022-05-11 14:52:49 +01:00
|
|
|
}
|
|
|
|
|
|
2023-01-04 16:01:24 +00:00
|
|
|
static thread_mutex g_shaderCacheCountMutex;
|
|
|
|
|
g_shaderCacheCountMutex.lock();
|
|
|
|
|
int index = g_shaderCacheCount++;
|
|
|
|
|
g_shaderCacheCountMutex.unlock();
|
|
|
|
|
|
|
|
|
|
assert(index < MAX_POSSIBLE_GPUS_ON_SYSTEM);
|
|
|
|
|
g_shaderCache[index].first = mtlDevice;
|
|
|
|
|
g_shaderCache[index].second = make_unique<ShaderCache>(mtlDevice);
|
|
|
|
|
return g_shaderCache[index].second.get();
|
2022-05-11 14:52:49 +01:00
|
|
|
}
|
|
|
|
|
|
|
|
|
|
ShaderCache::~ShaderCache()
|
|
|
|
|
{
|
|
|
|
|
running = false;
|
|
|
|
|
cond_var.notify_all();
|
2023-01-04 14:23:33 +00:00
|
|
|
|
2023-03-07 17:08:30 +01:00
|
|
|
metal_printf("Waiting for ShaderCache threads... (incomplete_requests = %d)\n",
|
|
|
|
|
int(incomplete_requests));
|
2022-05-11 14:52:49 +01:00
|
|
|
for (auto &thread : compile_threads) {
|
|
|
|
|
thread.join();
|
|
|
|
|
}
|
2023-03-07 17:08:30 +01:00
|
|
|
metal_printf("ShaderCache shut down.\n");
|
2022-05-11 14:52:49 +01:00
|
|
|
}
|
|
|
|
|
|
|
|
|
|
void ShaderCache::wait_for_all()
|
|
|
|
|
{
|
|
|
|
|
while (incomplete_requests > 0) {
|
|
|
|
|
std::this_thread::sleep_for(std::chrono::milliseconds(100));
|
|
|
|
|
}
|
|
|
|
|
}
|
|
|
|
|
|
2024-03-25 11:36:15 +01:00
|
|
|
void ShaderCache::compile_thread_func()
|
2022-05-11 14:52:49 +01:00
|
|
|
{
|
2023-01-04 14:23:33 +00:00
|
|
|
while (running) {
|
2022-05-11 14:52:49 +01:00
|
|
|
|
|
|
|
|
/* wait for / acquire next request */
|
2024-12-29 23:13:45 +01:00
|
|
|
unique_ptr<MetalKernelPipeline> pipeline;
|
2022-05-11 14:52:49 +01:00
|
|
|
{
|
|
|
|
|
thread_scoped_lock lock(cache_mutex);
|
|
|
|
|
cond_var.wait(lock, [&] { return !running || !request_queue.empty(); });
|
2023-01-04 14:23:33 +00:00
|
|
|
if (!running || request_queue.empty()) {
|
|
|
|
|
continue;
|
2022-05-11 14:52:49 +01:00
|
|
|
}
|
|
|
|
|
|
2024-12-29 23:13:45 +01:00
|
|
|
pipeline = std::move(request_queue.front());
|
2023-01-04 14:23:33 +00:00
|
|
|
request_queue.pop_front();
|
|
|
|
|
}
|
|
|
|
|
|
|
|
|
|
/* Service the request. */
|
|
|
|
|
DeviceKernel device_kernel = pipeline->device_kernel;
|
|
|
|
|
MetalPipelineType pso_type = pipeline->pso_type;
|
|
|
|
|
|
|
|
|
|
if (MetalDevice::is_device_cancelled(pipeline->originating_device_id)) {
|
|
|
|
|
/* The originating MetalDevice is no longer active, so this request is obsolete. */
|
|
|
|
|
metal_printf("Cancelling compilation of %s (%s)\n",
|
|
|
|
|
device_kernel_as_string(device_kernel),
|
|
|
|
|
kernel_type_as_string(pso_type));
|
2022-05-11 14:52:49 +01:00
|
|
|
}
|
2023-01-04 14:23:33 +00:00
|
|
|
else {
|
|
|
|
|
/* Do the actual compilation. */
|
|
|
|
|
pipeline->compile();
|
2022-05-11 14:52:49 +01:00
|
|
|
|
2023-01-04 14:23:33 +00:00
|
|
|
thread_scoped_lock lock(cache_mutex);
|
|
|
|
|
auto &collection = pipelines[device_kernel];
|
|
|
|
|
|
|
|
|
|
/* Cache up to 3 kernel variants with the same pso_type in memory, purging oldest first. */
|
|
|
|
|
int max_entries_of_same_pso_type = 3;
|
|
|
|
|
for (int i = (int)collection.size() - 1; i >= 0; i--) {
|
|
|
|
|
if (collection[i]->pso_type == pso_type) {
|
|
|
|
|
max_entries_of_same_pso_type -= 1;
|
|
|
|
|
if (max_entries_of_same_pso_type == 0) {
|
|
|
|
|
metal_printf("Purging oldest %s:%s kernel from ShaderCache\n",
|
|
|
|
|
kernel_type_as_string(pso_type),
|
|
|
|
|
device_kernel_as_string(device_kernel));
|
|
|
|
|
collection.erase(collection.begin() + i);
|
|
|
|
|
break;
|
|
|
|
|
}
|
|
|
|
|
}
|
|
|
|
|
}
|
2024-12-29 23:13:45 +01:00
|
|
|
collection.push_back(std::move(pipeline));
|
2023-01-04 14:23:33 +00:00
|
|
|
}
|
|
|
|
|
incomplete_requests--;
|
|
|
|
|
if (pso_type != PSO_GENERIC) {
|
|
|
|
|
incomplete_specialization_requests--;
|
2022-05-11 14:52:49 +01:00
|
|
|
}
|
2021-12-07 15:11:35 +00:00
|
|
|
}
|
|
|
|
|
}
|
|
|
|
|
|
2022-07-12 15:32:46 +02:00
|
|
|
bool ShaderCache::should_load_kernel(DeviceKernel device_kernel,
|
2024-12-29 17:32:00 +01:00
|
|
|
const MetalDevice *device,
|
2022-07-12 15:32:46 +02:00
|
|
|
MetalPipelineType pso_type)
|
2021-12-07 15:11:35 +00:00
|
|
|
{
|
2023-01-04 14:23:33 +00:00
|
|
|
if (!running) {
|
|
|
|
|
return false;
|
|
|
|
|
}
|
|
|
|
|
|
2022-07-12 15:32:46 +02:00
|
|
|
if (device_kernel == DEVICE_KERNEL_INTEGRATOR_MEGAKERNEL) {
|
|
|
|
|
/* Skip megakernel. */
|
|
|
|
|
return false;
|
2022-05-11 14:52:49 +01:00
|
|
|
}
|
|
|
|
|
|
2022-07-12 15:32:46 +02:00
|
|
|
if (device_kernel == DEVICE_KERNEL_INTEGRATOR_SHADE_SURFACE_RAYTRACE) {
|
|
|
|
|
if ((device->kernel_features & KERNEL_FEATURE_NODE_RAYTRACE) == 0) {
|
|
|
|
|
/* Skip shade_surface_raytrace kernel if the scene doesn't require it. */
|
|
|
|
|
return false;
|
|
|
|
|
}
|
2021-12-07 15:11:35 +00:00
|
|
|
}
|
|
|
|
|
|
2022-10-24 10:23:56 +01:00
|
|
|
if (device_kernel == DEVICE_KERNEL_INTEGRATOR_SHADE_SURFACE_MNEE) {
|
|
|
|
|
if ((device->kernel_features & KERNEL_FEATURE_MNEE) == 0) {
|
|
|
|
|
/* Skip shade_surface_mnee kernel if the scene doesn't require it. */
|
|
|
|
|
return false;
|
|
|
|
|
}
|
|
|
|
|
}
|
|
|
|
|
|
2022-07-12 15:32:46 +02:00
|
|
|
if (pso_type != PSO_GENERIC) {
|
2022-05-11 14:52:49 +01:00
|
|
|
/* Only specialize kernels where it can make an impact. */
|
|
|
|
|
if (device_kernel < DEVICE_KERNEL_INTEGRATOR_INTERSECT_CLOSEST ||
|
|
|
|
|
device_kernel > DEVICE_KERNEL_INTEGRATOR_MEGAKERNEL)
|
|
|
|
|
{
|
2022-07-12 15:32:46 +02:00
|
|
|
return false;
|
|
|
|
|
}
|
|
|
|
|
|
|
|
|
|
/* Only specialize shading / intersection kernels as requested. */
|
|
|
|
|
bool is_shade_kernel = (device_kernel >= DEVICE_KERNEL_INTEGRATOR_SHADE_BACKGROUND);
|
|
|
|
|
bool is_shade_pso = (pso_type == PSO_SPECIALIZED_SHADE);
|
|
|
|
|
if (is_shade_pso != is_shade_kernel) {
|
|
|
|
|
return false;
|
2022-05-11 14:52:49 +01:00
|
|
|
}
|
2022-04-26 19:00:35 +01:00
|
|
|
}
|
2021-12-07 15:11:35 +00:00
|
|
|
|
2022-05-11 14:52:49 +01:00
|
|
|
{
|
|
|
|
|
/* check whether the kernel has already been requested / cached */
|
|
|
|
|
thread_scoped_lock lock(cache_mutex);
|
|
|
|
|
for (auto &pipeline : pipelines[device_kernel]) {
|
2023-02-23 11:07:28 +01:00
|
|
|
if (pipeline->kernels_md5 == device->kernels_md5[pso_type]) {
|
2022-07-12 15:32:46 +02:00
|
|
|
return false;
|
2022-05-11 14:52:49 +01:00
|
|
|
}
|
2022-07-12 15:32:46 +02:00
|
|
|
}
|
|
|
|
|
}
|
|
|
|
|
|
|
|
|
|
return true;
|
|
|
|
|
}
|
|
|
|
|
|
|
|
|
|
void ShaderCache::load_kernel(DeviceKernel device_kernel,
|
|
|
|
|
MetalDevice *device,
|
|
|
|
|
MetalPipelineType pso_type)
|
|
|
|
|
{
|
|
|
|
|
{
|
|
|
|
|
/* create compiler threads on first run */
|
|
|
|
|
thread_scoped_lock lock(cache_mutex);
|
|
|
|
|
if (compile_threads.empty()) {
|
2023-07-04 15:01:48 +02:00
|
|
|
/* Limit to 2 MTLCompiler instances by default. In macOS >= 13.3 we can query the upper
|
|
|
|
|
* limit. */
|
|
|
|
|
int max_mtlcompiler_threads = 2;
|
|
|
|
|
|
|
|
|
|
# if defined(MAC_OS_VERSION_13_3)
|
|
|
|
|
if (@available(macOS 13.3, *)) {
|
|
|
|
|
/* Subtract one to avoid contention with the real-time GPU module. */
|
2023-07-04 08:26:24 -06:00
|
|
|
max_mtlcompiler_threads = max(2,
|
|
|
|
|
int([mtlDevice maximumConcurrentCompilationTaskCount]) - 1);
|
2023-07-04 15:01:48 +02:00
|
|
|
}
|
|
|
|
|
# endif
|
|
|
|
|
|
|
|
|
|
metal_printf("Spawning %d Cycles kernel compilation threads\n", max_mtlcompiler_threads);
|
2022-07-12 15:32:46 +02:00
|
|
|
for (int i = 0; i < max_mtlcompiler_threads; i++) {
|
2024-12-26 17:53:59 +01:00
|
|
|
compile_threads.emplace_back([this] { this->compile_thread_func(); });
|
2022-05-11 14:52:49 +01:00
|
|
|
}
|
|
|
|
|
}
|
2022-04-26 19:00:35 +01:00
|
|
|
}
|
|
|
|
|
|
2022-07-12 15:32:46 +02:00
|
|
|
if (!should_load_kernel(device_kernel, device, pso_type)) {
|
|
|
|
|
return;
|
|
|
|
|
}
|
|
|
|
|
|
2022-05-11 14:52:49 +01:00
|
|
|
incomplete_requests++;
|
2023-01-04 14:23:33 +00:00
|
|
|
if (pso_type != PSO_GENERIC) {
|
|
|
|
|
incomplete_specialization_requests++;
|
|
|
|
|
}
|
|
|
|
|
|
2024-12-29 23:13:45 +01:00
|
|
|
unique_ptr<MetalKernelPipeline> pipeline = make_unique<MetalKernelPipeline>();
|
2022-05-11 14:52:49 +01:00
|
|
|
|
2023-01-04 14:23:33 +00:00
|
|
|
/* Keep track of the originating device's ID so that we can cancel requests if the device ceases
|
|
|
|
|
* to be active. */
|
2024-07-08 16:18:34 +02:00
|
|
|
pipeline->pipeline_id = g_next_pipeline_id.fetch_add(1);
|
2023-01-04 14:23:33 +00:00
|
|
|
pipeline->originating_device_id = device->device_id;
|
2024-12-27 21:50:31 +01:00
|
|
|
pipeline->kernel_data_ = device->launch_params.data;
|
2023-01-04 14:23:33 +00:00
|
|
|
pipeline->pso_type = pso_type;
|
|
|
|
|
pipeline->mtlDevice = mtlDevice;
|
2023-02-23 11:07:28 +01:00
|
|
|
pipeline->kernels_md5 = device->kernels_md5[pso_type];
|
2023-01-04 14:23:33 +00:00
|
|
|
pipeline->mtlLibrary = device->mtlLibrary[pso_type];
|
|
|
|
|
pipeline->device_kernel = device_kernel;
|
|
|
|
|
pipeline->threads_per_threadgroup = device->max_threads_per_threadgroup;
|
2022-05-11 14:52:49 +01:00
|
|
|
|
2022-11-04 15:59:55 +00:00
|
|
|
if (occupancy_tuning[device_kernel].threads_per_threadgroup) {
|
2023-01-05 11:21:51 +13:00
|
|
|
pipeline->threads_per_threadgroup = occupancy_tuning[device_kernel].threads_per_threadgroup;
|
|
|
|
|
pipeline->num_threads_per_block = occupancy_tuning[device_kernel].num_threads_per_block;
|
2022-11-04 15:59:55 +00:00
|
|
|
}
|
|
|
|
|
|
2022-05-11 14:52:49 +01:00
|
|
|
/* metalrt options */
|
2023-01-04 14:23:33 +00:00
|
|
|
pipeline->use_metalrt = device->use_metalrt;
|
2023-02-28 11:42:08 +01:00
|
|
|
pipeline->kernel_features = device->kernel_features;
|
2022-05-11 14:52:49 +01:00
|
|
|
|
|
|
|
|
{
|
|
|
|
|
thread_scoped_lock lock(cache_mutex);
|
2024-12-29 23:13:45 +01:00
|
|
|
request_queue.push_back(std::move(pipeline));
|
2022-05-11 14:52:49 +01:00
|
|
|
}
|
|
|
|
|
cond_var.notify_one();
|
|
|
|
|
}
|
2022-04-26 19:00:35 +01:00
|
|
|
|
2022-05-11 14:52:49 +01:00
|
|
|
MetalKernelPipeline *ShaderCache::get_best_pipeline(DeviceKernel kernel, const MetalDevice *device)
|
|
|
|
|
{
|
2023-05-05 18:52:54 +02:00
|
|
|
while (running && !device->has_error) {
|
2023-02-28 11:42:08 +01:00
|
|
|
/* Search all loaded pipelines with matching kernels_md5 checksums. */
|
|
|
|
|
MetalKernelPipeline *best_match = nullptr;
|
2023-01-04 16:01:24 +00:00
|
|
|
{
|
|
|
|
|
thread_scoped_lock lock(cache_mutex);
|
2023-02-28 11:42:08 +01:00
|
|
|
for (auto &candidate : pipelines[kernel]) {
|
|
|
|
|
if (candidate->loaded &&
|
2024-01-02 18:12:54 +01:00
|
|
|
candidate->kernels_md5 == device->kernels_md5[candidate->pso_type])
|
|
|
|
|
{
|
2023-02-28 11:42:08 +01:00
|
|
|
/* Replace existing match if candidate is more specialized. */
|
|
|
|
|
if (!best_match || candidate->pso_type > best_match->pso_type) {
|
|
|
|
|
best_match = candidate.get();
|
2023-01-04 16:01:24 +00:00
|
|
|
}
|
|
|
|
|
}
|
2021-12-07 15:11:35 +00:00
|
|
|
}
|
|
|
|
|
}
|
2023-01-04 16:01:24 +00:00
|
|
|
|
2023-02-28 11:42:08 +01:00
|
|
|
if (best_match) {
|
|
|
|
|
if (best_match->usage_count == 0 && best_match->pso_type != PSO_GENERIC) {
|
|
|
|
|
metal_printf("Swapping in %s version of %s\n",
|
|
|
|
|
kernel_type_as_string(best_match->pso_type),
|
|
|
|
|
device_kernel_as_string(kernel));
|
|
|
|
|
}
|
|
|
|
|
best_match->usage_count += 1;
|
|
|
|
|
return best_match;
|
2022-05-11 14:52:49 +01:00
|
|
|
}
|
|
|
|
|
|
2023-02-28 11:42:08 +01:00
|
|
|
/* Spin until a matching kernel is loaded, or we're shutting down. */
|
|
|
|
|
std::this_thread::sleep_for(std::chrono::milliseconds(100));
|
2022-07-12 15:32:46 +02:00
|
|
|
}
|
2023-02-28 11:42:08 +01:00
|
|
|
return nullptr;
|
2022-05-11 14:52:49 +01:00
|
|
|
}
|
|
|
|
|
|
2022-07-12 15:32:46 +02:00
|
|
|
bool MetalKernelPipeline::should_use_binary_archive() const
|
2022-05-11 14:52:49 +01:00
|
|
|
{
|
2022-09-27 14:34:37 +01:00
|
|
|
/* Issues with binary archives in older macOS versions. */
|
2025-01-06 14:12:22 +01:00
|
|
|
if (@available(macOS 15.4, *)) {
|
2024-12-26 17:53:59 +01:00
|
|
|
if (auto *str = getenv("CYCLES_METAL_DISABLE_BINARY_ARCHIVES")) {
|
2022-09-27 14:34:37 +01:00
|
|
|
if (atoi(str) != 0) {
|
|
|
|
|
/* Don't archive if we have opted out by env var. */
|
|
|
|
|
return false;
|
|
|
|
|
}
|
2022-07-12 15:32:46 +02:00
|
|
|
}
|
2022-10-19 17:09:23 +01:00
|
|
|
|
2023-05-23 13:42:25 +02:00
|
|
|
if (use_metalrt && device_kernel_has_intersection(device_kernel)) {
|
|
|
|
|
/* Binary linked functions aren't supported in binary archives. */
|
|
|
|
|
return false;
|
|
|
|
|
}
|
|
|
|
|
|
2022-09-27 14:34:37 +01:00
|
|
|
if (pso_type == PSO_GENERIC) {
|
|
|
|
|
/* Archive the generic kernels. */
|
|
|
|
|
return true;
|
|
|
|
|
}
|
2022-07-12 15:32:46 +02:00
|
|
|
|
2023-02-24 17:52:35 +01:00
|
|
|
if ((device_kernel >= DEVICE_KERNEL_INTEGRATOR_SHADE_BACKGROUND &&
|
|
|
|
|
device_kernel <= DEVICE_KERNEL_INTEGRATOR_SHADE_SHADOW) ||
|
2023-02-28 15:44:49 +11:00
|
|
|
(device_kernel >= DEVICE_KERNEL_SHADER_EVAL_DISPLACE &&
|
2023-02-24 17:52:35 +01:00
|
|
|
device_kernel <= DEVICE_KERNEL_SHADER_EVAL_CURVE_SHADOW_TRANSPARENCY))
|
|
|
|
|
{
|
2022-09-27 14:34:37 +01:00
|
|
|
/* Archive all shade kernels - they take a long time to compile. */
|
|
|
|
|
return true;
|
|
|
|
|
}
|
2022-07-12 15:32:46 +02:00
|
|
|
|
2022-09-28 09:41:07 +10:00
|
|
|
/* The remaining kernels are all fast to compile. They may get cached by the system shader
|
|
|
|
|
* cache, but will be quick to regenerate if not. */
|
2022-09-27 14:34:37 +01:00
|
|
|
}
|
2022-07-12 15:32:46 +02:00
|
|
|
return false;
|
|
|
|
|
}
|
|
|
|
|
|
2024-12-29 17:32:00 +01:00
|
|
|
static MTLFunctionConstantValues *GetConstantValues(const KernelData *data = nullptr)
|
2022-07-12 15:32:46 +02:00
|
|
|
{
|
|
|
|
|
MTLFunctionConstantValues *constant_values = [MTLFunctionConstantValues new];
|
|
|
|
|
|
|
|
|
|
MTLDataType MTLDataType_int = MTLDataTypeInt;
|
|
|
|
|
MTLDataType MTLDataType_float = MTLDataTypeFloat;
|
|
|
|
|
MTLDataType MTLDataType_float4 = MTLDataTypeFloat4;
|
|
|
|
|
KernelData zero_data = {0};
|
|
|
|
|
if (!data) {
|
|
|
|
|
data = &zero_data;
|
|
|
|
|
}
|
2023-01-19 17:57:26 +00:00
|
|
|
[constant_values setConstantValue:&zero_data type:MTLDataType_int atIndex:Kernel_DummyConstant];
|
|
|
|
|
|
|
|
|
|
bool next_member_is_specialized = true;
|
|
|
|
|
|
|
|
|
|
# define KERNEL_STRUCT_MEMBER_DONT_SPECIALIZE next_member_is_specialized = false;
|
2022-05-11 14:52:49 +01:00
|
|
|
|
2022-07-12 15:32:46 +02:00
|
|
|
# define KERNEL_STRUCT_MEMBER(parent, _type, name) \
|
2023-01-19 22:30:10 +01:00
|
|
|
[constant_values setConstantValue:next_member_is_specialized ? (void *)&data->parent.name : \
|
|
|
|
|
(void *)&zero_data \
|
2022-07-12 15:32:46 +02:00
|
|
|
type:MTLDataType_##_type \
|
2023-01-19 17:57:26 +00:00
|
|
|
atIndex:KernelData_##parent##_##name]; \
|
|
|
|
|
next_member_is_specialized = true;
|
2022-07-12 15:32:46 +02:00
|
|
|
|
|
|
|
|
# include "kernel/data_template.h"
|
|
|
|
|
|
|
|
|
|
return constant_values;
|
|
|
|
|
}
|
|
|
|
|
|
2024-07-08 16:18:34 +02:00
|
|
|
void MetalDispatchPipeline::free_intersection_function_tables()
|
|
|
|
|
{
|
|
|
|
|
for (int table = 0; table < METALRT_TABLE_NUM; table++) {
|
|
|
|
|
if (intersection_func_table[table]) {
|
|
|
|
|
[intersection_func_table[table] release];
|
|
|
|
|
intersection_func_table[table] = nil;
|
|
|
|
|
}
|
|
|
|
|
}
|
|
|
|
|
}
|
|
|
|
|
|
|
|
|
|
MetalDispatchPipeline::~MetalDispatchPipeline()
|
|
|
|
|
{
|
|
|
|
|
free_intersection_function_tables();
|
|
|
|
|
}
|
|
|
|
|
|
|
|
|
|
bool MetalDispatchPipeline::update(MetalDevice *metal_device, DeviceKernel kernel)
|
|
|
|
|
{
|
|
|
|
|
const MetalKernelPipeline *best_pipeline = MetalDeviceKernels::get_best_pipeline(metal_device,
|
|
|
|
|
kernel);
|
|
|
|
|
if (!best_pipeline) {
|
|
|
|
|
return false;
|
|
|
|
|
}
|
|
|
|
|
|
|
|
|
|
if (pipeline_id == best_pipeline->pipeline_id) {
|
|
|
|
|
/* The best pipeline is already active - nothing to do. */
|
|
|
|
|
return true;
|
|
|
|
|
}
|
|
|
|
|
pipeline_id = best_pipeline->pipeline_id;
|
|
|
|
|
pipeline = best_pipeline->pipeline;
|
|
|
|
|
pso_type = best_pipeline->pso_type;
|
|
|
|
|
num_threads_per_block = best_pipeline->num_threads_per_block;
|
|
|
|
|
|
|
|
|
|
/* Create the MTLIntersectionFunctionTables if needed. */
|
|
|
|
|
if (best_pipeline->use_metalrt && device_kernel_has_intersection(best_pipeline->device_kernel)) {
|
|
|
|
|
free_intersection_function_tables();
|
|
|
|
|
|
|
|
|
|
for (int table = 0; table < METALRT_TABLE_NUM; table++) {
|
|
|
|
|
@autoreleasepool {
|
|
|
|
|
MTLIntersectionFunctionTableDescriptor *ift_desc =
|
|
|
|
|
[[MTLIntersectionFunctionTableDescriptor alloc] init];
|
|
|
|
|
ift_desc.functionCount = best_pipeline->table_functions[table].count;
|
|
|
|
|
intersection_func_table[table] = [this->pipeline
|
|
|
|
|
newIntersectionFunctionTableWithDescriptor:ift_desc];
|
|
|
|
|
|
|
|
|
|
/* Finally write the function handles into this pipeline's table */
|
|
|
|
|
int size = int([best_pipeline->table_functions[table] count]);
|
|
|
|
|
for (int i = 0; i < size; i++) {
|
|
|
|
|
id<MTLFunctionHandle> handle = [pipeline
|
|
|
|
|
functionHandleWithFunction:best_pipeline->table_functions[table][i]];
|
|
|
|
|
[intersection_func_table[table] setFunction:handle atIndex:i];
|
|
|
|
|
}
|
|
|
|
|
}
|
|
|
|
|
}
|
|
|
|
|
}
|
|
|
|
|
|
|
|
|
|
return true;
|
|
|
|
|
}
|
|
|
|
|
|
Cycles: MetalRT optimisations (scene_intersect_shadow + random_walk)
This PR contains optimisations and a general tidy-up of the MetalRT backend.
- Currently `scene_intersect` is used for both normal and (opaque) shadow rays, however the usage patterns are different enough to warrant specialisation. Shadow intersection tests (flagged with `PATH_RAY_SHADOW_OPAQUE`) only need a bool result, but need a larger "self" payload in order to exclude hits against target lights. By specialising we can minimise the payload size in each case (which is helps performance) and avoid some dynamic branching. This PR introduces a new `scene_intersect_shadow` function which is specialised in Metal, and currently redirects to `scene_intersect` in the other backends.
- Currently `scene_intersect_local` is implemented for worst-case payload requirements as demanded by `subsurface_disk` (where `max_hits` is 4). The random_walk case only demands 1 hit result which we can retrieve directly from the intersector object (rather than stashing it in the payload). By specialising, we significantly reduce the payload size for random_walk queries, which has a big impact on performance. Additionally, we only need to use a custom intersection function for the first ray test in a random walk (for self-primitive filtering), so this PR forces faster `opaque` intersection testing for all but the first random walk test.
- Currently `scene_intersect_volume` has a lot of redundant code to handle non-triangle primitives despite volumes only being enclosed by trimeshes. This PR removes this code.
Additionally, this PR tidies up the convoluted intersection function linking code, removes some redundant intersection handlers, and uses more consistent naming of intersection functions.
On a M3 MacBook Pro, these changes give 2-3% performance increase on typical scenes with opaque trimesh materials (e.g. barbershop, classroom junkshop), but can give over 15% performance increase for certain scenes using random walk SSS (e.g. monster).
Pull Request: https://projects.blender.org/blender/blender/pulls/121397
2024-05-10 16:38:02 +02:00
|
|
|
id<MTLFunction> MetalKernelPipeline::make_intersection_function(const char *function_name)
|
|
|
|
|
{
|
|
|
|
|
MTLFunctionDescriptor *desc = [MTLIntersectionFunctionDescriptor functionDescriptor];
|
|
|
|
|
desc.name = [@(function_name) copy];
|
|
|
|
|
|
|
|
|
|
if (pso_type != PSO_GENERIC) {
|
|
|
|
|
desc.constantValues = GetConstantValues(&kernel_data_);
|
|
|
|
|
}
|
|
|
|
|
else {
|
|
|
|
|
desc.constantValues = GetConstantValues();
|
|
|
|
|
}
|
|
|
|
|
|
2024-12-26 17:53:55 +01:00
|
|
|
NSError *error = nullptr;
|
Cycles: MetalRT optimisations (scene_intersect_shadow + random_walk)
This PR contains optimisations and a general tidy-up of the MetalRT backend.
- Currently `scene_intersect` is used for both normal and (opaque) shadow rays, however the usage patterns are different enough to warrant specialisation. Shadow intersection tests (flagged with `PATH_RAY_SHADOW_OPAQUE`) only need a bool result, but need a larger "self" payload in order to exclude hits against target lights. By specialising we can minimise the payload size in each case (which is helps performance) and avoid some dynamic branching. This PR introduces a new `scene_intersect_shadow` function which is specialised in Metal, and currently redirects to `scene_intersect` in the other backends.
- Currently `scene_intersect_local` is implemented for worst-case payload requirements as demanded by `subsurface_disk` (where `max_hits` is 4). The random_walk case only demands 1 hit result which we can retrieve directly from the intersector object (rather than stashing it in the payload). By specialising, we significantly reduce the payload size for random_walk queries, which has a big impact on performance. Additionally, we only need to use a custom intersection function for the first ray test in a random walk (for self-primitive filtering), so this PR forces faster `opaque` intersection testing for all but the first random walk test.
- Currently `scene_intersect_volume` has a lot of redundant code to handle non-triangle primitives despite volumes only being enclosed by trimeshes. This PR removes this code.
Additionally, this PR tidies up the convoluted intersection function linking code, removes some redundant intersection handlers, and uses more consistent naming of intersection functions.
On a M3 MacBook Pro, these changes give 2-3% performance increase on typical scenes with opaque trimesh materials (e.g. barbershop, classroom junkshop), but can give over 15% performance increase for certain scenes using random walk SSS (e.g. monster).
Pull Request: https://projects.blender.org/blender/blender/pulls/121397
2024-05-10 16:38:02 +02:00
|
|
|
id<MTLFunction> rt_intersection_function = [mtlLibrary newFunctionWithDescriptor:desc
|
|
|
|
|
error:&error];
|
|
|
|
|
|
|
|
|
|
if (rt_intersection_function == nil) {
|
|
|
|
|
NSString *err = [error localizedDescription];
|
|
|
|
|
string errors = [err UTF8String];
|
|
|
|
|
|
|
|
|
|
error_str = string_printf(
|
|
|
|
|
"Error getting intersection function \"%s\": %s", function_name, errors.c_str());
|
|
|
|
|
}
|
|
|
|
|
else {
|
|
|
|
|
rt_intersection_function.label = [@(function_name) copy];
|
|
|
|
|
}
|
|
|
|
|
return rt_intersection_function;
|
|
|
|
|
}
|
|
|
|
|
|
2022-07-12 15:32:46 +02:00
|
|
|
void MetalKernelPipeline::compile()
|
|
|
|
|
{
|
2022-05-11 14:52:49 +01:00
|
|
|
const std::string function_name = std::string("cycles_metal_") +
|
|
|
|
|
device_kernel_as_string(device_kernel);
|
|
|
|
|
|
2024-12-26 17:53:55 +01:00
|
|
|
NSError *error = nullptr;
|
2022-07-12 15:32:46 +02:00
|
|
|
|
2024-02-16 19:03:23 +01:00
|
|
|
MTLFunctionDescriptor *func_desc = [MTLIntersectionFunctionDescriptor functionDescriptor];
|
|
|
|
|
func_desc.name = [@(function_name.c_str()) copy];
|
2022-07-12 15:32:46 +02:00
|
|
|
|
2024-02-16 19:03:23 +01:00
|
|
|
if (pso_type != PSO_GENERIC) {
|
|
|
|
|
func_desc.constantValues = GetConstantValues(&kernel_data_);
|
|
|
|
|
}
|
|
|
|
|
else {
|
|
|
|
|
func_desc.constantValues = GetConstantValues();
|
2021-12-07 15:11:35 +00:00
|
|
|
}
|
2022-05-11 14:52:49 +01:00
|
|
|
|
2024-02-16 19:03:23 +01:00
|
|
|
function = [mtlLibrary newFunctionWithDescriptor:func_desc error:&error];
|
|
|
|
|
|
2022-05-11 14:52:49 +01:00
|
|
|
if (function == nil) {
|
2021-12-07 15:11:35 +00:00
|
|
|
NSString *err = [error localizedDescription];
|
|
|
|
|
string errors = [err UTF8String];
|
2022-05-11 14:52:49 +01:00
|
|
|
metal_printf("Error getting function \"%s\": %s", function_name.c_str(), errors.c_str());
|
|
|
|
|
return;
|
|
|
|
|
}
|
|
|
|
|
|
2023-04-05 21:50:14 +02:00
|
|
|
function.label = [@(function_name.c_str()) copy];
|
2022-05-11 14:52:49 +01:00
|
|
|
|
|
|
|
|
NSArray *linked_functions = nil;
|
|
|
|
|
|
Cycles: MetalRT optimisations (scene_intersect_shadow + random_walk)
This PR contains optimisations and a general tidy-up of the MetalRT backend.
- Currently `scene_intersect` is used for both normal and (opaque) shadow rays, however the usage patterns are different enough to warrant specialisation. Shadow intersection tests (flagged with `PATH_RAY_SHADOW_OPAQUE`) only need a bool result, but need a larger "self" payload in order to exclude hits against target lights. By specialising we can minimise the payload size in each case (which is helps performance) and avoid some dynamic branching. This PR introduces a new `scene_intersect_shadow` function which is specialised in Metal, and currently redirects to `scene_intersect` in the other backends.
- Currently `scene_intersect_local` is implemented for worst-case payload requirements as demanded by `subsurface_disk` (where `max_hits` is 4). The random_walk case only demands 1 hit result which we can retrieve directly from the intersector object (rather than stashing it in the payload). By specialising, we significantly reduce the payload size for random_walk queries, which has a big impact on performance. Additionally, we only need to use a custom intersection function for the first ray test in a random walk (for self-primitive filtering), so this PR forces faster `opaque` intersection testing for all but the first random walk test.
- Currently `scene_intersect_volume` has a lot of redundant code to handle non-triangle primitives despite volumes only being enclosed by trimeshes. This PR removes this code.
Additionally, this PR tidies up the convoluted intersection function linking code, removes some redundant intersection handlers, and uses more consistent naming of intersection functions.
On a M3 MacBook Pro, these changes give 2-3% performance increase on typical scenes with opaque trimesh materials (e.g. barbershop, classroom junkshop), but can give over 15% performance increase for certain scenes using random walk SSS (e.g. monster).
Pull Request: https://projects.blender.org/blender/blender/pulls/121397
2024-05-10 16:38:02 +02:00
|
|
|
if (use_metalrt && device_kernel_has_intersection(device_kernel)) {
|
2022-05-11 14:52:49 +01:00
|
|
|
|
2023-09-13 16:02:49 +02:00
|
|
|
NSMutableSet *unique_functions = [[NSMutableSet alloc] init];
|
Cycles: MetalRT optimisations (scene_intersect_shadow + random_walk)
This PR contains optimisations and a general tidy-up of the MetalRT backend.
- Currently `scene_intersect` is used for both normal and (opaque) shadow rays, however the usage patterns are different enough to warrant specialisation. Shadow intersection tests (flagged with `PATH_RAY_SHADOW_OPAQUE`) only need a bool result, but need a larger "self" payload in order to exclude hits against target lights. By specialising we can minimise the payload size in each case (which is helps performance) and avoid some dynamic branching. This PR introduces a new `scene_intersect_shadow` function which is specialised in Metal, and currently redirects to `scene_intersect` in the other backends.
- Currently `scene_intersect_local` is implemented for worst-case payload requirements as demanded by `subsurface_disk` (where `max_hits` is 4). The random_walk case only demands 1 hit result which we can retrieve directly from the intersector object (rather than stashing it in the payload). By specialising, we significantly reduce the payload size for random_walk queries, which has a big impact on performance. Additionally, we only need to use a custom intersection function for the first ray test in a random walk (for self-primitive filtering), so this PR forces faster `opaque` intersection testing for all but the first random walk test.
- Currently `scene_intersect_volume` has a lot of redundant code to handle non-triangle primitives despite volumes only being enclosed by trimeshes. This PR removes this code.
Additionally, this PR tidies up the convoluted intersection function linking code, removes some redundant intersection handlers, and uses more consistent naming of intersection functions.
On a M3 MacBook Pro, these changes give 2-3% performance increase on typical scenes with opaque trimesh materials (e.g. barbershop, classroom junkshop), but can give over 15% performance increase for certain scenes using random walk SSS (e.g. monster).
Pull Request: https://projects.blender.org/blender/blender/pulls/121397
2024-05-10 16:38:02 +02:00
|
|
|
|
|
|
|
|
auto add_intersection_functions = [&](int table_index,
|
|
|
|
|
const char *tri_fn,
|
|
|
|
|
const char *curve_fn = nullptr,
|
|
|
|
|
const char *point_fn = nullptr) {
|
|
|
|
|
table_functions[table_index] = [NSArray
|
|
|
|
|
arrayWithObjects:make_intersection_function(tri_fn),
|
|
|
|
|
curve_fn ? make_intersection_function(curve_fn) : nil,
|
|
|
|
|
point_fn ? make_intersection_function(point_fn) : nil,
|
|
|
|
|
nil];
|
|
|
|
|
|
|
|
|
|
[unique_functions addObjectsFromArray:table_functions[table_index]];
|
|
|
|
|
};
|
|
|
|
|
|
|
|
|
|
add_intersection_functions(METALRT_TABLE_DEFAULT,
|
|
|
|
|
"__intersection__tri",
|
|
|
|
|
"__intersection__curve",
|
|
|
|
|
"__intersection__point");
|
|
|
|
|
add_intersection_functions(METALRT_TABLE_SHADOW,
|
|
|
|
|
"__intersection__tri_shadow",
|
|
|
|
|
"__intersection__curve_shadow",
|
|
|
|
|
"__intersection__point_shadow");
|
|
|
|
|
add_intersection_functions(METALRT_TABLE_SHADOW_ALL,
|
|
|
|
|
"__intersection__tri_shadow_all",
|
|
|
|
|
"__intersection__curve_shadow_all",
|
|
|
|
|
"__intersection__point_shadow_all");
|
|
|
|
|
add_intersection_functions(METALRT_TABLE_VOLUME, "__intersection__volume_tri");
|
|
|
|
|
add_intersection_functions(METALRT_TABLE_LOCAL, "__intersection__local_tri");
|
|
|
|
|
add_intersection_functions(METALRT_TABLE_LOCAL_MBLUR, "__intersection__local_tri_mblur");
|
|
|
|
|
add_intersection_functions(METALRT_TABLE_LOCAL_SINGLE_HIT,
|
|
|
|
|
"__intersection__local_tri_single_hit");
|
|
|
|
|
add_intersection_functions(METALRT_TABLE_LOCAL_SINGLE_HIT_MBLUR,
|
|
|
|
|
"__intersection__local_tri_single_hit_mblur");
|
|
|
|
|
|
|
|
|
|
linked_functions = [[NSArray arrayWithArray:[unique_functions allObjects]]
|
|
|
|
|
sortedArrayUsingComparator:^NSComparisonResult(id<MTLFunction> f1, id<MTLFunction> f2) {
|
|
|
|
|
return [f1.label compare:f2.label];
|
|
|
|
|
}];
|
2022-05-11 14:52:49 +01:00
|
|
|
unique_functions = nil;
|
|
|
|
|
}
|
2021-12-07 15:11:35 +00:00
|
|
|
|
2022-05-11 14:52:49 +01:00
|
|
|
MTLComputePipelineDescriptor *computePipelineStateDescriptor =
|
2021-12-07 15:11:35 +00:00
|
|
|
[[MTLComputePipelineDescriptor alloc] init];
|
|
|
|
|
|
|
|
|
|
computePipelineStateDescriptor.buffers[0].mutability = MTLMutabilityImmutable;
|
|
|
|
|
computePipelineStateDescriptor.buffers[1].mutability = MTLMutabilityImmutable;
|
|
|
|
|
computePipelineStateDescriptor.buffers[2].mutability = MTLMutabilityImmutable;
|
|
|
|
|
|
2024-02-16 19:03:23 +01:00
|
|
|
computePipelineStateDescriptor.maxTotalThreadsPerThreadgroup = threads_per_threadgroup;
|
2021-12-07 15:11:35 +00:00
|
|
|
computePipelineStateDescriptor.threadGroupSizeIsMultipleOfThreadExecutionWidth = true;
|
|
|
|
|
|
2022-05-11 14:52:49 +01:00
|
|
|
computePipelineStateDescriptor.computeFunction = function;
|
|
|
|
|
|
2024-02-16 19:03:23 +01:00
|
|
|
/* Attach the additional functions to an MTLLinkedFunctions object */
|
|
|
|
|
if (linked_functions) {
|
|
|
|
|
computePipelineStateDescriptor.linkedFunctions = [[MTLLinkedFunctions alloc] init];
|
|
|
|
|
computePipelineStateDescriptor.linkedFunctions.functions = linked_functions;
|
|
|
|
|
}
|
|
|
|
|
computePipelineStateDescriptor.maxCallStackDepth = 1;
|
Cycles: MetalRT optimisations (scene_intersect_shadow + random_walk)
This PR contains optimisations and a general tidy-up of the MetalRT backend.
- Currently `scene_intersect` is used for both normal and (opaque) shadow rays, however the usage patterns are different enough to warrant specialisation. Shadow intersection tests (flagged with `PATH_RAY_SHADOW_OPAQUE`) only need a bool result, but need a larger "self" payload in order to exclude hits against target lights. By specialising we can minimise the payload size in each case (which is helps performance) and avoid some dynamic branching. This PR introduces a new `scene_intersect_shadow` function which is specialised in Metal, and currently redirects to `scene_intersect` in the other backends.
- Currently `scene_intersect_local` is implemented for worst-case payload requirements as demanded by `subsurface_disk` (where `max_hits` is 4). The random_walk case only demands 1 hit result which we can retrieve directly from the intersector object (rather than stashing it in the payload). By specialising, we significantly reduce the payload size for random_walk queries, which has a big impact on performance. Additionally, we only need to use a custom intersection function for the first ray test in a random walk (for self-primitive filtering), so this PR forces faster `opaque` intersection testing for all but the first random walk test.
- Currently `scene_intersect_volume` has a lot of redundant code to handle non-triangle primitives despite volumes only being enclosed by trimeshes. This PR removes this code.
Additionally, this PR tidies up the convoluted intersection function linking code, removes some redundant intersection handlers, and uses more consistent naming of intersection functions.
On a M3 MacBook Pro, these changes give 2-3% performance increase on typical scenes with opaque trimesh materials (e.g. barbershop, classroom junkshop), but can give over 15% performance increase for certain scenes using random walk SSS (e.g. monster).
Pull Request: https://projects.blender.org/blender/blender/pulls/121397
2024-05-10 16:38:02 +02:00
|
|
|
if (use_metalrt && device_kernel_has_intersection(device_kernel)) {
|
|
|
|
|
computePipelineStateDescriptor.maxCallStackDepth = 2;
|
2021-12-07 15:11:35 +00:00
|
|
|
}
|
|
|
|
|
|
|
|
|
|
MTLPipelineOption pipelineOptions = MTLPipelineOptionNone;
|
|
|
|
|
|
2022-07-12 15:32:46 +02:00
|
|
|
bool use_binary_archive = should_use_binary_archive();
|
2023-03-10 12:36:58 +01:00
|
|
|
bool loading_existing_archive = false;
|
|
|
|
|
bool creating_new_archive = false;
|
2022-05-11 14:52:49 +01:00
|
|
|
|
|
|
|
|
id<MTLBinaryArchive> archive = nil;
|
|
|
|
|
string metalbin_path;
|
|
|
|
|
string metalbin_name;
|
|
|
|
|
if (use_binary_archive) {
|
|
|
|
|
NSProcessInfo *processInfo = [NSProcessInfo processInfo];
|
|
|
|
|
string osVersion = [[processInfo operatingSystemVersionString] UTF8String];
|
|
|
|
|
MD5Hash local_md5;
|
2023-02-23 11:07:28 +01:00
|
|
|
local_md5.append(kernels_md5);
|
2022-05-11 14:52:49 +01:00
|
|
|
local_md5.append(osVersion);
|
|
|
|
|
local_md5.append((uint8_t *)&this->threads_per_threadgroup,
|
|
|
|
|
sizeof(this->threads_per_threadgroup));
|
|
|
|
|
|
|
|
|
|
/* Replace non-alphanumerical characters with underscores. */
|
|
|
|
|
string device_name = [mtlDevice.name UTF8String];
|
|
|
|
|
for (char &c : device_name) {
|
|
|
|
|
if ((c < '0' || c > '9') && (c < 'a' || c > 'z') && (c < 'A' || c > 'Z')) {
|
|
|
|
|
c = '_';
|
|
|
|
|
}
|
|
|
|
|
}
|
|
|
|
|
|
|
|
|
|
metalbin_name = device_name;
|
|
|
|
|
metalbin_name = path_join(metalbin_name, device_kernel_as_string(device_kernel));
|
|
|
|
|
metalbin_name = path_join(metalbin_name, kernel_type_as_string(pso_type));
|
2023-02-28 11:42:08 +01:00
|
|
|
metalbin_name = path_join(metalbin_name, local_md5.get_hex() + ".bin");
|
2022-05-11 14:52:49 +01:00
|
|
|
|
|
|
|
|
metalbin_path = path_cache_get(path_join("kernels", metalbin_name));
|
|
|
|
|
path_create_directories(metalbin_path);
|
|
|
|
|
|
2023-03-10 12:36:58 +01:00
|
|
|
/* Check if shader binary exists on disk, and if so, update the file timestamp for LRU purging
|
|
|
|
|
* to work as intended. */
|
|
|
|
|
loading_existing_archive = path_cache_kernel_exists_and_mark_used(metalbin_path);
|
|
|
|
|
creating_new_archive = !loading_existing_archive;
|
|
|
|
|
|
2024-02-16 19:03:23 +01:00
|
|
|
MTLBinaryArchiveDescriptor *archiveDesc = [[MTLBinaryArchiveDescriptor alloc] init];
|
|
|
|
|
if (loading_existing_archive) {
|
|
|
|
|
archiveDesc.url = [NSURL fileURLWithPath:@(metalbin_path.c_str())];
|
|
|
|
|
}
|
|
|
|
|
NSError *error = nil;
|
|
|
|
|
archive = [mtlDevice newBinaryArchiveWithDescriptor:archiveDesc error:&error];
|
|
|
|
|
if (!archive) {
|
|
|
|
|
const char *err = error ? [[error localizedDescription] UTF8String] : nullptr;
|
|
|
|
|
metal_printf("newBinaryArchiveWithDescriptor failed: %s\n", err ? err : "nil");
|
|
|
|
|
}
|
|
|
|
|
[archiveDesc release];
|
2023-03-10 12:36:58 +01:00
|
|
|
|
2024-02-16 19:03:23 +01:00
|
|
|
if (loading_existing_archive) {
|
|
|
|
|
pipelineOptions = MTLPipelineOptionFailOnBinaryArchiveMiss;
|
|
|
|
|
computePipelineStateDescriptor.binaryArchives = [NSArray arrayWithObjects:archive, nil];
|
2021-12-07 15:11:35 +00:00
|
|
|
}
|
|
|
|
|
}
|
|
|
|
|
|
2023-03-10 12:36:58 +01:00
|
|
|
bool recreate_archive = false;
|
|
|
|
|
|
2023-03-07 17:08:30 +01:00
|
|
|
/* Lambda to do the actual pipeline compilation. */
|
|
|
|
|
auto do_compilation = [&]() {
|
|
|
|
|
__block bool compilation_finished = false;
|
|
|
|
|
__block string error_str;
|
|
|
|
|
|
2023-05-30 11:12:05 +02:00
|
|
|
if (loading_existing_archive || !DebugFlags().metal.use_async_pso_creation) {
|
2023-03-07 17:08:30 +01:00
|
|
|
/* Use the blocking variant of newComputePipelineStateWithDescriptor if an archive exists on
|
|
|
|
|
* disk. It should load almost instantaneously, and will fail gracefully when loading a
|
|
|
|
|
* corrupt archive (unlike the async variant). */
|
|
|
|
|
NSError *error = nil;
|
|
|
|
|
pipeline = [mtlDevice newComputePipelineStateWithDescriptor:computePipelineStateDescriptor
|
|
|
|
|
options:pipelineOptions
|
|
|
|
|
reflection:nullptr
|
|
|
|
|
error:&error];
|
|
|
|
|
const char *err = error ? [[error localizedDescription] UTF8String] : nullptr;
|
|
|
|
|
error_str = err ? err : "nil";
|
|
|
|
|
}
|
|
|
|
|
else {
|
|
|
|
|
/* Use the async variant of newComputePipelineStateWithDescriptor if no archive exists on
|
2023-03-10 12:36:58 +01:00
|
|
|
* disk. This allows us to respond to app shutdown. */
|
2023-03-07 17:08:30 +01:00
|
|
|
[mtlDevice
|
|
|
|
|
newComputePipelineStateWithDescriptor:computePipelineStateDescriptor
|
|
|
|
|
options:pipelineOptions
|
|
|
|
|
completionHandler:^(id<MTLComputePipelineState> computePipelineState,
|
2023-07-05 12:02:06 +02:00
|
|
|
MTLComputePipelineReflection * /*reflection*/,
|
2023-03-07 17:08:30 +01:00
|
|
|
NSError *error) {
|
|
|
|
|
pipeline = computePipelineState;
|
|
|
|
|
|
|
|
|
|
/* Retain the pipeline so we can use it safely past the completion
|
|
|
|
|
* handler. */
|
|
|
|
|
if (pipeline) {
|
|
|
|
|
[pipeline retain];
|
|
|
|
|
}
|
|
|
|
|
const char *err = error ?
|
|
|
|
|
[[error localizedDescription] UTF8String] :
|
|
|
|
|
nullptr;
|
|
|
|
|
error_str = err ? err : "nil";
|
|
|
|
|
|
|
|
|
|
compilation_finished = true;
|
|
|
|
|
}];
|
|
|
|
|
|
|
|
|
|
/* Immediately wait for either the compilation to finish or for app shutdown. */
|
|
|
|
|
while (ShaderCache::running && !compilation_finished) {
|
|
|
|
|
std::this_thread::sleep_for(std::chrono::milliseconds(5));
|
|
|
|
|
}
|
2023-05-30 11:12:05 +02:00
|
|
|
}
|
2023-03-07 17:08:30 +01:00
|
|
|
|
2023-05-30 11:12:05 +02:00
|
|
|
if (creating_new_archive && pipeline) {
|
|
|
|
|
/* Add pipeline into the new archive. */
|
|
|
|
|
NSError *error;
|
|
|
|
|
if (![archive addComputePipelineFunctionsWithDescriptor:computePipelineStateDescriptor
|
|
|
|
|
error:&error])
|
|
|
|
|
{
|
|
|
|
|
NSString *errStr = [error localizedDescription];
|
|
|
|
|
metal_printf("Failed to add PSO to archive:\n%s\n", errStr ? [errStr UTF8String] : "nil");
|
2023-03-07 17:08:30 +01:00
|
|
|
}
|
2021-12-07 15:11:35 +00:00
|
|
|
}
|
2023-03-10 12:36:58 +01:00
|
|
|
|
|
|
|
|
if (!pipeline) {
|
2023-03-07 17:08:30 +01:00
|
|
|
metal_printf(
|
|
|
|
|
"newComputePipelineStateWithDescriptor failed for \"%s\"%s. "
|
|
|
|
|
"Error:\n%s\n",
|
2024-12-26 17:53:59 +01:00
|
|
|
device_kernel_as_string(device_kernel),
|
2023-03-07 17:08:30 +01:00
|
|
|
(archive && !recreate_archive) ? " Archive may be incomplete or corrupt - attempting "
|
|
|
|
|
"recreation.." :
|
|
|
|
|
"",
|
|
|
|
|
error_str.c_str());
|
|
|
|
|
}
|
|
|
|
|
};
|
2021-12-07 15:11:35 +00:00
|
|
|
|
2023-03-07 17:08:30 +01:00
|
|
|
double starttime = time_dt();
|
2023-01-04 14:23:33 +00:00
|
|
|
|
2023-03-07 17:08:30 +01:00
|
|
|
do_compilation();
|
|
|
|
|
|
|
|
|
|
/* An archive might have a corrupt entry and fail to materialize the pipeline. This shouldn't
|
|
|
|
|
* happen, but if it does we recreate it. */
|
2023-01-04 14:23:33 +00:00
|
|
|
if (pipeline == nil && archive) {
|
|
|
|
|
recreate_archive = true;
|
2023-03-07 17:08:30 +01:00
|
|
|
pipelineOptions = MTLPipelineOptionNone;
|
|
|
|
|
path_remove(metalbin_path);
|
|
|
|
|
|
|
|
|
|
do_compilation();
|
2023-01-04 14:23:33 +00:00
|
|
|
}
|
2022-11-04 15:59:55 +00:00
|
|
|
|
2023-01-04 14:23:33 +00:00
|
|
|
double duration = time_dt() - starttime;
|
2021-12-07 15:11:35 +00:00
|
|
|
|
2023-01-04 14:23:33 +00:00
|
|
|
if (pipeline == nil) {
|
|
|
|
|
metal_printf("%16s | %2d | %-55s | %7.2fs | FAILED!\n",
|
|
|
|
|
kernel_type_as_string(pso_type),
|
|
|
|
|
device_kernel,
|
2024-12-26 17:53:59 +01:00
|
|
|
device_kernel_as_string(device_kernel),
|
2023-01-04 14:23:33 +00:00
|
|
|
duration);
|
|
|
|
|
return;
|
|
|
|
|
}
|
|
|
|
|
|
|
|
|
|
if (!num_threads_per_block) {
|
|
|
|
|
num_threads_per_block = round_down(pipeline.maxTotalThreadsPerThreadgroup,
|
|
|
|
|
pipeline.threadExecutionWidth);
|
2023-01-05 11:21:51 +13:00
|
|
|
num_threads_per_block = std::max(num_threads_per_block, (int)pipeline.threadExecutionWidth);
|
2023-01-04 14:23:33 +00:00
|
|
|
}
|
|
|
|
|
|
2024-02-16 19:03:23 +01:00
|
|
|
if (ShaderCache::running) {
|
|
|
|
|
if (creating_new_archive || recreate_archive) {
|
|
|
|
|
if (![archive serializeToURL:[NSURL fileURLWithPath:@(metalbin_path.c_str())] error:&error])
|
|
|
|
|
{
|
|
|
|
|
metal_printf("Failed to save binary archive to %s, error:\n%s\n",
|
|
|
|
|
metalbin_path.c_str(),
|
|
|
|
|
[[error localizedDescription] UTF8String]);
|
|
|
|
|
}
|
|
|
|
|
else {
|
|
|
|
|
path_cache_kernel_mark_added_and_clear_old(metalbin_path);
|
2021-12-07 15:11:35 +00:00
|
|
|
}
|
|
|
|
|
}
|
2022-05-11 14:52:49 +01:00
|
|
|
}
|
|
|
|
|
|
|
|
|
|
this->loaded = true;
|
|
|
|
|
[computePipelineStateDescriptor release];
|
|
|
|
|
computePipelineStateDescriptor = nil;
|
|
|
|
|
|
|
|
|
|
if (!use_binary_archive) {
|
|
|
|
|
metal_printf("%16s | %2d | %-55s | %7.2fs\n",
|
|
|
|
|
kernel_type_as_string(pso_type),
|
|
|
|
|
int(device_kernel),
|
|
|
|
|
device_kernel_as_string(device_kernel),
|
|
|
|
|
duration);
|
2021-12-07 15:11:35 +00:00
|
|
|
}
|
|
|
|
|
else {
|
2022-05-11 14:52:49 +01:00
|
|
|
metal_printf("%16s | %2d | %-55s | %7.2fs | %s: %s\n",
|
|
|
|
|
kernel_type_as_string(pso_type),
|
|
|
|
|
device_kernel,
|
2024-12-26 17:53:59 +01:00
|
|
|
device_kernel_as_string(device_kernel),
|
2022-05-11 14:52:49 +01:00
|
|
|
duration,
|
|
|
|
|
creating_new_archive ? " new" : "load",
|
|
|
|
|
metalbin_name.c_str());
|
2021-12-07 15:11:35 +00:00
|
|
|
}
|
2022-04-28 00:46:14 +02:00
|
|
|
}
|
|
|
|
|
|
2022-07-12 15:32:46 +02:00
|
|
|
bool MetalDeviceKernels::load(MetalDevice *device, MetalPipelineType pso_type)
|
2022-04-28 00:46:14 +02:00
|
|
|
{
|
2024-12-26 17:53:59 +01:00
|
|
|
auto *shader_cache = get_shader_cache(device->mtlDevice);
|
2022-05-11 14:52:49 +01:00
|
|
|
for (int i = 0; i < DEVICE_KERNEL_NUM; i++) {
|
2022-07-12 15:32:46 +02:00
|
|
|
shader_cache->load_kernel((DeviceKernel)i, device, pso_type);
|
2022-04-28 00:46:14 +02:00
|
|
|
}
|
2023-01-04 16:01:24 +00:00
|
|
|
return true;
|
|
|
|
|
}
|
2022-04-28 00:46:14 +02:00
|
|
|
|
2023-01-04 16:01:24 +00:00
|
|
|
void MetalDeviceKernels::wait_for_all()
|
|
|
|
|
{
|
2023-01-05 19:42:16 +01:00
|
|
|
for (int i = 0; i < g_shaderCacheCount; i++) {
|
2023-01-04 16:01:24 +00:00
|
|
|
g_shaderCache[i].second->wait_for_all();
|
2023-01-04 14:23:33 +00:00
|
|
|
}
|
2021-12-07 15:11:35 +00:00
|
|
|
}
|
|
|
|
|
|
2023-02-28 11:42:08 +01:00
|
|
|
int MetalDeviceKernels::num_incomplete_specialization_requests()
|
2023-01-04 14:23:33 +00:00
|
|
|
{
|
|
|
|
|
/* Return true if any ShaderCaches have ongoing specialization requests (typically there will be
|
|
|
|
|
* only 1). */
|
2023-02-28 11:42:08 +01:00
|
|
|
int total = 0;
|
2023-01-05 19:42:16 +01:00
|
|
|
for (int i = 0; i < g_shaderCacheCount; i++) {
|
2023-02-28 11:42:08 +01:00
|
|
|
total += g_shaderCache[i].second->incomplete_specialization_requests;
|
2023-01-04 14:23:33 +00:00
|
|
|
}
|
2023-02-28 11:42:08 +01:00
|
|
|
return total;
|
2023-01-04 14:23:33 +00:00
|
|
|
}
|
|
|
|
|
|
2024-12-29 17:32:00 +01:00
|
|
|
int MetalDeviceKernels::get_loaded_kernel_count(const MetalDevice *device,
|
2023-01-04 14:23:33 +00:00
|
|
|
MetalPipelineType pso_type)
|
2022-07-12 15:32:46 +02:00
|
|
|
{
|
2024-12-26 17:53:59 +01:00
|
|
|
auto *shader_cache = get_shader_cache(device->mtlDevice);
|
2023-01-04 14:23:33 +00:00
|
|
|
int loaded_count = DEVICE_KERNEL_NUM;
|
2022-07-12 15:32:46 +02:00
|
|
|
for (int i = 0; i < DEVICE_KERNEL_NUM; i++) {
|
|
|
|
|
if (shader_cache->should_load_kernel((DeviceKernel)i, device, pso_type)) {
|
2023-01-04 14:23:33 +00:00
|
|
|
loaded_count -= 1;
|
2022-07-12 15:32:46 +02:00
|
|
|
}
|
|
|
|
|
}
|
2023-01-04 14:23:33 +00:00
|
|
|
return loaded_count;
|
|
|
|
|
}
|
|
|
|
|
|
2024-12-29 17:32:00 +01:00
|
|
|
bool MetalDeviceKernels::should_load_kernels(const MetalDevice *device, MetalPipelineType pso_type)
|
2023-01-04 14:23:33 +00:00
|
|
|
{
|
|
|
|
|
return get_loaded_kernel_count(device, pso_type) != DEVICE_KERNEL_NUM;
|
2022-07-12 15:32:46 +02:00
|
|
|
}
|
|
|
|
|
|
2022-05-11 14:52:49 +01:00
|
|
|
const MetalKernelPipeline *MetalDeviceKernels::get_best_pipeline(const MetalDevice *device,
|
|
|
|
|
DeviceKernel kernel)
|
2021-12-07 15:11:35 +00:00
|
|
|
{
|
2022-05-11 14:52:49 +01:00
|
|
|
return get_shader_cache(device->mtlDevice)->get_best_pipeline(kernel, device);
|
2021-12-07 15:11:35 +00:00
|
|
|
}
|
|
|
|
|
|
2023-01-04 16:01:24 +00:00
|
|
|
bool MetalDeviceKernels::is_benchmark_warmup()
|
|
|
|
|
{
|
|
|
|
|
NSArray *args = [[NSProcessInfo processInfo] arguments];
|
2023-01-05 19:42:16 +01:00
|
|
|
for (int i = 0; i < args.count; i++) {
|
|
|
|
|
if (const char *arg = [[args objectAtIndex:i] cStringUsingEncoding:NSASCIIStringEncoding]) {
|
2023-01-04 16:01:24 +00:00
|
|
|
if (!strcmp(arg, "--warm-up")) {
|
|
|
|
|
return true;
|
|
|
|
|
}
|
|
|
|
|
}
|
|
|
|
|
}
|
|
|
|
|
return false;
|
|
|
|
|
}
|
|
|
|
|
|
2024-09-27 14:39:49 +02:00
|
|
|
void MetalDeviceKernels::static_deinitialize()
|
|
|
|
|
{
|
|
|
|
|
for (int i = 0; i < g_shaderCacheCount; i++) {
|
|
|
|
|
g_shaderCache[i] = DeviceShaderCache();
|
|
|
|
|
}
|
|
|
|
|
}
|
|
|
|
|
|
2021-12-07 15:11:35 +00:00
|
|
|
CCL_NAMESPACE_END
|
|
|
|
|
|
2024-03-09 23:40:57 +11:00
|
|
|
#endif /* WITH_METAL */
|