diff --git a/intern/cycles/blender/addon/properties.py b/intern/cycles/blender/addon/properties.py index eff6384c85e..a27a75e48fa 100644 --- a/intern/cycles/blender/addon/properties.py +++ b/intern/cycles/blender/addon/properties.py @@ -1543,6 +1543,17 @@ class CyclesPreferences(bpy.types.AddonPreferences): default=False, ) + kernel_optimization_level: EnumProperty( + name="Kernel Optimization", + description="Kernels can be optimized based on scene content. Optimized kernels are requested at the start of a render. If optimized kernels are not available, rendering will proceed using generic kernels until the optimized set is available in the cache. This can result in additional CPU usage for a brief time (tens of seconds).", + default='FULL', + items=( + ('OFF', "Off", "Disable kernel optimization. Slowest rendering, no extra background CPU usage"), + ('INTERSECT', "Intersection only", "Optimize only intersection kernels. Faster rendering, negligible extra background CPU usage"), + ('FULL', "Full", "Optimize all kernels. Fastest rendering, may result in extra background CPU usage"), + ), + ) + def find_existing_device_entry(self, device): for device_entry in self.devices: if device_entry.id == device[2] and device_entry.type == device[1]: @@ -1711,10 +1722,12 @@ class CyclesPreferences(bpy.types.AddonPreferences): if compute_device_type == 'METAL': import platform # MetalRT only works on Apple Silicon at present, pending argument encoding fixes on AMD + # Kernel specialization is only viable on Apple Silicon at present due to relative compilation speed if platform.machine() == 'arm64': - row = layout.row() - row.use_property_split = True - row.prop(self, "use_metalrt") + col = layout.column() + col.use_property_split = True + col.prop(self, "kernel_optimization_level") + col.prop(self, "use_metalrt") def draw(self, context): self.draw_impl(self.layout, context) diff --git a/intern/cycles/blender/device.cpp b/intern/cycles/blender/device.cpp index 22beca898f1..96e7bdd03aa 100644 --- a/intern/cycles/blender/device.cpp +++ b/intern/cycles/blender/device.cpp @@ -30,7 +30,7 @@ int blender_device_threads(BL::Scene &b_scene) return 0; } -DeviceInfo blender_device_info(BL::Preferences &b_preferences, BL::Scene &b_scene, bool background) +DeviceInfo blender_device_info(BL::Preferences &b_preferences, BL::Scene &b_scene, bool background, bool preview) { PointerRNA cscene = RNA_pointer_get(&b_scene.ptr, "cycles"); @@ -113,6 +113,18 @@ DeviceInfo blender_device_info(BL::Preferences &b_preferences, BL::Scene &b_scen device.use_metalrt = true; } + if (preview) { + /* Disable specialization for preview renders. */ + device.kernel_optimization_level = KERNEL_OPTIMIZATION_LEVEL_OFF; + } + else { + device.kernel_optimization_level = (KernelOptimizationLevel)get_enum( + cpreferences, + "kernel_optimization_level", + KERNEL_OPTIMIZATION_NUM_LEVELS, + KERNEL_OPTIMIZATION_LEVEL_FULL); + } + return device; } diff --git a/intern/cycles/blender/device.h b/intern/cycles/blender/device.h index 7a762261829..08655743eeb 100644 --- a/intern/cycles/blender/device.h +++ b/intern/cycles/blender/device.h @@ -19,7 +19,8 @@ int blender_device_threads(BL::Scene &b_scene); /* Convert Blender settings to device specification. */ DeviceInfo blender_device_info(BL::Preferences &b_preferences, BL::Scene &b_scene, - bool background); + bool background, + bool preview); CCL_NAMESPACE_END diff --git a/intern/cycles/blender/python.cpp b/intern/cycles/blender/python.cpp index cfc7a78143c..96cb204be4b 100644 --- a/intern/cycles/blender/python.cpp +++ b/intern/cycles/blender/python.cpp @@ -754,7 +754,7 @@ static PyObject *denoise_func(PyObject * /*self*/, PyObject *args, PyObject *key RNA_id_pointer_create((ID *)PyLong_AsVoidPtr(pyscene), &sceneptr); BL::Scene b_scene(sceneptr); - DeviceInfo device = blender_device_info(b_preferences, b_scene, true); + DeviceInfo device = blender_device_info(b_preferences, b_scene, true, true); /* Get denoising parameters from view layer. */ PointerRNA viewlayerptr; diff --git a/intern/cycles/blender/sync.cpp b/intern/cycles/blender/sync.cpp index d87d094dc56..45fe4334f06 100644 --- a/intern/cycles/blender/sync.cpp +++ b/intern/cycles/blender/sync.cpp @@ -866,7 +866,7 @@ SessionParams BlenderSync::get_session_params(BL::RenderEngine &b_engine, /* Device */ params.threads = blender_device_threads(b_scene); - params.device = blender_device_info(b_preferences, b_scene, params.background); + params.device = blender_device_info(b_preferences, b_scene, params.background, b_engine.is_preview()); /* samples */ int samples = get_int(cscene, "samples"); diff --git a/intern/cycles/device/device.h b/intern/cycles/device/device.h index ad625fc5a47..3923698b1cd 100644 --- a/intern/cycles/device/device.h +++ b/intern/cycles/device/device.h @@ -57,6 +57,14 @@ enum DeviceTypeMask { #define DEVICE_MASK(type) (DeviceTypeMask)(1 << type) +enum KernelOptimizationLevel { + KERNEL_OPTIMIZATION_LEVEL_OFF = 0, + KERNEL_OPTIMIZATION_LEVEL_INTERSECT = 1, + KERNEL_OPTIMIZATION_LEVEL_FULL = 2, + + KERNEL_OPTIMIZATION_NUM_LEVELS +}; + class DeviceInfo { public: DeviceType type; @@ -66,13 +74,15 @@ class DeviceInfo { bool display_device; /* GPU is used as a display device. */ bool has_nanovdb; /* Support NanoVDB volumes. */ bool has_light_tree; /* Support light tree. */ - bool has_osl; /* Support Open Shading Language. */ - bool has_guiding; /* Support path guiding. */ - bool has_profiling; /* Supports runtime collection of profiling info. */ - bool has_peer_memory; /* GPU has P2P access to memory of another GPU. */ - bool has_gpu_queue; /* Device supports GPU queue. */ - bool use_metalrt; /* Use MetalRT to accelerate ray queries (Metal only). */ - DenoiserTypeMask denoisers; /* Supported denoiser types. */ + bool has_osl; /* Support Open Shading Language. */ + bool has_guiding; /* Support path guiding. */ + bool has_profiling; /* Supports runtime collection of profiling info. */ + bool has_peer_memory; /* GPU has P2P access to memory of another GPU. */ + bool has_gpu_queue; /* Device supports GPU queue. */ + bool use_metalrt; /* Use MetalRT to accelerate ray queries (Metal only). */ + KernelOptimizationLevel kernel_optimization_level; /* Optimization level applied to path tracing + kernels (Metal only). */ + DenoiserTypeMask denoisers; /* Supported denoiser types. */ int cpu_threads; vector multi_devices; string error_msg; diff --git a/intern/cycles/device/metal/device_impl.mm b/intern/cycles/device/metal/device_impl.mm index a6966bf167d..01578155931 100644 --- a/intern/cycles/device/metal/device_impl.mm +++ b/intern/cycles/device/metal/device_impl.mm @@ -110,10 +110,6 @@ MetalDevice::MetalDevice(const DeviceInfo &info, Stats &stats, Profiler &profile case METAL_GPU_APPLE: { max_threads_per_threadgroup = 512; use_metalrt = info.use_metalrt; - - /* Specialize the intersection kernels on Apple GPUs by default as these can be built very - * quickly. */ - kernel_specialization_level = PSO_SPECIALIZED_INTERSECT; break; } } @@ -126,6 +122,22 @@ MetalDevice::MetalDevice(const DeviceInfo &info, Stats &stats, Profiler &profile capture_enabled = true; } + if (device_vendor == METAL_GPU_APPLE) { + /* Set kernel_specialization_level based on user prefs. */ + switch (info.kernel_optimization_level) { + case KERNEL_OPTIMIZATION_LEVEL_OFF: + kernel_specialization_level = PSO_GENERIC; + break; + default: + case KERNEL_OPTIMIZATION_LEVEL_INTERSECT: + kernel_specialization_level = PSO_SPECIALIZED_INTERSECT; + break; + case KERNEL_OPTIMIZATION_LEVEL_FULL: + kernel_specialization_level = PSO_SPECIALIZED_SHADE; + break; + } + } + if (auto envstr = getenv("CYCLES_METAL_SPECIALIZATION_LEVEL")) { kernel_specialization_level = (MetalPipelineType)atoi(envstr); } @@ -444,7 +456,7 @@ void MetalDevice::compile_and_load(int device_id, MetalPipelineType pso_type) source); } - const double starttime = time_dt(); + double starttime = time_dt(); NSError *error = NULL; id mtlLibrary = [mtlDevice newLibraryWithSource:@(source.c_str()) @@ -457,6 +469,12 @@ void MetalDevice::compile_and_load(int device_id, MetalPipelineType pso_type) [options release]; + bool blocking_pso_build = (getenv("CYCLES_METAL_PROFILING") || MetalDeviceKernels::is_benchmark_warmup()); + if (blocking_pso_build) { + MetalDeviceKernels::wait_for_all(); + starttime = 0.0; + } + /* Save the compiled MTLLibrary and trigger the AIR->PSO builds (if the MetalDevice still * exists). */ { @@ -464,6 +482,8 @@ void MetalDevice::compile_and_load(int device_id, MetalPipelineType pso_type) if (MetalDevice *instance = get_device_by_ID(device_id, lock)) { if (mtlLibrary) { instance->mtlLibrary[pso_type] = mtlLibrary; + + starttime = time_dt(); MetalDeviceKernels::load(instance, pso_type); } else { @@ -472,6 +492,14 @@ void MetalDevice::compile_and_load(int device_id, MetalPipelineType pso_type) } } } + + if (starttime && blocking_pso_build) { + MetalDeviceKernels::wait_for_all(); + + metal_printf("Back-end compilation finished in %.1f seconds (%s)\n", + time_dt() - starttime, + kernel_type_as_string(pso_type)); + } } void MetalDevice::load_texture_info() @@ -832,10 +860,8 @@ void MetalDevice::optimize_for_scene(Scene *scene) } /* Block during benchmark warm-up to ensure kernels are cached prior to the observed run. */ - for (int i = 0; i < *_NSGetArgc(); i++) { - if (!strcmp((*_NSGetArgv())[i], "--warm-up")) { - specialize_in_background = false; - } + if (MetalDeviceKernels::is_benchmark_warmup()) { + specialize_in_background = false; } if (specialize_in_background) { diff --git a/intern/cycles/device/metal/kernel.h b/intern/cycles/device/metal/kernel.h index c2b9f073b12..212671f52a0 100644 --- a/intern/cycles/device/metal/kernel.h +++ b/intern/cycles/device/metal/kernel.h @@ -101,6 +101,8 @@ int get_loaded_kernel_count(MetalDevice const *device, MetalPipelineType pso_typ bool should_load_kernels(MetalDevice const *device, MetalPipelineType pso_type); bool load(MetalDevice *device, MetalPipelineType pso_type); const MetalKernelPipeline *get_best_pipeline(const MetalDevice *device, DeviceKernel kernel); +void wait_for_all(); +bool is_benchmark_warmup(); } /* namespace MetalDeviceKernels */ diff --git a/intern/cycles/device/metal/kernel.mm b/intern/cycles/device/metal/kernel.mm index ec2025e78fe..7b26085c75f 100644 --- a/intern/cycles/device/metal/kernel.mm +++ b/intern/cycles/device/metal/kernel.mm @@ -116,19 +116,29 @@ struct ShaderCache { }; bool ShaderCache::running = true; -std::mutex g_shaderCacheMutex; -std::map, unique_ptr> g_shaderCache; + +const int MAX_POSSIBLE_GPUS_ON_SYSTEM = 8; +using DeviceShaderCache = std::pair, unique_ptr>; +int g_shaderCacheCount = 0; +DeviceShaderCache g_shaderCache[MAX_POSSIBLE_GPUS_ON_SYSTEM]; ShaderCache *get_shader_cache(id mtlDevice) { - thread_scoped_lock lock(g_shaderCacheMutex); - auto it = g_shaderCache.find(mtlDevice); - if (it != g_shaderCache.end()) { - return it->second.get(); + for (int i=0; i(mtlDevice); - return g_shaderCache[mtlDevice].get(); + static thread_mutex g_shaderCacheCountMutex; + g_shaderCacheCountMutex.lock(); + int index = g_shaderCacheCount++; + g_shaderCacheCountMutex.unlock(); + + assert(index < MAX_POSSIBLE_GPUS_ON_SYSTEM); + g_shaderCache[index].first = mtlDevice; + g_shaderCache[index].second = make_unique(mtlDevice); + return g_shaderCache[index].second.get(); } ShaderCache::~ShaderCache() @@ -145,7 +155,7 @@ ShaderCache::~ShaderCache() num_incomplete = int(incomplete_requests); } - if (num_incomplete) { + if (num_incomplete && !MetalDeviceKernels::is_benchmark_warmup()) { metal_printf("ShaderCache still busy (incomplete_requests = %d). Terminating...\n", num_incomplete); std::terminate(); @@ -332,12 +342,6 @@ void ShaderCache::load_kernel(DeviceKernel device_kernel, MetalKernelPipeline *ShaderCache::get_best_pipeline(DeviceKernel kernel, const MetalDevice *device) { - thread_scoped_lock lock(cache_mutex); - auto &collection = pipelines[kernel]; - if (collection.empty()) { - return nullptr; - } - /* metalrt options */ bool use_metalrt = device->use_metalrt; bool device_metalrt_hair = use_metalrt && device->kernel_features & KERNEL_FEATURE_HAIR; @@ -349,34 +353,43 @@ MetalKernelPipeline *ShaderCache::get_best_pipeline(DeviceKernel kernel, const M device->kernel_features & KERNEL_FEATURE_OBJECT_MOTION; MetalKernelPipeline *best_pipeline = nullptr; - for (auto &pipeline : collection) { - if (!pipeline->loaded) { - /* still loading - ignore */ - continue; - } + while(!best_pipeline) { + { + thread_scoped_lock lock(cache_mutex); + for (auto &pipeline : pipelines[kernel]) { + if (!pipeline->loaded) { + /* still loading - ignore */ + continue; + } - bool pipeline_metalrt_hair = pipeline->metalrt_features & KERNEL_FEATURE_HAIR; - bool pipeline_metalrt_hair_thick = pipeline->metalrt_features & KERNEL_FEATURE_HAIR_THICK; - bool pipeline_metalrt_pointcloud = pipeline->metalrt_features & KERNEL_FEATURE_POINTCLOUD; - bool pipeline_metalrt_motion = use_metalrt && - pipeline->metalrt_features & KERNEL_FEATURE_OBJECT_MOTION; + bool pipeline_metalrt_hair = pipeline->metalrt_features & KERNEL_FEATURE_HAIR; + bool pipeline_metalrt_hair_thick = pipeline->metalrt_features & KERNEL_FEATURE_HAIR_THICK; + bool pipeline_metalrt_pointcloud = pipeline->metalrt_features & KERNEL_FEATURE_POINTCLOUD; + bool pipeline_metalrt_motion = use_metalrt && + pipeline->metalrt_features & KERNEL_FEATURE_OBJECT_MOTION; - if (pipeline->use_metalrt != use_metalrt || pipeline_metalrt_hair != device_metalrt_hair || - pipeline_metalrt_hair_thick != device_metalrt_hair_thick || - pipeline_metalrt_pointcloud != device_metalrt_pointcloud || - pipeline_metalrt_motion != device_metalrt_motion) { - /* wrong combination of metalrt options */ - continue; - } + if (pipeline->use_metalrt != use_metalrt || pipeline_metalrt_hair != device_metalrt_hair || + pipeline_metalrt_hair_thick != device_metalrt_hair_thick || + pipeline_metalrt_pointcloud != device_metalrt_pointcloud || + pipeline_metalrt_motion != device_metalrt_motion) { + /* wrong combination of metalrt options */ + continue; + } - if (pipeline->pso_type != PSO_GENERIC) { - if (pipeline->source_md5 == device->source_md5[PSO_SPECIALIZED_INTERSECT] || - pipeline->source_md5 == device->source_md5[PSO_SPECIALIZED_SHADE]) { - best_pipeline = pipeline.get(); + if (pipeline->pso_type != PSO_GENERIC) { + if (pipeline->source_md5 == device->source_md5[PSO_SPECIALIZED_INTERSECT] || + pipeline->source_md5 == device->source_md5[PSO_SPECIALIZED_SHADE]) { + best_pipeline = pipeline.get(); + } + } + else if (!best_pipeline) { + best_pipeline = pipeline.get(); + } } } - else if (!best_pipeline) { - best_pipeline = pipeline.get(); + + if (!best_pipeline) { + std::this_thread::sleep_for(std::chrono::milliseconds(100)); } } @@ -802,28 +815,26 @@ void MetalKernelPipeline::compile() bool MetalDeviceKernels::load(MetalDevice *device, MetalPipelineType pso_type) { - const double starttime = time_dt(); auto shader_cache = get_shader_cache(device->mtlDevice); for (int i = 0; i < DEVICE_KERNEL_NUM; i++) { shader_cache->load_kernel((DeviceKernel)i, device, pso_type); } - - if (getenv("CYCLES_METAL_PROFILING")) { - shader_cache->wait_for_all(); - metal_printf("Back-end compilation finished in %.1f seconds (%s)\n", - time_dt() - starttime, - kernel_type_as_string(pso_type)); - } return true; } +void MetalDeviceKernels::wait_for_all() +{ + for (int i=0; iwait_for_all(); + } +} + bool MetalDeviceKernels::any_specialization_happening_now() { /* Return true if any ShaderCaches have ongoing specialization requests (typically there will be * only 1). */ - thread_scoped_lock lock(g_shaderCacheMutex); - for (auto &it : g_shaderCache) { - if (it.second->incomplete_specialization_requests > 0) { + for (int i=0; iincomplete_specialization_requests > 0) { return true; } } @@ -854,6 +865,19 @@ const MetalKernelPipeline *MetalDeviceKernels::get_best_pipeline(const MetalDevi return get_shader_cache(device->mtlDevice)->get_best_pipeline(kernel, device); } +bool MetalDeviceKernels::is_benchmark_warmup() +{ + NSArray *args = [[NSProcessInfo processInfo] arguments]; + for (int i = 0; i MetalDeviceQueue::get_compute_encoder(DeviceKernel if (@available(macos 10.14, *)) { if (timing_shared_event_) { /* Close the current encoder to ensure we're able to capture per-encoder timing data. */ - if (mtlComputeEncoder_) { - close_compute_encoder(); - } + close_compute_encoder(); } if (mtlComputeEncoder_) { @@ -897,9 +896,7 @@ id MetalDeviceQueue::get_blit_encoder() return mtlBlitEncoder_; } - if (mtlComputeEncoder_) { - close_compute_encoder(); - } + close_compute_encoder(); if (!mtlCommandBuffer_) { mtlCommandBuffer_ = [mtlCommandQueue_ commandBuffer]; @@ -913,12 +910,14 @@ id MetalDeviceQueue::get_blit_encoder() void MetalDeviceQueue::close_compute_encoder() { - [mtlComputeEncoder_ endEncoding]; - mtlComputeEncoder_ = nil; + if (mtlComputeEncoder_) { + [mtlComputeEncoder_ endEncoding]; + mtlComputeEncoder_ = nil; - if (@available(macos 10.14, *)) { - if (timing_shared_event_) { - [mtlCommandBuffer_ encodeSignalEvent:timing_shared_event_ value:timing_shared_event_id_++]; + if (@available(macos 10.14, *)) { + if (timing_shared_event_) { + [mtlCommandBuffer_ encodeSignalEvent:timing_shared_event_ value:timing_shared_event_id_++]; + } } } }