Merge branch 'blender-v4.2-release'
This commit is contained in:
@@ -57,6 +57,9 @@ class BVHMetal : public BVH {
|
||||
Geometry *const geom,
|
||||
bool refit);
|
||||
bool build_TLAS(Progress &progress, id<MTLDevice> device, id<MTLCommandQueue> queue, bool refit);
|
||||
|
||||
API_AVAILABLE(macos(11.0))
|
||||
void set_accel_struct(id<MTLAccelerationStructure> new_accel_struct);
|
||||
};
|
||||
|
||||
CCL_NAMESPACE_END
|
||||
|
||||
@@ -119,17 +119,27 @@ BVHMetal::BVHMetal(const BVHParams ¶ms_,
|
||||
|
||||
BVHMetal::~BVHMetal()
|
||||
{
|
||||
/* Clear point used by enqueueing. */
|
||||
device->release_bvh(this);
|
||||
if (@available(macos 12.0, *)) {
|
||||
set_accel_struct(nil);
|
||||
if (null_BLAS) {
|
||||
[null_BLAS release];
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
API_AVAILABLE(macos(11.0))
|
||||
void BVHMetal::set_accel_struct(id<MTLAccelerationStructure> new_accel_struct)
|
||||
{
|
||||
if (@available(macos 12.0, *)) {
|
||||
if (accel_struct) {
|
||||
device->stats.mem_free(accel_struct.allocatedSize);
|
||||
[accel_struct release];
|
||||
accel_struct = nil;
|
||||
}
|
||||
|
||||
if (null_BLAS) {
|
||||
[null_BLAS release];
|
||||
if (new_accel_struct) {
|
||||
accel_struct = new_accel_struct;
|
||||
device->stats.mem_alloc(accel_struct.allocatedSize);
|
||||
}
|
||||
}
|
||||
}
|
||||
@@ -325,9 +335,7 @@ bool BVHMetal::build_BLAS_mesh(Progress &progress,
|
||||
toAccelerationStructure:accel];
|
||||
[accelEnc endEncoding];
|
||||
[accelCommands addCompletedHandler:^(id<MTLCommandBuffer> /*command_buffer*/) {
|
||||
uint64_t allocated_size = [accel allocatedSize];
|
||||
device->stats.mem_alloc(allocated_size);
|
||||
accel_struct = accel;
|
||||
set_accel_struct(accel);
|
||||
[accel_uncompressed release];
|
||||
|
||||
/* Signal that we've finished doing GPU acceleration struct build. */
|
||||
@@ -338,10 +346,7 @@ bool BVHMetal::build_BLAS_mesh(Progress &progress,
|
||||
}
|
||||
else {
|
||||
/* set our acceleration structure to the uncompressed structure */
|
||||
accel_struct = accel_uncompressed;
|
||||
|
||||
uint64_t allocated_size = [accel_struct allocatedSize];
|
||||
device->stats.mem_alloc(allocated_size);
|
||||
set_accel_struct(accel_uncompressed);
|
||||
|
||||
/* Signal that we've finished doing GPU acceleration struct build. */
|
||||
g_bvh_build_throttler.release(wired_size);
|
||||
@@ -663,9 +668,7 @@ bool BVHMetal::build_BLAS_hair(Progress &progress,
|
||||
toAccelerationStructure:accel];
|
||||
[accelEnc endEncoding];
|
||||
[accelCommands addCompletedHandler:^(id<MTLCommandBuffer> /*command_buffer*/) {
|
||||
uint64_t allocated_size = [accel allocatedSize];
|
||||
device->stats.mem_alloc(allocated_size);
|
||||
accel_struct = accel;
|
||||
set_accel_struct(accel);
|
||||
[accel_uncompressed release];
|
||||
|
||||
/* Signal that we've finished doing GPU acceleration struct build. */
|
||||
@@ -676,10 +679,7 @@ bool BVHMetal::build_BLAS_hair(Progress &progress,
|
||||
}
|
||||
else {
|
||||
/* set our acceleration structure to the uncompressed structure */
|
||||
accel_struct = accel_uncompressed;
|
||||
|
||||
uint64_t allocated_size = [accel_struct allocatedSize];
|
||||
device->stats.mem_alloc(allocated_size);
|
||||
set_accel_struct(accel_uncompressed);
|
||||
|
||||
/* Signal that we've finished doing GPU acceleration struct build. */
|
||||
g_bvh_build_throttler.release(wired_size);
|
||||
@@ -910,9 +910,7 @@ bool BVHMetal::build_BLAS_pointcloud(Progress &progress,
|
||||
toAccelerationStructure:accel];
|
||||
[accelEnc endEncoding];
|
||||
[accelCommands addCompletedHandler:^(id<MTLCommandBuffer> /*command_buffer*/) {
|
||||
uint64_t allocated_size = [accel allocatedSize];
|
||||
device->stats.mem_alloc(allocated_size);
|
||||
accel_struct = accel;
|
||||
set_accel_struct(accel);
|
||||
[accel_uncompressed release];
|
||||
|
||||
/* Signal that we've finished doing GPU acceleration struct build. */
|
||||
@@ -923,10 +921,7 @@ bool BVHMetal::build_BLAS_pointcloud(Progress &progress,
|
||||
}
|
||||
else {
|
||||
/* set our acceleration structure to the uncompressed structure */
|
||||
accel_struct = accel_uncompressed;
|
||||
|
||||
uint64_t allocated_size = [accel_struct allocatedSize];
|
||||
device->stats.mem_alloc(allocated_size);
|
||||
set_accel_struct(accel_uncompressed);
|
||||
|
||||
/* Signal that we've finished doing GPU acceleration struct build. */
|
||||
g_bvh_build_throttler.release(wired_size);
|
||||
@@ -1036,10 +1031,6 @@ bool BVHMetal::build_TLAS(Progress &progress,
|
||||
for (Object *ob : objects) {
|
||||
num_instances++;
|
||||
|
||||
/* Skip motion for non-traceable objects */
|
||||
if (!ob->is_traceable())
|
||||
continue;
|
||||
|
||||
if (ob->use_motion()) {
|
||||
num_motion_transforms += max((size_t)1, ob->get_motion().size());
|
||||
}
|
||||
@@ -1115,8 +1106,8 @@ bool BVHMetal::build_TLAS(Progress &progress,
|
||||
/* Skip non-traceable objects */
|
||||
Geometry const *geom = ob->get_geometry();
|
||||
BVHMetal const *blas = static_cast<BVHMetal const *>(geom->bvh);
|
||||
if (!blas || !blas->accel_struct) {
|
||||
/* Place a degenerate instance, to ensure [[instance_id]] equals ob->get_mtl_device_index()
|
||||
if (!blas || !blas->accel_struct || !ob->is_traceable()) {
|
||||
/* Place a degenerate instance, to ensure [[instance_id]] equals ob->get_device_index()
|
||||
* in our intersection functions */
|
||||
blas = nullptr;
|
||||
|
||||
@@ -1299,11 +1290,8 @@ bool BVHMetal::build_TLAS(Progress &progress,
|
||||
[instanceBuf release];
|
||||
[scratchBuf release];
|
||||
|
||||
uint64_t allocated_size = [accel allocatedSize];
|
||||
device->stats.mem_alloc(allocated_size);
|
||||
|
||||
/* Cache top and bottom-level acceleration structs */
|
||||
accel_struct = accel;
|
||||
set_accel_struct(accel);
|
||||
|
||||
unique_blas_array.clear();
|
||||
unique_blas_array.reserve(all_blas.count);
|
||||
@@ -1322,16 +1310,18 @@ bool BVHMetal::build(Progress &progress,
|
||||
bool refit)
|
||||
{
|
||||
if (@available(macos 12.0, *)) {
|
||||
if (refit && params.bvh_type != BVH_TYPE_STATIC) {
|
||||
assert(accel_struct);
|
||||
}
|
||||
else {
|
||||
if (accel_struct) {
|
||||
device->stats.mem_free(accel_struct.allocatedSize);
|
||||
[accel_struct release];
|
||||
accel_struct = nil;
|
||||
if (refit) {
|
||||
/* It isn't valid to refit a non-existent BVH, or one which wasn't constructed as dynamic.
|
||||
* In such cases, assert in development but try to recover in the wild. */
|
||||
if (params.bvh_type != BVH_TYPE_DYNAMIC || !accel_struct) {
|
||||
assert(false);
|
||||
refit = false;
|
||||
}
|
||||
}
|
||||
|
||||
if (!refit) {
|
||||
set_accel_struct(nil);
|
||||
}
|
||||
}
|
||||
|
||||
@autoreleasepool {
|
||||
|
||||
@@ -39,10 +39,19 @@ class MetalDevice : public Device {
|
||||
KernelParamsMetal launch_params = {0};
|
||||
|
||||
/* MetalRT members ----------------------------------*/
|
||||
BVHMetal *bvhMetalRT = nullptr;
|
||||
bool use_metalrt = false;
|
||||
bool motion_blur = false;
|
||||
id<MTLArgumentEncoder> mtlASArgEncoder =
|
||||
nil; /* encoder used for fetching device pointers from MTLAccelerationStructure */
|
||||
|
||||
id<MTLArgumentEncoder> mtlBlasArgEncoder = nil;
|
||||
id<MTLBuffer> blas_buffer = nil;
|
||||
|
||||
API_AVAILABLE(macos(11.0))
|
||||
vector<id<MTLAccelerationStructure>> unique_blas_array;
|
||||
|
||||
API_AVAILABLE(macos(11.0))
|
||||
id<MTLAccelerationStructure> accel_struct = nil;
|
||||
/*---------------------------------------------------*/
|
||||
|
||||
uint kernel_features;
|
||||
@@ -79,11 +88,6 @@ class MetalDevice : public Device {
|
||||
id<MTLBuffer> texture_bindings_3d = nil;
|
||||
std::vector<id<MTLTexture>> texture_slot_map;
|
||||
|
||||
/* BLAS encoding & lookup */
|
||||
id<MTLArgumentEncoder> mtlBlasArgEncoder = nil;
|
||||
id<MTLBuffer> blas_buffer = nil;
|
||||
|
||||
bool use_metalrt = false;
|
||||
MetalPipelineType kernel_specialization_level = PSO_GENERIC;
|
||||
|
||||
int device_id = 0;
|
||||
@@ -138,8 +142,6 @@ class MetalDevice : public Device {
|
||||
|
||||
virtual void build_bvh(BVH *bvh, Progress &progress, bool refit) override;
|
||||
|
||||
virtual void release_bvh(BVH *bvh) override;
|
||||
|
||||
virtual void optimize_for_scene(Scene *scene) override;
|
||||
|
||||
static void compile_and_load(int device_id, MetalPipelineType pso_type);
|
||||
@@ -184,6 +186,10 @@ class MetalDevice : public Device {
|
||||
void tex_free(device_texture &mem);
|
||||
|
||||
void flush_delayed_free_list();
|
||||
|
||||
void free_bvh();
|
||||
|
||||
void update_bvh(BVHMetal *bvh_metal);
|
||||
};
|
||||
|
||||
CCL_NAMESPACE_END
|
||||
|
||||
@@ -267,6 +267,7 @@ MetalDevice::~MetalDevice()
|
||||
}
|
||||
}
|
||||
|
||||
free_bvh();
|
||||
flush_delayed_free_list();
|
||||
|
||||
if (texture_bindings_2d) {
|
||||
@@ -1372,24 +1373,7 @@ void MetalDevice::build_bvh(BVH *bvh, Progress &progress, bool refit)
|
||||
if (bvh_metal->build(progress, mtlDevice, mtlGeneralCommandQueue, refit)) {
|
||||
|
||||
if (bvh->params.top_level) {
|
||||
bvhMetalRT = bvh_metal;
|
||||
|
||||
// allocate required buffers for BLAS array
|
||||
uint64_t count = bvhMetalRT->blas_array.size();
|
||||
uint64_t bufferSize = mtlBlasArgEncoder.encodedLength * count;
|
||||
blas_buffer = [mtlDevice newBufferWithLength:bufferSize options:default_storage_mode];
|
||||
stats.mem_alloc(blas_buffer.allocatedSize);
|
||||
|
||||
for (uint64_t i = 0; i < count; ++i) {
|
||||
if (bvhMetalRT->blas_array[i]) {
|
||||
[mtlBlasArgEncoder setArgumentBuffer:blas_buffer
|
||||
offset:i * mtlBlasArgEncoder.encodedLength];
|
||||
[mtlBlasArgEncoder setAccelerationStructure:bvhMetalRT->blas_array[i] atIndex:0];
|
||||
}
|
||||
}
|
||||
if (default_storage_mode == MTLResourceStorageModeManaged) {
|
||||
[blas_buffer didModifyRange:NSMakeRange(0, blas_buffer.length)];
|
||||
}
|
||||
update_bvh(bvh_metal);
|
||||
}
|
||||
}
|
||||
|
||||
@@ -1399,10 +1383,54 @@ void MetalDevice::build_bvh(BVH *bvh, Progress &progress, bool refit)
|
||||
}
|
||||
}
|
||||
|
||||
void MetalDevice::release_bvh(BVH *bvh)
|
||||
void MetalDevice::free_bvh()
|
||||
{
|
||||
if (bvhMetalRT == bvh) {
|
||||
bvhMetalRT = nullptr;
|
||||
for (id<MTLAccelerationStructure> &blas : unique_blas_array) {
|
||||
[blas release];
|
||||
}
|
||||
unique_blas_array.clear();
|
||||
|
||||
if (blas_buffer) {
|
||||
[blas_buffer release];
|
||||
blas_buffer = nil;
|
||||
}
|
||||
|
||||
if (accel_struct) {
|
||||
[accel_struct release];
|
||||
accel_struct = nil;
|
||||
}
|
||||
}
|
||||
|
||||
void MetalDevice::update_bvh(BVHMetal *bvh_metal)
|
||||
{
|
||||
free_bvh();
|
||||
|
||||
if (!bvh_metal) {
|
||||
return;
|
||||
}
|
||||
|
||||
accel_struct = bvh_metal->accel_struct;
|
||||
unique_blas_array = bvh_metal->unique_blas_array;
|
||||
|
||||
[accel_struct retain];
|
||||
for (id<MTLAccelerationStructure> &blas : unique_blas_array) {
|
||||
[blas retain];
|
||||
}
|
||||
|
||||
// Allocate required buffers for BLAS array.
|
||||
uint64_t count = bvh_metal->blas_array.size();
|
||||
uint64_t buffer_size = mtlBlasArgEncoder.encodedLength * count;
|
||||
blas_buffer = [mtlDevice newBufferWithLength:buffer_size options:default_storage_mode];
|
||||
stats.mem_alloc(blas_buffer.allocatedSize);
|
||||
|
||||
for (uint64_t i = 0; i < count; ++i) {
|
||||
if (bvh_metal->blas_array[i]) {
|
||||
[mtlBlasArgEncoder setArgumentBuffer:blas_buffer offset:i * mtlBlasArgEncoder.encodedLength];
|
||||
[mtlBlasArgEncoder setAccelerationStructure:bvh_metal->blas_array[i] atIndex:0];
|
||||
}
|
||||
}
|
||||
if (default_storage_mode == MTLResourceStorageModeManaged) {
|
||||
[blas_buffer didModifyRange:NSMakeRange(0, blas_buffer.length)];
|
||||
}
|
||||
}
|
||||
|
||||
|
||||
@@ -54,10 +54,12 @@ enum MetalPipelineType {
|
||||
|
||||
const char *kernel_type_as_string(MetalPipelineType pso_type);
|
||||
|
||||
struct MetalKernelPipeline {
|
||||
|
||||
/* A pipeline object that can be shared between multiple instances of MetalDeviceQueue. */
|
||||
class MetalKernelPipeline {
|
||||
public:
|
||||
void compile();
|
||||
|
||||
int pipeline_id;
|
||||
int originating_device_id;
|
||||
|
||||
id<MTLLibrary> mtlLibrary = nil;
|
||||
@@ -83,6 +85,28 @@ struct MetalKernelPipeline {
|
||||
|
||||
string error_str;
|
||||
|
||||
NSArray *table_functions[METALRT_TABLE_NUM] = {nil};
|
||||
};
|
||||
|
||||
/* An actively instanced pipeline that can only be used by a single instance of MetalDeviceQueue.
|
||||
*/
|
||||
class MetalDispatchPipeline {
|
||||
public:
|
||||
~MetalDispatchPipeline();
|
||||
|
||||
bool update(MetalDevice *metal_device, DeviceKernel kernel);
|
||||
void free_intersection_function_tables();
|
||||
|
||||
private:
|
||||
friend class MetalDeviceQueue;
|
||||
friend struct ShaderCache;
|
||||
|
||||
int pipeline_id = -1;
|
||||
|
||||
MetalPipelineType pso_type;
|
||||
id<MTLComputePipelineState> pipeline = nil;
|
||||
int num_threads_per_block = 0;
|
||||
|
||||
API_AVAILABLE(macos(11.0))
|
||||
id<MTLIntersectionFunctionTable> intersection_func_table[METALRT_TABLE_NUM] = {nil};
|
||||
};
|
||||
|
||||
@@ -133,6 +133,9 @@ using DeviceShaderCache = std::pair<id<MTLDevice>, unique_ptr<ShaderCache>>;
|
||||
int g_shaderCacheCount = 0;
|
||||
DeviceShaderCache g_shaderCache[MAX_POSSIBLE_GPUS_ON_SYSTEM];
|
||||
|
||||
/* Next UID for associating a MetalDispatchPipeline with an originating MetalKernelPipeline. */
|
||||
static std::atomic_int g_next_pipeline_id = 0;
|
||||
|
||||
ShaderCache *get_shader_cache(id<MTLDevice> mtlDevice)
|
||||
{
|
||||
for (int i = 0; i < g_shaderCacheCount; i++) {
|
||||
@@ -325,6 +328,7 @@ void ShaderCache::load_kernel(DeviceKernel device_kernel,
|
||||
|
||||
/* Keep track of the originating device's ID so that we can cancel requests if the device ceases
|
||||
* to be active. */
|
||||
pipeline->pipeline_id = g_next_pipeline_id.fetch_add(1);
|
||||
pipeline->originating_device_id = device->device_id;
|
||||
memcpy(&pipeline->kernel_data_, &device->launch_params.data, sizeof(pipeline->kernel_data_));
|
||||
pipeline->pso_type = pso_type;
|
||||
@@ -450,6 +454,64 @@ static MTLFunctionConstantValues *GetConstantValues(KernelData const *data = nul
|
||||
return constant_values;
|
||||
}
|
||||
|
||||
void MetalDispatchPipeline::free_intersection_function_tables()
|
||||
{
|
||||
for (int table = 0; table < METALRT_TABLE_NUM; table++) {
|
||||
if (intersection_func_table[table]) {
|
||||
[intersection_func_table[table] release];
|
||||
intersection_func_table[table] = nil;
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
MetalDispatchPipeline::~MetalDispatchPipeline()
|
||||
{
|
||||
free_intersection_function_tables();
|
||||
}
|
||||
|
||||
bool MetalDispatchPipeline::update(MetalDevice *metal_device, DeviceKernel kernel)
|
||||
{
|
||||
const MetalKernelPipeline *best_pipeline = MetalDeviceKernels::get_best_pipeline(metal_device,
|
||||
kernel);
|
||||
if (!best_pipeline) {
|
||||
return false;
|
||||
}
|
||||
|
||||
if (pipeline_id == best_pipeline->pipeline_id) {
|
||||
/* The best pipeline is already active - nothing to do. */
|
||||
return true;
|
||||
}
|
||||
pipeline_id = best_pipeline->pipeline_id;
|
||||
pipeline = best_pipeline->pipeline;
|
||||
pso_type = best_pipeline->pso_type;
|
||||
num_threads_per_block = best_pipeline->num_threads_per_block;
|
||||
|
||||
/* Create the MTLIntersectionFunctionTables if needed. */
|
||||
if (best_pipeline->use_metalrt && device_kernel_has_intersection(best_pipeline->device_kernel)) {
|
||||
free_intersection_function_tables();
|
||||
|
||||
for (int table = 0; table < METALRT_TABLE_NUM; table++) {
|
||||
@autoreleasepool {
|
||||
MTLIntersectionFunctionTableDescriptor *ift_desc =
|
||||
[[MTLIntersectionFunctionTableDescriptor alloc] init];
|
||||
ift_desc.functionCount = best_pipeline->table_functions[table].count;
|
||||
intersection_func_table[table] = [this->pipeline
|
||||
newIntersectionFunctionTableWithDescriptor:ift_desc];
|
||||
|
||||
/* Finally write the function handles into this pipeline's table */
|
||||
int size = int([best_pipeline->table_functions[table] count]);
|
||||
for (int i = 0; i < size; i++) {
|
||||
id<MTLFunctionHandle> handle = [pipeline
|
||||
functionHandleWithFunction:best_pipeline->table_functions[table][i]];
|
||||
[intersection_func_table[table] setFunction:handle atIndex:i];
|
||||
}
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
return true;
|
||||
}
|
||||
|
||||
id<MTLFunction> MetalKernelPipeline::make_intersection_function(const char *function_name)
|
||||
{
|
||||
MTLFunctionDescriptor *desc = [MTLIntersectionFunctionDescriptor functionDescriptor];
|
||||
@@ -507,7 +569,6 @@ void MetalKernelPipeline::compile()
|
||||
|
||||
function.label = [@(function_name.c_str()) copy];
|
||||
|
||||
NSArray *table_functions[METALRT_TABLE_NUM] = {nil};
|
||||
NSArray *linked_functions = nil;
|
||||
|
||||
if (use_metalrt && device_kernel_has_intersection(device_kernel)) {
|
||||
@@ -754,24 +815,6 @@ void MetalKernelPipeline::compile()
|
||||
[computePipelineStateDescriptor release];
|
||||
computePipelineStateDescriptor = nil;
|
||||
|
||||
if (use_metalrt && linked_functions) {
|
||||
for (int table = 0; table < METALRT_TABLE_NUM; table++) {
|
||||
MTLIntersectionFunctionTableDescriptor *ift_desc =
|
||||
[[MTLIntersectionFunctionTableDescriptor alloc] init];
|
||||
ift_desc.functionCount = table_functions[table].count;
|
||||
intersection_func_table[table] = [this->pipeline
|
||||
newIntersectionFunctionTableWithDescriptor:ift_desc];
|
||||
|
||||
/* Finally write the function handles into this pipeline's table */
|
||||
int size = (int)[table_functions[table] count];
|
||||
for (int i = 0; i < size; i++) {
|
||||
id<MTLFunctionHandle> handle = [pipeline
|
||||
functionHandleWithFunction:table_functions[table][i]];
|
||||
[intersection_func_table[table] setFunction:handle atIndex:i];
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
if (!use_binary_archive) {
|
||||
metal_printf("%16s | %2d | %-55s | %7.2fs\n",
|
||||
kernel_type_as_string(pso_type),
|
||||
|
||||
@@ -66,6 +66,7 @@ class MetalDeviceQueue : public DeviceQueue {
|
||||
id<MTLSharedEvent> shared_event_ = nil;
|
||||
API_AVAILABLE(macos(10.14), ios(14.0))
|
||||
MTLSharedEventListener *shared_event_listener_ = nil;
|
||||
MetalDispatchPipeline active_pipelines_[DEVICE_KERNEL_NUM];
|
||||
|
||||
dispatch_queue_t event_queue_;
|
||||
dispatch_semaphore_t wait_semaphore_;
|
||||
|
||||
@@ -465,13 +465,12 @@ bool MetalDeviceQueue::enqueue(DeviceKernel kernel,
|
||||
}
|
||||
bytes_written = globals_offsets + sizeof(KernelParamsMetal);
|
||||
|
||||
const MetalKernelPipeline *metal_kernel_pso = MetalDeviceKernels::get_best_pipeline(
|
||||
metal_device_, kernel);
|
||||
if (!metal_kernel_pso) {
|
||||
if (!active_pipelines_[kernel].update(metal_device_, kernel)) {
|
||||
metal_device_->set_error(
|
||||
string_printf("No MetalKernelPipeline for %s\n", device_kernel_as_string(kernel)));
|
||||
string_printf("Could not activate pipeline for %s\n", device_kernel_as_string(kernel)));
|
||||
return false;
|
||||
}
|
||||
MetalDispatchPipeline &active_pipeline = active_pipelines_[kernel];
|
||||
|
||||
/* Encode ancillaries */
|
||||
[metal_device_->mtlAncillaryArgEncoder setArgumentBuffer:arg_buffer offset:metal_offsets];
|
||||
@@ -487,8 +486,7 @@ bool MetalDeviceQueue::enqueue(DeviceKernel kernel,
|
||||
|
||||
if (@available(macos 12.0, *)) {
|
||||
if (metal_device_->use_metalrt && device_kernel_has_intersection(kernel)) {
|
||||
if (metal_device_->bvhMetalRT) {
|
||||
id<MTLAccelerationStructure> accel_struct = metal_device_->bvhMetalRT->accel_struct;
|
||||
if (id<MTLAccelerationStructure> accel_struct = metal_device_->accel_struct) {
|
||||
[metal_device_->mtlAncillaryArgEncoder setAccelerationStructure:accel_struct atIndex:3];
|
||||
[metal_device_->mtlAncillaryArgEncoder setBuffer:metal_device_->blas_buffer
|
||||
offset:0
|
||||
@@ -496,14 +494,14 @@ bool MetalDeviceQueue::enqueue(DeviceKernel kernel,
|
||||
}
|
||||
|
||||
for (int table = 0; table < METALRT_TABLE_NUM; table++) {
|
||||
if (metal_kernel_pso->intersection_func_table[table]) {
|
||||
[metal_kernel_pso->intersection_func_table[table] setBuffer:arg_buffer
|
||||
offset:globals_offsets
|
||||
atIndex:1];
|
||||
if (active_pipeline.intersection_func_table[table]) {
|
||||
[active_pipeline.intersection_func_table[table] setBuffer:arg_buffer
|
||||
offset:globals_offsets
|
||||
atIndex:1];
|
||||
[metal_device_->mtlAncillaryArgEncoder
|
||||
setIntersectionFunctionTable:metal_kernel_pso->intersection_func_table[table]
|
||||
setIntersectionFunctionTable:active_pipeline.intersection_func_table[table]
|
||||
atIndex:4 + table];
|
||||
[mtlComputeCommandEncoder useResource:metal_kernel_pso->intersection_func_table[table]
|
||||
[mtlComputeCommandEncoder useResource:active_pipeline.intersection_func_table[table]
|
||||
usage:MTLResourceUsageRead];
|
||||
}
|
||||
else {
|
||||
@@ -526,24 +524,22 @@ bool MetalDeviceQueue::enqueue(DeviceKernel kernel,
|
||||
if (metal_device_->use_metalrt && device_kernel_has_intersection(kernel)) {
|
||||
if (@available(macos 12.0, *)) {
|
||||
|
||||
BVHMetal *bvhMetalRT = metal_device_->bvhMetalRT;
|
||||
if (bvhMetalRT && bvhMetalRT->accel_struct) {
|
||||
if (id<MTLAccelerationStructure> accel_struct = metal_device_->accel_struct) {
|
||||
/* Mark all Accelerations resources as used */
|
||||
[mtlComputeCommandEncoder useResource:bvhMetalRT->accel_struct
|
||||
usage:MTLResourceUsageRead];
|
||||
[mtlComputeCommandEncoder useResource:accel_struct usage:MTLResourceUsageRead];
|
||||
[mtlComputeCommandEncoder useResource:metal_device_->blas_buffer
|
||||
usage:MTLResourceUsageRead];
|
||||
[mtlComputeCommandEncoder useResources:bvhMetalRT->unique_blas_array.data()
|
||||
count:bvhMetalRT->unique_blas_array.size()
|
||||
[mtlComputeCommandEncoder useResources:metal_device_->unique_blas_array.data()
|
||||
count:metal_device_->unique_blas_array.size()
|
||||
usage:MTLResourceUsageRead];
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
[mtlComputeCommandEncoder setComputePipelineState:metal_kernel_pso->pipeline];
|
||||
[mtlComputeCommandEncoder setComputePipelineState:active_pipeline.pipeline];
|
||||
|
||||
/* Compute kernel launch parameters. */
|
||||
const int num_threads_per_block = metal_kernel_pso->num_threads_per_block;
|
||||
const int num_threads_per_block = active_pipeline.num_threads_per_block;
|
||||
|
||||
int shared_mem_bytes = 0;
|
||||
|
||||
@@ -594,7 +590,7 @@ bool MetalDeviceQueue::enqueue(DeviceKernel kernel,
|
||||
const char *errCStr = [[NSString stringWithFormat:@"%@", command_buffer.error]
|
||||
UTF8String];
|
||||
str += string_printf("(%s.%s):\n%s\n",
|
||||
kernel_type_as_string(metal_kernel_pso->pso_type),
|
||||
kernel_type_as_string(active_pipeline.pso_type),
|
||||
device_kernel_as_string(kernel),
|
||||
errCStr);
|
||||
}
|
||||
|
||||
@@ -41,23 +41,14 @@ struct MetalInfo {
|
||||
/* Pool of MTLBuffers whose lifetime is linked to a single MTLCommandBuffer */
|
||||
class MetalBufferPool {
|
||||
struct MetalBufferListEntry {
|
||||
MetalBufferListEntry(id<MTLBuffer> buffer, id<MTLCommandBuffer> command_buffer)
|
||||
: buffer(buffer), command_buffer(command_buffer)
|
||||
{
|
||||
}
|
||||
|
||||
MetalBufferListEntry() = delete;
|
||||
|
||||
id<MTLBuffer> buffer;
|
||||
id<MTLCommandBuffer> command_buffer;
|
||||
};
|
||||
std::vector<MetalBufferListEntry> buffer_free_list;
|
||||
std::vector<MetalBufferListEntry> buffer_in_use_list;
|
||||
std::vector<MetalBufferListEntry> temp_buffers;
|
||||
thread_mutex buffer_mutex;
|
||||
size_t total_temp_mem_size = 0;
|
||||
|
||||
public:
|
||||
MetalBufferPool() = default;
|
||||
~MetalBufferPool();
|
||||
|
||||
id<MTLBuffer> get_buffer(id<MTLDevice> device,
|
||||
|
||||
@@ -123,53 +123,42 @@ id<MTLBuffer> MetalBufferPool::get_buffer(id<MTLDevice> device,
|
||||
const void *pointer,
|
||||
Stats &stats)
|
||||
{
|
||||
id<MTLBuffer> buffer;
|
||||
id<MTLBuffer> buffer = nil;
|
||||
|
||||
MTLStorageMode storageMode = MTLStorageMode((options & MTLResourceStorageModeMask) >>
|
||||
MTLResourceStorageModeShift);
|
||||
MTLCPUCacheMode cpuCacheMode = MTLCPUCacheMode((options & MTLResourceCPUCacheModeMask) >>
|
||||
MTLResourceCPUCacheModeShift);
|
||||
|
||||
buffer_mutex.lock();
|
||||
for (auto entry = buffer_free_list.begin(); entry != buffer_free_list.end(); entry++) {
|
||||
MetalBufferListEntry bufferEntry = *entry;
|
||||
|
||||
/* Check if buffer matches size and storage mode and is old enough to reuse */
|
||||
if (bufferEntry.buffer.length == length && storageMode == bufferEntry.buffer.storageMode &&
|
||||
cpuCacheMode == bufferEntry.buffer.cpuCacheMode)
|
||||
{
|
||||
buffer = bufferEntry.buffer;
|
||||
buffer_free_list.erase(entry);
|
||||
bufferEntry.command_buffer = command_buffer;
|
||||
buffer_in_use_list.push_back(bufferEntry);
|
||||
buffer_mutex.unlock();
|
||||
|
||||
/* Copy over data */
|
||||
if (pointer) {
|
||||
memcpy(buffer.contents, pointer, length);
|
||||
if (bufferEntry.buffer.storageMode == MTLStorageModeManaged) {
|
||||
[buffer didModifyRange:NSMakeRange(0, length)];
|
||||
}
|
||||
{
|
||||
thread_scoped_lock lock(buffer_mutex);
|
||||
/* Find an unused buffer with matching size and storage mode. */
|
||||
for (MetalBufferListEntry &bufferEntry : temp_buffers) {
|
||||
if (bufferEntry.buffer.length == length && storageMode == bufferEntry.buffer.storageMode &&
|
||||
cpuCacheMode == bufferEntry.buffer.cpuCacheMode && bufferEntry.command_buffer == nil)
|
||||
{
|
||||
buffer = bufferEntry.buffer;
|
||||
bufferEntry.command_buffer = command_buffer;
|
||||
break;
|
||||
}
|
||||
|
||||
return buffer;
|
||||
}
|
||||
if (!buffer) {
|
||||
/* Create a new buffer and add it to the pool. Typically this pool will only grow to a
|
||||
* handful of entries. */
|
||||
buffer = [device newBufferWithLength:length options:options];
|
||||
stats.mem_alloc(buffer.allocatedSize);
|
||||
total_temp_mem_size += buffer.allocatedSize;
|
||||
temp_buffers.push_back(MetalBufferListEntry{buffer, command_buffer});
|
||||
}
|
||||
}
|
||||
// NSLog(@"Creating buffer of length %lu (%lu)", length, frameCount);
|
||||
|
||||
/* Copy over data */
|
||||
if (pointer) {
|
||||
buffer = [device newBufferWithBytes:pointer length:length options:options];
|
||||
memcpy(buffer.contents, pointer, length);
|
||||
if (buffer.storageMode == MTLStorageModeManaged) {
|
||||
[buffer didModifyRange:NSMakeRange(0, length)];
|
||||
}
|
||||
}
|
||||
else {
|
||||
buffer = [device newBufferWithLength:length options:options];
|
||||
}
|
||||
|
||||
MetalBufferListEntry buffer_entry(buffer, command_buffer);
|
||||
|
||||
stats.mem_alloc(buffer.allocatedSize);
|
||||
|
||||
total_temp_mem_size += buffer.allocatedSize;
|
||||
buffer_in_use_list.push_back(buffer_entry);
|
||||
buffer_mutex.unlock();
|
||||
|
||||
return buffer;
|
||||
}
|
||||
@@ -178,16 +167,10 @@ void MetalBufferPool::process_command_buffer_completion(id<MTLCommandBuffer> com
|
||||
{
|
||||
assert(command_buffer);
|
||||
thread_scoped_lock lock(buffer_mutex);
|
||||
/* Release all buffers that have not been recently reused back into the free pool */
|
||||
for (auto entry = buffer_in_use_list.begin(); entry != buffer_in_use_list.end();) {
|
||||
MetalBufferListEntry buffer_entry = *entry;
|
||||
/* Mark any temp buffers associated with command_buffer as unused. */
|
||||
for (MetalBufferListEntry &buffer_entry : temp_buffers) {
|
||||
if (buffer_entry.command_buffer == command_buffer) {
|
||||
entry = buffer_in_use_list.erase(entry);
|
||||
buffer_entry.command_buffer = nil;
|
||||
buffer_free_list.push_back(buffer_entry);
|
||||
}
|
||||
else {
|
||||
entry++;
|
||||
}
|
||||
}
|
||||
}
|
||||
@@ -196,16 +179,12 @@ MetalBufferPool::~MetalBufferPool()
|
||||
{
|
||||
thread_scoped_lock lock(buffer_mutex);
|
||||
/* Release all buffers that have not been recently reused */
|
||||
for (auto entry = buffer_free_list.begin(); entry != buffer_free_list.end();) {
|
||||
MetalBufferListEntry buffer_entry = *entry;
|
||||
|
||||
id<MTLBuffer> buffer = buffer_entry.buffer;
|
||||
// NSLog(@"Releasing buffer of length %lu (%lu) (%lu outstanding)", buffer.length, frameCount,
|
||||
// bufferFreeList.size());
|
||||
total_temp_mem_size -= buffer.allocatedSize;
|
||||
[buffer release];
|
||||
entry = buffer_free_list.erase(entry);
|
||||
for (MetalBufferListEntry &buffer_entry : temp_buffers) {
|
||||
total_temp_mem_size -= buffer_entry.buffer.allocatedSize;
|
||||
[buffer_entry.buffer release];
|
||||
buffer_entry.buffer = nil;
|
||||
}
|
||||
temp_buffers.clear();
|
||||
}
|
||||
|
||||
CCL_NAMESPACE_END
|
||||
|
||||
@@ -989,11 +989,12 @@ ccl_device_forceinline bool integrate_volume_phase_scatter(
|
||||
ccl_device VolumeIntegrateEvent volume_integrate(KernelGlobals kg,
|
||||
IntegratorState state,
|
||||
ccl_private Ray *ccl_restrict ray,
|
||||
const int object,
|
||||
ccl_global float *ccl_restrict render_buffer)
|
||||
{
|
||||
ShaderData sd;
|
||||
shader_setup_from_volume(kg, &sd, ray, object);
|
||||
/* FIXME: `object` is used for light linking. We read the bottom of the stack for simplicity, but
|
||||
* this does not work for overlapping volumes. */
|
||||
shader_setup_from_volume(kg, &sd, ray, INTEGRATOR_STATE_ARRAY(state, volume_stack, 0, object));
|
||||
|
||||
/* Load random number state. */
|
||||
RNGState rng_state;
|
||||
@@ -1186,8 +1187,7 @@ ccl_device void integrator_shade_volume(KernelGlobals kg,
|
||||
volume_stack_clean(kg, state);
|
||||
}
|
||||
|
||||
const VolumeIntegrateEvent event = volume_integrate(
|
||||
kg, state, &ray, isect.object, render_buffer);
|
||||
const VolumeIntegrateEvent event = volume_integrate(kg, state, &ray, render_buffer);
|
||||
if (event == VOLUME_PATH_MISSED) {
|
||||
/* End path. */
|
||||
integrator_path_terminate(kg, state, DEVICE_KERNEL_INTEGRATOR_SHADE_VOLUME);
|
||||
|
||||
Reference in New Issue
Block a user