diff --git a/intern/cycles/device/cpu/device_impl.cpp b/intern/cycles/device/cpu/device_impl.cpp index 62ae8f652bc..29387ca849c 100644 --- a/intern/cycles/device/cpu/device_impl.cpp +++ b/intern/cycles/device/cpu/device_impl.cpp @@ -139,6 +139,11 @@ void CPUDevice::mem_copy_to(device_memory &mem) } } +void CPUDevice::mem_move_to_host(device_memory & /*mem*/) +{ + /* no-op */ +} + void CPUDevice::mem_copy_from( device_memory & /*mem*/, size_t /*y*/, size_t /*w*/, size_t /*h*/, size_t /*elem*/) { diff --git a/intern/cycles/device/cpu/device_impl.h b/intern/cycles/device/cpu/device_impl.h index e6008c8e28d..92b29f6be25 100644 --- a/intern/cycles/device/cpu/device_impl.h +++ b/intern/cycles/device/cpu/device_impl.h @@ -66,6 +66,7 @@ class CPUDevice : public Device { void mem_alloc(device_memory &mem) override; void mem_copy_to(device_memory &mem) override; + void mem_move_to_host(device_memory &mem) override; void mem_copy_from( device_memory &mem, const size_t y, size_t w, const size_t h, size_t elem) override; void mem_zero(device_memory &mem) override; diff --git a/intern/cycles/device/cuda/device_impl.cpp b/intern/cycles/device/cuda/device_impl.cpp index 7b1004b8ac3..fe25d753981 100644 --- a/intern/cycles/device/cuda/device_impl.cpp +++ b/intern/cycles/device/cuda/device_impl.cpp @@ -581,6 +581,25 @@ void CUDADevice::mem_alloc(device_memory &mem) } void CUDADevice::mem_copy_to(device_memory &mem) +{ + if (mem.type == MEM_GLOBAL) { + global_copy_to(mem); + } + else if (mem.type == MEM_TEXTURE) { + tex_copy_to((device_texture &)mem); + } + else { + if (!mem.device_pointer) { + generic_alloc(mem); + generic_copy_to(mem); + } + else if (mem.is_resident(this)) { + generic_copy_to(mem); + } + } +} + +void CUDADevice::mem_move_to_host(device_memory &mem) { if (mem.type == MEM_GLOBAL) { global_free(mem); @@ -591,10 +610,7 @@ void CUDADevice::mem_copy_to(device_memory &mem) tex_alloc((device_texture &)mem); } else { - if (!mem.device_pointer) { - generic_alloc(mem); - } - generic_copy_to(mem); + assert(0); } } @@ -689,6 +705,19 @@ void CUDADevice::global_alloc(device_memory &mem) const_copy_to(mem.name, &mem.device_pointer, sizeof(mem.device_pointer)); } +void CUDADevice::global_copy_to(device_memory &mem) +{ + if (!mem.device_pointer) { + generic_alloc(mem); + generic_copy_to(mem); + } + else if (mem.is_resident(this)) { + generic_copy_to(mem); + } + + const_copy_to(mem.name, &mem.device_pointer, sizeof(mem.device_pointer)); +} + void CUDADevice::global_free(device_memory &mem) { if (mem.is_resident(this) && mem.device_pointer) { @@ -696,13 +725,53 @@ void CUDADevice::global_free(device_memory &mem) } } +static size_t tex_src_pitch(const device_texture &mem) +{ + return mem.data_width * datatype_size(mem.data_type) * mem.data_elements; +} + +static CUDA_MEMCPY2D tex_2d_copy_param(const device_texture &mem, const int pitch_alignment) +{ + /* 2D texture using pitch aligned linear memory. */ + const size_t src_pitch = tex_src_pitch(mem); + const size_t dst_pitch = align_up(src_pitch, pitch_alignment); + + CUDA_MEMCPY2D param; + memset(¶m, 0, sizeof(param)); + param.dstMemoryType = CU_MEMORYTYPE_DEVICE; + param.dstDevice = mem.device_pointer; + param.dstPitch = dst_pitch; + param.srcMemoryType = CU_MEMORYTYPE_HOST; + param.srcHost = mem.host_pointer; + param.srcPitch = src_pitch; + param.WidthInBytes = param.srcPitch; + param.Height = mem.data_height; + + return param; +} + +static CUDA_MEMCPY3D tex_3d_copy_param(const device_texture &mem) +{ + const size_t src_pitch = tex_src_pitch(mem); + + CUDA_MEMCPY3D param; + memset(¶m, 0, sizeof(param)); + param.dstMemoryType = CU_MEMORYTYPE_ARRAY; + param.dstArray = (CUarray)mem.device_pointer; + param.srcMemoryType = CU_MEMORYTYPE_HOST; + param.srcHost = mem.host_pointer; + param.srcPitch = src_pitch; + param.WidthInBytes = param.srcPitch; + param.Height = mem.data_height; + param.Depth = mem.data_depth; + + return param; +} + void CUDADevice::tex_alloc(device_texture &mem) { CUDAContextScope scope(this); - size_t dsize = datatype_size(mem.data_type); - size_t size = mem.memory_size(); - CUaddress_mode address_mode = CU_TR_ADDRESS_MODE_WRAP; switch (mem.info.extension) { case EXTENSION_REPEAT: @@ -761,8 +830,6 @@ void CUDADevice::tex_alloc(device_texture &mem) Mem *cmem = nullptr; CUarray array_3d = nullptr; - size_t src_pitch = mem.data_width * dsize * mem.data_elements; - size_t dst_pitch = src_pitch; if (!mem.is_resident(this)) { thread_scoped_lock lock(device_mem_map_mutex); @@ -773,9 +840,6 @@ void CUDADevice::tex_alloc(device_texture &mem) array_3d = (CUarray)mem.device_pointer; cmem->array = reinterpret_cast(array_3d); } - else if (mem.data_height > 0) { - dst_pitch = align_up(src_pitch, pitch_alignment); - } } else if (mem.data_depth > 1) { /* 3D texture using array, there is no API for linear memory. */ @@ -798,22 +862,12 @@ void CUDADevice::tex_alloc(device_texture &mem) return; } - CUDA_MEMCPY3D param; - memset(¶m, 0, sizeof(param)); - param.dstMemoryType = CU_MEMORYTYPE_ARRAY; - param.dstArray = array_3d; - param.srcMemoryType = CU_MEMORYTYPE_HOST; - param.srcHost = mem.host_pointer; - param.srcPitch = src_pitch; - param.WidthInBytes = param.srcPitch; - param.Height = mem.data_height; - param.Depth = mem.data_depth; - - cuda_assert(cuMemcpy3D(¶m)); - mem.device_pointer = (device_ptr)array_3d; - mem.device_size = size; - stats.mem_alloc(size); + mem.device_size = mem.memory_size(); + stats.mem_alloc(mem.memory_size()); + + const CUDA_MEMCPY3D param = tex_3d_copy_param(mem); + cuda_assert(cuMemcpy3D(¶m)); thread_scoped_lock lock(device_mem_map_mutex); cmem = &device_mem_map[&mem]; @@ -822,25 +876,15 @@ void CUDADevice::tex_alloc(device_texture &mem) } else if (mem.data_height > 0) { /* 2D texture, using pitch aligned linear memory. */ - dst_pitch = align_up(src_pitch, pitch_alignment); - size_t dst_size = dst_pitch * mem.data_height; + const size_t dst_pitch = align_up(tex_src_pitch(mem), pitch_alignment); + const size_t dst_size = dst_pitch * mem.data_height; cmem = generic_alloc(mem, dst_size - mem.memory_size()); if (!cmem) { return; } - CUDA_MEMCPY2D param; - memset(¶m, 0, sizeof(param)); - param.dstMemoryType = CU_MEMORYTYPE_DEVICE; - param.dstDevice = mem.device_pointer; - param.dstPitch = dst_pitch; - param.srcMemoryType = CU_MEMORYTYPE_HOST; - param.srcHost = mem.host_pointer; - param.srcPitch = src_pitch; - param.WidthInBytes = param.srcPitch; - param.Height = mem.data_height; - + const CUDA_MEMCPY2D param = tex_2d_copy_param(mem, pitch_alignment); cuda_assert(cuMemcpy2DUnaligned(¶m)); } else { @@ -850,7 +894,7 @@ void CUDADevice::tex_alloc(device_texture &mem) return; } - cuda_assert(cuMemcpyHtoD(mem.device_pointer, mem.host_pointer, size)); + cuda_assert(cuMemcpyHtoD(mem.device_pointer, mem.host_pointer, mem.memory_size())); } /* Resize once */ @@ -879,6 +923,8 @@ void CUDADevice::tex_alloc(device_texture &mem) resDesc.flags = 0; } else if (mem.data_height > 0) { + const size_t dst_pitch = align_up(tex_src_pitch(mem), pitch_alignment); + resDesc.resType = CU_RESOURCE_TYPE_PITCH2D; resDesc.res.pitch2D.devPtr = mem.device_pointer; resDesc.res.pitch2D.format = format; @@ -917,39 +963,76 @@ void CUDADevice::tex_alloc(device_texture &mem) } } -void CUDADevice::tex_free(device_texture &mem) +void CUDADevice::tex_copy_to(device_texture &mem) { - if (mem.device_pointer) { - CUDAContextScope scope(this); - thread_scoped_lock lock(device_mem_map_mutex); - DCHECK(device_mem_map.find(&mem) != device_mem_map.end()); - const Mem &cmem = device_mem_map[&mem]; - - if (cmem.texobject) { - /* Free bindless texture. */ - cuTexObjectDestroy(cmem.texobject); + if (!mem.device_pointer) { + /* Not yet allocated on device. */ + tex_alloc(mem); + } + else if (!mem.is_resident(this)) { + /* Peering with another device, may still need to create texture info and object. */ + if (texture_info[mem.slot].data == 0) { + tex_alloc(mem); } - - if (!mem.is_resident(this)) { - /* Do not free memory here, since it was allocated on a different device. */ - device_mem_map.erase(device_mem_map.find(&mem)); + } + else { + /* Resident and fully allocated, only copy. */ + if (mem.data_depth > 0) { + CUDAContextScope scope(this); + const CUDA_MEMCPY3D param = tex_3d_copy_param(mem); + cuda_assert(cuMemcpy3D(¶m)); } - else if (cmem.array) { - /* Free array. */ - cuArrayDestroy(reinterpret_cast(cmem.array)); - stats.mem_free(mem.device_size); - mem.device_pointer = 0; - mem.device_size = 0; - - device_mem_map.erase(device_mem_map.find(&mem)); + else if (mem.data_height > 0) { + CUDAContextScope scope(this); + const CUDA_MEMCPY2D param = tex_2d_copy_param(mem, pitch_alignment); + cuda_assert(cuMemcpy2DUnaligned(¶m)); } else { - lock.unlock(); - generic_free(mem); + generic_copy_to(mem); } } } +void CUDADevice::tex_free(device_texture &mem) +{ + CUDAContextScope scope(this); + thread_scoped_lock lock(device_mem_map_mutex); + + /* Check if the memory was allocated for this device. */ + auto it = device_mem_map.find(&mem); + if (it == device_mem_map.end()) { + return; + } + + const Mem &cmem = it->second; + + /* Always clear texture info and texture object, regardless of residency. */ + texture_info[mem.slot] = TextureInfo(); + + if (cmem.texobject) { + /* Free bindless texture. */ + cuTexObjectDestroy(cmem.texobject); + } + + if (!mem.is_resident(this)) { + /* Do not free memory here, since it was allocated on a different device. */ + device_mem_map.erase(device_mem_map.find(&mem)); + } + else if (cmem.array) { + /* Free array. */ + cuArrayDestroy(reinterpret_cast(cmem.array)); + stats.mem_free(mem.device_size); + mem.device_pointer = 0; + mem.device_size = 0; + + device_mem_map.erase(device_mem_map.find(&mem)); + } + else { + lock.unlock(); + generic_free(mem); + } +} + unique_ptr CUDADevice::gpu_queue_create() { return make_unique(this); diff --git a/intern/cycles/device/cuda/device_impl.h b/intern/cycles/device/cuda/device_impl.h index eb7a7538c28..3b551dcf55b 100644 --- a/intern/cycles/device/cuda/device_impl.h +++ b/intern/cycles/device/cuda/device_impl.h @@ -74,6 +74,8 @@ class CUDADevice : public GPUDevice { void mem_copy_to(device_memory &mem) override; + void mem_move_to_host(device_memory &mem) override; + void mem_copy_from( device_memory &mem, const size_t y, size_t w, const size_t h, size_t elem) override; @@ -86,11 +88,11 @@ class CUDADevice : public GPUDevice { void const_copy_to(const char *name, void *host, const size_t size) override; void global_alloc(device_memory &mem); - + void global_copy_to(device_memory &mem); void global_free(device_memory &mem); void tex_alloc(device_texture &mem); - + void tex_copy_to(device_texture &mem); void tex_free(device_texture &mem); bool should_use_graphics_interop() override; diff --git a/intern/cycles/device/device.cpp b/intern/cycles/device/device.cpp index 141a8c4c09f..04768bd39f0 100644 --- a/intern/cycles/device/device.cpp +++ b/intern/cycles/device/device.cpp @@ -614,7 +614,7 @@ void GPUDevice::move_textures_to_host(size_t size, bool for_texture) * devices as well, which is potentially dangerous when still in use (since * a thread rendering on another devices would only be caught in this mutex * if it so happens to do an allocation at the same time as well. */ - max_mem->device_copy_to(); + max_mem->device_move_to_host(); size = (max_size >= size) ? 0 : size - max_size; any_device_moving_textures_to_host = false; @@ -758,40 +758,42 @@ GPUDevice::Mem *GPUDevice::generic_alloc(device_memory &mem, const size_t pitch_ void GPUDevice::generic_free(device_memory &mem) { - if (mem.device_pointer) { - const thread_scoped_lock lock(device_mem_map_mutex); - DCHECK(device_mem_map.find(&mem) != device_mem_map.end()); - const Mem &cmem = device_mem_map[&mem]; - - /* If cmem.use_mapped_host is true, reference counting is used - * to safely free a mapped host memory. */ - - if (cmem.use_mapped_host) { - assert(mem.shared_pointer); - if (mem.shared_pointer) { - assert(mem.shared_counter > 0); - if (--mem.shared_counter == 0) { - if (mem.host_pointer == mem.shared_pointer) { - mem.host_pointer = nullptr; - } - free_host(mem.shared_pointer); - mem.shared_pointer = nullptr; - } - } - map_host_used -= mem.device_size; - } - else { - /* Free device memory. */ - free_device((void *)mem.device_pointer); - device_mem_in_use -= mem.device_size; - } - - stats.mem_free(mem.device_size); - mem.device_pointer = 0; - mem.device_size = 0; - - device_mem_map.erase(device_mem_map.find(&mem)); + if (!(mem.device_pointer && mem.is_resident(this))) { + return; } + + const thread_scoped_lock lock(device_mem_map_mutex); + DCHECK(device_mem_map.find(&mem) != device_mem_map.end()); + const Mem &cmem = device_mem_map[&mem]; + + /* If cmem.use_mapped_host is true, reference counting is used + * to safely free a mapped host memory. */ + + if (cmem.use_mapped_host) { + assert(mem.shared_pointer); + if (mem.shared_pointer) { + assert(mem.shared_counter > 0); + if (--mem.shared_counter == 0) { + if (mem.host_pointer == mem.shared_pointer) { + mem.host_pointer = nullptr; + } + free_host(mem.shared_pointer); + mem.shared_pointer = nullptr; + } + } + map_host_used -= mem.device_size; + } + else { + /* Free device memory. */ + free_device((void *)mem.device_pointer); + device_mem_in_use -= mem.device_size; + } + + stats.mem_free(mem.device_size); + mem.device_pointer = 0; + mem.device_size = 0; + + device_mem_map.erase(device_mem_map.find(&mem)); } void GPUDevice::generic_copy_to(device_memory &mem) diff --git a/intern/cycles/device/device.h b/intern/cycles/device/device.h index b162a16816f..758054d76a7 100644 --- a/intern/cycles/device/device.h +++ b/intern/cycles/device/device.h @@ -315,6 +315,7 @@ class Device { virtual void mem_alloc(device_memory &mem) = 0; virtual void mem_copy_to(device_memory &mem) = 0; + virtual void mem_move_to_host(device_memory &mem) = 0; virtual void mem_copy_from( device_memory &mem, const size_t y, size_t w, const size_t h, size_t elem) = 0; virtual void mem_zero(device_memory &mem) = 0; @@ -379,8 +380,10 @@ class GPUDevice : public Device { size_t device_mem_in_use = 0; virtual void init_host_memory(const size_t preferred_texture_headroom = 0, - size_t preferred_working_headroom = 0); - virtual void move_textures_to_host(const size_t size, bool for_texture); + const size_t preferred_working_headroom = 0); + virtual void move_textures_to_host(const size_t size, + const size_t headroom, + const bool for_texture); /* Allocation, deallocation and copy functions, with corresponding * support of device/host allocations. */ diff --git a/intern/cycles/device/dummy/device.cpp b/intern/cycles/device/dummy/device.cpp index dbed884db25..c1e4f933ff0 100644 --- a/intern/cycles/device/dummy/device.cpp +++ b/intern/cycles/device/dummy/device.cpp @@ -29,6 +29,8 @@ class DummyDevice : public Device { void mem_copy_to(device_memory & /*mem*/) override {} + void mem_move_to_host(device_memory & /*mem*/) override {} + void mem_copy_from( device_memory & /*mem*/, size_t /*y*/, size_t /*w*/, size_t /*h*/, size_t /*elem*/) override { diff --git a/intern/cycles/device/hip/device_impl.cpp b/intern/cycles/device/hip/device_impl.cpp index fe88ed6d7ad..e1ea4ca4b20 100644 --- a/intern/cycles/device/hip/device_impl.cpp +++ b/intern/cycles/device/hip/device_impl.cpp @@ -4,7 +4,6 @@ #ifdef WITH_HIP -# include # include # include # include @@ -544,6 +543,25 @@ void HIPDevice::mem_alloc(device_memory &mem) } void HIPDevice::mem_copy_to(device_memory &mem) +{ + if (mem.type == MEM_GLOBAL) { + global_copy_to(mem); + } + else if (mem.type == MEM_TEXTURE) { + tex_copy_to((device_texture &)mem); + } + else { + if (!mem.device_pointer) { + generic_alloc(mem); + generic_copy_to(mem); + } + else if (mem.is_resident(this)) { + generic_copy_to(mem); + } + } +} + +void HIPDevice::mem_move_to_host(device_memory &mem) { if (mem.type == MEM_GLOBAL) { global_free(mem); @@ -554,10 +572,7 @@ void HIPDevice::mem_copy_to(device_memory &mem) tex_alloc((device_texture &)mem); } else { - if (!mem.device_pointer) { - generic_alloc(mem); - } - generic_copy_to(mem); + assert(0); } } @@ -652,6 +667,19 @@ void HIPDevice::global_alloc(device_memory &mem) const_copy_to(mem.name, &mem.device_pointer, sizeof(mem.device_pointer)); } +void HIPDevice::global_copy_to(device_memory &mem) +{ + if (!mem.device_pointer) { + generic_alloc(mem); + generic_copy_to(mem); + } + else if (mem.is_resident(this)) { + generic_copy_to(mem); + } + + const_copy_to(mem.name, &mem.device_pointer, sizeof(mem.device_pointer)); +} + void HIPDevice::global_free(device_memory &mem) { if (mem.is_resident(this) && mem.device_pointer) { @@ -659,13 +687,52 @@ void HIPDevice::global_free(device_memory &mem) } } +static size_t tex_src_pitch(const device_texture &mem) +{ + return mem.data_width * datatype_size(mem.data_type) * mem.data_elements; +} + +static hip_Memcpy2D tex_2d_copy_param(const device_texture &mem, const int pitch_alignment) +{ + /* 2D texture using pitch aligned linear memory. */ + const size_t src_pitch = tex_src_pitch(mem); + const size_t dst_pitch = align_up(src_pitch, pitch_alignment); + + hip_Memcpy2D param; + memset(¶m, 0, sizeof(param)); + param.dstMemoryType = hipMemoryTypeDevice; + param.dstDevice = mem.device_pointer; + param.dstPitch = dst_pitch; + param.srcMemoryType = hipMemoryTypeHost; + param.srcHost = mem.host_pointer; + param.srcPitch = src_pitch; + param.WidthInBytes = param.srcPitch; + param.Height = mem.data_height; + + return param; +} + +static HIP_MEMCPY3D tex_3d_copy_param(const device_texture &mem) +{ + const size_t src_pitch = tex_src_pitch(mem); + + HIP_MEMCPY3D param; + memset(¶m, 0, sizeof(HIP_MEMCPY3D)); + param.dstMemoryType = hipMemoryTypeArray; + param.dstArray = (hArray)mem.device_pointer; + param.srcMemoryType = hipMemoryTypeHost; + param.srcHost = mem.host_pointer; + param.srcPitch = src_pitch; + param.WidthInBytes = param.srcPitch; + param.Height = mem.data_height; + param.Depth = mem.data_depth; + return param; +} + void HIPDevice::tex_alloc(device_texture &mem) { HIPContextScope scope(this); - size_t dsize = datatype_size(mem.data_type); - size_t size = mem.memory_size(); - hipTextureAddressMode address_mode = hipAddressModeWrap; switch (mem.info.extension) { case EXTENSION_REPEAT: @@ -721,8 +788,6 @@ void HIPDevice::tex_alloc(device_texture &mem) Mem *cmem = nullptr; hArray array_3d = nullptr; - size_t src_pitch = mem.data_width * dsize * mem.data_elements; - size_t dst_pitch = src_pitch; if (!mem.is_resident(this)) { thread_scoped_lock lock(device_mem_map_mutex); @@ -733,9 +798,6 @@ void HIPDevice::tex_alloc(device_texture &mem) array_3d = (hArray)mem.device_pointer; cmem->array = reinterpret_cast(array_3d); } - else if (mem.data_height > 0) { - dst_pitch = align_up(src_pitch, pitch_alignment); - } } else if (mem.data_depth > 1) { /* 3D texture using array, there is no API for linear memory. */ @@ -758,22 +820,12 @@ void HIPDevice::tex_alloc(device_texture &mem) return; } - HIP_MEMCPY3D param; - memset(¶m, 0, sizeof(HIP_MEMCPY3D)); - param.dstMemoryType = hipMemoryTypeArray; - param.dstArray = array_3d; - param.srcMemoryType = hipMemoryTypeHost; - param.srcHost = mem.host_pointer; - param.srcPitch = src_pitch; - param.WidthInBytes = param.srcPitch; - param.Height = mem.data_height; - param.Depth = mem.data_depth; - - hip_assert(hipDrvMemcpy3D(¶m)); - mem.device_pointer = (device_ptr)array_3d; - mem.device_size = size; - stats.mem_alloc(size); + mem.device_size = mem.memory_size(); + stats.mem_alloc(mem.memory_size()); + + const HIP_MEMCPY3D param = tex_3d_copy_param(mem); + hip_assert(hipDrvMemcpy3D(¶m)); thread_scoped_lock lock(device_mem_map_mutex); cmem = &device_mem_map[&mem]; @@ -782,25 +834,15 @@ void HIPDevice::tex_alloc(device_texture &mem) } else if (mem.data_height > 0) { /* 2D texture, using pitch aligned linear memory. */ - dst_pitch = align_up(src_pitch, pitch_alignment); - size_t dst_size = dst_pitch * mem.data_height; + const size_t dst_pitch = align_up(tex_src_pitch(mem), pitch_alignment); + const size_t dst_size = dst_pitch * mem.data_height; cmem = generic_alloc(mem, dst_size - mem.memory_size()); if (!cmem) { return; } - hip_Memcpy2D param; - memset(¶m, 0, sizeof(param)); - param.dstMemoryType = hipMemoryTypeDevice; - param.dstDevice = mem.device_pointer; - param.dstPitch = dst_pitch; - param.srcMemoryType = hipMemoryTypeHost; - param.srcHost = mem.host_pointer; - param.srcPitch = src_pitch; - param.WidthInBytes = param.srcPitch; - param.Height = mem.data_height; - + const hip_Memcpy2D param = tex_2d_copy_param(mem, pitch_alignment); hip_assert(hipDrvMemcpy2DUnaligned(¶m)); } else { @@ -810,7 +852,7 @@ void HIPDevice::tex_alloc(device_texture &mem) return; } - hip_assert(hipMemcpyHtoD(mem.device_pointer, mem.host_pointer, size)); + hip_assert(hipMemcpyHtoD(mem.device_pointer, mem.host_pointer, mem.memory_size())); } /* Resize once */ @@ -840,6 +882,8 @@ void HIPDevice::tex_alloc(device_texture &mem) resDesc.flags = 0; } else if (mem.data_height > 0) { + const size_t dst_pitch = align_up(tex_src_pitch(mem), pitch_alignment); + resDesc.resType = hipResourceTypePitch2D; resDesc.res.pitch2D.devPtr = mem.device_pointer; resDesc.res.pitch2D.format = format; @@ -880,39 +924,76 @@ void HIPDevice::tex_alloc(device_texture &mem) } } -void HIPDevice::tex_free(device_texture &mem) +void HIPDevice::tex_copy_to(device_texture &mem) { - if (mem.device_pointer) { - HIPContextScope scope(this); - thread_scoped_lock lock(device_mem_map_mutex); - DCHECK(device_mem_map.find(&mem) != device_mem_map.end()); - const Mem &cmem = device_mem_map[&mem]; - - if (cmem.texobject) { - /* Free bindless texture. */ - hipTexObjectDestroy(cmem.texobject); + if (!mem.device_pointer) { + /* Not yet allocated on device. */ + tex_alloc(mem); + } + else if (!mem.is_resident(this)) { + /* Peering with another device, may still need to create texture info and object. */ + if (texture_info[mem.slot].data == 0) { + tex_alloc(mem); } - - if (!mem.is_resident(this)) { - /* Do not free memory here, since it was allocated on a different device. */ - device_mem_map.erase(device_mem_map.find(&mem)); + } + else { + /* Resident and fully allocated, only copy. */ + if (mem.data_depth > 0) { + HIPContextScope scope(this); + const HIP_MEMCPY3D param = tex_3d_copy_param(mem); + hip_assert(hipDrvMemcpy3D(¶m)); } - else if (cmem.array) { - /* Free array. */ - hipArrayDestroy(reinterpret_cast(cmem.array)); - stats.mem_free(mem.device_size); - mem.device_pointer = 0; - mem.device_size = 0; - - device_mem_map.erase(device_mem_map.find(&mem)); + else if (mem.data_height > 0) { + HIPContextScope scope(this); + const hip_Memcpy2D param = tex_2d_copy_param(mem, pitch_alignment); + hip_assert(hipDrvMemcpy2DUnaligned(¶m)); } else { - lock.unlock(); - generic_free(mem); + generic_copy_to(mem); } } } +void HIPDevice::tex_free(device_texture &mem) +{ + HIPContextScope scope(this); + thread_scoped_lock lock(device_mem_map_mutex); + + /* Check if the memory was allocated for this device. */ + auto it = device_mem_map.find(&mem); + if (it == device_mem_map.end()) { + return; + } + + const Mem &cmem = it->second; + + /* Always clear texture info and texture object, regardless of residency. */ + texture_info[mem.slot] = TextureInfo(); + + if (cmem.texobject) { + /* Free bindless texture. */ + hipTexObjectDestroy(cmem.texobject); + } + + if (!mem.is_resident(this)) { + /* Do not free memory here, since it was allocated on a different device. */ + device_mem_map.erase(device_mem_map.find(&mem)); + } + else if (cmem.array) { + /* Free array. */ + hipArrayDestroy(reinterpret_cast(cmem.array)); + stats.mem_free(mem.device_size); + mem.device_pointer = 0; + mem.device_size = 0; + + device_mem_map.erase(device_mem_map.find(&mem)); + } + else { + lock.unlock(); + generic_free(mem); + } +} + unique_ptr HIPDevice::gpu_queue_create() { return make_unique(this); diff --git a/intern/cycles/device/hip/device_impl.h b/intern/cycles/device/hip/device_impl.h index 007095554c7..d40102e3ac9 100644 --- a/intern/cycles/device/hip/device_impl.h +++ b/intern/cycles/device/hip/device_impl.h @@ -72,6 +72,8 @@ class HIPDevice : public GPUDevice { void mem_copy_to(device_memory &mem) override; + void mem_move_to_host(device_memory &mem) override; + void mem_copy_from( device_memory &mem, const size_t y, size_t w, const size_t h, size_t elem) override; @@ -84,11 +86,11 @@ class HIPDevice : public GPUDevice { void const_copy_to(const char *name, void *host, const size_t size) override; void global_alloc(device_memory &mem); - + void global_copy_to(device_memory &mem); void global_free(device_memory &mem); void tex_alloc(device_texture &mem); - + void tex_copy_to(device_texture &mem); void tex_free(device_texture &mem); /* Graphics resources interoperability. */ diff --git a/intern/cycles/device/memory.cpp b/intern/cycles/device/memory.cpp index 687c7400bd1..b54f82e6339 100644 --- a/intern/cycles/device/memory.cpp +++ b/intern/cycles/device/memory.cpp @@ -82,6 +82,13 @@ void device_memory::device_copy_to() } } +void device_memory::device_move_to_host() +{ + if (host_pointer) { + device->mem_move_to_host(*this); + } +} + void device_memory::device_copy_from(const size_t y, const size_t w, size_t h, const size_t elem) { assert(type != MEM_TEXTURE && type != MEM_READ_ONLY && type != MEM_GLOBAL); diff --git a/intern/cycles/device/memory.h b/intern/cycles/device/memory.h index d267e1b39c4..8d6bea3d23d 100644 --- a/intern/cycles/device/memory.h +++ b/intern/cycles/device/memory.h @@ -288,6 +288,7 @@ class device_memory { void device_alloc(); void device_free(); void device_copy_to(); + void device_move_to_host(); void device_copy_from(const size_t y, const size_t w, size_t h, const size_t elem); void device_zero(); diff --git a/intern/cycles/device/metal/device_impl.h b/intern/cycles/device/metal/device_impl.h index 0f2f76793d5..759644e7afc 100644 --- a/intern/cycles/device/metal/device_impl.h +++ b/intern/cycles/device/metal/device_impl.h @@ -161,6 +161,8 @@ class MetalDevice : public Device { void mem_copy_to(device_memory &mem) override; + void mem_move_to_host(device_memory &mem) override; + void mem_copy_from(device_memory &mem) { mem_copy_from(mem, -1, -1, -1, -1); @@ -177,13 +179,11 @@ class MetalDevice : public Device { void const_copy_to(const char *name, void *host, const size_t size) override; void global_alloc(device_memory &mem); - void global_free(device_memory &mem); void tex_alloc(device_texture &mem); - void tex_alloc_as_buffer(device_texture &mem); - + void tex_copy_to(device_texture &mem); void tex_free(device_texture &mem); void flush_delayed_free_list(); diff --git a/intern/cycles/device/metal/device_impl.mm b/intern/cycles/device/metal/device_impl.mm index 0e877e69660..8a7a78f3960 100644 --- a/intern/cycles/device/metal/device_impl.mm +++ b/intern/cycles/device/metal/device_impl.mm @@ -772,46 +772,48 @@ void MetalDevice::generic_copy_to(device_memory &mem) void MetalDevice::generic_free(device_memory &mem) { - if (mem.device_pointer) { - std::lock_guard lock(metal_mem_map_mutex); - MetalMem &mmem = *metal_mem_map.at(&mem); - size_t size = mmem.size; - - /* If mmem.use_uma is true, reference counting is used - * to safely free memory. */ - - bool free_mtlBuffer = false; - - if (mmem.use_UMA) { - assert(mem.shared_pointer); - if (mem.shared_pointer) { - assert(mem.shared_counter > 0); - if (--mem.shared_counter == 0) { - free_mtlBuffer = true; - } - } - } - else { - free_mtlBuffer = true; - } - - if (free_mtlBuffer) { - if (mem.host_pointer && mem.host_pointer == mem.shared_pointer) { - /* Safely move the device-side data back to the host before it is freed. */ - mem.host_pointer = mem.host_alloc(size); - memcpy(mem.host_pointer, mem.shared_pointer, size); - mmem.use_UMA = false; - } - - mem.shared_pointer = nullptr; - - /* Free device memory. */ - delayed_free_list.push_back(mmem.mtlBuffer); - mmem.mtlBuffer = nil; - } - - erase_allocation(mem); + if (!mem.device_pointer) { + return; } + + std::lock_guard lock(metal_mem_map_mutex); + MetalMem &mmem = *metal_mem_map.at(&mem); + size_t size = mmem.size; + + /* If mmem.use_uma is true, reference counting is used + * to safely free memory. */ + + bool free_mtlBuffer = false; + + if (mmem.use_UMA) { + assert(mem.shared_pointer); + if (mem.shared_pointer) { + assert(mem.shared_counter > 0); + if (--mem.shared_counter == 0) { + free_mtlBuffer = true; + } + } + } + else { + free_mtlBuffer = true; + } + + if (free_mtlBuffer) { + if (mem.host_pointer && mem.host_pointer == mem.shared_pointer) { + /* Safely move the device-side data back to the host before it is freed. */ + mem.host_pointer = mem.host_alloc(size); + memcpy(mem.host_pointer, mem.shared_pointer, size); + mmem.use_UMA = false; + } + + mem.shared_pointer = nullptr; + + /* Free device memory. */ + delayed_free_list.push_back(mmem.mtlBuffer); + mmem.mtlBuffer = nil; + } + + erase_allocation(mem); } void MetalDevice::mem_alloc(device_memory &mem) @@ -829,20 +831,35 @@ void MetalDevice::mem_alloc(device_memory &mem) void MetalDevice::mem_copy_to(device_memory &mem) { - if (mem.type == MEM_GLOBAL) { - global_free(mem); - global_alloc(mem); - } - else if (mem.type == MEM_TEXTURE) { - tex_free((device_texture &)mem); - tex_alloc((device_texture &)mem); - } - else { - if (!mem.device_pointer) { - generic_alloc(mem); + if (!mem.device_pointer) { + if (mem.type == MEM_GLOBAL) { + global_alloc(mem); + } + else if (mem.type == MEM_TEXTURE) { + tex_alloc((device_texture &)mem); + } + else { + generic_alloc(mem); + generic_copy_to(mem); } - generic_copy_to(mem); } + else if (mem.is_resident(this)) { + if (mem.type == MEM_GLOBAL) { + generic_copy_to(mem); + } + else if (mem.type == MEM_TEXTURE) { + tex_copy_to((device_texture &)mem); + } + else { + generic_copy_to(mem); + } + } +} + +void MetalDevice::mem_move_to_host(device_memory & /*mem*/) +{ + /* Metal implements own mechanism for moving host memory. */ + assert(0); } void MetalDevice::mem_copy_from( @@ -1116,7 +1133,6 @@ void MetalDevice::tex_alloc(device_texture &mem) } /* General variables for both architectures */ - size_t dsize = datatype_size(mem.data_type); size_t size = mem.memory_size(); /* sampler_index maps into the GPU's constant 'metal_samplers' array */ @@ -1178,7 +1194,7 @@ void MetalDevice::tex_alloc(device_texture &mem) assert(format != MTLPixelFormatInvalid); id mtlTexture = nil; - size_t src_pitch = mem.data_width * dsize * mem.data_elements; + size_t src_pitch = mem.data_width * datatype_size(mem.data_type) * mem.data_elements; if (mem.data_depth > 1) { /* 3D texture using array */ @@ -1309,6 +1325,45 @@ void MetalDevice::tex_alloc(device_texture &mem) } } +void MetalDevice::tex_copy_to(device_texture &mem) +{ + if (mem.is_resident(this)) { + const size_t src_pitch = mem.data_width * datatype_size(mem.data_type) * mem.data_elements; + + if (mem.data_depth > 0) { + id mtlTexture; + { + std::lock_guard lock(metal_mem_map_mutex); + mtlTexture = metal_mem_map.at(&mem)->mtlTexture; + } + const size_t imageBytes = src_pitch * mem.data_height; + for (size_t d = 0; d < mem.data_depth; d++) { + const size_t offset = d * imageBytes; + [mtlTexture replaceRegion:MTLRegionMake3D(0, 0, d, mem.data_width, mem.data_height, 1) + mipmapLevel:0 + slice:0 + withBytes:(uint8_t *)mem.host_pointer + offset + bytesPerRow:src_pitch + bytesPerImage:0]; + } + } + else if (mem.data_height > 0) { + id mtlTexture; + { + std::lock_guard lock(metal_mem_map_mutex); + mtlTexture = metal_mem_map.at(&mem)->mtlTexture; + } + [mtlTexture replaceRegion:MTLRegionMake2D(0, 0, mem.data_width, mem.data_height) + mipmapLevel:0 + withBytes:mem.host_pointer + bytesPerRow:src_pitch]; + } + else { + generic_copy_to(mem); + } + } +} + void MetalDevice::tex_free(device_texture &mem) { if (mem.data_depth == 0 && mem.data_height == 0) { diff --git a/intern/cycles/device/multi/device.cpp b/intern/cycles/device/multi/device.cpp index e57ed63b322..2a8c59a1c4b 100644 --- a/intern/cycles/device/multi/device.cpp +++ b/intern/cycles/device/multi/device.cpp @@ -340,7 +340,6 @@ class MultiDevice : public Device { device_ptr key = (existing_key) ? existing_key : unique_key++; size_t existing_size = mem.device_size; - /* The tile buffers are allocated on each device (see below), so copy to all of them */ for (const vector &island : peer_islands) { SubDevice *owner_sub = find_suitable_mem_device(existing_key, island); mem.device = owner_sub->device.get(); @@ -365,6 +364,36 @@ class MultiDevice : public Device { stats.mem_alloc(mem.device_size - existing_size); } + void mem_move_to_host(device_memory &mem) override + { + device_ptr existing_key = mem.device_pointer; + device_ptr key = (existing_key) ? existing_key : unique_key++; + size_t existing_size = mem.device_size; + + for (const vector &island : peer_islands) { + SubDevice *owner_sub = find_suitable_mem_device(existing_key, island); + mem.device = owner_sub->device.get(); + mem.device_pointer = (existing_key) ? owner_sub->ptr_map[existing_key] : 0; + mem.device_size = existing_size; + + owner_sub->device->mem_move_to_host(mem); + owner_sub->ptr_map[key] = mem.device_pointer; + + if (mem.type == MEM_GLOBAL || mem.type == MEM_TEXTURE) { + /* Need to create texture objects and update pointer in kernel globals on all devices */ + for (SubDevice *island_sub : island) { + if (island_sub != owner_sub) { + island_sub->device->mem_move_to_host(mem); + } + } + } + } + + mem.device = this; + mem.device_pointer = key; + stats.mem_alloc(mem.device_size - existing_size); + } + void mem_copy_from( device_memory &mem, const size_t y, size_t w, const size_t h, size_t elem) override { diff --git a/intern/cycles/device/oneapi/device_impl.cpp b/intern/cycles/device/oneapi/device_impl.cpp index 8e7a06ae228..cdf4d8c4f19 100644 --- a/intern/cycles/device/oneapi/device_impl.cpp +++ b/intern/cycles/device/oneapi/device_impl.cpp @@ -112,7 +112,6 @@ OneapiDevice::OneapiDevice(const DeviceInfo &info, Stats &stats, Profiler &profi max_memory_on_device_ = get_memcapacity(); init_host_memory(); - move_texture_to_host = false; can_map_host = true; const char *headroom_str = getenv("CYCLES_ONEAPI_MEMORY_HEADROOM"); @@ -417,6 +416,34 @@ void OneapiDevice::mem_copy_to(device_memory &mem) return; } + if (mem.type == MEM_GLOBAL) { + global_copy_to(mem); + } + else if (mem.type == MEM_TEXTURE) { + tex_copy_to((device_texture &)mem); + } + else { + if (!mem.device_pointer) { + generic_alloc(mem); + } + generic_copy_to(mem); + } +} + +void OneapiDevice::mem_move_to_host(device_memory &mem) +{ + if (mem.name) { + VLOG_DEBUG << "OneapiDevice::mem_move_to_host: \"" << mem.name << "\", " + << string_human_readable_number(mem.memory_size()) << " bytes. (" + << string_human_readable_size(mem.memory_size()) << ")"; + } + + /* After getting runtime errors we need to avoid performing oneAPI runtime operations + * because the associated GPU context may be in an invalid state at this point. */ + if (have_error()) { + return; + } + if (mem.type == MEM_GLOBAL) { global_free(mem); global_alloc(mem); @@ -426,11 +453,7 @@ void OneapiDevice::mem_copy_to(device_memory &mem) tex_alloc((device_texture &)mem); } else { - if (!mem.device_pointer) { - generic_alloc(mem); - } - - generic_copy_to(mem); + assert(0); } } @@ -596,6 +619,16 @@ void OneapiDevice::global_alloc(device_memory &mem) usm_memcpy(device_queue_, kg_memory_device_, kg_memory_, kg_memory_size_); } +void OneapiDevice::global_copy_to(device_memory &mem) +{ + if (!mem.device_pointer) { + global_alloc(mem); + } + else { + generic_copy_to(mem); + } +} + void OneapiDevice::global_free(device_memory &mem) { if (mem.device_pointer) { @@ -620,6 +653,11 @@ void OneapiDevice::tex_alloc(device_texture &mem) texture_info[slot].data = (uint64_t)mem.device_pointer; } +void OneapiDevice::tex_copy_to(device_texture &mem) +{ + generic_copy_to(mem); +} + void OneapiDevice::tex_free(device_texture &mem) { /* There is no texture memory in SYCL. */ diff --git a/intern/cycles/device/oneapi/device_impl.h b/intern/cycles/device/oneapi/device_impl.h index 8c70f448872..aa0ca698bfb 100644 --- a/intern/cycles/device/oneapi/device_impl.h +++ b/intern/cycles/device/oneapi/device_impl.h @@ -75,6 +75,8 @@ class OneapiDevice : public GPUDevice { void mem_copy_to(device_memory &mem) override; + void mem_move_to_host(device_memory &mem) override; + void mem_copy_from( device_memory &mem, const size_t y, size_t w, const size_t h, size_t elem) override; @@ -92,11 +94,11 @@ class OneapiDevice : public GPUDevice { void const_copy_to(const char *name, void *host, const size_t size) override; void global_alloc(device_memory &mem); - + void global_copy_to(device_memory &mem); void global_free(device_memory &mem); void tex_alloc(device_texture &mem); - + void tex_copy_to(device_texture &mem); void tex_free(device_texture &mem); /* Graphics resources interoperability. */