Fix: Cycles: Re-copy memory from host to device without realloc
Should be a bit more efficient, and it fixes host memory fallback bugs, where host memory was incorrectly freed during re-copy. For the case where memory should get reallocated on the host, a new mem_move_to_host was added. Thanks to Jorn Visser for investigating and finding this problem. Pull Request: https://projects.blender.org/blender/blender/pulls/132912
This commit is contained in:
@@ -139,6 +139,11 @@ void CPUDevice::mem_copy_to(device_memory &mem)
|
||||
}
|
||||
}
|
||||
|
||||
void CPUDevice::mem_move_to_host(device_memory & /*mem*/)
|
||||
{
|
||||
/* no-op */
|
||||
}
|
||||
|
||||
void CPUDevice::mem_copy_from(
|
||||
device_memory & /*mem*/, size_t /*y*/, size_t /*w*/, size_t /*h*/, size_t /*elem*/)
|
||||
{
|
||||
|
||||
@@ -66,6 +66,7 @@ class CPUDevice : public Device {
|
||||
|
||||
void mem_alloc(device_memory &mem) override;
|
||||
void mem_copy_to(device_memory &mem) override;
|
||||
void mem_move_to_host(device_memory &mem) override;
|
||||
void mem_copy_from(
|
||||
device_memory &mem, const size_t y, size_t w, const size_t h, size_t elem) override;
|
||||
void mem_zero(device_memory &mem) override;
|
||||
|
||||
@@ -581,6 +581,25 @@ void CUDADevice::mem_alloc(device_memory &mem)
|
||||
}
|
||||
|
||||
void CUDADevice::mem_copy_to(device_memory &mem)
|
||||
{
|
||||
if (mem.type == MEM_GLOBAL) {
|
||||
global_copy_to(mem);
|
||||
}
|
||||
else if (mem.type == MEM_TEXTURE) {
|
||||
tex_copy_to((device_texture &)mem);
|
||||
}
|
||||
else {
|
||||
if (!mem.device_pointer) {
|
||||
generic_alloc(mem);
|
||||
generic_copy_to(mem);
|
||||
}
|
||||
else if (mem.is_resident(this)) {
|
||||
generic_copy_to(mem);
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
void CUDADevice::mem_move_to_host(device_memory &mem)
|
||||
{
|
||||
if (mem.type == MEM_GLOBAL) {
|
||||
global_free(mem);
|
||||
@@ -591,10 +610,7 @@ void CUDADevice::mem_copy_to(device_memory &mem)
|
||||
tex_alloc((device_texture &)mem);
|
||||
}
|
||||
else {
|
||||
if (!mem.device_pointer) {
|
||||
generic_alloc(mem);
|
||||
}
|
||||
generic_copy_to(mem);
|
||||
assert(0);
|
||||
}
|
||||
}
|
||||
|
||||
@@ -689,6 +705,19 @@ void CUDADevice::global_alloc(device_memory &mem)
|
||||
const_copy_to(mem.name, &mem.device_pointer, sizeof(mem.device_pointer));
|
||||
}
|
||||
|
||||
void CUDADevice::global_copy_to(device_memory &mem)
|
||||
{
|
||||
if (!mem.device_pointer) {
|
||||
generic_alloc(mem);
|
||||
generic_copy_to(mem);
|
||||
}
|
||||
else if (mem.is_resident(this)) {
|
||||
generic_copy_to(mem);
|
||||
}
|
||||
|
||||
const_copy_to(mem.name, &mem.device_pointer, sizeof(mem.device_pointer));
|
||||
}
|
||||
|
||||
void CUDADevice::global_free(device_memory &mem)
|
||||
{
|
||||
if (mem.is_resident(this) && mem.device_pointer) {
|
||||
@@ -696,13 +725,53 @@ void CUDADevice::global_free(device_memory &mem)
|
||||
}
|
||||
}
|
||||
|
||||
static size_t tex_src_pitch(const device_texture &mem)
|
||||
{
|
||||
return mem.data_width * datatype_size(mem.data_type) * mem.data_elements;
|
||||
}
|
||||
|
||||
static CUDA_MEMCPY2D tex_2d_copy_param(const device_texture &mem, const int pitch_alignment)
|
||||
{
|
||||
/* 2D texture using pitch aligned linear memory. */
|
||||
const size_t src_pitch = tex_src_pitch(mem);
|
||||
const size_t dst_pitch = align_up(src_pitch, pitch_alignment);
|
||||
|
||||
CUDA_MEMCPY2D param;
|
||||
memset(¶m, 0, sizeof(param));
|
||||
param.dstMemoryType = CU_MEMORYTYPE_DEVICE;
|
||||
param.dstDevice = mem.device_pointer;
|
||||
param.dstPitch = dst_pitch;
|
||||
param.srcMemoryType = CU_MEMORYTYPE_HOST;
|
||||
param.srcHost = mem.host_pointer;
|
||||
param.srcPitch = src_pitch;
|
||||
param.WidthInBytes = param.srcPitch;
|
||||
param.Height = mem.data_height;
|
||||
|
||||
return param;
|
||||
}
|
||||
|
||||
static CUDA_MEMCPY3D tex_3d_copy_param(const device_texture &mem)
|
||||
{
|
||||
const size_t src_pitch = tex_src_pitch(mem);
|
||||
|
||||
CUDA_MEMCPY3D param;
|
||||
memset(¶m, 0, sizeof(param));
|
||||
param.dstMemoryType = CU_MEMORYTYPE_ARRAY;
|
||||
param.dstArray = (CUarray)mem.device_pointer;
|
||||
param.srcMemoryType = CU_MEMORYTYPE_HOST;
|
||||
param.srcHost = mem.host_pointer;
|
||||
param.srcPitch = src_pitch;
|
||||
param.WidthInBytes = param.srcPitch;
|
||||
param.Height = mem.data_height;
|
||||
param.Depth = mem.data_depth;
|
||||
|
||||
return param;
|
||||
}
|
||||
|
||||
void CUDADevice::tex_alloc(device_texture &mem)
|
||||
{
|
||||
CUDAContextScope scope(this);
|
||||
|
||||
size_t dsize = datatype_size(mem.data_type);
|
||||
size_t size = mem.memory_size();
|
||||
|
||||
CUaddress_mode address_mode = CU_TR_ADDRESS_MODE_WRAP;
|
||||
switch (mem.info.extension) {
|
||||
case EXTENSION_REPEAT:
|
||||
@@ -761,8 +830,6 @@ void CUDADevice::tex_alloc(device_texture &mem)
|
||||
|
||||
Mem *cmem = nullptr;
|
||||
CUarray array_3d = nullptr;
|
||||
size_t src_pitch = mem.data_width * dsize * mem.data_elements;
|
||||
size_t dst_pitch = src_pitch;
|
||||
|
||||
if (!mem.is_resident(this)) {
|
||||
thread_scoped_lock lock(device_mem_map_mutex);
|
||||
@@ -773,9 +840,6 @@ void CUDADevice::tex_alloc(device_texture &mem)
|
||||
array_3d = (CUarray)mem.device_pointer;
|
||||
cmem->array = reinterpret_cast<arrayMemObject>(array_3d);
|
||||
}
|
||||
else if (mem.data_height > 0) {
|
||||
dst_pitch = align_up(src_pitch, pitch_alignment);
|
||||
}
|
||||
}
|
||||
else if (mem.data_depth > 1) {
|
||||
/* 3D texture using array, there is no API for linear memory. */
|
||||
@@ -798,22 +862,12 @@ void CUDADevice::tex_alloc(device_texture &mem)
|
||||
return;
|
||||
}
|
||||
|
||||
CUDA_MEMCPY3D param;
|
||||
memset(¶m, 0, sizeof(param));
|
||||
param.dstMemoryType = CU_MEMORYTYPE_ARRAY;
|
||||
param.dstArray = array_3d;
|
||||
param.srcMemoryType = CU_MEMORYTYPE_HOST;
|
||||
param.srcHost = mem.host_pointer;
|
||||
param.srcPitch = src_pitch;
|
||||
param.WidthInBytes = param.srcPitch;
|
||||
param.Height = mem.data_height;
|
||||
param.Depth = mem.data_depth;
|
||||
|
||||
cuda_assert(cuMemcpy3D(¶m));
|
||||
|
||||
mem.device_pointer = (device_ptr)array_3d;
|
||||
mem.device_size = size;
|
||||
stats.mem_alloc(size);
|
||||
mem.device_size = mem.memory_size();
|
||||
stats.mem_alloc(mem.memory_size());
|
||||
|
||||
const CUDA_MEMCPY3D param = tex_3d_copy_param(mem);
|
||||
cuda_assert(cuMemcpy3D(¶m));
|
||||
|
||||
thread_scoped_lock lock(device_mem_map_mutex);
|
||||
cmem = &device_mem_map[&mem];
|
||||
@@ -822,25 +876,15 @@ void CUDADevice::tex_alloc(device_texture &mem)
|
||||
}
|
||||
else if (mem.data_height > 0) {
|
||||
/* 2D texture, using pitch aligned linear memory. */
|
||||
dst_pitch = align_up(src_pitch, pitch_alignment);
|
||||
size_t dst_size = dst_pitch * mem.data_height;
|
||||
const size_t dst_pitch = align_up(tex_src_pitch(mem), pitch_alignment);
|
||||
const size_t dst_size = dst_pitch * mem.data_height;
|
||||
|
||||
cmem = generic_alloc(mem, dst_size - mem.memory_size());
|
||||
if (!cmem) {
|
||||
return;
|
||||
}
|
||||
|
||||
CUDA_MEMCPY2D param;
|
||||
memset(¶m, 0, sizeof(param));
|
||||
param.dstMemoryType = CU_MEMORYTYPE_DEVICE;
|
||||
param.dstDevice = mem.device_pointer;
|
||||
param.dstPitch = dst_pitch;
|
||||
param.srcMemoryType = CU_MEMORYTYPE_HOST;
|
||||
param.srcHost = mem.host_pointer;
|
||||
param.srcPitch = src_pitch;
|
||||
param.WidthInBytes = param.srcPitch;
|
||||
param.Height = mem.data_height;
|
||||
|
||||
const CUDA_MEMCPY2D param = tex_2d_copy_param(mem, pitch_alignment);
|
||||
cuda_assert(cuMemcpy2DUnaligned(¶m));
|
||||
}
|
||||
else {
|
||||
@@ -850,7 +894,7 @@ void CUDADevice::tex_alloc(device_texture &mem)
|
||||
return;
|
||||
}
|
||||
|
||||
cuda_assert(cuMemcpyHtoD(mem.device_pointer, mem.host_pointer, size));
|
||||
cuda_assert(cuMemcpyHtoD(mem.device_pointer, mem.host_pointer, mem.memory_size()));
|
||||
}
|
||||
|
||||
/* Resize once */
|
||||
@@ -879,6 +923,8 @@ void CUDADevice::tex_alloc(device_texture &mem)
|
||||
resDesc.flags = 0;
|
||||
}
|
||||
else if (mem.data_height > 0) {
|
||||
const size_t dst_pitch = align_up(tex_src_pitch(mem), pitch_alignment);
|
||||
|
||||
resDesc.resType = CU_RESOURCE_TYPE_PITCH2D;
|
||||
resDesc.res.pitch2D.devPtr = mem.device_pointer;
|
||||
resDesc.res.pitch2D.format = format;
|
||||
@@ -917,39 +963,76 @@ void CUDADevice::tex_alloc(device_texture &mem)
|
||||
}
|
||||
}
|
||||
|
||||
void CUDADevice::tex_free(device_texture &mem)
|
||||
void CUDADevice::tex_copy_to(device_texture &mem)
|
||||
{
|
||||
if (mem.device_pointer) {
|
||||
CUDAContextScope scope(this);
|
||||
thread_scoped_lock lock(device_mem_map_mutex);
|
||||
DCHECK(device_mem_map.find(&mem) != device_mem_map.end());
|
||||
const Mem &cmem = device_mem_map[&mem];
|
||||
|
||||
if (cmem.texobject) {
|
||||
/* Free bindless texture. */
|
||||
cuTexObjectDestroy(cmem.texobject);
|
||||
if (!mem.device_pointer) {
|
||||
/* Not yet allocated on device. */
|
||||
tex_alloc(mem);
|
||||
}
|
||||
else if (!mem.is_resident(this)) {
|
||||
/* Peering with another device, may still need to create texture info and object. */
|
||||
if (texture_info[mem.slot].data == 0) {
|
||||
tex_alloc(mem);
|
||||
}
|
||||
|
||||
if (!mem.is_resident(this)) {
|
||||
/* Do not free memory here, since it was allocated on a different device. */
|
||||
device_mem_map.erase(device_mem_map.find(&mem));
|
||||
}
|
||||
else {
|
||||
/* Resident and fully allocated, only copy. */
|
||||
if (mem.data_depth > 0) {
|
||||
CUDAContextScope scope(this);
|
||||
const CUDA_MEMCPY3D param = tex_3d_copy_param(mem);
|
||||
cuda_assert(cuMemcpy3D(¶m));
|
||||
}
|
||||
else if (cmem.array) {
|
||||
/* Free array. */
|
||||
cuArrayDestroy(reinterpret_cast<CUarray>(cmem.array));
|
||||
stats.mem_free(mem.device_size);
|
||||
mem.device_pointer = 0;
|
||||
mem.device_size = 0;
|
||||
|
||||
device_mem_map.erase(device_mem_map.find(&mem));
|
||||
else if (mem.data_height > 0) {
|
||||
CUDAContextScope scope(this);
|
||||
const CUDA_MEMCPY2D param = tex_2d_copy_param(mem, pitch_alignment);
|
||||
cuda_assert(cuMemcpy2DUnaligned(¶m));
|
||||
}
|
||||
else {
|
||||
lock.unlock();
|
||||
generic_free(mem);
|
||||
generic_copy_to(mem);
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
void CUDADevice::tex_free(device_texture &mem)
|
||||
{
|
||||
CUDAContextScope scope(this);
|
||||
thread_scoped_lock lock(device_mem_map_mutex);
|
||||
|
||||
/* Check if the memory was allocated for this device. */
|
||||
auto it = device_mem_map.find(&mem);
|
||||
if (it == device_mem_map.end()) {
|
||||
return;
|
||||
}
|
||||
|
||||
const Mem &cmem = it->second;
|
||||
|
||||
/* Always clear texture info and texture object, regardless of residency. */
|
||||
texture_info[mem.slot] = TextureInfo();
|
||||
|
||||
if (cmem.texobject) {
|
||||
/* Free bindless texture. */
|
||||
cuTexObjectDestroy(cmem.texobject);
|
||||
}
|
||||
|
||||
if (!mem.is_resident(this)) {
|
||||
/* Do not free memory here, since it was allocated on a different device. */
|
||||
device_mem_map.erase(device_mem_map.find(&mem));
|
||||
}
|
||||
else if (cmem.array) {
|
||||
/* Free array. */
|
||||
cuArrayDestroy(reinterpret_cast<CUarray>(cmem.array));
|
||||
stats.mem_free(mem.device_size);
|
||||
mem.device_pointer = 0;
|
||||
mem.device_size = 0;
|
||||
|
||||
device_mem_map.erase(device_mem_map.find(&mem));
|
||||
}
|
||||
else {
|
||||
lock.unlock();
|
||||
generic_free(mem);
|
||||
}
|
||||
}
|
||||
|
||||
unique_ptr<DeviceQueue> CUDADevice::gpu_queue_create()
|
||||
{
|
||||
return make_unique<CUDADeviceQueue>(this);
|
||||
|
||||
@@ -74,6 +74,8 @@ class CUDADevice : public GPUDevice {
|
||||
|
||||
void mem_copy_to(device_memory &mem) override;
|
||||
|
||||
void mem_move_to_host(device_memory &mem) override;
|
||||
|
||||
void mem_copy_from(
|
||||
device_memory &mem, const size_t y, size_t w, const size_t h, size_t elem) override;
|
||||
|
||||
@@ -86,11 +88,11 @@ class CUDADevice : public GPUDevice {
|
||||
void const_copy_to(const char *name, void *host, const size_t size) override;
|
||||
|
||||
void global_alloc(device_memory &mem);
|
||||
|
||||
void global_copy_to(device_memory &mem);
|
||||
void global_free(device_memory &mem);
|
||||
|
||||
void tex_alloc(device_texture &mem);
|
||||
|
||||
void tex_copy_to(device_texture &mem);
|
||||
void tex_free(device_texture &mem);
|
||||
|
||||
bool should_use_graphics_interop() override;
|
||||
|
||||
@@ -614,7 +614,7 @@ void GPUDevice::move_textures_to_host(size_t size, bool for_texture)
|
||||
* devices as well, which is potentially dangerous when still in use (since
|
||||
* a thread rendering on another devices would only be caught in this mutex
|
||||
* if it so happens to do an allocation at the same time as well. */
|
||||
max_mem->device_copy_to();
|
||||
max_mem->device_move_to_host();
|
||||
size = (max_size >= size) ? 0 : size - max_size;
|
||||
|
||||
any_device_moving_textures_to_host = false;
|
||||
@@ -758,40 +758,42 @@ GPUDevice::Mem *GPUDevice::generic_alloc(device_memory &mem, const size_t pitch_
|
||||
|
||||
void GPUDevice::generic_free(device_memory &mem)
|
||||
{
|
||||
if (mem.device_pointer) {
|
||||
const thread_scoped_lock lock(device_mem_map_mutex);
|
||||
DCHECK(device_mem_map.find(&mem) != device_mem_map.end());
|
||||
const Mem &cmem = device_mem_map[&mem];
|
||||
|
||||
/* If cmem.use_mapped_host is true, reference counting is used
|
||||
* to safely free a mapped host memory. */
|
||||
|
||||
if (cmem.use_mapped_host) {
|
||||
assert(mem.shared_pointer);
|
||||
if (mem.shared_pointer) {
|
||||
assert(mem.shared_counter > 0);
|
||||
if (--mem.shared_counter == 0) {
|
||||
if (mem.host_pointer == mem.shared_pointer) {
|
||||
mem.host_pointer = nullptr;
|
||||
}
|
||||
free_host(mem.shared_pointer);
|
||||
mem.shared_pointer = nullptr;
|
||||
}
|
||||
}
|
||||
map_host_used -= mem.device_size;
|
||||
}
|
||||
else {
|
||||
/* Free device memory. */
|
||||
free_device((void *)mem.device_pointer);
|
||||
device_mem_in_use -= mem.device_size;
|
||||
}
|
||||
|
||||
stats.mem_free(mem.device_size);
|
||||
mem.device_pointer = 0;
|
||||
mem.device_size = 0;
|
||||
|
||||
device_mem_map.erase(device_mem_map.find(&mem));
|
||||
if (!(mem.device_pointer && mem.is_resident(this))) {
|
||||
return;
|
||||
}
|
||||
|
||||
const thread_scoped_lock lock(device_mem_map_mutex);
|
||||
DCHECK(device_mem_map.find(&mem) != device_mem_map.end());
|
||||
const Mem &cmem = device_mem_map[&mem];
|
||||
|
||||
/* If cmem.use_mapped_host is true, reference counting is used
|
||||
* to safely free a mapped host memory. */
|
||||
|
||||
if (cmem.use_mapped_host) {
|
||||
assert(mem.shared_pointer);
|
||||
if (mem.shared_pointer) {
|
||||
assert(mem.shared_counter > 0);
|
||||
if (--mem.shared_counter == 0) {
|
||||
if (mem.host_pointer == mem.shared_pointer) {
|
||||
mem.host_pointer = nullptr;
|
||||
}
|
||||
free_host(mem.shared_pointer);
|
||||
mem.shared_pointer = nullptr;
|
||||
}
|
||||
}
|
||||
map_host_used -= mem.device_size;
|
||||
}
|
||||
else {
|
||||
/* Free device memory. */
|
||||
free_device((void *)mem.device_pointer);
|
||||
device_mem_in_use -= mem.device_size;
|
||||
}
|
||||
|
||||
stats.mem_free(mem.device_size);
|
||||
mem.device_pointer = 0;
|
||||
mem.device_size = 0;
|
||||
|
||||
device_mem_map.erase(device_mem_map.find(&mem));
|
||||
}
|
||||
|
||||
void GPUDevice::generic_copy_to(device_memory &mem)
|
||||
|
||||
@@ -315,6 +315,7 @@ class Device {
|
||||
|
||||
virtual void mem_alloc(device_memory &mem) = 0;
|
||||
virtual void mem_copy_to(device_memory &mem) = 0;
|
||||
virtual void mem_move_to_host(device_memory &mem) = 0;
|
||||
virtual void mem_copy_from(
|
||||
device_memory &mem, const size_t y, size_t w, const size_t h, size_t elem) = 0;
|
||||
virtual void mem_zero(device_memory &mem) = 0;
|
||||
@@ -379,8 +380,10 @@ class GPUDevice : public Device {
|
||||
size_t device_mem_in_use = 0;
|
||||
|
||||
virtual void init_host_memory(const size_t preferred_texture_headroom = 0,
|
||||
size_t preferred_working_headroom = 0);
|
||||
virtual void move_textures_to_host(const size_t size, bool for_texture);
|
||||
const size_t preferred_working_headroom = 0);
|
||||
virtual void move_textures_to_host(const size_t size,
|
||||
const size_t headroom,
|
||||
const bool for_texture);
|
||||
|
||||
/* Allocation, deallocation and copy functions, with corresponding
|
||||
* support of device/host allocations. */
|
||||
|
||||
@@ -29,6 +29,8 @@ class DummyDevice : public Device {
|
||||
|
||||
void mem_copy_to(device_memory & /*mem*/) override {}
|
||||
|
||||
void mem_move_to_host(device_memory & /*mem*/) override {}
|
||||
|
||||
void mem_copy_from(
|
||||
device_memory & /*mem*/, size_t /*y*/, size_t /*w*/, size_t /*h*/, size_t /*elem*/) override
|
||||
{
|
||||
|
||||
@@ -4,7 +4,6 @@
|
||||
|
||||
#ifdef WITH_HIP
|
||||
|
||||
# include <climits>
|
||||
# include <cstdio>
|
||||
# include <cstdlib>
|
||||
# include <cstring>
|
||||
@@ -544,6 +543,25 @@ void HIPDevice::mem_alloc(device_memory &mem)
|
||||
}
|
||||
|
||||
void HIPDevice::mem_copy_to(device_memory &mem)
|
||||
{
|
||||
if (mem.type == MEM_GLOBAL) {
|
||||
global_copy_to(mem);
|
||||
}
|
||||
else if (mem.type == MEM_TEXTURE) {
|
||||
tex_copy_to((device_texture &)mem);
|
||||
}
|
||||
else {
|
||||
if (!mem.device_pointer) {
|
||||
generic_alloc(mem);
|
||||
generic_copy_to(mem);
|
||||
}
|
||||
else if (mem.is_resident(this)) {
|
||||
generic_copy_to(mem);
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
void HIPDevice::mem_move_to_host(device_memory &mem)
|
||||
{
|
||||
if (mem.type == MEM_GLOBAL) {
|
||||
global_free(mem);
|
||||
@@ -554,10 +572,7 @@ void HIPDevice::mem_copy_to(device_memory &mem)
|
||||
tex_alloc((device_texture &)mem);
|
||||
}
|
||||
else {
|
||||
if (!mem.device_pointer) {
|
||||
generic_alloc(mem);
|
||||
}
|
||||
generic_copy_to(mem);
|
||||
assert(0);
|
||||
}
|
||||
}
|
||||
|
||||
@@ -652,6 +667,19 @@ void HIPDevice::global_alloc(device_memory &mem)
|
||||
const_copy_to(mem.name, &mem.device_pointer, sizeof(mem.device_pointer));
|
||||
}
|
||||
|
||||
void HIPDevice::global_copy_to(device_memory &mem)
|
||||
{
|
||||
if (!mem.device_pointer) {
|
||||
generic_alloc(mem);
|
||||
generic_copy_to(mem);
|
||||
}
|
||||
else if (mem.is_resident(this)) {
|
||||
generic_copy_to(mem);
|
||||
}
|
||||
|
||||
const_copy_to(mem.name, &mem.device_pointer, sizeof(mem.device_pointer));
|
||||
}
|
||||
|
||||
void HIPDevice::global_free(device_memory &mem)
|
||||
{
|
||||
if (mem.is_resident(this) && mem.device_pointer) {
|
||||
@@ -659,13 +687,52 @@ void HIPDevice::global_free(device_memory &mem)
|
||||
}
|
||||
}
|
||||
|
||||
static size_t tex_src_pitch(const device_texture &mem)
|
||||
{
|
||||
return mem.data_width * datatype_size(mem.data_type) * mem.data_elements;
|
||||
}
|
||||
|
||||
static hip_Memcpy2D tex_2d_copy_param(const device_texture &mem, const int pitch_alignment)
|
||||
{
|
||||
/* 2D texture using pitch aligned linear memory. */
|
||||
const size_t src_pitch = tex_src_pitch(mem);
|
||||
const size_t dst_pitch = align_up(src_pitch, pitch_alignment);
|
||||
|
||||
hip_Memcpy2D param;
|
||||
memset(¶m, 0, sizeof(param));
|
||||
param.dstMemoryType = hipMemoryTypeDevice;
|
||||
param.dstDevice = mem.device_pointer;
|
||||
param.dstPitch = dst_pitch;
|
||||
param.srcMemoryType = hipMemoryTypeHost;
|
||||
param.srcHost = mem.host_pointer;
|
||||
param.srcPitch = src_pitch;
|
||||
param.WidthInBytes = param.srcPitch;
|
||||
param.Height = mem.data_height;
|
||||
|
||||
return param;
|
||||
}
|
||||
|
||||
static HIP_MEMCPY3D tex_3d_copy_param(const device_texture &mem)
|
||||
{
|
||||
const size_t src_pitch = tex_src_pitch(mem);
|
||||
|
||||
HIP_MEMCPY3D param;
|
||||
memset(¶m, 0, sizeof(HIP_MEMCPY3D));
|
||||
param.dstMemoryType = hipMemoryTypeArray;
|
||||
param.dstArray = (hArray)mem.device_pointer;
|
||||
param.srcMemoryType = hipMemoryTypeHost;
|
||||
param.srcHost = mem.host_pointer;
|
||||
param.srcPitch = src_pitch;
|
||||
param.WidthInBytes = param.srcPitch;
|
||||
param.Height = mem.data_height;
|
||||
param.Depth = mem.data_depth;
|
||||
return param;
|
||||
}
|
||||
|
||||
void HIPDevice::tex_alloc(device_texture &mem)
|
||||
{
|
||||
HIPContextScope scope(this);
|
||||
|
||||
size_t dsize = datatype_size(mem.data_type);
|
||||
size_t size = mem.memory_size();
|
||||
|
||||
hipTextureAddressMode address_mode = hipAddressModeWrap;
|
||||
switch (mem.info.extension) {
|
||||
case EXTENSION_REPEAT:
|
||||
@@ -721,8 +788,6 @@ void HIPDevice::tex_alloc(device_texture &mem)
|
||||
|
||||
Mem *cmem = nullptr;
|
||||
hArray array_3d = nullptr;
|
||||
size_t src_pitch = mem.data_width * dsize * mem.data_elements;
|
||||
size_t dst_pitch = src_pitch;
|
||||
|
||||
if (!mem.is_resident(this)) {
|
||||
thread_scoped_lock lock(device_mem_map_mutex);
|
||||
@@ -733,9 +798,6 @@ void HIPDevice::tex_alloc(device_texture &mem)
|
||||
array_3d = (hArray)mem.device_pointer;
|
||||
cmem->array = reinterpret_cast<arrayMemObject>(array_3d);
|
||||
}
|
||||
else if (mem.data_height > 0) {
|
||||
dst_pitch = align_up(src_pitch, pitch_alignment);
|
||||
}
|
||||
}
|
||||
else if (mem.data_depth > 1) {
|
||||
/* 3D texture using array, there is no API for linear memory. */
|
||||
@@ -758,22 +820,12 @@ void HIPDevice::tex_alloc(device_texture &mem)
|
||||
return;
|
||||
}
|
||||
|
||||
HIP_MEMCPY3D param;
|
||||
memset(¶m, 0, sizeof(HIP_MEMCPY3D));
|
||||
param.dstMemoryType = hipMemoryTypeArray;
|
||||
param.dstArray = array_3d;
|
||||
param.srcMemoryType = hipMemoryTypeHost;
|
||||
param.srcHost = mem.host_pointer;
|
||||
param.srcPitch = src_pitch;
|
||||
param.WidthInBytes = param.srcPitch;
|
||||
param.Height = mem.data_height;
|
||||
param.Depth = mem.data_depth;
|
||||
|
||||
hip_assert(hipDrvMemcpy3D(¶m));
|
||||
|
||||
mem.device_pointer = (device_ptr)array_3d;
|
||||
mem.device_size = size;
|
||||
stats.mem_alloc(size);
|
||||
mem.device_size = mem.memory_size();
|
||||
stats.mem_alloc(mem.memory_size());
|
||||
|
||||
const HIP_MEMCPY3D param = tex_3d_copy_param(mem);
|
||||
hip_assert(hipDrvMemcpy3D(¶m));
|
||||
|
||||
thread_scoped_lock lock(device_mem_map_mutex);
|
||||
cmem = &device_mem_map[&mem];
|
||||
@@ -782,25 +834,15 @@ void HIPDevice::tex_alloc(device_texture &mem)
|
||||
}
|
||||
else if (mem.data_height > 0) {
|
||||
/* 2D texture, using pitch aligned linear memory. */
|
||||
dst_pitch = align_up(src_pitch, pitch_alignment);
|
||||
size_t dst_size = dst_pitch * mem.data_height;
|
||||
const size_t dst_pitch = align_up(tex_src_pitch(mem), pitch_alignment);
|
||||
const size_t dst_size = dst_pitch * mem.data_height;
|
||||
|
||||
cmem = generic_alloc(mem, dst_size - mem.memory_size());
|
||||
if (!cmem) {
|
||||
return;
|
||||
}
|
||||
|
||||
hip_Memcpy2D param;
|
||||
memset(¶m, 0, sizeof(param));
|
||||
param.dstMemoryType = hipMemoryTypeDevice;
|
||||
param.dstDevice = mem.device_pointer;
|
||||
param.dstPitch = dst_pitch;
|
||||
param.srcMemoryType = hipMemoryTypeHost;
|
||||
param.srcHost = mem.host_pointer;
|
||||
param.srcPitch = src_pitch;
|
||||
param.WidthInBytes = param.srcPitch;
|
||||
param.Height = mem.data_height;
|
||||
|
||||
const hip_Memcpy2D param = tex_2d_copy_param(mem, pitch_alignment);
|
||||
hip_assert(hipDrvMemcpy2DUnaligned(¶m));
|
||||
}
|
||||
else {
|
||||
@@ -810,7 +852,7 @@ void HIPDevice::tex_alloc(device_texture &mem)
|
||||
return;
|
||||
}
|
||||
|
||||
hip_assert(hipMemcpyHtoD(mem.device_pointer, mem.host_pointer, size));
|
||||
hip_assert(hipMemcpyHtoD(mem.device_pointer, mem.host_pointer, mem.memory_size()));
|
||||
}
|
||||
|
||||
/* Resize once */
|
||||
@@ -840,6 +882,8 @@ void HIPDevice::tex_alloc(device_texture &mem)
|
||||
resDesc.flags = 0;
|
||||
}
|
||||
else if (mem.data_height > 0) {
|
||||
const size_t dst_pitch = align_up(tex_src_pitch(mem), pitch_alignment);
|
||||
|
||||
resDesc.resType = hipResourceTypePitch2D;
|
||||
resDesc.res.pitch2D.devPtr = mem.device_pointer;
|
||||
resDesc.res.pitch2D.format = format;
|
||||
@@ -880,39 +924,76 @@ void HIPDevice::tex_alloc(device_texture &mem)
|
||||
}
|
||||
}
|
||||
|
||||
void HIPDevice::tex_free(device_texture &mem)
|
||||
void HIPDevice::tex_copy_to(device_texture &mem)
|
||||
{
|
||||
if (mem.device_pointer) {
|
||||
HIPContextScope scope(this);
|
||||
thread_scoped_lock lock(device_mem_map_mutex);
|
||||
DCHECK(device_mem_map.find(&mem) != device_mem_map.end());
|
||||
const Mem &cmem = device_mem_map[&mem];
|
||||
|
||||
if (cmem.texobject) {
|
||||
/* Free bindless texture. */
|
||||
hipTexObjectDestroy(cmem.texobject);
|
||||
if (!mem.device_pointer) {
|
||||
/* Not yet allocated on device. */
|
||||
tex_alloc(mem);
|
||||
}
|
||||
else if (!mem.is_resident(this)) {
|
||||
/* Peering with another device, may still need to create texture info and object. */
|
||||
if (texture_info[mem.slot].data == 0) {
|
||||
tex_alloc(mem);
|
||||
}
|
||||
|
||||
if (!mem.is_resident(this)) {
|
||||
/* Do not free memory here, since it was allocated on a different device. */
|
||||
device_mem_map.erase(device_mem_map.find(&mem));
|
||||
}
|
||||
else {
|
||||
/* Resident and fully allocated, only copy. */
|
||||
if (mem.data_depth > 0) {
|
||||
HIPContextScope scope(this);
|
||||
const HIP_MEMCPY3D param = tex_3d_copy_param(mem);
|
||||
hip_assert(hipDrvMemcpy3D(¶m));
|
||||
}
|
||||
else if (cmem.array) {
|
||||
/* Free array. */
|
||||
hipArrayDestroy(reinterpret_cast<hArray>(cmem.array));
|
||||
stats.mem_free(mem.device_size);
|
||||
mem.device_pointer = 0;
|
||||
mem.device_size = 0;
|
||||
|
||||
device_mem_map.erase(device_mem_map.find(&mem));
|
||||
else if (mem.data_height > 0) {
|
||||
HIPContextScope scope(this);
|
||||
const hip_Memcpy2D param = tex_2d_copy_param(mem, pitch_alignment);
|
||||
hip_assert(hipDrvMemcpy2DUnaligned(¶m));
|
||||
}
|
||||
else {
|
||||
lock.unlock();
|
||||
generic_free(mem);
|
||||
generic_copy_to(mem);
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
void HIPDevice::tex_free(device_texture &mem)
|
||||
{
|
||||
HIPContextScope scope(this);
|
||||
thread_scoped_lock lock(device_mem_map_mutex);
|
||||
|
||||
/* Check if the memory was allocated for this device. */
|
||||
auto it = device_mem_map.find(&mem);
|
||||
if (it == device_mem_map.end()) {
|
||||
return;
|
||||
}
|
||||
|
||||
const Mem &cmem = it->second;
|
||||
|
||||
/* Always clear texture info and texture object, regardless of residency. */
|
||||
texture_info[mem.slot] = TextureInfo();
|
||||
|
||||
if (cmem.texobject) {
|
||||
/* Free bindless texture. */
|
||||
hipTexObjectDestroy(cmem.texobject);
|
||||
}
|
||||
|
||||
if (!mem.is_resident(this)) {
|
||||
/* Do not free memory here, since it was allocated on a different device. */
|
||||
device_mem_map.erase(device_mem_map.find(&mem));
|
||||
}
|
||||
else if (cmem.array) {
|
||||
/* Free array. */
|
||||
hipArrayDestroy(reinterpret_cast<hArray>(cmem.array));
|
||||
stats.mem_free(mem.device_size);
|
||||
mem.device_pointer = 0;
|
||||
mem.device_size = 0;
|
||||
|
||||
device_mem_map.erase(device_mem_map.find(&mem));
|
||||
}
|
||||
else {
|
||||
lock.unlock();
|
||||
generic_free(mem);
|
||||
}
|
||||
}
|
||||
|
||||
unique_ptr<DeviceQueue> HIPDevice::gpu_queue_create()
|
||||
{
|
||||
return make_unique<HIPDeviceQueue>(this);
|
||||
|
||||
@@ -72,6 +72,8 @@ class HIPDevice : public GPUDevice {
|
||||
|
||||
void mem_copy_to(device_memory &mem) override;
|
||||
|
||||
void mem_move_to_host(device_memory &mem) override;
|
||||
|
||||
void mem_copy_from(
|
||||
device_memory &mem, const size_t y, size_t w, const size_t h, size_t elem) override;
|
||||
|
||||
@@ -84,11 +86,11 @@ class HIPDevice : public GPUDevice {
|
||||
void const_copy_to(const char *name, void *host, const size_t size) override;
|
||||
|
||||
void global_alloc(device_memory &mem);
|
||||
|
||||
void global_copy_to(device_memory &mem);
|
||||
void global_free(device_memory &mem);
|
||||
|
||||
void tex_alloc(device_texture &mem);
|
||||
|
||||
void tex_copy_to(device_texture &mem);
|
||||
void tex_free(device_texture &mem);
|
||||
|
||||
/* Graphics resources interoperability. */
|
||||
|
||||
@@ -82,6 +82,13 @@ void device_memory::device_copy_to()
|
||||
}
|
||||
}
|
||||
|
||||
void device_memory::device_move_to_host()
|
||||
{
|
||||
if (host_pointer) {
|
||||
device->mem_move_to_host(*this);
|
||||
}
|
||||
}
|
||||
|
||||
void device_memory::device_copy_from(const size_t y, const size_t w, size_t h, const size_t elem)
|
||||
{
|
||||
assert(type != MEM_TEXTURE && type != MEM_READ_ONLY && type != MEM_GLOBAL);
|
||||
|
||||
@@ -288,6 +288,7 @@ class device_memory {
|
||||
void device_alloc();
|
||||
void device_free();
|
||||
void device_copy_to();
|
||||
void device_move_to_host();
|
||||
void device_copy_from(const size_t y, const size_t w, size_t h, const size_t elem);
|
||||
void device_zero();
|
||||
|
||||
|
||||
@@ -161,6 +161,8 @@ class MetalDevice : public Device {
|
||||
|
||||
void mem_copy_to(device_memory &mem) override;
|
||||
|
||||
void mem_move_to_host(device_memory &mem) override;
|
||||
|
||||
void mem_copy_from(device_memory &mem)
|
||||
{
|
||||
mem_copy_from(mem, -1, -1, -1, -1);
|
||||
@@ -177,13 +179,11 @@ class MetalDevice : public Device {
|
||||
void const_copy_to(const char *name, void *host, const size_t size) override;
|
||||
|
||||
void global_alloc(device_memory &mem);
|
||||
|
||||
void global_free(device_memory &mem);
|
||||
|
||||
void tex_alloc(device_texture &mem);
|
||||
|
||||
void tex_alloc_as_buffer(device_texture &mem);
|
||||
|
||||
void tex_copy_to(device_texture &mem);
|
||||
void tex_free(device_texture &mem);
|
||||
|
||||
void flush_delayed_free_list();
|
||||
|
||||
@@ -772,46 +772,48 @@ void MetalDevice::generic_copy_to(device_memory &mem)
|
||||
|
||||
void MetalDevice::generic_free(device_memory &mem)
|
||||
{
|
||||
if (mem.device_pointer) {
|
||||
std::lock_guard<std::recursive_mutex> lock(metal_mem_map_mutex);
|
||||
MetalMem &mmem = *metal_mem_map.at(&mem);
|
||||
size_t size = mmem.size;
|
||||
|
||||
/* If mmem.use_uma is true, reference counting is used
|
||||
* to safely free memory. */
|
||||
|
||||
bool free_mtlBuffer = false;
|
||||
|
||||
if (mmem.use_UMA) {
|
||||
assert(mem.shared_pointer);
|
||||
if (mem.shared_pointer) {
|
||||
assert(mem.shared_counter > 0);
|
||||
if (--mem.shared_counter == 0) {
|
||||
free_mtlBuffer = true;
|
||||
}
|
||||
}
|
||||
}
|
||||
else {
|
||||
free_mtlBuffer = true;
|
||||
}
|
||||
|
||||
if (free_mtlBuffer) {
|
||||
if (mem.host_pointer && mem.host_pointer == mem.shared_pointer) {
|
||||
/* Safely move the device-side data back to the host before it is freed. */
|
||||
mem.host_pointer = mem.host_alloc(size);
|
||||
memcpy(mem.host_pointer, mem.shared_pointer, size);
|
||||
mmem.use_UMA = false;
|
||||
}
|
||||
|
||||
mem.shared_pointer = nullptr;
|
||||
|
||||
/* Free device memory. */
|
||||
delayed_free_list.push_back(mmem.mtlBuffer);
|
||||
mmem.mtlBuffer = nil;
|
||||
}
|
||||
|
||||
erase_allocation(mem);
|
||||
if (!mem.device_pointer) {
|
||||
return;
|
||||
}
|
||||
|
||||
std::lock_guard<std::recursive_mutex> lock(metal_mem_map_mutex);
|
||||
MetalMem &mmem = *metal_mem_map.at(&mem);
|
||||
size_t size = mmem.size;
|
||||
|
||||
/* If mmem.use_uma is true, reference counting is used
|
||||
* to safely free memory. */
|
||||
|
||||
bool free_mtlBuffer = false;
|
||||
|
||||
if (mmem.use_UMA) {
|
||||
assert(mem.shared_pointer);
|
||||
if (mem.shared_pointer) {
|
||||
assert(mem.shared_counter > 0);
|
||||
if (--mem.shared_counter == 0) {
|
||||
free_mtlBuffer = true;
|
||||
}
|
||||
}
|
||||
}
|
||||
else {
|
||||
free_mtlBuffer = true;
|
||||
}
|
||||
|
||||
if (free_mtlBuffer) {
|
||||
if (mem.host_pointer && mem.host_pointer == mem.shared_pointer) {
|
||||
/* Safely move the device-side data back to the host before it is freed. */
|
||||
mem.host_pointer = mem.host_alloc(size);
|
||||
memcpy(mem.host_pointer, mem.shared_pointer, size);
|
||||
mmem.use_UMA = false;
|
||||
}
|
||||
|
||||
mem.shared_pointer = nullptr;
|
||||
|
||||
/* Free device memory. */
|
||||
delayed_free_list.push_back(mmem.mtlBuffer);
|
||||
mmem.mtlBuffer = nil;
|
||||
}
|
||||
|
||||
erase_allocation(mem);
|
||||
}
|
||||
|
||||
void MetalDevice::mem_alloc(device_memory &mem)
|
||||
@@ -829,20 +831,35 @@ void MetalDevice::mem_alloc(device_memory &mem)
|
||||
|
||||
void MetalDevice::mem_copy_to(device_memory &mem)
|
||||
{
|
||||
if (mem.type == MEM_GLOBAL) {
|
||||
global_free(mem);
|
||||
global_alloc(mem);
|
||||
}
|
||||
else if (mem.type == MEM_TEXTURE) {
|
||||
tex_free((device_texture &)mem);
|
||||
tex_alloc((device_texture &)mem);
|
||||
}
|
||||
else {
|
||||
if (!mem.device_pointer) {
|
||||
generic_alloc(mem);
|
||||
if (!mem.device_pointer) {
|
||||
if (mem.type == MEM_GLOBAL) {
|
||||
global_alloc(mem);
|
||||
}
|
||||
else if (mem.type == MEM_TEXTURE) {
|
||||
tex_alloc((device_texture &)mem);
|
||||
}
|
||||
else {
|
||||
generic_alloc(mem);
|
||||
generic_copy_to(mem);
|
||||
}
|
||||
generic_copy_to(mem);
|
||||
}
|
||||
else if (mem.is_resident(this)) {
|
||||
if (mem.type == MEM_GLOBAL) {
|
||||
generic_copy_to(mem);
|
||||
}
|
||||
else if (mem.type == MEM_TEXTURE) {
|
||||
tex_copy_to((device_texture &)mem);
|
||||
}
|
||||
else {
|
||||
generic_copy_to(mem);
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
void MetalDevice::mem_move_to_host(device_memory & /*mem*/)
|
||||
{
|
||||
/* Metal implements own mechanism for moving host memory. */
|
||||
assert(0);
|
||||
}
|
||||
|
||||
void MetalDevice::mem_copy_from(
|
||||
@@ -1116,7 +1133,6 @@ void MetalDevice::tex_alloc(device_texture &mem)
|
||||
}
|
||||
|
||||
/* General variables for both architectures */
|
||||
size_t dsize = datatype_size(mem.data_type);
|
||||
size_t size = mem.memory_size();
|
||||
|
||||
/* sampler_index maps into the GPU's constant 'metal_samplers' array */
|
||||
@@ -1178,7 +1194,7 @@ void MetalDevice::tex_alloc(device_texture &mem)
|
||||
assert(format != MTLPixelFormatInvalid);
|
||||
|
||||
id<MTLTexture> mtlTexture = nil;
|
||||
size_t src_pitch = mem.data_width * dsize * mem.data_elements;
|
||||
size_t src_pitch = mem.data_width * datatype_size(mem.data_type) * mem.data_elements;
|
||||
|
||||
if (mem.data_depth > 1) {
|
||||
/* 3D texture using array */
|
||||
@@ -1309,6 +1325,45 @@ void MetalDevice::tex_alloc(device_texture &mem)
|
||||
}
|
||||
}
|
||||
|
||||
void MetalDevice::tex_copy_to(device_texture &mem)
|
||||
{
|
||||
if (mem.is_resident(this)) {
|
||||
const size_t src_pitch = mem.data_width * datatype_size(mem.data_type) * mem.data_elements;
|
||||
|
||||
if (mem.data_depth > 0) {
|
||||
id<MTLTexture> mtlTexture;
|
||||
{
|
||||
std::lock_guard<std::recursive_mutex> lock(metal_mem_map_mutex);
|
||||
mtlTexture = metal_mem_map.at(&mem)->mtlTexture;
|
||||
}
|
||||
const size_t imageBytes = src_pitch * mem.data_height;
|
||||
for (size_t d = 0; d < mem.data_depth; d++) {
|
||||
const size_t offset = d * imageBytes;
|
||||
[mtlTexture replaceRegion:MTLRegionMake3D(0, 0, d, mem.data_width, mem.data_height, 1)
|
||||
mipmapLevel:0
|
||||
slice:0
|
||||
withBytes:(uint8_t *)mem.host_pointer + offset
|
||||
bytesPerRow:src_pitch
|
||||
bytesPerImage:0];
|
||||
}
|
||||
}
|
||||
else if (mem.data_height > 0) {
|
||||
id<MTLTexture> mtlTexture;
|
||||
{
|
||||
std::lock_guard<std::recursive_mutex> lock(metal_mem_map_mutex);
|
||||
mtlTexture = metal_mem_map.at(&mem)->mtlTexture;
|
||||
}
|
||||
[mtlTexture replaceRegion:MTLRegionMake2D(0, 0, mem.data_width, mem.data_height)
|
||||
mipmapLevel:0
|
||||
withBytes:mem.host_pointer
|
||||
bytesPerRow:src_pitch];
|
||||
}
|
||||
else {
|
||||
generic_copy_to(mem);
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
void MetalDevice::tex_free(device_texture &mem)
|
||||
{
|
||||
if (mem.data_depth == 0 && mem.data_height == 0) {
|
||||
|
||||
@@ -340,7 +340,6 @@ class MultiDevice : public Device {
|
||||
device_ptr key = (existing_key) ? existing_key : unique_key++;
|
||||
size_t existing_size = mem.device_size;
|
||||
|
||||
/* The tile buffers are allocated on each device (see below), so copy to all of them */
|
||||
for (const vector<SubDevice *> &island : peer_islands) {
|
||||
SubDevice *owner_sub = find_suitable_mem_device(existing_key, island);
|
||||
mem.device = owner_sub->device.get();
|
||||
@@ -365,6 +364,36 @@ class MultiDevice : public Device {
|
||||
stats.mem_alloc(mem.device_size - existing_size);
|
||||
}
|
||||
|
||||
void mem_move_to_host(device_memory &mem) override
|
||||
{
|
||||
device_ptr existing_key = mem.device_pointer;
|
||||
device_ptr key = (existing_key) ? existing_key : unique_key++;
|
||||
size_t existing_size = mem.device_size;
|
||||
|
||||
for (const vector<SubDevice *> &island : peer_islands) {
|
||||
SubDevice *owner_sub = find_suitable_mem_device(existing_key, island);
|
||||
mem.device = owner_sub->device.get();
|
||||
mem.device_pointer = (existing_key) ? owner_sub->ptr_map[existing_key] : 0;
|
||||
mem.device_size = existing_size;
|
||||
|
||||
owner_sub->device->mem_move_to_host(mem);
|
||||
owner_sub->ptr_map[key] = mem.device_pointer;
|
||||
|
||||
if (mem.type == MEM_GLOBAL || mem.type == MEM_TEXTURE) {
|
||||
/* Need to create texture objects and update pointer in kernel globals on all devices */
|
||||
for (SubDevice *island_sub : island) {
|
||||
if (island_sub != owner_sub) {
|
||||
island_sub->device->mem_move_to_host(mem);
|
||||
}
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
mem.device = this;
|
||||
mem.device_pointer = key;
|
||||
stats.mem_alloc(mem.device_size - existing_size);
|
||||
}
|
||||
|
||||
void mem_copy_from(
|
||||
device_memory &mem, const size_t y, size_t w, const size_t h, size_t elem) override
|
||||
{
|
||||
|
||||
@@ -112,7 +112,6 @@ OneapiDevice::OneapiDevice(const DeviceInfo &info, Stats &stats, Profiler &profi
|
||||
|
||||
max_memory_on_device_ = get_memcapacity();
|
||||
init_host_memory();
|
||||
move_texture_to_host = false;
|
||||
can_map_host = true;
|
||||
|
||||
const char *headroom_str = getenv("CYCLES_ONEAPI_MEMORY_HEADROOM");
|
||||
@@ -417,6 +416,34 @@ void OneapiDevice::mem_copy_to(device_memory &mem)
|
||||
return;
|
||||
}
|
||||
|
||||
if (mem.type == MEM_GLOBAL) {
|
||||
global_copy_to(mem);
|
||||
}
|
||||
else if (mem.type == MEM_TEXTURE) {
|
||||
tex_copy_to((device_texture &)mem);
|
||||
}
|
||||
else {
|
||||
if (!mem.device_pointer) {
|
||||
generic_alloc(mem);
|
||||
}
|
||||
generic_copy_to(mem);
|
||||
}
|
||||
}
|
||||
|
||||
void OneapiDevice::mem_move_to_host(device_memory &mem)
|
||||
{
|
||||
if (mem.name) {
|
||||
VLOG_DEBUG << "OneapiDevice::mem_move_to_host: \"" << mem.name << "\", "
|
||||
<< string_human_readable_number(mem.memory_size()) << " bytes. ("
|
||||
<< string_human_readable_size(mem.memory_size()) << ")";
|
||||
}
|
||||
|
||||
/* After getting runtime errors we need to avoid performing oneAPI runtime operations
|
||||
* because the associated GPU context may be in an invalid state at this point. */
|
||||
if (have_error()) {
|
||||
return;
|
||||
}
|
||||
|
||||
if (mem.type == MEM_GLOBAL) {
|
||||
global_free(mem);
|
||||
global_alloc(mem);
|
||||
@@ -426,11 +453,7 @@ void OneapiDevice::mem_copy_to(device_memory &mem)
|
||||
tex_alloc((device_texture &)mem);
|
||||
}
|
||||
else {
|
||||
if (!mem.device_pointer) {
|
||||
generic_alloc(mem);
|
||||
}
|
||||
|
||||
generic_copy_to(mem);
|
||||
assert(0);
|
||||
}
|
||||
}
|
||||
|
||||
@@ -596,6 +619,16 @@ void OneapiDevice::global_alloc(device_memory &mem)
|
||||
usm_memcpy(device_queue_, kg_memory_device_, kg_memory_, kg_memory_size_);
|
||||
}
|
||||
|
||||
void OneapiDevice::global_copy_to(device_memory &mem)
|
||||
{
|
||||
if (!mem.device_pointer) {
|
||||
global_alloc(mem);
|
||||
}
|
||||
else {
|
||||
generic_copy_to(mem);
|
||||
}
|
||||
}
|
||||
|
||||
void OneapiDevice::global_free(device_memory &mem)
|
||||
{
|
||||
if (mem.device_pointer) {
|
||||
@@ -620,6 +653,11 @@ void OneapiDevice::tex_alloc(device_texture &mem)
|
||||
texture_info[slot].data = (uint64_t)mem.device_pointer;
|
||||
}
|
||||
|
||||
void OneapiDevice::tex_copy_to(device_texture &mem)
|
||||
{
|
||||
generic_copy_to(mem);
|
||||
}
|
||||
|
||||
void OneapiDevice::tex_free(device_texture &mem)
|
||||
{
|
||||
/* There is no texture memory in SYCL. */
|
||||
|
||||
@@ -75,6 +75,8 @@ class OneapiDevice : public GPUDevice {
|
||||
|
||||
void mem_copy_to(device_memory &mem) override;
|
||||
|
||||
void mem_move_to_host(device_memory &mem) override;
|
||||
|
||||
void mem_copy_from(
|
||||
device_memory &mem, const size_t y, size_t w, const size_t h, size_t elem) override;
|
||||
|
||||
@@ -92,11 +94,11 @@ class OneapiDevice : public GPUDevice {
|
||||
void const_copy_to(const char *name, void *host, const size_t size) override;
|
||||
|
||||
void global_alloc(device_memory &mem);
|
||||
|
||||
void global_copy_to(device_memory &mem);
|
||||
void global_free(device_memory &mem);
|
||||
|
||||
void tex_alloc(device_texture &mem);
|
||||
|
||||
void tex_copy_to(device_texture &mem);
|
||||
void tex_free(device_texture &mem);
|
||||
|
||||
/* Graphics resources interoperability. */
|
||||
|
||||
Reference in New Issue
Block a user