Fix: Cycles: Re-copy memory from host to device without realloc

Should be a bit more efficient, and it fixes host memory fallback bugs, where host memory was incorrectly freed during re-copy. For the case where memory should get reallocated on the host, a new mem_move_to_host was added. Thanks to Jorn Visser for investigating and finding this problem. Pull Request: https://projects.blender.org/blender/blender/pulls/132912
2025-01-09 12:04:08 +01:00
parent 0e8a7c751a
commit 2cfe2e0bfe
16 changed files with 549 additions and 236 deletions
--- a/intern/cycles/device/cpu/device_impl.cpp
+++ b/intern/cycles/device/cpu/device_impl.cpp
@@ -139,6 +139,11 @@ void CPUDevice::mem_copy_to(device_memory &mem)
  }
 }

+void CPUDevice::mem_move_to_host(device_memory & /*mem*/)
+{
+  /* no-op */
+}
+
 void CPUDevice::mem_copy_from(
    device_memory & /*mem*/, size_t /*y*/, size_t /*w*/, size_t /*h*/, size_t /*elem*/)
 {
--- a/intern/cycles/device/cpu/device_impl.h
+++ b/intern/cycles/device/cpu/device_impl.h
@@ -66,6 +66,7 @@ class CPUDevice : public Device {

  void mem_alloc(device_memory &mem) override;
  void mem_copy_to(device_memory &mem) override;
+  void mem_move_to_host(device_memory &mem) override;
  void mem_copy_from(
      device_memory &mem, const size_t y, size_t w, const size_t h, size_t elem) override;
  void mem_zero(device_memory &mem) override;
--- a/intern/cycles/device/cuda/device_impl.cpp
+++ b/intern/cycles/device/cuda/device_impl.cpp
@@ -581,6 +581,25 @@ void CUDADevice::mem_alloc(device_memory &mem)
 }

 void CUDADevice::mem_copy_to(device_memory &mem)
+{
+  if (mem.type == MEM_GLOBAL) {
+    global_copy_to(mem);
+  }
+  else if (mem.type == MEM_TEXTURE) {
+    tex_copy_to((device_texture &)mem);
+  }
+  else {
+    if (!mem.device_pointer) {
+      generic_alloc(mem);
+      generic_copy_to(mem);
+    }
+    else if (mem.is_resident(this)) {
+      generic_copy_to(mem);
+    }
+  }
+}
+
+void CUDADevice::mem_move_to_host(device_memory &mem)
 {
  if (mem.type == MEM_GLOBAL) {
    global_free(mem);
@@ -591,10 +610,7 @@ void CUDADevice::mem_copy_to(device_memory &mem)
    tex_alloc((device_texture &)mem);
  }
  else {
-    if (!mem.device_pointer) {
-      generic_alloc(mem);
-    }
-    generic_copy_to(mem);
+    assert(0);
  }
 }

@@ -689,6 +705,19 @@ void CUDADevice::global_alloc(device_memory &mem)
  const_copy_to(mem.name, &mem.device_pointer, sizeof(mem.device_pointer));
 }

+void CUDADevice::global_copy_to(device_memory &mem)
+{
+  if (!mem.device_pointer) {
+    generic_alloc(mem);
+    generic_copy_to(mem);
+  }
+  else if (mem.is_resident(this)) {
+    generic_copy_to(mem);
+  }
+
+  const_copy_to(mem.name, &mem.device_pointer, sizeof(mem.device_pointer));
+}
+
 void CUDADevice::global_free(device_memory &mem)
 {
  if (mem.is_resident(this) && mem.device_pointer) {
@@ -696,13 +725,53 @@ void CUDADevice::global_free(device_memory &mem)
  }
 }

+static size_t tex_src_pitch(const device_texture &mem)
+{
+  return mem.data_width * datatype_size(mem.data_type) * mem.data_elements;
+}
+
+static CUDA_MEMCPY2D tex_2d_copy_param(const device_texture &mem, const int pitch_alignment)
+{
+  /* 2D texture using pitch aligned linear memory. */
+  const size_t src_pitch = tex_src_pitch(mem);
+  const size_t dst_pitch = align_up(src_pitch, pitch_alignment);
+
+  CUDA_MEMCPY2D param;
+  memset(&param, 0, sizeof(param));
+  param.dstMemoryType = CU_MEMORYTYPE_DEVICE;
+  param.dstDevice = mem.device_pointer;
+  param.dstPitch = dst_pitch;
+  param.srcMemoryType = CU_MEMORYTYPE_HOST;
+  param.srcHost = mem.host_pointer;
+  param.srcPitch = src_pitch;
+  param.WidthInBytes = param.srcPitch;
+  param.Height = mem.data_height;
+
+  return param;
+}
+
+static CUDA_MEMCPY3D tex_3d_copy_param(const device_texture &mem)
+{
+  const size_t src_pitch = tex_src_pitch(mem);
+
+  CUDA_MEMCPY3D param;
+  memset(&param, 0, sizeof(param));
+  param.dstMemoryType = CU_MEMORYTYPE_ARRAY;
+  param.dstArray = (CUarray)mem.device_pointer;
+  param.srcMemoryType = CU_MEMORYTYPE_HOST;
+  param.srcHost = mem.host_pointer;
+  param.srcPitch = src_pitch;
+  param.WidthInBytes = param.srcPitch;
+  param.Height = mem.data_height;
+  param.Depth = mem.data_depth;
+
+  return param;
+}
+
 void CUDADevice::tex_alloc(device_texture &mem)
 {
  CUDAContextScope scope(this);

-  size_t dsize = datatype_size(mem.data_type);
-  size_t size = mem.memory_size();
-
  CUaddress_mode address_mode = CU_TR_ADDRESS_MODE_WRAP;
  switch (mem.info.extension) {
    case EXTENSION_REPEAT:
@@ -761,8 +830,6 @@ void CUDADevice::tex_alloc(device_texture &mem)

  Mem *cmem = nullptr;
  CUarray array_3d = nullptr;
-  size_t src_pitch = mem.data_width * dsize * mem.data_elements;
-  size_t dst_pitch = src_pitch;

  if (!mem.is_resident(this)) {
    thread_scoped_lock lock(device_mem_map_mutex);
@@ -773,9 +840,6 @@ void CUDADevice::tex_alloc(device_texture &mem)
      array_3d = (CUarray)mem.device_pointer;
      cmem->array = reinterpret_cast<arrayMemObject>(array_3d);
    }
-    else if (mem.data_height > 0) {
-      dst_pitch = align_up(src_pitch, pitch_alignment);
-    }
  }
  else if (mem.data_depth > 1) {
    /* 3D texture using array, there is no API for linear memory. */
@@ -798,22 +862,12 @@ void CUDADevice::tex_alloc(device_texture &mem)
      return;
    }

-    CUDA_MEMCPY3D param;
-    memset(&param, 0, sizeof(param));
-    param.dstMemoryType = CU_MEMORYTYPE_ARRAY;
-    param.dstArray = array_3d;
-    param.srcMemoryType = CU_MEMORYTYPE_HOST;
-    param.srcHost = mem.host_pointer;
-    param.srcPitch = src_pitch;
-    param.WidthInBytes = param.srcPitch;
-    param.Height = mem.data_height;
-    param.Depth = mem.data_depth;
-
-    cuda_assert(cuMemcpy3D(&param));
-
    mem.device_pointer = (device_ptr)array_3d;
-    mem.device_size = size;
-    stats.mem_alloc(size);
+    mem.device_size = mem.memory_size();
+    stats.mem_alloc(mem.memory_size());
+
+    const CUDA_MEMCPY3D param = tex_3d_copy_param(mem);
+    cuda_assert(cuMemcpy3D(&param));

    thread_scoped_lock lock(device_mem_map_mutex);
    cmem = &device_mem_map[&mem];
@@ -822,25 +876,15 @@ void CUDADevice::tex_alloc(device_texture &mem)
  }
  else if (mem.data_height > 0) {
    /* 2D texture, using pitch aligned linear memory. */
-    dst_pitch = align_up(src_pitch, pitch_alignment);
-    size_t dst_size = dst_pitch * mem.data_height;
+    const size_t dst_pitch = align_up(tex_src_pitch(mem), pitch_alignment);
+    const size_t dst_size = dst_pitch * mem.data_height;

    cmem = generic_alloc(mem, dst_size - mem.memory_size());
    if (!cmem) {
      return;
    }

-    CUDA_MEMCPY2D param;
-    memset(&param, 0, sizeof(param));
-    param.dstMemoryType = CU_MEMORYTYPE_DEVICE;
-    param.dstDevice = mem.device_pointer;
-    param.dstPitch = dst_pitch;
-    param.srcMemoryType = CU_MEMORYTYPE_HOST;
-    param.srcHost = mem.host_pointer;
-    param.srcPitch = src_pitch;
-    param.WidthInBytes = param.srcPitch;
-    param.Height = mem.data_height;
-
+    const CUDA_MEMCPY2D param = tex_2d_copy_param(mem, pitch_alignment);
    cuda_assert(cuMemcpy2DUnaligned(&param));
  }
  else {
@@ -850,7 +894,7 @@ void CUDADevice::tex_alloc(device_texture &mem)
      return;
    }

-    cuda_assert(cuMemcpyHtoD(mem.device_pointer, mem.host_pointer, size));
+    cuda_assert(cuMemcpyHtoD(mem.device_pointer, mem.host_pointer, mem.memory_size()));
  }

  /* Resize once */
@@ -879,6 +923,8 @@ void CUDADevice::tex_alloc(device_texture &mem)
      resDesc.flags = 0;
    }
    else if (mem.data_height > 0) {
+      const size_t dst_pitch = align_up(tex_src_pitch(mem), pitch_alignment);
+
      resDesc.resType = CU_RESOURCE_TYPE_PITCH2D;
      resDesc.res.pitch2D.devPtr = mem.device_pointer;
      resDesc.res.pitch2D.format = format;
@@ -917,39 +963,76 @@ void CUDADevice::tex_alloc(device_texture &mem)
  }
 }

-void CUDADevice::tex_free(device_texture &mem)
+void CUDADevice::tex_copy_to(device_texture &mem)
 {
-  if (mem.device_pointer) {
-    CUDAContextScope scope(this);
-    thread_scoped_lock lock(device_mem_map_mutex);
-    DCHECK(device_mem_map.find(&mem) != device_mem_map.end());
-    const Mem &cmem = device_mem_map[&mem];
-
-    if (cmem.texobject) {
-      /* Free bindless texture. */
-      cuTexObjectDestroy(cmem.texobject);
+  if (!mem.device_pointer) {
+    /* Not yet allocated on device. */
+    tex_alloc(mem);
+  }
+  else if (!mem.is_resident(this)) {
+    /* Peering with another device, may still need to create texture info and object. */
+    if (texture_info[mem.slot].data == 0) {
+      tex_alloc(mem);
    }
-
-    if (!mem.is_resident(this)) {
-      /* Do not free memory here, since it was allocated on a different device. */
-      device_mem_map.erase(device_mem_map.find(&mem));
+  }
+  else {
+    /* Resident and fully allocated, only copy. */
+    if (mem.data_depth > 0) {
+      CUDAContextScope scope(this);
+      const CUDA_MEMCPY3D param = tex_3d_copy_param(mem);
+      cuda_assert(cuMemcpy3D(&param));
    }
-    else if (cmem.array) {
-      /* Free array. */
-      cuArrayDestroy(reinterpret_cast<CUarray>(cmem.array));
-      stats.mem_free(mem.device_size);
-      mem.device_pointer = 0;
-      mem.device_size = 0;
-
-      device_mem_map.erase(device_mem_map.find(&mem));
+    else if (mem.data_height > 0) {
+      CUDAContextScope scope(this);
+      const CUDA_MEMCPY2D param = tex_2d_copy_param(mem, pitch_alignment);
+      cuda_assert(cuMemcpy2DUnaligned(&param));
    }
    else {
-      lock.unlock();
-      generic_free(mem);
+      generic_copy_to(mem);
    }
  }
 }

+void CUDADevice::tex_free(device_texture &mem)
+{
+  CUDAContextScope scope(this);
+  thread_scoped_lock lock(device_mem_map_mutex);
+
+  /* Check if the memory was allocated for this device. */
+  auto it = device_mem_map.find(&mem);
+  if (it == device_mem_map.end()) {
+    return;
+  }
+
+  const Mem &cmem = it->second;
+
+  /* Always clear texture info and texture object, regardless of residency. */
+  texture_info[mem.slot] = TextureInfo();
+
+  if (cmem.texobject) {
+    /* Free bindless texture. */
+    cuTexObjectDestroy(cmem.texobject);
+  }
+
+  if (!mem.is_resident(this)) {
+    /* Do not free memory here, since it was allocated on a different device. */
+    device_mem_map.erase(device_mem_map.find(&mem));
+  }
+  else if (cmem.array) {
+    /* Free array. */
+    cuArrayDestroy(reinterpret_cast<CUarray>(cmem.array));
+    stats.mem_free(mem.device_size);
+    mem.device_pointer = 0;
+    mem.device_size = 0;
+
+    device_mem_map.erase(device_mem_map.find(&mem));
+  }
+  else {
+    lock.unlock();
+    generic_free(mem);
+  }
+}
+
 unique_ptr<DeviceQueue> CUDADevice::gpu_queue_create()
 {
  return make_unique<CUDADeviceQueue>(this);
--- a/intern/cycles/device/cuda/device_impl.h
+++ b/intern/cycles/device/cuda/device_impl.h
@@ -74,6 +74,8 @@ class CUDADevice : public GPUDevice {

  void mem_copy_to(device_memory &mem) override;

+  void mem_move_to_host(device_memory &mem) override;
+
  void mem_copy_from(
      device_memory &mem, const size_t y, size_t w, const size_t h, size_t elem) override;

@@ -86,11 +88,11 @@ class CUDADevice : public GPUDevice {
  void const_copy_to(const char *name, void *host, const size_t size) override;

  void global_alloc(device_memory &mem);
-
+  void global_copy_to(device_memory &mem);
  void global_free(device_memory &mem);

  void tex_alloc(device_texture &mem);
-
+  void tex_copy_to(device_texture &mem);
  void tex_free(device_texture &mem);

  bool should_use_graphics_interop() override;
--- a/intern/cycles/device/device.cpp
+++ b/intern/cycles/device/device.cpp
@@ -614,7 +614,7 @@ void GPUDevice::move_textures_to_host(size_t size, bool for_texture)
       * devices as well, which is potentially dangerous when still in use (since
       * a thread rendering on another devices would only be caught in this mutex
       * if it so happens to do an allocation at the same time as well. */
-      max_mem->device_copy_to();
+      max_mem->device_move_to_host();
      size = (max_size >= size) ? 0 : size - max_size;

      any_device_moving_textures_to_host = false;
@@ -758,40 +758,42 @@ GPUDevice::Mem *GPUDevice::generic_alloc(device_memory &mem, const size_t pitch_

 void GPUDevice::generic_free(device_memory &mem)
 {
-  if (mem.device_pointer) {
-    const thread_scoped_lock lock(device_mem_map_mutex);
-    DCHECK(device_mem_map.find(&mem) != device_mem_map.end());
-    const Mem &cmem = device_mem_map[&mem];
-
-    /* If cmem.use_mapped_host is true, reference counting is used
-     * to safely free a mapped host memory. */
-
-    if (cmem.use_mapped_host) {
-      assert(mem.shared_pointer);
-      if (mem.shared_pointer) {
-        assert(mem.shared_counter > 0);
-        if (--mem.shared_counter == 0) {
-          if (mem.host_pointer == mem.shared_pointer) {
-            mem.host_pointer = nullptr;
-          }
-          free_host(mem.shared_pointer);
-          mem.shared_pointer = nullptr;
-        }
-      }
-      map_host_used -= mem.device_size;
-    }
-    else {
-      /* Free device memory. */
-      free_device((void *)mem.device_pointer);
-      device_mem_in_use -= mem.device_size;
-    }
-
-    stats.mem_free(mem.device_size);
-    mem.device_pointer = 0;
-    mem.device_size = 0;
-
-    device_mem_map.erase(device_mem_map.find(&mem));
+  if (!(mem.device_pointer && mem.is_resident(this))) {
+    return;
  }
+
+  const thread_scoped_lock lock(device_mem_map_mutex);
+  DCHECK(device_mem_map.find(&mem) != device_mem_map.end());
+  const Mem &cmem = device_mem_map[&mem];
+
+  /* If cmem.use_mapped_host is true, reference counting is used
+   * to safely free a mapped host memory. */
+
+  if (cmem.use_mapped_host) {
+    assert(mem.shared_pointer);
+    if (mem.shared_pointer) {
+      assert(mem.shared_counter > 0);
+      if (--mem.shared_counter == 0) {
+        if (mem.host_pointer == mem.shared_pointer) {
+          mem.host_pointer = nullptr;
+        }
+        free_host(mem.shared_pointer);
+        mem.shared_pointer = nullptr;
+      }
+    }
+    map_host_used -= mem.device_size;
+  }
+  else {
+    /* Free device memory. */
+    free_device((void *)mem.device_pointer);
+    device_mem_in_use -= mem.device_size;
+  }
+
+  stats.mem_free(mem.device_size);
+  mem.device_pointer = 0;
+  mem.device_size = 0;
+
+  device_mem_map.erase(device_mem_map.find(&mem));
 }

 void GPUDevice::generic_copy_to(device_memory &mem)
--- a/intern/cycles/device/device.h
+++ b/intern/cycles/device/device.h
@@ -315,6 +315,7 @@ class Device {

  virtual void mem_alloc(device_memory &mem) = 0;
  virtual void mem_copy_to(device_memory &mem) = 0;
+  virtual void mem_move_to_host(device_memory &mem) = 0;
  virtual void mem_copy_from(
      device_memory &mem, const size_t y, size_t w, const size_t h, size_t elem) = 0;
  virtual void mem_zero(device_memory &mem) = 0;
@@ -379,8 +380,10 @@ class GPUDevice : public Device {
  size_t device_mem_in_use = 0;

  virtual void init_host_memory(const size_t preferred_texture_headroom = 0,
-                                size_t preferred_working_headroom = 0);
-  virtual void move_textures_to_host(const size_t size, bool for_texture);
+                                const size_t preferred_working_headroom = 0);
+  virtual void move_textures_to_host(const size_t size,
+                                     const size_t headroom,
+                                     const bool for_texture);

  /* Allocation, deallocation and copy functions, with corresponding
   * support of device/host allocations. */
--- a/intern/cycles/device/dummy/device.cpp
+++ b/intern/cycles/device/dummy/device.cpp
@@ -29,6 +29,8 @@ class DummyDevice : public Device {

  void mem_copy_to(device_memory & /*mem*/) override {}

+  void mem_move_to_host(device_memory & /*mem*/) override {}
+
  void mem_copy_from(
      device_memory & /*mem*/, size_t /*y*/, size_t /*w*/, size_t /*h*/, size_t /*elem*/) override
  {
--- a/intern/cycles/device/hip/device_impl.cpp
+++ b/intern/cycles/device/hip/device_impl.cpp
@@ -4,7 +4,6 @@

 #ifdef WITH_HIP

-#  include <climits>
 #  include <cstdio>
 #  include <cstdlib>
 #  include <cstring>
@@ -544,6 +543,25 @@ void HIPDevice::mem_alloc(device_memory &mem)
 }

 void HIPDevice::mem_copy_to(device_memory &mem)
+{
+  if (mem.type == MEM_GLOBAL) {
+    global_copy_to(mem);
+  }
+  else if (mem.type == MEM_TEXTURE) {
+    tex_copy_to((device_texture &)mem);
+  }
+  else {
+    if (!mem.device_pointer) {
+      generic_alloc(mem);
+      generic_copy_to(mem);
+    }
+    else if (mem.is_resident(this)) {
+      generic_copy_to(mem);
+    }
+  }
+}
+
+void HIPDevice::mem_move_to_host(device_memory &mem)
 {
  if (mem.type == MEM_GLOBAL) {
    global_free(mem);
@@ -554,10 +572,7 @@ void HIPDevice::mem_copy_to(device_memory &mem)
    tex_alloc((device_texture &)mem);
  }
  else {
-    if (!mem.device_pointer) {
-      generic_alloc(mem);
-    }
-    generic_copy_to(mem);
+    assert(0);
  }
 }

@@ -652,6 +667,19 @@ void HIPDevice::global_alloc(device_memory &mem)
  const_copy_to(mem.name, &mem.device_pointer, sizeof(mem.device_pointer));
 }

+void HIPDevice::global_copy_to(device_memory &mem)
+{
+  if (!mem.device_pointer) {
+    generic_alloc(mem);
+    generic_copy_to(mem);
+  }
+  else if (mem.is_resident(this)) {
+    generic_copy_to(mem);
+  }
+
+  const_copy_to(mem.name, &mem.device_pointer, sizeof(mem.device_pointer));
+}
+
 void HIPDevice::global_free(device_memory &mem)
 {
  if (mem.is_resident(this) && mem.device_pointer) {
@@ -659,13 +687,52 @@ void HIPDevice::global_free(device_memory &mem)
  }
 }

+static size_t tex_src_pitch(const device_texture &mem)
+{
+  return mem.data_width * datatype_size(mem.data_type) * mem.data_elements;
+}
+
+static hip_Memcpy2D tex_2d_copy_param(const device_texture &mem, const int pitch_alignment)
+{
+  /* 2D texture using pitch aligned linear memory. */
+  const size_t src_pitch = tex_src_pitch(mem);
+  const size_t dst_pitch = align_up(src_pitch, pitch_alignment);
+
+  hip_Memcpy2D param;
+  memset(&param, 0, sizeof(param));
+  param.dstMemoryType = hipMemoryTypeDevice;
+  param.dstDevice = mem.device_pointer;
+  param.dstPitch = dst_pitch;
+  param.srcMemoryType = hipMemoryTypeHost;
+  param.srcHost = mem.host_pointer;
+  param.srcPitch = src_pitch;
+  param.WidthInBytes = param.srcPitch;
+  param.Height = mem.data_height;
+
+  return param;
+}
+
+static HIP_MEMCPY3D tex_3d_copy_param(const device_texture &mem)
+{
+  const size_t src_pitch = tex_src_pitch(mem);
+
+  HIP_MEMCPY3D param;
+  memset(&param, 0, sizeof(HIP_MEMCPY3D));
+  param.dstMemoryType = hipMemoryTypeArray;
+  param.dstArray = (hArray)mem.device_pointer;
+  param.srcMemoryType = hipMemoryTypeHost;
+  param.srcHost = mem.host_pointer;
+  param.srcPitch = src_pitch;
+  param.WidthInBytes = param.srcPitch;
+  param.Height = mem.data_height;
+  param.Depth = mem.data_depth;
+  return param;
+}
+
 void HIPDevice::tex_alloc(device_texture &mem)
 {
  HIPContextScope scope(this);

-  size_t dsize = datatype_size(mem.data_type);
-  size_t size = mem.memory_size();
-
  hipTextureAddressMode address_mode = hipAddressModeWrap;
  switch (mem.info.extension) {
    case EXTENSION_REPEAT:
@@ -721,8 +788,6 @@ void HIPDevice::tex_alloc(device_texture &mem)

  Mem *cmem = nullptr;
  hArray array_3d = nullptr;
-  size_t src_pitch = mem.data_width * dsize * mem.data_elements;
-  size_t dst_pitch = src_pitch;

  if (!mem.is_resident(this)) {
    thread_scoped_lock lock(device_mem_map_mutex);
@@ -733,9 +798,6 @@ void HIPDevice::tex_alloc(device_texture &mem)
      array_3d = (hArray)mem.device_pointer;
      cmem->array = reinterpret_cast<arrayMemObject>(array_3d);
    }
-    else if (mem.data_height > 0) {
-      dst_pitch = align_up(src_pitch, pitch_alignment);
-    }
  }
  else if (mem.data_depth > 1) {
    /* 3D texture using array, there is no API for linear memory. */
@@ -758,22 +820,12 @@ void HIPDevice::tex_alloc(device_texture &mem)
      return;
    }

-    HIP_MEMCPY3D param;
-    memset(&param, 0, sizeof(HIP_MEMCPY3D));
-    param.dstMemoryType = hipMemoryTypeArray;
-    param.dstArray = array_3d;
-    param.srcMemoryType = hipMemoryTypeHost;
-    param.srcHost = mem.host_pointer;
-    param.srcPitch = src_pitch;
-    param.WidthInBytes = param.srcPitch;
-    param.Height = mem.data_height;
-    param.Depth = mem.data_depth;
-
-    hip_assert(hipDrvMemcpy3D(&param));
-
    mem.device_pointer = (device_ptr)array_3d;
-    mem.device_size = size;
-    stats.mem_alloc(size);
+    mem.device_size = mem.memory_size();
+    stats.mem_alloc(mem.memory_size());
+
+    const HIP_MEMCPY3D param = tex_3d_copy_param(mem);
+    hip_assert(hipDrvMemcpy3D(&param));

    thread_scoped_lock lock(device_mem_map_mutex);
    cmem = &device_mem_map[&mem];
@@ -782,25 +834,15 @@ void HIPDevice::tex_alloc(device_texture &mem)
  }
  else if (mem.data_height > 0) {
    /* 2D texture, using pitch aligned linear memory. */
-    dst_pitch = align_up(src_pitch, pitch_alignment);
-    size_t dst_size = dst_pitch * mem.data_height;
+    const size_t dst_pitch = align_up(tex_src_pitch(mem), pitch_alignment);
+    const size_t dst_size = dst_pitch * mem.data_height;

    cmem = generic_alloc(mem, dst_size - mem.memory_size());
    if (!cmem) {
      return;
    }

-    hip_Memcpy2D param;
-    memset(&param, 0, sizeof(param));
-    param.dstMemoryType = hipMemoryTypeDevice;
-    param.dstDevice = mem.device_pointer;
-    param.dstPitch = dst_pitch;
-    param.srcMemoryType = hipMemoryTypeHost;
-    param.srcHost = mem.host_pointer;
-    param.srcPitch = src_pitch;
-    param.WidthInBytes = param.srcPitch;
-    param.Height = mem.data_height;
-
+    const hip_Memcpy2D param = tex_2d_copy_param(mem, pitch_alignment);
    hip_assert(hipDrvMemcpy2DUnaligned(&param));
  }
  else {
@@ -810,7 +852,7 @@ void HIPDevice::tex_alloc(device_texture &mem)
      return;
    }

-    hip_assert(hipMemcpyHtoD(mem.device_pointer, mem.host_pointer, size));
+    hip_assert(hipMemcpyHtoD(mem.device_pointer, mem.host_pointer, mem.memory_size()));
  }

  /* Resize once */
@@ -840,6 +882,8 @@ void HIPDevice::tex_alloc(device_texture &mem)
      resDesc.flags = 0;
    }
    else if (mem.data_height > 0) {
+      const size_t dst_pitch = align_up(tex_src_pitch(mem), pitch_alignment);
+
      resDesc.resType = hipResourceTypePitch2D;
      resDesc.res.pitch2D.devPtr = mem.device_pointer;
      resDesc.res.pitch2D.format = format;
@@ -880,39 +924,76 @@ void HIPDevice::tex_alloc(device_texture &mem)
  }
 }

-void HIPDevice::tex_free(device_texture &mem)
+void HIPDevice::tex_copy_to(device_texture &mem)
 {
-  if (mem.device_pointer) {
-    HIPContextScope scope(this);
-    thread_scoped_lock lock(device_mem_map_mutex);
-    DCHECK(device_mem_map.find(&mem) != device_mem_map.end());
-    const Mem &cmem = device_mem_map[&mem];
-
-    if (cmem.texobject) {
-      /* Free bindless texture. */
-      hipTexObjectDestroy(cmem.texobject);
+  if (!mem.device_pointer) {
+    /* Not yet allocated on device. */
+    tex_alloc(mem);
+  }
+  else if (!mem.is_resident(this)) {
+    /* Peering with another device, may still need to create texture info and object. */
+    if (texture_info[mem.slot].data == 0) {
+      tex_alloc(mem);
    }
-
-    if (!mem.is_resident(this)) {
-      /* Do not free memory here, since it was allocated on a different device. */
-      device_mem_map.erase(device_mem_map.find(&mem));
+  }
+  else {
+    /* Resident and fully allocated, only copy. */
+    if (mem.data_depth > 0) {
+      HIPContextScope scope(this);
+      const HIP_MEMCPY3D param = tex_3d_copy_param(mem);
+      hip_assert(hipDrvMemcpy3D(&param));
    }
-    else if (cmem.array) {
-      /* Free array. */
-      hipArrayDestroy(reinterpret_cast<hArray>(cmem.array));
-      stats.mem_free(mem.device_size);
-      mem.device_pointer = 0;
-      mem.device_size = 0;
-
-      device_mem_map.erase(device_mem_map.find(&mem));
+    else if (mem.data_height > 0) {
+      HIPContextScope scope(this);
+      const hip_Memcpy2D param = tex_2d_copy_param(mem, pitch_alignment);
+      hip_assert(hipDrvMemcpy2DUnaligned(&param));
    }
    else {
-      lock.unlock();
-      generic_free(mem);
+      generic_copy_to(mem);
    }
  }
 }

+void HIPDevice::tex_free(device_texture &mem)
+{
+  HIPContextScope scope(this);
+  thread_scoped_lock lock(device_mem_map_mutex);
+
+  /* Check if the memory was allocated for this device. */
+  auto it = device_mem_map.find(&mem);
+  if (it == device_mem_map.end()) {
+    return;
+  }
+
+  const Mem &cmem = it->second;
+
+  /* Always clear texture info and texture object, regardless of residency. */
+  texture_info[mem.slot] = TextureInfo();
+
+  if (cmem.texobject) {
+    /* Free bindless texture. */
+    hipTexObjectDestroy(cmem.texobject);
+  }
+
+  if (!mem.is_resident(this)) {
+    /* Do not free memory here, since it was allocated on a different device. */
+    device_mem_map.erase(device_mem_map.find(&mem));
+  }
+  else if (cmem.array) {
+    /* Free array. */
+    hipArrayDestroy(reinterpret_cast<hArray>(cmem.array));
+    stats.mem_free(mem.device_size);
+    mem.device_pointer = 0;
+    mem.device_size = 0;
+
+    device_mem_map.erase(device_mem_map.find(&mem));
+  }
+  else {
+    lock.unlock();
+    generic_free(mem);
+  }
+}
+
 unique_ptr<DeviceQueue> HIPDevice::gpu_queue_create()
 {
  return make_unique<HIPDeviceQueue>(this);
--- a/intern/cycles/device/hip/device_impl.h
+++ b/intern/cycles/device/hip/device_impl.h
@@ -72,6 +72,8 @@ class HIPDevice : public GPUDevice {

  void mem_copy_to(device_memory &mem) override;

+  void mem_move_to_host(device_memory &mem) override;
+
  void mem_copy_from(
      device_memory &mem, const size_t y, size_t w, const size_t h, size_t elem) override;

@@ -84,11 +86,11 @@ class HIPDevice : public GPUDevice {
  void const_copy_to(const char *name, void *host, const size_t size) override;

  void global_alloc(device_memory &mem);
-
+  void global_copy_to(device_memory &mem);
  void global_free(device_memory &mem);

  void tex_alloc(device_texture &mem);
-
+  void tex_copy_to(device_texture &mem);
  void tex_free(device_texture &mem);

  /* Graphics resources interoperability. */
--- a/intern/cycles/device/memory.cpp
+++ b/intern/cycles/device/memory.cpp
@@ -82,6 +82,13 @@ void device_memory::device_copy_to()
  }
 }

+void device_memory::device_move_to_host()
+{
+  if (host_pointer) {
+    device->mem_move_to_host(*this);
+  }
+}
+
 void device_memory::device_copy_from(const size_t y, const size_t w, size_t h, const size_t elem)
 {
  assert(type != MEM_TEXTURE && type != MEM_READ_ONLY && type != MEM_GLOBAL);
--- a/intern/cycles/device/memory.h
+++ b/intern/cycles/device/memory.h
@@ -288,6 +288,7 @@ class device_memory {
  void device_alloc();
  void device_free();
  void device_copy_to();
+  void device_move_to_host();
  void device_copy_from(const size_t y, const size_t w, size_t h, const size_t elem);
  void device_zero();

--- a/intern/cycles/device/metal/device_impl.h
+++ b/intern/cycles/device/metal/device_impl.h
@@ -161,6 +161,8 @@ class MetalDevice : public Device {

  void mem_copy_to(device_memory &mem) override;

+  void mem_move_to_host(device_memory &mem) override;
+
  void mem_copy_from(device_memory &mem)
  {
    mem_copy_from(mem, -1, -1, -1, -1);
@@ -177,13 +179,11 @@ class MetalDevice : public Device {
  void const_copy_to(const char *name, void *host, const size_t size) override;

  void global_alloc(device_memory &mem);
-
  void global_free(device_memory &mem);

  void tex_alloc(device_texture &mem);
-
  void tex_alloc_as_buffer(device_texture &mem);
-
+  void tex_copy_to(device_texture &mem);
  void tex_free(device_texture &mem);

  void flush_delayed_free_list();
--- a/intern/cycles/device/metal/device_impl.mm
+++ b/intern/cycles/device/metal/device_impl.mm
@@ -772,46 +772,48 @@ void MetalDevice::generic_copy_to(device_memory &mem)

 void MetalDevice::generic_free(device_memory &mem)
 {
-  if (mem.device_pointer) {
-    std::lock_guard<std::recursive_mutex> lock(metal_mem_map_mutex);
-    MetalMem &mmem = *metal_mem_map.at(&mem);
-    size_t size = mmem.size;
-
-    /* If mmem.use_uma is true, reference counting is used
-     * to safely free memory. */
-
-    bool free_mtlBuffer = false;
-
-    if (mmem.use_UMA) {
-      assert(mem.shared_pointer);
-      if (mem.shared_pointer) {
-        assert(mem.shared_counter > 0);
-        if (--mem.shared_counter == 0) {
-          free_mtlBuffer = true;
-        }
-      }
-    }
-    else {
-      free_mtlBuffer = true;
-    }
-
-    if (free_mtlBuffer) {
-      if (mem.host_pointer && mem.host_pointer == mem.shared_pointer) {
-        /* Safely move the device-side data back to the host before it is freed. */
-        mem.host_pointer = mem.host_alloc(size);
-        memcpy(mem.host_pointer, mem.shared_pointer, size);
-        mmem.use_UMA = false;
-      }
-
-      mem.shared_pointer = nullptr;
-
-      /* Free device memory. */
-      delayed_free_list.push_back(mmem.mtlBuffer);
-      mmem.mtlBuffer = nil;
-    }
-
-    erase_allocation(mem);
+  if (!mem.device_pointer) {
+    return;
  }
+
+  std::lock_guard<std::recursive_mutex> lock(metal_mem_map_mutex);
+  MetalMem &mmem = *metal_mem_map.at(&mem);
+  size_t size = mmem.size;
+
+  /* If mmem.use_uma is true, reference counting is used
+   * to safely free memory. */
+
+  bool free_mtlBuffer = false;
+
+  if (mmem.use_UMA) {
+    assert(mem.shared_pointer);
+    if (mem.shared_pointer) {
+      assert(mem.shared_counter > 0);
+      if (--mem.shared_counter == 0) {
+        free_mtlBuffer = true;
+      }
+    }
+  }
+  else {
+    free_mtlBuffer = true;
+  }
+
+  if (free_mtlBuffer) {
+    if (mem.host_pointer && mem.host_pointer == mem.shared_pointer) {
+      /* Safely move the device-side data back to the host before it is freed. */
+      mem.host_pointer = mem.host_alloc(size);
+      memcpy(mem.host_pointer, mem.shared_pointer, size);
+      mmem.use_UMA = false;
+    }
+
+    mem.shared_pointer = nullptr;
+
+    /* Free device memory. */
+    delayed_free_list.push_back(mmem.mtlBuffer);
+    mmem.mtlBuffer = nil;
+  }
+
+  erase_allocation(mem);
 }

 void MetalDevice::mem_alloc(device_memory &mem)
@@ -829,20 +831,35 @@ void MetalDevice::mem_alloc(device_memory &mem)

 void MetalDevice::mem_copy_to(device_memory &mem)
 {
-  if (mem.type == MEM_GLOBAL) {
-    global_free(mem);
-    global_alloc(mem);
-  }
-  else if (mem.type == MEM_TEXTURE) {
-    tex_free((device_texture &)mem);
-    tex_alloc((device_texture &)mem);
-  }
-  else {
-    if (!mem.device_pointer) {
-      generic_alloc(mem);
+  if (!mem.device_pointer) {
+    if (mem.type == MEM_GLOBAL) {
+      global_alloc(mem);
+    }
+    else if (mem.type == MEM_TEXTURE) {
+      tex_alloc((device_texture &)mem);
+    }
+    else {
+      generic_alloc(mem);
+      generic_copy_to(mem);
    }
-    generic_copy_to(mem);
  }
+  else if (mem.is_resident(this)) {
+    if (mem.type == MEM_GLOBAL) {
+      generic_copy_to(mem);
+    }
+    else if (mem.type == MEM_TEXTURE) {
+      tex_copy_to((device_texture &)mem);
+    }
+    else {
+      generic_copy_to(mem);
+    }
+  }
+}
+
+void MetalDevice::mem_move_to_host(device_memory & /*mem*/)
+{
+  /* Metal implements own mechanism for moving host memory. */
+  assert(0);
 }

 void MetalDevice::mem_copy_from(
@@ -1116,7 +1133,6 @@ void MetalDevice::tex_alloc(device_texture &mem)
    }

    /* General variables for both architectures */
-    size_t dsize = datatype_size(mem.data_type);
    size_t size = mem.memory_size();

    /* sampler_index maps into the GPU's constant 'metal_samplers' array */
@@ -1178,7 +1194,7 @@ void MetalDevice::tex_alloc(device_texture &mem)
    assert(format != MTLPixelFormatInvalid);

    id<MTLTexture> mtlTexture = nil;
-    size_t src_pitch = mem.data_width * dsize * mem.data_elements;
+    size_t src_pitch = mem.data_width * datatype_size(mem.data_type) * mem.data_elements;

    if (mem.data_depth > 1) {
      /* 3D texture using array */
@@ -1309,6 +1325,45 @@ void MetalDevice::tex_alloc(device_texture &mem)
  }
 }

+void MetalDevice::tex_copy_to(device_texture &mem)
+{
+  if (mem.is_resident(this)) {
+    const size_t src_pitch = mem.data_width * datatype_size(mem.data_type) * mem.data_elements;
+
+    if (mem.data_depth > 0) {
+      id<MTLTexture> mtlTexture;
+      {
+        std::lock_guard<std::recursive_mutex> lock(metal_mem_map_mutex);
+        mtlTexture = metal_mem_map.at(&mem)->mtlTexture;
+      }
+      const size_t imageBytes = src_pitch * mem.data_height;
+      for (size_t d = 0; d < mem.data_depth; d++) {
+        const size_t offset = d * imageBytes;
+        [mtlTexture replaceRegion:MTLRegionMake3D(0, 0, d, mem.data_width, mem.data_height, 1)
+                      mipmapLevel:0
+                            slice:0
+                        withBytes:(uint8_t *)mem.host_pointer + offset
+                      bytesPerRow:src_pitch
+                    bytesPerImage:0];
+      }
+    }
+    else if (mem.data_height > 0) {
+      id<MTLTexture> mtlTexture;
+      {
+        std::lock_guard<std::recursive_mutex> lock(metal_mem_map_mutex);
+        mtlTexture = metal_mem_map.at(&mem)->mtlTexture;
+      }
+      [mtlTexture replaceRegion:MTLRegionMake2D(0, 0, mem.data_width, mem.data_height)
+                    mipmapLevel:0
+                      withBytes:mem.host_pointer
+                    bytesPerRow:src_pitch];
+    }
+    else {
+      generic_copy_to(mem);
+    }
+  }
+}
+
 void MetalDevice::tex_free(device_texture &mem)
 {
  if (mem.data_depth == 0 && mem.data_height == 0) {
--- a/intern/cycles/device/multi/device.cpp
+++ b/intern/cycles/device/multi/device.cpp
@@ -340,7 +340,6 @@ class MultiDevice : public Device {
    device_ptr key = (existing_key) ? existing_key : unique_key++;
    size_t existing_size = mem.device_size;

-    /* The tile buffers are allocated on each device (see below), so copy to all of them */
    for (const vector<SubDevice *> &island : peer_islands) {
      SubDevice *owner_sub = find_suitable_mem_device(existing_key, island);
      mem.device = owner_sub->device.get();
@@ -365,6 +364,36 @@ class MultiDevice : public Device {
    stats.mem_alloc(mem.device_size - existing_size);
  }

+  void mem_move_to_host(device_memory &mem) override
+  {
+    device_ptr existing_key = mem.device_pointer;
+    device_ptr key = (existing_key) ? existing_key : unique_key++;
+    size_t existing_size = mem.device_size;
+
+    for (const vector<SubDevice *> &island : peer_islands) {
+      SubDevice *owner_sub = find_suitable_mem_device(existing_key, island);
+      mem.device = owner_sub->device.get();
+      mem.device_pointer = (existing_key) ? owner_sub->ptr_map[existing_key] : 0;
+      mem.device_size = existing_size;
+
+      owner_sub->device->mem_move_to_host(mem);
+      owner_sub->ptr_map[key] = mem.device_pointer;
+
+      if (mem.type == MEM_GLOBAL || mem.type == MEM_TEXTURE) {
+        /* Need to create texture objects and update pointer in kernel globals on all devices */
+        for (SubDevice *island_sub : island) {
+          if (island_sub != owner_sub) {
+            island_sub->device->mem_move_to_host(mem);
+          }
+        }
+      }
+    }
+
+    mem.device = this;
+    mem.device_pointer = key;
+    stats.mem_alloc(mem.device_size - existing_size);
+  }
+
  void mem_copy_from(
      device_memory &mem, const size_t y, size_t w, const size_t h, size_t elem) override
  {
--- a/intern/cycles/device/oneapi/device_impl.cpp
+++ b/intern/cycles/device/oneapi/device_impl.cpp
@@ -112,7 +112,6 @@ OneapiDevice::OneapiDevice(const DeviceInfo &info, Stats &stats, Profiler &profi

  max_memory_on_device_ = get_memcapacity();
  init_host_memory();
-  move_texture_to_host = false;
  can_map_host = true;

  const char *headroom_str = getenv("CYCLES_ONEAPI_MEMORY_HEADROOM");
@@ -417,6 +416,34 @@ void OneapiDevice::mem_copy_to(device_memory &mem)
    return;
  }

+  if (mem.type == MEM_GLOBAL) {
+    global_copy_to(mem);
+  }
+  else if (mem.type == MEM_TEXTURE) {
+    tex_copy_to((device_texture &)mem);
+  }
+  else {
+    if (!mem.device_pointer) {
+      generic_alloc(mem);
+    }
+    generic_copy_to(mem);
+  }
+}
+
+void OneapiDevice::mem_move_to_host(device_memory &mem)
+{
+  if (mem.name) {
+    VLOG_DEBUG << "OneapiDevice::mem_move_to_host: \"" << mem.name << "\", "
+               << string_human_readable_number(mem.memory_size()) << " bytes. ("
+               << string_human_readable_size(mem.memory_size()) << ")";
+  }
+
+  /* After getting runtime errors we need to avoid performing oneAPI runtime operations
+   * because the associated GPU context may be in an invalid state at this point. */
+  if (have_error()) {
+    return;
+  }
+
  if (mem.type == MEM_GLOBAL) {
    global_free(mem);
    global_alloc(mem);
@@ -426,11 +453,7 @@ void OneapiDevice::mem_copy_to(device_memory &mem)
    tex_alloc((device_texture &)mem);
  }
  else {
-    if (!mem.device_pointer) {
-      generic_alloc(mem);
-    }
-
-    generic_copy_to(mem);
+    assert(0);
  }
 }

@@ -596,6 +619,16 @@ void OneapiDevice::global_alloc(device_memory &mem)
  usm_memcpy(device_queue_, kg_memory_device_, kg_memory_, kg_memory_size_);
 }

+void OneapiDevice::global_copy_to(device_memory &mem)
+{
+  if (!mem.device_pointer) {
+    global_alloc(mem);
+  }
+  else {
+    generic_copy_to(mem);
+  }
+}
+
 void OneapiDevice::global_free(device_memory &mem)
 {
  if (mem.device_pointer) {
@@ -620,6 +653,11 @@ void OneapiDevice::tex_alloc(device_texture &mem)
  texture_info[slot].data = (uint64_t)mem.device_pointer;
 }

+void OneapiDevice::tex_copy_to(device_texture &mem)
+{
+  generic_copy_to(mem);
+}
+
 void OneapiDevice::tex_free(device_texture &mem)
 {
  /* There is no texture memory in SYCL. */
--- a/intern/cycles/device/oneapi/device_impl.h
+++ b/intern/cycles/device/oneapi/device_impl.h
@@ -75,6 +75,8 @@ class OneapiDevice : public GPUDevice {

  void mem_copy_to(device_memory &mem) override;

+  void mem_move_to_host(device_memory &mem) override;
+
  void mem_copy_from(
      device_memory &mem, const size_t y, size_t w, const size_t h, size_t elem) override;

@@ -92,11 +94,11 @@ class OneapiDevice : public GPUDevice {
  void const_copy_to(const char *name, void *host, const size_t size) override;

  void global_alloc(device_memory &mem);
-
+  void global_copy_to(device_memory &mem);
  void global_free(device_memory &mem);

  void tex_alloc(device_texture &mem);
-
+  void tex_copy_to(device_texture &mem);
  void tex_free(device_texture &mem);

  /* Graphics resources interoperability. */