test2/intern/cycles/device/queue.h

/* SPDX-FileCopyrightText: 2011-2022 Blender Foundation
 *
 * SPDX-License-Identifier: Apache-2.0 */

#pragma once

#include "device/kernel.h"

#include "device/graphics_interop.h"
#include "util/debug.h"
#include "util/log.h"
#include "util/map.h"
#include "util/string.h"
#include "util/unique_ptr.h"

CCL_NAMESPACE_BEGIN

class Device;
class device_memory;

struct KernelWorkTile;

/* Container for device kernel arguments with type correctness ensured by API. */
struct DeviceKernelArguments {

  enum Type {
    POINTER,
    INT32,
    FLOAT32,
    KERNEL_FILM_CONVERT,
    HIPRT_GLOBAL_STACK,
  };

  static const int MAX_ARGS = 18;
  Type types[MAX_ARGS];
  void *values[MAX_ARGS];
  size_t sizes[MAX_ARGS];
  size_t count = 0;

  DeviceKernelArguments() {}

  template<class T> DeviceKernelArguments(const T *arg)
  {
    add(arg);
  }

  template<class T, class... Args> DeviceKernelArguments(const T *first, Args... args)
  {
    add(first);
    add(args...);
  }

  void add(const KernelFilmConvert *value)
  {
    add(KERNEL_FILM_CONVERT, value, sizeof(KernelFilmConvert));
  }
  void add(const device_ptr *value)
  {
    add(POINTER, value, sizeof(device_ptr));
  }
  void add(const int32_t *value)
  {
    add(INT32, value, sizeof(int32_t));
  }
  void add(const float *value)
  {
    add(FLOAT32, value, sizeof(float));
  }
  void add(const Type type, const void *value, size_t size)
  {
    assert(count < MAX_ARGS);

    types[count] = type;
    values[count] = (void *)value;
    sizes[count] = size;
    count++;
  }
  template<typename T, typename... Args> void add(const T *first, Args... args)
  {
    add(first);
    add(args...);
  }
};

/* Abstraction of a command queue for a device.
 * Provides API to schedule kernel execution in a specific queue with minimal possible overhead
 * from driver side.
 *
 * This class encapsulates all properties needed for commands execution. */
class DeviceQueue {
 public:
  virtual ~DeviceQueue();

  /* Number of concurrent states to process for integrator,
   * based on number of cores and/or available memory. */
  virtual int num_concurrent_states(const size_t state_size) const = 0;

  /* Number of states which keeps the device occupied with work without losing performance.
   * The renderer will add more work (when available) when number of active paths falls below this
   * value. */
  virtual int num_concurrent_busy_states(const size_t state_size) const = 0;

  /* Number of elements in a partition of sorted shaders, that improves memory locality of
   * integrator state fetch at the cost of decreased coherence for shader kernel execution. */
  virtual int num_sort_partition_elements() const
  {
    return 65536;
  }

  /* Does device support local atomic sorting kernels (INTEGRATOR_SORT_BUCKET_PASS and
   * INTEGRATOR_SORT_WRITE_PASS)? */
  virtual bool supports_local_atomic_sort() const
  {
    return false;
  }

  /* Initialize execution of kernels on this queue.
   *
   * Will, for example, load all data required by the kernels from Device to global or path state.
   *
   * Use this method after device synchronization has finished before enqueueing any kernels. */
  virtual void init_execution() = 0;

  /* Enqueue kernel execution.
   *
   * Execute the kernel work_size times on the device.
   * Supported arguments types:
   * - int: pass pointer to the int
   * - device memory: pass pointer to device_memory.device_pointer
   * Return false if there was an error executing this or a previous kernel. */
  virtual bool enqueue(DeviceKernel kernel,
                       const int work_size,
                       DeviceKernelArguments const &args) = 0;

  /* Wait unit all enqueued kernels have finished execution.
   * Return false if there was an error executing any of the enqueued kernels. */
  virtual bool synchronize() = 0;

  /* Copy memory to/from device as part of the command queue, to ensure
   * operations are done in order without having to synchronize. */
  virtual void zero_to_device(device_memory &mem) = 0;
  virtual void copy_to_device(device_memory &mem) = 0;
  virtual void copy_from_device(device_memory &mem) = 0;

  /* Graphics resources interoperability.
   *
   * The interoperability comes here by the meaning that the device is capable of computing result
   * directly into an OpenGL (or other graphics library) buffer. */

  /* Create graphics interoperability context which will be taking care of mapping graphics
   * resource as a buffer writable by kernels of this device. */
  virtual unique_ptr<DeviceGraphicsInterop> graphics_interop_create()
  {
    LOG(FATAL) << "Request of GPU interop of a device which does not support it.";
    return nullptr;
  }

  /* Device this queue has been created for. */
  Device *device;

  virtual void *native_queue()
  {
    return nullptr;
  }

 protected:
  /* Hide construction so that allocation via `Device` API is enforced. */
  explicit DeviceQueue(Device *device);

  /* Implementations call these from the corresponding methods to generate debugging logs. */
  void debug_init_execution();
  void debug_enqueue_begin(DeviceKernel kernel, const int work_size);
  void debug_enqueue_end();
  void debug_synchronize();
  string debug_active_kernels();

  /* Combination of kernels enqueued together sync last synchronize. */
  DeviceKernelMask last_kernels_enqueued_;
  /* Time of synchronize call. */
  double last_sync_time_;
  /* Accumulated execution time for combinations of kernels launched together. */
  map<DeviceKernelMask, double> stats_kernel_time_;
  /* If it is true, then a performance statistics in the debugging logs will have focus on kernels
   * and an explicit queue synchronization will be added after each kernel execution. */
  bool is_per_kernel_performance_;
};

CCL_NAMESPACE_END