Refactor: Cycles: Make CPU kernel globals storage more sane

Pull Request: https://projects.blender.org/blender/blender/pulls/132361
2024-12-29 23:13:45 +01:00
parent 57ff24cb99
commit a8654a1dbe
32 changed files with 319 additions and 386 deletions
--- a/intern/cycles/app/cycles_precompute.cpp
+++ b/intern/cycles/app/cycles_precompute.cpp
@@ -19,8 +19,6 @@ CCL_NAMESPACE_BEGIN

 static float precompute_ggx_E(const float rough, const float mu, const float3 rand)
 {
-  KernelGlobalsCPU kg;
-
  MicrofacetBsdf bsdf;
  bsdf.weight = one_float3();
  bsdf.sample_weight = 1.0f;
@@ -36,7 +34,7 @@ static float precompute_ggx_E(const float rough, const float mu, const float3 ra
  float pdf = 0.0f;
  float sampled_eta;
  float2 sampled_roughness;
-  bsdf_microfacet_ggx_sample(&kg,
+  bsdf_microfacet_ggx_sample(nullptr,
                             (ShaderClosure *)&bsdf,
                             make_float3(0.0f, 0.0f, 1.0f),
                             make_float3(sqrtf(1.0f - sqr(mu)), 0.0f, mu),
@@ -57,8 +55,6 @@ static float precompute_ggx_glass_E(const float rough,
                                    const float eta,
                                    const float3 rand)
 {
-  KernelGlobalsCPU kg;
-
  MicrofacetBsdf bsdf;
  bsdf.weight = one_float3();
  bsdf.sample_weight = 1.0f;
@@ -74,7 +70,7 @@ static float precompute_ggx_glass_E(const float rough,
  float pdf = 0.0f;
  float sampled_eta;
  float2 sampled_roughness;
-  bsdf_microfacet_ggx_sample(&kg,
+  bsdf_microfacet_ggx_sample(nullptr,
                             (ShaderClosure *)&bsdf,
                             make_float3(0.0f, 0.0f, 1.0f),
                             make_float3(sqrtf(1.0f - sqr(mu)), 0.0f, mu),
@@ -93,8 +89,6 @@ static float precompute_ggx_glass_E(const float rough,
 static float precompute_ggx_gen_schlick_s(
    const float rough, const float mu, const float eta, const float exponent, const float3 rand)
 {
-  KernelGlobalsCPU kg;
-
  MicrofacetBsdf bsdf;
  bsdf.weight = one_float3();
  bsdf.sample_weight = 1.0f;
@@ -120,7 +114,7 @@ static float precompute_ggx_gen_schlick_s(
  float pdf = 0.0f;
  float sampled_eta;
  float2 sampled_roughness;
-  bsdf_microfacet_ggx_sample(&kg,
+  bsdf_microfacet_ggx_sample(nullptr,
                             (ShaderClosure *)&bsdf,
                             make_float3(0.0f, 0.0f, 1.0f),
                             make_float3(sqrtf(1.0f - sqr(mu)), 0.0f, mu),
--- a/intern/cycles/device/CMakeLists.txt
+++ b/intern/cycles/device/CMakeLists.txt
@@ -33,8 +33,6 @@ set(SRC_CPU
  cpu/kernel.cpp
  cpu/kernel.h
  cpu/kernel_function.h
-  cpu/kernel_thread_globals.cpp
-  cpu/kernel_thread_globals.h
 )

 set(SRC_CUDA
--- a/intern/cycles/device/cpu/device_impl.cpp
+++ b/intern/cycles/device/cpu/device_impl.cpp
@@ -25,7 +25,6 @@
 #endif

 #include "device/cpu/kernel.h"
-#include "device/cpu/kernel_thread_globals.h"

 #include "device/device.h"

@@ -56,9 +55,6 @@ CPUDevice::CPUDevice(const DeviceInfo &info_, Stats &stats_, Profiler &profiler_
    info.cpu_threads = TaskScheduler::max_concurrency();
  }

-#ifdef WITH_OSL
-  kernel_globals.osl = &osl_globals;
-#endif
 #ifdef WITH_EMBREE
  embree_device = rtcNewDevice("verbose=0");
 #endif
@@ -296,19 +292,19 @@ void *CPUDevice::get_guiding_device() const
 }

 void CPUDevice::get_cpu_kernel_thread_globals(
-    vector<CPUKernelThreadGlobals> &kernel_thread_globals)
+    vector<ThreadKernelGlobalsCPU> &kernel_thread_globals)
 {
  /* Ensure latest texture info is loaded into kernel globals before returning. */
  load_texture_info();

  kernel_thread_globals.clear();
-  void *osl_memory = get_cpu_osl_memory();
+  OSLGlobals *osl_globals = get_cpu_osl_memory();
  for (int i = 0; i < info.cpu_threads; i++) {
-    kernel_thread_globals.emplace_back(kernel_globals, osl_memory, profiler, i);
+    kernel_thread_globals.emplace_back(kernel_globals, osl_globals, profiler, i);
  }
 }

-void *CPUDevice::get_cpu_osl_memory()
+OSLGlobals *CPUDevice::get_cpu_osl_memory()
 {
 #ifdef WITH_OSL
  return &osl_globals;
--- a/intern/cycles/device/cpu/device_impl.h
+++ b/intern/cycles/device/cpu/device_impl.h
@@ -85,8 +85,8 @@ class CPUDevice : public Device {
  void *get_guiding_device() const override;

  void get_cpu_kernel_thread_globals(
-      vector<CPUKernelThreadGlobals> &kernel_thread_globals) override;
-  void *get_cpu_osl_memory() override;
+      vector<ThreadKernelGlobalsCPU> &kernel_thread_globals) override;
+  OSLGlobals *get_cpu_osl_memory() override;

 protected:
  bool load_kernels(uint /*kernel_features*/) override;
--- a/intern/cycles/device/cpu/kernel.h
+++ b/intern/cycles/device/cpu/kernel.h
@@ -9,7 +9,7 @@

 CCL_NAMESPACE_BEGIN

-struct KernelGlobalsCPU;
+struct ThreadKernelGlobalsCPU;
 struct KernelFilmConvert;
 struct IntegratorStateCPU;
 struct TileInfo;
@@ -19,10 +19,11 @@ class CPUKernels {
  /* Integrator. */

  using IntegratorFunction =
-      CPUKernelFunction<void (*)(const KernelGlobalsCPU *kg, IntegratorStateCPU *state)>;
-  using IntegratorShadeFunction = CPUKernelFunction<void (*)(
-      const KernelGlobalsCPU *kg, IntegratorStateCPU *state, ccl_global float *render_buffer)>;
-  using IntegratorInitFunction = CPUKernelFunction<bool (*)(const KernelGlobalsCPU *kg,
+      CPUKernelFunction<void (*)(const ThreadKernelGlobalsCPU *kg, IntegratorStateCPU *state)>;
+  using IntegratorShadeFunction = CPUKernelFunction<void (*)(const ThreadKernelGlobalsCPU *kg,
+                                                             IntegratorStateCPU *state,
+                                                             ccl_global float *render_buffer)>;
+  using IntegratorInitFunction = CPUKernelFunction<bool (*)(const ThreadKernelGlobalsCPU *kg,
                                                            IntegratorStateCPU *state,
                                                            KernelWorkTile *tile,
                                                            ccl_global float *render_buffer)>;
@@ -45,7 +46,7 @@ class CPUKernels {
  /* Shader evaluation. */

  using ShaderEvalFunction = CPUKernelFunction<void (*)(
-      const KernelGlobalsCPU *kg, const KernelShaderEvalInput *, float *, const int)>;
+      const ThreadKernelGlobalsCPU *kg, const KernelShaderEvalInput *, float *, const int)>;

  ShaderEvalFunction shader_eval_displace;
  ShaderEvalFunction shader_eval_background;
@@ -54,7 +55,7 @@ class CPUKernels {
  /* Adaptive stopping. */

  using AdaptiveSamplingConvergenceCheckFunction =
-      CPUKernelFunction<bool (*)(const KernelGlobalsCPU *kg,
+      CPUKernelFunction<bool (*)(const ThreadKernelGlobalsCPU *kg,
                                 ccl_global float *render_buffer,
                                 const int x,
                                 const int y,
@@ -64,7 +65,7 @@ class CPUKernels {
                                 int stride)>;

  using AdaptiveSamplingFilterXFunction =
-      CPUKernelFunction<void (*)(const KernelGlobalsCPU *kg,
+      CPUKernelFunction<void (*)(const ThreadKernelGlobalsCPU *kg,
                                 ccl_global float *render_buffer,
                                 const int y,
                                 const int start_x,
@@ -73,7 +74,7 @@ class CPUKernels {
                                 int stride)>;

  using AdaptiveSamplingFilterYFunction =
-      CPUKernelFunction<void (*)(const KernelGlobalsCPU *kg,
+      CPUKernelFunction<void (*)(const ThreadKernelGlobalsCPU *kg,
                                 ccl_global float *render_buffer,
                                 const int x,
                                 const int start_y,
@@ -89,7 +90,7 @@ class CPUKernels {
  /* Cryptomatte. */

  using CryptomattePostprocessFunction = CPUKernelFunction<void (*)(
-      const KernelGlobalsCPU *kg, ccl_global float *render_buffer, const int pixel_index)>;
+      const ThreadKernelGlobalsCPU *kg, ccl_global float *render_buffer, const int pixel_index)>;

  CryptomattePostprocessFunction cryptomatte_postprocess;

--- a/intern/cycles/device/cpu/kernel_thread_globals.cpp
+++ b/intern/cycles/device/cpu/kernel_thread_globals.cpp
@@ -1,91 +0,0 @@
-/* SPDX-FileCopyrightText: 2011-2022 Blender Foundation
- *
- * SPDX-License-Identifier: Apache-2.0 */
-
-#include "device/cpu/kernel_thread_globals.h"
-
-#include "kernel/osl/globals.h"
-
-#include "util/profiling.h"
-
-CCL_NAMESPACE_BEGIN
-
-CPUKernelThreadGlobals::CPUKernelThreadGlobals(const KernelGlobalsCPU &kernel_globals,
-                                               void *osl_globals_memory,
-                                               Profiler &cpu_profiler,
-                                               const int thread_index)
-    : KernelGlobalsCPU(kernel_globals), cpu_profiler_(cpu_profiler)
-{
-  clear_runtime_pointers();
-
-#ifdef WITH_OSL
-  OSLGlobals::thread_init(this, static_cast<OSLGlobals *>(osl_globals_memory), thread_index);
-#else
-  (void)thread_index;
-  (void)osl_globals_memory;
-#endif
-
-#ifdef WITH_PATH_GUIDING
-  opgl_path_segment_storage = new openpgl::cpp::PathSegmentStorage();
-#endif
-}
-
-CPUKernelThreadGlobals::CPUKernelThreadGlobals(CPUKernelThreadGlobals &&other) noexcept
-    : KernelGlobalsCPU(std::move(other)), cpu_profiler_(other.cpu_profiler_)
-{
-  other.clear_runtime_pointers();
-}
-
-CPUKernelThreadGlobals::~CPUKernelThreadGlobals()
-{
-#ifdef WITH_OSL
-  OSLGlobals::thread_free(this);
-#endif
-
-#ifdef WITH_PATH_GUIDING
-  delete opgl_path_segment_storage;
-  delete opgl_surface_sampling_distribution;
-  delete opgl_volume_sampling_distribution;
-#endif
-}
-
-CPUKernelThreadGlobals &CPUKernelThreadGlobals::operator=(CPUKernelThreadGlobals &&other)
-{
-  if (this == &other) {
-    return *this;
-  }
-
-  *static_cast<KernelGlobalsCPU *>(this) = *static_cast<KernelGlobalsCPU *>(&other);
-
-  other.clear_runtime_pointers();
-
-  return *this;
-}
-
-void CPUKernelThreadGlobals::clear_runtime_pointers()
-{
-#ifdef WITH_OSL
-  osl = nullptr;
-#endif
-
-#ifdef WITH_PATH_GUIDING
-  opgl_sample_data_storage = nullptr;
-  opgl_guiding_field = nullptr;
-
-  opgl_path_segment_storage = nullptr;
-  opgl_surface_sampling_distribution = nullptr;
-  opgl_volume_sampling_distribution = nullptr;
-#endif
-}
-
-void CPUKernelThreadGlobals::start_profiling()
-{
-  cpu_profiler_.add_state(&profiler);
-}
-
-void CPUKernelThreadGlobals::stop_profiling()
-{
-  cpu_profiler_.remove_state(&profiler);
-}
-
-CCL_NAMESPACE_END
--- a/intern/cycles/device/cpu/kernel_thread_globals.h
+++ b/intern/cycles/device/cpu/kernel_thread_globals.h
@@ -1,45 +0,0 @@
-/* SPDX-FileCopyrightText: 2011-2022 Blender Foundation
- *
- * SPDX-License-Identifier: Apache-2.0 */
-
-#pragma once
-
-#include "kernel/globals.h"
-
-CCL_NAMESPACE_BEGIN
-
-class Profiler;
-
-/* A special class which extends memory ownership of the `KernelGlobalsCPU` decoupling any resource
- * which is not thread-safe for access. Every worker thread which needs to operate on
- * `KernelGlobalsCPU` needs to initialize its own copy of this object.
- *
- * NOTE: Only minimal subset of objects are copied: `KernelData` is never copied. This means that
- * there is no unnecessary data duplication happening when using this object. */
-class CPUKernelThreadGlobals : public KernelGlobalsCPU {
- public:
-  /* TODO(sergey): Would be nice to have properly typed OSLGlobals even in the case when building
-   * without OSL support. Will avoid need to those unnamed pointers and casts. */
-  CPUKernelThreadGlobals(const KernelGlobalsCPU &kernel_globals,
-                         void *osl_globals_memory,
-                         Profiler &cpu_profiler,
-                         const int thread_index);
-
-  ~CPUKernelThreadGlobals();
-
-  CPUKernelThreadGlobals(const CPUKernelThreadGlobals &other) = delete;
-  CPUKernelThreadGlobals(CPUKernelThreadGlobals &&other) noexcept;
-
-  CPUKernelThreadGlobals &operator=(const CPUKernelThreadGlobals &other) = delete;
-  CPUKernelThreadGlobals &operator=(CPUKernelThreadGlobals &&other);
-
-  void start_profiling();
-  void stop_profiling();
-
- protected:
-  void clear_runtime_pointers();
-
-  Profiler &cpu_profiler_;
-};
-
-CCL_NAMESPACE_END
--- a/intern/cycles/device/device.cpp
+++ b/intern/cycles/device/device.cpp
@@ -483,12 +483,12 @@ const CPUKernels &Device::get_cpu_kernels()
 }

 void Device::get_cpu_kernel_thread_globals(
-    vector<CPUKernelThreadGlobals> & /*kernel_thread_globals*/)
+    vector<ThreadKernelGlobalsCPU> & /*kernel_thread_globals*/)
 {
  LOG(FATAL) << "Device does not support CPU kernels.";
 }

-void *Device::get_cpu_osl_memory()
+OSLGlobals *Device::get_cpu_osl_memory()
 {
  return nullptr;
 }
--- a/intern/cycles/device/device.h
+++ b/intern/cycles/device/device.h
@@ -28,9 +28,11 @@ class BVH;
 class DeviceQueue;
 class Progress;
 class CPUKernels;
-class CPUKernelThreadGlobals;
 class Scene;

+struct OSLGlobals;
+struct ThreadKernelGlobalsCPU;
+
 /* Device Types */

 enum DeviceType {
@@ -216,9 +218,9 @@ class Device {
  static const CPUKernels &get_cpu_kernels();
  /* Get kernel globals to pass to kernels. */
  virtual void get_cpu_kernel_thread_globals(
-      vector<CPUKernelThreadGlobals> & /*kernel_thread_globals*/);
+      vector<ThreadKernelGlobalsCPU> & /*kernel_thread_globals*/);
  /* Get OpenShadingLanguage memory buffer. */
-  virtual void *get_cpu_osl_memory();
+  virtual OSLGlobals *get_cpu_osl_memory();

  /* Acceleration structure building. */
  virtual void build_bvh(BVH *bvh, Progress &progress, bool refit);
--- a/intern/cycles/device/multi/device.cpp
+++ b/intern/cycles/device/multi/device.cpp
@@ -257,7 +257,7 @@ class MultiDevice : public Device {
    }
  }

-  void *get_cpu_osl_memory() override
+  OSLGlobals *get_cpu_osl_memory() override
  {
    /* Always return the OSL memory of the CPU device (this works since the constructor above
     * guarantees that CPU devices are always added to the back). */
--- a/intern/cycles/device/optix/device_impl.cpp
+++ b/intern/cycles/device/optix/device_impl.cpp
@@ -1006,7 +1006,7 @@ bool OptiXDevice::load_osl_kernels()
 #  endif
 }

-void *OptiXDevice::get_cpu_osl_memory()
+OSLGlobals *OptiXDevice::get_cpu_osl_memory()
 {
 #  ifdef WITH_OSL
  return &osl_globals;
--- a/intern/cycles/device/optix/device_impl.h
+++ b/intern/cycles/device/optix/device_impl.h
@@ -115,7 +115,7 @@ class OptiXDevice : public CUDADevice {

  unique_ptr<DeviceQueue> gpu_queue_create() override;

-  void *get_cpu_osl_memory() override;
+  OSLGlobals *get_cpu_osl_memory() override;
 };

 CCL_NAMESPACE_END
--- a/intern/cycles/integrator/path_trace_work_cpu.cpp
+++ b/intern/cycles/integrator/path_trace_work_cpu.cpp
@@ -28,9 +28,9 @@ static inline tbb::task_arena local_tbb_arena_create(const Device *device)
  return tbb::task_arena(device->info.cpu_threads);
 }

-/* Get CPUKernelThreadGlobals for the current thread. */
-static inline CPUKernelThreadGlobals *kernel_thread_globals_get(
-    vector<CPUKernelThreadGlobals> &kernel_thread_globals)
+/* Get ThreadKernelGlobalsCPU for the current thread. */
+static inline ThreadKernelGlobalsCPU *kernel_thread_globals_get(
+    vector<ThreadKernelGlobalsCPU> &kernel_thread_globals)
 {
  const int thread_index = tbb::this_task_arena::current_thread_index();
  DCHECK_GE(thread_index, 0);
@@ -65,7 +65,7 @@ void PathTraceWorkCPU::render_samples(RenderStatistics &statistics,
  const int64_t total_pixels_num = image_width * image_height;

  if (device_->profiler.active()) {
-    for (CPUKernelThreadGlobals &kernel_globals : kernel_thread_globals_) {
+    for (ThreadKernelGlobalsCPU &kernel_globals : kernel_thread_globals_) {
      kernel_globals.start_profiling();
    }
  }
@@ -91,13 +91,13 @@ void PathTraceWorkCPU::render_samples(RenderStatistics &statistics,
      work_tile.offset = effective_buffer_params_.offset;
      work_tile.stride = effective_buffer_params_.stride;

-      CPUKernelThreadGlobals *kernel_globals = kernel_thread_globals_get(kernel_thread_globals_);
+      ThreadKernelGlobalsCPU *kernel_globals = kernel_thread_globals_get(kernel_thread_globals_);

      render_samples_full_pipeline(kernel_globals, work_tile, samples_num);
    });
  });
  if (device_->profiler.active()) {
-    for (CPUKernelThreadGlobals &kernel_globals : kernel_thread_globals_) {
+    for (ThreadKernelGlobalsCPU &kernel_globals : kernel_thread_globals_) {
      kernel_globals.stop_profiling();
    }
  }
@@ -105,7 +105,7 @@ void PathTraceWorkCPU::render_samples(RenderStatistics &statistics,
  statistics.occupancy = 1.0f;
 }

-void PathTraceWorkCPU::render_samples_full_pipeline(KernelGlobalsCPU *kernel_globals,
+void PathTraceWorkCPU::render_samples_full_pipeline(ThreadKernelGlobalsCPU *kernel_globals,
                                                    const KernelWorkTile &work_tile,
                                                    const int samples_num)
 {
@@ -230,7 +230,7 @@ int PathTraceWorkCPU::adaptive_sampling_converge_filter_count_active(const float
  /* Check convergency and do x-filter in a single `parallel_for`, to reduce threading overhead. */
  local_arena.execute([&]() {
    parallel_for(full_y, full_y + height, [&](int y) {
-      CPUKernelThreadGlobals *kernel_globals = kernel_thread_globals_.data();
+      ThreadKernelGlobalsCPU *kernel_globals = kernel_thread_globals_.data();

      bool row_converged = true;
      uint num_row_pixels_active = 0;
@@ -255,7 +255,7 @@ int PathTraceWorkCPU::adaptive_sampling_converge_filter_count_active(const float
  if (num_active_pixels) {
    local_arena.execute([&]() {
      parallel_for(full_x, full_x + width, [&](int x) {
-        CPUKernelThreadGlobals *kernel_globals = kernel_thread_globals_.data();
+        ThreadKernelGlobalsCPU *kernel_globals = kernel_thread_globals_.data();
        kernels_.adaptive_sampling_filter_y(
            kernel_globals, render_buffer, x, full_y, height, offset, stride);
      });
@@ -277,7 +277,7 @@ void PathTraceWorkCPU::cryptomatte_postproces()
  /* Check convergency and do x-filter in a single `parallel_for`, to reduce threading overhead. */
  local_arena.execute([&]() {
    parallel_for(0, height, [&](int y) {
-      CPUKernelThreadGlobals *kernel_globals = kernel_thread_globals_.data();
+      ThreadKernelGlobalsCPU *kernel_globals = kernel_thread_globals_.data();
      int pixel_index = y * width;

      for (int x = 0; x < width; ++x, ++pixel_index) {
@@ -297,7 +297,7 @@ void PathTraceWorkCPU::guiding_init_kernel_globals(void *guiding_field,
  /* Linking the global guiding structures (e.g., Field and SampleStorage) to the per-thread
   * kernel globals. */
  for (int thread_index = 0; thread_index < kernel_thread_globals_.size(); thread_index++) {
-    CPUKernelThreadGlobals &kg = kernel_thread_globals_[thread_index];
+    ThreadKernelGlobalsCPU &kg = kernel_thread_globals_[thread_index];
    openpgl::cpp::Field *field = (openpgl::cpp::Field *)guiding_field;

    /* Allocate sampling distributions. */
@@ -305,17 +305,17 @@ void PathTraceWorkCPU::guiding_init_kernel_globals(void *guiding_field,

 #  if PATH_GUIDING_LEVEL >= 4
    if (kg.opgl_surface_sampling_distribution) {
-      delete kg.opgl_surface_sampling_distribution;
-      kg.opgl_surface_sampling_distribution = nullptr;
+      kg.opgl_surface_sampling_distribution.reset();
    }
    if (kg.opgl_volume_sampling_distribution) {
-      delete kg.opgl_volume_sampling_distribution;
-      kg.opgl_volume_sampling_distribution = nullptr;
+      kg.opgl_volume_sampling_distribution.reset();
    }

    if (field) {
-      kg.opgl_surface_sampling_distribution = new openpgl::cpp::SurfaceSamplingDistribution(field);
-      kg.opgl_volume_sampling_distribution = new openpgl::cpp::VolumeSamplingDistribution(field);
+      kg.opgl_surface_sampling_distribution =
+          make_unique<openpgl::cpp::SurfaceSamplingDistribution>(field);
+      kg.opgl_volume_sampling_distribution = make_unique<openpgl::cpp::VolumeSamplingDistribution>(
+          field);
    }
 #  endif

@@ -332,7 +332,7 @@ void PathTraceWorkCPU::guiding_init_kernel_globals(void *guiding_field,
 }

 void PathTraceWorkCPU::guiding_push_sample_data_to_global_storage(
-    KernelGlobalsCPU *kg,
+    ThreadKernelGlobalsCPU *kg,
    IntegratorStateCPU *state,
    const ccl_global float *ccl_restrict render_buffer)
 {
--- a/intern/cycles/integrator/path_trace_work_cpu.h
+++ b/intern/cycles/integrator/path_trace_work_cpu.h
@@ -4,9 +4,9 @@

 #pragma once

+#include "kernel/device/cpu/globals.h"
 #include "kernel/integrator/state.h"

-#include "device/cpu/kernel_thread_globals.h"
 #include "device/queue.h"

 #include "integrator/path_trace_work.h"
@@ -16,7 +16,7 @@
 CCL_NAMESPACE_BEGIN

 struct KernelWorkTile;
-struct KernelGlobalsCPU;
+struct ThreadKernelGlobalsCPU;
 struct IntegratorStateCPU;

 class CPUKernels;
@@ -63,7 +63,7 @@ class PathTraceWorkCPU : public PathTraceWork {

  /* Pushes the collected training data/samples of a path to the global sample storage.
   * This function is called at the end of a random walk/path generation. */
-  void guiding_push_sample_data_to_global_storage(KernelGlobalsCPU *kg,
+  void guiding_push_sample_data_to_global_storage(ThreadKernelGlobalsCPU *kg,
                                                  IntegratorStateCPU *state,
                                                  const ccl_global float *ccl_restrict
                                                      render_buffer);
@@ -71,7 +71,7 @@ class PathTraceWorkCPU : public PathTraceWork {

 protected:
  /* Core path tracing routine. Renders given work time on the given queue. */
-  void render_samples_full_pipeline(KernelGlobalsCPU *kernel_globals,
+  void render_samples_full_pipeline(ThreadKernelGlobalsCPU *kernel_globals,
                                    const KernelWorkTile &work_tile,
                                    const int samples_num);

@@ -83,7 +83,7 @@ class PathTraceWorkCPU : public PathTraceWork {
   * More specifically, the `kernel_globals_` is local to each threads and nobody else is
   * accessing it, but some "localization" is required to decouple from kernel globals stored
   * on the device level. */
-  vector<CPUKernelThreadGlobals> kernel_thread_globals_;
+  vector<ThreadKernelGlobalsCPU> kernel_thread_globals_;
 };

 CCL_NAMESPACE_END
--- a/intern/cycles/integrator/shader_eval.cpp
+++ b/intern/cycles/integrator/shader_eval.cpp
@@ -2,13 +2,14 @@
 *
 * SPDX-License-Identifier: Apache-2.0 */

+#include "kernel/device/cpu/globals.h"
+
 #include "integrator/shader_eval.h"

 #include "device/device.h"
 #include "device/queue.h"

 #include "device/cpu/kernel.h"
-#include "device/cpu/kernel_thread_globals.h"

 #include "util/log.h"
 #include "util/progress.h"
@@ -80,7 +81,7 @@ bool ShaderEval::eval_cpu(Device *device,
                          device_vector<float> &output,
                          const int64_t work_size)
 {
-  vector<CPUKernelThreadGlobals> kernel_thread_globals;
+  vector<ThreadKernelGlobalsCPU> kernel_thread_globals;
  device->get_cpu_kernel_thread_globals(kernel_thread_globals);

  /* Find required kernel function. */
@@ -101,7 +102,7 @@ bool ShaderEval::eval_cpu(Device *device,
      }

      const int thread_index = tbb::this_task_arena::current_thread_index();
-      const KernelGlobalsCPU *kg = &kernel_thread_globals[thread_index];
+      const ThreadKernelGlobalsCPU *kg = &kernel_thread_globals[thread_index];

      switch (type) {
        case SHADER_EVAL_DISPLACE:
--- a/intern/cycles/kernel/CMakeLists.txt
+++ b/intern/cycles/kernel/CMakeLists.txt
@@ -13,6 +13,7 @@ set(INC_SYS
 )

 set(SRC_KERNEL_DEVICE_CPU
+  device/cpu/globals.cpp
  device/cpu/kernel.cpp
  device/cpu/kernel_sse42.cpp
  device/cpu/kernel_avx2.cpp
--- a/intern/cycles/kernel/device/cpu/bvh.h
+++ b/intern/cycles/kernel/device/cpu/bvh.h
@@ -284,7 +284,7 @@ ccl_device_forceinline void kernel_embree_filter_intersection_func_impl(
 #ifdef __KERNEL_ONEAPI__
  KernelGlobalsGPU *kg = nullptr;
 #else
-  const KernelGlobalsCPU *kg = ctx->kg;
+  const ThreadKernelGlobalsCPU *kg = ctx->kg;
 #endif
  const Ray *cray = ctx->ray;

@@ -324,7 +324,7 @@ ccl_device_forceinline void kernel_embree_filter_occluded_shadow_all_func_impl(
 #ifdef __KERNEL_ONEAPI__
  KernelGlobalsGPU *kg = nullptr;
 #else
-  const KernelGlobalsCPU *kg = ctx->kg;
+  const ThreadKernelGlobalsCPU *kg = ctx->kg;
 #endif
  const Ray *cray = ctx->ray;

@@ -438,7 +438,7 @@ ccl_device_forceinline void kernel_embree_filter_occluded_local_func_impl(
 #ifdef __KERNEL_ONEAPI__
  KernelGlobalsGPU *kg = nullptr;
 #else
-  const KernelGlobalsCPU *kg = ctx->kg;
+  const ThreadKernelGlobalsCPU *kg = ctx->kg;
 #endif
  const Ray *cray = ctx->ray;

@@ -541,7 +541,7 @@ ccl_device_forceinline void kernel_embree_filter_occluded_volume_all_func_impl(
 #ifdef __KERNEL_ONEAPI__
  KernelGlobalsGPU *kg = nullptr;
 #else
-  const KernelGlobalsCPU *kg = ctx->kg;
+  const ThreadKernelGlobalsCPU *kg = ctx->kg;
 #endif
  const Ray *cray = ctx->ray;

@@ -622,7 +622,7 @@ ccl_device void kernel_embree_filter_func_backface_cull(const RTCFilterFunctionN
  }

  CCLIntersectContext *ctx = ((CCLIntersectContext *)args->context);
-  const KernelGlobalsCPU *kg = ctx->kg;
+  const ThreadKernelGlobalsCPU *kg = ctx->kg;
  const Ray *cray = ctx->ray;

  if (kernel_embree_is_self_intersection(
--- a/intern/cycles/kernel/device/cpu/globals.cpp
+++ b/intern/cycles/kernel/device/cpu/globals.cpp
@@ -0,0 +1,43 @@
+/* SPDX-FileCopyrightText: 2011-2022 Blender Foundation
+ *
+ * SPDX-License-Identifier: Apache-2.0 */
+
+#include "kernel/device/cpu/globals.h"
+#include "kernel/osl/globals.h"
+
+#include "util/guiding.h"  // IWYU pragma: keep
+#include "util/profiling.h"
+
+CCL_NAMESPACE_BEGIN
+
+ThreadKernelGlobalsCPU::ThreadKernelGlobalsCPU(const KernelGlobalsCPU &kernel_globals,
+                                               OSLGlobals *osl_globals,
+                                               Profiler &cpu_profiler,
+                                               const int thread_index)
+    : KernelGlobalsCPU(kernel_globals),
+#ifdef WITH_OSL
+      osl(osl_globals, thread_index),
+#endif
+      cpu_profiler_(cpu_profiler)
+{
+#ifndef WITH_OSL
+  (void)thread_index;
+  (void)osl_globals;
+#endif
+
+#ifdef WITH_PATH_GUIDING
+  opgl_path_segment_storage = make_unique<openpgl::cpp::PathSegmentStorage>();
+#endif
+}
+
+void ThreadKernelGlobalsCPU::start_profiling()
+{
+  cpu_profiler_.add_state(&profiler);
+}
+
+void ThreadKernelGlobalsCPU::stop_profiling()
+{
+  cpu_profiler_.remove_state(&profiler);
+}
+
+CCL_NAMESPACE_END
--- a/intern/cycles/kernel/device/cpu/globals.h
+++ b/intern/cycles/kernel/device/cpu/globals.h
@@ -9,22 +9,23 @@
 #include "kernel/types.h"
 #include "kernel/util/profiler.h"

+#ifdef __OSL__
+#  include "kernel/osl/globals.h"
+#endif
+
 #include "util/guiding.h"  // IWYU pragma: keep
 #include "util/texture.h"  // IWYU pragma: keep
+#include "util/unique_ptr.h"

 CCL_NAMESPACE_BEGIN

+struct OSLGlobals;
+
 /* On the CPU, we pass along the struct KernelGlobals to nearly everywhere in
 * the kernel, to access constant data. These are all stored as flat arrays.
 * these are really just standard arrays. We can't use actually globals because
 * multiple renders may be running inside the same process. */

-#ifdef __OSL__
-struct OSLGlobals;
-struct OSLThreadData;
-struct OSLShadingSystem;
-#endif
-
 /* Array for kernel data, with size to be able to assert on invalid data access. */
 template<typename T> struct kernel_array {
  const ccl_always_inline T &fetch(const int index) const
@@ -37,38 +38,58 @@ template<typename T> struct kernel_array {
  int width = 0;
 };

+/* Constant globals shared between all threads. */
 struct KernelGlobalsCPU {
 #define KERNEL_DATA_ARRAY(type, name) kernel_array<type> name;
 #include "kernel/data_arrays.h"

  KernelData data = {};

+  ProfilingState profiler;
+};
+
+/* Per-thread global state.
+ *
+ * To avoid pointer indirection, the constant globals are copied to each thread.
+ *
+ * This may not be ideal for cache pressure. Alternative would be to pass an
+ * additional thread index to every function, and potentially to make the shared
+ * part an actual global variable. That would match the GPU more closely, but
+ * also require mutex locks for multiple Cycles instances. */
+struct ThreadKernelGlobalsCPU : public KernelGlobalsCPU {
+  ThreadKernelGlobalsCPU(const KernelGlobalsCPU &kernel_globals,
+                         OSLGlobals *osl_globals_memory,
+                         Profiler &cpu_profiler,
+                         const int thread_index);
+
+  ThreadKernelGlobalsCPU(ThreadKernelGlobalsCPU &other) = delete;
+  ThreadKernelGlobalsCPU(ThreadKernelGlobalsCPU &&other) noexcept = default;
+  ThreadKernelGlobalsCPU &operator=(const ThreadKernelGlobalsCPU &other) = delete;
+  ThreadKernelGlobalsCPU &operator=(ThreadKernelGlobalsCPU &&other) = delete;
+
+  void start_profiling();
+  void stop_profiling();
+
 #ifdef __OSL__
-  /* On the CPU, we also have the OSL globals here. Most data structures are shared
-   * with SVM, the difference is in the shaders and object/mesh attributes. */
-  OSLGlobals *osl = nullptr;
-  OSLShadingSystem *osl_ss = nullptr;
-  OSLThreadData *osl_tdata = nullptr;
-  int osl_thread_index = 0;
+  OSLThreadData osl;
 #endif

 #ifdef __PATH_GUIDING__
-  /* Pointers to global data structures. */
+  /* Pointers to shared global data structures. */
  openpgl::cpp::SampleStorage *opgl_sample_data_storage = nullptr;
  openpgl::cpp::Field *opgl_guiding_field = nullptr;

  /* Local data structures owned by the thread. */
-  openpgl::cpp::PathSegmentStorage *opgl_path_segment_storage = nullptr;
-  openpgl::cpp::SurfaceSamplingDistribution *opgl_surface_sampling_distribution = nullptr;
-  openpgl::cpp::VolumeSamplingDistribution *opgl_volume_sampling_distribution = nullptr;
+  unique_ptr<openpgl::cpp::PathSegmentStorage> opgl_path_segment_storage;
+  unique_ptr<openpgl::cpp::SurfaceSamplingDistribution> opgl_surface_sampling_distribution;
+  unique_ptr<openpgl::cpp::VolumeSamplingDistribution> opgl_volume_sampling_distribution;
 #endif

-  /* **** Run-time data ****  */
-
-  ProfilingState profiler;
+ protected:
+  Profiler &cpu_profiler_;
 };

-using KernelGlobals = const KernelGlobalsCPU *;
+using KernelGlobals = const ThreadKernelGlobalsCPU *;

 /* Abstraction macros */
 #define kernel_data_fetch(name, index) (kg->name.fetch(index))
--- a/intern/cycles/kernel/device/cpu/kernel.cpp
+++ b/intern/cycles/kernel/device/cpu/kernel.cpp
@@ -43,6 +43,8 @@
 /* do nothing */
 #endif

+#include "kernel/device/cpu/globals.h"
+
 #include "kernel/device/cpu/kernel.h"
 #define KERNEL_ARCH cpu
 #include "kernel/device/cpu/kernel_arch_impl.h"
--- a/intern/cycles/kernel/device/cpu/kernel_arch.h
+++ b/intern/cycles/kernel/device/cpu/kernel_arch.h
@@ -9,19 +9,21 @@
 */

 #define KERNEL_INTEGRATOR_FUNCTION(name) \
-  void KERNEL_FUNCTION_FULL_NAME(integrator_##name)(const KernelGlobalsCPU *ccl_restrict kg, \
-                                                    IntegratorStateCPU *state)
+  void KERNEL_FUNCTION_FULL_NAME(integrator_##name)( \
+      const ThreadKernelGlobalsCPU *ccl_restrict kg, IntegratorStateCPU *state)

 #define KERNEL_INTEGRATOR_SHADE_FUNCTION(name) \
-  void KERNEL_FUNCTION_FULL_NAME(integrator_##name)(const KernelGlobalsCPU *ccl_restrict kg, \
-                                                    IntegratorStateCPU *state, \
-                                                    ccl_global float *render_buffer)
+  void KERNEL_FUNCTION_FULL_NAME(integrator_##name)( \
+      const ThreadKernelGlobalsCPU *ccl_restrict kg, \
+      IntegratorStateCPU *state, \
+      ccl_global float *render_buffer)

 #define KERNEL_INTEGRATOR_INIT_FUNCTION(name) \
-  bool KERNEL_FUNCTION_FULL_NAME(integrator_##name)(const KernelGlobalsCPU *ccl_restrict kg, \
-                                                    IntegratorStateCPU *state, \
-                                                    KernelWorkTile *tile, \
-                                                    ccl_global float *render_buffer)
+  bool KERNEL_FUNCTION_FULL_NAME(integrator_##name)( \
+      const ThreadKernelGlobalsCPU *ccl_restrict kg, \
+      IntegratorStateCPU *state, \
+      KernelWorkTile *tile, \
+      ccl_global float *render_buffer)

 KERNEL_INTEGRATOR_INIT_FUNCTION(init_from_camera);
 KERNEL_INTEGRATOR_INIT_FUNCTION(init_from_bake);
@@ -77,16 +79,16 @@ KERNEL_FILM_CONVERT_FUNCTION(float4)
 * Shader evaluation.
 */

-void KERNEL_FUNCTION_FULL_NAME(shader_eval_background)(const KernelGlobalsCPU *kg,
+void KERNEL_FUNCTION_FULL_NAME(shader_eval_background)(const ThreadKernelGlobalsCPU *kg,
                                                       const KernelShaderEvalInput *input,
                                                       float *output,
                                                       const int offset);
-void KERNEL_FUNCTION_FULL_NAME(shader_eval_displace)(const KernelGlobalsCPU *kg,
+void KERNEL_FUNCTION_FULL_NAME(shader_eval_displace)(const ThreadKernelGlobalsCPU *kg,
                                                     const KernelShaderEvalInput *input,
                                                     float *output,
                                                     const int offset);
 void KERNEL_FUNCTION_FULL_NAME(shader_eval_curve_shadow_transparency)(
-    const KernelGlobalsCPU *kg,
+    const ThreadKernelGlobalsCPU *kg,
    const KernelShaderEvalInput *input,
    float *output,
    const int offset);
@@ -96,7 +98,7 @@ void KERNEL_FUNCTION_FULL_NAME(shader_eval_curve_shadow_transparency)(
 */

 bool KERNEL_FUNCTION_FULL_NAME(adaptive_sampling_convergence_check)(
-    const KernelGlobalsCPU *kg,
+    const ThreadKernelGlobalsCPU *kg,
    ccl_global float *render_buffer,
    const int x,
    const int y,
@@ -105,14 +107,14 @@ bool KERNEL_FUNCTION_FULL_NAME(adaptive_sampling_convergence_check)(
    const int offset,
    int stride);

-void KERNEL_FUNCTION_FULL_NAME(adaptive_sampling_filter_x)(const KernelGlobalsCPU *kg,
+void KERNEL_FUNCTION_FULL_NAME(adaptive_sampling_filter_x)(const ThreadKernelGlobalsCPU *kg,
                                                           ccl_global float *render_buffer,
                                                           const int y,
                                                           const int start_x,
                                                           const int width,
                                                           const int offset,
                                                           int stride);
-void KERNEL_FUNCTION_FULL_NAME(adaptive_sampling_filter_y)(const KernelGlobalsCPU *kg,
+void KERNEL_FUNCTION_FULL_NAME(adaptive_sampling_filter_y)(const ThreadKernelGlobalsCPU *kg,
                                                           ccl_global float *render_buffer,
                                                           const int x,
                                                           const int start_y,
@@ -124,7 +126,7 @@ void KERNEL_FUNCTION_FULL_NAME(adaptive_sampling_filter_y)(const KernelGlobalsCP
 * Cryptomatte.
 */

-void KERNEL_FUNCTION_FULL_NAME(cryptomatte_postprocess)(const KernelGlobalsCPU *kg,
+void KERNEL_FUNCTION_FULL_NAME(cryptomatte_postprocess)(const ThreadKernelGlobalsCPU *kg,
                                                        ccl_global float *render_buffer,
                                                        int pixel_index);

--- a/intern/cycles/kernel/device/cpu/kernel_arch_impl.h
+++ b/intern/cycles/kernel/device/cpu/kernel_arch_impl.h
@@ -62,7 +62,7 @@ CCL_NAMESPACE_BEGIN
 /* TODO: Either use something like get_work_pixel(), or simplify tile which is passed here, so
 * that it does not contain unused fields. */
 #define DEFINE_INTEGRATOR_INIT_KERNEL(name) \
-  bool KERNEL_FUNCTION_FULL_NAME(integrator_##name)(const KernelGlobalsCPU *kg, \
+  bool KERNEL_FUNCTION_FULL_NAME(integrator_##name)(const ThreadKernelGlobalsCPU *kg, \
                                                    IntegratorStateCPU *state, \
                                                    KernelWorkTile *tile, \
                                                    ccl_global float *render_buffer) \
@@ -72,29 +72,31 @@ CCL_NAMESPACE_BEGIN
  }

 #define DEFINE_INTEGRATOR_KERNEL(name) \
-  void KERNEL_FUNCTION_FULL_NAME(integrator_##name)(const KernelGlobalsCPU *kg, \
+  void KERNEL_FUNCTION_FULL_NAME(integrator_##name)(const ThreadKernelGlobalsCPU *kg, \
                                                    IntegratorStateCPU *state) \
  { \
    KERNEL_INVOKE(name, kg, state); \
  }

 #define DEFINE_INTEGRATOR_SHADE_KERNEL(name) \
-  void KERNEL_FUNCTION_FULL_NAME(integrator_##name)( \
-      const KernelGlobalsCPU *kg, IntegratorStateCPU *state, ccl_global float *render_buffer) \
+  void KERNEL_FUNCTION_FULL_NAME(integrator_##name)(const ThreadKernelGlobalsCPU *kg, \
+                                                    IntegratorStateCPU *state, \
+                                                    ccl_global float *render_buffer) \
  { \
    KERNEL_INVOKE(name, kg, state, render_buffer); \
  }

 #define DEFINE_INTEGRATOR_SHADOW_KERNEL(name) \
-  void KERNEL_FUNCTION_FULL_NAME(integrator_##name)(const KernelGlobalsCPU *kg, \
+  void KERNEL_FUNCTION_FULL_NAME(integrator_##name)(const ThreadKernelGlobalsCPU *kg, \
                                                    IntegratorStateCPU *state) \
  { \
    KERNEL_INVOKE(name, kg, &state->shadow); \
  }

 #define DEFINE_INTEGRATOR_SHADOW_SHADE_KERNEL(name) \
-  void KERNEL_FUNCTION_FULL_NAME(integrator_##name)( \
-      const KernelGlobalsCPU *kg, IntegratorStateCPU *state, ccl_global float *render_buffer) \
+  void KERNEL_FUNCTION_FULL_NAME(integrator_##name)(const ThreadKernelGlobalsCPU *kg, \
+                                                    IntegratorStateCPU *state, \
+                                                    ccl_global float *render_buffer) \
  { \
    KERNEL_INVOKE(name, kg, &state->shadow, render_buffer); \
  }
@@ -118,7 +120,7 @@ DEFINE_INTEGRATOR_SHADOW_SHADE_KERNEL(shade_shadow)
 * Shader evaluation.
 */

-void KERNEL_FUNCTION_FULL_NAME(shader_eval_displace)(const KernelGlobalsCPU *kg,
+void KERNEL_FUNCTION_FULL_NAME(shader_eval_displace)(const ThreadKernelGlobalsCPU *kg,
                                                     const KernelShaderEvalInput *input,
                                                     float *output,
                                                     const int offset)
@@ -130,7 +132,7 @@ void KERNEL_FUNCTION_FULL_NAME(shader_eval_displace)(const KernelGlobalsCPU *kg,
 #endif
 }

-void KERNEL_FUNCTION_FULL_NAME(shader_eval_background)(const KernelGlobalsCPU *kg,
+void KERNEL_FUNCTION_FULL_NAME(shader_eval_background)(const ThreadKernelGlobalsCPU *kg,
                                                       const KernelShaderEvalInput *input,
                                                       float *output,
                                                       const int offset)
@@ -143,7 +145,7 @@ void KERNEL_FUNCTION_FULL_NAME(shader_eval_background)(const KernelGlobalsCPU *k
 }

 void KERNEL_FUNCTION_FULL_NAME(shader_eval_curve_shadow_transparency)(
-    const KernelGlobalsCPU *kg,
+    const ThreadKernelGlobalsCPU *kg,
    const KernelShaderEvalInput *input,
    float *output,
    const int offset)
@@ -160,7 +162,7 @@ void KERNEL_FUNCTION_FULL_NAME(shader_eval_curve_shadow_transparency)(
 */

 bool KERNEL_FUNCTION_FULL_NAME(adaptive_sampling_convergence_check)(
-    const KernelGlobalsCPU *kg,
+    const ThreadKernelGlobalsCPU *kg,
    ccl_global float *render_buffer,
    const int x,
    const int y,
@@ -178,7 +180,7 @@ bool KERNEL_FUNCTION_FULL_NAME(adaptive_sampling_convergence_check)(
 #endif
 }

-void KERNEL_FUNCTION_FULL_NAME(adaptive_sampling_filter_x)(const KernelGlobalsCPU *kg,
+void KERNEL_FUNCTION_FULL_NAME(adaptive_sampling_filter_x)(const ThreadKernelGlobalsCPU *kg,
                                                           ccl_global float *render_buffer,
                                                           const int y,
                                                           const int start_x,
@@ -193,7 +195,7 @@ void KERNEL_FUNCTION_FULL_NAME(adaptive_sampling_filter_x)(const KernelGlobalsCP
 #endif
 }

-void KERNEL_FUNCTION_FULL_NAME(adaptive_sampling_filter_y)(const KernelGlobalsCPU *kg,
+void KERNEL_FUNCTION_FULL_NAME(adaptive_sampling_filter_y)(const ThreadKernelGlobalsCPU *kg,
                                                           ccl_global float *render_buffer,
                                                           const int x,
                                                           const int start_y,
@@ -212,7 +214,7 @@ void KERNEL_FUNCTION_FULL_NAME(adaptive_sampling_filter_y)(const KernelGlobalsCP
 * Cryptomatte.
 */

-void KERNEL_FUNCTION_FULL_NAME(cryptomatte_postprocess)(const KernelGlobalsCPU *kg,
+void KERNEL_FUNCTION_FULL_NAME(cryptomatte_postprocess)(const ThreadKernelGlobalsCPU *kg,
                                                        ccl_global float *render_buffer,
                                                        const int pixel_index)
 {
--- a/intern/cycles/kernel/device/cpu/kernel_avx2.cpp
+++ b/intern/cycles/kernel/device/cpu/kernel_avx2.cpp
@@ -23,6 +23,7 @@
 #  endif
 #endif /* WITH_CYCLES_OPTIMIZED_KERNEL_AVX2 */

+#include "kernel/device/cpu/globals.h"
 #include "kernel/device/cpu/kernel.h"
 #define KERNEL_ARCH cpu_avx2
 #include "kernel/device/cpu/kernel_arch_impl.h"
--- a/intern/cycles/kernel/device/cpu/kernel_sse42.cpp
+++ b/intern/cycles/kernel/device/cpu/kernel_sse42.cpp
@@ -21,6 +21,7 @@
 #  endif
 #endif /* WITH_CYCLES_OPTIMIZED_KERNEL_SSE42 */

+#include "kernel/device/cpu/globals.h"
 #include "kernel/device/cpu/kernel.h"
 #define KERNEL_ARCH cpu_sse42
 #include "kernel/device/cpu/kernel_arch_impl.h"
--- a/intern/cycles/kernel/osl/closures.cpp
+++ b/intern/cycles/kernel/osl/closures.cpp
@@ -76,18 +76,17 @@ void OSLRenderServices::register_closures(OSL::ShadingSystem *ss)
 /* Surface & Background */

 template<>
-void osl_eval_nodes<SHADER_TYPE_SURFACE>(const KernelGlobalsCPU *kg,
+void osl_eval_nodes<SHADER_TYPE_SURFACE>(const ThreadKernelGlobalsCPU *kg,
                                         const void *state,
                                         ShaderData *sd,
                                         const uint32_t path_flag)
 {
  /* setup shader globals from shader data */
-  OSLThreadData *tdata = kg->osl_tdata;
  shaderdata_to_shaderglobals(
-      kg, sd, path_flag, reinterpret_cast<ShaderGlobals *>(&tdata->globals));
+      kg, sd, path_flag, reinterpret_cast<ShaderGlobals *>(&kg->osl.shader_globals));

  /* clear trace data */
-  tdata->tracedata.init = false;
+  kg->osl.tracedata.init = false;

  /* Used by render-services. */
  sd->osl_globals = kg;
@@ -101,30 +100,30 @@ void osl_eval_nodes<SHADER_TYPE_SURFACE>(const KernelGlobalsCPU *kg,
  }

  /* execute shader for this point */
-  OSL::ShadingSystem *ss = (OSL::ShadingSystem *)kg->osl_ss;
-  OSL::ShaderGlobals *globals = &tdata->globals;
-  OSL::ShadingContext *octx = tdata->context;
+  OSL::ShadingSystem *ss = (OSL::ShadingSystem *)kg->osl.ss;
+  OSL::ShaderGlobals *globals = &kg->osl.shader_globals;
+  OSL::ShadingContext *octx = kg->osl.context;
  const int shader = sd->shader & SHADER_MASK;

  if (sd->object == OBJECT_NONE && sd->lamp == LAMP_NONE) {
    /* background */
-    if (kg->osl->background_state) {
+    if (kg->osl.globals->background_state) {
 #if OSL_LIBRARY_VERSION_CODE >= 11304
      ss->execute(*octx,
-                  *(kg->osl->background_state),
-                  kg->osl_thread_index,
+                  *(kg->osl.globals->background_state),
+                  kg->osl.thread_index,
                  0,
                  *globals,
                  nullptr,
                  nullptr);
 #else
-      ss->execute(octx, *(kg->osl->background_state), *globals);
+      ss->execute(octx, *(kg->osl.globals->background_state), *globals);
 #endif
    }
  }
  else {
    /* automatic bump shader */
-    if (kg->osl->bump_state[shader]) {
+    if (kg->osl.globals->bump_state[shader]) {
      /* save state */
      const float3 P = sd->P;
      const float dP = sd->dP;
@@ -134,12 +133,13 @@ void osl_eval_nodes<SHADER_TYPE_SURFACE>(const KernelGlobalsCPU *kg,
      /* set state as if undisplaced */
      if (sd->flag & SD_HAS_DISPLACEMENT) {
        float data[9];
-        const bool found = kg->osl->services->get_attribute(sd,
-                                                            true,
-                                                            OSLRenderServices::u_empty,
-                                                            TypeVector,
-                                                            OSLRenderServices::u_geom_undisplaced,
-                                                            data);
+        const bool found = kg->osl.globals->services->get_attribute(
+            sd,
+            true,
+            OSLRenderServices::u_empty,
+            TypeVector,
+            OSLRenderServices::u_geom_undisplaced,
+            data);
        (void)found;
        assert(found);

@@ -162,14 +162,14 @@ void osl_eval_nodes<SHADER_TYPE_SURFACE>(const KernelGlobalsCPU *kg,
 /* execute bump shader */
 #if OSL_LIBRARY_VERSION_CODE >= 11304
      ss->execute(*octx,
-                  *(kg->osl->bump_state[shader]),
-                  kg->osl_thread_index,
+                  *(kg->osl.globals->bump_state[shader]),
+                  kg->osl.thread_index,
                  0,
                  *globals,
                  nullptr,
                  nullptr);
 #else
-      ss->execute(octx, *(kg->osl->bump_state[shader]), *globals);
+      ss->execute(octx, *(kg->osl.globals->bump_state[shader]), *globals);
 #endif

      /* reset state */
@@ -182,17 +182,17 @@ void osl_eval_nodes<SHADER_TYPE_SURFACE>(const KernelGlobalsCPU *kg,
    }

    /* surface shader */
-    if (kg->osl->surface_state[shader]) {
+    if (kg->osl.globals->surface_state[shader]) {
 #if OSL_LIBRARY_VERSION_CODE >= 11304
      ss->execute(*octx,
-                  *(kg->osl->surface_state[shader]),
-                  kg->osl_thread_index,
+                  *(kg->osl.globals->surface_state[shader]),
+                  kg->osl.thread_index,
                  0,
                  *globals,
                  nullptr,
                  nullptr);
 #else
-      ss->execute(octx, *(kg->osl->surface_state[shader]), *globals);
+      ss->execute(octx, *(kg->osl.globals->surface_state[shader]), *globals);
 #endif
    }
  }
@@ -206,18 +206,17 @@ void osl_eval_nodes<SHADER_TYPE_SURFACE>(const KernelGlobalsCPU *kg,
 /* Volume */

 template<>
-void osl_eval_nodes<SHADER_TYPE_VOLUME>(const KernelGlobalsCPU *kg,
+void osl_eval_nodes<SHADER_TYPE_VOLUME>(const ThreadKernelGlobalsCPU *kg,
                                        const void *state,
                                        ShaderData *sd,
                                        const uint32_t path_flag)
 {
  /* setup shader globals from shader data */
-  OSLThreadData *tdata = kg->osl_tdata;
  shaderdata_to_shaderglobals(
-      kg, sd, path_flag, reinterpret_cast<ShaderGlobals *>(&tdata->globals));
+      kg, sd, path_flag, reinterpret_cast<ShaderGlobals *>(&kg->osl.shader_globals));

  /* clear trace data */
-  tdata->tracedata.init = false;
+  kg->osl.tracedata.init = false;

  /* Used by render-services. */
  sd->osl_globals = kg;
@@ -231,22 +230,22 @@ void osl_eval_nodes<SHADER_TYPE_VOLUME>(const KernelGlobalsCPU *kg,
  }

  /* execute shader */
-  OSL::ShadingSystem *ss = (OSL::ShadingSystem *)kg->osl_ss;
-  OSL::ShaderGlobals *globals = &tdata->globals;
-  OSL::ShadingContext *octx = tdata->context;
+  OSL::ShadingSystem *ss = (OSL::ShadingSystem *)kg->osl.ss;
+  OSL::ShaderGlobals *globals = &kg->osl.shader_globals;
+  OSL::ShadingContext *octx = kg->osl.context;
  const int shader = sd->shader & SHADER_MASK;

-  if (kg->osl->volume_state[shader]) {
+  if (kg->osl.globals->volume_state[shader]) {
 #if OSL_LIBRARY_VERSION_CODE >= 11304
    ss->execute(*octx,
-                *(kg->osl->volume_state[shader]),
-                kg->osl_thread_index,
+                *(kg->osl.globals->volume_state[shader]),
+                kg->osl.thread_index,
                0,
                *globals,
                nullptr,
                nullptr);
 #else
-    ss->execute(octx, *(kg->osl->volume_state[shader]), *globals);
+    ss->execute(octx, *(kg->osl.globals->volume_state[shader]), *globals);
 #endif
  }

@@ -259,18 +258,17 @@ void osl_eval_nodes<SHADER_TYPE_VOLUME>(const KernelGlobalsCPU *kg,
 /* Displacement */

 template<>
-void osl_eval_nodes<SHADER_TYPE_DISPLACEMENT>(const KernelGlobalsCPU *kg,
+void osl_eval_nodes<SHADER_TYPE_DISPLACEMENT>(const ThreadKernelGlobalsCPU *kg,
                                              const void *state,
                                              ShaderData *sd,
                                              const uint32_t path_flag)
 {
  /* setup shader globals from shader data */
-  OSLThreadData *tdata = kg->osl_tdata;
  shaderdata_to_shaderglobals(
-      kg, sd, path_flag, reinterpret_cast<ShaderGlobals *>(&tdata->globals));
+      kg, sd, path_flag, reinterpret_cast<ShaderGlobals *>(&kg->osl.shader_globals));

  /* clear trace data */
-  tdata->tracedata.init = false;
+  kg->osl.tracedata.init = false;

  /* Used by render-services. */
  sd->osl_globals = kg;
@@ -278,22 +276,22 @@ void osl_eval_nodes<SHADER_TYPE_DISPLACEMENT>(const KernelGlobalsCPU *kg,
  sd->osl_shadow_path_state = nullptr;

  /* execute shader */
-  OSL::ShadingSystem *ss = (OSL::ShadingSystem *)kg->osl_ss;
-  OSL::ShaderGlobals *globals = &tdata->globals;
-  OSL::ShadingContext *octx = tdata->context;
+  OSL::ShadingSystem *ss = (OSL::ShadingSystem *)kg->osl.ss;
+  OSL::ShaderGlobals *globals = &kg->osl.shader_globals;
+  OSL::ShadingContext *octx = kg->osl.context;
  const int shader = sd->shader & SHADER_MASK;

-  if (kg->osl->displacement_state[shader]) {
+  if (kg->osl.globals->displacement_state[shader]) {
 #if OSL_LIBRARY_VERSION_CODE >= 11304
    ss->execute(*octx,
-                *(kg->osl->displacement_state[shader]),
-                kg->osl_thread_index,
+                *(kg->osl.globals->displacement_state[shader]),
+                kg->osl.thread_index,
                0,
                *globals,
                nullptr,
                nullptr);
 #else
-    ss->execute(octx, *(kg->osl->displacement_state[shader]), *globals);
+    ss->execute(octx, *(kg->osl.globals->displacement_state[shader]), *globals);
 #endif
  }

--- a/intern/cycles/kernel/osl/globals.cpp
+++ b/intern/cycles/kernel/osl/globals.cpp
@@ -4,57 +4,55 @@

 #include <OSL/oslexec.h>

-#include "kernel/globals.h"
-#include "kernel/types.h"
-
 #include "kernel/osl/globals.h"
-#include "kernel/osl/services.h"

 CCL_NAMESPACE_BEGIN

-void OSLGlobals::thread_init(KernelGlobalsCPU *kg, OSLGlobals *osl_globals, const int thread_index)
+OSLThreadData::OSLThreadData(OSLGlobals *osl_globals, const int thread_index)
+    : globals(osl_globals), thread_index(thread_index)
 {
-  /* no osl used? */
-  if (!osl_globals->use) {
-    kg->osl = nullptr;
+  if (globals == nullptr || globals->use == false) {
    return;
  }

-  /* Per thread kernel data init. */
-  kg->osl = osl_globals;
+  ss = globals->ss;

-  OSL::ShadingSystem *ss = kg->osl->ss;
-  OSLThreadData *tdata = new OSLThreadData();
+  memset((void *)&shader_globals, 0, sizeof(shader_globals));
+  shader_globals.tracedata = &tracedata;

-  memset((void *)&tdata->globals, 0, sizeof(OSL::ShaderGlobals));
-  tdata->globals.tracedata = &tdata->tracedata;
-  tdata->osl_thread_info = ss->create_thread_info();
-  tdata->context = ss->get_context(tdata->osl_thread_info);
-
-  tdata->oiio_thread_info = osl_globals->ts->get_perthread_info();
-
-  kg->osl_ss = (OSLShadingSystem *)ss;
-  kg->osl_tdata = tdata;
-  kg->osl_thread_index = thread_index;
+  osl_thread_info = ss->create_thread_info();
+  context = ss->get_context(osl_thread_info);
+  oiio_thread_info = globals->ts->get_perthread_info();
 }

-void OSLGlobals::thread_free(KernelGlobalsCPU *kg)
+OSLThreadData::~OSLThreadData()
 {
-  if (!kg->osl) {
-    return;
+  if (context) {
+    ss->release_context(context);
  }
+  if (osl_thread_info) {
+    ss->destroy_thread_info(osl_thread_info);
+  }
+}

-  OSL::ShadingSystem *ss = (OSL::ShadingSystem *)kg->osl_ss;
-  OSLThreadData *tdata = kg->osl_tdata;
-  ss->release_context(tdata->context);
+OSLThreadData::OSLThreadData(OSLThreadData &&other) noexcept
+    : globals(other.globals),
+      ss(other.ss),
+      thread_index(other.thread_index),
+      shader_globals(other.shader_globals),
+      tracedata(other.tracedata),
+      osl_thread_info(other.osl_thread_info),
+      context(other.context),
+      oiio_thread_info(other.oiio_thread_info)
+{
+  shader_globals.tracedata = &tracedata;

-  ss->destroy_thread_info(tdata->osl_thread_info);
-
-  delete tdata;
-
-  kg->osl = nullptr;
-  kg->osl_ss = nullptr;
-  kg->osl_tdata = nullptr;
+  memset((void *)&other.shader_globals, 0, sizeof(other.shader_globals));
+  memset((void *)&other.tracedata, 0, sizeof(other.tracedata));
+  other.thread_index = -1;
+  other.context = nullptr;
+  other.osl_thread_info = nullptr;
+  other.oiio_thread_info = nullptr;
 }

 CCL_NAMESPACE_END
--- a/intern/cycles/kernel/osl/globals.h
+++ b/intern/cycles/kernel/osl/globals.h
@@ -23,6 +23,7 @@ CCL_NAMESPACE_BEGIN

 class OSLRenderServices;
 class ColorSpaceProcessor;
+struct ThreadKernelGlobalsCPU;

 /* OSL Globals
 *
@@ -39,12 +40,6 @@ struct OSLGlobals {
    use = false;
  }

-  /* per thread data */
-  static void thread_init(struct KernelGlobalsCPU *kg,
-                          OSLGlobals *osl_globals,
-                          const int thread_index);
-  static void thread_free(struct KernelGlobalsCPU *kg);
-
  bool use;

  /* shading system */
@@ -78,11 +73,27 @@ struct OSLTraceData {

 /* thread key for thread specific data lookup */
 struct OSLThreadData {
-  OSL::ShaderGlobals globals;
-  OSL::PerThreadInfo *osl_thread_info;
-  OSLTraceData tracedata;
-  OSL::ShadingContext *context;
-  OIIO::TextureSystem::Perthread *oiio_thread_info;
+  /* Global Data */
+  OSLGlobals *globals = nullptr;
+  OSL::ShadingSystem *ss = nullptr;
+
+  /* Per-thread data. */
+  int thread_index = -1;
+
+  mutable OSL::ShaderGlobals shader_globals;
+  mutable OSLTraceData tracedata;
+
+  OSL::PerThreadInfo *osl_thread_info = nullptr;
+  OSL::ShadingContext *context = nullptr;
+  OIIO::TextureSystem::Perthread *oiio_thread_info = nullptr;
+
+  OSLThreadData(OSLGlobals *globals, const int thread_index);
+  ~OSLThreadData();
+
+  OSLThreadData(OSLThreadData &other) = delete;
+  OSLThreadData(OSLThreadData &&other) noexcept;
+  OSLThreadData &operator=(const OSLThreadData &other) = delete;
+  OSLThreadData &operator=(OSLThreadData &&other) = delete;
 };

 CCL_NAMESPACE_END
--- a/intern/cycles/kernel/osl/osl.h
+++ b/intern/cycles/kernel/osl/osl.h
@@ -172,7 +172,7 @@ ccl_device void flatten_closure_tree(KernelGlobals kg,
 #ifndef __KERNEL_GPU__

 template<ShaderType type>
-void osl_eval_nodes(const KernelGlobalsCPU *kg,
+void osl_eval_nodes(const ThreadKernelGlobalsCPU *kg,
                    const void *state,
                    ShaderData *sd,
                    uint32_t path_flag);
--- a/intern/cycles/kernel/osl/services.cpp
+++ b/intern/cycles/kernel/osl/services.cpp
@@ -148,7 +148,7 @@ bool OSLRenderServices::get_matrix(OSL::ShaderGlobals *sg,
   * a concept of shader space, so we just use object space for both. */
  if (xform) {
    const ShaderData *sd = (const ShaderData *)xform;
-    const KernelGlobalsCPU *kg = sd->osl_globals;
+    const ThreadKernelGlobalsCPU *kg = sd->osl_globals;
    const int object = sd->object;

    if (object != OBJECT_NONE) {
@@ -188,7 +188,7 @@ bool OSLRenderServices::get_inverse_matrix(OSL::ShaderGlobals *sg,
   * a concept of shader space, so we just use object space for both. */
  if (xform) {
    const ShaderData *sd = (const ShaderData *)xform;
-    const KernelGlobalsCPU *kg = sd->osl_globals;
+    const ThreadKernelGlobalsCPU *kg = sd->osl_globals;
    const int object = sd->object;

    if (object != OBJECT_NONE) {
@@ -225,7 +225,7 @@ bool OSLRenderServices::get_matrix(OSL::ShaderGlobals *sg,
                                   const float time)
 {
  ShaderData *sd = (ShaderData *)(sg->renderstate);
-  const KernelGlobalsCPU *kg = sd->osl_globals;
+  const ThreadKernelGlobalsCPU *kg = sd->osl_globals;

  if (from == u_ndc) {
    copy_matrix(result, kernel_data.cam.ndctoworld);
@@ -257,7 +257,7 @@ bool OSLRenderServices::get_inverse_matrix(OSL::ShaderGlobals *sg,
                                           const float time)
 {
  ShaderData *sd = (ShaderData *)(sg->renderstate);
-  const KernelGlobalsCPU *kg = sd->osl_globals;
+  const ThreadKernelGlobalsCPU *kg = sd->osl_globals;

  if (to == u_ndc) {
    copy_matrix(result, kernel_data.cam.worldtondc);
@@ -291,7 +291,7 @@ bool OSLRenderServices::get_matrix(OSL::ShaderGlobals *sg,
   * a concept of shader space, so we just use object space for both. */
  if (xform) {
    const ShaderData *sd = (const ShaderData *)xform;
-    const KernelGlobalsCPU *kg = sd->osl_globals;
+    const ThreadKernelGlobalsCPU *kg = sd->osl_globals;
    const int object = sd->object;

    if (object != OBJECT_NONE) {
@@ -319,7 +319,7 @@ bool OSLRenderServices::get_inverse_matrix(OSL::ShaderGlobals *sg,
   * a concept of shader space, so we just use object space for both. */
  if (xform) {
    const ShaderData *sd = (const ShaderData *)xform;
-    const KernelGlobalsCPU *kg = sd->osl_globals;
+    const ThreadKernelGlobalsCPU *kg = sd->osl_globals;
    const int object = sd->object;

    if (object != OBJECT_NONE) {
@@ -344,7 +344,7 @@ bool OSLRenderServices::get_matrix(OSL::ShaderGlobals *sg,
                                   OSLUStringHash from)
 {
  ShaderData *sd = (ShaderData *)(sg->renderstate);
-  const KernelGlobalsCPU *kg = sd->osl_globals;
+  const ThreadKernelGlobalsCPU *kg = sd->osl_globals;

  if (from == u_ndc) {
    copy_matrix(result, kernel_data.cam.ndctoworld);
@@ -371,7 +371,7 @@ bool OSLRenderServices::get_inverse_matrix(OSL::ShaderGlobals *sg,
                                           OSLUStringHash to)
 {
  ShaderData *sd = (ShaderData *)(sg->renderstate);
-  const KernelGlobalsCPU *kg = sd->osl_globals;
+  const ThreadKernelGlobalsCPU *kg = sd->osl_globals;

  if (to == u_ndc) {
    copy_matrix(result, kernel_data.cam.worldtondc);
@@ -727,7 +727,7 @@ static bool set_attribute_matrix(const Transform &tfm, const TypeDesc type, void
  return false;
 }

-static bool get_object_attribute(const KernelGlobalsCPU *kg,
+static bool get_object_attribute(const ThreadKernelGlobalsCPU *kg,
                                 ShaderData *sd,
                                 const AttributeDescriptor &desc,
                                 const TypeDesc &type,
@@ -803,7 +803,7 @@ static bool get_object_attribute(const KernelGlobalsCPU *kg,
  return false;
 }

-bool OSLRenderServices::get_object_standard_attribute(const KernelGlobalsCPU *kg,
+bool OSLRenderServices::get_object_standard_attribute(const ThreadKernelGlobalsCPU *kg,
                                                      ShaderData *sd,
                                                      OSLUStringHash name,
                                                      const TypeDesc type,
@@ -924,7 +924,7 @@ bool OSLRenderServices::get_object_standard_attribute(const KernelGlobalsCPU *kg
    return set_attribute_float3_3(P, type, derivatives, val);
  }
  if (name == u_geom_name) {
-    const ustring object_name = kg->osl->object_names[sd->object];
+    const ustring object_name = kg->osl.globals->object_names[sd->object];
    return set_attribute_string(object_name, type, derivatives, val);
  }
  if (name == u_is_smooth) {
@@ -979,7 +979,7 @@ bool OSLRenderServices::get_object_standard_attribute(const KernelGlobalsCPU *kg
  return get_background_attribute(kg, sd, name, type, derivatives, val);
 }

-bool OSLRenderServices::get_background_attribute(const KernelGlobalsCPU *kg,
+bool OSLRenderServices::get_background_attribute(const ThreadKernelGlobalsCPU *kg,
                                                 ShaderData *sd,
                                                 OSLUStringHash name,
                                                 const TypeDesc type,
@@ -1038,8 +1038,7 @@ bool OSLRenderServices::get_background_attribute(const KernelGlobalsCPU *kg,
  }
  if (name == u_ndc) {
    /* NDC coordinates with special exception for orthographic projection. */
-    OSLThreadData *tdata = kg->osl_tdata;
-    OSL::ShaderGlobals *globals = &tdata->globals;
+    OSL::ShaderGlobals *globals = &kg->osl.shader_globals;
    float3 ndc[3];

    if ((globals->raytype & PATH_RAY_CAMERA) && sd->object == OBJECT_NONE &&
@@ -1090,14 +1089,15 @@ bool OSLRenderServices::get_attribute(ShaderData *sd,
                                      OSLUStringHash name,
                                      void *val)
 {
-  const KernelGlobalsCPU *kg = sd->osl_globals;
+  const ThreadKernelGlobalsCPU *kg = sd->osl_globals;
  int object;

  /* lookup of attribute on another object */
  if (object_name != u_empty) {
-    const OSLGlobals::ObjectNameMap::iterator it = kg->osl->object_name_map.find(object_name);
+    const OSLGlobals::ObjectNameMap::iterator it = kg->osl.globals->object_name_map.find(
+        object_name);

-    if (it == kg->osl->object_name_map.end()) {
+    if (it == kg->osl.globals->object_name_map.end()) {
      return false;
    }

@@ -1246,7 +1246,7 @@ bool OSLRenderServices::texture(OSLUStringHash filename,
  OSLTextureHandle *handle = (OSLTextureHandle *)texture_handle;
  const OSLTextureHandle::Type texture_type = (handle) ? handle->type : OSLTextureHandle::OIIO;
  ShaderData *sd = (ShaderData *)(sg->renderstate);
-  KernelGlobals kernel_globals = sd->osl_globals;
+  const ThreadKernelGlobalsCPU *kernel_globals = sd->osl_globals;
  bool status = false;

  switch (texture_type) {
@@ -1351,8 +1351,7 @@ bool OSLRenderServices::texture(OSLUStringHash filename,

      if (handle && handle->oiio_handle) {
        if (texture_thread_info == nullptr) {
-          OSLThreadData *tdata = kernel_globals->osl_tdata;
-          texture_thread_info = tdata->oiio_thread_info;
+          texture_thread_info = kernel_globals->osl.oiio_thread_info;
        }

        status = ts->texture(handle->oiio_handle,
@@ -1460,9 +1459,8 @@ bool OSLRenderServices::texture3d(OSLUStringHash filename,
      if (handle && handle->oiio_handle) {
        if (texture_thread_info == nullptr) {
          ShaderData *sd = (ShaderData *)(sg->renderstate);
-          KernelGlobals kernel_globals = sd->osl_globals;
-          OSLThreadData *tdata = kernel_globals->osl_tdata;
-          texture_thread_info = tdata->oiio_thread_info;
+          const ThreadKernelGlobalsCPU *kernel_globals = sd->osl_globals;
+          texture_thread_info = kernel_globals->osl.oiio_thread_info;
        }

        status = ts->texture3d(handle->oiio_handle,
@@ -1546,9 +1544,8 @@ bool OSLRenderServices::environment(OSLUStringHash filename,
  if (handle && handle->oiio_handle) {
    if (thread_info == nullptr) {
      ShaderData *sd = (ShaderData *)(sg->renderstate);
-      KernelGlobals kernel_globals = sd->osl_globals;
-      OSLThreadData *tdata = kernel_globals->osl_tdata;
-      thread_info = tdata->oiio_thread_info;
+      const ThreadKernelGlobalsCPU *kernel_globals = sd->osl_globals;
+      thread_info = kernel_globals->osl.oiio_thread_info;
    }

    status = ts->environment(handle->oiio_handle,
@@ -1726,7 +1723,7 @@ bool OSLRenderServices::trace(TraceOpt &options,
  tracedata->hit = false;
  tracedata->sd.osl_globals = sd->osl_globals;

-  const KernelGlobalsCPU *kg = sd->osl_globals;
+  const ThreadKernelGlobalsCPU *kg = sd->osl_globals;

  /* Can't ray-trace from shaders like displacement, before BVH exists. */
  if (kernel_data.bvh.bvh_layout == BVH_LAYOUT_NONE) {
@@ -1759,7 +1756,7 @@ bool OSLRenderServices::getmessage(OSL::ShaderGlobals *sg,
      }

      ShaderData *sd = &tracedata->sd;
-      const KernelGlobalsCPU *kg = sd->osl_globals;
+      const ThreadKernelGlobalsCPU *kg = sd->osl_globals;

      if (!tracedata->setup) {
        /* lazy shader data setup */
--- a/intern/cycles/kernel/osl/services.h
+++ b/intern/cycles/kernel/osl/services.h
@@ -30,7 +30,7 @@ CCL_NAMESPACE_BEGIN

 class Scene;
 struct ShaderData;
-struct KernelGlobalsCPU;
+struct ThreadKernelGlobalsCPU;

 /* OSL Texture Handle
 *
@@ -276,13 +276,13 @@ class OSLRenderServices : public OSL::RendererServices {
                        void *data) override;
 #endif

-  static bool get_background_attribute(const KernelGlobalsCPU *kg,
+  static bool get_background_attribute(const ThreadKernelGlobalsCPU *kg,
                                       ShaderData *sd,
                                       OSLUStringHash name,
                                       const TypeDesc type,
                                       bool derivatives,
                                       void *val);
-  static bool get_object_standard_attribute(const KernelGlobalsCPU *kg,
+  static bool get_object_standard_attribute(const ThreadKernelGlobalsCPU *kg,
                                            ShaderData *sd,
                                            OSLUStringHash name,
                                            const TypeDesc type,
--- a/intern/cycles/kernel/types.h
+++ b/intern/cycles/kernel/types.h
@@ -1191,7 +1191,7 @@ struct ccl_align(16) ShaderData
 #  ifdef __KERNEL_GPU__
  ccl_private uint8_t *osl_closure_pool;
 #  else
-  const struct KernelGlobalsCPU *osl_globals;
+  const struct ThreadKernelGlobalsCPU *osl_globals;
  const struct IntegratorStateCPU *osl_path_state;
  const struct IntegratorShadowStateCPU *osl_shadow_path_state;
 #  endif
--- a/intern/cycles/scene/osl.cpp
+++ b/intern/cycles/scene/osl.cpp
@@ -141,7 +141,7 @@ void OSLShaderManager::device_update_specific(Device *device,
  /* collect shader groups from all shaders */
  for (Shader *shader : scene->shaders) {
    device->foreach_device([shader, background_shader](Device *sub_device) {
-      OSLGlobals *og = (OSLGlobals *)sub_device->get_cpu_osl_memory();
+      OSLGlobals *og = sub_device->get_cpu_osl_memory();

      /* push state to array for lookup */
      og->surface_state.push_back(shader->osl_surface_ref);
@@ -161,7 +161,7 @@ void OSLShaderManager::device_update_specific(Device *device,

  /* setup shader engine */
  device->foreach_device([](Device *sub_device) {
-    OSLGlobals *og = (OSLGlobals *)sub_device->get_cpu_osl_memory();
+    OSLGlobals *og = sub_device->get_cpu_osl_memory();
    OSL::ShadingSystem *ss = ss_shared[sub_device->info.type];

    og->ss = ss;
@@ -228,7 +228,7 @@ void OSLShaderManager::device_free(Device *device, DeviceScene *dscene, Scene *s

  /* clear shader engine */
  device->foreach_device([](Device *sub_device) {
-    OSLGlobals *og = (OSLGlobals *)sub_device->get_cpu_osl_memory();
+    OSLGlobals *og = sub_device->get_cpu_osl_memory();

    og->use = false;
    og->ss = nullptr;
@@ -712,7 +712,7 @@ void OSLShaderManager::osl_image_slots(Device *device,
 {
  set<OSLRenderServices *> services_shared;
  device->foreach_device([&services_shared](Device *sub_device) {
-    OSLGlobals *og = (OSLGlobals *)sub_device->get_cpu_osl_memory();
+    OSLGlobals *og = sub_device->get_cpu_osl_memory();
    services_shared.insert(og->services);
  });