From a8654a1dbea218e8e072b651f0987fbc584f693e Mon Sep 17 00:00:00 2001
From: Brecht Van Lommel <brecht@blender.org>
Date: Sun, 29 Dec 2024 23:13:45 +0100
Subject: [PATCH] Refactor: Cycles: Make CPU kernel globals storage more sane

Pull Request: https://projects.blender.org/blender/blender/pulls/132361
---
 intern/cycles/app/cycles_precompute.cpp       | 12 +--
 intern/cycles/device/CMakeLists.txt           |  2 -
 intern/cycles/device/cpu/device_impl.cpp      | 12 +--
 intern/cycles/device/cpu/device_impl.h        |  4 +-
 intern/cycles/device/cpu/kernel.h             | 21 +++--
 .../device/cpu/kernel_thread_globals.cpp      | 91 ------------------
 .../cycles/device/cpu/kernel_thread_globals.h | 45 ---------
 intern/cycles/device/device.cpp               |  4 +-
 intern/cycles/device/device.h                 |  8 +-
 intern/cycles/device/multi/device.cpp         |  2 +-
 intern/cycles/device/optix/device_impl.cpp    |  2 +-
 intern/cycles/device/optix/device_impl.h      |  2 +-
 .../cycles/integrator/path_trace_work_cpu.cpp | 36 ++++----
 .../cycles/integrator/path_trace_work_cpu.h   | 10 +-
 intern/cycles/integrator/shader_eval.cpp      |  7 +-
 intern/cycles/kernel/CMakeLists.txt           |  1 +
 intern/cycles/kernel/device/cpu/bvh.h         | 10 +-
 intern/cycles/kernel/device/cpu/globals.cpp   | 43 +++++++++
 intern/cycles/kernel/device/cpu/globals.h     | 61 ++++++++----
 intern/cycles/kernel/device/cpu/kernel.cpp    |  2 +
 intern/cycles/kernel/device/cpu/kernel_arch.h | 34 +++----
 .../kernel/device/cpu/kernel_arch_impl.h      | 30 +++---
 .../cycles/kernel/device/cpu/kernel_avx2.cpp  |  1 +
 .../cycles/kernel/device/cpu/kernel_sse42.cpp |  1 +
 intern/cycles/kernel/osl/closures.cpp         | 92 +++++++++----------
 intern/cycles/kernel/osl/globals.cpp          | 68 +++++++-------
 intern/cycles/kernel/osl/globals.h            | 33 ++++---
 intern/cycles/kernel/osl/osl.h                |  2 +-
 intern/cycles/kernel/osl/services.cpp         | 53 +++++------
 intern/cycles/kernel/osl/services.h           |  6 +-
 intern/cycles/kernel/types.h                  |  2 +-
 intern/cycles/scene/osl.cpp                   |  8 +-
 32 files changed, 319 insertions(+), 386 deletions(-)
 delete mode 100644 intern/cycles/device/cpu/kernel_thread_globals.cpp
 delete mode 100644 intern/cycles/device/cpu/kernel_thread_globals.h
 create mode 100644 intern/cycles/kernel/device/cpu/globals.cpp

diff --git a/intern/cycles/app/cycles_precompute.cpp b/intern/cycles/app/cycles_precompute.cpp
index e5b31771c4f..8e393db309a 100644
--- a/intern/cycles/app/cycles_precompute.cpp
+++ b/intern/cycles/app/cycles_precompute.cpp
@@ -19,8 +19,6 @@ CCL_NAMESPACE_BEGIN
 
 static float precompute_ggx_E(const float rough, const float mu, const float3 rand)
 {
-  KernelGlobalsCPU kg;
-
   MicrofacetBsdf bsdf;
   bsdf.weight = one_float3();
   bsdf.sample_weight = 1.0f;
@@ -36,7 +34,7 @@ static float precompute_ggx_E(const float rough, const float mu, const float3 ra
   float pdf = 0.0f;
   float sampled_eta;
   float2 sampled_roughness;
-  bsdf_microfacet_ggx_sample(&kg,
+  bsdf_microfacet_ggx_sample(nullptr,
                              (ShaderClosure *)&bsdf,
                              make_float3(0.0f, 0.0f, 1.0f),
                              make_float3(sqrtf(1.0f - sqr(mu)), 0.0f, mu),
@@ -57,8 +55,6 @@ static float precompute_ggx_glass_E(const float rough,
                                     const float eta,
                                     const float3 rand)
 {
-  KernelGlobalsCPU kg;
-
   MicrofacetBsdf bsdf;
   bsdf.weight = one_float3();
   bsdf.sample_weight = 1.0f;
@@ -74,7 +70,7 @@ static float precompute_ggx_glass_E(const float rough,
   float pdf = 0.0f;
   float sampled_eta;
   float2 sampled_roughness;
-  bsdf_microfacet_ggx_sample(&kg,
+  bsdf_microfacet_ggx_sample(nullptr,
                              (ShaderClosure *)&bsdf,
                              make_float3(0.0f, 0.0f, 1.0f),
                              make_float3(sqrtf(1.0f - sqr(mu)), 0.0f, mu),
@@ -93,8 +89,6 @@ static float precompute_ggx_glass_E(const float rough,
 static float precompute_ggx_gen_schlick_s(
     const float rough, const float mu, const float eta, const float exponent, const float3 rand)
 {
-  KernelGlobalsCPU kg;
-
   MicrofacetBsdf bsdf;
   bsdf.weight = one_float3();
   bsdf.sample_weight = 1.0f;
@@ -120,7 +114,7 @@ static float precompute_ggx_gen_schlick_s(
   float pdf = 0.0f;
   float sampled_eta;
   float2 sampled_roughness;
-  bsdf_microfacet_ggx_sample(&kg,
+  bsdf_microfacet_ggx_sample(nullptr,
                              (ShaderClosure *)&bsdf,
                              make_float3(0.0f, 0.0f, 1.0f),
                              make_float3(sqrtf(1.0f - sqr(mu)), 0.0f, mu),
diff --git a/intern/cycles/device/CMakeLists.txt b/intern/cycles/device/CMakeLists.txt
index e2939ef08da..431ae7a80d4 100644
--- a/intern/cycles/device/CMakeLists.txt
+++ b/intern/cycles/device/CMakeLists.txt
@@ -33,8 +33,6 @@ set(SRC_CPU
   cpu/kernel.cpp
   cpu/kernel.h
   cpu/kernel_function.h
-  cpu/kernel_thread_globals.cpp
-  cpu/kernel_thread_globals.h
 )
 
 set(SRC_CUDA
diff --git a/intern/cycles/device/cpu/device_impl.cpp b/intern/cycles/device/cpu/device_impl.cpp
index fef9dd27296..c912eeca5b7 100644
--- a/intern/cycles/device/cpu/device_impl.cpp
+++ b/intern/cycles/device/cpu/device_impl.cpp
@@ -25,7 +25,6 @@
 #endif
 
 #include "device/cpu/kernel.h"
-#include "device/cpu/kernel_thread_globals.h"
 
 #include "device/device.h"
 
@@ -56,9 +55,6 @@ CPUDevice::CPUDevice(const DeviceInfo &info_, Stats &stats_, Profiler &profiler_
     info.cpu_threads = TaskScheduler::max_concurrency();
   }
 
-#ifdef WITH_OSL
-  kernel_globals.osl = &osl_globals;
-#endif
 #ifdef WITH_EMBREE
   embree_device = rtcNewDevice("verbose=0");
 #endif
@@ -296,19 +292,19 @@ void *CPUDevice::get_guiding_device() const
 }
 
 void CPUDevice::get_cpu_kernel_thread_globals(
-    vector<CPUKernelThreadGlobals> &kernel_thread_globals)
+    vector<ThreadKernelGlobalsCPU> &kernel_thread_globals)
 {
   /* Ensure latest texture info is loaded into kernel globals before returning. */
   load_texture_info();
 
   kernel_thread_globals.clear();
-  void *osl_memory = get_cpu_osl_memory();
+  OSLGlobals *osl_globals = get_cpu_osl_memory();
   for (int i = 0; i < info.cpu_threads; i++) {
-    kernel_thread_globals.emplace_back(kernel_globals, osl_memory, profiler, i);
+    kernel_thread_globals.emplace_back(kernel_globals, osl_globals, profiler, i);
   }
 }
 
-void *CPUDevice::get_cpu_osl_memory()
+OSLGlobals *CPUDevice::get_cpu_osl_memory()
 {
 #ifdef WITH_OSL
   return &osl_globals;
diff --git a/intern/cycles/device/cpu/device_impl.h b/intern/cycles/device/cpu/device_impl.h
index f0e242af237..e6008c8e28d 100644
--- a/intern/cycles/device/cpu/device_impl.h
+++ b/intern/cycles/device/cpu/device_impl.h
@@ -85,8 +85,8 @@ class CPUDevice : public Device {
   void *get_guiding_device() const override;
 
   void get_cpu_kernel_thread_globals(
-      vector<CPUKernelThreadGlobals> &kernel_thread_globals) override;
-  void *get_cpu_osl_memory() override;
+      vector<ThreadKernelGlobalsCPU> &kernel_thread_globals) override;
+  OSLGlobals *get_cpu_osl_memory() override;
 
  protected:
   bool load_kernels(uint /*kernel_features*/) override;
diff --git a/intern/cycles/device/cpu/kernel.h b/intern/cycles/device/cpu/kernel.h
index 6edca4eb724..bab7e898acf 100644
--- a/intern/cycles/device/cpu/kernel.h
+++ b/intern/cycles/device/cpu/kernel.h
@@ -9,7 +9,7 @@
 
 CCL_NAMESPACE_BEGIN
 
-struct KernelGlobalsCPU;
+struct ThreadKernelGlobalsCPU;
 struct KernelFilmConvert;
 struct IntegratorStateCPU;
 struct TileInfo;
@@ -19,10 +19,11 @@ class CPUKernels {
   /* Integrator. */
 
   using IntegratorFunction =
-      CPUKernelFunction<void (*)(const KernelGlobalsCPU *kg, IntegratorStateCPU *state)>;
-  using IntegratorShadeFunction = CPUKernelFunction<void (*)(
-      const KernelGlobalsCPU *kg, IntegratorStateCPU *state, ccl_global float *render_buffer)>;
-  using IntegratorInitFunction = CPUKernelFunction<bool (*)(const KernelGlobalsCPU *kg,
+      CPUKernelFunction<void (*)(const ThreadKernelGlobalsCPU *kg, IntegratorStateCPU *state)>;
+  using IntegratorShadeFunction = CPUKernelFunction<void (*)(const ThreadKernelGlobalsCPU *kg,
+                                                             IntegratorStateCPU *state,
+                                                             ccl_global float *render_buffer)>;
+  using IntegratorInitFunction = CPUKernelFunction<bool (*)(const ThreadKernelGlobalsCPU *kg,
                                                             IntegratorStateCPU *state,
                                                             KernelWorkTile *tile,
                                                             ccl_global float *render_buffer)>;
@@ -45,7 +46,7 @@ class CPUKernels {
   /* Shader evaluation. */
 
   using ShaderEvalFunction = CPUKernelFunction<void (*)(
-      const KernelGlobalsCPU *kg, const KernelShaderEvalInput *, float *, const int)>;
+      const ThreadKernelGlobalsCPU *kg, const KernelShaderEvalInput *, float *, const int)>;
 
   ShaderEvalFunction shader_eval_displace;
   ShaderEvalFunction shader_eval_background;
@@ -54,7 +55,7 @@ class CPUKernels {
   /* Adaptive stopping. */
 
   using AdaptiveSamplingConvergenceCheckFunction =
-      CPUKernelFunction<bool (*)(const KernelGlobalsCPU *kg,
+      CPUKernelFunction<bool (*)(const ThreadKernelGlobalsCPU *kg,
                                  ccl_global float *render_buffer,
                                  const int x,
                                  const int y,
@@ -64,7 +65,7 @@ class CPUKernels {
                                  int stride)>;
 
   using AdaptiveSamplingFilterXFunction =
-      CPUKernelFunction<void (*)(const KernelGlobalsCPU *kg,
+      CPUKernelFunction<void (*)(const ThreadKernelGlobalsCPU *kg,
                                  ccl_global float *render_buffer,
                                  const int y,
                                  const int start_x,
@@ -73,7 +74,7 @@ class CPUKernels {
                                  int stride)>;
 
   using AdaptiveSamplingFilterYFunction =
-      CPUKernelFunction<void (*)(const KernelGlobalsCPU *kg,
+      CPUKernelFunction<void (*)(const ThreadKernelGlobalsCPU *kg,
                                  ccl_global float *render_buffer,
                                  const int x,
                                  const int start_y,
@@ -89,7 +90,7 @@ class CPUKernels {
   /* Cryptomatte. */
 
   using CryptomattePostprocessFunction = CPUKernelFunction<void (*)(
-      const KernelGlobalsCPU *kg, ccl_global float *render_buffer, const int pixel_index)>;
+      const ThreadKernelGlobalsCPU *kg, ccl_global float *render_buffer, const int pixel_index)>;
 
   CryptomattePostprocessFunction cryptomatte_postprocess;
 
diff --git a/intern/cycles/device/cpu/kernel_thread_globals.cpp b/intern/cycles/device/cpu/kernel_thread_globals.cpp
deleted file mode 100644
index 998a63aa334..00000000000
--- a/intern/cycles/device/cpu/kernel_thread_globals.cpp
+++ /dev/null
@@ -1,91 +0,0 @@
-/* SPDX-FileCopyrightText: 2011-2022 Blender Foundation
- *
- * SPDX-License-Identifier: Apache-2.0 */
-
-#include "device/cpu/kernel_thread_globals.h"
-
-#include "kernel/osl/globals.h"
-
-#include "util/profiling.h"
-
-CCL_NAMESPACE_BEGIN
-
-CPUKernelThreadGlobals::CPUKernelThreadGlobals(const KernelGlobalsCPU &kernel_globals,
-                                               void *osl_globals_memory,
-                                               Profiler &cpu_profiler,
-                                               const int thread_index)
-    : KernelGlobalsCPU(kernel_globals), cpu_profiler_(cpu_profiler)
-{
-  clear_runtime_pointers();
-
-#ifdef WITH_OSL
-  OSLGlobals::thread_init(this, static_cast<OSLGlobals *>(osl_globals_memory), thread_index);
-#else
-  (void)thread_index;
-  (void)osl_globals_memory;
-#endif
-
-#ifdef WITH_PATH_GUIDING
-  opgl_path_segment_storage = new openpgl::cpp::PathSegmentStorage();
-#endif
-}
-
-CPUKernelThreadGlobals::CPUKernelThreadGlobals(CPUKernelThreadGlobals &&other) noexcept
-    : KernelGlobalsCPU(std::move(other)), cpu_profiler_(other.cpu_profiler_)
-{
-  other.clear_runtime_pointers();
-}
-
-CPUKernelThreadGlobals::~CPUKernelThreadGlobals()
-{
-#ifdef WITH_OSL
-  OSLGlobals::thread_free(this);
-#endif
-
-#ifdef WITH_PATH_GUIDING
-  delete opgl_path_segment_storage;
-  delete opgl_surface_sampling_distribution;
-  delete opgl_volume_sampling_distribution;
-#endif
-}
-
-CPUKernelThreadGlobals &CPUKernelThreadGlobals::operator=(CPUKernelThreadGlobals &&other)
-{
-  if (this == &other) {
-    return *this;
-  }
-
-  *static_cast<KernelGlobalsCPU *>(this) = *static_cast<KernelGlobalsCPU *>(&other);
-
-  other.clear_runtime_pointers();
-
-  return *this;
-}
-
-void CPUKernelThreadGlobals::clear_runtime_pointers()
-{
-#ifdef WITH_OSL
-  osl = nullptr;
-#endif
-
-#ifdef WITH_PATH_GUIDING
-  opgl_sample_data_storage = nullptr;
-  opgl_guiding_field = nullptr;
-
-  opgl_path_segment_storage = nullptr;
-  opgl_surface_sampling_distribution = nullptr;
-  opgl_volume_sampling_distribution = nullptr;
-#endif
-}
-
-void CPUKernelThreadGlobals::start_profiling()
-{
-  cpu_profiler_.add_state(&profiler);
-}
-
-void CPUKernelThreadGlobals::stop_profiling()
-{
-  cpu_profiler_.remove_state(&profiler);
-}
-
-CCL_NAMESPACE_END
diff --git a/intern/cycles/device/cpu/kernel_thread_globals.h b/intern/cycles/device/cpu/kernel_thread_globals.h
deleted file mode 100644
index 64225d6d554..00000000000
--- a/intern/cycles/device/cpu/kernel_thread_globals.h
+++ /dev/null
@@ -1,45 +0,0 @@
-/* SPDX-FileCopyrightText: 2011-2022 Blender Foundation
- *
- * SPDX-License-Identifier: Apache-2.0 */
-
-#pragma once
-
-#include "kernel/globals.h"
-
-CCL_NAMESPACE_BEGIN
-
-class Profiler;
-
-/* A special class which extends memory ownership of the `KernelGlobalsCPU` decoupling any resource
- * which is not thread-safe for access. Every worker thread which needs to operate on
- * `KernelGlobalsCPU` needs to initialize its own copy of this object.
- *
- * NOTE: Only minimal subset of objects are copied: `KernelData` is never copied. This means that
- * there is no unnecessary data duplication happening when using this object. */
-class CPUKernelThreadGlobals : public KernelGlobalsCPU {
- public:
-  /* TODO(sergey): Would be nice to have properly typed OSLGlobals even in the case when building
-   * without OSL support. Will avoid need to those unnamed pointers and casts. */
-  CPUKernelThreadGlobals(const KernelGlobalsCPU &kernel_globals,
-                         void *osl_globals_memory,
-                         Profiler &cpu_profiler,
-                         const int thread_index);
-
-  ~CPUKernelThreadGlobals();
-
-  CPUKernelThreadGlobals(const CPUKernelThreadGlobals &other) = delete;
-  CPUKernelThreadGlobals(CPUKernelThreadGlobals &&other) noexcept;
-
-  CPUKernelThreadGlobals &operator=(const CPUKernelThreadGlobals &other) = delete;
-  CPUKernelThreadGlobals &operator=(CPUKernelThreadGlobals &&other);
-
-  void start_profiling();
-  void stop_profiling();
-
- protected:
-  void clear_runtime_pointers();
-
-  Profiler &cpu_profiler_;
-};
-
-CCL_NAMESPACE_END
diff --git a/intern/cycles/device/device.cpp b/intern/cycles/device/device.cpp
index 7a00aa83048..96ba33e3195 100644
--- a/intern/cycles/device/device.cpp
+++ b/intern/cycles/device/device.cpp
@@ -483,12 +483,12 @@ const CPUKernels &Device::get_cpu_kernels()
 }
 
 void Device::get_cpu_kernel_thread_globals(
-    vector<CPUKernelThreadGlobals> & /*kernel_thread_globals*/)
+    vector<ThreadKernelGlobalsCPU> & /*kernel_thread_globals*/)
 {
   LOG(FATAL) << "Device does not support CPU kernels.";
 }
 
-void *Device::get_cpu_osl_memory()
+OSLGlobals *Device::get_cpu_osl_memory()
 {
   return nullptr;
 }
diff --git a/intern/cycles/device/device.h b/intern/cycles/device/device.h
index 85c0b48c392..8f9a2fca146 100644
--- a/intern/cycles/device/device.h
+++ b/intern/cycles/device/device.h
@@ -28,9 +28,11 @@ class BVH;
 class DeviceQueue;
 class Progress;
 class CPUKernels;
-class CPUKernelThreadGlobals;
 class Scene;
 
+struct OSLGlobals;
+struct ThreadKernelGlobalsCPU;
+
 /* Device Types */
 
 enum DeviceType {
@@ -216,9 +218,9 @@ class Device {
   static const CPUKernels &get_cpu_kernels();
   /* Get kernel globals to pass to kernels. */
   virtual void get_cpu_kernel_thread_globals(
-      vector<CPUKernelThreadGlobals> & /*kernel_thread_globals*/);
+      vector<ThreadKernelGlobalsCPU> & /*kernel_thread_globals*/);
   /* Get OpenShadingLanguage memory buffer. */
-  virtual void *get_cpu_osl_memory();
+  virtual OSLGlobals *get_cpu_osl_memory();
 
   /* Acceleration structure building. */
   virtual void build_bvh(BVH *bvh, Progress &progress, bool refit);
diff --git a/intern/cycles/device/multi/device.cpp b/intern/cycles/device/multi/device.cpp
index b72424cad89..0c6c62536ed 100644
--- a/intern/cycles/device/multi/device.cpp
+++ b/intern/cycles/device/multi/device.cpp
@@ -257,7 +257,7 @@ class MultiDevice : public Device {
     }
   }
 
-  void *get_cpu_osl_memory() override
+  OSLGlobals *get_cpu_osl_memory() override
   {
     /* Always return the OSL memory of the CPU device (this works since the constructor above
      * guarantees that CPU devices are always added to the back). */
diff --git a/intern/cycles/device/optix/device_impl.cpp b/intern/cycles/device/optix/device_impl.cpp
index 0a21ff20b00..1a9a81b2d6a 100644
--- a/intern/cycles/device/optix/device_impl.cpp
+++ b/intern/cycles/device/optix/device_impl.cpp
@@ -1006,7 +1006,7 @@ bool OptiXDevice::load_osl_kernels()
 #  endif
 }
 
-void *OptiXDevice::get_cpu_osl_memory()
+OSLGlobals *OptiXDevice::get_cpu_osl_memory()
 {
 #  ifdef WITH_OSL
   return &osl_globals;
diff --git a/intern/cycles/device/optix/device_impl.h b/intern/cycles/device/optix/device_impl.h
index 9de13c0e0b9..5fe4bea3895 100644
--- a/intern/cycles/device/optix/device_impl.h
+++ b/intern/cycles/device/optix/device_impl.h
@@ -115,7 +115,7 @@ class OptiXDevice : public CUDADevice {
 
   unique_ptr<DeviceQueue> gpu_queue_create() override;
 
-  void *get_cpu_osl_memory() override;
+  OSLGlobals *get_cpu_osl_memory() override;
 };
 
 CCL_NAMESPACE_END
diff --git a/intern/cycles/integrator/path_trace_work_cpu.cpp b/intern/cycles/integrator/path_trace_work_cpu.cpp
index 30dd8adc81d..8c323770fef 100644
--- a/intern/cycles/integrator/path_trace_work_cpu.cpp
+++ b/intern/cycles/integrator/path_trace_work_cpu.cpp
@@ -28,9 +28,9 @@ static inline tbb::task_arena local_tbb_arena_create(const Device *device)
   return tbb::task_arena(device->info.cpu_threads);
 }
 
-/* Get CPUKernelThreadGlobals for the current thread. */
-static inline CPUKernelThreadGlobals *kernel_thread_globals_get(
-    vector<CPUKernelThreadGlobals> &kernel_thread_globals)
+/* Get ThreadKernelGlobalsCPU for the current thread. */
+static inline ThreadKernelGlobalsCPU *kernel_thread_globals_get(
+    vector<ThreadKernelGlobalsCPU> &kernel_thread_globals)
 {
   const int thread_index = tbb::this_task_arena::current_thread_index();
   DCHECK_GE(thread_index, 0);
@@ -65,7 +65,7 @@ void PathTraceWorkCPU::render_samples(RenderStatistics &statistics,
   const int64_t total_pixels_num = image_width * image_height;
 
   if (device_->profiler.active()) {
-    for (CPUKernelThreadGlobals &kernel_globals : kernel_thread_globals_) {
+    for (ThreadKernelGlobalsCPU &kernel_globals : kernel_thread_globals_) {
       kernel_globals.start_profiling();
     }
   }
@@ -91,13 +91,13 @@ void PathTraceWorkCPU::render_samples(RenderStatistics &statistics,
       work_tile.offset = effective_buffer_params_.offset;
       work_tile.stride = effective_buffer_params_.stride;
 
-      CPUKernelThreadGlobals *kernel_globals = kernel_thread_globals_get(kernel_thread_globals_);
+      ThreadKernelGlobalsCPU *kernel_globals = kernel_thread_globals_get(kernel_thread_globals_);
 
       render_samples_full_pipeline(kernel_globals, work_tile, samples_num);
     });
   });
   if (device_->profiler.active()) {
-    for (CPUKernelThreadGlobals &kernel_globals : kernel_thread_globals_) {
+    for (ThreadKernelGlobalsCPU &kernel_globals : kernel_thread_globals_) {
       kernel_globals.stop_profiling();
     }
   }
@@ -105,7 +105,7 @@ void PathTraceWorkCPU::render_samples(RenderStatistics &statistics,
   statistics.occupancy = 1.0f;
 }
 
-void PathTraceWorkCPU::render_samples_full_pipeline(KernelGlobalsCPU *kernel_globals,
+void PathTraceWorkCPU::render_samples_full_pipeline(ThreadKernelGlobalsCPU *kernel_globals,
                                                     const KernelWorkTile &work_tile,
                                                     const int samples_num)
 {
@@ -230,7 +230,7 @@ int PathTraceWorkCPU::adaptive_sampling_converge_filter_count_active(const float
   /* Check convergency and do x-filter in a single `parallel_for`, to reduce threading overhead. */
   local_arena.execute([&]() {
     parallel_for(full_y, full_y + height, [&](int y) {
-      CPUKernelThreadGlobals *kernel_globals = kernel_thread_globals_.data();
+      ThreadKernelGlobalsCPU *kernel_globals = kernel_thread_globals_.data();
 
       bool row_converged = true;
       uint num_row_pixels_active = 0;
@@ -255,7 +255,7 @@ int PathTraceWorkCPU::adaptive_sampling_converge_filter_count_active(const float
   if (num_active_pixels) {
     local_arena.execute([&]() {
       parallel_for(full_x, full_x + width, [&](int x) {
-        CPUKernelThreadGlobals *kernel_globals = kernel_thread_globals_.data();
+        ThreadKernelGlobalsCPU *kernel_globals = kernel_thread_globals_.data();
         kernels_.adaptive_sampling_filter_y(
             kernel_globals, render_buffer, x, full_y, height, offset, stride);
       });
@@ -277,7 +277,7 @@ void PathTraceWorkCPU::cryptomatte_postproces()
   /* Check convergency and do x-filter in a single `parallel_for`, to reduce threading overhead. */
   local_arena.execute([&]() {
     parallel_for(0, height, [&](int y) {
-      CPUKernelThreadGlobals *kernel_globals = kernel_thread_globals_.data();
+      ThreadKernelGlobalsCPU *kernel_globals = kernel_thread_globals_.data();
       int pixel_index = y * width;
 
       for (int x = 0; x < width; ++x, ++pixel_index) {
@@ -297,7 +297,7 @@ void PathTraceWorkCPU::guiding_init_kernel_globals(void *guiding_field,
   /* Linking the global guiding structures (e.g., Field and SampleStorage) to the per-thread
    * kernel globals. */
   for (int thread_index = 0; thread_index < kernel_thread_globals_.size(); thread_index++) {
-    CPUKernelThreadGlobals &kg = kernel_thread_globals_[thread_index];
+    ThreadKernelGlobalsCPU &kg = kernel_thread_globals_[thread_index];
     openpgl::cpp::Field *field = (openpgl::cpp::Field *)guiding_field;
 
     /* Allocate sampling distributions. */
@@ -305,17 +305,17 @@ void PathTraceWorkCPU::guiding_init_kernel_globals(void *guiding_field,
 
 #  if PATH_GUIDING_LEVEL >= 4
     if (kg.opgl_surface_sampling_distribution) {
-      delete kg.opgl_surface_sampling_distribution;
-      kg.opgl_surface_sampling_distribution = nullptr;
+      kg.opgl_surface_sampling_distribution.reset();
     }
     if (kg.opgl_volume_sampling_distribution) {
-      delete kg.opgl_volume_sampling_distribution;
-      kg.opgl_volume_sampling_distribution = nullptr;
+      kg.opgl_volume_sampling_distribution.reset();
     }
 
     if (field) {
-      kg.opgl_surface_sampling_distribution = new openpgl::cpp::SurfaceSamplingDistribution(field);
-      kg.opgl_volume_sampling_distribution = new openpgl::cpp::VolumeSamplingDistribution(field);
+      kg.opgl_surface_sampling_distribution =
+          make_unique<openpgl::cpp::SurfaceSamplingDistribution>(field);
+      kg.opgl_volume_sampling_distribution = make_unique<openpgl::cpp::VolumeSamplingDistribution>(
+          field);
     }
 #  endif
 
@@ -332,7 +332,7 @@ void PathTraceWorkCPU::guiding_init_kernel_globals(void *guiding_field,
 }
 
 void PathTraceWorkCPU::guiding_push_sample_data_to_global_storage(
-    KernelGlobalsCPU *kg,
+    ThreadKernelGlobalsCPU *kg,
     IntegratorStateCPU *state,
     const ccl_global float *ccl_restrict render_buffer)
 {
diff --git a/intern/cycles/integrator/path_trace_work_cpu.h b/intern/cycles/integrator/path_trace_work_cpu.h
index 6f35be8cab4..cffea461e9f 100644
--- a/intern/cycles/integrator/path_trace_work_cpu.h
+++ b/intern/cycles/integrator/path_trace_work_cpu.h
@@ -4,9 +4,9 @@
 
 #pragma once
 
+#include "kernel/device/cpu/globals.h"
 #include "kernel/integrator/state.h"
 
-#include "device/cpu/kernel_thread_globals.h"
 #include "device/queue.h"
 
 #include "integrator/path_trace_work.h"
@@ -16,7 +16,7 @@
 CCL_NAMESPACE_BEGIN
 
 struct KernelWorkTile;
-struct KernelGlobalsCPU;
+struct ThreadKernelGlobalsCPU;
 struct IntegratorStateCPU;
 
 class CPUKernels;
@@ -63,7 +63,7 @@ class PathTraceWorkCPU : public PathTraceWork {
 
   /* Pushes the collected training data/samples of a path to the global sample storage.
    * This function is called at the end of a random walk/path generation. */
-  void guiding_push_sample_data_to_global_storage(KernelGlobalsCPU *kg,
+  void guiding_push_sample_data_to_global_storage(ThreadKernelGlobalsCPU *kg,
                                                   IntegratorStateCPU *state,
                                                   const ccl_global float *ccl_restrict
                                                       render_buffer);
@@ -71,7 +71,7 @@ class PathTraceWorkCPU : public PathTraceWork {
 
  protected:
   /* Core path tracing routine. Renders given work time on the given queue. */
-  void render_samples_full_pipeline(KernelGlobalsCPU *kernel_globals,
+  void render_samples_full_pipeline(ThreadKernelGlobalsCPU *kernel_globals,
                                     const KernelWorkTile &work_tile,
                                     const int samples_num);
 
@@ -83,7 +83,7 @@ class PathTraceWorkCPU : public PathTraceWork {
    * More specifically, the `kernel_globals_` is local to each threads and nobody else is
    * accessing it, but some "localization" is required to decouple from kernel globals stored
    * on the device level. */
-  vector<CPUKernelThreadGlobals> kernel_thread_globals_;
+  vector<ThreadKernelGlobalsCPU> kernel_thread_globals_;
 };
 
 CCL_NAMESPACE_END
diff --git a/intern/cycles/integrator/shader_eval.cpp b/intern/cycles/integrator/shader_eval.cpp
index 1019f48d104..15d2bc75c13 100644
--- a/intern/cycles/integrator/shader_eval.cpp
+++ b/intern/cycles/integrator/shader_eval.cpp
@@ -2,13 +2,14 @@
  *
  * SPDX-License-Identifier: Apache-2.0 */
 
+#include "kernel/device/cpu/globals.h"
+
 #include "integrator/shader_eval.h"
 
 #include "device/device.h"
 #include "device/queue.h"
 
 #include "device/cpu/kernel.h"
-#include "device/cpu/kernel_thread_globals.h"
 
 #include "util/log.h"
 #include "util/progress.h"
@@ -80,7 +81,7 @@ bool ShaderEval::eval_cpu(Device *device,
                           device_vector<float> &output,
                           const int64_t work_size)
 {
-  vector<CPUKernelThreadGlobals> kernel_thread_globals;
+  vector<ThreadKernelGlobalsCPU> kernel_thread_globals;
   device->get_cpu_kernel_thread_globals(kernel_thread_globals);
 
   /* Find required kernel function. */
@@ -101,7 +102,7 @@ bool ShaderEval::eval_cpu(Device *device,
       }
 
       const int thread_index = tbb::this_task_arena::current_thread_index();
-      const KernelGlobalsCPU *kg = &kernel_thread_globals[thread_index];
+      const ThreadKernelGlobalsCPU *kg = &kernel_thread_globals[thread_index];
 
       switch (type) {
         case SHADER_EVAL_DISPLACE:
diff --git a/intern/cycles/kernel/CMakeLists.txt b/intern/cycles/kernel/CMakeLists.txt
index 3057ec82bc4..5f56257b4b6 100644
--- a/intern/cycles/kernel/CMakeLists.txt
+++ b/intern/cycles/kernel/CMakeLists.txt
@@ -13,6 +13,7 @@ set(INC_SYS
 )
 
 set(SRC_KERNEL_DEVICE_CPU
+  device/cpu/globals.cpp
   device/cpu/kernel.cpp
   device/cpu/kernel_sse42.cpp
   device/cpu/kernel_avx2.cpp
diff --git a/intern/cycles/kernel/device/cpu/bvh.h b/intern/cycles/kernel/device/cpu/bvh.h
index 1b8c2c5422d..070e222e4cb 100644
--- a/intern/cycles/kernel/device/cpu/bvh.h
+++ b/intern/cycles/kernel/device/cpu/bvh.h
@@ -284,7 +284,7 @@ ccl_device_forceinline void kernel_embree_filter_intersection_func_impl(
 #ifdef __KERNEL_ONEAPI__
   KernelGlobalsGPU *kg = nullptr;
 #else
-  const KernelGlobalsCPU *kg = ctx->kg;
+  const ThreadKernelGlobalsCPU *kg = ctx->kg;
 #endif
   const Ray *cray = ctx->ray;
 
@@ -324,7 +324,7 @@ ccl_device_forceinline void kernel_embree_filter_occluded_shadow_all_func_impl(
 #ifdef __KERNEL_ONEAPI__
   KernelGlobalsGPU *kg = nullptr;
 #else
-  const KernelGlobalsCPU *kg = ctx->kg;
+  const ThreadKernelGlobalsCPU *kg = ctx->kg;
 #endif
   const Ray *cray = ctx->ray;
 
@@ -438,7 +438,7 @@ ccl_device_forceinline void kernel_embree_filter_occluded_local_func_impl(
 #ifdef __KERNEL_ONEAPI__
   KernelGlobalsGPU *kg = nullptr;
 #else
-  const KernelGlobalsCPU *kg = ctx->kg;
+  const ThreadKernelGlobalsCPU *kg = ctx->kg;
 #endif
   const Ray *cray = ctx->ray;
 
@@ -541,7 +541,7 @@ ccl_device_forceinline void kernel_embree_filter_occluded_volume_all_func_impl(
 #ifdef __KERNEL_ONEAPI__
   KernelGlobalsGPU *kg = nullptr;
 #else
-  const KernelGlobalsCPU *kg = ctx->kg;
+  const ThreadKernelGlobalsCPU *kg = ctx->kg;
 #endif
   const Ray *cray = ctx->ray;
 
@@ -622,7 +622,7 @@ ccl_device void kernel_embree_filter_func_backface_cull(const RTCFilterFunctionN
   }
 
   CCLIntersectContext *ctx = ((CCLIntersectContext *)args->context);
-  const KernelGlobalsCPU *kg = ctx->kg;
+  const ThreadKernelGlobalsCPU *kg = ctx->kg;
   const Ray *cray = ctx->ray;
 
   if (kernel_embree_is_self_intersection(
diff --git a/intern/cycles/kernel/device/cpu/globals.cpp b/intern/cycles/kernel/device/cpu/globals.cpp
new file mode 100644
index 00000000000..825233e47c4
--- /dev/null
+++ b/intern/cycles/kernel/device/cpu/globals.cpp
@@ -0,0 +1,43 @@
+/* SPDX-FileCopyrightText: 2011-2022 Blender Foundation
+ *
+ * SPDX-License-Identifier: Apache-2.0 */
+
+#include "kernel/device/cpu/globals.h"
+#include "kernel/osl/globals.h"
+
+#include "util/guiding.h"  // IWYU pragma: keep
+#include "util/profiling.h"
+
+CCL_NAMESPACE_BEGIN
+
+ThreadKernelGlobalsCPU::ThreadKernelGlobalsCPU(const KernelGlobalsCPU &kernel_globals,
+                                               OSLGlobals *osl_globals,
+                                               Profiler &cpu_profiler,
+                                               const int thread_index)
+    : KernelGlobalsCPU(kernel_globals),
+#ifdef WITH_OSL
+      osl(osl_globals, thread_index),
+#endif
+      cpu_profiler_(cpu_profiler)
+{
+#ifndef WITH_OSL
+  (void)thread_index;
+  (void)osl_globals;
+#endif
+
+#ifdef WITH_PATH_GUIDING
+  opgl_path_segment_storage = make_unique<openpgl::cpp::PathSegmentStorage>();
+#endif
+}
+
+void ThreadKernelGlobalsCPU::start_profiling()
+{
+  cpu_profiler_.add_state(&profiler);
+}
+
+void ThreadKernelGlobalsCPU::stop_profiling()
+{
+  cpu_profiler_.remove_state(&profiler);
+}
+
+CCL_NAMESPACE_END
diff --git a/intern/cycles/kernel/device/cpu/globals.h b/intern/cycles/kernel/device/cpu/globals.h
index 62bd989054b..2c6d5ad29d2 100644
--- a/intern/cycles/kernel/device/cpu/globals.h
+++ b/intern/cycles/kernel/device/cpu/globals.h
@@ -9,22 +9,23 @@
 #include "kernel/types.h"
 #include "kernel/util/profiler.h"
 
+#ifdef __OSL__
+#  include "kernel/osl/globals.h"
+#endif
+
 #include "util/guiding.h"  // IWYU pragma: keep
 #include "util/texture.h"  // IWYU pragma: keep
+#include "util/unique_ptr.h"
 
 CCL_NAMESPACE_BEGIN
 
+struct OSLGlobals;
+
 /* On the CPU, we pass along the struct KernelGlobals to nearly everywhere in
  * the kernel, to access constant data. These are all stored as flat arrays.
  * these are really just standard arrays. We can't use actually globals because
  * multiple renders may be running inside the same process. */
 
-#ifdef __OSL__
-struct OSLGlobals;
-struct OSLThreadData;
-struct OSLShadingSystem;
-#endif
-
 /* Array for kernel data, with size to be able to assert on invalid data access. */
 template<typename T> struct kernel_array {
   const ccl_always_inline T &fetch(const int index) const
@@ -37,38 +38,58 @@ template<typename T> struct kernel_array {
   int width = 0;
 };
 
+/* Constant globals shared between all threads. */
 struct KernelGlobalsCPU {
 #define KERNEL_DATA_ARRAY(type, name) kernel_array<type> name;
 #include "kernel/data_arrays.h"
 
   KernelData data = {};
 
+  ProfilingState profiler;
+};
+
+/* Per-thread global state.
+ *
+ * To avoid pointer indirection, the constant globals are copied to each thread.
+ *
+ * This may not be ideal for cache pressure. Alternative would be to pass an
+ * additional thread index to every function, and potentially to make the shared
+ * part an actual global variable. That would match the GPU more closely, but
+ * also require mutex locks for multiple Cycles instances. */
+struct ThreadKernelGlobalsCPU : public KernelGlobalsCPU {
+  ThreadKernelGlobalsCPU(const KernelGlobalsCPU &kernel_globals,
+                         OSLGlobals *osl_globals_memory,
+                         Profiler &cpu_profiler,
+                         const int thread_index);
+
+  ThreadKernelGlobalsCPU(ThreadKernelGlobalsCPU &other) = delete;
+  ThreadKernelGlobalsCPU(ThreadKernelGlobalsCPU &&other) noexcept = default;
+  ThreadKernelGlobalsCPU &operator=(const ThreadKernelGlobalsCPU &other) = delete;
+  ThreadKernelGlobalsCPU &operator=(ThreadKernelGlobalsCPU &&other) = delete;
+
+  void start_profiling();
+  void stop_profiling();
+
 #ifdef __OSL__
-  /* On the CPU, we also have the OSL globals here. Most data structures are shared
-   * with SVM, the difference is in the shaders and object/mesh attributes. */
-  OSLGlobals *osl = nullptr;
-  OSLShadingSystem *osl_ss = nullptr;
-  OSLThreadData *osl_tdata = nullptr;
-  int osl_thread_index = 0;
+  OSLThreadData osl;
 #endif
 
 #ifdef __PATH_GUIDING__
-  /* Pointers to global data structures. */
+  /* Pointers to shared global data structures. */
   openpgl::cpp::SampleStorage *opgl_sample_data_storage = nullptr;
   openpgl::cpp::Field *opgl_guiding_field = nullptr;
 
   /* Local data structures owned by the thread. */
-  openpgl::cpp::PathSegmentStorage *opgl_path_segment_storage = nullptr;
-  openpgl::cpp::SurfaceSamplingDistribution *opgl_surface_sampling_distribution = nullptr;
-  openpgl::cpp::VolumeSamplingDistribution *opgl_volume_sampling_distribution = nullptr;
+  unique_ptr<openpgl::cpp::PathSegmentStorage> opgl_path_segment_storage;
+  unique_ptr<openpgl::cpp::SurfaceSamplingDistribution> opgl_surface_sampling_distribution;
+  unique_ptr<openpgl::cpp::VolumeSamplingDistribution> opgl_volume_sampling_distribution;
 #endif
 
-  /* **** Run-time data ****  */
-
-  ProfilingState profiler;
+ protected:
+  Profiler &cpu_profiler_;
 };
 
-using KernelGlobals = const KernelGlobalsCPU *;
+using KernelGlobals = const ThreadKernelGlobalsCPU *;
 
 /* Abstraction macros */
 #define kernel_data_fetch(name, index) (kg->name.fetch(index))
diff --git a/intern/cycles/kernel/device/cpu/kernel.cpp b/intern/cycles/kernel/device/cpu/kernel.cpp
index a5a025c8997..f686867db2f 100644
--- a/intern/cycles/kernel/device/cpu/kernel.cpp
+++ b/intern/cycles/kernel/device/cpu/kernel.cpp
@@ -43,6 +43,8 @@
 /* do nothing */
 #endif
 
+#include "kernel/device/cpu/globals.h"
+
 #include "kernel/device/cpu/kernel.h"
 #define KERNEL_ARCH cpu
 #include "kernel/device/cpu/kernel_arch_impl.h"
diff --git a/intern/cycles/kernel/device/cpu/kernel_arch.h b/intern/cycles/kernel/device/cpu/kernel_arch.h
index 700fab9f988..c5b9ed9afa9 100644
--- a/intern/cycles/kernel/device/cpu/kernel_arch.h
+++ b/intern/cycles/kernel/device/cpu/kernel_arch.h
@@ -9,19 +9,21 @@
  */
 
 #define KERNEL_INTEGRATOR_FUNCTION(name) \
-  void KERNEL_FUNCTION_FULL_NAME(integrator_##name)(const KernelGlobalsCPU *ccl_restrict kg, \
-                                                    IntegratorStateCPU *state)
+  void KERNEL_FUNCTION_FULL_NAME(integrator_##name)( \
+      const ThreadKernelGlobalsCPU *ccl_restrict kg, IntegratorStateCPU *state)
 
 #define KERNEL_INTEGRATOR_SHADE_FUNCTION(name) \
-  void KERNEL_FUNCTION_FULL_NAME(integrator_##name)(const KernelGlobalsCPU *ccl_restrict kg, \
-                                                    IntegratorStateCPU *state, \
-                                                    ccl_global float *render_buffer)
+  void KERNEL_FUNCTION_FULL_NAME(integrator_##name)( \
+      const ThreadKernelGlobalsCPU *ccl_restrict kg, \
+      IntegratorStateCPU *state, \
+      ccl_global float *render_buffer)
 
 #define KERNEL_INTEGRATOR_INIT_FUNCTION(name) \
-  bool KERNEL_FUNCTION_FULL_NAME(integrator_##name)(const KernelGlobalsCPU *ccl_restrict kg, \
-                                                    IntegratorStateCPU *state, \
-                                                    KernelWorkTile *tile, \
-                                                    ccl_global float *render_buffer)
+  bool KERNEL_FUNCTION_FULL_NAME(integrator_##name)( \
+      const ThreadKernelGlobalsCPU *ccl_restrict kg, \
+      IntegratorStateCPU *state, \
+      KernelWorkTile *tile, \
+      ccl_global float *render_buffer)
 
 KERNEL_INTEGRATOR_INIT_FUNCTION(init_from_camera);
 KERNEL_INTEGRATOR_INIT_FUNCTION(init_from_bake);
@@ -77,16 +79,16 @@ KERNEL_FILM_CONVERT_FUNCTION(float4)
  * Shader evaluation.
  */
 
-void KERNEL_FUNCTION_FULL_NAME(shader_eval_background)(const KernelGlobalsCPU *kg,
+void KERNEL_FUNCTION_FULL_NAME(shader_eval_background)(const ThreadKernelGlobalsCPU *kg,
                                                        const KernelShaderEvalInput *input,
                                                        float *output,
                                                        const int offset);
-void KERNEL_FUNCTION_FULL_NAME(shader_eval_displace)(const KernelGlobalsCPU *kg,
+void KERNEL_FUNCTION_FULL_NAME(shader_eval_displace)(const ThreadKernelGlobalsCPU *kg,
                                                      const KernelShaderEvalInput *input,
                                                      float *output,
                                                      const int offset);
 void KERNEL_FUNCTION_FULL_NAME(shader_eval_curve_shadow_transparency)(
-    const KernelGlobalsCPU *kg,
+    const ThreadKernelGlobalsCPU *kg,
     const KernelShaderEvalInput *input,
     float *output,
     const int offset);
@@ -96,7 +98,7 @@ void KERNEL_FUNCTION_FULL_NAME(shader_eval_curve_shadow_transparency)(
  */
 
 bool KERNEL_FUNCTION_FULL_NAME(adaptive_sampling_convergence_check)(
-    const KernelGlobalsCPU *kg,
+    const ThreadKernelGlobalsCPU *kg,
     ccl_global float *render_buffer,
     const int x,
     const int y,
@@ -105,14 +107,14 @@ bool KERNEL_FUNCTION_FULL_NAME(adaptive_sampling_convergence_check)(
     const int offset,
     int stride);
 
-void KERNEL_FUNCTION_FULL_NAME(adaptive_sampling_filter_x)(const KernelGlobalsCPU *kg,
+void KERNEL_FUNCTION_FULL_NAME(adaptive_sampling_filter_x)(const ThreadKernelGlobalsCPU *kg,
                                                            ccl_global float *render_buffer,
                                                            const int y,
                                                            const int start_x,
                                                            const int width,
                                                            const int offset,
                                                            int stride);
-void KERNEL_FUNCTION_FULL_NAME(adaptive_sampling_filter_y)(const KernelGlobalsCPU *kg,
+void KERNEL_FUNCTION_FULL_NAME(adaptive_sampling_filter_y)(const ThreadKernelGlobalsCPU *kg,
                                                            ccl_global float *render_buffer,
                                                            const int x,
                                                            const int start_y,
@@ -124,7 +126,7 @@ void KERNEL_FUNCTION_FULL_NAME(adaptive_sampling_filter_y)(const KernelGlobalsCP
  * Cryptomatte.
  */
 
-void KERNEL_FUNCTION_FULL_NAME(cryptomatte_postprocess)(const KernelGlobalsCPU *kg,
+void KERNEL_FUNCTION_FULL_NAME(cryptomatte_postprocess)(const ThreadKernelGlobalsCPU *kg,
                                                         ccl_global float *render_buffer,
                                                         int pixel_index);
 
diff --git a/intern/cycles/kernel/device/cpu/kernel_arch_impl.h b/intern/cycles/kernel/device/cpu/kernel_arch_impl.h
index 80dd94f8d74..aa593a41dba 100644
--- a/intern/cycles/kernel/device/cpu/kernel_arch_impl.h
+++ b/intern/cycles/kernel/device/cpu/kernel_arch_impl.h
@@ -62,7 +62,7 @@ CCL_NAMESPACE_BEGIN
 /* TODO: Either use something like get_work_pixel(), or simplify tile which is passed here, so
  * that it does not contain unused fields. */
 #define DEFINE_INTEGRATOR_INIT_KERNEL(name) \
-  bool KERNEL_FUNCTION_FULL_NAME(integrator_##name)(const KernelGlobalsCPU *kg, \
+  bool KERNEL_FUNCTION_FULL_NAME(integrator_##name)(const ThreadKernelGlobalsCPU *kg, \
                                                     IntegratorStateCPU *state, \
                                                     KernelWorkTile *tile, \
                                                     ccl_global float *render_buffer) \
@@ -72,29 +72,31 @@ CCL_NAMESPACE_BEGIN
   }
 
 #define DEFINE_INTEGRATOR_KERNEL(name) \
-  void KERNEL_FUNCTION_FULL_NAME(integrator_##name)(const KernelGlobalsCPU *kg, \
+  void KERNEL_FUNCTION_FULL_NAME(integrator_##name)(const ThreadKernelGlobalsCPU *kg, \
                                                     IntegratorStateCPU *state) \
   { \
     KERNEL_INVOKE(name, kg, state); \
   }
 
 #define DEFINE_INTEGRATOR_SHADE_KERNEL(name) \
-  void KERNEL_FUNCTION_FULL_NAME(integrator_##name)( \
-      const KernelGlobalsCPU *kg, IntegratorStateCPU *state, ccl_global float *render_buffer) \
+  void KERNEL_FUNCTION_FULL_NAME(integrator_##name)(const ThreadKernelGlobalsCPU *kg, \
+                                                    IntegratorStateCPU *state, \
+                                                    ccl_global float *render_buffer) \
   { \
     KERNEL_INVOKE(name, kg, state, render_buffer); \
   }
 
 #define DEFINE_INTEGRATOR_SHADOW_KERNEL(name) \
-  void KERNEL_FUNCTION_FULL_NAME(integrator_##name)(const KernelGlobalsCPU *kg, \
+  void KERNEL_FUNCTION_FULL_NAME(integrator_##name)(const ThreadKernelGlobalsCPU *kg, \
                                                     IntegratorStateCPU *state) \
   { \
     KERNEL_INVOKE(name, kg, &state->shadow); \
   }
 
 #define DEFINE_INTEGRATOR_SHADOW_SHADE_KERNEL(name) \
-  void KERNEL_FUNCTION_FULL_NAME(integrator_##name)( \
-      const KernelGlobalsCPU *kg, IntegratorStateCPU *state, ccl_global float *render_buffer) \
+  void KERNEL_FUNCTION_FULL_NAME(integrator_##name)(const ThreadKernelGlobalsCPU *kg, \
+                                                    IntegratorStateCPU *state, \
+                                                    ccl_global float *render_buffer) \
   { \
     KERNEL_INVOKE(name, kg, &state->shadow, render_buffer); \
   }
@@ -118,7 +120,7 @@ DEFINE_INTEGRATOR_SHADOW_SHADE_KERNEL(shade_shadow)
  * Shader evaluation.
  */
 
-void KERNEL_FUNCTION_FULL_NAME(shader_eval_displace)(const KernelGlobalsCPU *kg,
+void KERNEL_FUNCTION_FULL_NAME(shader_eval_displace)(const ThreadKernelGlobalsCPU *kg,
                                                      const KernelShaderEvalInput *input,
                                                      float *output,
                                                      const int offset)
@@ -130,7 +132,7 @@ void KERNEL_FUNCTION_FULL_NAME(shader_eval_displace)(const KernelGlobalsCPU *kg,
 #endif
 }
 
-void KERNEL_FUNCTION_FULL_NAME(shader_eval_background)(const KernelGlobalsCPU *kg,
+void KERNEL_FUNCTION_FULL_NAME(shader_eval_background)(const ThreadKernelGlobalsCPU *kg,
                                                        const KernelShaderEvalInput *input,
                                                        float *output,
                                                        const int offset)
@@ -143,7 +145,7 @@ void KERNEL_FUNCTION_FULL_NAME(shader_eval_background)(const KernelGlobalsCPU *k
 }
 
 void KERNEL_FUNCTION_FULL_NAME(shader_eval_curve_shadow_transparency)(
-    const KernelGlobalsCPU *kg,
+    const ThreadKernelGlobalsCPU *kg,
     const KernelShaderEvalInput *input,
     float *output,
     const int offset)
@@ -160,7 +162,7 @@ void KERNEL_FUNCTION_FULL_NAME(shader_eval_curve_shadow_transparency)(
  */
 
 bool KERNEL_FUNCTION_FULL_NAME(adaptive_sampling_convergence_check)(
-    const KernelGlobalsCPU *kg,
+    const ThreadKernelGlobalsCPU *kg,
     ccl_global float *render_buffer,
     const int x,
     const int y,
@@ -178,7 +180,7 @@ bool KERNEL_FUNCTION_FULL_NAME(adaptive_sampling_convergence_check)(
 #endif
 }
 
-void KERNEL_FUNCTION_FULL_NAME(adaptive_sampling_filter_x)(const KernelGlobalsCPU *kg,
+void KERNEL_FUNCTION_FULL_NAME(adaptive_sampling_filter_x)(const ThreadKernelGlobalsCPU *kg,
                                                            ccl_global float *render_buffer,
                                                            const int y,
                                                            const int start_x,
@@ -193,7 +195,7 @@ void KERNEL_FUNCTION_FULL_NAME(adaptive_sampling_filter_x)(const KernelGlobalsCP
 #endif
 }
 
-void KERNEL_FUNCTION_FULL_NAME(adaptive_sampling_filter_y)(const KernelGlobalsCPU *kg,
+void KERNEL_FUNCTION_FULL_NAME(adaptive_sampling_filter_y)(const ThreadKernelGlobalsCPU *kg,
                                                            ccl_global float *render_buffer,
                                                            const int x,
                                                            const int start_y,
@@ -212,7 +214,7 @@ void KERNEL_FUNCTION_FULL_NAME(adaptive_sampling_filter_y)(const KernelGlobalsCP
  * Cryptomatte.
  */
 
-void KERNEL_FUNCTION_FULL_NAME(cryptomatte_postprocess)(const KernelGlobalsCPU *kg,
+void KERNEL_FUNCTION_FULL_NAME(cryptomatte_postprocess)(const ThreadKernelGlobalsCPU *kg,
                                                         ccl_global float *render_buffer,
                                                         const int pixel_index)
 {
diff --git a/intern/cycles/kernel/device/cpu/kernel_avx2.cpp b/intern/cycles/kernel/device/cpu/kernel_avx2.cpp
index 097601e1950..0d0894c7607 100644
--- a/intern/cycles/kernel/device/cpu/kernel_avx2.cpp
+++ b/intern/cycles/kernel/device/cpu/kernel_avx2.cpp
@@ -23,6 +23,7 @@
 #  endif
 #endif /* WITH_CYCLES_OPTIMIZED_KERNEL_AVX2 */
 
+#include "kernel/device/cpu/globals.h"
 #include "kernel/device/cpu/kernel.h"
 #define KERNEL_ARCH cpu_avx2
 #include "kernel/device/cpu/kernel_arch_impl.h"
diff --git a/intern/cycles/kernel/device/cpu/kernel_sse42.cpp b/intern/cycles/kernel/device/cpu/kernel_sse42.cpp
index d9a12e8a224..2970d84b27a 100644
--- a/intern/cycles/kernel/device/cpu/kernel_sse42.cpp
+++ b/intern/cycles/kernel/device/cpu/kernel_sse42.cpp
@@ -21,6 +21,7 @@
 #  endif
 #endif /* WITH_CYCLES_OPTIMIZED_KERNEL_SSE42 */
 
+#include "kernel/device/cpu/globals.h"
 #include "kernel/device/cpu/kernel.h"
 #define KERNEL_ARCH cpu_sse42
 #include "kernel/device/cpu/kernel_arch_impl.h"
diff --git a/intern/cycles/kernel/osl/closures.cpp b/intern/cycles/kernel/osl/closures.cpp
index 4463280d38a..0300151088f 100644
--- a/intern/cycles/kernel/osl/closures.cpp
+++ b/intern/cycles/kernel/osl/closures.cpp
@@ -76,18 +76,17 @@ void OSLRenderServices::register_closures(OSL::ShadingSystem *ss)
 /* Surface & Background */
 
 template<>
-void osl_eval_nodes<SHADER_TYPE_SURFACE>(const KernelGlobalsCPU *kg,
+void osl_eval_nodes<SHADER_TYPE_SURFACE>(const ThreadKernelGlobalsCPU *kg,
                                          const void *state,
                                          ShaderData *sd,
                                          const uint32_t path_flag)
 {
   /* setup shader globals from shader data */
-  OSLThreadData *tdata = kg->osl_tdata;
   shaderdata_to_shaderglobals(
-      kg, sd, path_flag, reinterpret_cast<ShaderGlobals *>(&tdata->globals));
+      kg, sd, path_flag, reinterpret_cast<ShaderGlobals *>(&kg->osl.shader_globals));
 
   /* clear trace data */
-  tdata->tracedata.init = false;
+  kg->osl.tracedata.init = false;
 
   /* Used by render-services. */
   sd->osl_globals = kg;
@@ -101,30 +100,30 @@ void osl_eval_nodes<SHADER_TYPE_SURFACE>(const KernelGlobalsCPU *kg,
   }
 
   /* execute shader for this point */
-  OSL::ShadingSystem *ss = (OSL::ShadingSystem *)kg->osl_ss;
-  OSL::ShaderGlobals *globals = &tdata->globals;
-  OSL::ShadingContext *octx = tdata->context;
+  OSL::ShadingSystem *ss = (OSL::ShadingSystem *)kg->osl.ss;
+  OSL::ShaderGlobals *globals = &kg->osl.shader_globals;
+  OSL::ShadingContext *octx = kg->osl.context;
   const int shader = sd->shader & SHADER_MASK;
 
   if (sd->object == OBJECT_NONE && sd->lamp == LAMP_NONE) {
     /* background */
-    if (kg->osl->background_state) {
+    if (kg->osl.globals->background_state) {
 #if OSL_LIBRARY_VERSION_CODE >= 11304
       ss->execute(*octx,
-                  *(kg->osl->background_state),
-                  kg->osl_thread_index,
+                  *(kg->osl.globals->background_state),
+                  kg->osl.thread_index,
                   0,
                   *globals,
                   nullptr,
                   nullptr);
 #else
-      ss->execute(octx, *(kg->osl->background_state), *globals);
+      ss->execute(octx, *(kg->osl.globals->background_state), *globals);
 #endif
     }
   }
   else {
     /* automatic bump shader */
-    if (kg->osl->bump_state[shader]) {
+    if (kg->osl.globals->bump_state[shader]) {
       /* save state */
       const float3 P = sd->P;
       const float dP = sd->dP;
@@ -134,12 +133,13 @@ void osl_eval_nodes<SHADER_TYPE_SURFACE>(const KernelGlobalsCPU *kg,
       /* set state as if undisplaced */
       if (sd->flag & SD_HAS_DISPLACEMENT) {
         float data[9];
-        const bool found = kg->osl->services->get_attribute(sd,
-                                                            true,
-                                                            OSLRenderServices::u_empty,
-                                                            TypeVector,
-                                                            OSLRenderServices::u_geom_undisplaced,
-                                                            data);
+        const bool found = kg->osl.globals->services->get_attribute(
+            sd,
+            true,
+            OSLRenderServices::u_empty,
+            TypeVector,
+            OSLRenderServices::u_geom_undisplaced,
+            data);
         (void)found;
         assert(found);
 
@@ -162,14 +162,14 @@ void osl_eval_nodes<SHADER_TYPE_SURFACE>(const KernelGlobalsCPU *kg,
 /* execute bump shader */
 #if OSL_LIBRARY_VERSION_CODE >= 11304
       ss->execute(*octx,
-                  *(kg->osl->bump_state[shader]),
-                  kg->osl_thread_index,
+                  *(kg->osl.globals->bump_state[shader]),
+                  kg->osl.thread_index,
                   0,
                   *globals,
                   nullptr,
                   nullptr);
 #else
-      ss->execute(octx, *(kg->osl->bump_state[shader]), *globals);
+      ss->execute(octx, *(kg->osl.globals->bump_state[shader]), *globals);
 #endif
 
       /* reset state */
@@ -182,17 +182,17 @@ void osl_eval_nodes<SHADER_TYPE_SURFACE>(const KernelGlobalsCPU *kg,
     }
 
     /* surface shader */
-    if (kg->osl->surface_state[shader]) {
+    if (kg->osl.globals->surface_state[shader]) {
 #if OSL_LIBRARY_VERSION_CODE >= 11304
       ss->execute(*octx,
-                  *(kg->osl->surface_state[shader]),
-                  kg->osl_thread_index,
+                  *(kg->osl.globals->surface_state[shader]),
+                  kg->osl.thread_index,
                   0,
                   *globals,
                   nullptr,
                   nullptr);
 #else
-      ss->execute(octx, *(kg->osl->surface_state[shader]), *globals);
+      ss->execute(octx, *(kg->osl.globals->surface_state[shader]), *globals);
 #endif
     }
   }
@@ -206,18 +206,17 @@ void osl_eval_nodes<SHADER_TYPE_SURFACE>(const KernelGlobalsCPU *kg,
 /* Volume */
 
 template<>
-void osl_eval_nodes<SHADER_TYPE_VOLUME>(const KernelGlobalsCPU *kg,
+void osl_eval_nodes<SHADER_TYPE_VOLUME>(const ThreadKernelGlobalsCPU *kg,
                                         const void *state,
                                         ShaderData *sd,
                                         const uint32_t path_flag)
 {
   /* setup shader globals from shader data */
-  OSLThreadData *tdata = kg->osl_tdata;
   shaderdata_to_shaderglobals(
-      kg, sd, path_flag, reinterpret_cast<ShaderGlobals *>(&tdata->globals));
+      kg, sd, path_flag, reinterpret_cast<ShaderGlobals *>(&kg->osl.shader_globals));
 
   /* clear trace data */
-  tdata->tracedata.init = false;
+  kg->osl.tracedata.init = false;
 
   /* Used by render-services. */
   sd->osl_globals = kg;
@@ -231,22 +230,22 @@ void osl_eval_nodes<SHADER_TYPE_VOLUME>(const KernelGlobalsCPU *kg,
   }
 
   /* execute shader */
-  OSL::ShadingSystem *ss = (OSL::ShadingSystem *)kg->osl_ss;
-  OSL::ShaderGlobals *globals = &tdata->globals;
-  OSL::ShadingContext *octx = tdata->context;
+  OSL::ShadingSystem *ss = (OSL::ShadingSystem *)kg->osl.ss;
+  OSL::ShaderGlobals *globals = &kg->osl.shader_globals;
+  OSL::ShadingContext *octx = kg->osl.context;
   const int shader = sd->shader & SHADER_MASK;
 
-  if (kg->osl->volume_state[shader]) {
+  if (kg->osl.globals->volume_state[shader]) {
 #if OSL_LIBRARY_VERSION_CODE >= 11304
     ss->execute(*octx,
-                *(kg->osl->volume_state[shader]),
-                kg->osl_thread_index,
+                *(kg->osl.globals->volume_state[shader]),
+                kg->osl.thread_index,
                 0,
                 *globals,
                 nullptr,
                 nullptr);
 #else
-    ss->execute(octx, *(kg->osl->volume_state[shader]), *globals);
+    ss->execute(octx, *(kg->osl.globals->volume_state[shader]), *globals);
 #endif
   }
 
@@ -259,18 +258,17 @@ void osl_eval_nodes<SHADER_TYPE_VOLUME>(const KernelGlobalsCPU *kg,
 /* Displacement */
 
 template<>
-void osl_eval_nodes<SHADER_TYPE_DISPLACEMENT>(const KernelGlobalsCPU *kg,
+void osl_eval_nodes<SHADER_TYPE_DISPLACEMENT>(const ThreadKernelGlobalsCPU *kg,
                                               const void *state,
                                               ShaderData *sd,
                                               const uint32_t path_flag)
 {
   /* setup shader globals from shader data */
-  OSLThreadData *tdata = kg->osl_tdata;
   shaderdata_to_shaderglobals(
-      kg, sd, path_flag, reinterpret_cast<ShaderGlobals *>(&tdata->globals));
+      kg, sd, path_flag, reinterpret_cast<ShaderGlobals *>(&kg->osl.shader_globals));
 
   /* clear trace data */
-  tdata->tracedata.init = false;
+  kg->osl.tracedata.init = false;
 
   /* Used by render-services. */
   sd->osl_globals = kg;
@@ -278,22 +276,22 @@ void osl_eval_nodes<SHADER_TYPE_DISPLACEMENT>(const KernelGlobalsCPU *kg,
   sd->osl_shadow_path_state = nullptr;
 
   /* execute shader */
-  OSL::ShadingSystem *ss = (OSL::ShadingSystem *)kg->osl_ss;
-  OSL::ShaderGlobals *globals = &tdata->globals;
-  OSL::ShadingContext *octx = tdata->context;
+  OSL::ShadingSystem *ss = (OSL::ShadingSystem *)kg->osl.ss;
+  OSL::ShaderGlobals *globals = &kg->osl.shader_globals;
+  OSL::ShadingContext *octx = kg->osl.context;
   const int shader = sd->shader & SHADER_MASK;
 
-  if (kg->osl->displacement_state[shader]) {
+  if (kg->osl.globals->displacement_state[shader]) {
 #if OSL_LIBRARY_VERSION_CODE >= 11304
     ss->execute(*octx,
-                *(kg->osl->displacement_state[shader]),
-                kg->osl_thread_index,
+                *(kg->osl.globals->displacement_state[shader]),
+                kg->osl.thread_index,
                 0,
                 *globals,
                 nullptr,
                 nullptr);
 #else
-    ss->execute(octx, *(kg->osl->displacement_state[shader]), *globals);
+    ss->execute(octx, *(kg->osl.globals->displacement_state[shader]), *globals);
 #endif
   }
 
diff --git a/intern/cycles/kernel/osl/globals.cpp b/intern/cycles/kernel/osl/globals.cpp
index 626036eb27d..52b408b1d69 100644
--- a/intern/cycles/kernel/osl/globals.cpp
+++ b/intern/cycles/kernel/osl/globals.cpp
@@ -4,57 +4,55 @@
 
 #include <OSL/oslexec.h>
 
-#include "kernel/globals.h"
-#include "kernel/types.h"
-
 #include "kernel/osl/globals.h"
-#include "kernel/osl/services.h"
 
 CCL_NAMESPACE_BEGIN
 
-void OSLGlobals::thread_init(KernelGlobalsCPU *kg, OSLGlobals *osl_globals, const int thread_index)
+OSLThreadData::OSLThreadData(OSLGlobals *osl_globals, const int thread_index)
+    : globals(osl_globals), thread_index(thread_index)
 {
-  /* no osl used? */
-  if (!osl_globals->use) {
-    kg->osl = nullptr;
+  if (globals == nullptr || globals->use == false) {
     return;
   }
 
-  /* Per thread kernel data init. */
-  kg->osl = osl_globals;
+  ss = globals->ss;
 
-  OSL::ShadingSystem *ss = kg->osl->ss;
-  OSLThreadData *tdata = new OSLThreadData();
+  memset((void *)&shader_globals, 0, sizeof(shader_globals));
+  shader_globals.tracedata = &tracedata;
 
-  memset((void *)&tdata->globals, 0, sizeof(OSL::ShaderGlobals));
-  tdata->globals.tracedata = &tdata->tracedata;
-  tdata->osl_thread_info = ss->create_thread_info();
-  tdata->context = ss->get_context(tdata->osl_thread_info);
-
-  tdata->oiio_thread_info = osl_globals->ts->get_perthread_info();
-
-  kg->osl_ss = (OSLShadingSystem *)ss;
-  kg->osl_tdata = tdata;
-  kg->osl_thread_index = thread_index;
+  osl_thread_info = ss->create_thread_info();
+  context = ss->get_context(osl_thread_info);
+  oiio_thread_info = globals->ts->get_perthread_info();
 }
 
-void OSLGlobals::thread_free(KernelGlobalsCPU *kg)
+OSLThreadData::~OSLThreadData()
 {
-  if (!kg->osl) {
-    return;
+  if (context) {
+    ss->release_context(context);
   }
+  if (osl_thread_info) {
+    ss->destroy_thread_info(osl_thread_info);
+  }
+}
 
-  OSL::ShadingSystem *ss = (OSL::ShadingSystem *)kg->osl_ss;
-  OSLThreadData *tdata = kg->osl_tdata;
-  ss->release_context(tdata->context);
+OSLThreadData::OSLThreadData(OSLThreadData &&other) noexcept
+    : globals(other.globals),
+      ss(other.ss),
+      thread_index(other.thread_index),
+      shader_globals(other.shader_globals),
+      tracedata(other.tracedata),
+      osl_thread_info(other.osl_thread_info),
+      context(other.context),
+      oiio_thread_info(other.oiio_thread_info)
+{
+  shader_globals.tracedata = &tracedata;
 
-  ss->destroy_thread_info(tdata->osl_thread_info);
-
-  delete tdata;
-
-  kg->osl = nullptr;
-  kg->osl_ss = nullptr;
-  kg->osl_tdata = nullptr;
+  memset((void *)&other.shader_globals, 0, sizeof(other.shader_globals));
+  memset((void *)&other.tracedata, 0, sizeof(other.tracedata));
+  other.thread_index = -1;
+  other.context = nullptr;
+  other.osl_thread_info = nullptr;
+  other.oiio_thread_info = nullptr;
 }
 
 CCL_NAMESPACE_END
diff --git a/intern/cycles/kernel/osl/globals.h b/intern/cycles/kernel/osl/globals.h
index 3308603642c..3a12b30fe47 100644
--- a/intern/cycles/kernel/osl/globals.h
+++ b/intern/cycles/kernel/osl/globals.h
@@ -23,6 +23,7 @@ CCL_NAMESPACE_BEGIN
 
 class OSLRenderServices;
 class ColorSpaceProcessor;
+struct ThreadKernelGlobalsCPU;
 
 /* OSL Globals
  *
@@ -39,12 +40,6 @@ struct OSLGlobals {
     use = false;
   }
 
-  /* per thread data */
-  static void thread_init(struct KernelGlobalsCPU *kg,
-                          OSLGlobals *osl_globals,
-                          const int thread_index);
-  static void thread_free(struct KernelGlobalsCPU *kg);
-
   bool use;
 
   /* shading system */
@@ -78,11 +73,27 @@ struct OSLTraceData {
 
 /* thread key for thread specific data lookup */
 struct OSLThreadData {
-  OSL::ShaderGlobals globals;
-  OSL::PerThreadInfo *osl_thread_info;
-  OSLTraceData tracedata;
-  OSL::ShadingContext *context;
-  OIIO::TextureSystem::Perthread *oiio_thread_info;
+  /* Global Data */
+  OSLGlobals *globals = nullptr;
+  OSL::ShadingSystem *ss = nullptr;
+
+  /* Per-thread data. */
+  int thread_index = -1;
+
+  mutable OSL::ShaderGlobals shader_globals;
+  mutable OSLTraceData tracedata;
+
+  OSL::PerThreadInfo *osl_thread_info = nullptr;
+  OSL::ShadingContext *context = nullptr;
+  OIIO::TextureSystem::Perthread *oiio_thread_info = nullptr;
+
+  OSLThreadData(OSLGlobals *globals, const int thread_index);
+  ~OSLThreadData();
+
+  OSLThreadData(OSLThreadData &other) = delete;
+  OSLThreadData(OSLThreadData &&other) noexcept;
+  OSLThreadData &operator=(const OSLThreadData &other) = delete;
+  OSLThreadData &operator=(OSLThreadData &&other) = delete;
 };
 
 CCL_NAMESPACE_END
diff --git a/intern/cycles/kernel/osl/osl.h b/intern/cycles/kernel/osl/osl.h
index 7f634f1b660..bbb909e3458 100644
--- a/intern/cycles/kernel/osl/osl.h
+++ b/intern/cycles/kernel/osl/osl.h
@@ -172,7 +172,7 @@ ccl_device void flatten_closure_tree(KernelGlobals kg,
 #ifndef __KERNEL_GPU__
 
 template<ShaderType type>
-void osl_eval_nodes(const KernelGlobalsCPU *kg,
+void osl_eval_nodes(const ThreadKernelGlobalsCPU *kg,
                     const void *state,
                     ShaderData *sd,
                     uint32_t path_flag);
diff --git a/intern/cycles/kernel/osl/services.cpp b/intern/cycles/kernel/osl/services.cpp
index e733178aba0..7a60edf7519 100644
--- a/intern/cycles/kernel/osl/services.cpp
+++ b/intern/cycles/kernel/osl/services.cpp
@@ -148,7 +148,7 @@ bool OSLRenderServices::get_matrix(OSL::ShaderGlobals *sg,
    * a concept of shader space, so we just use object space for both. */
   if (xform) {
     const ShaderData *sd = (const ShaderData *)xform;
-    const KernelGlobalsCPU *kg = sd->osl_globals;
+    const ThreadKernelGlobalsCPU *kg = sd->osl_globals;
     const int object = sd->object;
 
     if (object != OBJECT_NONE) {
@@ -188,7 +188,7 @@ bool OSLRenderServices::get_inverse_matrix(OSL::ShaderGlobals *sg,
    * a concept of shader space, so we just use object space for both. */
   if (xform) {
     const ShaderData *sd = (const ShaderData *)xform;
-    const KernelGlobalsCPU *kg = sd->osl_globals;
+    const ThreadKernelGlobalsCPU *kg = sd->osl_globals;
     const int object = sd->object;
 
     if (object != OBJECT_NONE) {
@@ -225,7 +225,7 @@ bool OSLRenderServices::get_matrix(OSL::ShaderGlobals *sg,
                                    const float time)
 {
   ShaderData *sd = (ShaderData *)(sg->renderstate);
-  const KernelGlobalsCPU *kg = sd->osl_globals;
+  const ThreadKernelGlobalsCPU *kg = sd->osl_globals;
 
   if (from == u_ndc) {
     copy_matrix(result, kernel_data.cam.ndctoworld);
@@ -257,7 +257,7 @@ bool OSLRenderServices::get_inverse_matrix(OSL::ShaderGlobals *sg,
                                            const float time)
 {
   ShaderData *sd = (ShaderData *)(sg->renderstate);
-  const KernelGlobalsCPU *kg = sd->osl_globals;
+  const ThreadKernelGlobalsCPU *kg = sd->osl_globals;
 
   if (to == u_ndc) {
     copy_matrix(result, kernel_data.cam.worldtondc);
@@ -291,7 +291,7 @@ bool OSLRenderServices::get_matrix(OSL::ShaderGlobals *sg,
    * a concept of shader space, so we just use object space for both. */
   if (xform) {
     const ShaderData *sd = (const ShaderData *)xform;
-    const KernelGlobalsCPU *kg = sd->osl_globals;
+    const ThreadKernelGlobalsCPU *kg = sd->osl_globals;
     const int object = sd->object;
 
     if (object != OBJECT_NONE) {
@@ -319,7 +319,7 @@ bool OSLRenderServices::get_inverse_matrix(OSL::ShaderGlobals *sg,
    * a concept of shader space, so we just use object space for both. */
   if (xform) {
     const ShaderData *sd = (const ShaderData *)xform;
-    const KernelGlobalsCPU *kg = sd->osl_globals;
+    const ThreadKernelGlobalsCPU *kg = sd->osl_globals;
     const int object = sd->object;
 
     if (object != OBJECT_NONE) {
@@ -344,7 +344,7 @@ bool OSLRenderServices::get_matrix(OSL::ShaderGlobals *sg,
                                    OSLUStringHash from)
 {
   ShaderData *sd = (ShaderData *)(sg->renderstate);
-  const KernelGlobalsCPU *kg = sd->osl_globals;
+  const ThreadKernelGlobalsCPU *kg = sd->osl_globals;
 
   if (from == u_ndc) {
     copy_matrix(result, kernel_data.cam.ndctoworld);
@@ -371,7 +371,7 @@ bool OSLRenderServices::get_inverse_matrix(OSL::ShaderGlobals *sg,
                                            OSLUStringHash to)
 {
   ShaderData *sd = (ShaderData *)(sg->renderstate);
-  const KernelGlobalsCPU *kg = sd->osl_globals;
+  const ThreadKernelGlobalsCPU *kg = sd->osl_globals;
 
   if (to == u_ndc) {
     copy_matrix(result, kernel_data.cam.worldtondc);
@@ -727,7 +727,7 @@ static bool set_attribute_matrix(const Transform &tfm, const TypeDesc type, void
   return false;
 }
 
-static bool get_object_attribute(const KernelGlobalsCPU *kg,
+static bool get_object_attribute(const ThreadKernelGlobalsCPU *kg,
                                  ShaderData *sd,
                                  const AttributeDescriptor &desc,
                                  const TypeDesc &type,
@@ -803,7 +803,7 @@ static bool get_object_attribute(const KernelGlobalsCPU *kg,
   return false;
 }
 
-bool OSLRenderServices::get_object_standard_attribute(const KernelGlobalsCPU *kg,
+bool OSLRenderServices::get_object_standard_attribute(const ThreadKernelGlobalsCPU *kg,
                                                       ShaderData *sd,
                                                       OSLUStringHash name,
                                                       const TypeDesc type,
@@ -924,7 +924,7 @@ bool OSLRenderServices::get_object_standard_attribute(const KernelGlobalsCPU *kg
     return set_attribute_float3_3(P, type, derivatives, val);
   }
   if (name == u_geom_name) {
-    const ustring object_name = kg->osl->object_names[sd->object];
+    const ustring object_name = kg->osl.globals->object_names[sd->object];
     return set_attribute_string(object_name, type, derivatives, val);
   }
   if (name == u_is_smooth) {
@@ -979,7 +979,7 @@ bool OSLRenderServices::get_object_standard_attribute(const KernelGlobalsCPU *kg
   return get_background_attribute(kg, sd, name, type, derivatives, val);
 }
 
-bool OSLRenderServices::get_background_attribute(const KernelGlobalsCPU *kg,
+bool OSLRenderServices::get_background_attribute(const ThreadKernelGlobalsCPU *kg,
                                                  ShaderData *sd,
                                                  OSLUStringHash name,
                                                  const TypeDesc type,
@@ -1038,8 +1038,7 @@ bool OSLRenderServices::get_background_attribute(const KernelGlobalsCPU *kg,
   }
   if (name == u_ndc) {
     /* NDC coordinates with special exception for orthographic projection. */
-    OSLThreadData *tdata = kg->osl_tdata;
-    OSL::ShaderGlobals *globals = &tdata->globals;
+    OSL::ShaderGlobals *globals = &kg->osl.shader_globals;
     float3 ndc[3];
 
     if ((globals->raytype & PATH_RAY_CAMERA) && sd->object == OBJECT_NONE &&
@@ -1090,14 +1089,15 @@ bool OSLRenderServices::get_attribute(ShaderData *sd,
                                       OSLUStringHash name,
                                       void *val)
 {
-  const KernelGlobalsCPU *kg = sd->osl_globals;
+  const ThreadKernelGlobalsCPU *kg = sd->osl_globals;
   int object;
 
   /* lookup of attribute on another object */
   if (object_name != u_empty) {
-    const OSLGlobals::ObjectNameMap::iterator it = kg->osl->object_name_map.find(object_name);
+    const OSLGlobals::ObjectNameMap::iterator it = kg->osl.globals->object_name_map.find(
+        object_name);
 
-    if (it == kg->osl->object_name_map.end()) {
+    if (it == kg->osl.globals->object_name_map.end()) {
       return false;
     }
 
@@ -1246,7 +1246,7 @@ bool OSLRenderServices::texture(OSLUStringHash filename,
   OSLTextureHandle *handle = (OSLTextureHandle *)texture_handle;
   const OSLTextureHandle::Type texture_type = (handle) ? handle->type : OSLTextureHandle::OIIO;
   ShaderData *sd = (ShaderData *)(sg->renderstate);
-  KernelGlobals kernel_globals = sd->osl_globals;
+  const ThreadKernelGlobalsCPU *kernel_globals = sd->osl_globals;
   bool status = false;
 
   switch (texture_type) {
@@ -1351,8 +1351,7 @@ bool OSLRenderServices::texture(OSLUStringHash filename,
 
       if (handle && handle->oiio_handle) {
         if (texture_thread_info == nullptr) {
-          OSLThreadData *tdata = kernel_globals->osl_tdata;
-          texture_thread_info = tdata->oiio_thread_info;
+          texture_thread_info = kernel_globals->osl.oiio_thread_info;
         }
 
         status = ts->texture(handle->oiio_handle,
@@ -1460,9 +1459,8 @@ bool OSLRenderServices::texture3d(OSLUStringHash filename,
       if (handle && handle->oiio_handle) {
         if (texture_thread_info == nullptr) {
           ShaderData *sd = (ShaderData *)(sg->renderstate);
-          KernelGlobals kernel_globals = sd->osl_globals;
-          OSLThreadData *tdata = kernel_globals->osl_tdata;
-          texture_thread_info = tdata->oiio_thread_info;
+          const ThreadKernelGlobalsCPU *kernel_globals = sd->osl_globals;
+          texture_thread_info = kernel_globals->osl.oiio_thread_info;
         }
 
         status = ts->texture3d(handle->oiio_handle,
@@ -1546,9 +1544,8 @@ bool OSLRenderServices::environment(OSLUStringHash filename,
   if (handle && handle->oiio_handle) {
     if (thread_info == nullptr) {
       ShaderData *sd = (ShaderData *)(sg->renderstate);
-      KernelGlobals kernel_globals = sd->osl_globals;
-      OSLThreadData *tdata = kernel_globals->osl_tdata;
-      thread_info = tdata->oiio_thread_info;
+      const ThreadKernelGlobalsCPU *kernel_globals = sd->osl_globals;
+      thread_info = kernel_globals->osl.oiio_thread_info;
     }
 
     status = ts->environment(handle->oiio_handle,
@@ -1726,7 +1723,7 @@ bool OSLRenderServices::trace(TraceOpt &options,
   tracedata->hit = false;
   tracedata->sd.osl_globals = sd->osl_globals;
 
-  const KernelGlobalsCPU *kg = sd->osl_globals;
+  const ThreadKernelGlobalsCPU *kg = sd->osl_globals;
 
   /* Can't ray-trace from shaders like displacement, before BVH exists. */
   if (kernel_data.bvh.bvh_layout == BVH_LAYOUT_NONE) {
@@ -1759,7 +1756,7 @@ bool OSLRenderServices::getmessage(OSL::ShaderGlobals *sg,
       }
 
       ShaderData *sd = &tracedata->sd;
-      const KernelGlobalsCPU *kg = sd->osl_globals;
+      const ThreadKernelGlobalsCPU *kg = sd->osl_globals;
 
       if (!tracedata->setup) {
         /* lazy shader data setup */
diff --git a/intern/cycles/kernel/osl/services.h b/intern/cycles/kernel/osl/services.h
index 0b063d894d0..28e6417d3e0 100644
--- a/intern/cycles/kernel/osl/services.h
+++ b/intern/cycles/kernel/osl/services.h
@@ -30,7 +30,7 @@ CCL_NAMESPACE_BEGIN
 
 class Scene;
 struct ShaderData;
-struct KernelGlobalsCPU;
+struct ThreadKernelGlobalsCPU;
 
 /* OSL Texture Handle
  *
@@ -276,13 +276,13 @@ class OSLRenderServices : public OSL::RendererServices {
                         void *data) override;
 #endif
 
-  static bool get_background_attribute(const KernelGlobalsCPU *kg,
+  static bool get_background_attribute(const ThreadKernelGlobalsCPU *kg,
                                        ShaderData *sd,
                                        OSLUStringHash name,
                                        const TypeDesc type,
                                        bool derivatives,
                                        void *val);
-  static bool get_object_standard_attribute(const KernelGlobalsCPU *kg,
+  static bool get_object_standard_attribute(const ThreadKernelGlobalsCPU *kg,
                                             ShaderData *sd,
                                             OSLUStringHash name,
                                             const TypeDesc type,
diff --git a/intern/cycles/kernel/types.h b/intern/cycles/kernel/types.h
index 5d7064f4987..07c82959aa0 100644
--- a/intern/cycles/kernel/types.h
+++ b/intern/cycles/kernel/types.h
@@ -1191,7 +1191,7 @@ struct ccl_align(16) ShaderData
 #  ifdef __KERNEL_GPU__
   ccl_private uint8_t *osl_closure_pool;
 #  else
-  const struct KernelGlobalsCPU *osl_globals;
+  const struct ThreadKernelGlobalsCPU *osl_globals;
   const struct IntegratorStateCPU *osl_path_state;
   const struct IntegratorShadowStateCPU *osl_shadow_path_state;
 #  endif
diff --git a/intern/cycles/scene/osl.cpp b/intern/cycles/scene/osl.cpp
index 302f2348179..13f4450a965 100644
--- a/intern/cycles/scene/osl.cpp
+++ b/intern/cycles/scene/osl.cpp
@@ -141,7 +141,7 @@ void OSLShaderManager::device_update_specific(Device *device,
   /* collect shader groups from all shaders */
   for (Shader *shader : scene->shaders) {
     device->foreach_device([shader, background_shader](Device *sub_device) {
-      OSLGlobals *og = (OSLGlobals *)sub_device->get_cpu_osl_memory();
+      OSLGlobals *og = sub_device->get_cpu_osl_memory();
 
       /* push state to array for lookup */
       og->surface_state.push_back(shader->osl_surface_ref);
@@ -161,7 +161,7 @@ void OSLShaderManager::device_update_specific(Device *device,
 
   /* setup shader engine */
   device->foreach_device([](Device *sub_device) {
-    OSLGlobals *og = (OSLGlobals *)sub_device->get_cpu_osl_memory();
+    OSLGlobals *og = sub_device->get_cpu_osl_memory();
     OSL::ShadingSystem *ss = ss_shared[sub_device->info.type];
 
     og->ss = ss;
@@ -228,7 +228,7 @@ void OSLShaderManager::device_free(Device *device, DeviceScene *dscene, Scene *s
 
   /* clear shader engine */
   device->foreach_device([](Device *sub_device) {
-    OSLGlobals *og = (OSLGlobals *)sub_device->get_cpu_osl_memory();
+    OSLGlobals *og = sub_device->get_cpu_osl_memory();
 
     og->use = false;
     og->ss = nullptr;
@@ -712,7 +712,7 @@ void OSLShaderManager::osl_image_slots(Device *device,
 {
   set<OSLRenderServices *> services_shared;
   device->foreach_device([&services_shared](Device *sub_device) {
-    OSLGlobals *og = (OSLGlobals *)sub_device->get_cpu_osl_memory();
+    OSLGlobals *og = sub_device->get_cpu_osl_memory();
     services_shared.insert(og->services);
   });