EEVEE Next: Optimize HiZ with fast image load store routines

Authored by Apple: Michael Parkin-White

Pull Request: https://projects.blender.org/blender/blender/pulls/116953
This commit is contained in:
Jason Fielder
2024-01-24 09:36:25 +01:00
committed by Clément Foucault
parent d16d2bbd3a
commit 190567f941
3 changed files with 86 additions and 11 deletions

View File

@@ -53,16 +53,16 @@ void main()
#endif
if (update_mip_0) {
imageStore(out_mip_0, src_px + ivec2(0, 1), samp.xxxx);
imageStore(out_mip_0, src_px + ivec2(1, 1), samp.yyyy);
imageStore(out_mip_0, src_px + ivec2(1, 0), samp.zzzz);
imageStore(out_mip_0, src_px + ivec2(0, 0), samp.wwww);
imageStoreFast(out_mip_0, src_px + ivec2(0, 1), samp.xxxx);
imageStoreFast(out_mip_0, src_px + ivec2(1, 1), samp.yyyy);
imageStoreFast(out_mip_0, src_px + ivec2(1, 0), samp.zzzz);
imageStoreFast(out_mip_0, src_px + ivec2(0, 0), samp.wwww);
}
/* Level 1. (No load) */
float max_depth = reduce_max(samp);
ivec2 dst_px = ivec2(kernel_origin + local_px);
imageStore(out_mip_1, dst_px, vec4(max_depth));
imageStoreFast(out_mip_1, dst_px, vec4(max_depth));
store_local_depth(local_px, max_depth);
/* Level 2-5. */
@@ -75,7 +75,7 @@ void main()
if (active_thread) { \
max_depth = reduce_max(load_local_depths(local_px)); \
dst_px = ivec2((kernel_origin >> mask_shift) + local_px); \
imageStore(out_mip__, dst_px, vec4(max_depth)); \
imageStoreFast(out_mip__, dst_px, vec4(max_depth)); \
} \
barrier(); /* Wait for previous reads to finish. */ \
if (active_thread) { \
@@ -105,14 +105,14 @@ void main()
kernel_origin = ivec2(gl_WorkGroupSize.xy) * ivec2(x, y);
src_px = ivec2(kernel_origin + local_px) * 2;
vec4 samp;
samp.x = imageLoad(out_mip_5, min(src_px + ivec2(0, 1), image_border)).x;
samp.y = imageLoad(out_mip_5, min(src_px + ivec2(1, 1), image_border)).x;
samp.z = imageLoad(out_mip_5, min(src_px + ivec2(1, 0), image_border)).x;
samp.w = imageLoad(out_mip_5, min(src_px + ivec2(0, 0), image_border)).x;
samp.x = imageLoadFast(out_mip_5, min(src_px + ivec2(0, 1), image_border)).x;
samp.y = imageLoadFast(out_mip_5, min(src_px + ivec2(1, 1), image_border)).x;
samp.z = imageLoadFast(out_mip_5, min(src_px + ivec2(1, 0), image_border)).x;
samp.w = imageLoadFast(out_mip_5, min(src_px + ivec2(0, 0), image_border)).x;
/* Level 6. */
float max_depth = reduce_max(samp);
ivec2 dst_px = ivec2(kernel_origin + local_px);
imageStore(out_mip_6, dst_px, vec4(max_depth));
imageStoreFast(out_mip_6, dst_px, vec4(max_depth));
store_local_depth(local_px, max_depth);
mask_shift = 1;

View File

@@ -359,6 +359,7 @@ struct SStruct {
#define texelFetchOffset(__tex, __texel, __lod, __offset) \
_texelFetch_internal(__tex, __texel, __lod, __offset)
#define imageLoad(__image, __coord) _texelFetch_internal(__image, __coord, 0)
#define imageLoadFast(__image, __coord) _texelFetch_internal_fast(__image, __coord, 0)
#define texture2(__tex, __uv) _texture_internal_samp(__tex, __uv)
#define texture3(__tex, __uv, _bias) _texture_internal_bias(__tex, __uv, bias(float(_bias)))
#define textureLod(__tex, __uv, __lod) _texture_internal_level(__tex, __uv, level(float(__lod)))
@@ -497,6 +498,14 @@ inline vec<S, 4> _texelFetch_internal(thread _mtl_combined_image_sampler_1d<S, A
}
}
template<typename S, typename T, access A>
inline vec<S, 4> _texelFetch_internal_fast(thread _mtl_combined_image_sampler_1d<S, A> tex,
T texel,
uint lod = 0)
{
return tex.texture->read(uint(texel));
}
template<typename S, typename T>
inline vec<S, 4> _texelFetch_internal(
const thread _mtl_combined_image_sampler_buffer<S, access::read> tex, T texel, uint lod = 0)
@@ -510,6 +519,13 @@ inline vec<S, 4> _texelFetch_internal(
}
}
template<typename S, typename T>
inline vec<S, 4> _texelFetch_internal_fast(
const thread _mtl_combined_image_sampler_buffer<S, access::read> tex, T texel, uint lod = 0)
{
return tex.texture->read(uint(texel));
}
template<typename S, typename T, access A>
inline vec<S, 4> _texelFetch_internal(thread _mtl_combined_image_sampler_1d<S, A> tex,
T texel,
@@ -526,6 +542,16 @@ inline vec<S, 4> _texelFetch_internal(thread _mtl_combined_image_sampler_1d<S, A
}
}
template<typename S, typename T, access A>
inline vec<S, 4> _texelFetch_internal_fast(thread _mtl_combined_image_sampler_1d<S, A> tex,
T texel,
uint lod,
T offset)
{
/* LODs not supported for 1d textures. This must be zero. */
return tex.texture->read(uint(texel + offset), 0);
}
template<typename S, typename T, access A>
inline vec<S, 4> _texelFetch_internal(thread _mtl_combined_image_sampler_1d_array<S, A> tex,
vec<T, 2> texel,
@@ -546,6 +572,16 @@ inline vec<S, 4> _texelFetch_internal(thread _mtl_combined_image_sampler_1d_arra
}
}
template<typename S, typename T, access A>
inline vec<S, 4> _texelFetch_internal_fast(thread _mtl_combined_image_sampler_1d_array<S, A> tex,
vec<T, 2> texel,
uint lod,
vec<T, 2> offset = vec<T, 2>(0, 0))
{
/* LODs not supported for 1d textures. This must be zero. */
return tex.texture->read(uint(texel.x + offset.x), uint(texel.y + offset.y), 0);
}
template<typename S, typename T, access A>
inline vec<S, 4> _texelFetch_internal(thread _mtl_combined_image_sampler_2d<S, A> tex,
vec<T, 2> texel,
@@ -565,6 +601,15 @@ inline vec<S, 4> _texelFetch_internal(thread _mtl_combined_image_sampler_2d<S, A
}
}
template<typename S, typename T, access A>
inline vec<S, 4> _texelFetch_internal_fast(thread _mtl_combined_image_sampler_2d<S, A> tex,
vec<T, 2> texel,
uint lod,
vec<T, 2> offset = vec<T, 2>(0))
{
return tex.texture->read(uint2(texel + offset), lod);
}
template<typename S, typename T, access A>
inline vec<S, 4> _texelFetch_internal(thread _mtl_combined_image_sampler_2d_array<S, A> tex,
vec<T, 3> texel,
@@ -584,6 +629,15 @@ inline vec<S, 4> _texelFetch_internal(thread _mtl_combined_image_sampler_2d_arra
}
}
template<typename S, typename T, access A>
inline vec<S, 4> _texelFetch_internal_fast(thread _mtl_combined_image_sampler_2d_array<S, A> tex,
vec<T, 3> texel,
uint lod,
vec<T, 3> offset = vec<T, 3>(0))
{
return tex.texture->read(uint2(texel.xy + offset.xy), uint(texel.z + offset.z), lod);
}
template<typename S, typename T, access A>
inline vec<S, 4> _texelFetch_internal(thread _mtl_combined_image_sampler_3d<S, A> tex,
vec<T, 3> texel,
@@ -604,6 +658,15 @@ inline vec<S, 4> _texelFetch_internal(thread _mtl_combined_image_sampler_3d<S, A
}
}
template<typename S, typename T, access A>
inline vec<S, 4> _texelFetch_internal_fast(thread _mtl_combined_image_sampler_3d<S, A> tex,
vec<T, 3> texel,
uint lod,
vec<T, 3> offset = vec<T, 3>(0))
{
return tex.texture->read(uint3(texel + offset), lod);
}
template<typename T, access A>
inline _msl_return_float _texelFetch_internal(
thread _mtl_combined_image_sampler_depth_2d<float, A> tex,
@@ -626,6 +689,17 @@ inline _msl_return_float _texelFetch_internal(
}
}
template<typename T, access A>
inline _msl_return_float _texelFetch_internal_fast(
thread _mtl_combined_image_sampler_depth_2d<float, A> tex,
vec<T, 2> texel,
uint lod,
vec<T, 2> offset = vec<T, 2>(0))
{
_msl_return_float fl = {tex.texture->read(uint2(texel + offset), lod)};
return fl;
}
template<typename S, typename T, access A>
inline vec<S, 4> _texture_internal_samp(thread _mtl_combined_image_sampler_2d_array<S, A> tex,
vec<T, 3> texel,

View File

@@ -12,6 +12,7 @@
/* Fast store variant macro. In GLSL this is the same as imageStore, but assumes no bounds
* checking. */
#define imageStoreFast imageStore
#define imageLoadFast imageLoad
/* Texture format tokens -- Type explicitness required by other Graphics APIs. */
#define depth2D sampler2D