Files
test2/intern/cycles/util/transform_inverse.h
Patrick Mours d0dd587b60 Fix #108372: GPU implementation of OSL matrix intrinsic functions
All the OSL matrix functions had been implemented using the
`Transform` utility of Cycles, but that's built around a 4x3 matrix,
when the OSL matrix functions are working with 4x4 matrices.
This resulted in them not producing results consistent with the
CPU implementation.

This fixes that by making use of the `ProjectionTransform` utility
of Cycles instead, because it's built around a 4x4 matrix. Since
matrix inversion is required, I had to make a few more utility
functions available on the GPU (except Metal, due to use of
references/pointers without specification) that were previously
CPU-only.

Co-authored-by: Brecht Van Lommel <brecht@blender.org>

Pull Request: https://projects.blender.org/blender/blender/pulls/110102
2024-11-04 17:59:29 +01:00

86 lines
2.9 KiB
C

/* SPDX-FileCopyrightText: 2011-2022 Blender Foundation
*
* SPDX-License-Identifier: Apache-2.0 */
#pragma once
CCL_NAMESPACE_BEGIN
/* Custom cross and dot implementations that match Embree bit for bit.
* Normally we don't use SSE41/AVX outside the kernel, but for this it's
* important to match exactly for ray tracing precision. */
ccl_device_forceinline float3 transform_inverse_cross(const float3 a_, const float3 b_)
{
#if defined(__AVX2__) && defined(__KERNEL_SSE2__)
const __m128 a = (const __m128 &)a_;
const __m128 b = (const __m128 &)b_;
const __m128 a_shuffle = _mm_castsi128_ps(
_mm_shuffle_epi32(_mm_castps_si128(a), _MM_SHUFFLE(3, 0, 2, 1)));
const __m128 b_shuffle = _mm_castsi128_ps(
_mm_shuffle_epi32(_mm_castps_si128(b), _MM_SHUFFLE(3, 0, 2, 1)));
const __m128 r = _mm_castsi128_ps(
_mm_shuffle_epi32(_mm_castps_si128(_mm_fmsub_ps(a, b_shuffle, _mm_mul_ps(a_shuffle, b))),
_MM_SHUFFLE(3, 0, 2, 1)));
return (const float3 &)r;
#endif
return cross(a_, b_);
}
ccl_device_forceinline float transform_inverse_dot(const float3 a_, const float3 b_)
{
#if defined(__KERNEL_SSE__) && defined(__KERNEL_SSE42__)
const __m128 a = (const __m128 &)a_;
const __m128 b = (const __m128 &)b_;
return _mm_cvtss_f32(_mm_dp_ps(a, b, 0x7F));
#endif
return dot(a_, b_);
}
ccl_device_forceinline Transform transform_inverse_impl(const Transform tfm)
{
/* This implementation matches the one in Embree exactly, to ensure consistent
* results with the ray intersection of instances. */
float3 x = make_float3(tfm.x.x, tfm.y.x, tfm.z.x);
float3 y = make_float3(tfm.x.y, tfm.y.y, tfm.z.y);
float3 z = make_float3(tfm.x.z, tfm.y.z, tfm.z.z);
float3 w = make_float3(tfm.x.w, tfm.y.w, tfm.z.w);
/* Compute determinant. */
float det = transform_inverse_dot(x, transform_inverse_cross(y, z));
if (det == 0.0f) {
/* Matrix is degenerate (e.g. 0 scale on some axis), ideally we should
* never be in this situation, but try to invert it anyway with tweak.
*
* This logic does not match Embree which would just give an invalid
* matrix. A better solution would be to remove this and ensure any object
* matrix is valid. */
x.x += 1e-8f;
y.y += 1e-8f;
z.z += 1e-8f;
det = transform_inverse_dot(x, cross(y, z));
if (det == 0.0f) {
det = FLT_MAX;
}
}
/* Divide adjoint matrix by the determinant to compute inverse of 3x3 matrix. */
const float3 inverse_x = transform_inverse_cross(y, z) / det;
const float3 inverse_y = transform_inverse_cross(z, x) / det;
const float3 inverse_z = transform_inverse_cross(x, y) / det;
/* Compute translation and fill transform. */
Transform itfm;
itfm.x = float3_to_float4(inverse_x, -transform_inverse_dot(inverse_x, w));
itfm.y = float3_to_float4(inverse_y, -transform_inverse_dot(inverse_y, w));
itfm.z = float3_to_float4(inverse_z, -transform_inverse_dot(inverse_z, w));
return itfm;
}
CCL_NAMESPACE_END