All the OSL matrix functions had been implemented using the `Transform` utility of Cycles, but that's built around a 4x3 matrix, when the OSL matrix functions are working with 4x4 matrices. This resulted in them not producing results consistent with the CPU implementation. This fixes that by making use of the `ProjectionTransform` utility of Cycles instead, because it's built around a 4x4 matrix. Since matrix inversion is required, I had to make a few more utility functions available on the GPU (except Metal, due to use of references/pointers without specification) that were previously CPU-only. Co-authored-by: Brecht Van Lommel <brecht@blender.org> Pull Request: https://projects.blender.org/blender/blender/pulls/110102
86 lines
2.9 KiB
C
86 lines
2.9 KiB
C
/* SPDX-FileCopyrightText: 2011-2022 Blender Foundation
|
|
*
|
|
* SPDX-License-Identifier: Apache-2.0 */
|
|
|
|
#pragma once
|
|
|
|
CCL_NAMESPACE_BEGIN
|
|
|
|
/* Custom cross and dot implementations that match Embree bit for bit.
|
|
* Normally we don't use SSE41/AVX outside the kernel, but for this it's
|
|
* important to match exactly for ray tracing precision. */
|
|
|
|
ccl_device_forceinline float3 transform_inverse_cross(const float3 a_, const float3 b_)
|
|
{
|
|
#if defined(__AVX2__) && defined(__KERNEL_SSE2__)
|
|
const __m128 a = (const __m128 &)a_;
|
|
const __m128 b = (const __m128 &)b_;
|
|
const __m128 a_shuffle = _mm_castsi128_ps(
|
|
_mm_shuffle_epi32(_mm_castps_si128(a), _MM_SHUFFLE(3, 0, 2, 1)));
|
|
const __m128 b_shuffle = _mm_castsi128_ps(
|
|
_mm_shuffle_epi32(_mm_castps_si128(b), _MM_SHUFFLE(3, 0, 2, 1)));
|
|
const __m128 r = _mm_castsi128_ps(
|
|
_mm_shuffle_epi32(_mm_castps_si128(_mm_fmsub_ps(a, b_shuffle, _mm_mul_ps(a_shuffle, b))),
|
|
_MM_SHUFFLE(3, 0, 2, 1)));
|
|
return (const float3 &)r;
|
|
#endif
|
|
|
|
return cross(a_, b_);
|
|
}
|
|
|
|
ccl_device_forceinline float transform_inverse_dot(const float3 a_, const float3 b_)
|
|
{
|
|
#if defined(__KERNEL_SSE__) && defined(__KERNEL_SSE42__)
|
|
const __m128 a = (const __m128 &)a_;
|
|
const __m128 b = (const __m128 &)b_;
|
|
return _mm_cvtss_f32(_mm_dp_ps(a, b, 0x7F));
|
|
#endif
|
|
|
|
return dot(a_, b_);
|
|
}
|
|
|
|
ccl_device_forceinline Transform transform_inverse_impl(const Transform tfm)
|
|
{
|
|
/* This implementation matches the one in Embree exactly, to ensure consistent
|
|
* results with the ray intersection of instances. */
|
|
float3 x = make_float3(tfm.x.x, tfm.y.x, tfm.z.x);
|
|
float3 y = make_float3(tfm.x.y, tfm.y.y, tfm.z.y);
|
|
float3 z = make_float3(tfm.x.z, tfm.y.z, tfm.z.z);
|
|
float3 w = make_float3(tfm.x.w, tfm.y.w, tfm.z.w);
|
|
|
|
/* Compute determinant. */
|
|
float det = transform_inverse_dot(x, transform_inverse_cross(y, z));
|
|
|
|
if (det == 0.0f) {
|
|
/* Matrix is degenerate (e.g. 0 scale on some axis), ideally we should
|
|
* never be in this situation, but try to invert it anyway with tweak.
|
|
*
|
|
* This logic does not match Embree which would just give an invalid
|
|
* matrix. A better solution would be to remove this and ensure any object
|
|
* matrix is valid. */
|
|
x.x += 1e-8f;
|
|
y.y += 1e-8f;
|
|
z.z += 1e-8f;
|
|
|
|
det = transform_inverse_dot(x, cross(y, z));
|
|
if (det == 0.0f) {
|
|
det = FLT_MAX;
|
|
}
|
|
}
|
|
|
|
/* Divide adjoint matrix by the determinant to compute inverse of 3x3 matrix. */
|
|
const float3 inverse_x = transform_inverse_cross(y, z) / det;
|
|
const float3 inverse_y = transform_inverse_cross(z, x) / det;
|
|
const float3 inverse_z = transform_inverse_cross(x, y) / det;
|
|
|
|
/* Compute translation and fill transform. */
|
|
Transform itfm;
|
|
itfm.x = float3_to_float4(inverse_x, -transform_inverse_dot(inverse_x, w));
|
|
itfm.y = float3_to_float4(inverse_y, -transform_inverse_dot(inverse_y, w));
|
|
itfm.z = float3_to_float4(inverse_z, -transform_inverse_dot(inverse_z, w));
|
|
|
|
return itfm;
|
|
}
|
|
|
|
CCL_NAMESPACE_END
|