Files
test2/intern/cycles/util/math_int4.h
Michael Jones f6c8a78ac6 Cycles: Fix bvh2 gen on Apple Silicon and use it to speed up renders
This patch fixes a correctness issue discovered in the `int4 select(...)` function on Apple Silicon machines, which causes bad bvh2 builds. Although the generated bvh2s give correct renders, the resulting runtime performance is terrible. This fix allows us to switch over to bvh2 on Apple Silicon giving a significant performance uplift for many of the standard benchmarking assets. It also fixes some unit test failures stemming from the use of MetalRT, and trivially enables the new pointcloud primitive.

Ref T92212

Reviewed By: brecht

Maniphest Tasks: T92212

Differential Revision: https://developer.blender.org/D13877
2022-01-20 15:37:49 +00:00

154 lines
4.3 KiB
C

/*
* Copyright 2011-2017 Blender Foundation
*
* Licensed under the Apache License, Version 2.0 (the "License");
* you may not use this file except in compliance with the License.
* You may obtain a copy of the License at
*
* http://www.apache.org/licenses/LICENSE-2.0
*
* Unless required by applicable law or agreed to in writing, software
* distributed under the License is distributed on an "AS IS" BASIS,
* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
* See the License for the specific language governing permissions and
* limitations under the License.
*/
#ifndef __UTIL_MATH_INT4_H__
#define __UTIL_MATH_INT4_H__
#ifndef __UTIL_MATH_H__
# error "Do not include this file directly, include util/types.h instead."
#endif
CCL_NAMESPACE_BEGIN
/*******************************************************************************
* Declaration.
*/
#ifndef __KERNEL_GPU__
ccl_device_inline int4 operator+(const int4 &a, const int4 &b);
ccl_device_inline int4 operator+=(int4 &a, const int4 &b);
ccl_device_inline int4 operator>>(const int4 &a, int i);
ccl_device_inline int4 operator<<(const int4 &a, int i);
ccl_device_inline int4 operator<(const int4 &a, const int4 &b);
ccl_device_inline int4 operator>=(const int4 &a, const int4 &b);
ccl_device_inline int4 operator&(const int4 &a, const int4 &b);
ccl_device_inline int4 min(int4 a, int4 b);
ccl_device_inline int4 max(int4 a, int4 b);
ccl_device_inline int4 clamp(const int4 &a, const int4 &mn, const int4 &mx);
ccl_device_inline int4 select(const int4 &mask, const int4 &a, const int4 &b);
#endif /* __KERNEL_GPU__ */
/*******************************************************************************
* Definition.
*/
#ifndef __KERNEL_GPU__
ccl_device_inline int4 operator+(const int4 &a, const int4 &b)
{
# ifdef __KERNEL_SSE__
return int4(_mm_add_epi32(a.m128, b.m128));
# else
return make_int4(a.x + b.x, a.y + b.y, a.z + b.z, a.w + b.w);
# endif
}
ccl_device_inline int4 operator+=(int4 &a, const int4 &b)
{
return a = a + b;
}
ccl_device_inline int4 operator>>(const int4 &a, int i)
{
# ifdef __KERNEL_SSE__
return int4(_mm_srai_epi32(a.m128, i));
# else
return make_int4(a.x >> i, a.y >> i, a.z >> i, a.w >> i);
# endif
}
ccl_device_inline int4 operator<<(const int4 &a, int i)
{
# ifdef __KERNEL_SSE__
return int4(_mm_slli_epi32(a.m128, i));
# else
return make_int4(a.x << i, a.y << i, a.z << i, a.w << i);
# endif
}
ccl_device_inline int4 operator<(const int4 &a, const int4 &b)
{
# ifdef __KERNEL_SSE__
return int4(_mm_cmplt_epi32(a.m128, b.m128));
# else
return make_int4(a.x < b.x, a.y < b.y, a.z < b.z, a.w < b.w);
# endif
}
ccl_device_inline int4 operator>=(const int4 &a, const int4 &b)
{
# ifdef __KERNEL_SSE__
return int4(_mm_xor_si128(_mm_set1_epi32(0xffffffff), _mm_cmplt_epi32(a.m128, b.m128)));
# else
return make_int4(a.x >= b.x, a.y >= b.y, a.z >= b.z, a.w >= b.w);
# endif
}
ccl_device_inline int4 operator&(const int4 &a, const int4 &b)
{
# ifdef __KERNEL_SSE__
return int4(_mm_and_si128(a.m128, b.m128));
# else
return make_int4(a.x & b.x, a.y & b.y, a.z & b.z, a.w & b.w);
# endif
}
ccl_device_inline int4 min(int4 a, int4 b)
{
# if defined(__KERNEL_SSE__) && defined(__KERNEL_SSE41__)
return int4(_mm_min_epi32(a.m128, b.m128));
# else
return make_int4(min(a.x, b.x), min(a.y, b.y), min(a.z, b.z), min(a.w, b.w));
# endif
}
ccl_device_inline int4 max(int4 a, int4 b)
{
# if defined(__KERNEL_SSE__) && defined(__KERNEL_SSE41__)
return int4(_mm_max_epi32(a.m128, b.m128));
# else
return make_int4(max(a.x, b.x), max(a.y, b.y), max(a.z, b.z), max(a.w, b.w));
# endif
}
ccl_device_inline int4 clamp(const int4 &a, const int4 &mn, const int4 &mx)
{
return min(max(a, mn), mx);
}
ccl_device_inline int4 select(const int4 &mask, const int4 &a, const int4 &b)
{
# ifdef __KERNEL_SSE__
return int4(_mm_or_si128(_mm_and_si128(mask, a), _mm_andnot_si128(mask, b)));
# else
return make_int4(
(mask.x) ? a.x : b.x, (mask.y) ? a.y : b.y, (mask.z) ? a.z : b.z, (mask.w) ? a.w : b.w);
# endif
}
ccl_device_inline int4 load_int4(const int *v)
{
# ifdef __KERNEL_SSE__
return int4(_mm_loadu_si128((__m128i *)v));
# else
return make_int4(v[0], v[1], v[2], v[3]);
# endif
}
#endif /* __KERNEL_GPU__ */
CCL_NAMESPACE_END
#endif /* __UTIL_MATH_INT4_H__ */