Merge remote-tracking branch 'origin/master' into blender2.8
This commit is contained in:
@@ -107,6 +107,67 @@ ccl_device_inline bool triangle_intersect(KernelGlobals *kg,
|
||||
|
||||
/* Calculate vertices relative to ray origin. */
|
||||
const uint tri_vindex = kernel_tex_fetch(__prim_tri_index, triAddr);
|
||||
|
||||
#if defined(__KERNEL_AVX2__)
|
||||
const avxf avxf_P(P.m128, P.m128);
|
||||
|
||||
const avxf tri_ab = kernel_tex_fetch_avxf(__prim_tri_verts, tri_vindex + 0);
|
||||
const avxf tri_bc = kernel_tex_fetch_avxf(__prim_tri_verts, tri_vindex + 1);
|
||||
|
||||
const avxf AB = tri_ab - avxf_P;
|
||||
const avxf BC = tri_bc - avxf_P;
|
||||
|
||||
const __m256i permuteMask = _mm256_set_epi32(0x3, kz, ky, kx, 0x3, kz, ky, kx);
|
||||
|
||||
const avxf AB_k = shuffle(AB, permuteMask);
|
||||
const avxf BC_k = shuffle(BC, permuteMask);
|
||||
|
||||
/* Akz, Akz, Bkz, Bkz, Bkz, Bkz, Ckz, Ckz */
|
||||
const avxf ABBC_kz = shuffle<2>(AB_k, BC_k);
|
||||
|
||||
/* Akx, Aky, Bkx, Bky, Bkx,Bky, Ckx, Cky */
|
||||
const avxf ABBC_kxy = shuffle<0,1,0,1>(AB_k, BC_k);
|
||||
|
||||
const avxf Sxy(Sy, Sx, Sy, Sx);
|
||||
|
||||
/* Ax, Ay, Bx, By, Bx, By, Cx, Cy */
|
||||
const avxf ABBC_xy = nmadd(ABBC_kz, Sxy, ABBC_kxy);
|
||||
|
||||
float ABBC_kz_array[8];
|
||||
_mm256_storeu_ps((float*)&ABBC_kz_array, ABBC_kz);
|
||||
|
||||
const float A_kz = ABBC_kz_array[0];
|
||||
const float B_kz = ABBC_kz_array[2];
|
||||
const float C_kz = ABBC_kz_array[6];
|
||||
|
||||
/* By, Bx, Cy, Cx, By, Bx, Ay, Ax */
|
||||
const avxf BCBA_yx = permute<3,2,7,6,3,2,1,0>(ABBC_xy);
|
||||
|
||||
const avxf negMask(0,0,0,0,0x80000000, 0x80000000, 0x80000000, 0x80000000);
|
||||
|
||||
/* W U V
|
||||
* (AxBy-AyBx) (BxCy-ByCx) XX XX (BxBy-ByBx) (CxAy-CyAx) XX XX
|
||||
*/
|
||||
const avxf WUxxxxVxx_neg = _mm256_hsub_ps(ABBC_xy * BCBA_yx, negMask /* Dont care */);
|
||||
|
||||
const avxf WUVWnegWUVW = permute<0,1,5,0,0,1,5,0>(WUxxxxVxx_neg) ^ negMask;
|
||||
|
||||
/* Calculate scaled barycentric coordinates. */
|
||||
float WUVW_array[4];
|
||||
_mm_storeu_ps((float*)&WUVW_array, _mm256_castps256_ps128 (WUVWnegWUVW));
|
||||
|
||||
const float W = WUVW_array[0];
|
||||
const float U = WUVW_array[1];
|
||||
const float V = WUVW_array[2];
|
||||
|
||||
const int WUVW_mask = 0x7 & _mm256_movemask_ps(WUVWnegWUVW);
|
||||
const int WUVW_zero = 0x7 & _mm256_movemask_ps(_mm256_cmp_ps(WUVWnegWUVW,
|
||||
_mm256_setzero_ps(), 0));
|
||||
|
||||
if(!((WUVW_mask == 7) || (WUVW_mask == 0)) && ((WUVW_mask | WUVW_zero) != 7)) {
|
||||
return false;
|
||||
}
|
||||
#else
|
||||
const float4 tri_a = kernel_tex_fetch(__prim_tri_verts, tri_vindex+0),
|
||||
tri_b = kernel_tex_fetch(__prim_tri_verts, tri_vindex+1),
|
||||
tri_c = kernel_tex_fetch(__prim_tri_verts, tri_vindex+2);
|
||||
@@ -135,6 +196,7 @@ ccl_device_inline bool triangle_intersect(KernelGlobals *kg,
|
||||
{
|
||||
return false;
|
||||
}
|
||||
#endif
|
||||
|
||||
/* Calculate determinant. */
|
||||
float det = U + V + W;
|
||||
|
||||
@@ -71,6 +71,20 @@ template<typename T> struct texture {
|
||||
return data[index];
|
||||
}
|
||||
|
||||
#ifdef __KERNEL_AVX__
|
||||
/* Reads 256 bytes but indexes in blocks of 128 bytes to maintain
|
||||
* compatibility with existing indicies and data structures.
|
||||
*/
|
||||
ccl_always_inline avxf fetch_avxf(const int index)
|
||||
{
|
||||
kernel_assert(index >= 0 && (index+1) < width);
|
||||
ssef *ssefData = (ssef*)data;
|
||||
ssef *ssefNodeData = &ssefData[index];
|
||||
return _mm256_loadu_ps((float *)ssefNodeData);
|
||||
}
|
||||
|
||||
#endif
|
||||
|
||||
#ifdef __KERNEL_SSE2__
|
||||
ccl_always_inline ssef fetch_ssef(int index)
|
||||
{
|
||||
@@ -506,6 +520,7 @@ typedef texture_image<half4> texture_image_half4;
|
||||
/* Macros to handle different memory storage on different devices */
|
||||
|
||||
#define kernel_tex_fetch(tex, index) (kg->tex.fetch(index))
|
||||
#define kernel_tex_fetch_avxf(tex, index) (kg->tex.fetch_avxf(index))
|
||||
#define kernel_tex_fetch_ssef(tex, index) (kg->tex.fetch_ssef(index))
|
||||
#define kernel_tex_fetch_ssei(tex, index) (kg->tex.fetch_ssei(index))
|
||||
#define kernel_tex_lookup(tex, t, offset, size) (kg->tex.lookup(t, offset, size))
|
||||
|
||||
@@ -85,16 +85,11 @@ ccl_device ShaderClosure *subsurface_scatter_pick_closure(KernelGlobals *kg, Sha
|
||||
return NULL;
|
||||
}
|
||||
|
||||
#ifndef __KERNEL_GPU__
|
||||
ccl_device_noinline
|
||||
#else
|
||||
ccl_device_inline
|
||||
#endif
|
||||
float3 subsurface_scatter_eval(ShaderData *sd,
|
||||
ShaderClosure *sc,
|
||||
float disk_r,
|
||||
float r,
|
||||
bool all)
|
||||
ccl_device_inline float3 subsurface_scatter_eval(ShaderData *sd,
|
||||
ShaderClosure *sc,
|
||||
float disk_r,
|
||||
float r,
|
||||
bool all)
|
||||
{
|
||||
#ifdef BSSRDF_MULTI_EVAL
|
||||
/* this is the veach one-sample model with balance heuristic, some pdf
|
||||
@@ -223,14 +218,9 @@ ccl_device void subsurface_color_bump_blur(KernelGlobals *kg,
|
||||
/* Subsurface scattering step, from a point on the surface to other
|
||||
* nearby points on the same object.
|
||||
*/
|
||||
#ifndef __KERNEL_CUDA__
|
||||
ccl_device
|
||||
#else
|
||||
ccl_device_inline
|
||||
#endif
|
||||
int subsurface_scatter_multi_intersect(
|
||||
ccl_device_inline int subsurface_scatter_multi_intersect(
|
||||
KernelGlobals *kg,
|
||||
SubsurfaceIntersection* ss_isect,
|
||||
SubsurfaceIntersection *ss_isect,
|
||||
ShaderData *sd,
|
||||
ShaderClosure *sc,
|
||||
uint *lcg_state,
|
||||
@@ -330,6 +320,10 @@ int subsurface_scatter_multi_intersect(
|
||||
verts);
|
||||
}
|
||||
#endif /* __OBJECT_MOTION__ */
|
||||
else {
|
||||
ss_isect->weight[hit] = make_float3(0.0f, 0.0f, 0.0f);
|
||||
continue;
|
||||
}
|
||||
|
||||
float3 hit_Ng = ss_isect->Ng[hit];
|
||||
if(ss_isect->hits[hit].object != OBJECT_NONE) {
|
||||
|
||||
@@ -45,6 +45,7 @@
|
||||
# define __KERNEL_AVX__
|
||||
# endif
|
||||
# ifdef __AVX2__
|
||||
# define __KERNEL_SSE__
|
||||
# define __KERNEL_AVX2__
|
||||
# endif
|
||||
#endif
|
||||
|
||||
@@ -20,6 +20,7 @@
|
||||
|
||||
/* SSE optimization disabled for now on 32 bit, see bug #36316 */
|
||||
#if !(defined(__GNUC__) && (defined(i386) || defined(_M_IX86)))
|
||||
# define __KERNEL_SSE__
|
||||
# define __KERNEL_SSE2__
|
||||
# define __KERNEL_SSE3__
|
||||
# define __KERNEL_SSSE3__
|
||||
|
||||
@@ -321,8 +321,8 @@ void ShaderGraph::finalize(Scene *scene,
|
||||
* modified afterwards. */
|
||||
|
||||
if(!finalized) {
|
||||
clean(scene);
|
||||
default_inputs(do_osl);
|
||||
clean(scene);
|
||||
refine_bump_nodes();
|
||||
|
||||
if(do_bump)
|
||||
|
||||
@@ -63,6 +63,7 @@ set(SRC_HEADERS
|
||||
util_sky_model.cpp
|
||||
util_sky_model.h
|
||||
util_sky_model_data.h
|
||||
util_avxf.h
|
||||
util_sseb.h
|
||||
util_ssef.h
|
||||
util_ssei.h
|
||||
|
||||
185
intern/cycles/util/util_avxf.h
Normal file
185
intern/cycles/util/util_avxf.h
Normal file
@@ -0,0 +1,185 @@
|
||||
/*
|
||||
* Copyright 2016 Intel Corporation
|
||||
*
|
||||
* Licensed under the Apache License, Version 2.0(the "License");
|
||||
* you may not use this file except in compliance with the License.
|
||||
* You may obtain a copy of the License at
|
||||
*
|
||||
* http://www.apache.org/licenses/LICENSE-2.0
|
||||
*
|
||||
* Unless required by applicable law or agreed to in writing, software
|
||||
* distributed under the License is distributed on an "AS IS" BASIS,
|
||||
* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
|
||||
* See the License for the specific language governing permissions and
|
||||
* limitations under the License.
|
||||
*/
|
||||
|
||||
#ifndef __UTIL_AVXF_H__
|
||||
#define __UTIL_AVXF_H__
|
||||
|
||||
CCL_NAMESPACE_BEGIN
|
||||
|
||||
#ifdef __KERNEL_AVX__
|
||||
struct avxf
|
||||
{
|
||||
typedef avxf Float;
|
||||
|
||||
enum { size = 8 }; /* Number of SIMD elements. */
|
||||
|
||||
union {
|
||||
__m256 m256;
|
||||
float f[8];
|
||||
int i[8];
|
||||
};
|
||||
|
||||
__forceinline avxf () {}
|
||||
__forceinline avxf (const avxf& other) { m256 = other.m256; }
|
||||
__forceinline avxf& operator=(const avxf& other) { m256 = other.m256; return *this; }
|
||||
|
||||
__forceinline avxf(const __m256 a) : m256(a) {}
|
||||
__forceinline avxf(const __m256i a) : m256(_mm256_castsi256_ps (a)) {}
|
||||
|
||||
__forceinline operator const __m256&(void) const { return m256; }
|
||||
__forceinline operator __m256&(void) { return m256; }
|
||||
|
||||
__forceinline avxf (float a) : m256(_mm256_set1_ps(a)) {}
|
||||
|
||||
__forceinline avxf(float high32x4, float low32x4) :
|
||||
m256(_mm256_set_ps(high32x4, high32x4, high32x4, high32x4, low32x4, low32x4, low32x4, low32x4)) {}
|
||||
|
||||
__forceinline avxf(float a3, float a2, float a1, float a0) :
|
||||
m256(_mm256_set_ps(a3, a2, a1, a0, a3, a2, a1, a0)) {}
|
||||
|
||||
__forceinline avxf(float a7, float a6, float a5, float a4, float a3, float a2, float a1, float a0) :
|
||||
m256(_mm256_set_ps(a7, a6, a5, a4, a3, a2, a1, a0)) {}
|
||||
|
||||
|
||||
__forceinline avxf(int a3, int a2, int a1, int a0)
|
||||
{
|
||||
const __m256i foo = _mm256_set_epi32(a3, a2, a1, a0, a3, a2, a1, a0);
|
||||
m256 = _mm256_castsi256_ps(foo);
|
||||
}
|
||||
|
||||
|
||||
__forceinline avxf(int a7, int a6, int a5, int a4, int a3, int a2, int a1, int a0)
|
||||
{
|
||||
const __m256i foo = _mm256_set_epi32(a7, a6, a5, a4, a3, a2, a1, a0);
|
||||
m256 = _mm256_castsi256_ps(foo);
|
||||
}
|
||||
|
||||
__forceinline avxf(__m128 a, __m128 b)
|
||||
{
|
||||
const __m256 foo = _mm256_castps128_ps256(a);
|
||||
m256 = _mm256_insertf128_ps(foo, b, 1);
|
||||
}
|
||||
|
||||
};
|
||||
|
||||
////////////////////////////////////////////////////////////////////////////////
|
||||
/// Unary Operators
|
||||
////////////////////////////////////////////////////////////////////////////////
|
||||
|
||||
__forceinline const avxf mm256_sqrt(const avxf& a) { return _mm256_sqrt_ps(a.m256); }
|
||||
|
||||
////////////////////////////////////////////////////////////////////////////////
|
||||
/// Binary Operators
|
||||
////////////////////////////////////////////////////////////////////////////////
|
||||
|
||||
__forceinline const avxf operator +(const avxf& a, const avxf& b) { return _mm256_add_ps(a.m256, b.m256); }
|
||||
__forceinline const avxf operator +(const avxf& a, const float& b) { return a + avxf(b); }
|
||||
__forceinline const avxf operator +(const float& a, const avxf& b) { return avxf(a) + b; }
|
||||
|
||||
__forceinline const avxf operator -(const avxf& a, const avxf& b) { return _mm256_sub_ps(a.m256, b.m256); }
|
||||
__forceinline const avxf operator -(const avxf& a, const float& b) { return a - avxf(b); }
|
||||
__forceinline const avxf operator -(const float& a, const avxf& b) { return avxf(a) - b; }
|
||||
|
||||
__forceinline const avxf operator *(const avxf& a, const avxf& b) { return _mm256_mul_ps(a.m256, b.m256); }
|
||||
__forceinline const avxf operator *(const avxf& a, const float& b) { return a * avxf(b); }
|
||||
__forceinline const avxf operator *(const float& a, const avxf& b) { return avxf(a) * b; }
|
||||
|
||||
__forceinline const avxf operator /(const avxf& a, const avxf& b) { return _mm256_div_ps(a.m256,b.m256); }
|
||||
__forceinline const avxf operator /(const avxf& a, const float& b) { return a/avxf(b); }
|
||||
__forceinline const avxf operator /(const float& a, const avxf& b) { return avxf(a)/b; }
|
||||
|
||||
__forceinline const avxf operator|(const avxf& a, const avxf& b) { return _mm256_or_ps(a.m256,b.m256); }
|
||||
|
||||
__forceinline const avxf operator^(const avxf& a, const avxf& b) { return _mm256_xor_ps(a.m256,b.m256); }
|
||||
|
||||
__forceinline const avxf operator&(const avxf& a, const avxf& b) { return _mm256_and_ps(a.m256,b.m256); }
|
||||
|
||||
////////////////////////////////////////////////////////////////////////////////
|
||||
/// Movement/Shifting/Shuffling Functions
|
||||
////////////////////////////////////////////////////////////////////////////////
|
||||
|
||||
__forceinline const avxf shuffle(const avxf& a, const __m256i &shuf) {
|
||||
return _mm256_permutevar_ps(a, shuf);
|
||||
}
|
||||
|
||||
template<int i0, int i1, int i2, int i3, int i4, int i5, int i6, int i7> __forceinline const avxf shuffle(const avxf& a) {
|
||||
return _mm256_permutevar_ps(a, _mm256_set_epi32( i7,i6,i5,i4 ,i3,i2,i1,i0));
|
||||
}
|
||||
|
||||
template<size_t i0, size_t i1, size_t i2, size_t i3> __forceinline const avxf shuffle(const avxf& a, const avxf& b) {
|
||||
return _mm256_shuffle_ps(a, b, _MM_SHUFFLE(i3, i2, i1, i0));
|
||||
}
|
||||
template<size_t i0, size_t i1, size_t i2, size_t i3> __forceinline const avxf shuffle(const avxf& a) {
|
||||
return shuffle<i0,i1,i2,i3>(a,a);
|
||||
}
|
||||
template<size_t i0> __forceinline const avxf shuffle(const avxf& a, const avxf& b) {
|
||||
return shuffle<i0,i0,i0,i0>(a, b);
|
||||
}
|
||||
template<size_t i0> __forceinline const avxf shuffle(const avxf& a) {
|
||||
return shuffle<i0>(a,a);
|
||||
}
|
||||
|
||||
template<int i0, int i1, int i2, int i3, int i4, int i5, int i6, int i7> __forceinline const avxf permute(const avxf& a) {
|
||||
#ifdef __KERNEL_AVX2__
|
||||
return _mm256_permutevar8x32_ps(a,_mm256_set_epi32( i7,i6,i5,i4 ,i3,i2,i1,i0));
|
||||
#else
|
||||
float temp[8];
|
||||
_mm256_storeu_ps((float*)&temp, a);
|
||||
return avxf(temp[i7], temp[i6], temp[i5], temp[i4], temp[i3], temp[i2], temp[i1], temp[i0]);
|
||||
#endif
|
||||
}
|
||||
|
||||
template<int S0, int S1, int S2, int S3,int S4,int S5,int S6, int S7>
|
||||
ccl_device_inline const avxf set_sign_bit(const avxf &a)
|
||||
{
|
||||
return a ^ avxf(S7 << 31, S6 << 31, S5 << 31, S4 << 31, S3 << 31,S2 << 31,S1 << 31,S0 << 31);
|
||||
}
|
||||
|
||||
template<size_t S0, size_t S1, size_t S2, size_t S3,size_t S4,size_t S5,size_t S6, size_t S7>
|
||||
ccl_device_inline const avxf blend(const avxf &a, const avxf &b)
|
||||
{
|
||||
return _mm256_blend_ps(a,b,S7 << 0 | S6 << 1 | S5 << 2 | S4 << 3 | S3 << 4 | S2 << 5 | S1 << 6 | S0 << 7);
|
||||
}
|
||||
|
||||
template<size_t S0, size_t S1, size_t S2, size_t S3 >
|
||||
ccl_device_inline const avxf blend(const avxf &a, const avxf &b)
|
||||
{
|
||||
return blend<S0,S1,S2,S3,S0,S1,S2,S3>(a,b);
|
||||
}
|
||||
|
||||
////////////////////////////////////////////////////////////////////////////////
|
||||
/// Ternary Operators
|
||||
////////////////////////////////////////////////////////////////////////////////
|
||||
__forceinline const avxf madd (const avxf& a, const avxf& b, const avxf& c) {
|
||||
#ifdef __KERNEL_AVX2__
|
||||
return _mm256_fmadd_ps(a,b,c);
|
||||
#else
|
||||
return c+(a*b);
|
||||
#endif
|
||||
}
|
||||
|
||||
__forceinline const avxf nmadd(const avxf& a, const avxf& b, const avxf& c) {
|
||||
#ifdef __KERNEL_AVX2__
|
||||
return _mm256_fnmadd_ps(a, b, c);
|
||||
#else
|
||||
return c-(a*b);
|
||||
#endif
|
||||
}
|
||||
#endif
|
||||
|
||||
CCL_NAMESPACE_END
|
||||
|
||||
#endif
|
||||
@@ -233,7 +233,7 @@ ccl_device_inline int mod(int x, int m)
|
||||
|
||||
#ifndef __KERNEL_OPENCL__
|
||||
|
||||
ccl_device_inline bool is_zero(const float2 a)
|
||||
ccl_device_inline bool is_zero(const float2& a)
|
||||
{
|
||||
return (a.x == 0.0f && a.y == 0.0f);
|
||||
}
|
||||
@@ -242,7 +242,7 @@ ccl_device_inline bool is_zero(const float2 a)
|
||||
|
||||
#ifndef __KERNEL_OPENCL__
|
||||
|
||||
ccl_device_inline float average(const float2 a)
|
||||
ccl_device_inline float average(const float2& a)
|
||||
{
|
||||
return (a.x + a.y)*(1.0f/2.0f);
|
||||
}
|
||||
@@ -251,58 +251,58 @@ ccl_device_inline float average(const float2 a)
|
||||
|
||||
#ifndef __KERNEL_OPENCL__
|
||||
|
||||
ccl_device_inline float2 operator-(const float2 a)
|
||||
ccl_device_inline float2 operator-(const float2& a)
|
||||
{
|
||||
return make_float2(-a.x, -a.y);
|
||||
}
|
||||
|
||||
ccl_device_inline float2 operator*(const float2 a, const float2 b)
|
||||
ccl_device_inline float2 operator*(const float2& a, const float2& b)
|
||||
{
|
||||
return make_float2(a.x*b.x, a.y*b.y);
|
||||
}
|
||||
|
||||
ccl_device_inline float2 operator*(const float2 a, float f)
|
||||
ccl_device_inline float2 operator*(const float2& a, float f)
|
||||
{
|
||||
return make_float2(a.x*f, a.y*f);
|
||||
}
|
||||
|
||||
ccl_device_inline float2 operator*(float f, const float2 a)
|
||||
ccl_device_inline float2 operator*(float f, const float2& a)
|
||||
{
|
||||
return make_float2(a.x*f, a.y*f);
|
||||
}
|
||||
|
||||
ccl_device_inline float2 operator/(float f, const float2 a)
|
||||
ccl_device_inline float2 operator/(float f, const float2& a)
|
||||
{
|
||||
return make_float2(f/a.x, f/a.y);
|
||||
}
|
||||
|
||||
ccl_device_inline float2 operator/(const float2 a, float f)
|
||||
ccl_device_inline float2 operator/(const float2& a, float f)
|
||||
{
|
||||
float invf = 1.0f/f;
|
||||
return make_float2(a.x*invf, a.y*invf);
|
||||
}
|
||||
|
||||
ccl_device_inline float2 operator/(const float2 a, const float2 b)
|
||||
ccl_device_inline float2 operator/(const float2& a, const float2& b)
|
||||
{
|
||||
return make_float2(a.x/b.x, a.y/b.y);
|
||||
}
|
||||
|
||||
ccl_device_inline float2 operator+(const float2 a, const float2 b)
|
||||
ccl_device_inline float2 operator+(const float2& a, const float2& b)
|
||||
{
|
||||
return make_float2(a.x+b.x, a.y+b.y);
|
||||
}
|
||||
|
||||
ccl_device_inline float2 operator-(const float2 a, const float2 b)
|
||||
ccl_device_inline float2 operator-(const float2& a, const float2& b)
|
||||
{
|
||||
return make_float2(a.x-b.x, a.y-b.y);
|
||||
}
|
||||
|
||||
ccl_device_inline float2 operator+=(float2& a, const float2 b)
|
||||
ccl_device_inline float2 operator+=(float2& a, const float2& b)
|
||||
{
|
||||
return a = a + b;
|
||||
}
|
||||
|
||||
ccl_device_inline float2 operator*=(float2& a, const float2 b)
|
||||
ccl_device_inline float2 operator*=(float2& a, const float2& b)
|
||||
{
|
||||
return a = a * b;
|
||||
}
|
||||
@@ -312,7 +312,7 @@ ccl_device_inline float2 operator*=(float2& a, float f)
|
||||
return a = a * f;
|
||||
}
|
||||
|
||||
ccl_device_inline float2 operator/=(float2& a, const float2 b)
|
||||
ccl_device_inline float2 operator/=(float2& a, const float2& b)
|
||||
{
|
||||
return a = a / b;
|
||||
}
|
||||
@@ -324,12 +324,12 @@ ccl_device_inline float2 operator/=(float2& a, float f)
|
||||
}
|
||||
|
||||
|
||||
ccl_device_inline float dot(const float2 a, const float2 b)
|
||||
ccl_device_inline float dot(const float2& a, const float2& b)
|
||||
{
|
||||
return a.x*b.x + a.y*b.y;
|
||||
}
|
||||
|
||||
ccl_device_inline float cross(const float2 a, const float2 b)
|
||||
ccl_device_inline float cross(const float2& a, const float2& b)
|
||||
{
|
||||
return (a.x*b.y - a.y*b.x);
|
||||
}
|
||||
@@ -343,59 +343,59 @@ ccl_device_inline bool operator==(const int2 a, const int2 b)
|
||||
return (a.x == b.x && a.y == b.y);
|
||||
}
|
||||
|
||||
ccl_device_inline float len(const float2 a)
|
||||
ccl_device_inline float len(const float2& a)
|
||||
{
|
||||
return sqrtf(dot(a, a));
|
||||
}
|
||||
|
||||
ccl_device_inline float2 normalize(const float2 a)
|
||||
ccl_device_inline float2 normalize(const float2& a)
|
||||
{
|
||||
return a/len(a);
|
||||
}
|
||||
|
||||
ccl_device_inline float2 normalize_len(const float2 a, float *t)
|
||||
ccl_device_inline float2 normalize_len(const float2& a, float *t)
|
||||
{
|
||||
*t = len(a);
|
||||
return a/(*t);
|
||||
}
|
||||
|
||||
ccl_device_inline float2 safe_normalize(const float2 a)
|
||||
ccl_device_inline float2 safe_normalize(const float2& a)
|
||||
{
|
||||
float t = len(a);
|
||||
return (t != 0.0f)? a/t: a;
|
||||
}
|
||||
|
||||
ccl_device_inline bool operator==(const float2 a, const float2 b)
|
||||
ccl_device_inline bool operator==(const float2& a, const float2& b)
|
||||
{
|
||||
return (a.x == b.x && a.y == b.y);
|
||||
}
|
||||
|
||||
ccl_device_inline bool operator!=(const float2 a, const float2 b)
|
||||
ccl_device_inline bool operator!=(const float2& a, const float2& b)
|
||||
{
|
||||
return !(a == b);
|
||||
}
|
||||
|
||||
ccl_device_inline float2 min(float2 a, float2 b)
|
||||
ccl_device_inline float2 min(const float2& a, const float2& b)
|
||||
{
|
||||
return make_float2(min(a.x, b.x), min(a.y, b.y));
|
||||
}
|
||||
|
||||
ccl_device_inline float2 max(float2 a, float2 b)
|
||||
ccl_device_inline float2 max(const float2& a, const float2& b)
|
||||
{
|
||||
return make_float2(max(a.x, b.x), max(a.y, b.y));
|
||||
}
|
||||
|
||||
ccl_device_inline float2 clamp(float2 a, float2 mn, float2 mx)
|
||||
ccl_device_inline float2 clamp(const float2& a, const float2& mn, const float2& mx)
|
||||
{
|
||||
return min(max(a, mn), mx);
|
||||
}
|
||||
|
||||
ccl_device_inline float2 fabs(float2 a)
|
||||
ccl_device_inline float2 fabs(const float2& a)
|
||||
{
|
||||
return make_float2(fabsf(a.x), fabsf(a.y));
|
||||
}
|
||||
|
||||
ccl_device_inline float2 as_float2(const float4 a)
|
||||
ccl_device_inline float2 as_float2(const float4& a)
|
||||
{
|
||||
return make_float2(a.x, a.y);
|
||||
}
|
||||
@@ -413,7 +413,7 @@ ccl_device_inline void print_float2(const char *label, const float2& a)
|
||||
|
||||
#ifndef __KERNEL_OPENCL__
|
||||
|
||||
ccl_device_inline float2 interp(float2 a, float2 b, float t)
|
||||
ccl_device_inline float2 interp(const float2& a, const float2& b, float t)
|
||||
{
|
||||
return a + t*(b - a);
|
||||
}
|
||||
@@ -424,58 +424,92 @@ ccl_device_inline float2 interp(float2 a, float2 b, float t)
|
||||
|
||||
#ifndef __KERNEL_OPENCL__
|
||||
|
||||
ccl_device_inline float3 operator-(const float3 a)
|
||||
ccl_device_inline float3 operator-(const float3& a)
|
||||
{
|
||||
#ifdef __KERNEL_SSE__
|
||||
return float3(_mm_xor_ps(a.m128, _mm_castsi128_ps(_mm_set1_epi32(0x80000000))));
|
||||
#else
|
||||
return make_float3(-a.x, -a.y, -a.z);
|
||||
#endif
|
||||
}
|
||||
|
||||
ccl_device_inline float3 operator*(const float3 a, const float3 b)
|
||||
ccl_device_inline float3 operator*(const float3& a, const float3& b)
|
||||
{
|
||||
#ifdef __KERNEL_SSE__
|
||||
return float3(_mm_mul_ps(a.m128,b.m128));
|
||||
#else
|
||||
return make_float3(a.x*b.x, a.y*b.y, a.z*b.z);
|
||||
#endif
|
||||
}
|
||||
|
||||
ccl_device_inline float3 operator*(const float3 a, float f)
|
||||
ccl_device_inline float3 operator*(const float3& a, const float f)
|
||||
{
|
||||
#ifdef __KERNEL_SSE__
|
||||
return float3(_mm_mul_ps(a.m128,_mm_set1_ps(f)));
|
||||
#else
|
||||
return make_float3(a.x*f, a.y*f, a.z*f);
|
||||
#endif
|
||||
}
|
||||
|
||||
ccl_device_inline float3 operator*(float f, const float3 a)
|
||||
ccl_device_inline float3 operator*(const float f, const float3& a)
|
||||
{
|
||||
#ifdef __KERNEL_SSE__
|
||||
return float3(_mm_mul_ps(a.m128, _mm_set1_ps(f)));
|
||||
#else
|
||||
return make_float3(a.x*f, a.y*f, a.z*f);
|
||||
#endif
|
||||
}
|
||||
|
||||
ccl_device_inline float3 operator/(float f, const float3 a)
|
||||
ccl_device_inline float3 operator/(const float f, const float3& a)
|
||||
{
|
||||
return make_float3(f/a.x, f/a.y, f/a.z);
|
||||
#ifdef __KERNEL_SSE__
|
||||
__m128 rc = _mm_rcp_ps(a.m128);
|
||||
return float3(_mm_mul_ps(_mm_set1_ps(f),rc));
|
||||
#else
|
||||
return make_float3(f / a.x, f / a.y, f / a.z);
|
||||
#endif
|
||||
}
|
||||
|
||||
ccl_device_inline float3 operator/(const float3 a, float f)
|
||||
ccl_device_inline float3 operator/(const float3& a, const float f)
|
||||
{
|
||||
float invf = 1.0f/f;
|
||||
return make_float3(a.x*invf, a.y*invf, a.z*invf);
|
||||
return a * invf;
|
||||
}
|
||||
|
||||
ccl_device_inline float3 operator/(const float3 a, const float3 b)
|
||||
ccl_device_inline float3 operator/(const float3& a, const float3& b)
|
||||
{
|
||||
return make_float3(a.x/b.x, a.y/b.y, a.z/b.z);
|
||||
#ifdef __KERNEL_SSE__
|
||||
__m128 rc = _mm_rcp_ps(b.m128);
|
||||
return float3(_mm_mul_ps(a, rc));
|
||||
#else
|
||||
return make_float3(a.x / b.x, a.y / b.y, a.z / b.z);
|
||||
#endif
|
||||
}
|
||||
|
||||
ccl_device_inline float3 operator+(const float3 a, const float3 b)
|
||||
ccl_device_inline float3 operator+(const float3& a, const float3& b)
|
||||
{
|
||||
return make_float3(a.x+b.x, a.y+b.y, a.z+b.z);
|
||||
#ifdef __KERNEL_SSE__
|
||||
return float3(_mm_add_ps(a.m128, b.m128));
|
||||
#else
|
||||
return make_float3(a.x + b.x, a.y + b.y, a.z + b.z);
|
||||
#endif
|
||||
}
|
||||
|
||||
ccl_device_inline float3 operator-(const float3 a, const float3 b)
|
||||
ccl_device_inline float3 operator-(const float3& a, const float3& b)
|
||||
{
|
||||
return make_float3(a.x-b.x, a.y-b.y, a.z-b.z);
|
||||
#ifdef __KERNEL_SSE__
|
||||
return float3(_mm_sub_ps(a.m128, b.m128));
|
||||
#else
|
||||
return make_float3(a.x - b.x, a.y - b.y, a.z - b.z);
|
||||
#endif
|
||||
}
|
||||
|
||||
ccl_device_inline float3 operator+=(float3& a, const float3 b)
|
||||
ccl_device_inline float3 operator+=(float3& a, const float3& b)
|
||||
{
|
||||
return a = a + b;
|
||||
}
|
||||
|
||||
ccl_device_inline float3 operator*=(float3& a, const float3 b)
|
||||
ccl_device_inline float3 operator*=(float3& a, const float3& b)
|
||||
{
|
||||
return a = a * b;
|
||||
}
|
||||
@@ -485,7 +519,7 @@ ccl_device_inline float3 operator*=(float3& a, float f)
|
||||
return a = a * f;
|
||||
}
|
||||
|
||||
ccl_device_inline float3 operator/=(float3& a, const float3 b)
|
||||
ccl_device_inline float3 operator/=(float3& a, const float3& b)
|
||||
{
|
||||
return a = a / b;
|
||||
}
|
||||
@@ -496,7 +530,7 @@ ccl_device_inline float3 operator/=(float3& a, float f)
|
||||
return a = a * invf;
|
||||
}
|
||||
|
||||
ccl_device_inline float dot(const float3 a, const float3 b)
|
||||
ccl_device_inline float dot(const float3& a, const float3& b)
|
||||
{
|
||||
#if defined(__KERNEL_SSE41__) && defined(__KERNEL_SSE__)
|
||||
return _mm_cvtss_f32(_mm_dp_ps(a, b, 0x7F));
|
||||
@@ -505,7 +539,16 @@ ccl_device_inline float dot(const float3 a, const float3 b)
|
||||
#endif
|
||||
}
|
||||
|
||||
ccl_device_inline float dot(const float4 a, const float4 b)
|
||||
ccl_device_inline float dot_xy(const float3& a, const float3& b)
|
||||
{
|
||||
#if defined(__KERNEL_SSE41__) && defined(__KERNEL_SSE__)
|
||||
return _mm_cvtss_f32(_mm_hadd_ps(_mm_mul_ps(a,b),b));
|
||||
#else
|
||||
return a.x*b.x + a.y*b.y;
|
||||
#endif
|
||||
}
|
||||
|
||||
ccl_device_inline float dot(const float4& a, const float4& b)
|
||||
{
|
||||
#if defined(__KERNEL_SSE41__) && defined(__KERNEL_SSE__)
|
||||
return _mm_cvtss_f32(_mm_dp_ps(a, b, 0xFF));
|
||||
@@ -514,7 +557,7 @@ ccl_device_inline float dot(const float4 a, const float4 b)
|
||||
#endif
|
||||
}
|
||||
|
||||
ccl_device_inline float3 cross(const float3 a, const float3 b)
|
||||
ccl_device_inline float3 cross(const float3& a, const float3& b)
|
||||
{
|
||||
float3 r = make_float3(a.y*b.z - a.z*b.y, a.z*b.x - a.x*b.z, a.x*b.y - a.y*b.x);
|
||||
return r;
|
||||
@@ -538,12 +581,12 @@ ccl_device_inline float len_squared(const float3 a)
|
||||
|
||||
#ifndef __KERNEL_OPENCL__
|
||||
|
||||
ccl_device_inline float len_squared(const float4 a)
|
||||
ccl_device_inline float len_squared(const float4& a)
|
||||
{
|
||||
return dot(a, a);
|
||||
}
|
||||
|
||||
ccl_device_inline float3 normalize(const float3 a)
|
||||
ccl_device_inline float3 normalize(const float3& a)
|
||||
{
|
||||
#if defined(__KERNEL_SSE41__) && defined(__KERNEL_SSE__)
|
||||
__m128 norm = _mm_sqrt_ps(_mm_dp_ps(a.m128, a.m128, 0x7F));
|
||||
@@ -563,13 +606,14 @@ ccl_device_inline float3 saturate3(float3 a)
|
||||
ccl_device_inline float3 normalize_len(const float3 a, float *t)
|
||||
{
|
||||
*t = len(a);
|
||||
return a/(*t);
|
||||
float x = 1.0f / *t;
|
||||
return a*x;
|
||||
}
|
||||
|
||||
ccl_device_inline float3 safe_normalize(const float3 a)
|
||||
{
|
||||
float t = len(a);
|
||||
return (t != 0.0f)? a/t: a;
|
||||
return (t != 0.0f)? a * (1.0f/t) : a;
|
||||
}
|
||||
|
||||
ccl_device_inline float3 safe_normalize_len(const float3 a, float *t)
|
||||
@@ -580,7 +624,7 @@ ccl_device_inline float3 safe_normalize_len(const float3 a, float *t)
|
||||
|
||||
#ifndef __KERNEL_OPENCL__
|
||||
|
||||
ccl_device_inline bool operator==(const float3 a, const float3 b)
|
||||
ccl_device_inline bool operator==(const float3& a, const float3& b)
|
||||
{
|
||||
#ifdef __KERNEL_SSE__
|
||||
return (_mm_movemask_ps(_mm_cmpeq_ps(a.m128, b.m128)) & 7) == 7;
|
||||
@@ -589,12 +633,12 @@ ccl_device_inline bool operator==(const float3 a, const float3 b)
|
||||
#endif
|
||||
}
|
||||
|
||||
ccl_device_inline bool operator!=(const float3 a, const float3 b)
|
||||
ccl_device_inline bool operator!=(const float3& a, const float3& b)
|
||||
{
|
||||
return !(a == b);
|
||||
}
|
||||
|
||||
ccl_device_inline float3 min(float3 a, float3 b)
|
||||
ccl_device_inline float3 min(const float3& a, const float3& b)
|
||||
{
|
||||
#ifdef __KERNEL_SSE__
|
||||
return _mm_min_ps(a.m128, b.m128);
|
||||
@@ -603,7 +647,7 @@ ccl_device_inline float3 min(float3 a, float3 b)
|
||||
#endif
|
||||
}
|
||||
|
||||
ccl_device_inline float3 max(float3 a, float3 b)
|
||||
ccl_device_inline float3 max(const float3& a, const float3& b)
|
||||
{
|
||||
#ifdef __KERNEL_SSE__
|
||||
return _mm_max_ps(a.m128, b.m128);
|
||||
@@ -612,12 +656,12 @@ ccl_device_inline float3 max(float3 a, float3 b)
|
||||
#endif
|
||||
}
|
||||
|
||||
ccl_device_inline float3 clamp(float3 a, float3 mn, float3 mx)
|
||||
ccl_device_inline float3 clamp(const float3& a, const float3& mn, const float3& mx)
|
||||
{
|
||||
return min(max(a, mn), mx);
|
||||
}
|
||||
|
||||
ccl_device_inline float3 fabs(float3 a)
|
||||
ccl_device_inline float3 fabs(const float3& a)
|
||||
{
|
||||
#ifdef __KERNEL_SSE__
|
||||
__m128 mask = _mm_castsi128_ps(_mm_set1_epi32(0x7fffffff));
|
||||
@@ -670,7 +714,7 @@ ccl_device_inline float3 interp(float3 a, float3 b, float t)
|
||||
|
||||
#ifndef __KERNEL_OPENCL__
|
||||
|
||||
ccl_device_inline float3 mix(float3 a, float3 b, float t)
|
||||
ccl_device_inline float3 mix(const float3& a, const float3& b, float t)
|
||||
{
|
||||
return a + t*(b - a);
|
||||
}
|
||||
@@ -833,7 +877,7 @@ ccl_device_inline int4 operator<(const float4& a, const float4& b)
|
||||
#endif
|
||||
}
|
||||
|
||||
ccl_device_inline int4 operator>=(float4 a, float4 b)
|
||||
ccl_device_inline int4 operator>=(const float4& a, const float4& b)
|
||||
{
|
||||
#ifdef __KERNEL_SSE__
|
||||
return _mm_cvtps_epi32(_mm_cmpge_ps(a.m128, b.m128)); /* todo: avoid cvt */
|
||||
@@ -851,7 +895,7 @@ ccl_device_inline int4 operator<=(const float4& a, const float4& b)
|
||||
#endif
|
||||
}
|
||||
|
||||
ccl_device_inline bool operator==(const float4 a, const float4 b)
|
||||
ccl_device_inline bool operator==(const float4& a, const float4& b)
|
||||
{
|
||||
#ifdef __KERNEL_SSE__
|
||||
return (_mm_movemask_ps(_mm_cmpeq_ps(a.m128, b.m128)) & 15) == 15;
|
||||
@@ -893,23 +937,23 @@ ccl_device_inline float average(const float4& a)
|
||||
return reduce_add(a) * 0.25f;
|
||||
}
|
||||
|
||||
ccl_device_inline float len(const float4 a)
|
||||
ccl_device_inline float len(const float4& a)
|
||||
{
|
||||
return sqrtf(dot(a, a));
|
||||
}
|
||||
|
||||
ccl_device_inline float4 normalize(const float4 a)
|
||||
ccl_device_inline float4 normalize(const float4& a)
|
||||
{
|
||||
return a/len(a);
|
||||
}
|
||||
|
||||
ccl_device_inline float4 safe_normalize(const float4 a)
|
||||
ccl_device_inline float4 safe_normalize(const float4& a)
|
||||
{
|
||||
float t = len(a);
|
||||
return (t != 0.0f)? a/t: a;
|
||||
}
|
||||
|
||||
ccl_device_inline float4 min(float4 a, float4 b)
|
||||
ccl_device_inline float4 min(const float4& a, const float4& b)
|
||||
{
|
||||
#ifdef __KERNEL_SSE__
|
||||
return _mm_min_ps(a.m128, b.m128);
|
||||
@@ -918,7 +962,7 @@ ccl_device_inline float4 min(float4 a, float4 b)
|
||||
#endif
|
||||
}
|
||||
|
||||
ccl_device_inline float4 max(float4 a, float4 b)
|
||||
ccl_device_inline float4 max(const float4& a, const float4& b)
|
||||
{
|
||||
#ifdef __KERNEL_SSE__
|
||||
return _mm_max_ps(a.m128, b.m128);
|
||||
@@ -1190,7 +1234,7 @@ template<class A, class B> A lerp(const A& a, const A& b, const B& t)
|
||||
|
||||
/* Triangle */
|
||||
|
||||
ccl_device_inline float triangle_area(const float3 v1, const float3 v2, const float3 v3)
|
||||
ccl_device_inline float triangle_area(const float3& v1, const float3& v2, const float3& v3)
|
||||
{
|
||||
return len(cross(v3 - v2, v1 - v2))*0.5f;
|
||||
}
|
||||
|
||||
@@ -455,6 +455,7 @@ CCL_NAMESPACE_END
|
||||
#include "util_sseb.h"
|
||||
#include "util_ssei.h"
|
||||
#include "util_ssef.h"
|
||||
#include "util_avxf.h"
|
||||
|
||||
#endif /* __UTIL_SIMD_TYPES_H__ */
|
||||
|
||||
|
||||
@@ -174,6 +174,9 @@ struct ccl_try_align(16) int3 {
|
||||
__forceinline int3(const __m128i a) : m128(a) {}
|
||||
__forceinline operator const __m128i&(void) const { return m128; }
|
||||
__forceinline operator __m128i&(void) { return m128; }
|
||||
|
||||
int3(const int3& a) { m128 = a.m128; }
|
||||
int3& operator =(const int3& a) { m128 = a.m128; return *this; }
|
||||
#else
|
||||
int x, y, z, w;
|
||||
#endif
|
||||
@@ -193,6 +196,9 @@ struct ccl_try_align(16) int4 {
|
||||
__forceinline int4(const __m128i a) : m128(a) {}
|
||||
__forceinline operator const __m128i&(void) const { return m128; }
|
||||
__forceinline operator __m128i&(void) { return m128; }
|
||||
|
||||
int4(const int4& a) : m128(a.m128) {}
|
||||
int4& operator=(const int4& a) { m128 = a.m128; return *this; }
|
||||
#else
|
||||
int x, y, z, w;
|
||||
#endif
|
||||
@@ -237,9 +243,12 @@ struct ccl_try_align(16) float3 {
|
||||
};
|
||||
|
||||
__forceinline float3() {}
|
||||
__forceinline float3(const __m128 a) : m128(a) {}
|
||||
__forceinline float3(const __m128& a) : m128(a) {}
|
||||
__forceinline operator const __m128&(void) const { return m128; }
|
||||
__forceinline operator __m128&(void) { return m128; }
|
||||
|
||||
__forceinline float3(const float3& a) : m128(a.m128) {}
|
||||
__forceinline float3& operator =(const float3& a) { m128 = a.m128; return *this; }
|
||||
#else
|
||||
float x, y, z, w;
|
||||
#endif
|
||||
@@ -259,6 +268,10 @@ struct ccl_try_align(16) float4 {
|
||||
__forceinline float4(const __m128 a) : m128(a) {}
|
||||
__forceinline operator const __m128&(void) const { return m128; }
|
||||
__forceinline operator __m128&(void) { return m128; }
|
||||
|
||||
__forceinline float4(const float4& a) : m128(a.m128) {}
|
||||
__forceinline float4& operator =(const float4& a) { m128 = a.m128; return *this; }
|
||||
|
||||
#else
|
||||
float x, y, z, w;
|
||||
#endif
|
||||
|
||||
Reference in New Issue
Block a user