Merge remote-tracking branch 'origin/master' into blender2.8

This commit is contained in:
Dalai Felinto
2016-10-13 16:42:54 +00:00
31 changed files with 528 additions and 153 deletions

View File

@@ -107,6 +107,67 @@ ccl_device_inline bool triangle_intersect(KernelGlobals *kg,
/* Calculate vertices relative to ray origin. */
const uint tri_vindex = kernel_tex_fetch(__prim_tri_index, triAddr);
#if defined(__KERNEL_AVX2__)
const avxf avxf_P(P.m128, P.m128);
const avxf tri_ab = kernel_tex_fetch_avxf(__prim_tri_verts, tri_vindex + 0);
const avxf tri_bc = kernel_tex_fetch_avxf(__prim_tri_verts, tri_vindex + 1);
const avxf AB = tri_ab - avxf_P;
const avxf BC = tri_bc - avxf_P;
const __m256i permuteMask = _mm256_set_epi32(0x3, kz, ky, kx, 0x3, kz, ky, kx);
const avxf AB_k = shuffle(AB, permuteMask);
const avxf BC_k = shuffle(BC, permuteMask);
/* Akz, Akz, Bkz, Bkz, Bkz, Bkz, Ckz, Ckz */
const avxf ABBC_kz = shuffle<2>(AB_k, BC_k);
/* Akx, Aky, Bkx, Bky, Bkx,Bky, Ckx, Cky */
const avxf ABBC_kxy = shuffle<0,1,0,1>(AB_k, BC_k);
const avxf Sxy(Sy, Sx, Sy, Sx);
/* Ax, Ay, Bx, By, Bx, By, Cx, Cy */
const avxf ABBC_xy = nmadd(ABBC_kz, Sxy, ABBC_kxy);
float ABBC_kz_array[8];
_mm256_storeu_ps((float*)&ABBC_kz_array, ABBC_kz);
const float A_kz = ABBC_kz_array[0];
const float B_kz = ABBC_kz_array[2];
const float C_kz = ABBC_kz_array[6];
/* By, Bx, Cy, Cx, By, Bx, Ay, Ax */
const avxf BCBA_yx = permute<3,2,7,6,3,2,1,0>(ABBC_xy);
const avxf negMask(0,0,0,0,0x80000000, 0x80000000, 0x80000000, 0x80000000);
/* W U V
* (AxBy-AyBx) (BxCy-ByCx) XX XX (BxBy-ByBx) (CxAy-CyAx) XX XX
*/
const avxf WUxxxxVxx_neg = _mm256_hsub_ps(ABBC_xy * BCBA_yx, negMask /* Dont care */);
const avxf WUVWnegWUVW = permute<0,1,5,0,0,1,5,0>(WUxxxxVxx_neg) ^ negMask;
/* Calculate scaled barycentric coordinates. */
float WUVW_array[4];
_mm_storeu_ps((float*)&WUVW_array, _mm256_castps256_ps128 (WUVWnegWUVW));
const float W = WUVW_array[0];
const float U = WUVW_array[1];
const float V = WUVW_array[2];
const int WUVW_mask = 0x7 & _mm256_movemask_ps(WUVWnegWUVW);
const int WUVW_zero = 0x7 & _mm256_movemask_ps(_mm256_cmp_ps(WUVWnegWUVW,
_mm256_setzero_ps(), 0));
if(!((WUVW_mask == 7) || (WUVW_mask == 0)) && ((WUVW_mask | WUVW_zero) != 7)) {
return false;
}
#else
const float4 tri_a = kernel_tex_fetch(__prim_tri_verts, tri_vindex+0),
tri_b = kernel_tex_fetch(__prim_tri_verts, tri_vindex+1),
tri_c = kernel_tex_fetch(__prim_tri_verts, tri_vindex+2);
@@ -135,6 +196,7 @@ ccl_device_inline bool triangle_intersect(KernelGlobals *kg,
{
return false;
}
#endif
/* Calculate determinant. */
float det = U + V + W;

View File

@@ -71,6 +71,20 @@ template<typename T> struct texture {
return data[index];
}
#ifdef __KERNEL_AVX__
/* Reads 256 bytes but indexes in blocks of 128 bytes to maintain
* compatibility with existing indicies and data structures.
*/
ccl_always_inline avxf fetch_avxf(const int index)
{
kernel_assert(index >= 0 && (index+1) < width);
ssef *ssefData = (ssef*)data;
ssef *ssefNodeData = &ssefData[index];
return _mm256_loadu_ps((float *)ssefNodeData);
}
#endif
#ifdef __KERNEL_SSE2__
ccl_always_inline ssef fetch_ssef(int index)
{
@@ -506,6 +520,7 @@ typedef texture_image<half4> texture_image_half4;
/* Macros to handle different memory storage on different devices */
#define kernel_tex_fetch(tex, index) (kg->tex.fetch(index))
#define kernel_tex_fetch_avxf(tex, index) (kg->tex.fetch_avxf(index))
#define kernel_tex_fetch_ssef(tex, index) (kg->tex.fetch_ssef(index))
#define kernel_tex_fetch_ssei(tex, index) (kg->tex.fetch_ssei(index))
#define kernel_tex_lookup(tex, t, offset, size) (kg->tex.lookup(t, offset, size))

View File

@@ -85,16 +85,11 @@ ccl_device ShaderClosure *subsurface_scatter_pick_closure(KernelGlobals *kg, Sha
return NULL;
}
#ifndef __KERNEL_GPU__
ccl_device_noinline
#else
ccl_device_inline
#endif
float3 subsurface_scatter_eval(ShaderData *sd,
ShaderClosure *sc,
float disk_r,
float r,
bool all)
ccl_device_inline float3 subsurface_scatter_eval(ShaderData *sd,
ShaderClosure *sc,
float disk_r,
float r,
bool all)
{
#ifdef BSSRDF_MULTI_EVAL
/* this is the veach one-sample model with balance heuristic, some pdf
@@ -223,14 +218,9 @@ ccl_device void subsurface_color_bump_blur(KernelGlobals *kg,
/* Subsurface scattering step, from a point on the surface to other
* nearby points on the same object.
*/
#ifndef __KERNEL_CUDA__
ccl_device
#else
ccl_device_inline
#endif
int subsurface_scatter_multi_intersect(
ccl_device_inline int subsurface_scatter_multi_intersect(
KernelGlobals *kg,
SubsurfaceIntersection* ss_isect,
SubsurfaceIntersection *ss_isect,
ShaderData *sd,
ShaderClosure *sc,
uint *lcg_state,
@@ -330,6 +320,10 @@ int subsurface_scatter_multi_intersect(
verts);
}
#endif /* __OBJECT_MOTION__ */
else {
ss_isect->weight[hit] = make_float3(0.0f, 0.0f, 0.0f);
continue;
}
float3 hit_Ng = ss_isect->Ng[hit];
if(ss_isect->hits[hit].object != OBJECT_NONE) {

View File

@@ -45,6 +45,7 @@
# define __KERNEL_AVX__
# endif
# ifdef __AVX2__
# define __KERNEL_SSE__
# define __KERNEL_AVX2__
# endif
#endif

View File

@@ -20,6 +20,7 @@
/* SSE optimization disabled for now on 32 bit, see bug #36316 */
#if !(defined(__GNUC__) && (defined(i386) || defined(_M_IX86)))
# define __KERNEL_SSE__
# define __KERNEL_SSE2__
# define __KERNEL_SSE3__
# define __KERNEL_SSSE3__

View File

@@ -321,8 +321,8 @@ void ShaderGraph::finalize(Scene *scene,
* modified afterwards. */
if(!finalized) {
clean(scene);
default_inputs(do_osl);
clean(scene);
refine_bump_nodes();
if(do_bump)

View File

@@ -63,6 +63,7 @@ set(SRC_HEADERS
util_sky_model.cpp
util_sky_model.h
util_sky_model_data.h
util_avxf.h
util_sseb.h
util_ssef.h
util_ssei.h

View File

@@ -0,0 +1,185 @@
/*
* Copyright 2016 Intel Corporation
*
* Licensed under the Apache License, Version 2.0(the "License");
* you may not use this file except in compliance with the License.
* You may obtain a copy of the License at
*
* http://www.apache.org/licenses/LICENSE-2.0
*
* Unless required by applicable law or agreed to in writing, software
* distributed under the License is distributed on an "AS IS" BASIS,
* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
* See the License for the specific language governing permissions and
* limitations under the License.
*/
#ifndef __UTIL_AVXF_H__
#define __UTIL_AVXF_H__
CCL_NAMESPACE_BEGIN
#ifdef __KERNEL_AVX__
struct avxf
{
typedef avxf Float;
enum { size = 8 }; /* Number of SIMD elements. */
union {
__m256 m256;
float f[8];
int i[8];
};
__forceinline avxf () {}
__forceinline avxf (const avxf& other) { m256 = other.m256; }
__forceinline avxf& operator=(const avxf& other) { m256 = other.m256; return *this; }
__forceinline avxf(const __m256 a) : m256(a) {}
__forceinline avxf(const __m256i a) : m256(_mm256_castsi256_ps (a)) {}
__forceinline operator const __m256&(void) const { return m256; }
__forceinline operator __m256&(void) { return m256; }
__forceinline avxf (float a) : m256(_mm256_set1_ps(a)) {}
__forceinline avxf(float high32x4, float low32x4) :
m256(_mm256_set_ps(high32x4, high32x4, high32x4, high32x4, low32x4, low32x4, low32x4, low32x4)) {}
__forceinline avxf(float a3, float a2, float a1, float a0) :
m256(_mm256_set_ps(a3, a2, a1, a0, a3, a2, a1, a0)) {}
__forceinline avxf(float a7, float a6, float a5, float a4, float a3, float a2, float a1, float a0) :
m256(_mm256_set_ps(a7, a6, a5, a4, a3, a2, a1, a0)) {}
__forceinline avxf(int a3, int a2, int a1, int a0)
{
const __m256i foo = _mm256_set_epi32(a3, a2, a1, a0, a3, a2, a1, a0);
m256 = _mm256_castsi256_ps(foo);
}
__forceinline avxf(int a7, int a6, int a5, int a4, int a3, int a2, int a1, int a0)
{
const __m256i foo = _mm256_set_epi32(a7, a6, a5, a4, a3, a2, a1, a0);
m256 = _mm256_castsi256_ps(foo);
}
__forceinline avxf(__m128 a, __m128 b)
{
const __m256 foo = _mm256_castps128_ps256(a);
m256 = _mm256_insertf128_ps(foo, b, 1);
}
};
////////////////////////////////////////////////////////////////////////////////
/// Unary Operators
////////////////////////////////////////////////////////////////////////////////
__forceinline const avxf mm256_sqrt(const avxf& a) { return _mm256_sqrt_ps(a.m256); }
////////////////////////////////////////////////////////////////////////////////
/// Binary Operators
////////////////////////////////////////////////////////////////////////////////
__forceinline const avxf operator +(const avxf& a, const avxf& b) { return _mm256_add_ps(a.m256, b.m256); }
__forceinline const avxf operator +(const avxf& a, const float& b) { return a + avxf(b); }
__forceinline const avxf operator +(const float& a, const avxf& b) { return avxf(a) + b; }
__forceinline const avxf operator -(const avxf& a, const avxf& b) { return _mm256_sub_ps(a.m256, b.m256); }
__forceinline const avxf operator -(const avxf& a, const float& b) { return a - avxf(b); }
__forceinline const avxf operator -(const float& a, const avxf& b) { return avxf(a) - b; }
__forceinline const avxf operator *(const avxf& a, const avxf& b) { return _mm256_mul_ps(a.m256, b.m256); }
__forceinline const avxf operator *(const avxf& a, const float& b) { return a * avxf(b); }
__forceinline const avxf operator *(const float& a, const avxf& b) { return avxf(a) * b; }
__forceinline const avxf operator /(const avxf& a, const avxf& b) { return _mm256_div_ps(a.m256,b.m256); }
__forceinline const avxf operator /(const avxf& a, const float& b) { return a/avxf(b); }
__forceinline const avxf operator /(const float& a, const avxf& b) { return avxf(a)/b; }
__forceinline const avxf operator|(const avxf& a, const avxf& b) { return _mm256_or_ps(a.m256,b.m256); }
__forceinline const avxf operator^(const avxf& a, const avxf& b) { return _mm256_xor_ps(a.m256,b.m256); }
__forceinline const avxf operator&(const avxf& a, const avxf& b) { return _mm256_and_ps(a.m256,b.m256); }
////////////////////////////////////////////////////////////////////////////////
/// Movement/Shifting/Shuffling Functions
////////////////////////////////////////////////////////////////////////////////
__forceinline const avxf shuffle(const avxf& a, const __m256i &shuf) {
return _mm256_permutevar_ps(a, shuf);
}
template<int i0, int i1, int i2, int i3, int i4, int i5, int i6, int i7> __forceinline const avxf shuffle(const avxf& a) {
return _mm256_permutevar_ps(a, _mm256_set_epi32( i7,i6,i5,i4 ,i3,i2,i1,i0));
}
template<size_t i0, size_t i1, size_t i2, size_t i3> __forceinline const avxf shuffle(const avxf& a, const avxf& b) {
return _mm256_shuffle_ps(a, b, _MM_SHUFFLE(i3, i2, i1, i0));
}
template<size_t i0, size_t i1, size_t i2, size_t i3> __forceinline const avxf shuffle(const avxf& a) {
return shuffle<i0,i1,i2,i3>(a,a);
}
template<size_t i0> __forceinline const avxf shuffle(const avxf& a, const avxf& b) {
return shuffle<i0,i0,i0,i0>(a, b);
}
template<size_t i0> __forceinline const avxf shuffle(const avxf& a) {
return shuffle<i0>(a,a);
}
template<int i0, int i1, int i2, int i3, int i4, int i5, int i6, int i7> __forceinline const avxf permute(const avxf& a) {
#ifdef __KERNEL_AVX2__
return _mm256_permutevar8x32_ps(a,_mm256_set_epi32( i7,i6,i5,i4 ,i3,i2,i1,i0));
#else
float temp[8];
_mm256_storeu_ps((float*)&temp, a);
return avxf(temp[i7], temp[i6], temp[i5], temp[i4], temp[i3], temp[i2], temp[i1], temp[i0]);
#endif
}
template<int S0, int S1, int S2, int S3,int S4,int S5,int S6, int S7>
ccl_device_inline const avxf set_sign_bit(const avxf &a)
{
return a ^ avxf(S7 << 31, S6 << 31, S5 << 31, S4 << 31, S3 << 31,S2 << 31,S1 << 31,S0 << 31);
}
template<size_t S0, size_t S1, size_t S2, size_t S3,size_t S4,size_t S5,size_t S6, size_t S7>
ccl_device_inline const avxf blend(const avxf &a, const avxf &b)
{
return _mm256_blend_ps(a,b,S7 << 0 | S6 << 1 | S5 << 2 | S4 << 3 | S3 << 4 | S2 << 5 | S1 << 6 | S0 << 7);
}
template<size_t S0, size_t S1, size_t S2, size_t S3 >
ccl_device_inline const avxf blend(const avxf &a, const avxf &b)
{
return blend<S0,S1,S2,S3,S0,S1,S2,S3>(a,b);
}
////////////////////////////////////////////////////////////////////////////////
/// Ternary Operators
////////////////////////////////////////////////////////////////////////////////
__forceinline const avxf madd (const avxf& a, const avxf& b, const avxf& c) {
#ifdef __KERNEL_AVX2__
return _mm256_fmadd_ps(a,b,c);
#else
return c+(a*b);
#endif
}
__forceinline const avxf nmadd(const avxf& a, const avxf& b, const avxf& c) {
#ifdef __KERNEL_AVX2__
return _mm256_fnmadd_ps(a, b, c);
#else
return c-(a*b);
#endif
}
#endif
CCL_NAMESPACE_END
#endif

View File

@@ -233,7 +233,7 @@ ccl_device_inline int mod(int x, int m)
#ifndef __KERNEL_OPENCL__
ccl_device_inline bool is_zero(const float2 a)
ccl_device_inline bool is_zero(const float2& a)
{
return (a.x == 0.0f && a.y == 0.0f);
}
@@ -242,7 +242,7 @@ ccl_device_inline bool is_zero(const float2 a)
#ifndef __KERNEL_OPENCL__
ccl_device_inline float average(const float2 a)
ccl_device_inline float average(const float2& a)
{
return (a.x + a.y)*(1.0f/2.0f);
}
@@ -251,58 +251,58 @@ ccl_device_inline float average(const float2 a)
#ifndef __KERNEL_OPENCL__
ccl_device_inline float2 operator-(const float2 a)
ccl_device_inline float2 operator-(const float2& a)
{
return make_float2(-a.x, -a.y);
}
ccl_device_inline float2 operator*(const float2 a, const float2 b)
ccl_device_inline float2 operator*(const float2& a, const float2& b)
{
return make_float2(a.x*b.x, a.y*b.y);
}
ccl_device_inline float2 operator*(const float2 a, float f)
ccl_device_inline float2 operator*(const float2& a, float f)
{
return make_float2(a.x*f, a.y*f);
}
ccl_device_inline float2 operator*(float f, const float2 a)
ccl_device_inline float2 operator*(float f, const float2& a)
{
return make_float2(a.x*f, a.y*f);
}
ccl_device_inline float2 operator/(float f, const float2 a)
ccl_device_inline float2 operator/(float f, const float2& a)
{
return make_float2(f/a.x, f/a.y);
}
ccl_device_inline float2 operator/(const float2 a, float f)
ccl_device_inline float2 operator/(const float2& a, float f)
{
float invf = 1.0f/f;
return make_float2(a.x*invf, a.y*invf);
}
ccl_device_inline float2 operator/(const float2 a, const float2 b)
ccl_device_inline float2 operator/(const float2& a, const float2& b)
{
return make_float2(a.x/b.x, a.y/b.y);
}
ccl_device_inline float2 operator+(const float2 a, const float2 b)
ccl_device_inline float2 operator+(const float2& a, const float2& b)
{
return make_float2(a.x+b.x, a.y+b.y);
}
ccl_device_inline float2 operator-(const float2 a, const float2 b)
ccl_device_inline float2 operator-(const float2& a, const float2& b)
{
return make_float2(a.x-b.x, a.y-b.y);
}
ccl_device_inline float2 operator+=(float2& a, const float2 b)
ccl_device_inline float2 operator+=(float2& a, const float2& b)
{
return a = a + b;
}
ccl_device_inline float2 operator*=(float2& a, const float2 b)
ccl_device_inline float2 operator*=(float2& a, const float2& b)
{
return a = a * b;
}
@@ -312,7 +312,7 @@ ccl_device_inline float2 operator*=(float2& a, float f)
return a = a * f;
}
ccl_device_inline float2 operator/=(float2& a, const float2 b)
ccl_device_inline float2 operator/=(float2& a, const float2& b)
{
return a = a / b;
}
@@ -324,12 +324,12 @@ ccl_device_inline float2 operator/=(float2& a, float f)
}
ccl_device_inline float dot(const float2 a, const float2 b)
ccl_device_inline float dot(const float2& a, const float2& b)
{
return a.x*b.x + a.y*b.y;
}
ccl_device_inline float cross(const float2 a, const float2 b)
ccl_device_inline float cross(const float2& a, const float2& b)
{
return (a.x*b.y - a.y*b.x);
}
@@ -343,59 +343,59 @@ ccl_device_inline bool operator==(const int2 a, const int2 b)
return (a.x == b.x && a.y == b.y);
}
ccl_device_inline float len(const float2 a)
ccl_device_inline float len(const float2& a)
{
return sqrtf(dot(a, a));
}
ccl_device_inline float2 normalize(const float2 a)
ccl_device_inline float2 normalize(const float2& a)
{
return a/len(a);
}
ccl_device_inline float2 normalize_len(const float2 a, float *t)
ccl_device_inline float2 normalize_len(const float2& a, float *t)
{
*t = len(a);
return a/(*t);
}
ccl_device_inline float2 safe_normalize(const float2 a)
ccl_device_inline float2 safe_normalize(const float2& a)
{
float t = len(a);
return (t != 0.0f)? a/t: a;
}
ccl_device_inline bool operator==(const float2 a, const float2 b)
ccl_device_inline bool operator==(const float2& a, const float2& b)
{
return (a.x == b.x && a.y == b.y);
}
ccl_device_inline bool operator!=(const float2 a, const float2 b)
ccl_device_inline bool operator!=(const float2& a, const float2& b)
{
return !(a == b);
}
ccl_device_inline float2 min(float2 a, float2 b)
ccl_device_inline float2 min(const float2& a, const float2& b)
{
return make_float2(min(a.x, b.x), min(a.y, b.y));
}
ccl_device_inline float2 max(float2 a, float2 b)
ccl_device_inline float2 max(const float2& a, const float2& b)
{
return make_float2(max(a.x, b.x), max(a.y, b.y));
}
ccl_device_inline float2 clamp(float2 a, float2 mn, float2 mx)
ccl_device_inline float2 clamp(const float2& a, const float2& mn, const float2& mx)
{
return min(max(a, mn), mx);
}
ccl_device_inline float2 fabs(float2 a)
ccl_device_inline float2 fabs(const float2& a)
{
return make_float2(fabsf(a.x), fabsf(a.y));
}
ccl_device_inline float2 as_float2(const float4 a)
ccl_device_inline float2 as_float2(const float4& a)
{
return make_float2(a.x, a.y);
}
@@ -413,7 +413,7 @@ ccl_device_inline void print_float2(const char *label, const float2& a)
#ifndef __KERNEL_OPENCL__
ccl_device_inline float2 interp(float2 a, float2 b, float t)
ccl_device_inline float2 interp(const float2& a, const float2& b, float t)
{
return a + t*(b - a);
}
@@ -424,58 +424,92 @@ ccl_device_inline float2 interp(float2 a, float2 b, float t)
#ifndef __KERNEL_OPENCL__
ccl_device_inline float3 operator-(const float3 a)
ccl_device_inline float3 operator-(const float3& a)
{
#ifdef __KERNEL_SSE__
return float3(_mm_xor_ps(a.m128, _mm_castsi128_ps(_mm_set1_epi32(0x80000000))));
#else
return make_float3(-a.x, -a.y, -a.z);
#endif
}
ccl_device_inline float3 operator*(const float3 a, const float3 b)
ccl_device_inline float3 operator*(const float3& a, const float3& b)
{
#ifdef __KERNEL_SSE__
return float3(_mm_mul_ps(a.m128,b.m128));
#else
return make_float3(a.x*b.x, a.y*b.y, a.z*b.z);
#endif
}
ccl_device_inline float3 operator*(const float3 a, float f)
ccl_device_inline float3 operator*(const float3& a, const float f)
{
#ifdef __KERNEL_SSE__
return float3(_mm_mul_ps(a.m128,_mm_set1_ps(f)));
#else
return make_float3(a.x*f, a.y*f, a.z*f);
#endif
}
ccl_device_inline float3 operator*(float f, const float3 a)
ccl_device_inline float3 operator*(const float f, const float3& a)
{
#ifdef __KERNEL_SSE__
return float3(_mm_mul_ps(a.m128, _mm_set1_ps(f)));
#else
return make_float3(a.x*f, a.y*f, a.z*f);
#endif
}
ccl_device_inline float3 operator/(float f, const float3 a)
ccl_device_inline float3 operator/(const float f, const float3& a)
{
return make_float3(f/a.x, f/a.y, f/a.z);
#ifdef __KERNEL_SSE__
__m128 rc = _mm_rcp_ps(a.m128);
return float3(_mm_mul_ps(_mm_set1_ps(f),rc));
#else
return make_float3(f / a.x, f / a.y, f / a.z);
#endif
}
ccl_device_inline float3 operator/(const float3 a, float f)
ccl_device_inline float3 operator/(const float3& a, const float f)
{
float invf = 1.0f/f;
return make_float3(a.x*invf, a.y*invf, a.z*invf);
return a * invf;
}
ccl_device_inline float3 operator/(const float3 a, const float3 b)
ccl_device_inline float3 operator/(const float3& a, const float3& b)
{
return make_float3(a.x/b.x, a.y/b.y, a.z/b.z);
#ifdef __KERNEL_SSE__
__m128 rc = _mm_rcp_ps(b.m128);
return float3(_mm_mul_ps(a, rc));
#else
return make_float3(a.x / b.x, a.y / b.y, a.z / b.z);
#endif
}
ccl_device_inline float3 operator+(const float3 a, const float3 b)
ccl_device_inline float3 operator+(const float3& a, const float3& b)
{
return make_float3(a.x+b.x, a.y+b.y, a.z+b.z);
#ifdef __KERNEL_SSE__
return float3(_mm_add_ps(a.m128, b.m128));
#else
return make_float3(a.x + b.x, a.y + b.y, a.z + b.z);
#endif
}
ccl_device_inline float3 operator-(const float3 a, const float3 b)
ccl_device_inline float3 operator-(const float3& a, const float3& b)
{
return make_float3(a.x-b.x, a.y-b.y, a.z-b.z);
#ifdef __KERNEL_SSE__
return float3(_mm_sub_ps(a.m128, b.m128));
#else
return make_float3(a.x - b.x, a.y - b.y, a.z - b.z);
#endif
}
ccl_device_inline float3 operator+=(float3& a, const float3 b)
ccl_device_inline float3 operator+=(float3& a, const float3& b)
{
return a = a + b;
}
ccl_device_inline float3 operator*=(float3& a, const float3 b)
ccl_device_inline float3 operator*=(float3& a, const float3& b)
{
return a = a * b;
}
@@ -485,7 +519,7 @@ ccl_device_inline float3 operator*=(float3& a, float f)
return a = a * f;
}
ccl_device_inline float3 operator/=(float3& a, const float3 b)
ccl_device_inline float3 operator/=(float3& a, const float3& b)
{
return a = a / b;
}
@@ -496,7 +530,7 @@ ccl_device_inline float3 operator/=(float3& a, float f)
return a = a * invf;
}
ccl_device_inline float dot(const float3 a, const float3 b)
ccl_device_inline float dot(const float3& a, const float3& b)
{
#if defined(__KERNEL_SSE41__) && defined(__KERNEL_SSE__)
return _mm_cvtss_f32(_mm_dp_ps(a, b, 0x7F));
@@ -505,7 +539,16 @@ ccl_device_inline float dot(const float3 a, const float3 b)
#endif
}
ccl_device_inline float dot(const float4 a, const float4 b)
ccl_device_inline float dot_xy(const float3& a, const float3& b)
{
#if defined(__KERNEL_SSE41__) && defined(__KERNEL_SSE__)
return _mm_cvtss_f32(_mm_hadd_ps(_mm_mul_ps(a,b),b));
#else
return a.x*b.x + a.y*b.y;
#endif
}
ccl_device_inline float dot(const float4& a, const float4& b)
{
#if defined(__KERNEL_SSE41__) && defined(__KERNEL_SSE__)
return _mm_cvtss_f32(_mm_dp_ps(a, b, 0xFF));
@@ -514,7 +557,7 @@ ccl_device_inline float dot(const float4 a, const float4 b)
#endif
}
ccl_device_inline float3 cross(const float3 a, const float3 b)
ccl_device_inline float3 cross(const float3& a, const float3& b)
{
float3 r = make_float3(a.y*b.z - a.z*b.y, a.z*b.x - a.x*b.z, a.x*b.y - a.y*b.x);
return r;
@@ -538,12 +581,12 @@ ccl_device_inline float len_squared(const float3 a)
#ifndef __KERNEL_OPENCL__
ccl_device_inline float len_squared(const float4 a)
ccl_device_inline float len_squared(const float4& a)
{
return dot(a, a);
}
ccl_device_inline float3 normalize(const float3 a)
ccl_device_inline float3 normalize(const float3& a)
{
#if defined(__KERNEL_SSE41__) && defined(__KERNEL_SSE__)
__m128 norm = _mm_sqrt_ps(_mm_dp_ps(a.m128, a.m128, 0x7F));
@@ -563,13 +606,14 @@ ccl_device_inline float3 saturate3(float3 a)
ccl_device_inline float3 normalize_len(const float3 a, float *t)
{
*t = len(a);
return a/(*t);
float x = 1.0f / *t;
return a*x;
}
ccl_device_inline float3 safe_normalize(const float3 a)
{
float t = len(a);
return (t != 0.0f)? a/t: a;
return (t != 0.0f)? a * (1.0f/t) : a;
}
ccl_device_inline float3 safe_normalize_len(const float3 a, float *t)
@@ -580,7 +624,7 @@ ccl_device_inline float3 safe_normalize_len(const float3 a, float *t)
#ifndef __KERNEL_OPENCL__
ccl_device_inline bool operator==(const float3 a, const float3 b)
ccl_device_inline bool operator==(const float3& a, const float3& b)
{
#ifdef __KERNEL_SSE__
return (_mm_movemask_ps(_mm_cmpeq_ps(a.m128, b.m128)) & 7) == 7;
@@ -589,12 +633,12 @@ ccl_device_inline bool operator==(const float3 a, const float3 b)
#endif
}
ccl_device_inline bool operator!=(const float3 a, const float3 b)
ccl_device_inline bool operator!=(const float3& a, const float3& b)
{
return !(a == b);
}
ccl_device_inline float3 min(float3 a, float3 b)
ccl_device_inline float3 min(const float3& a, const float3& b)
{
#ifdef __KERNEL_SSE__
return _mm_min_ps(a.m128, b.m128);
@@ -603,7 +647,7 @@ ccl_device_inline float3 min(float3 a, float3 b)
#endif
}
ccl_device_inline float3 max(float3 a, float3 b)
ccl_device_inline float3 max(const float3& a, const float3& b)
{
#ifdef __KERNEL_SSE__
return _mm_max_ps(a.m128, b.m128);
@@ -612,12 +656,12 @@ ccl_device_inline float3 max(float3 a, float3 b)
#endif
}
ccl_device_inline float3 clamp(float3 a, float3 mn, float3 mx)
ccl_device_inline float3 clamp(const float3& a, const float3& mn, const float3& mx)
{
return min(max(a, mn), mx);
}
ccl_device_inline float3 fabs(float3 a)
ccl_device_inline float3 fabs(const float3& a)
{
#ifdef __KERNEL_SSE__
__m128 mask = _mm_castsi128_ps(_mm_set1_epi32(0x7fffffff));
@@ -670,7 +714,7 @@ ccl_device_inline float3 interp(float3 a, float3 b, float t)
#ifndef __KERNEL_OPENCL__
ccl_device_inline float3 mix(float3 a, float3 b, float t)
ccl_device_inline float3 mix(const float3& a, const float3& b, float t)
{
return a + t*(b - a);
}
@@ -833,7 +877,7 @@ ccl_device_inline int4 operator<(const float4& a, const float4& b)
#endif
}
ccl_device_inline int4 operator>=(float4 a, float4 b)
ccl_device_inline int4 operator>=(const float4& a, const float4& b)
{
#ifdef __KERNEL_SSE__
return _mm_cvtps_epi32(_mm_cmpge_ps(a.m128, b.m128)); /* todo: avoid cvt */
@@ -851,7 +895,7 @@ ccl_device_inline int4 operator<=(const float4& a, const float4& b)
#endif
}
ccl_device_inline bool operator==(const float4 a, const float4 b)
ccl_device_inline bool operator==(const float4& a, const float4& b)
{
#ifdef __KERNEL_SSE__
return (_mm_movemask_ps(_mm_cmpeq_ps(a.m128, b.m128)) & 15) == 15;
@@ -893,23 +937,23 @@ ccl_device_inline float average(const float4& a)
return reduce_add(a) * 0.25f;
}
ccl_device_inline float len(const float4 a)
ccl_device_inline float len(const float4& a)
{
return sqrtf(dot(a, a));
}
ccl_device_inline float4 normalize(const float4 a)
ccl_device_inline float4 normalize(const float4& a)
{
return a/len(a);
}
ccl_device_inline float4 safe_normalize(const float4 a)
ccl_device_inline float4 safe_normalize(const float4& a)
{
float t = len(a);
return (t != 0.0f)? a/t: a;
}
ccl_device_inline float4 min(float4 a, float4 b)
ccl_device_inline float4 min(const float4& a, const float4& b)
{
#ifdef __KERNEL_SSE__
return _mm_min_ps(a.m128, b.m128);
@@ -918,7 +962,7 @@ ccl_device_inline float4 min(float4 a, float4 b)
#endif
}
ccl_device_inline float4 max(float4 a, float4 b)
ccl_device_inline float4 max(const float4& a, const float4& b)
{
#ifdef __KERNEL_SSE__
return _mm_max_ps(a.m128, b.m128);
@@ -1190,7 +1234,7 @@ template<class A, class B> A lerp(const A& a, const A& b, const B& t)
/* Triangle */
ccl_device_inline float triangle_area(const float3 v1, const float3 v2, const float3 v3)
ccl_device_inline float triangle_area(const float3& v1, const float3& v2, const float3& v3)
{
return len(cross(v3 - v2, v1 - v2))*0.5f;
}

View File

@@ -455,6 +455,7 @@ CCL_NAMESPACE_END
#include "util_sseb.h"
#include "util_ssei.h"
#include "util_ssef.h"
#include "util_avxf.h"
#endif /* __UTIL_SIMD_TYPES_H__ */

View File

@@ -174,6 +174,9 @@ struct ccl_try_align(16) int3 {
__forceinline int3(const __m128i a) : m128(a) {}
__forceinline operator const __m128i&(void) const { return m128; }
__forceinline operator __m128i&(void) { return m128; }
int3(const int3& a) { m128 = a.m128; }
int3& operator =(const int3& a) { m128 = a.m128; return *this; }
#else
int x, y, z, w;
#endif
@@ -193,6 +196,9 @@ struct ccl_try_align(16) int4 {
__forceinline int4(const __m128i a) : m128(a) {}
__forceinline operator const __m128i&(void) const { return m128; }
__forceinline operator __m128i&(void) { return m128; }
int4(const int4& a) : m128(a.m128) {}
int4& operator=(const int4& a) { m128 = a.m128; return *this; }
#else
int x, y, z, w;
#endif
@@ -237,9 +243,12 @@ struct ccl_try_align(16) float3 {
};
__forceinline float3() {}
__forceinline float3(const __m128 a) : m128(a) {}
__forceinline float3(const __m128& a) : m128(a) {}
__forceinline operator const __m128&(void) const { return m128; }
__forceinline operator __m128&(void) { return m128; }
__forceinline float3(const float3& a) : m128(a.m128) {}
__forceinline float3& operator =(const float3& a) { m128 = a.m128; return *this; }
#else
float x, y, z, w;
#endif
@@ -259,6 +268,10 @@ struct ccl_try_align(16) float4 {
__forceinline float4(const __m128 a) : m128(a) {}
__forceinline operator const __m128&(void) const { return m128; }
__forceinline operator __m128&(void) { return m128; }
__forceinline float4(const float4& a) : m128(a.m128) {}
__forceinline float4& operator =(const float4& a) { m128 = a.m128; return *this; }
#else
float x, y, z, w;
#endif