1 files changed, 678 insertions, 0 deletions
diff --git a/Runtime/Math/Simd/math.h b/Runtime/Math/Simd/math.h
new file mode 100644
index 0000000..43a8837
--- /dev/null
+++ b/Runtime/Math/Simd/math.h
@@ -0,0 +1,678 @@
+#ifndef SIMD_MATH_H
+#define SIMD_MATH_H
+
+#include <cmath>
+
+// Standard macro define in cmath
+#ifndef M_EPSF
+#define M_EPSF     1e-6f
+#endif
+#ifndef M_PIf
+#define M_PIf       3.14159265358979323846f
+#endif
+#ifndef M_PI_2f
+#define M_PI_2f     1.57079632679489661923f
+#endif
+#ifndef M_PI_4f
+#define M_PI_4f     0.785398163397448309616f
+#endif
+#ifndef M_1_PIf
+#define M_1_PIf     0.318309886183790671538f
+#endif
+#ifndef M_2_PIf
+#define M_2_PIf     0.636619772367581343076f
+#endif
+#ifndef M_DEG_2_RADf
+#define M_DEG_2_RADf  0.0174532925f
+#endif
+#ifndef M_RAD_2_DEGf
+#define M_RAD_2_DEGf  57.295779513f
+#endif
+
+#include "Runtime/Math/Simd/float4.h"
+#include "Runtime/Math/Simd/bool4.h"
+
+namespace math
+{
+
+static inline bool all(bool4 const& r);
+static inline bool any(bool4 const& r);
+template <typename T> static inline T clamp(T const& v, T const& a, T const& b);
+static inline float cond(bool c, float const& a, float const& b);
+static inline int cond(bool c, int const& a, int const& b);
+static inline float cubic(float const& a, float const& b, float const& c, float const& d, float const& u);
+static inline float4 cubic(float4 const& a, float4 const& b, float4 const& c, float4 const& d, float4 const& u);
+static inline float4 cross(float4 const& a, float4 const& b);
+static inline float degrees(float const& deg);
+static inline float4 degrees(float4 const& deg);
+static inline float1 dot(float4 const& l, float4 const& r);
+static inline float1 dot(float4 const& r);
+static inline float1 length(float4 const& r);
+static inline float lerp(float const& a, float const& b, float x);
+static inline float1 lerp(float1 const& a, float1 const& b, float1 const& x);
+static inline float4 lerp(float4 const& a, float4 const& b, float1 const& x);
+static inline float4 lerp(float4 const& a, float4 const& b, float4 const& x);
+template <typename T> static inline T maximum(T const& a, T const& b);
+template <typename T> static inline T minimum(T const& a, T const& b);
+static inline float1 maximum(float4 const& r);
+static inline float1 minimum(float4 const& r);
+static inline float4 normalize(float4 const& r);
+static inline float pow(float const& r, float const& e);
+static inline float4 pow(float4 const& r, float1 const& e);
+static inline float radians(float const& deg);
+static inline float4 radians(float4 const& deg);
+static inline float4 rcp(float4 const& r);
+static inline float1 rcp(float1 const& r);
+static inline float4 rsqrt(float4 const& r );
+static inline float saturate(float const& r);
+static inline float1 saturate(float1 const& r);
+static inline float4 saturate(float4 const& r);
+static inline float4 scaleIdentity();
+static inline float4 scaleWeight(float4 const& s, float1 const& w);
+static inline void sincos(float4 const& u, float4& s, float4& c);
+static inline void sincos(float1 const& u, float1& s, float1& c);
+static inline void sincose(float4 const& u, float4& s, float4& c);
+static inline void sincose(float1 const& u, float1& s, float1& c);
+static inline float sgn(float const& r);
+static inline float1 sgn(float1 const& r);
+template <typename T> static inline vecexp4<vec4> sgn(vecexp4<T> const& x);
+template <typename T> static inline vecexp1<vec4> sgn(vecexp1<T> const& x);
+static inline float sign(float const& r);
+static inline float4 sign(float4 const& r);
+static inline float smoothstep( float min, float max, float x);
+static inline float smoothpulse( float minmin, float minmax, float maxmin, float maxmax, float x);
+static inline float1 sqrt(float1 const& r);
+static inline float4 sqrt(float4 const& r);
+static inline float1 sum(float4 const& r);	
+static inline float4 vector(float4 const& v);
+static inline float unrollangle(float angleRef, float angle);
+static inline float4 load(float const* v);
+static inline void store(float4 const& v, float* r);
+static inline void store(bool4 const& v, bool* r);
+
+
+static inline float abs(const float &r)
+{
+	return std::abs(r);
+}
+
+static inline float cos(float const& theta)
+{
+	return std::cos(theta);
+}
+
+static inline float rcp(const float &r)
+{
+	return 1.f/r;
+}
+
+static inline float rsqrt(const float& r)
+{
+	return 1.f/std::sqrt(r);
+}
+
+static inline float sin(float const& theta)
+{
+	return std::sin(theta);
+}
+
+static inline void sincos(float const& u, float& s, float& c)
+{
+	s = sin(u);
+	c = cos(u);		
+}
+
+static inline float tan(float const& theta)
+{
+	return std::tan(theta);
+}
+
+static inline float atan(float const& t)
+{
+	return std::atan(t);
+}
+
+static inline  float sqrt(const float& r)
+{
+	return std::sqrt(r);
+}	
+
+static inline float modf(float x, float &ip)
+{
+#if UNITY_FLASH
+	float intPart;
+	__asm __volatile__("%[RES] = (%[FARG] < 0 ? Math.ceil(%[FARG]) : Math.floor(%[FARG]));//modf" : [RES] "=f" (intPart) : [FARG] "f" (x));
+	ip = intPart;
+	return x-intPart;
+#else
+	return std::modf(x, &ip); 
+#endif
+}
+
+static inline float fmod(float x, float y)
+{
+	return std::fmod(x,y);
+}
+
+static inline  float pow(const float& x,const float& y)
+{
+	return std::pow(x,y);
+}	
+
+template <typename T> static inline vecexp4<vec4> abs(vecexp4<T> const& x)
+{
+	return vecexp4<vec4>( Vabs( x.eval() ) );
+}
+
+template <typename T> static inline vecexp1<T> abs(vecexp1<T> const& x)
+{
+	return vecexp1<T>( Vabs( x.eval() ) );
+}
+
+static inline float1 abs(float1 const& x)
+{
+	return float1( Vabs( x.eval() ) );
+}
+
+static inline bool all(bool4 const& r)
+{
+	return Vall(r.v); 
+}
+static inline bool any(bool4 const& r)
+{
+	return Vany(r.v); 
+}
+
+// x clamped to the range [a, b] as follows:
+// Returns a if x is less than a.
+// Returns b if x is greater than b.
+// Returns x otherwise.
+template <typename T> static inline T clamp(T const& v, T const& a, T const& b)
+{
+	return minimum(b, maximum(a, v));
+}
+
+template <typename L, typename R> static inline vecexp4<vec4> cond(bool4 const& c, vecexp4<L> const& l, vecexp4<R> const& r)
+{	
+	return vecexp4<vec4>( Vsel(c.v, l.eval(), r.eval()) ); 
+}
+
+template <typename L, typename R> static inline vecexp4<vec4> cond(bool1 const& c, vecexp4<L> const& l, vecexp4<R> const& r)
+{
+	return vecexp4<vec4>( Vsel(c.s, l.eval(), r.eval()) ); 
+}
+
+template <typename L, typename R> static inline vecexp1<vec4> cond(bool1 const& c, vecexp1<L> const& l, vecexp1<R> const& r)
+{
+	return vecexp4<vec4>( Vsel(c.s, l.eval(), r.eval()) ); 
+}
+
+static inline float cond(bool c, float const& a, float const& b)
+{
+	return c ? a : b;
+}
+
+static inline int cond(bool c, int const& a, int const& b)
+{
+	return int(b + (-int(c) & (a - b)));
+}
+
+static inline int cond(bool c, long int const& a, long int const& b)
+{
+	typedef long int long_int;
+	return long_int(b + (-long_int(c) & (a - b)));
+}
+
+static inline unsigned long cond(bool c, unsigned long const& a, unsigned long const& b)
+{
+	return b + (- long(c) & (a - b));
+}
+
+static inline unsigned int cond(bool c, unsigned int const& a, unsigned int const& b)
+{
+	return b + (- int(c) & (a - b));
+}
+
+// De Casteljau construction of bezier
+static inline float cubic(float const& a, float const& b, float const& c, float const& d, float const& u)
+{
+	const float ab = lerp(a,b,u);
+	const float bc = lerp(b,c,u);
+	const float cd = lerp(c,d,u);
+	const float abc = lerp(ab,bc,u);
+	const float bcd = lerp(bc,cd,u);
+	return lerp(abc, bcd, u);
+}
+
+static inline float4 cubic(float4 const& a, float4 const& b, float4 const& c, float4 const& d, float4 const& u)
+{
+	const float4 ab = lerp(a,b,u);
+	const float4 bc = lerp(b,c,u);
+	const float4 cd = lerp(c,d,u);
+	const float4 abc = lerp(ab,bc,u);
+	const float4 bcd = lerp(bc,cd,u);
+	return lerp(abc, bcd, u);
+}
+
+static inline float4 cross(float4 const& a, float4 const& b)
+{
+	return float4(a.yzxw()*b.zxyw() - a.zxyw()*b.yzxw());		
+}
+
+static inline float degrees(float const& rad)
+{
+	return M_RAD_2_DEGf*rad;
+}
+
+static inline float4 degrees(float4 const& rad)
+{
+	return float1(M_RAD_2_DEGf)*rad;
+}
+
+static inline float1 degrees(float1 const& rad)
+{
+	return float1(M_RAD_2_DEGf)*rad;
+}
+
+static inline float1 dot(float4 const& l, float4 const& r)
+{
+	return float1( Vdot(l.eval(), r.eval()) );
+}
+
+static inline float1 dot(float4 const& r)
+{
+	return float1( Vdot(r.eval(), r.eval()) );
+}
+
+static inline float1 length(float4 const& r)
+{
+	return float1(Vsqrt( Vdot(r.eval(), r.eval()) ));
+}
+
+static inline float lerp(float const& a, float const& b, float x)
+{
+	return a + x*(b - a);
+}
+
+static inline float1 lerp(float1 const& a, float1 const& b, float1 const& x)
+{
+	return a + x*(b - a);
+}
+
+static inline float4 lerp(float4 const& a, float4 const& b, float1 const& x)
+{
+	return a + x*(b - a);
+}
+
+static inline float4 lerp(float4 const& a, float4 const& b, float4 const& x)
+{
+	return a + x*(b - a);
+}
+
+template <typename T> static inline T maximum(T const& a, T const& b)
+{
+	return cond(a > b, a, b);
+}
+
+static inline float1 maximum(float4 const& r)
+{
+	return float1( Vlargest(r.eval()) );
+}
+
+template <typename T> static inline T minimum(T const& a, T const& b)
+{
+	return cond(a < b, a, b);
+}	
+
+static inline float1 minimum(float4 const& r)
+{
+	return float1( Vsmallest(r.eval()) );
+}
+
+static inline float4 normalize(float4 const& r)
+{
+	return float4( Vmul(r.eval(), Vrsqrt(Vdot(r.eval(), r.eval()) ) ));
+}	
+
+static inline float4 pow(float4 const& r, float1 const& e)
+{
+	float e1 = e.tofloat();	
+	
+	return float4( std::pow( r.x().tofloat(), e1), std::pow( r.y().tofloat(), e1), std::pow( r.z().tofloat(), e1), std::pow( r.w().tofloat(), e1));	
+}	
+
+static inline float radians(float const& deg)
+{
+	return M_DEG_2_RADf*deg;
+}
+
+static inline float4 radians(float4 const& deg)
+{
+	return float1(M_DEG_2_RADf)*deg;
+}
+
+static inline float4 rcp(float4 const& r)
+{
+	return float4(Vrcp(r.eval()));
+}	
+
+static inline float1 rcp(float1 const& r)
+{
+	return float1(Vrcp(r.eval()));
+}
+
+static inline float4 rsqrt(float4 const& r)
+{
+	return float4(Vrsqrt(r.eval()));
+}		
+
+static inline float saturate(float const& r)
+{
+	return clamp(r, 0.f, 1.f);
+}
+
+static inline float1 saturate(float1 const& r)
+{
+	return float1(Vmin( Vmax(r.eval(), Vzero()), Vone()));	
+}
+
+static inline float4 saturate(float4 const& r)
+{
+	return float4(Vmin( Vmax(r.eval(), Vzero()), Vone()));		
+}
+
+static inline float4 scaleIdentity()
+{
+	return float4::one();
+}
+
+static inline float4 scaleWeight(float4 const& s, float1 const& w)
+{
+	float4 s_abs = math::abs(s);
+	float4 s_sng = math::sgn(s);
+
+	return s_sng * pow( s_abs, w);	
+}
+
+static inline float4 scaleBlend(float4 const& sa, float4 const& sb,float1 const& w)
+{
+	const float4 saw = scaleWeight(sa, float1::one() - w);
+	const float4 sbw = scaleWeight(sb, w);
+	const float4 s_sng = math::sgn( cond( w > float1(.5), sb, sa) );
+	return s_sng * math::abs(saw * sbw);
+}
+
+// return -1 if r < 0
+// return 1 if r >= 0
+static inline float sgn(float const& r)
+{
+	return cond(r >= 0.f, 1.f, -1.f);
+}
+
+// return -1 if r < 0
+// return 1 if r >= 0
+static inline float1 sgn(float1 const& r)
+{
+	return float1(Vsgn(r.eval()));
+}
+
+// return -1 if r < 0
+// return 1 if r >= 0
+template <typename T> static inline vecexp4<vec4> sgn(vecexp4<T> const& x)
+{
+	return vecexp4<vec4>( Vsgn( x.eval()) );
+}
+
+// return -1 if r < 0
+// return 1 if r >= 0
+template <typename T> static inline vecexp1<vec4> sgn(vecexp1<T> const& x)
+{
+	return vecexp1<vec4>( Vsgn( x.eval() ) );
+}
+
+// return -1 if r < 0
+// return 0 if r == 0
+// return 1 if r > 0
+static inline float sign(float const& r)
+{
+	return cond( r > 0, 1.f, cond( r < 0, -1.f, 0.f));
+}
+
+// return -1 if r < 0
+// return 0 if r == 0
+// return 1 if r > 0
+static inline float4 sign(float4 const& r)
+{
+	return float4(Vsign(r.eval()));
+}
+
+static inline float4 smoothClamp(float4 const& v, float4 const& m, float1 const& r)
+{
+	return cond(v-m>float1::zero(),m+r*((v-m)/(v-m+r)),v);
+}
+
+static inline float smoothstep( float min, float max, float x)
+{
+	x = math::clamp(x, min, max);
+	return -2.f * math::pow((x-min)/(max-min), 3.f) + 3.f * math::pow((x-min)/(max-min), 2.f);
+}
+
+static inline float smoothpulse( float minmin, float minmax, float maxmin, float maxmax, float x)
+{
+	return smoothstep(minmin,minmax,x) - smoothstep(maxmin,maxmax,x); 
+}
+
+static inline float1 sqrt(float1 const& r)
+{
+	return float1(Vsqrt(r.eval()));
+}
+
+static inline float4 sqrt(float4 const& r)
+{
+	return float4(Vsqrt(r.eval()));
+}
+
+static inline float1 sum(float4 const& r)
+{
+	return float1(Vsum(r.eval()));
+}
+
+static inline float1 triangleAngle(math::float1 const& aLen, math::float1 const& aLen1, math::float1 const& aLen2)
+{ 
+	math::float1 c = clamp<float1>((aLen1*aLen1 + aLen2*aLen2 - aLen*aLen) / (aLen1*aLen2) / float1(2.f), -float1::one() , float1::one());
+	return math::float1(acos(c.tofloat()));
+}
+
+static inline float4 vector(float4 const& v)
+{
+	constant_float4( mask, 1.f,1.f,1.f,0.f);
+	return v*mask;
+}	
+
+static inline float4 vector(float const& x, float const& y, float const& z)
+{
+	return float4(x, y, z, 0);
+}
+
+static inline float unrollangle(float angleRef, float angle)
+{
+	float i;
+	float f = math::modf( (angleRef-angle)/(2.f*M_PIf), i);
+	return angle + ( (i+(math::abs(f) > 0.5f ? math::sgn(f) * 1 : 0)) * 2.f * M_PIf);
+}
+
+static inline float4 doubleAtan(float4 const& v)
+{
+	float ATTRIBUTE_ALIGN(ALIGN4F) av[4];
+	
+	store(v, av);
+
+	return float4(2.0f*atan(av[0]),2.0f*atan(av[1]),2.0f*atan(av[2]),2.0f*atan(av[3]));
+}
+
+
+// between range [-pi/2, pi/2] the maximum error is 8.186e-4
+static inline vec4f cos_estimate(vec4f x)
+{
+	// cos(x) = 1 - (c2*x^2) + (c4*x^4) - (c6*x^6)
+	// cos(x) = 1 + (-c2*x^2) + (c4*x^4) + (-c6*x^6) // 3 mul and 3 mul add
+	// let's bake sign into constant to remove some complexity
+	cvec4fs(c2, -0.5f);
+	cvec4fs(c4, 4.166666666667e-2f);
+	cvec4fs(c6, -1.38888889e-3f);
+	
+	// Use horner form to reduce the polynomial instruction count
+	// cos(x) = 1 + x^2*(c2 + x^2*(c4 + x^2*(c6))) // 1 mul and 3 mul add
+	vec4f x2 = Vmul(x,x);
+	return Vmadd(Vmadd(Vmadd(c6, x2, c4), x2, c2), x2, Vone());
+}
+
+
+// between range [-pi/2, pi/2] the maximum error is 9.1e-5
+static inline vec4f sin_estimate(vec4f x)
+{
+	// sin(x) = x - (c3*x^3) + (c5*x^5) - (c7*x^7)
+	// sin(x) = x + (-c3*x^3) + (c5*x^5) + (-c7*x^7) // 4 mul and 3 mul add
+	// let's bake sign into constant to remove some complexity
+	cvec4fs(c3, -0.166666567325592041015625f);
+	cvec4fs(c5, 8.33220803e-3f);
+	cvec4fs(c7, -1.95168955e-4f);
+	
+	// Use horner form to reduce the polynomial instruction count
+	// sin(x) = x * ( 1 + x^2*(c3 + x^2*(c5 + x^2*c7))) // 2 mul and 3 mul add
+	vec4f x2 = Vmul(x,x);
+	return Vmul(x, Vmadd(Vmadd(Vmadd(c7, x2, c5), x2, c3), x2, Vone()));
+}
+
+static inline float4 sin_est(float4 const& x)
+{
+	return float4( sin_estimate( x.eval() ) );
+}
+
+static inline float4 cos_est(float4 const& x)
+{
+	return float4( cos_estimate( x.eval() ) );
+}
+
+static inline void sincos(float4 const& u, float4& s, float4& c)
+{
+	float ATTRIBUTE_ALIGN(ALIGN4F) sv[4];
+	float ATTRIBUTE_ALIGN(ALIGN4F) cv[4];
+	float ATTRIBUTE_ALIGN(ALIGN4F) uv[4];
+	
+	store(u, uv);
+
+	sincos(uv[0], sv[0], cv[0]);
+	sincos(uv[1], sv[1], cv[1]);
+	sincos(uv[2], sv[2], cv[2]);
+	sincos(uv[3], sv[3], cv[3]);
+
+	s = load(sv);
+	c = load(cv);
+}
+
+static inline void sincos(float1 const& u, float1& s, float1& c)
+{
+	float sv;
+	float cv;
+	
+	sincos(u.tofloat(), sv, cv);
+
+	s = float1(sv);
+	c = float1(cv);
+}
+
+static inline void sincos_est(float4 const& u, float4& s, float4& c)
+{
+	s = sin_est(u);
+	c = cos_est(u);
+}
+
+static inline void sincos_est(float1 const& u, float1& s, float1& c)
+{
+	s = float1( sin_estimate( u.eval() ) );
+	c = float1( cos_estimate( u.eval() ) );
+}
+
+static inline float4 tan(float4 const& x)
+{
+	vec4f x2,x3;
+
+	// Compute x^2 and x^3
+	//
+	x2 = Vmul(x.eval(),x.eval());
+	x3 = Vmul(x2,x.eval());	    
+
+	// Compute both the sin and cos of the angles
+	// using a polynomial expression:
+	//   cx = 1.0f + x2 * (C0 * x2 + C1), and
+	//   sx = xl + x3 * S0
+	//
+	cvec4fs(c0, 0.0097099364f);
+	cvec4fs(c1, -0.4291161787f);
+	cvec4fs(s0, -0.0957822992f);
+
+	vec4f ct2 = Vmadd(c0,x2,c1);
+    
+	vec4f cx = Vmadd(ct2,x2, Vone());
+	vec4f sx = Vmadd(s0,x3, x.eval());
+
+	return float4(Vdiv(sx,cx));
+}
+
+static inline float4 atan(float4 const& x)
+{
+	//x - (x^3)/3 + (x^5)/5 - (x^7)/7 + ...
+	
+	cvec4fs(c3, 3.f);
+	cvec4fs(c5, 5.f);
+	cvec4fs(c7, 7.f);
+	vec4f x2 = Vmul(x.eval(),x.eval());
+	vec4f x3 = Vmul(x2,x.eval());
+	vec4f x5 = Vmul(x3,x2);
+	vec4f x7 = Vmul(x5,x2);
+
+	return float4(Vsub( x.eval(), Vadd(Vdiv( x3, c3), Vsub(Vdiv(x5,c5), Vdiv(x7,c7)))));
+}
+
+static inline float halfTan(float a)
+{
+	//float x = math::fmod(0.5f*abs(a)+M_PI_2,float(M_PI));
+	float x1 = (0.5f*abs(a)+M_PI_2f);
+	return tan(clamp(sign(a)*(x1-M_PI_2f),-M_PI_2f+M_EPSF,M_PI_2f-M_EPSF));
+}
+
+static inline float4 halfTan(float4 const& a)
+{	
+	static const float4 nM_PI_2(-M_PI_2f+M_EPSF);
+	static const float4 pM_PI_2( M_PI_2f+M_EPSF);
+
+	float4 x = float1(0.5f) * abs(a) + float1(M_PI_2f);		
+	return tan( math::clamp<float4>( sign(a) * (x-float4(M_PI_2f)), nM_PI_2, pM_PI_2  ));
+}
+
+static inline float4 mirror(float4 const& t)
+{
+	constant_float4(mirrorT,-1,1,1,1);
+	return t * mirrorT;
+}
+
+static inline float4 load(float const* v)
+{
+	return float4(Vloadpf(v, 0));
+}
+
+static inline void store(float4 const& v, float* r)
+{
+	Vstorepf(v.eval(), r, 0);
+}	
+
+static inline void store(bool4 const& v, bool* r)
+{
+	Vstorepb(v.v, r);
+}	
+
+}
+
+#endif
+