summaryrefslogtreecommitdiff
path: root/Runtime/Math/Simd/neon.h
diff options
context:
space:
mode:
authorchai <chaifix@163.com>2019-08-14 22:50:43 +0800
committerchai <chaifix@163.com>2019-08-14 22:50:43 +0800
commit15740faf9fe9fe4be08965098bbf2947e096aeeb (patch)
treea730ec236656cc8cab5b13f088adfaed6bb218fb /Runtime/Math/Simd/neon.h
+Unity Runtime codeHEADmaster
Diffstat (limited to 'Runtime/Math/Simd/neon.h')
-rw-r--r--Runtime/Math/Simd/neon.h548
1 files changed, 548 insertions, 0 deletions
diff --git a/Runtime/Math/Simd/neon.h b/Runtime/Math/Simd/neon.h
new file mode 100644
index 0000000..08196f9
--- /dev/null
+++ b/Runtime/Math/Simd/neon.h
@@ -0,0 +1,548 @@
+#ifndef SIMD_NEON_H
+#define SIMD_NEON_H
+
+#include <arm_neon.h>
+
+typedef float32x4_t vec4f;
+typedef float32x4_t vec4fs;
+typedef uint32x4_t vec4b;
+typedef uint32x4_t vec4bs;
+
+#define SWZ_MASK(x, y, z, w) (((w) << 6) | ((z) << 4) | ((y) << 2) | ((x)))
+#define SWZ_X(MASK) (((MASK) >> 0) & 3)
+#define SWZ_Y(MASK) (((MASK) >> 2) & 3)
+#define SWZ_Z(MASK) (((MASK) >> 4) & 3)
+#define SWZ_W(MASK) (((MASK) >> 6) & 3)
+
+//VPERMWI_CONST(x, y, z, w)
+#if UNITY_WINRT
+#define cvec4f(name, x,y,z,w) static const vec4f name = Vload4sf(x, y, z, w)
+#define cvec4b(name, x,y,z,w) static const vec4b name = Vload4sb(x, y, z, w)
+#define cvec4fs(name, s) static const vec4fs name = Vloadsf(s)
+#else
+#define cvec4f(name, x,y,z,w) static const vec4f name = {(x),(y),(z),(w)}
+#define cvec4b(name, x,y,z,w) static const vec4b name = {(x),(y),(z),(w)}
+#define cvec4fs(name, s) static const vec4fs name = {(s),(s),(s),(s)}
+#endif
+
+enum simd_mask
+{
+ kXYZW = SWZ_MASK(0,1,2,3),
+ kXXXX = SWZ_MASK(0,0,0,0),
+ kYYYY = SWZ_MASK(1,1,1,1),
+ kZZZZ = SWZ_MASK(2,2,2,2),
+ kWWWW = SWZ_MASK(3,3,3,3),
+
+ kXWYZ = SWZ_MASK(0,3,1,2),
+ kXZWY = SWZ_MASK(0,2,3,1),
+
+ kYZWX = SWZ_MASK(1,2,3,0),
+ kYXZW = SWZ_MASK(1,0,2,3),
+ kYWZX = SWZ_MASK(1,3,2,0),
+ kYZXW = SWZ_MASK(1,2,0,3),
+ kYXWZ = SWZ_MASK(1,0,3,2),
+
+ kZWXY = SWZ_MASK(2,3,0,1),
+ kZYXW = SWZ_MASK(2,1,0,3),
+ kZYWX = SWZ_MASK(2,1,3,0),
+ kZXYW = SWZ_MASK(2,0,1,3),
+
+ kWYZX = SWZ_MASK(3,1,2,0),
+ kWXZY = SWZ_MASK(3,0,2,1),
+ kWYXZ = SWZ_MASK(3,1,0,2),
+ kWWWZ = SWZ_MASK(3,3,3,2),
+ kWWZZ = SWZ_MASK(3,3,2,2),
+ kWZYX = SWZ_MASK(3,2,1,0),
+};
+
+#define Vzero() vdupq_n_f32(0.0f)
+#define Vone() vdupq_n_f32(1.0f)
+
+#define Vfalse() vdupq_n_u32(0)
+#define Vtrue() vdupq_n_f32(0xFFFFFFFF)
+
+union U { float32x2x2_t f2x2; float32x4_t f4; uint8x8x2_t b8x2; float32_t f[4]; };
+
+#define LHS_FUNTION() \
+ static MECANIM_FORCE_INLINE vec4f lhs(vec4f l, vec4f r)\
+ {\
+ vec4f m = Vmove(rhs(l), r);\
+ return rhs(m);\
+ }
+
+//#define Vpermute(v, mask) v
+//#define Vmove(l, r) vextq_f32(l, r, 0)
+MECANIM_FORCE_INLINE vec4f Vmove(vec4f l, vec4f r)
+{
+ uint32x4_t sel = Vfalse();
+ sel = vsetq_lane_u32(0xFFFFFFFF,sel,0);
+ return vbslq_f32(sel, r, l);
+}
+template<int SWZ> struct Vswizzle;
+/*
+{
+ static MECANIM_FORCE_INLINE vec4f rhs(vec4f r)
+ {
+ ::uint32_t lanes[4];
+ uint32x4_t u = vreinterpretq_u32_f32(r);
+ uint32x4_t result;
+
+ lanes[0] = vgetq_lane_u32(u, 0);
+ lanes[1] = vgetq_lane_u32(u, 1);
+ lanes[2] = vgetq_lane_u32(u, 2);
+ lanes[3] = vgetq_lane_u32(u, 3);
+
+ result = vdupq_n_u32(lanes[SWZ_X(SWZ)]);
+ result = vsetq_lane_u32(lanes[SWZ_Y(SWZ)], result, 1);
+ result = vsetq_lane_u32(lanes[SWZ_Z(SWZ)], result, 2);
+ result = vsetq_lane_u32(lanes[SWZ_W(SWZ)], result, 3);
+
+ return vreinterpretq_f32_u32(result);
+ }
+
+ static MECANIM_FORCE_INLINE vec4f lhs(vec4f l, vec4f r)
+ {
+ vec4f m = Vmove(Vswizzle<SWZ>::rhs(l), r);
+ return Vswizzle<SWZ>::rhs(m);
+ }
+};
+*/
+
+template<> struct Vswizzle<kXYZW>
+{
+ static MECANIM_FORCE_INLINE vec4f rhs(vec4f r)
+ {
+ return r;
+ }
+ static MECANIM_FORCE_INLINE vec4f lhs(vec4f l, vec4f r)
+ {
+ return Vmove(l, r);
+ }
+};
+template<> struct Vswizzle<kXXXX>
+{
+ static MECANIM_FORCE_INLINE vec4f rhs(vec4f r)
+ {
+ return vdupq_lane_f32(vget_low_f32(r),0);
+ }
+
+ LHS_FUNTION()
+};
+template<> struct Vswizzle<kYYYY>
+{
+ static MECANIM_FORCE_INLINE vec4f rhs(vec4f r)
+ {
+ return vdupq_lane_f32(vget_low_f32(r),1);
+ }
+
+ LHS_FUNTION()
+};
+template<> struct Vswizzle<kZZZZ>
+{
+ static MECANIM_FORCE_INLINE vec4f rhs(vec4f r)
+ {
+ return vdupq_lane_f32(vget_high_f32(r),0);
+ }
+ LHS_FUNTION()
+};
+template<> struct Vswizzle<kWWWW>
+{
+ static MECANIM_FORCE_INLINE vec4f rhs(vec4f r)
+ {
+ return vdupq_lane_f32(vget_high_f32(r),1);
+ }
+ LHS_FUNTION()
+};
+
+template<> struct Vswizzle<kXWYZ>
+ {
+ static MECANIM_FORCE_INLINE vec4f rhs(vec4f p)
+ {
+ U u; u.f2x2 = vtrn_f32(vget_low_f32(p), vrev64_f32(vget_high_f32(p))); return u.f4;
+ }
+ LHS_FUNTION()
+};
+
+template<> struct Vswizzle<kXZWY>
+{
+ static MECANIM_FORCE_INLINE vec4f rhs(vec4f p)
+ {
+ return vcombine_f32(vtrn_f32(vget_low_f32(p), vget_high_f32(p)).val[0], vrev64_f32(vtrn_f32(vget_low_f32(p), vget_high_f32(p)).val[1]));
+ }
+ LHS_FUNTION()
+};
+
+template<> struct Vswizzle<kYZWX>
+{
+ static MECANIM_FORCE_INLINE vec4f rhs(vec4f p)
+ {
+ return vreinterpretq_f32_u32(vextq_u32(vreinterpretq_u32_f32(p), vreinterpretq_u32_f32(p), 1));
+ }
+ LHS_FUNTION()
+};
+
+template<> struct Vswizzle<kYXZW>
+{
+ static MECANIM_FORCE_INLINE vec4f rhs(vec4f p)
+ {
+ return vcombine_f32(vrev64_f32(vget_low_f32(p)), vget_high_f32(p));
+ }
+ LHS_FUNTION()
+};
+
+template<> struct Vswizzle<kYWZX>
+{
+ static MECANIM_FORCE_INLINE vec4f rhs(vec4f p)
+ {
+ return vcombine_f32(vtrn_f32(vget_low_f32(p), vget_high_f32(p)).val[1], vrev64_f32(vtrn_f32(vget_low_f32(p), vget_high_f32(p)).val[0]));
+ }
+ LHS_FUNTION()
+};
+
+template<> struct Vswizzle<kYZXW>
+{
+ static MECANIM_FORCE_INLINE vec4f rhs(vec4f p)
+ {
+ U u;
+ u.f2x2 = vtrn_f32(vrev64_f32(vget_low_f32(p)), vget_high_f32(p));
+ return u.f4;
+ }
+ LHS_FUNTION()
+};
+
+template<> struct Vswizzle<kYXWZ>
+{
+ static MECANIM_FORCE_INLINE vec4f rhs(vec4f p)
+ {
+ return vrev64q_f32(p);
+ }
+ LHS_FUNTION()
+};
+
+template<> struct Vswizzle<kZWXY>
+{
+ static MECANIM_FORCE_INLINE vec4f rhs(vec4f p)
+ {
+ return vcombine_f32(vget_high_f32(p), vget_low_f32(p));
+ }
+ LHS_FUNTION()
+};
+
+template<> struct Vswizzle<kZYXW>
+{
+ static MECANIM_FORCE_INLINE vec4f rhs(vec4f p)
+ {
+ return vcombine_f32(vrev64_f32(vreinterpret_f32_u32(vext_u32(vreinterpret_u32_f32(vget_low_f32(p)), vreinterpret_u32_f32(vget_high_f32(p)), 1))), vrev64_f32(vreinterpret_f32_u32(vext_u32(vreinterpret_u32_f32(vget_high_f32(p)), vreinterpret_u32_f32(vget_low_f32(p)), 1))));
+ }
+ LHS_FUNTION()
+};
+
+template<> struct Vswizzle<kZYWX>
+{
+ static MECANIM_FORCE_INLINE vec4f rhs(vec4f p)
+ {
+ return vcombine_f32(vrev64_f32(vreinterpret_f32_u32(vext_u32(vreinterpret_u32_f32(vget_low_f32(p)), vreinterpret_u32_f32(vget_high_f32(p)), 1))), vreinterpret_f32_u32(vext_u32(vreinterpret_u32_f32(vget_high_f32(p)), vreinterpret_u32_f32(vget_low_f32(p)), 1)));
+ }
+ LHS_FUNTION()
+};
+
+template<> struct Vswizzle<kZXYW>
+{
+ static MECANIM_FORCE_INLINE vec4f rhs(vec4f p)
+ {
+ return vcombine_f32(vrev64_f32(vtrn_f32(vget_low_f32(p), vget_high_f32(p)).val[0]), vtrn_f32(vget_low_f32(p), vget_high_f32(p)).val[1]);
+ }
+ LHS_FUNTION()
+};
+
+template<> struct Vswizzle<kWYZX>
+{
+ static MECANIM_FORCE_INLINE vec4f rhs(vec4f p)
+ {
+ U u;
+ u.f4 = vrev64q_f32(p);
+ u.f2x2 = vtrn_f32(u.f2x2.val[1], u.f2x2.val[0]);
+ return u.f4;
+ }
+ LHS_FUNTION()
+};
+
+template<> struct Vswizzle<kWXZY>
+{
+ static MECANIM_FORCE_INLINE vec4f rhs(vec4f p)
+ {
+ U u; u.f2x2 = vtrn_f32(vrev64_f32(vget_high_f32(p)), vget_low_f32(p)); return u.f4;
+ }
+ LHS_FUNTION()
+};
+
+template<> struct Vswizzle<kWYXZ>
+{
+ static MECANIM_FORCE_INLINE vec4f rhs(vec4f p)
+ {
+ return vcombine_f32(vrev64_f32(vtrn_f32(vget_low_f32(p), vget_high_f32(p)).val[1]), vtrn_f32(vget_low_f32(p), vget_high_f32(p)).val[0]);
+ }
+ LHS_FUNTION()
+};
+
+template<> struct Vswizzle<kWWWZ>
+{
+ static MECANIM_FORCE_INLINE vec4f rhs(vec4f p)
+ {
+ return vcombine_f32(vdup_lane_f32(vget_high_f32(p), 1), vrev64_f32(vget_high_f32(p)));
+ }
+ LHS_FUNTION()
+};
+
+template<> struct Vswizzle<kWWZZ>
+{
+ static MECANIM_FORCE_INLINE vec4f rhs(vec4f p)
+ {
+ U u; u.f2x2 = vtrn_f32(vget_high_f32(p), vget_high_f32(p));
+ return vreinterpretq_f32_u32(vextq_u32(vreinterpretq_u32_f32(u.f4), vreinterpretq_u32_f32(u.f4), 2));
+ }
+ LHS_FUNTION()
+};
+
+template<> struct Vswizzle<kWZYX>
+{
+ static MECANIM_FORCE_INLINE vec4f rhs(vec4f p)
+ {
+ return vcombine_f32(vrev64_f32(vget_high_f32(p)), vrev64_f32(vget_low_f32(p)));
+ }
+ LHS_FUNTION()
+};
+
+static MECANIM_FORCE_INLINE float Vstoresf(vec4f r)
+{
+ return vgetq_lane_f32(r, 0);
+}
+
+static MECANIM_FORCE_INLINE bool Vstoresb(vec4b r)
+{
+ return (vgetq_lane_u32(r, 0) > 0) ? true : false;
+}
+
+// Aligned store
+#define Vstorepf(v, base, offset) vst1q_f32((float32_t*)((base)+(offset)),v);
+
+static MECANIM_FORCE_INLINE void Vstorepb(const vec4b v, bool* r)
+{
+ ::uint32_t u;
+ vst1q_lane_u32(&u, v, 0);
+ r[0] = (u > 0) ? true : false;
+ vst1q_lane_u32(&u, v, 1);
+ r[1] = (u > 0) ? true : false;
+ vst1q_lane_u32(&u, v, 2);
+ r[2] = (u > 0) ? true : false;
+ vst1q_lane_u32(&u, v, 3);
+ r[3] = (u > 0) ? true : false;
+}
+
+static MECANIM_FORCE_INLINE vec4f Vloadsf(float s)
+{
+ return vmovq_n_f32(s);
+}
+
+static MECANIM_FORCE_INLINE vec4b Vloadsb(bool s)
+{
+ const ::uint32_t false_true[2] = { 0, 0xFFFFFFFF };
+ return vdupq_n_u32(false_true[s ? 1 : 0]);
+}
+
+static MECANIM_FORCE_INLINE vec4f Vload4sf(float x, float y, float z, float w)
+{
+ float32x4_t result;
+ result = vdupq_n_f32(x);
+ result = vsetq_lane_f32(y, result, 1);
+ result = vsetq_lane_f32(z, result, 2);
+ result = vsetq_lane_f32(w, result, 3);
+ return result;
+}
+
+static MECANIM_FORCE_INLINE vec4b Vload4sb(bool x, bool y, bool z, bool w)
+{
+ const ::uint32_t val[4] =
+ {
+ x ? 0xffffffff : 0x00,
+ y ? 0xffffffff : 0x00,
+ z ? 0xffffffff : 0x00,
+ w ? 0xffffffff : 0x00
+ };
+
+ return vld1q_u32(&val[0]);
+}
+
+static MECANIM_FORCE_INLINE vec4f Vloadpf(float const* buf, int offset)
+{
+ return vld1q_f32((float32_t const*)buf + offset);
+}
+
+#define Vadd(l, r) vaddq_f32(l, r)
+#define Vsub(l, r) vsubq_f32(l, r)
+#define Vmul(l, r) vmulq_f32(l, r)
+
+
+// return a*b+c : be aware that vmlaq does a+b*c
+#define Vmadd(a, b, c) vmlaq_f32(c, a, b)
+// return a*b-c : be aware that vmlaq does a-b*c
+#define Vmsub(a, b, c) Vneg(vmlsq_f32(c, a, b))
+
+static MECANIM_FORCE_INLINE vec4f Vneg(vec4f r)
+{
+ uint32x4_t sign_constant = vdupq_n_u32(0x80000000);
+ uint32x4_t negated = veorq_u32(vreinterpretq_u32_f32(r), sign_constant);
+ return vreinterpretq_f32_u32(negated);
+}
+
+// vector sgn: return -1, 1
+static MECANIM_FORCE_INLINE vec4f Vsgn(vec4f r)
+{
+ uint32x4_t sign_constant = vdupq_n_u32(0x80000000);
+ uint32x4_t signs = vandq_u32(vreinterpretq_u32_f32(r), sign_constant);
+ uint32x4_t ones = vdupq_n_u32 (0x3f800000);
+
+ return vreinterpretq_f32_u32(vorrq_u32(signs,ones));
+/* float32x4_t ones = Vone();
+ float32x4_t nones = Vneg(ones);
+ uint32x4_t cmp = vcltq_f32(r,Vzero());
+ return vbslq_f32(cmp,nones,ones);*/
+}
+
+// vector sgn: return -1, 0, 1
+static MECANIM_FORCE_INLINE vec4f Vsign(vec4f r)
+{
+ uint32x4_t sign_constant = vdupq_n_u32(0x80000000);
+ uint32x4_t signs = vandq_u32(vreinterpretq_u32_f32(r), sign_constant);
+ uint32x4_t ones = vdupq_n_u32 (0x3f800000);
+
+ return vreinterpretq_f32_u32(vorrq_u32( signs, vandq_u32( vmvnq_u32( vceqq_f32( r, Vzero())), ones)));
+}
+
+#define Vinc(r) Vadd( (r), Vone())
+#define Vdec(r) Vsub( (r), Vone())
+
+static MECANIM_FORCE_INLINE vec4f Vabs(vec4f r)
+{
+ return vabsq_f32(r);
+}
+
+#define Vmax( l, r) vmaxq_f32(l, r)
+#define Vmin( l, r) vminq_f32(l, r)
+
+static MECANIM_FORCE_INLINE vec4fs Vlargest(vec4f r)
+{
+ float32x2_t temp = vpmax_f32 ( vget_high_f32(r), vget_low_f32(r) );
+ temp = vpmax_f32(temp, temp);
+ return vcombine_f32(temp,temp);
+}
+
+static MECANIM_FORCE_INLINE vec4fs Vsmallest(vec4f r)
+{
+ float32x2_t temp = vpmin_f32 ( vget_high_f32(r), vget_low_f32(r) );
+ temp = vpmin_f32(temp, temp);
+ return vcombine_f32(temp,temp);
+}
+
+static MECANIM_FORCE_INLINE vec4fs Vsum(vec4f r)
+{
+ float32x2_t temp = vpadd_f32 ( vget_high_f32(r), vget_low_f32(r) );
+ temp = vpadd_f32(temp, temp);
+ return vcombine_f32(temp,temp);
+}
+
+#define Vdot( l, r) Vsum( Vmul((l), (r)) )
+
+static MECANIM_FORCE_INLINE vec4f Vrsqrt(vec4f r)
+{
+ float32x4_t e = vrsqrteq_f32(r);
+ float32x4_t s = vmulq_f32(e, r);
+ float32x4_t v = vrsqrtsq_f32(s, e);
+
+ e = vmulq_f32(e,v);
+ s = vmulq_f32(e, r);
+ v = vrsqrtsq_f32(s, e);
+
+ return vmulq_f32(e,v);
+}
+
+static MECANIM_FORCE_INLINE vec4f Vrcp(vec4f r)
+{
+ cvec4fs(C0,-3.402823466e+38f);
+ cvec4fs(C1, 3.402823466e+38f);
+
+ float32x4_t R0 = vrecpeq_f32(r);
+ R0 = vmaxq_f32(R0, C0);
+ R0 = vminq_f32(R0, C1);
+
+ float32x4_t R1 = vrecpsq_f32(r, R0);
+ R0 = vmulq_f32(R0, R1);
+ R0 = vmaxq_f32(R0, C0);
+ R0 = vminq_f32(R0, C1);
+ R1 = vrecpsq_f32(r, R0);
+ return vmulq_f32(R0, R1);
+
+ //float32x4_t inv = vrecpeq_f32(r);
+ //float32x4_t step = vrecpsq_f32(r, inv);
+ //return vmulq_f32(step, inv);
+}
+
+static MECANIM_FORCE_INLINE vec4f Vdiv(const vec4f l, const vec4f r)
+{
+ return Vmul(l, Vrcp(r));
+}
+
+static MECANIM_FORCE_INLINE vec4f Vcombine(vec4f x, vec4f y, vec4f z, vec4f w)
+{
+ float32x2x2_t temp1 = vtrn_f32(vget_high_f32(x), vget_high_f32(y));
+ float32x2x2_t temp2 = vtrn_f32(vget_high_f32(z), vget_high_f32(w));
+ return vcombine_f32(temp1.val[0], temp2.val[0]);
+}
+
+// Vector comparison
+#define Vcmpeq( a, b) vceqq_f32(a, b)
+#define Vcmpneq( a, b) Vnot(vceqq_f32(a, b))
+#define Vcmpgt( a, b) vcgtq_f32(a, b)
+#define Vcmpge( a, b) vcgeq_f32(a, b)
+#define Vcmplt( a, b) vcltq_f32(a, b)
+#define Vcmple( a, b) vcleq_f32(a, b)
+
+static MECANIM_FORCE_INLINE vec4f Vsel(vec4b c, vec4f a, vec4f b)
+{
+ return vbslq_f32(c, a, b);
+}
+
+#define Vsqrt(r) Vsel( Vcmpeq(r, Vzero()), Vzero(), Vmul(r,Vrsqrt(r)))
+
+// vector logics
+#define Vnot(r) vmvnq_u32(r)
+#define Vxnor(a, b) Vnot(veorq_u32(a, b))
+#define Vxor(a, b) veorq_u32(a, b)
+#define Vand(a, b) vandq_u32(a, b)
+#define Vor(a, b) vorrq_u32(a, b)
+
+static MECANIM_FORCE_INLINE bool Vall(const vec4b a)
+{
+ ::uint32_t u[4];
+
+ vst1q_lane_u32(&u[0], a, 0);
+ vst1q_lane_u32(&u[1], a, 1);
+ vst1q_lane_u32(&u[2], a, 2);
+ vst1q_lane_u32(&u[3], a, 3);
+
+ return (u[0] & u[1] & u[2] & u[3]);
+};
+
+static MECANIM_FORCE_INLINE bool Vany(const vec4b a)
+{
+ ::uint32_t u[4];
+
+ vst1q_lane_u32(&u[0], a, 0);
+ vst1q_lane_u32(&u[1], a, 1);
+ vst1q_lane_u32(&u[2], a, 2);
+ vst1q_lane_u32(&u[3], a, 3);
+
+ return (u[0] | u[1] | u[2] | u[3]);
+};
+
+#endif