diff options
author | chai <chaifix@163.com> | 2019-08-14 22:50:43 +0800 |
---|---|---|
committer | chai <chaifix@163.com> | 2019-08-14 22:50:43 +0800 |
commit | 15740faf9fe9fe4be08965098bbf2947e096aeeb (patch) | |
tree | a730ec236656cc8cab5b13f088adfaed6bb218fb /Runtime/Math/Simd/neon.h |
Diffstat (limited to 'Runtime/Math/Simd/neon.h')
-rw-r--r-- | Runtime/Math/Simd/neon.h | 548 |
1 files changed, 548 insertions, 0 deletions
diff --git a/Runtime/Math/Simd/neon.h b/Runtime/Math/Simd/neon.h new file mode 100644 index 0000000..08196f9 --- /dev/null +++ b/Runtime/Math/Simd/neon.h @@ -0,0 +1,548 @@ +#ifndef SIMD_NEON_H +#define SIMD_NEON_H + +#include <arm_neon.h> + +typedef float32x4_t vec4f; +typedef float32x4_t vec4fs; +typedef uint32x4_t vec4b; +typedef uint32x4_t vec4bs; + +#define SWZ_MASK(x, y, z, w) (((w) << 6) | ((z) << 4) | ((y) << 2) | ((x))) +#define SWZ_X(MASK) (((MASK) >> 0) & 3) +#define SWZ_Y(MASK) (((MASK) >> 2) & 3) +#define SWZ_Z(MASK) (((MASK) >> 4) & 3) +#define SWZ_W(MASK) (((MASK) >> 6) & 3) + +//VPERMWI_CONST(x, y, z, w) +#if UNITY_WINRT +#define cvec4f(name, x,y,z,w) static const vec4f name = Vload4sf(x, y, z, w) +#define cvec4b(name, x,y,z,w) static const vec4b name = Vload4sb(x, y, z, w) +#define cvec4fs(name, s) static const vec4fs name = Vloadsf(s) +#else +#define cvec4f(name, x,y,z,w) static const vec4f name = {(x),(y),(z),(w)} +#define cvec4b(name, x,y,z,w) static const vec4b name = {(x),(y),(z),(w)} +#define cvec4fs(name, s) static const vec4fs name = {(s),(s),(s),(s)} +#endif + +enum simd_mask +{ + kXYZW = SWZ_MASK(0,1,2,3), + kXXXX = SWZ_MASK(0,0,0,0), + kYYYY = SWZ_MASK(1,1,1,1), + kZZZZ = SWZ_MASK(2,2,2,2), + kWWWW = SWZ_MASK(3,3,3,3), + + kXWYZ = SWZ_MASK(0,3,1,2), + kXZWY = SWZ_MASK(0,2,3,1), + + kYZWX = SWZ_MASK(1,2,3,0), + kYXZW = SWZ_MASK(1,0,2,3), + kYWZX = SWZ_MASK(1,3,2,0), + kYZXW = SWZ_MASK(1,2,0,3), + kYXWZ = SWZ_MASK(1,0,3,2), + + kZWXY = SWZ_MASK(2,3,0,1), + kZYXW = SWZ_MASK(2,1,0,3), + kZYWX = SWZ_MASK(2,1,3,0), + kZXYW = SWZ_MASK(2,0,1,3), + + kWYZX = SWZ_MASK(3,1,2,0), + kWXZY = SWZ_MASK(3,0,2,1), + kWYXZ = SWZ_MASK(3,1,0,2), + kWWWZ = SWZ_MASK(3,3,3,2), + kWWZZ = SWZ_MASK(3,3,2,2), + kWZYX = SWZ_MASK(3,2,1,0), +}; + +#define Vzero() vdupq_n_f32(0.0f) +#define Vone() vdupq_n_f32(1.0f) + +#define Vfalse() vdupq_n_u32(0) +#define Vtrue() vdupq_n_f32(0xFFFFFFFF) + +union U { float32x2x2_t f2x2; float32x4_t f4; uint8x8x2_t b8x2; float32_t f[4]; }; + +#define LHS_FUNTION() \ + static MECANIM_FORCE_INLINE vec4f lhs(vec4f l, vec4f r)\ + {\ + vec4f m = Vmove(rhs(l), r);\ + return rhs(m);\ + } + +//#define Vpermute(v, mask) v +//#define Vmove(l, r) vextq_f32(l, r, 0) +MECANIM_FORCE_INLINE vec4f Vmove(vec4f l, vec4f r) +{ + uint32x4_t sel = Vfalse(); + sel = vsetq_lane_u32(0xFFFFFFFF,sel,0); + return vbslq_f32(sel, r, l); +} +template<int SWZ> struct Vswizzle; +/* +{ + static MECANIM_FORCE_INLINE vec4f rhs(vec4f r) + { + ::uint32_t lanes[4]; + uint32x4_t u = vreinterpretq_u32_f32(r); + uint32x4_t result; + + lanes[0] = vgetq_lane_u32(u, 0); + lanes[1] = vgetq_lane_u32(u, 1); + lanes[2] = vgetq_lane_u32(u, 2); + lanes[3] = vgetq_lane_u32(u, 3); + + result = vdupq_n_u32(lanes[SWZ_X(SWZ)]); + result = vsetq_lane_u32(lanes[SWZ_Y(SWZ)], result, 1); + result = vsetq_lane_u32(lanes[SWZ_Z(SWZ)], result, 2); + result = vsetq_lane_u32(lanes[SWZ_W(SWZ)], result, 3); + + return vreinterpretq_f32_u32(result); + } + + static MECANIM_FORCE_INLINE vec4f lhs(vec4f l, vec4f r) + { + vec4f m = Vmove(Vswizzle<SWZ>::rhs(l), r); + return Vswizzle<SWZ>::rhs(m); + } +}; +*/ + +template<> struct Vswizzle<kXYZW> +{ + static MECANIM_FORCE_INLINE vec4f rhs(vec4f r) + { + return r; + } + static MECANIM_FORCE_INLINE vec4f lhs(vec4f l, vec4f r) + { + return Vmove(l, r); + } +}; +template<> struct Vswizzle<kXXXX> +{ + static MECANIM_FORCE_INLINE vec4f rhs(vec4f r) + { + return vdupq_lane_f32(vget_low_f32(r),0); + } + + LHS_FUNTION() +}; +template<> struct Vswizzle<kYYYY> +{ + static MECANIM_FORCE_INLINE vec4f rhs(vec4f r) + { + return vdupq_lane_f32(vget_low_f32(r),1); + } + + LHS_FUNTION() +}; +template<> struct Vswizzle<kZZZZ> +{ + static MECANIM_FORCE_INLINE vec4f rhs(vec4f r) + { + return vdupq_lane_f32(vget_high_f32(r),0); + } + LHS_FUNTION() +}; +template<> struct Vswizzle<kWWWW> +{ + static MECANIM_FORCE_INLINE vec4f rhs(vec4f r) + { + return vdupq_lane_f32(vget_high_f32(r),1); + } + LHS_FUNTION() +}; + +template<> struct Vswizzle<kXWYZ> + { + static MECANIM_FORCE_INLINE vec4f rhs(vec4f p) + { + U u; u.f2x2 = vtrn_f32(vget_low_f32(p), vrev64_f32(vget_high_f32(p))); return u.f4; + } + LHS_FUNTION() +}; + +template<> struct Vswizzle<kXZWY> +{ + static MECANIM_FORCE_INLINE vec4f rhs(vec4f p) + { + return vcombine_f32(vtrn_f32(vget_low_f32(p), vget_high_f32(p)).val[0], vrev64_f32(vtrn_f32(vget_low_f32(p), vget_high_f32(p)).val[1])); + } + LHS_FUNTION() +}; + +template<> struct Vswizzle<kYZWX> +{ + static MECANIM_FORCE_INLINE vec4f rhs(vec4f p) + { + return vreinterpretq_f32_u32(vextq_u32(vreinterpretq_u32_f32(p), vreinterpretq_u32_f32(p), 1)); + } + LHS_FUNTION() +}; + +template<> struct Vswizzle<kYXZW> +{ + static MECANIM_FORCE_INLINE vec4f rhs(vec4f p) + { + return vcombine_f32(vrev64_f32(vget_low_f32(p)), vget_high_f32(p)); + } + LHS_FUNTION() +}; + +template<> struct Vswizzle<kYWZX> +{ + static MECANIM_FORCE_INLINE vec4f rhs(vec4f p) + { + return vcombine_f32(vtrn_f32(vget_low_f32(p), vget_high_f32(p)).val[1], vrev64_f32(vtrn_f32(vget_low_f32(p), vget_high_f32(p)).val[0])); + } + LHS_FUNTION() +}; + +template<> struct Vswizzle<kYZXW> +{ + static MECANIM_FORCE_INLINE vec4f rhs(vec4f p) + { + U u; + u.f2x2 = vtrn_f32(vrev64_f32(vget_low_f32(p)), vget_high_f32(p)); + return u.f4; + } + LHS_FUNTION() +}; + +template<> struct Vswizzle<kYXWZ> +{ + static MECANIM_FORCE_INLINE vec4f rhs(vec4f p) + { + return vrev64q_f32(p); + } + LHS_FUNTION() +}; + +template<> struct Vswizzle<kZWXY> +{ + static MECANIM_FORCE_INLINE vec4f rhs(vec4f p) + { + return vcombine_f32(vget_high_f32(p), vget_low_f32(p)); + } + LHS_FUNTION() +}; + +template<> struct Vswizzle<kZYXW> +{ + static MECANIM_FORCE_INLINE vec4f rhs(vec4f p) + { + return vcombine_f32(vrev64_f32(vreinterpret_f32_u32(vext_u32(vreinterpret_u32_f32(vget_low_f32(p)), vreinterpret_u32_f32(vget_high_f32(p)), 1))), vrev64_f32(vreinterpret_f32_u32(vext_u32(vreinterpret_u32_f32(vget_high_f32(p)), vreinterpret_u32_f32(vget_low_f32(p)), 1)))); + } + LHS_FUNTION() +}; + +template<> struct Vswizzle<kZYWX> +{ + static MECANIM_FORCE_INLINE vec4f rhs(vec4f p) + { + return vcombine_f32(vrev64_f32(vreinterpret_f32_u32(vext_u32(vreinterpret_u32_f32(vget_low_f32(p)), vreinterpret_u32_f32(vget_high_f32(p)), 1))), vreinterpret_f32_u32(vext_u32(vreinterpret_u32_f32(vget_high_f32(p)), vreinterpret_u32_f32(vget_low_f32(p)), 1))); + } + LHS_FUNTION() +}; + +template<> struct Vswizzle<kZXYW> +{ + static MECANIM_FORCE_INLINE vec4f rhs(vec4f p) + { + return vcombine_f32(vrev64_f32(vtrn_f32(vget_low_f32(p), vget_high_f32(p)).val[0]), vtrn_f32(vget_low_f32(p), vget_high_f32(p)).val[1]); + } + LHS_FUNTION() +}; + +template<> struct Vswizzle<kWYZX> +{ + static MECANIM_FORCE_INLINE vec4f rhs(vec4f p) + { + U u; + u.f4 = vrev64q_f32(p); + u.f2x2 = vtrn_f32(u.f2x2.val[1], u.f2x2.val[0]); + return u.f4; + } + LHS_FUNTION() +}; + +template<> struct Vswizzle<kWXZY> +{ + static MECANIM_FORCE_INLINE vec4f rhs(vec4f p) + { + U u; u.f2x2 = vtrn_f32(vrev64_f32(vget_high_f32(p)), vget_low_f32(p)); return u.f4; + } + LHS_FUNTION() +}; + +template<> struct Vswizzle<kWYXZ> +{ + static MECANIM_FORCE_INLINE vec4f rhs(vec4f p) + { + return vcombine_f32(vrev64_f32(vtrn_f32(vget_low_f32(p), vget_high_f32(p)).val[1]), vtrn_f32(vget_low_f32(p), vget_high_f32(p)).val[0]); + } + LHS_FUNTION() +}; + +template<> struct Vswizzle<kWWWZ> +{ + static MECANIM_FORCE_INLINE vec4f rhs(vec4f p) + { + return vcombine_f32(vdup_lane_f32(vget_high_f32(p), 1), vrev64_f32(vget_high_f32(p))); + } + LHS_FUNTION() +}; + +template<> struct Vswizzle<kWWZZ> +{ + static MECANIM_FORCE_INLINE vec4f rhs(vec4f p) + { + U u; u.f2x2 = vtrn_f32(vget_high_f32(p), vget_high_f32(p)); + return vreinterpretq_f32_u32(vextq_u32(vreinterpretq_u32_f32(u.f4), vreinterpretq_u32_f32(u.f4), 2)); + } + LHS_FUNTION() +}; + +template<> struct Vswizzle<kWZYX> +{ + static MECANIM_FORCE_INLINE vec4f rhs(vec4f p) + { + return vcombine_f32(vrev64_f32(vget_high_f32(p)), vrev64_f32(vget_low_f32(p))); + } + LHS_FUNTION() +}; + +static MECANIM_FORCE_INLINE float Vstoresf(vec4f r) +{ + return vgetq_lane_f32(r, 0); +} + +static MECANIM_FORCE_INLINE bool Vstoresb(vec4b r) +{ + return (vgetq_lane_u32(r, 0) > 0) ? true : false; +} + +// Aligned store +#define Vstorepf(v, base, offset) vst1q_f32((float32_t*)((base)+(offset)),v); + +static MECANIM_FORCE_INLINE void Vstorepb(const vec4b v, bool* r) +{ + ::uint32_t u; + vst1q_lane_u32(&u, v, 0); + r[0] = (u > 0) ? true : false; + vst1q_lane_u32(&u, v, 1); + r[1] = (u > 0) ? true : false; + vst1q_lane_u32(&u, v, 2); + r[2] = (u > 0) ? true : false; + vst1q_lane_u32(&u, v, 3); + r[3] = (u > 0) ? true : false; +} + +static MECANIM_FORCE_INLINE vec4f Vloadsf(float s) +{ + return vmovq_n_f32(s); +} + +static MECANIM_FORCE_INLINE vec4b Vloadsb(bool s) +{ + const ::uint32_t false_true[2] = { 0, 0xFFFFFFFF }; + return vdupq_n_u32(false_true[s ? 1 : 0]); +} + +static MECANIM_FORCE_INLINE vec4f Vload4sf(float x, float y, float z, float w) +{ + float32x4_t result; + result = vdupq_n_f32(x); + result = vsetq_lane_f32(y, result, 1); + result = vsetq_lane_f32(z, result, 2); + result = vsetq_lane_f32(w, result, 3); + return result; +} + +static MECANIM_FORCE_INLINE vec4b Vload4sb(bool x, bool y, bool z, bool w) +{ + const ::uint32_t val[4] = + { + x ? 0xffffffff : 0x00, + y ? 0xffffffff : 0x00, + z ? 0xffffffff : 0x00, + w ? 0xffffffff : 0x00 + }; + + return vld1q_u32(&val[0]); +} + +static MECANIM_FORCE_INLINE vec4f Vloadpf(float const* buf, int offset) +{ + return vld1q_f32((float32_t const*)buf + offset); +} + +#define Vadd(l, r) vaddq_f32(l, r) +#define Vsub(l, r) vsubq_f32(l, r) +#define Vmul(l, r) vmulq_f32(l, r) + + +// return a*b+c : be aware that vmlaq does a+b*c +#define Vmadd(a, b, c) vmlaq_f32(c, a, b) +// return a*b-c : be aware that vmlaq does a-b*c +#define Vmsub(a, b, c) Vneg(vmlsq_f32(c, a, b)) + +static MECANIM_FORCE_INLINE vec4f Vneg(vec4f r) +{ + uint32x4_t sign_constant = vdupq_n_u32(0x80000000); + uint32x4_t negated = veorq_u32(vreinterpretq_u32_f32(r), sign_constant); + return vreinterpretq_f32_u32(negated); +} + +// vector sgn: return -1, 1 +static MECANIM_FORCE_INLINE vec4f Vsgn(vec4f r) +{ + uint32x4_t sign_constant = vdupq_n_u32(0x80000000); + uint32x4_t signs = vandq_u32(vreinterpretq_u32_f32(r), sign_constant); + uint32x4_t ones = vdupq_n_u32 (0x3f800000); + + return vreinterpretq_f32_u32(vorrq_u32(signs,ones)); +/* float32x4_t ones = Vone(); + float32x4_t nones = Vneg(ones); + uint32x4_t cmp = vcltq_f32(r,Vzero()); + return vbslq_f32(cmp,nones,ones);*/ +} + +// vector sgn: return -1, 0, 1 +static MECANIM_FORCE_INLINE vec4f Vsign(vec4f r) +{ + uint32x4_t sign_constant = vdupq_n_u32(0x80000000); + uint32x4_t signs = vandq_u32(vreinterpretq_u32_f32(r), sign_constant); + uint32x4_t ones = vdupq_n_u32 (0x3f800000); + + return vreinterpretq_f32_u32(vorrq_u32( signs, vandq_u32( vmvnq_u32( vceqq_f32( r, Vzero())), ones))); +} + +#define Vinc(r) Vadd( (r), Vone()) +#define Vdec(r) Vsub( (r), Vone()) + +static MECANIM_FORCE_INLINE vec4f Vabs(vec4f r) +{ + return vabsq_f32(r); +} + +#define Vmax( l, r) vmaxq_f32(l, r) +#define Vmin( l, r) vminq_f32(l, r) + +static MECANIM_FORCE_INLINE vec4fs Vlargest(vec4f r) +{ + float32x2_t temp = vpmax_f32 ( vget_high_f32(r), vget_low_f32(r) ); + temp = vpmax_f32(temp, temp); + return vcombine_f32(temp,temp); +} + +static MECANIM_FORCE_INLINE vec4fs Vsmallest(vec4f r) +{ + float32x2_t temp = vpmin_f32 ( vget_high_f32(r), vget_low_f32(r) ); + temp = vpmin_f32(temp, temp); + return vcombine_f32(temp,temp); +} + +static MECANIM_FORCE_INLINE vec4fs Vsum(vec4f r) +{ + float32x2_t temp = vpadd_f32 ( vget_high_f32(r), vget_low_f32(r) ); + temp = vpadd_f32(temp, temp); + return vcombine_f32(temp,temp); +} + +#define Vdot( l, r) Vsum( Vmul((l), (r)) ) + +static MECANIM_FORCE_INLINE vec4f Vrsqrt(vec4f r) +{ + float32x4_t e = vrsqrteq_f32(r); + float32x4_t s = vmulq_f32(e, r); + float32x4_t v = vrsqrtsq_f32(s, e); + + e = vmulq_f32(e,v); + s = vmulq_f32(e, r); + v = vrsqrtsq_f32(s, e); + + return vmulq_f32(e,v); +} + +static MECANIM_FORCE_INLINE vec4f Vrcp(vec4f r) +{ + cvec4fs(C0,-3.402823466e+38f); + cvec4fs(C1, 3.402823466e+38f); + + float32x4_t R0 = vrecpeq_f32(r); + R0 = vmaxq_f32(R0, C0); + R0 = vminq_f32(R0, C1); + + float32x4_t R1 = vrecpsq_f32(r, R0); + R0 = vmulq_f32(R0, R1); + R0 = vmaxq_f32(R0, C0); + R0 = vminq_f32(R0, C1); + R1 = vrecpsq_f32(r, R0); + return vmulq_f32(R0, R1); + + //float32x4_t inv = vrecpeq_f32(r); + //float32x4_t step = vrecpsq_f32(r, inv); + //return vmulq_f32(step, inv); +} + +static MECANIM_FORCE_INLINE vec4f Vdiv(const vec4f l, const vec4f r) +{ + return Vmul(l, Vrcp(r)); +} + +static MECANIM_FORCE_INLINE vec4f Vcombine(vec4f x, vec4f y, vec4f z, vec4f w) +{ + float32x2x2_t temp1 = vtrn_f32(vget_high_f32(x), vget_high_f32(y)); + float32x2x2_t temp2 = vtrn_f32(vget_high_f32(z), vget_high_f32(w)); + return vcombine_f32(temp1.val[0], temp2.val[0]); +} + +// Vector comparison +#define Vcmpeq( a, b) vceqq_f32(a, b) +#define Vcmpneq( a, b) Vnot(vceqq_f32(a, b)) +#define Vcmpgt( a, b) vcgtq_f32(a, b) +#define Vcmpge( a, b) vcgeq_f32(a, b) +#define Vcmplt( a, b) vcltq_f32(a, b) +#define Vcmple( a, b) vcleq_f32(a, b) + +static MECANIM_FORCE_INLINE vec4f Vsel(vec4b c, vec4f a, vec4f b) +{ + return vbslq_f32(c, a, b); +} + +#define Vsqrt(r) Vsel( Vcmpeq(r, Vzero()), Vzero(), Vmul(r,Vrsqrt(r))) + +// vector logics +#define Vnot(r) vmvnq_u32(r) +#define Vxnor(a, b) Vnot(veorq_u32(a, b)) +#define Vxor(a, b) veorq_u32(a, b) +#define Vand(a, b) vandq_u32(a, b) +#define Vor(a, b) vorrq_u32(a, b) + +static MECANIM_FORCE_INLINE bool Vall(const vec4b a) +{ + ::uint32_t u[4]; + + vst1q_lane_u32(&u[0], a, 0); + vst1q_lane_u32(&u[1], a, 1); + vst1q_lane_u32(&u[2], a, 2); + vst1q_lane_u32(&u[3], a, 3); + + return (u[0] & u[1] & u[2] & u[3]); +}; + +static MECANIM_FORCE_INLINE bool Vany(const vec4b a) +{ + ::uint32_t u[4]; + + vst1q_lane_u32(&u[0], a, 0); + vst1q_lane_u32(&u[1], a, 1); + vst1q_lane_u32(&u[2], a, 2); + vst1q_lane_u32(&u[3], a, 3); + + return (u[0] | u[1] | u[2] | u[3]); +}; + +#endif |