Runtime/Math/Simd/intrinsic.h


1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184

#ifndef SIMD_INTRINSIC_H
#define SIMD_INTRINSIC_H

/* Here the Math library back-end interface
	When you declare a function always returns results by values, you want to be sure that simd register stay in register, otherwise you may get poor performance if the CPU need to push back the register into memory
	Vector data is declared purely, ex: typedef __m128 vec4f. most compile won't recognize encapsulated vector type in class and thus generate more temporary and push back vector in memory.

	to support a new platform you need at least to support this function set

	typedef __m128	vec4f;	// vector 4 float packed
	typedef __m128	vec4fs;	// vector 4 float scalar
	typedef __m128	vec4b;	// vector 4 bool packed
	typedef __m128	vec4bs;	// vector 4 bool scalar

	#define Vzero()
	#define Vone()
	#define Vpermute(v, mask)
	#define Vmove(l, r)

	// This template is part of the back-end because some instruction set support some swizzle operation that could be specialized, like xbox vmx rotate instruction that is use in dot product
	template<int SWZ> struct Vswizzle
	{
		static MECANIM_FORCE_INLINE vec4f rhs(vec4f r)
		{
			return Vpermute(r, SWZ);
		}

		static MECANIM_FORCE_INLINE vec4f lhs(vec4f l, vec4f r)
		{
			return Vswizzle<SWZ>::rhs(Vmove(Vswizzle<SWZ>::rhs(l), r));
		}
	};


	// Aligned store, store vector at adress base as 4 float
	#define Vstorepf(v, base, offset)

	// Return component x as a float
	#define Vstoresf(r)

	// Return component x as a bool
	#define Vstoresb(r)

	// Aligned store, store vector at adress base as 4 bool
	#define Vstorepb(vec4f v, bool* r)

	// Aligned load, load 4 float at adress v in vector register
	#define Vloadpf(v, offset)

	// Load float value in vector register and replicate value in all component
	#define Vloadsf(s)

	// Load bool value in vector register and replicate value in all component
	#define Vloadsb(s)

	// Load 4 float value in vector register
	#define Vload4sf(x, y, z, w)

	// Load 4 bool value in vector register
	#define Vload4sb( x, y, z, w)

	#define Vadd(l, r)
	#define Vsub( l,  r)
	#define Vmul( l,  r)
	#define Vdiv( l,  r)
	#define Vmadd( a,  b,  c)
	#define Vmsub( a,  b,  c)
	#define Vneg(r)

	// Vector sgn: return -1, 1
	#define Vsgn(r)

	// Vector sgn: return -1, 0, 1
	#define Vsign(r)

	#define Vinc(r)
	#define Vdec(r)
	#define Vabs(r)
	#define Vmax( l,  r)
	#define Vmin( l,  r)

	// Return the largest of the 4 component
	#define Vlargest(r)

	// Return the smallest of the 4 component
	#define Vsmallest(r)
	#define Vsum(r)
	#define Vdot( l,  r)
	#define Vsqrt(r)

	#define Vrsqrt(r)
	#define Vrcp(r)

	// Merge 4 vector low bytes
	#define Vcombine(x,y,z,w)

	// Vector comparison
	#define Vcmpeq( a,  b)
	#define Vcmpneq( a,  b)
	#define Vcmpgt( a,  b)
	#define Vcmpge( a,  b)
	#define Vcmplt( a,  b)
	#define Vcmple( a,  b)

	#define Vsel( c,  a,  b)

	//	vector logics
	#define Vnot(r)
	#define Vxnor( a,  b)
	#define Vxor( a,  b)
	#define Vand( a,  b)
	#define Vor( a,  b)
	#define Vall(a)
	#define Vany( a)

*/
#if defined(__INTEL_COMPILER) || defined(__ICL) || defined(_MSC_VER)
	#include <cstddef>
	#define ATTRIBUTE_ALIGN(a)		__declspec(align(a))
	#define ALIGN4F					16
	#define MECANIM_FORCE_INLINE	__forceinline
#elif defined(__GNUC__) || defined(__clang__)
	#include <cstddef>

	#ifndef __has_attribute
  		#define __has_attribute(x) 0
	#endif

	#if ((__GNUC__ >= 3) && (__GNUC_MINOR__ >= 1)) || (__GNUC__ >= 4) || __has_attribute(always_inline)
        #ifdef _DEBUG
            #ifndef MECANIM_FORCE_INLINE
                #define MECANIM_FORCE_INLINE		inline
            #endif
        #else
            #ifndef MECANIM_FORCE_INLINE
                #define MECANIM_FORCE_INLINE		inline __attribute__((always_inline))
            #endif
        #endif
	#endif

	#if defined(__GNUC__) || __has_attribute(aligned)
		#define ATTRIBUTE_ALIGN(a)					__attribute__ ((aligned(a)))
	#endif

	#define ALIGN4F						16
#endif

#ifndef MECANIM_FORCE_INLINE
	#define MECANIM_FORCE_INLINE		inline
#endif

#ifndef ATTRIBUTE_ALIGN
	#define ATTRIBUTE_ALIGN(a)
#endif

#ifndef ALIGN4F
	#define ALIGN4F						16
#endif

#if UNITY_FORCE_FPU
	#include "Runtime/Math/Simd/fpu.h"
#elif UNITY_XENON
	#include "Runtime/Math/Simd/xenon.h"
#elif UNITY_PS3
	#include "Runtime/Math/Simd/ppu.h"
#elif UNITY_WIN && UNITY_SUPPORTS_SSE
	#include "Runtime/Math/Simd/sse.h"
#elif UNITY_OSX
	#include "Runtime/Math/Simd/sse.h"
#elif UNITY_SUPPORTS_NEON && (!UNITY_ANDROID)
	#include "Runtime/Math/Simd/neon.h"
#else
    #include "Runtime/Math/Simd/fpu.h"
#endif

//#define DEBUG_SIMD_ASSERT_IF 1
#if DEBUG_SIMD_ASSERT_IF
	#define SIMD_ASSERT_IF(x) AssertIf(x)
#else
	#define SIMD_ASSERT_IF(x)
#endif

#endif