49 files changed, 15246 insertions, 0 deletions
diff --git a/Runtime/Math/AnimationCurve.cpp b/Runtime/Math/AnimationCurve.cpp
new file mode 100644
index 0000000..07b976c
--- /dev/null
+++ b/Runtime/Math/AnimationCurve.cpp
@@ -0,0 +1,964 @@
+#include "UnityPrefix.h"
+#include "AnimationCurve.h"
+#include "Runtime/Utilities/LogAssert.h"
+using namespace std;
+
+#define kOneThird (1.0F / 3.0F)
+#define kMaxTan 5729577.9485111479F
+
+int ToInternalInfinity (int pre);
+int FromInternalInfinity (int pre);
+
+int ToInternalInfinity (int pre)
+{
+	if (pre == kRepeat)
+		return AnimationCurve::kInternalRepeat;
+	else if (pre == kPingPong)
+		return AnimationCurve::kInternalPingPong;
+	else
+		return AnimationCurve::kInternalClamp;
+}
+
+int FromInternalInfinity (int pre)
+{
+	if (pre == AnimationCurve::kInternalRepeat)
+		return kRepeat;
+	else if (pre == AnimationCurve::kInternalPingPong)
+		return kPingPong;
+	else
+		return kClampForever;
+}
+
+template<class T>
+KeyframeTpl<T>::KeyframeTpl (float t, const T& v)
+{
+	time = t;
+	value = v;
+	inSlope = Zero<T>();
+	outSlope = Zero<T>();
+
+#if UNITY_EDITOR
+	tangentMode = 0;
+#endif
+}
+
+
+template<class T>
+void AnimationCurveTpl<T>::InvalidateCache ()
+{
+	m_Cache.time = std::numeric_limits<float>::infinity ();
+	m_Cache.index = 0;
+	m_ClampCache.time = std::numeric_limits<float>::infinity ();
+	m_ClampCache.index = 0;
+}
+
+template<class T>
+pair<float, float> AnimationCurveTpl<T>::GetRange () const
+{
+	if (!m_Curve.empty ())
+		return make_pair (m_Curve[0].time, m_Curve.back ().time);
+	else
+		return make_pair (std::numeric_limits<float>::infinity (), -std::numeric_limits<float>::infinity ());
+}
+
+
+
+///@TODO: Handle step curves correctly
+template<class T>
+void AnimationCurveTpl<T>::EvaluateWithoutCache (float curveT, T& output)const
+{
+	DebugAssertIf (!IsValid ());
+	curveT = WrapTime (curveT);
+
+	int lhsIndex, rhsIndex;
+	FindIndexForSampling (m_Cache, curveT, lhsIndex, rhsIndex);
+	const Keyframe& lhs = m_Curve[lhsIndex];
+	const Keyframe& rhs = m_Curve[rhsIndex];
+
+	float dx = rhs.time - lhs.time;
+	T m1;
+	T m2;
+	float t;
+	if (dx != 0.0F)
+	{
+		t = (curveT - lhs.time) / dx;
+		m1 = lhs.outSlope * dx;
+		m2 = rhs.inSlope * dx;
+	}
+	else
+	{
+		t = 0.0F;
+		m1 = Zero<T>();
+		m2 = Zero<T>();
+	}
+
+	output = HermiteInterpolate (t, lhs.value, m1, m2, rhs.value);
+	HandleSteppedCurve(lhs, rhs, output);
+	DebugAssertIf(!IsFinite(output));
+}
+
+template<class T>
+inline void EvaluateCache (const typename AnimationCurveTpl<T>::Cache& cache, float curveT, T& output)
+{
+//	DebugAssertIf (curveT < cache.time - kCurveTimeEpsilon || curveT > cache.timeEnd + kCurveTimeEpsilon);
+	float t = curveT - cache.time;
+	output = (t * (t * (t * cache.coeff[0] + cache.coeff[1]) + cache.coeff[2])) + cache.coeff[3];
+	DebugAssertIf (!IsFinite(output));
+}
+
+void SetupStepped (float* coeff, const KeyframeTpl<float>& lhs, const KeyframeTpl<float>& rhs)
+{
+	// If either of the tangents in the segment are set to stepped, make the constant value equal the value of the left key
+	if (lhs.outSlope == std::numeric_limits<float>::infinity() || rhs.inSlope == std::numeric_limits<float>::infinity())
+	{
+		coeff[0] = 0.0F;
+		coeff[1] = 0.0F;
+		coeff[2] = 0.0F;
+		coeff[3] = lhs.value;
+	}
+}
+
+void HandleSteppedCurve (const KeyframeTpl<float>& lhs, const KeyframeTpl<float>& rhs, float& value)
+{
+	if (lhs.outSlope == std::numeric_limits<float>::infinity() || rhs.inSlope == std::numeric_limits<float>::infinity())
+		value = lhs.value;
+}
+
+void HandleSteppedTangent (const KeyframeTpl<float>& lhs, const KeyframeTpl<float>& rhs, float& tangent)
+{
+	if (lhs.outSlope == std::numeric_limits<float>::infinity() || rhs.inSlope == std::numeric_limits<float>::infinity())
+		tangent = std::numeric_limits<float>::infinity();
+}
+
+
+void SetupStepped (Vector3f* coeff, const KeyframeTpl<Vector3f>& lhs, const KeyframeTpl<Vector3f>& rhs)
+{
+	for (int i=0;i<3;i++)
+	{
+		// If either of the tangents in the segment are set to stepped, make the constant value equal the value of the left key
+		if (lhs.outSlope[i] == std::numeric_limits<float>::infinity() || rhs.inSlope[i] == std::numeric_limits<float>::infinity())
+		{
+			coeff[0][i] = 0.0F;
+			coeff[1][i] = 0.0F;
+			coeff[2][i] = 0.0F;
+			coeff[3][i] = lhs.value[i];
+		}
+	}
+}
+
+void HandleSteppedCurve (const KeyframeTpl<Vector3f>& lhs, const KeyframeTpl<Vector3f>& rhs, Vector3f& value)
+{
+	for (int i=0;i<3;i++)
+	{
+		if (lhs.outSlope[i] == std::numeric_limits<float>::infinity() || rhs.inSlope[i] == std::numeric_limits<float>::infinity())
+			value[i] = lhs.value[i];
+	}
+}
+
+void HandleSteppedTangent (const KeyframeTpl<Vector3f>& lhs, const KeyframeTpl<Vector3f>& rhs, Vector3f& value)
+{
+	for (int i=0;i<3;i++)
+	{
+		if (lhs.outSlope[i] == std::numeric_limits<float>::infinity() || rhs.inSlope[i] == std::numeric_limits<float>::infinity())
+			value[i] = std::numeric_limits<float>::infinity();
+	}
+}
+
+void SetupStepped (Quaternionf* coeff, const KeyframeTpl<Quaternionf>& lhs, const KeyframeTpl<Quaternionf>& rhs)
+{
+	// If either of the tangents in the segment are set to stepped, make the constant value equal the value of the left key
+	if (lhs.outSlope[0] == std::numeric_limits<float>::infinity() || rhs.inSlope[0] == std::numeric_limits<float>::infinity() ||
+		lhs.outSlope[1] == std::numeric_limits<float>::infinity() || rhs.inSlope[1] == std::numeric_limits<float>::infinity() ||
+		lhs.outSlope[2] == std::numeric_limits<float>::infinity() || rhs.inSlope[2] == std::numeric_limits<float>::infinity() ||
+		lhs.outSlope[3] == std::numeric_limits<float>::infinity() || rhs.inSlope[3] == std::numeric_limits<float>::infinity() )
+	{
+		for (int i=0;i<4;i++)
+		{
+			coeff[0][i] = 0.0F;
+			coeff[1][i] = 0.0F;
+			coeff[2][i] = 0.0F;
+			coeff[3][i] = lhs.value[i];
+		}
+	}
+}
+
+void HandleSteppedCurve (const KeyframeTpl<Quaternionf>& lhs, const KeyframeTpl<Quaternionf>& rhs, Quaternionf& value)
+{
+	if (lhs.outSlope[0] == std::numeric_limits<float>::infinity() || rhs.inSlope[0] == std::numeric_limits<float>::infinity() ||
+		lhs.outSlope[1] == std::numeric_limits<float>::infinity() || rhs.inSlope[1] == std::numeric_limits<float>::infinity() ||
+		lhs.outSlope[2] == std::numeric_limits<float>::infinity() || rhs.inSlope[2] == std::numeric_limits<float>::infinity() ||
+		lhs.outSlope[3] == std::numeric_limits<float>::infinity() || rhs.inSlope[3] == std::numeric_limits<float>::infinity() )
+	{
+		value = lhs.value;
+	}
+}
+
+void HandleSteppedTangent (const KeyframeTpl<Quaternionf>& lhs, const KeyframeTpl<Quaternionf>& rhs, Quaternionf& tangent)
+{
+	for (int i=0;i<4;i++)
+	{
+		if (lhs.outSlope[i] == std::numeric_limits<float>::infinity() || rhs.inSlope[i] == std::numeric_limits<float>::infinity())
+			tangent[i] = std::numeric_limits<float>::infinity();
+	}
+}
+
+template<class T>
+void AnimationCurveTpl<T>::CalculateCacheData (Cache& cache, int lhsIndex, int rhsIndex, float timeOffset) const
+{
+	const Keyframe& lhs = m_Curve[lhsIndex];
+	const Keyframe& rhs = m_Curve[rhsIndex];
+//	DebugAssertIf (timeOffset < -0.001F || timeOffset - 0.001F > rhs.time - lhs.time);
+	cache.index = lhsIndex;
+	cache.time = lhs.time + timeOffset;
+	cache.timeEnd = rhs.time + timeOffset;
+	cache.index = lhsIndex;
+
+	float dx, length;
+	T dy;
+	T m1, m2, d1, d2;
+
+	dx = rhs.time - lhs.time;
+	dx = max(dx, 0.0001F);
+	dy = rhs.value - lhs.value;
+	length = 1.0F / (dx * dx);
+
+	m1 = lhs.outSlope;
+	m2 = rhs.inSlope;
+	d1 = m1 * dx;
+	d2 = m2 * dx;
+
+	cache.coeff[0] = (d1 + d2 - dy - dy) * length / dx;
+	cache.coeff[1] = (dy + dy + dy - d1 - d1 - d2) * length;
+	cache.coeff[2] = m1;
+	cache.coeff[3] = lhs.value;
+	SetupStepped(cache.coeff, lhs, rhs);
+
+	DebugAssertIf(!IsFinite(cache.coeff[0]));
+	DebugAssertIf(!IsFinite(cache.coeff[1]));
+	DebugAssertIf(!IsFinite(cache.coeff[2]));
+	DebugAssertIf(!IsFinite(cache.coeff[3]));
+}
+
+// When we look for the next index, how many keyframes do we just loop ahead instead of binary searching?
+#define SEARCH_AHEAD 3
+///@TODO: Cleanup old code to completely get rid of this
+template<class T>
+int AnimationCurveTpl<T>::FindIndex (const Cache& cache, float curveT) const
+{
+	#if SEARCH_AHEAD >= 0
+	int cacheIndex = cache.index;
+	if (cacheIndex != -1)
+	{
+		// We can not use the cache time or time end since that is in unwrapped time space!
+		float time = m_Curve[cacheIndex].time;
+
+		if (curveT > time)
+		{
+			if (cacheIndex + SEARCH_AHEAD < static_cast<int>(m_Curve.size()))
+			{
+				for (int i=0;i<SEARCH_AHEAD;i++)
+				{
+					if (curveT < m_Curve[cacheIndex + i + 1].time)
+						return cacheIndex + i;
+				}
+			}
+		}
+		else
+		{
+			if (cacheIndex - SEARCH_AHEAD >= 0)
+			{
+				for (int i=0;i<SEARCH_AHEAD;i++)
+				{
+					if (curveT > m_Curve[cacheIndex - i - 1].time)
+						return cacheIndex - i - 1;
+				}
+			}
+		}
+	}
+
+	#endif
+
+	///@ use cache to index into next if not possible use binary search
+	const_iterator i = std::lower_bound (m_Curve.begin (), m_Curve.end (), curveT, KeyframeCompare());
+	int index = distance (m_Curve.begin (), i);
+	index--;
+	index = min<int> (m_Curve.size () - 2, index);
+	index = max<int> (0, index);
+
+	return index;
+}
+
+///@TODO: Cleanup old code to completely get rid of this
+template<class T>
+int AnimationCurveTpl<T>::FindIndex (float curveT) const
+{
+	pair<float, float> range = GetRange ();
+	if (curveT <= range.first || curveT >= range.second)
+		return -1;
+
+	const_iterator i = std::lower_bound (m_Curve.begin (), m_Curve.end (), curveT, KeyframeCompare());
+	AssertIf (i == m_Curve.end ());
+	int index = distance (m_Curve.begin (), i);
+	index--;
+	index = min<int> (m_Curve.size () - 2, index);
+	index = max<int> (0, index);
+
+	AssertIf (curveT < m_Curve[index].time || curveT > m_Curve[index+1].time);
+	return index;
+}
+
+template<class T>
+void AnimationCurveTpl<T>::FindIndexForSampling (const Cache& cache, float curveT, int& lhs, int& rhs) const
+{
+	AssertIf (curveT < GetRange ().first || curveT > GetRange ().second);
+	int actualSize = m_Curve.size();
+	const Keyframe* frames = &m_Curve[0];
+
+	// Reference implementation:
+	// (index is the last value that is equal to or smaller than curveT)
+	#if 0
+	int foundIndex = 0;
+	for (int i=0;i<actualSize;i++)
+	{
+		if (frames[i].time <= curveT)
+			foundIndex = i;
+	}
+
+	lhs = foundIndex;
+	rhs = min<int>(lhs + 1, actualSize - 1);
+	AssertIf (curveT < m_Curve[lhs].time || curveT > m_Curve[rhs].time);
+	AssertIf(frames[rhs].time == curveT && frames[lhs].time != curveT)
+	return;
+	#endif
+
+
+	#if SEARCH_AHEAD > 0
+	int cacheIndex = cache.index;
+	if (cacheIndex != -1)
+	{
+		// We can not use the cache time or time end since that is in unwrapped time space!
+		float time = m_Curve[cacheIndex].time;
+
+		if (curveT > time)
+		{
+			for (int i=0;i<SEARCH_AHEAD;i++)
+			{
+				int index = cacheIndex + i;
+				if (index + 1 < actualSize && frames[index + 1].time > curveT)
+				{
+					lhs = index;
+
+					rhs = min<int>(lhs + 1, actualSize - 1);
+					AssertIf (curveT < frames[lhs].time || curveT > frames[rhs].time);
+					AssertIf(frames[rhs].time == curveT && frames[lhs].time != curveT);
+					return;
+				}
+			}
+		}
+		else
+		{
+			for (int i=0;i<SEARCH_AHEAD;i++)
+			{
+				int index = cacheIndex - i;
+				if (index >= 0 && curveT >= frames[index].time)
+				{
+					lhs = index;
+					rhs = min<int>(lhs + 1, actualSize - 1);
+					AssertIf (curveT < frames[lhs].time || curveT > m_Curve[rhs].time);
+					AssertIf(frames[rhs].time == curveT && frames[lhs].time != curveT);
+					return;
+				}
+			}
+		}
+	}
+
+	#endif
+
+	// Fall back to using binary search
+	// upper bound (first value larger than curveT)
+	int __len = actualSize;
+	int __half;
+	int __middle;
+	int __first = 0;
+	while (__len > 0)
+	{
+		__half = __len >> 1;
+		__middle = __first + __half;
+
+		if (curveT < frames[__middle].time)
+			__len = __half;
+		else
+	    {
+			__first = __middle;
+			++__first;
+			__len = __len - __half - 1;
+	    }
+	}
+
+	// If not within range, we pick the last element twice
+	lhs = __first - 1;
+	rhs = min(actualSize - 1, __first);
+
+	AssertIf(lhs < 0 || lhs >= actualSize);
+	AssertIf(rhs < 0 || rhs >= actualSize);
+
+	AssertIf (curveT < m_Curve[lhs].time || curveT > m_Curve[rhs].time);
+	AssertIf(frames[rhs].time == curveT && frames[lhs].time != curveT);
+}
+
+template<class T>
+void AnimationCurveTpl<T>::SetPreInfinity (int pre)
+{
+	m_PreInfinity = ToInternalInfinity(pre);
+	InvalidateCache ();
+}
+
+template<class T>
+void AnimationCurveTpl<T>::SetPostInfinity (int post)
+{
+	m_PostInfinity = ToInternalInfinity(post);
+	InvalidateCache ();
+}
+
+template<class T>
+int AnimationCurveTpl<T>::GetPreInfinity () const
+{
+	return FromInternalInfinity(m_PreInfinity);
+}
+
+template<class T>
+int AnimationCurveTpl<T>::GetPostInfinity () const
+{
+	return FromInternalInfinity(m_PostInfinity);
+}
+
+template<class T>
+T AnimationCurveTpl<T>::EvaluateClamp (float curveT) const
+{
+	T output;
+	if (curveT >= m_ClampCache.time && curveT < m_ClampCache.timeEnd)
+	{
+//		AssertIf (!CompareApproximately (EvaluateCache (m_Cache, curveT), EvaluateWithoutCache (curveT), 0.001F));
+		EvaluateCache<T> (m_ClampCache, curveT, output);
+		return output;
+	}
+	else
+	{
+		DebugAssertIf (!IsValid ());
+
+		float begTime = m_Curve[0].time;
+		float endTime = m_Curve.back().time;
+
+		if (curveT > endTime)
+		{
+			m_ClampCache.time = endTime;
+			m_ClampCache.timeEnd = std::numeric_limits<float>::infinity ();
+			m_ClampCache.coeff[0] = m_ClampCache.coeff[1] = m_ClampCache.coeff[2] = Zero<T>();
+			m_ClampCache.coeff[3] = m_Curve[m_Curve.size()-1].value;
+		}
+		else if (curveT < begTime)
+		{
+			m_ClampCache.time = curveT - 1000.0F;
+			m_ClampCache.timeEnd = begTime;
+			m_ClampCache.coeff[0] = m_ClampCache.coeff[1] = m_ClampCache.coeff[2] = Zero<T>();
+			m_ClampCache.coeff[3] = m_Curve[0].value;
+		}
+		else
+		{
+			int lhs, rhs;
+			FindIndexForSampling (m_ClampCache, curveT, lhs, rhs);
+			CalculateCacheData (m_ClampCache, lhs, rhs, 0.0F);
+		}
+
+//		AssertIf (!CompareApproximately (EvaluateCache (m_Cache, curveT), EvaluateWithoutCache (curveT), 0.001F));
+		EvaluateCache<T> (m_ClampCache, curveT, output);
+		return output;
+	}
+}
+
+template<class T>
+T AnimationCurveTpl<T>::Evaluate (float curveT) const
+{
+	int lhs, rhs;
+	T output;
+	if (curveT >= m_Cache.time && curveT < m_Cache.timeEnd)
+	{
+//		AssertIf (!CompareApproximately (EvaluateCache (m_Cache, curveT), EvaluateWithoutCache (curveT), 0.001F));
+		EvaluateCache<T> (m_Cache, curveT, output);
+		return output;
+	}
+	// @TODO: Optimize IsValid () away if by making the non-valid case always use the m_Cache codepath
+	else if (IsValid ())
+	{
+		float begTime = m_Curve[0].time;
+		float endTime = m_Curve.back().time;
+		float wrappedTime;
+
+		if (curveT >= endTime)
+		{
+			if (m_PostInfinity == kInternalClamp)
+			{
+				m_Cache.time = endTime;
+				m_Cache.timeEnd = std::numeric_limits<float>::infinity ();
+				m_Cache.coeff[0] = m_Cache.coeff[1] = m_Cache.coeff[2] = Zero<T>();
+				m_Cache.coeff[3] = m_Curve[m_Curve.size()-1].value;
+			}
+			else if (m_PostInfinity == kInternalRepeat)
+			{
+				wrappedTime = Repeat (curveT, begTime, endTime);
+
+				FindIndexForSampling (m_Cache, wrappedTime, lhs, rhs);
+				CalculateCacheData (m_Cache, lhs, rhs, curveT - wrappedTime);
+			}
+			///@todo optimize pingpong by making it generate a cache too
+			else
+			{
+				EvaluateWithoutCache (curveT, output);
+				return output;
+			}
+		}
+		else if (curveT < begTime)
+		{
+			if (m_PreInfinity == kInternalClamp)
+			{
+				m_Cache.time = curveT - 1000.0F;
+				m_Cache.timeEnd = begTime;
+				m_Cache.coeff[0] = m_Cache.coeff[1] = m_Cache.coeff[2] = Zero<T>();
+				m_Cache.coeff[3] = m_Curve[0].value;
+			}
+			else if (m_PreInfinity == kInternalRepeat)
+			{
+				wrappedTime = Repeat (curveT, begTime, endTime);
+				FindIndexForSampling (m_Cache, wrappedTime, lhs, rhs);
+				CalculateCacheData (m_Cache, lhs, rhs, curveT - wrappedTime);
+			}
+			///@todo optimize pingpong by making it generate a cache too
+			else
+			{
+				EvaluateWithoutCache (curveT, output);
+				return output;
+			}
+		}
+		else
+		{
+			FindIndexForSampling (m_Cache, curveT, lhs, rhs);
+			CalculateCacheData (m_Cache, lhs, rhs, 0.0F);
+		}
+
+		//		AssertIf (!CompareApproximately (EvaluateCache (m_Cache, curveT), EvaluateWithoutCache (curveT), 0.001F));
+		EvaluateCache<T> (m_Cache, curveT, output);
+		return output;
+	}
+	else
+	{
+		if (m_Curve.size () == 1)
+			return m_Curve.begin()->value;
+		else
+			return Zero<T> ();
+	}
+}
+
+template<class T>
+float AnimationCurveTpl<T>::WrapTime (float curveT) const
+{
+	DebugAssertIf (!IsValid ());
+
+	float begTime = m_Curve[0].time;
+	float endTime = m_Curve.back().time;
+
+	if (curveT < begTime)
+	{
+		if (m_PreInfinity == kInternalClamp)
+			curveT = begTime;
+		else if (m_PreInfinity == kInternalPingPong)
+			curveT = PingPong (curveT, begTime, endTime);
+		else
+			curveT = Repeat (curveT, begTime, endTime);
+	}
+	else if (curveT > endTime)
+	{
+		if (m_PostInfinity == kInternalClamp)
+			curveT = endTime;
+		else if (m_PostInfinity == kInternalPingPong)
+			curveT = PingPong (curveT, begTime, endTime);
+		else
+			curveT = Repeat (curveT, begTime, endTime);
+	}
+	return curveT;
+}
+
+template<class T>
+int AnimationCurveTpl<T>::AddKey (const Keyframe& key)
+{
+	InvalidateCache ();
+
+	iterator i = std::lower_bound (m_Curve.begin (), m_Curve.end (), key);
+
+	// is not included in container and value is not a duplicate
+	if (i == end () || key < *i)
+	{
+		iterator ii = m_Curve.insert (i, key);
+		return std::distance (m_Curve.begin (), ii);
+	}
+	else
+		return -1;
+}
+
+template<class T>
+void AnimationCurveTpl<T>::RemoveKeys (iterator begin, iterator end)
+{
+	InvalidateCache ();
+	m_Curve.erase (begin, end);
+}
+
+void ScaleCurveValue (AnimationCurve& curve, float scale)
+{
+	for (int i=0;i<curve.GetKeyCount ();i++)
+	{
+		curve.GetKey (i).value *= scale;
+		curve.GetKey (i).inSlope *= scale;
+		curve.GetKey (i).outSlope *= scale;
+	}
+	
+	curve.InvalidateCache();
+}
+
+void OffsetCurveValue (AnimationCurve& curve, float offset)
+{
+	for (int i=0;i<curve.GetKeyCount ();i++)
+		curve.GetKey (i).value += offset;
+	
+	curve.InvalidateCache();
+}
+
+void ScaleCurveTime (AnimationCurve& curve, float scale)
+{
+	for (int i=0;i<curve.GetKeyCount ();i++)
+	{
+		curve.GetKey (i).time *= scale;
+		curve.GetKey (i).inSlope /= scale;
+		curve.GetKey (i).outSlope /= scale;
+	}
+	curve.InvalidateCache();
+}
+
+void OffsetCurveTime (AnimationCurve& curve, float offset)
+{
+	for (int i=0;i<curve.GetKeyCount ();i++)
+		curve.GetKey (i).time += offset;
+	curve.InvalidateCache();
+}
+
+
+/*
+
+Calculating tangents from a hermite spline () Realtime rendering page 56
+
+
+
+> On this first pass we're stuck with linear keyframing, because we just
+> don't have the cycles in game to go to splines. I know this would help a
+> lot, but it isn't an option.
+
+In Granny I do successive least-squares approximations do the data.  I
+take the array of samples for a given channel (ie., position) which is 2x
+oversampled or better from the art tool.  I then start by doing a
+least-square solve for a spline of arbitrary degree with knots at either
+end.  I compute the error over the spline from the original samples, and
+add knots in the areas of highest error.  Repeat and salt to taste.
+
+Since it's fairly easy to write a single solver that solves for any degree
+of spline, I use the same solver for linear keyframes as I do for
+quadratic keyframes, cubic keyframes, or even "0th order keframes", which
+is to say if you don't want to interpolate _at all_, the solver can still
+place the "stop-motion" key frames in the best locations.  But hopefully
+no one is still doing that kind of animation.
+
+If I were doing this over again (which I probably will at some point), I
+would probably use some kind of weird waveletty scheme now.  I decided not
+to do that originally, because I had about a month to do the entire
+spline/solver/reduction thing the first time, and it had to be highly
+optimized.  So I didn't want to have to learn wavelets and spline solvers
+at the same time.  Next time I'll spend some time on wavelets and probably
+use some sort of hierachical reduction scheme instead, primarily so you
+can change how densely your keyframes are placed at run-time, and so you
+can get hard edges easier (motions which are intended to have sharp
+discontinuities aren't handled well at all by my current scheme).
+
+- Casey
+
+
+
+--
+
+> Is anybody looking at the way errors add up?  eg. if you do some
+> reduction on the hip, and the thigh, and the ankle bone channels, the
+> result is a foot that moves quite a bit differently than it should.
+
+This has been on my list for a long time.  I don't think it's a simple
+case of error analysis though.  I think it's more a case for using
+discrete skeletal changes or integrated IK, because hey, if what you care
+about is having a particular thing stay in one place, then it seems to me
+that the best way to compress that is by just saying "this stays in one
+place", rather than trying to spend a lot of data on the joint curves
+necessary to make that so.
+
+> Also, is anybody doing things like using IK in the reducer, to make sure
+> that even in the reduced version the feet stay in exactly the same spot?
+
+That doesn't work with splines, unfortunately, because the splines are
+continuous, and you will always have the feet slipping as a result (if not
+at the keyframes, then in between for sure).  So you end up with the
+problem that the IK reducer would need to shove a metric assload of keys
+into the streams, which defeats the compression.
+
+From Ian:
+
+> You said you are doing this on linear data as well. I can see that this
+> would work, but do you find you get a fairly minimal result? I can
+> envisage cases where you'd get many unneeded keyframes from initial
+> 'breaks' at points of large error.
+
+I'm not sure what you mean by this.  The error is controllable, so you say
+how much you are willing to accept.  You get a minimal spline for the
+error that you ask for, but no more - obviously in the areas of high
+error, it has to add more keys, but that's what you want.  The objective
+of compression, at least in my opinion, is not to remove detail, but
+rather to more efficiently store places where there is less detail.
+
+> I think I may be missing the point. Do you do least squares on the
+> already fitted cures (ie, a chi squared test) then curve fit separately,
+> or do you use least squares to fit your actual spline to the data?
+
+The latter.  The incremental knot addition sets up the t_n's of an
+arbitrary degree spline.  You have a matrix A that has sample-count rows
+and knot-count columns.  The vector you're looking for is x, the spline
+vector, which has knot-count rows.  You're producing b, the vector of
+samples, which has sample-count rows.  So it's Ax = b, but A is
+rectangular.  You solve via A^T Ax = A^Tb, a simple least squares problem.
+You can solve it any way you like.  I chose a straightforward
+implementation, because there's no numerical difficulties with these
+things.
+
+The version that comes with Granny is a highly optimized A^T Ax = A^Tb
+solver, which constructs A^T A and A^T b directly and sparsely (so it is
+O(n) in the number of samples, instead of O(n^3)).  The A^T A is band
+diagonal, because of the sparsity pattern of A, so I use a sparse banded
+cholesky solver on the back end.  It's _extremely_ fast.  In fact, the
+stupid part of Granny's solver is actually the other part (the error
+analysis + knot addition), because it adds very few knots per cycle, so on
+a really long animation it can call the least-squares Ax = b solver many
+hundreds of times for a single animation, and it _still_ never takes more
+than 10 seconds or so even on animations with many hundred frames.  So
+really, the right place to fix currently is the stupid knot addition
+alogirithm, which really should be improved quite a bit.
+
+> I have not used least squared before and from the reading I have done
+> (numerical recipes, last night), it seem that the equation of the
+> line/spline are inherent in the method and it needs to be reformulated
+> to use a different line/spline. (The version I read was for fitting a
+> straight line to a data-set, which I know won't work for quaternions
+> slerps anyway)
+
+Quaternion lerps are great, and work great with splines.  I use them
+throughout all of Granny - there is no slerping anywhere.  Slerping is not
+a very useful thing in a run-time engine, in my opinion, unless you are
+dealing with orientations that are greater the 90 degrees apart.  It
+allows me to use the same solver for position, rotation, AND scale/shear,
+with no modifications.
+
+- Casey
+
+
+-----
+> When is it best to use quaternions and when to use normal matrix
+> rotations?
+
+In Granny, I use quaternions for everything except the final composite
+phase where rotations (and everything else) are multiplied by their
+parents to produce the final world-space results.  Since we support
+scale/shear, orientation, and position, it tends to be fastest at the
+composition stage to convert the orientation from quaternion to matrix,
+and then do all the matrix concatenation together.  I'm not sure I would
+do the same if I didn't support scale/shear.  It might be a bit more fun
+and efficient to go ahead and do the whole pipe in quaternion and only
+convert to matrices at the very end (maybe somebody else has played with
+that and can comment?)
+
+> And for character animation with moving joints quaternions is usually
+> used, but why?
+
+There's lots of nice things about quaternions:
+
+1) You can treat the linearly if you want to (so you can blend animations
+quickly and easily)
+
+2) You can treat them non-linearly if you want to (so you can do exact
+geodesic interpolation at a constant speed)
+
+3) You can convert them to a matrix with no transcendentals
+
+4) They are compact, and can be easily converted to a 3-element
+representation when necessary (ie., any one of the four values can be
+easily re-generated from the other 3)
+
+5) They cover 720 degrees of rotation, not 360 (if you do a lot of work
+with character animation, you will appreciate this!), and the
+neighborhooding operator is extremely simple and fast (inner product and a
+negation)
+
+6) They are easy to visualize (they're not much more complicated than
+angle/axis) and manipulate (ie., the axis of rotation is obvious, you can
+transform the axis of rotation without changing the amount of rotation,
+and even just use a regular 3-vector transform, etc.)
+
+7) You can easily transform vectors with them without converting to a
+matrix if you want to
+
+8) They are easy to renormalize, and they can never become "unorthogonal"
+
+9) No gimbal lock
+
+10) There is a simple, fast, and relevant distance metric (the inner
+product)
+
+> Is it just by trial and error that we decide when we have to use
+> quarternions?
+
+Well, I can't speak for everyone, but I have been extremely careful in my
+selection of quaternions.  With Granny 1 I did quaternions to get some
+practice with them, but when I did 2 I did a lot of research and wrote
+code to visualize the actions of various rotational representations and
+operations, and I can very confidently say there's no better choice for
+general character animation than quaternions, at least among the
+representations that I know (euler, angle/axis, matrix, quaternion, exp
+map).  There are some other mappings that I've come across since I worked
+everything out (like the Rational Map), that I have not experimented with,
+because so far quaternions have been working superbly so I haven't had
+much cause to go hunting.
+
+> "Quarternions have the ability to have smooth interpollated rotations",
+> I've made smooth interpollated rotations with normal matrix rotations
+> though.
+
+Well, it's not so much what you can do as how easy it is to do it.
+Quaternions can be interpolated directly as a smooth geodesic (via slerp)
+or via a nonlinear geodesic (lerp).  The latter is particularly powerful
+because it distributes - it's a linear operator.  So you can literally
+plug it in to anything that would work, like splines, multi-point blends,
+etc., and it will "just work" (well, that's a bit of an exaggeration,
+because there is a neighborhooding concern, but it's always easy to do).
+
+> "They take less room, 4 elements versus 9 and some operations
+> are cheaper in terms of CPU cycles". I accept this reason except someone
+> said that quarternions use more CPU cycles, so I'm not sure who to
+> believe.
+
+It depends what you're doing.  Quaternions take more CPU cycles to
+transform a vector than a matrix does.  But quaternions are MUCH less
+expensive for lots of other operations, like, for example, splining.
+This is why it's very useful to use quaternions for everything except the
+final stage of your pipe, where matrices are more appropriate.
+
+> "Quarternions are not susceptible to gimbal lock. Gimbal lock shows its
+> face when two axes point in the same direction." So if no axes face in
+> the same direction then this is not a reason to use quarternions.
+> ...
+> Am I right in saying that if no axes face the same direction it is
+> possible to represent all rotations with normal matrix rotations?
+
+Matrix rotations are not subject to gimbal lock.  That's Euler angles.
+
+- Casey
+
+
+Least squares is very simple.  Given a set of data points (2x oversampled in caseys case) and a knot vector, compute the coefficients for the control points of the spline that minimize squared error.  This involves solving a linear system where each row corresponds to a data point (di - they are uniformly space so each point has a corresponding ti which is in the domain of the curve) and each column corresponds to a control point for the curve (cj the unkowns).  So matrix element Aij is Bj(ti) where Bj is the basis function for the jth control point.  The right hand side is just a vector of the di and the x's are the unkown control points.  Sine you are compressing things there will be many more rows then columns in this linear system so you don't have to worry about over fitting problems (which you would if there were more DOF...)
+
+-Peter-Pike
+
+	-----Original Message-----
+	From: Ian Elsley [mailto:ielsley@kushgames.com]
+	Sent: Tue 12/17/2002 10:35 AM
+	To: gdalgorithms-list@lists.sourceforge.net
+	Cc:
+	Subject: RE: [Algorithms] Re: Keyframe reduction
+
+
+
+	Casey
+
+	I think I may be missing the point. Do you do least squares on the
+	already fitted cures (ie, a chi squared test) then curve fit separately,
+	or do you use least squares to fit your actual spline to the data?
+
+	I have not used least squared before and from the reading I have done
+	(numerical recipes, last night), it seem that the equation of the
+	line/spline are inherent in the method and it needs to be reformulated
+	to use a different line/spline. (The version I read was for fitting a
+	straight line to a data-set, which I know won't work for quaternions
+	slerps anyway)
+
+	I see the elegance of this method and would love to work it out, but
+	I've got a few blanks here.
+
+	Any pointers?
+
+	Thanks in advance,
+
+	Ian
+
+
+
+
+
+
+         q
+  q' = -----
+        |q|
+
+For renormalizing, since you are very close to 1, I usually use the
+tangent-line approximation, which was suggested by Checker a long long
+time ago for 3D vectors and works just peachy for 4D ones as well:
+
+inline void NormalizeCloseToOne4(float *Dest)
+{
+    float const Sum = (Dest[0] * Dest[0] +
+                       Dest[1] * Dest[1] +
+                       Dest[2] * Dest[2] +
+                       Dest[3] * Dest[3]);
+
+    float const ApproximateOneOverRoot = (3.0f - Sum) * 0.5f;
+
+    Dest[0] *= ApproximateOneOverRoot;
+    Dest[1] *= ApproximateOneOverRoot;
+    Dest[2] *= ApproximateOneOverRoot;
+    Dest[3] *= ApproximateOneOverRoot;
+}
+
+
+*/
+
+
+
+#define INSTANTIATE(T) \
+template EXPORT_COREMODULE std::pair<float, float> AnimationCurveTpl<T>::GetRange() const; \
+template EXPORT_COREMODULE T AnimationCurveTpl<T>::Evaluate (float curveT) const; \
+template EXPORT_COREMODULE void AnimationCurveTpl<T>::RemoveKeys (iterator begin, iterator end);\
+template EXPORT_COREMODULE int AnimationCurveTpl<T>::AddKey (const Keyframe& key);\
+template EXPORT_COREMODULE int AnimationCurveTpl<T>::FindIndex (float time) const; \
+template EXPORT_COREMODULE int AnimationCurveTpl<T>::FindIndex (const Cache& cache, float time) const; \
+template EXPORT_COREMODULE void AnimationCurveTpl<T>::InvalidateCache (); \
+template EXPORT_COREMODULE void AnimationCurveTpl<T>::SetPreInfinity (int mode); \
+template EXPORT_COREMODULE void AnimationCurveTpl<T>::SetPostInfinity (int mode); \
+template EXPORT_COREMODULE int AnimationCurveTpl<T>::GetPreInfinity () const; \
+template EXPORT_COREMODULE int AnimationCurveTpl<T>::GetPostInfinity () const; \
+template EXPORT_COREMODULE KeyframeTpl<T>::KeyframeTpl (float time, const T& value);
+
+INSTANTIATE(float)
+INSTANTIATE(Vector3f)
+INSTANTIATE(Quaternionf)
+
+template EXPORT_COREMODULE void AnimationCurveTpl<float>::CalculateCacheData (Cache& cache, int lhs, int rhs, float timeOffset) const;
+template EXPORT_COREMODULE float AnimationCurveTpl<float>::EvaluateClamp (float curveT) const;
+template EXPORT_COREMODULE Quaternionf AnimationCurveTpl<Quaternionf>::EvaluateClamp (float curveT) const;
+template EXPORT_COREMODULE Vector3f AnimationCurveTpl<Vector3f>::EvaluateClamp (float curveT) const;
diff --git a/Runtime/Math/AnimationCurve.h b/Runtime/Math/AnimationCurve.h
new file mode 100644
index 0000000..e88629a
--- /dev/null
+++ b/Runtime/Math/AnimationCurve.h
@@ -0,0 +1,324 @@
+#pragma once
+
+#include "Runtime/Utilities/dynamic_array.h"
+#include "Runtime/Serialize/SerializeUtility.h"
+#include "Runtime/Math/FloatConversion.h"
+#include "Runtime/Math/Vector3.h"
+#include "Runtime/Math/Quaternion.h"
+#include "Runtime/Modules/ExportModules.h"
+
+enum { kDefaultWrapMode = 0, kClamp = 1 << 0, kRepeat = 1 << 1, kPingPong = 1 << 2, kClampForever = 1 << 3 };
+#define kCurveTimeEpsilon 0.00001F
+
+/*
+	AnimationCurves in Maya are represented as time/value keys with 2D tangents which are always of normalized length.
+	From the tangents a slope is calculated (tangent.y / tangent.x) -> (thus length of the tangent doesn't matter)
+	
+	When the slope is multiplied by the time range of the curve (rhs.time - lhs.time) it can be evaluated using
+	a standard hermite interpolator.
+	
+	In the Unity AnimationCurve the slopes are directly stored in the keyframe instead of the 2D tangent vectors.
+*/
+
+///@TODO: Curve templates suck.
+///       Lets make some implementation where they share the same data structure and only Evaluate is specialized.
+
+template<class T>
+struct KeyframeTpl
+{
+	// DECLARE_SERIALIZE_OPTIMIZE_TRANSFER (Keyframe)
+	inline static const char* GetTypeString ()	{ return "Keyframe"; }
+	inline static bool IsAnimationChannel ()	{ return false; }
+	inline static bool MightContainPPtr ()	{ return false; }
+	// Disable transfer optimization in Editor because tangentMode optimized serialization when reading AssetBundles will corrupt data
+	inline static bool AllowTransferOptimization ()
+	{
+		#if UNITY_EDITOR
+		return false;
+		#else
+		return true;
+		#endif
+	} 
+	template<class TransferFunction>
+	void Transfer (TransferFunction& transfer);
+
+	float time;
+	T value;
+	T inSlope;
+	T outSlope; 
+	#if UNITY_EDITOR
+	int tangentMode;
+	#endif
+	
+	#if UNITY_EDITOR
+	KeyframeTpl ()
+	{
+		tangentMode = 0;
+	}
+	#else
+	KeyframeTpl () {}
+	#endif
+	KeyframeTpl (float t, const T& v);
+	
+			
+	friend bool operator < (const KeyframeTpl& lhs, const KeyframeTpl& rhs) { return lhs.time < rhs.time; }
+};
+
+enum AnimationCurveType {
+	kFloatCurve = 0,
+	kVector3Curve = 1,
+	kQuaternionCurve = 2
+};
+
+template<class T>
+class EXPORT_COREMODULE AnimationCurveTpl
+{
+	public: 
+	
+	DECLARE_SERIALIZE_NO_PPTR (AnimationCurve)
+
+	/// Stores the curve as a pure cubic function with 4 coefficients
+	struct Cache
+	{
+		int index;
+		float time;
+		float timeEnd;
+		T coeff[4];
+
+		Cache () { time = std::numeric_limits<float>::infinity (); index=0; timeEnd = 0.0f; memset(&coeff, 0, sizeof(coeff)); }
+		void Invalidate () { time = std::numeric_limits<float>::infinity (); index=0; }
+	};
+	
+	typedef KeyframeTpl<T> Keyframe;
+	
+	typedef dynamic_array<Keyframe> KeyframeContainer;
+	typedef typename KeyframeContainer::iterator iterator;
+	typedef typename KeyframeContainer::const_iterator const_iterator;
+	
+public:
+	AnimationCurveTpl ()
+	{
+		m_PreInfinity = m_PostInfinity = kInternalClamp;
+	}
+
+	/// Evaluates the AnimationCurve caching the segment.
+	T Evaluate (float curveT) const;
+	T EvaluateClamp (float curveT) const;
+	
+	bool IsValid () const { return m_Curve.size () >= 2; }
+	
+	int AddKey (const Keyframe& key);
+
+	/// Performs no error checking. And doesn't invalidate the cache!
+	void AddKeyBackFast (const Keyframe& key) { m_Curve.push_back (key); }
+	
+	const Keyframe& GetKey (int index) const { AssertMsg(index >= 0 && index < m_Curve.size(), "Index (%d) is out of range [0, %i)", index, (int)m_Curve.size()); return m_Curve[index]; }
+
+	/// When changing the keyframe using GetKey you are not allowed to change the time!
+	/// After modifying a key you have to call InvalidateCache
+	Keyframe& GetKey (int index) { AssertMsg(index >= 0 && index < m_Curve.size(), "Index (%d) is out of range [0, %i)", index, (int)m_Curve.size()); return const_cast<Keyframe&> (m_Curve[index]); }
+	
+	iterator begin () { return m_Curve.begin (); }
+	iterator end () { return m_Curve.end (); }
+	const_iterator begin () const { return m_Curve.begin (); }
+	const_iterator end () const { return m_Curve.end (); }
+
+	void InvalidateCache ();
+	
+	int GetKeyCount () const { return m_Curve.size (); }
+		
+	void RemoveKeys (iterator begin, iterator end);
+
+	/// Returns the first and last keyframe time
+	std::pair<float, float> GetRange () const;
+	
+	enum { kInternalPingPong = 0, kInternalRepeat = 1, kInternalClamp = 2 };
+
+	// How does the curve before the first keyframe
+	void SetPreInfinity (int pre);
+	int GetPreInfinity () const;
+	// How does the curve behave after the last keyframe
+	void SetPostInfinity (int post);
+	int GetPostInfinity () const;
+
+	// How does the curve before the first keyframe
+	void SetPreInfinityInternal (int pre) { m_PreInfinity = pre; InvalidateCache (); }
+	int GetPreInfinityInternal () const { return m_PreInfinity; }
+	// How does the curve behave after the last keyframe
+	void SetPostInfinityInternal (int post) { m_PostInfinity = post; InvalidateCache (); }
+	int GetPostInfinityInternal () const { return m_PostInfinity; }
+	
+	void Assign (const Keyframe* begin, const Keyframe* end) { m_Curve.assign (begin, end); InvalidateCache(); }
+	void Swap (KeyframeContainer& newArray) { m_Curve.swap(newArray); InvalidateCache(); }
+	void Sort () { std::sort(m_Curve.begin(), m_Curve.end()); InvalidateCache(); }
+
+	void ResizeUninitialized (int size) { m_Curve.resize_uninitialized(size); }
+
+	///@TODO: Cleanup old code to completely get rid of this
+	int FindIndex (const Cache& cache, float curveT) const;
+	
+	///@TODO: Cleanup old code to completely get rid of this
+	/// Returns the closest keyframe index that is less than time.
+	/// Returns -1 if time is outside the range of the curve
+	int FindIndex (float time) const;
+
+	void CalculateCacheData (Cache& cache, int lhs, int rhs, float timeOffset) const;
+	
+	private:
+	
+	void FindIndexForSampling (const Cache& cache, float curveT, int& lhs, int& rhs) const;
+	
+	/// Evaluates the AnimationCurve directly.
+	void EvaluateWithoutCache (float curveT, T& output)const;
+
+	float WrapTime (float curveT) const;
+
+	mutable Cache m_Cache;
+	mutable Cache m_ClampCache;
+
+	KeyframeContainer m_Curve;
+	int   m_PreInfinity;
+	int   m_PostInfinity;
+};
+
+typedef AnimationCurveTpl<float> AnimationCurveBase;
+typedef AnimationCurveTpl<float>      AnimationCurve;
+typedef AnimationCurveTpl<Quaternionf> AnimationCurveQuat;
+typedef AnimationCurveTpl<Vector3f>   AnimationCurveVec3;
+
+template<class T>
+template<class TransferFunction>
+inline void KeyframeTpl<T>::Transfer (TransferFunction& transfer)
+{
+	TRANSFER (time);
+	TRANSFER (value);
+	TRANSFER (inSlope);
+	TRANSFER (outSlope);
+	#if UNITY_EDITOR
+	if (!transfer.IsSerializingForGameRelease())
+		TRANSFER (tangentMode);
+	#endif
+}
+
+template<class T>
+template<class TransferFunction>
+inline void AnimationCurveTpl<T>::Transfer (TransferFunction& transfer)
+{
+	transfer.SetVersion(2);
+	
+	transfer.Transfer (m_Curve, "m_Curve", kHideInEditorMask);
+	transfer.Transfer (m_PreInfinity, "m_PreInfinity", kHideInEditorMask);
+	transfer.Transfer (m_PostInfinity, "m_PostInfinity", kHideInEditorMask);
+	
+	if (transfer.IsReading ())
+		InvalidateCache ();
+}
+
+inline int TimeToFrame (float time, float sampleRate)
+{
+	return RoundfToInt(time * sampleRate);
+}
+
+inline float FrameToTime (int frame, float sampleRate)
+{
+	return (float)frame / sampleRate;
+}
+
+inline float FloatFrameToTime (float frame, float sampleRate)
+{
+	return frame / sampleRate;
+}
+
+
+void HandleSteppedCurve (const KeyframeTpl<float>& lhs, const KeyframeTpl<float>& rhs, float& value);
+void HandleSteppedTangent (const KeyframeTpl<float>& lhs, const KeyframeTpl<float>& rhs, float& value);
+
+void HandleSteppedCurve (const KeyframeTpl<Vector3f>& lhs, const KeyframeTpl<Vector3f>& rhs, Vector3f& value);
+void HandleSteppedTangent (const KeyframeTpl<Vector3f>& lhs, const KeyframeTpl<Vector3f>& rhs, Vector3f& tangent);
+
+void HandleSteppedCurve (const KeyframeTpl<Quaternionf>& lhs, const KeyframeTpl<Quaternionf>& rhs, Quaternionf& tangent);
+void HandleSteppedTangent (const KeyframeTpl<Quaternionf>& lhs, const KeyframeTpl<Quaternionf>& rhs, Quaternionf& tangent);
+
+inline float PingPong (float t, float length)
+{
+	t = Repeat (t, length * 2.0F);
+	t = length - Abs (t - length);
+	return t;
+}
+
+
+inline float Repeat (float t, float begin, float end)
+{
+	return Repeat (t - begin, end - begin) + begin;
+}
+
+inline double RepeatD (double t, double begin, double end)
+{
+	return RepeatD (t - begin, end - begin) + begin;
+}
+
+inline float PingPong (float t, float begin, float end)
+{
+	return PingPong (t - begin, end - begin) + begin;
+}
+
+#if (defined(__GNUC__) && (__GNUC__ >= 4 && __GNUC_MINOR__ >= 3)) || defined(__clang__)
+	// in GCC 4.3 and above the explicit template specialization cannot have a storage class
+	#define SPEC_STORAGE_CLASS inline
+#else
+	#define SPEC_STORAGE_CLASS static
+#endif
+
+#define kMaxTan 5729577.9485111479F
+
+template<class T>
+static T MaxTan () { return kMaxTan; }
+
+template<>
+SPEC_STORAGE_CLASS Quaternionf MaxTan<Quaternionf> () { return Quaternionf(kMaxTan, kMaxTan, kMaxTan, kMaxTan); }
+
+template<>
+SPEC_STORAGE_CLASS Vector3f MaxTan<Vector3f> () { return Vector3f(kMaxTan, kMaxTan, kMaxTan); }
+
+#undef kMaxTan
+
+template<class T>
+static T Zero () { return T (); }
+
+template<>
+SPEC_STORAGE_CLASS Quaternionf Zero<Quaternionf> () { return Quaternionf(0.0F, 0.0F, 0.0F, 0.0F); }
+
+template<>
+SPEC_STORAGE_CLASS Vector3f Zero<Vector3f> () { return Vector3f(0.0F, 0.0F, 0.0F); }
+
+#undef SPEC_STORAGE_CLASS
+
+void ScaleCurveValue (AnimationCurve& curve, float scale);
+void OffsetCurveValue (AnimationCurve& curve, float offset);
+void ScaleCurveTime (AnimationCurve& curve, float scale);
+void OffsetCurveTime (AnimationCurve& curve, float offset);
+
+template<class T>
+inline T HermiteInterpolate (float t, T p0, T m0, T m1, T p1)
+{
+	float t2 = t * t;
+	float t3 = t2 * t;
+
+	float a = 2.0F * t3 - 3.0F * t2 + 1.0F;
+	float b = t3 - 2.0F * t2 + t;
+	float c = t3 - t2;
+	float d = -2.0F * t3 +  3.0F * t2;
+
+	return a * p0 + b * m0 + c * m1 + d * p1;
+}
+
+struct KeyframeCompare
+{
+	template<class T>
+	bool operator ()(KeyframeTpl<T> const& k, float t) { return k.time < t; }
+	// These are necessary for debug STL (validation of predicates)
+	template<class T>
+	bool operator ()(KeyframeTpl<T> const& k1, KeyframeTpl<T> const& k2) { return k1.time < k2.time; }
+	template<class T>
+	bool operator ()(float t, KeyframeTpl<T> const& k) { return !operator() (k, t); }
+};
diff --git a/Runtime/Math/Color.h b/Runtime/Math/Color.h
new file mode 100644
index 0000000..594598b
--- /dev/null
+++ b/Runtime/Math/Color.h
@@ -0,0 +1,293 @@
+#ifndef COLOR_H
+#define COLOR_H
+
+#include "Runtime/Serialize/SerializeUtility.h"
+#include "Runtime/Serialize/SerializationMetaFlags.h"
+#include <algorithm>
+#include "Runtime/Utilities/Utility.h"
+#include "FloatConversion.h"
+#include "Runtime/Serialize/SwapEndianBytes.h"
+
+class ColorRGBAf
+{
+	public:
+	float	r, g, b, a;
+
+	DEFINE_GET_TYPESTRING_IS_ANIMATION_CHANNEL (ColorRGBA)
+
+	ColorRGBAf () {}
+	
+	ColorRGBAf (float inR, float inG, float inB, float inA = 1.0F) : r(inR), g(inG), b(inB), a(inA) {}
+	explicit ColorRGBAf (const float* c) : r(c[0]), g(c[1]), b(c[2]), a(c[3]) {}
+
+	template<class TransferFunction> 
+	void Transfer (TransferFunction& transfer);
+
+	void Set (float inR, float inG, float inB, float inA) {r = inR; g = inG; b = inB; a = inA;}
+	
+	void SetHex (UInt32 hex)
+	{
+		Set(float (hex >> 24) / 255.0f, 
+			float ((hex >> 16) & 255) / 255.0f, 
+			float ((hex >> 8) & 255) / 255.0f, 
+			float (hex & 255) / 255.0f);
+	}
+
+	UInt32 GetHex () const
+	{
+		UInt32 hex = (NormalizedToByte(r) << 24) | (NormalizedToByte(g) << 16) | (NormalizedToByte(b) << 8) | NormalizedToByte(a);
+		return hex;
+	}
+	
+	float AverageRGB () const {return (r+g+b)*(1.0F / 3.0F);}
+	float GreyScaleValue () const { return r * 0.30f + g * 0.59f  + b * 0.11f; }
+	
+	ColorRGBAf& operator = (const ColorRGBAf& in) { Set (in.r, in.g, in.b, in.a); return *this; }
+
+	bool Equals(const ColorRGBAf& inRGB) const
+	{
+		return (r == inRGB.r && g == inRGB.g && b == inRGB.b && a == inRGB.a);
+	}
+	
+	bool NotEquals(const ColorRGBAf& inRGB) const
+	{
+		return (r != inRGB.r || g != inRGB.g || b != inRGB.b || a != inRGB.a);
+	}
+
+	float* GetPtr ()				{return &r;}
+	const float* GetPtr () const	{return &r;}
+
+	ColorRGBAf& operator += (const ColorRGBAf &inRGBA)
+	{
+		r += inRGBA.r; g += inRGBA.g; b += inRGBA.b; a += inRGBA.a;
+		return *this;
+	}
+
+	ColorRGBAf& operator *= (const ColorRGBAf &inRGBA)
+	{
+		r *= inRGBA.r; g *= inRGBA.g; b *= inRGBA.b; a *= inRGBA.a;
+		return *this;
+	}
+
+private:
+	// intentionally undefined
+	bool operator == (const ColorRGBAf& inRGB) const;
+	bool operator != (const ColorRGBAf& inRGB) const;
+};
+
+
+inline ColorRGBAf operator + (const ColorRGBAf& inC0, const ColorRGBAf& inC1)
+{
+	return ColorRGBAf (inC0.r + inC1.r, inC0.g + inC1.g, inC0.b + inC1.b, inC0.a + inC1.a);
+}
+
+inline ColorRGBAf operator * (const ColorRGBAf& inC0, const ColorRGBAf& inC1)
+{
+	return ColorRGBAf (inC0.r * inC1.r, inC0.g * inC1.g, inC0.b * inC1.b, inC0.a * inC1.a);
+}
+
+inline ColorRGBAf operator * (float inScale, const ColorRGBAf& inC0)
+{
+	return ColorRGBAf (inC0.r * inScale, inC0.g * inScale, inC0.b * inScale, inC0.a * inScale);
+}
+
+inline ColorRGBAf operator * (const ColorRGBAf& inC0, float inScale)
+{
+	return ColorRGBAf (inC0.r * inScale, inC0.g * inScale, inC0.b * inScale, inC0.a * inScale);
+}
+
+inline ColorRGBAf Lerp (const ColorRGBAf& c0, const ColorRGBAf& c1, float t)
+{
+	return (1.0f - t) * c0 + t * c1; 
+}
+
+
+
+class ColorRGBA32
+{
+	public:
+
+	UInt8	r, g, b, a;
+
+	DEFINE_GET_TYPESTRING_IS_ANIMATION_CHANNEL (ColorRGBA)
+	
+	ColorRGBA32 ()								{}
+	
+	ColorRGBA32 (UInt8 inR, UInt8 inG, UInt8 inB, UInt8 inA)		{ r = inR; g = inG; b = inB; a = inA; }
+	ColorRGBA32 (UInt32 c)						{ *(UInt32*)this = c; }
+	void Set (UInt8 inR, UInt8 inG, UInt8 inB, UInt8 inA)		{ r = inR; g = inG; b = inB; a = inA; }
+
+	ColorRGBA32 operator = (const ColorRGBA32& c)	{ *(UInt32*)this = *((UInt32*)&c); return *this;}
+	
+	ColorRGBA32 (const ColorRGBAf& c) { Set (c); }
+	
+	operator ColorRGBAf() const
+	{
+		return ColorRGBAf (ByteToNormalized(r), ByteToNormalized(g), ByteToNormalized(b), ByteToNormalized(a));
+	}
+	
+	UInt32 AsUInt32 () const { return *(UInt32*)this; }
+
+	void operator = (const ColorRGBAf& c)
+	{
+		Set (c);
+	}
+	
+	UInt32 GetUInt32 () { return *(UInt32*)this; }
+
+	void Set (const ColorRGBAf& c)
+	{
+		r = NormalizedToByte(c.r);
+		g = NormalizedToByte(c.g);
+		b = NormalizedToByte(c.b);
+		a = NormalizedToByte(c.a);
+	}
+	
+	template<class TransferFunction> 
+	void Transfer (TransferFunction& transfer)
+	{
+		transfer.SetVersion (2);
+		UInt32* c =  reinterpret_cast<UInt32*> (this);
+		// When transferring colors we shouldn't swap bytes.
+		// UInt32 already convert endianess by default so we convert it two times to keep it the same :)
+		if (transfer.ConvertEndianess ())
+		{
+			if (transfer.IsReading())
+			{
+				transfer.Transfer (*c, "rgba", kHideInEditorMask);	
+				SwapEndianBytes (*c);
+			}
+			else
+			{
+				UInt32 temp = *c;
+				SwapEndianBytes (temp);
+				transfer.Transfer (temp, "rgba", kHideInEditorMask);	
+			}
+		}
+		else
+		{
+			transfer.Transfer (*c, "rgba", kHideInEditorMask);	
+		}
+	}
+	
+	UInt8& operator [] (long i) {return GetPtr () [i];}
+	const UInt8& operator [] (long i)const {return GetPtr () [i];}
+	
+	bool operator == (const ColorRGBA32& inRGB) const
+	{
+		return (r == inRGB.r && g == inRGB.g && b == inRGB.b && a == inRGB.a) ? true : false;
+	}
+	
+	bool operator != (const ColorRGBA32& inRGB) const
+	{
+		return (r != inRGB.r || g != inRGB.g || b != inRGB.b || a != inRGB.a) ? true : false;
+	}
+	
+	UInt8* GetPtr ()		{return &r;}
+	const UInt8* GetPtr ()const	{return &r;}
+	
+	inline ColorRGBA32 operator * (int scale) const
+	{
+		//AssertIf (scale < 0 || scale > 255);
+		scale += 1;
+		const UInt32& u = reinterpret_cast<const UInt32&> (*this);
+		UInt32 lsb = (((u & 0x00ff00ff) * scale) >> 8) & 0x00ff00ff;
+		UInt32 msb = (((u & 0xff00ff00) >> 8) * scale) & 0xff00ff00;
+		lsb |= msb;
+		return ColorRGBA32 (lsb);
+	}
+
+	inline void operator *= (const ColorRGBA32& inC1)
+	{
+#if 0
+		r = (r * inC1.r) / 255;
+		g = (g * inC1.g) / 255;
+		b = (b * inC1.b) / 255;
+		a = (a * inC1.a) / 255;
+#else // This is much faster, but doesn't guarantee 100% matching result (basically color values van vary 1/255 but not at ends, check out unit test in cpp file).
+		UInt32& u = reinterpret_cast<UInt32&> (*this);
+		const UInt32& v = reinterpret_cast<const UInt32&> (inC1);
+		UInt32 result = (((u & 0x000000ff) * ((v & 0x000000ff) + 1)) >> 8) & 0x000000ff;
+		result |= (((u & 0x0000ff00) >> 8) * (((v & 0x0000ff00) >> 8) + 1)) & 0x0000ff00;
+		result |= (((u & 0x00ff0000) * (((v & 0x00ff0000) >> 16) + 1)) >> 8) & 0x00ff0000;
+		result |= (((u & 0xff000000) >> 8) * (((v & 0xff000000) >> 24) + 1)) & 0xff000000;
+		u = result;
+#endif
+}
+
+	inline ColorRGBA32 SwizzleToBGRA() const { return ColorRGBA32(b, g, r, a); }
+	inline ColorRGBA32 SwizzleToBGR() const { return ColorRGBA32(b, g, r, 255); }
+	inline ColorRGBA32 SwizzleToARGB() const { return ColorRGBA32(a, r, g, b); }
+	inline ColorRGBA32 UnswizzleBGRA() const { return ColorRGBA32(b, g, r, a); }
+	inline ColorRGBA32 UnswizzleARGB() const { return ColorRGBA32(g, b, a, r); }
+};
+
+#if GFX_OPENGLESxx_ONLY
+	inline ColorRGBA32 SwizzleColorForPlatform(const ColorRGBA32& col) { return col; }
+	inline ColorRGBA32 UnswizzleColorForPlatform(const ColorRGBA32& col) { return col; }
+#elif UNITY_XENON || UNITY_PS3 || UNITY_WII
+	inline ColorRGBA32 SwizzleColorForPlatform(const ColorRGBA32& col) { return col.SwizzleToARGB(); }
+	inline ColorRGBA32 UnswizzleColorForPlatform(const ColorRGBA32& col) { return col.UnswizzleARGB(); }
+#else
+	inline ColorRGBA32 SwizzleColorForPlatform(const ColorRGBA32& col) { return col.SwizzleToBGRA(); }
+	inline ColorRGBA32 UnswizzleColorForPlatform(const ColorRGBA32& col) { return col.UnswizzleBGRA(); }
+#endif
+
+struct OpColorRGBA32ToUInt32
+{
+	typedef UInt32 result_type;
+	UInt32 operator() (ColorRGBA32 const& arg) const { return arg.AsUInt32(); }
+};
+
+inline ColorRGBA32 operator + (const ColorRGBA32& inC0, const ColorRGBA32& inC1)
+{
+	return ColorRGBA32 (std::min<int> (inC0.r + inC1.r, 255), 
+		std::min<int> (inC0.g + inC1.g, 255),
+		std::min<int> (inC0.b + inC1.b, 255),
+		std::min<int> (inC0.a + inC1.a, 255));
+}
+
+inline ColorRGBA32 operator * (const ColorRGBA32& inC0, const ColorRGBA32& inC1)
+{
+#if 0
+	return ColorRGBA32 ((inC0.r * inC1.r) / 255, 
+		(inC0.g * inC1.g) / 255,
+		(inC0.b * inC1.b) / 255,
+		(inC0.a * inC1.a) / 255);
+#else
+	// This is much faster, but doesn't guarantee 100% matching result (basically color values van vary 1/255 but not at ends, check out unit test in cpp file).
+	const UInt32& u = reinterpret_cast<const UInt32&> (inC0);
+	const UInt32& v = reinterpret_cast<const UInt32&> (inC1);
+	UInt32 result = (((u & 0x000000ff) * ((v & 0x000000ff) + 1)) >> 8) & 0x000000ff;
+	result |= (((u & 0x0000ff00) >> 8) * (((v & 0x0000ff00) >> 8) + 1)) & 0x0000ff00;
+	result |= (((u & 0x00ff0000) * (((v & 0x00ff0000) >> 16) + 1)) >> 8) & 0x00ff0000;
+	result |= (((u & 0xff000000) >> 8) * (((v & 0xff000000) >> 24) + 1)) & 0xff000000;
+	return ColorRGBA32 (result);
+#endif
+}
+
+inline ColorRGBA32 Lerp(const ColorRGBA32& c0, const ColorRGBA32& c1, int scale)
+{
+	//AssertIf (scale < 0 || scale > 255);
+	const UInt32& u0 = reinterpret_cast<const UInt32&> (c0);
+	const UInt32& u1 = reinterpret_cast<const UInt32&> (c1);
+	UInt32 vx = u0 & 0x00ff00ff;
+	UInt32 rb = vx + ((((u1 & 0x00ff00ff) - vx) * scale) >> 8) & 0x00ff00ff;
+	vx = u0 & 0xff00ff00;
+	return ColorRGBA32( rb | (vx + ((((u1 >> 8) & 0x00ff00ff) - (vx >> 8)) * scale) & 0xff00ff00) );
+}
+
+
+template<class TransferFunction> 
+void ColorRGBAf::Transfer (TransferFunction& transfer)
+{
+	transfer.AddMetaFlag (kTransferUsingFlowMappingStyle);
+	transfer.Transfer (r, "r", kHideInEditorMask);
+	transfer.Transfer (g, "g", kHideInEditorMask);
+	transfer.Transfer (b, "b", kHideInEditorMask);
+	transfer.Transfer (a, "a", kHideInEditorMask);
+}
+
+
+#endif
+
diff --git a/Runtime/Math/ColorSpaceConversion.cpp b/Runtime/Math/ColorSpaceConversion.cpp
new file mode 100644
index 0000000..08d58f5
--- /dev/null
+++ b/Runtime/Math/ColorSpaceConversion.cpp
@@ -0,0 +1,17 @@
+#include "UnityPrefix.h"
+#include "ColorSpaceConversion.h"
+#include "Runtime/Misc/PlayerSettings.h"
+
+
+ColorSpace GetActiveColorSpace ()
+{
+	if (GetPlayerSettingsPtr())
+		return GetPlayerSettings().GetValidatedColorSpace();
+	else
+		return kUninitializedColorSpace;
+}
+
+/*
+ TODO:
+ * Fog colors in fixed function pipeline are not adjusted. Ask aras how the fog color gets put into the shader. ApplyFog does something with builtin shader params but it's never passed to shaderstate??? WTF.
+*/
+\ No newline at end of file
diff --git a/Runtime/Math/ColorSpaceConversion.h b/Runtime/Math/ColorSpaceConversion.h
new file mode 100644
index 0000000..4eb11da
--- /dev/null
+++ b/Runtime/Math/ColorSpaceConversion.h
@@ -0,0 +1,123 @@
+#pragma once
+
+#include "Color.h"
+
+enum ColorSpace { kUninitializedColorSpace = -1, kGammaColorSpace = 0, kLinearColorSpace, kMaxColorSpace };
+
+ColorSpace  GetActiveColorSpace ();
+
+
+// http://www.opengl.org/registry/specs/EXT/framebuffer_sRGB.txt
+// http://www.opengl.org/registry/specs/EXT/texture_sRGB_decode.txt
+// {  cs / 12.92,                 cs <= 0.04045 }
+// {  ((cs + 0.055)/1.055)^2.4,   cs >  0.04045 }
+
+inline float GammaToLinearSpace (float value)
+{
+	if (value <= 0.04045F)
+		return value / 12.92F;
+	else if (value < 1.0F)
+		return pow((value + 0.055F)/1.055F, 2.4F);
+	else
+		return pow(value, 2.4F);
+}
+
+// http://www.opengl.org/registry/specs/EXT/framebuffer_sRGB.txt
+// http://www.opengl.org/registry/specs/EXT/texture_sRGB_decode.txt
+// {  0.0,                          0         <= cl
+// {  12.92 * c,                    0         <  cl < 0.0031308
+// {  1.055 * cl^0.41666 - 0.055,   0.0031308 <= cl < 1
+// {  1.0,                                       cl >= 1  <- This has been adjusted since we want to maintain HDR colors
+
+inline float LinearToGammaSpace (float value)
+{
+	if (value <= 0.0F)
+		return 0.0F;
+	else if (value <= 0.0031308F)
+		return 12.92F * value;
+	else if (value <= 1.0F)
+		return 1.055F * powf(value, 0.41666F) - 0.055F;
+	else
+		return powf(value, 0.41666F);
+}
+
+inline float GammaToLinearSpaceXenon(float val)
+{
+	float ret;
+	if (val < 0)
+		ret = 0;
+	else if (val < 0.25f)
+		ret = 0.25f * val;
+	else if (val < 0.375f)
+		ret = (1.0f/16.0f) + 0.5f*(val-0.25f);
+	else if (val < 0.75f)
+		ret = 0.125f + 1.0f*(val-0.375f);
+	else if (val < 1.0f)
+		ret = 0.5f + 2.0f*(val-0.75f);
+	else
+		ret = 1.0f;
+	return ret;
+}
+
+inline float LinearToGammaSpaceXenon(float val)
+{
+	float ret;
+	if (val < 0)
+		ret = 0;
+	else if (val < (1.0f/16.0f))
+		ret = 4.0f * val;
+	else if (val < (1.0f/8.0f))
+		ret = (1.0f/4.0f) + 2.0f*(val-(1.0f/16.0f));
+	else if (val < 0.5f)
+		ret = 0.375f + 1.0f*(val-0.125f);
+	else if (val < 1.0f)
+		ret = 0.75f + 0.5f*(val-0.50f);
+	else
+		ret = 1.0f;
+	
+	return ret;
+}
+
+inline ColorRGBAf GammaToLinearSpace (const ColorRGBAf& value)
+{
+	return ColorRGBAf(GammaToLinearSpace(value.r), GammaToLinearSpace(value.g), GammaToLinearSpace(value.b), value.a);
+}
+
+inline ColorRGBAf LinearToGammaSpace (const ColorRGBAf& value)
+{
+	return ColorRGBAf(LinearToGammaSpace(value.r), LinearToGammaSpace(value.g), LinearToGammaSpace(value.b), value.a);
+}
+
+inline ColorRGBAf GammaToLinearSpaceXenon (const ColorRGBAf& value)
+{
+	return ColorRGBAf(GammaToLinearSpaceXenon(value.r), GammaToLinearSpaceXenon(value.g), GammaToLinearSpaceXenon(value.b), value.a);
+}
+
+inline ColorRGBAf LinearToGammaSpaceXenon (const ColorRGBAf& value)
+{
+	return ColorRGBAf(LinearToGammaSpaceXenon(value.r), LinearToGammaSpaceXenon(value.g), LinearToGammaSpaceXenon(value.b), value.a);
+}
+
+inline float GammaToActiveColorSpace (float value)
+{	
+	if (GetActiveColorSpace () == kLinearColorSpace)
+		return GammaToLinearSpace(value); 
+	else
+		return value; 
+}
+
+inline ColorRGBAf GammaToActiveColorSpace (const ColorRGBAf& value)
+{	
+	if (GetActiveColorSpace () == kLinearColorSpace)
+		return GammaToLinearSpace(value); 
+	else
+		return value; 
+}
+
+inline ColorRGBAf ActiveToGammaColorSpace (const ColorRGBAf& value)
+{
+	if (GetActiveColorSpace () == kLinearColorSpace)
+		return LinearToGammaSpace(value); 
+	else
+		return value;
+}
+\ No newline at end of file
diff --git a/Runtime/Math/FloatConversion.cpp b/Runtime/Math/FloatConversion.cpp
new file mode 100644
index 0000000..a76126d
--- /dev/null
+++ b/Runtime/Math/FloatConversion.cpp
@@ -0,0 +1,98 @@
+#include "UnityPrefix.h"
+#include "FloatConversion.h"
+
+#if UNITY_EDITOR
+
+#include "External/UnitTest++/src/UnitTest++.h"
+
+FloatToHalfConverter::FloatToHalfConverter()
+{
+	for (int i = 0; i < 256; i++)
+	{
+		int e = i - 127;
+		if (e < -24)
+		{
+			// Too small to represent becomes zero
+			m_ExponentTable[i] = 0x0000;
+			m_MantissaShift[i] = 24;
+		}
+		else if (e < -14)
+		{
+			// Small numbers become denormals
+			m_ExponentTable[i] = 0x0400 >> (-14 - e);
+			m_MantissaShift[i] = -1 - e;
+		}
+		else if (e < 16)
+		{
+			// Handle normalized numbers
+			m_ExponentTable[i] = (15 + e) << 10;
+			m_MantissaShift[i] = 13;
+		}
+		else if (e < 128)
+		{
+			// Large numbers become infinity
+			m_ExponentTable[i] = 0x7C00;
+			m_MantissaShift[i] = 24;
+		}
+		else
+		{
+			// Handle infinity and NaN
+			m_ExponentTable[i] = 0x7C00;
+			m_MantissaShift[i] = 13;
+		}
+	}
+}
+
+FloatToHalfConverter g_FloatToHalf;
+
+SUITE (FloatConversionTests)
+{
+TEST(FloatConversionTests_FloatToHalf)
+{
+	// 1 bit sign
+	for (int s = 0; s <= 1; s++)
+	{
+		// 5 bits exponent
+		for (int ebits = 0; ebits < (1 << 5); ebits++)
+		{
+			// 10 bits mantissa
+			for (int m = 0; m < (1 << 10); m++)
+			{
+				int orig = (s << 15) | (ebits << 10) | m;
+				float val;
+				HalfToFloat(orig, val);
+				UInt16 conv;
+				g_FloatToHalf.Convert(val, conv);
+				CHECK_EQUAL(orig, conv);
+			}
+		}
+	}
+}
+
+TEST(FloatConversionTests_IsFinite)
+{
+	float infF = std::numeric_limits<float>::infinity();
+	CHECK(IsFinite(0.0f));
+	CHECK(IsFinite(1.0f));
+	CHECK(IsFinite(FLT_MIN));
+	CHECK(IsFinite(FLT_MAX));
+	CHECK(IsFinite(-FLT_MIN));
+	CHECK(IsFinite(-FLT_MAX));
+	CHECK(!IsFinite(infF));
+	CHECK(!IsFinite(-infF));
+	CHECK(!IsFinite(infF-infF));
+
+	double infD = std::numeric_limits<double>::infinity();
+	CHECK(IsFinite(0.0));
+	CHECK(IsFinite(1.0));
+	CHECK(IsFinite(DBL_MIN));
+	CHECK(IsFinite(DBL_MAX));
+	CHECK(IsFinite(-DBL_MIN));
+	CHECK(IsFinite(-DBL_MAX));
+	CHECK(!IsFinite(infD));
+	CHECK(!IsFinite(-infD));
+	CHECK(!IsFinite(infD-infD));
+}
+}
+
+#endif // UNITY_EDITOR
diff --git a/Runtime/Math/FloatConversion.h b/Runtime/Math/FloatConversion.h
new file mode 100644
index 0000000..eb6b49c
--- /dev/null
+++ b/Runtime/Math/FloatConversion.h
@@ -0,0 +1,696 @@
+#ifndef FLOATCONVERSION_H
+#define FLOATCONVERSION_H
+
+#include <algorithm>
+#include <cmath>
+#include <limits>
+#include <math.h>
+
+#if !UNITY_EXTERNAL_TOOL
+#include "Runtime/Utilities/LogAssert.h"
+#endif
+
+#if defined(SN_TARGET_PS3)
+#	include <ppu_intrinsics.h>
+#elif defined(__GNUC__) && defined(__ppc__)
+#	include <ppc_intrinsics.h>
+#endif
+
+#ifndef kPI
+	#define kPI 3.14159265358979323846264338327950288419716939937510F
+#endif
+
+const float kBiggestFloatSmallerThanOne = 0.99999994f;
+const double kBiggestDoubleSmallerThanOne = 0.99999999999999989;
+
+#if defined(_XBOX)
+#define __FSELF __fself
+#elif defined(SN_TARGET_PS3)
+#define __FSELF __fsels
+#endif
+
+inline float FloatMin(float a, float b)
+{
+#if defined(_XBOX) || defined(SN_TARGET_PS3)
+	return __FSELF((a)-(b), b, a);
+#else
+	return std::min(a, b);
+#endif
+}
+
+inline float FloatMax(float a, float b)
+{
+#if defined(_XBOX) || defined(SN_TARGET_PS3)
+	return __FSELF((a)-(b), a, b);
+#else
+	return std::max(a, b);
+#endif
+}
+
+inline float Abs (float v)
+{
+#if defined(__ppc__) && (defined(__MWERKS__) || defined(SN_TARGET_PS3))
+	return __fabsf(v);
+#elif defined(_XBOX)
+	return __fabs(v);
+#else
+	return v < 0.0F ? -v : v;
+#endif
+}
+
+inline double Abs (double v)
+{
+	return v < 0.0 ? -v : v;	
+}
+
+inline int Abs (int v)
+{
+	return v < 0 ? -v : v;
+}
+
+// Floor, ceil and round functions.
+//
+// When changing or implementing these functions, make sure the tests in MathTest.cpp
+// still pass.
+//
+// Floor: rounds to the largest integer smaller than or equal to the input parameter.
+// Ceil: rounds to the smallest integer larger than or equal to the input parameter.
+// Round: rounds to the nearest integer. Ties (0.5) are rounded up to the smallest integer
+// larger than or equal to the input parameter.
+// Chop/truncate: use a normal integer cast.
+//
+// Windows:
+// Casts are as fast as a straight fistp on an SSE equipped CPU. This is by far the most common
+// scenario and will result in the best code for most users. fistp will use the rounding mode set
+// in the control register (round to nearest by default), and needs fiddling to work properly.
+// This actually makes code that attempt to use fistp slower than a cast.
+// Unless we want round to nearest, in which case fistp should be the best choice, right? But
+// it is not. The default rounding mode is round to nearest, but in case of a tie (0.5), round to 
+// nearest even is used. Thus 0.5 is rounded down to 0, 1.5 is rounded up to 2.
+// Conclusion - fistp is useless without stupid fiddling around that actually makes is slower than
+// an SSE cast.
+//
+// OS X Intel:
+// Needs investigating
+//
+// OS X PowerPC:
+// Needs investigating
+//
+// Xbox 360:
+// Needs investigating
+//
+// PS3:
+// Needs investigating
+//
+// iPhone:
+// Needs investigating
+//
+// Android:
+// Needs investigating
+
+
+inline int FloorfToInt (float f)
+{
+	DebugAssertIf (f < INT_MIN || f > INT_MAX);
+	return f >= 0 ? (int)f : (int)(f - kBiggestFloatSmallerThanOne);
+}
+
+inline UInt32 FloorfToIntPos (float f)
+{
+	DebugAssertIf (f < 0 || f > UINT_MAX);
+	return (UInt32)f;
+}
+
+inline float Floorf (float f)
+{
+	// Use std::floor().
+	// We are interested in reliable functions that do not lose precision.
+	// Casting to int and back to float would not be helpful.
+	return floor (f);
+}
+
+inline double Floord (double f)
+{
+	// Use std::floor().
+	// We are interested in reliable functions that do not lose precision.
+	// Casting to int and back to float would not be helpful.
+	return floor (f);
+}
+
+
+inline int CeilfToInt (float f)
+{
+	DebugAssertIf (f < INT_MIN || f > INT_MAX);
+	return f >= 0 ? (int)(f + kBiggestFloatSmallerThanOne) : (int)(f);
+}
+
+inline UInt32 CeilfToIntPos (float f)
+{
+	DebugAssertIf (f < 0 || f > UINT_MAX);
+	return (UInt32)(f + kBiggestFloatSmallerThanOne);
+}
+
+inline float Ceilf (float f)
+{
+	// Use std::ceil().
+	// We are interested in reliable functions that do not lose precision.
+	// Casting to int and back to float would not be helpful.
+	return ceil (f);
+}
+
+inline double Ceild (double f)
+{
+	// Use std::ceil().
+	// We are interested in reliable functions that do not lose precision.
+	// Casting to int and back to float would not be helpful.
+	return ceil (f);
+}
+
+
+inline int RoundfToInt (float f)
+{
+	return FloorfToInt (f + 0.5F);
+}
+
+inline UInt32 RoundfToIntPos (float f)
+{
+	return FloorfToIntPos (f + 0.5F);
+}
+
+inline float Roundf (float f)
+{
+	return Floorf (f + 0.5F);
+}
+
+inline double Roundf (double f)
+{
+	return Floord (f + 0.5);
+}
+
+
+///  Fast conversion of float [0...1] to 0 ... 65535
+inline int NormalizedToWord (float f)
+{
+	f = FloatMax (f, 0.0F);
+	f = FloatMin (f, 1.0F);
+	return RoundfToIntPos (f * 65535.0f);
+}
+
+///  Fast conversion of float [0...1] to 0 ... 65535
+inline float WordToNormalized (int p)
+{
+	AssertIf(p < 0 || p > 65535);
+	return (float)p / 65535.0F;
+}
+
+///  Fast conversion of float [0...1] to 0 ... 255
+inline int NormalizedToByte (float f)
+{
+	f = FloatMax (f, 0.0F);
+	f = FloatMin (f, 1.0F);
+	return RoundfToIntPos (f * 255.0f);
+}
+
+///  Fast conversion of float [0...1] to 0 ... 255
+inline float ByteToNormalized (int p)
+{
+	AssertIf(p < 0 || p > 255);
+	return (float)p / 255.0F;
+}
+
+
+// Returns float remainder for t / length
+inline float Repeat (float t, float length)
+{
+	return t - Floorf (t / length) * length;
+}
+
+// Returns double remainder for t / length
+inline double RepeatD (double t, double length)
+{
+	return t - floor (t / length) * length;
+}
+
+// Returns relative angle on the interval (-pi, pi]
+inline float DeltaAngleRad (float current, float target)
+{
+	float delta = Repeat ((target - current), 2 * kPI);
+	if (delta > kPI)
+		delta -= 2 * kPI;
+	return delta;
+}
+
+// Returns true if the distance between f0 and f1 is smaller than epsilon
+inline bool CompareApproximately (float f0, float f1, float epsilon = 0.000001F)
+{
+	float dist = (f0 - f1);
+	dist = Abs (dist);
+	return dist < epsilon;
+}
+
+/// CopySignf () returns x with its sign changed to y's.
+inline float CopySignf (float x, float y)
+{
+	union
+	{
+		float f;
+		UInt32 i;
+	} u, u0, u1;
+	u0.f = x; u1.f = y;
+	UInt32 a    = u0.i;
+	UInt32 b    = u1.i;
+	SInt32 mask = 1 << 31;
+	UInt32 sign = b & mask;
+	a &= ~mask;
+	a |= sign;
+
+	u.i = a;
+	return u.f;
+}
+
+inline int CompareFloatRobustSignUtility (float A)
+{
+    // The sign bit of a number is the high bit.
+	union
+	{
+		float f;
+		int i;
+	} u;
+	u.f = A;
+    return (u.i) & 0x80000000;
+}
+
+inline bool CompareFloatRobust (float f0, float f1, int maxUlps = 10)
+{
+    // After adjusting floats so their representations are lexicographically
+    // ordered as twos-complement integers a very small positive number
+    // will compare as 'close' to a very small negative number. If this is
+    // not desireable, and if you are on a platform that supports
+    // subnormals (which is the only place the problem can show up) then
+    // you need this check.
+    // The check for A == B is because zero and negative zero have different
+    // signs but are equal to each other.
+    if (CompareFloatRobustSignUtility(f0) != CompareFloatRobustSignUtility(f1))
+        return f0 == f1;
+
+	union
+	{
+		float f;
+		int i;
+	} u0, u1;
+	u0.f = f0;
+	u1.f = f1;
+    int aInt = u0.i;
+    // Make aInt lexicographically ordered as a twos-complement int
+    if (aInt < 0)
+        aInt = 0x80000000 - aInt;
+    // Make bInt lexicographically ordered as a twos-complement int
+    int bInt = u1.i;
+    if (bInt < 0)
+        bInt = 0x80000000 - bInt;
+
+    // Now we can compare aInt and bInt to find out how far apart A and B
+    // are.
+    int intDiff = Abs (aInt - bInt);
+    if (intDiff <= maxUlps)
+        return true;
+    return false;
+}
+
+// Returns the t^2
+template<class T>
+T Sqr (const T& t)
+{
+	return t * t;
+}
+
+#define kDeg2Rad (2.0F * kPI / 360.0F)
+#define kRad2Deg (1.F / kDeg2Rad)
+
+inline float Deg2Rad (float deg)
+{
+	// TODO : should be deg * kDeg2Rad, but can't be changed, 
+	// because it changes the order of operations and that affects a replay in some RegressionTests
+	return deg / 360.0F * 2.0F * kPI;
+}
+
+inline float Rad2Deg (float rad)
+{
+	// TODO : should be rad * kRad2Deg, but can't be changed, 
+	// because it changes the order of operations and that affects a replay in some RegressionTests
+	return rad / 2.0F / kPI * 360.0F;
+}
+
+inline float Lerp (float from, float to, float t)
+{
+	return to * t + from * (1.0F - t);
+}
+
+inline bool IsNAN (float value)
+{
+	#if defined __APPLE_CC__
+		return value != value;
+	#elif _MSC_VER
+		return _isnan(value) != 0;
+	#else
+		return isnan (value);
+	#endif
+}
+
+inline bool IsNAN (double value)
+{
+	#if defined __APPLE_CC__
+		return value != value;
+	#elif _MSC_VER
+		return _isnan(value) != 0;
+	#else
+		return isnan (value);
+	#endif
+}
+
+inline bool IsPlusInf(float value)		{ return value == std::numeric_limits<float>::infinity (); }
+inline bool IsMinusInf(float value)		{ return value == -std::numeric_limits<float>::infinity ();	}
+
+inline bool IsFinite(const float& value)
+{
+	// Returns false if value is NaN or +/- infinity
+	UInt32 intval = *reinterpret_cast<const UInt32*>(&value);
+	return (intval & 0x7f800000) != 0x7f800000;
+}
+
+inline bool IsFinite(const double& value)
+{
+	// Returns false if value is NaN or +/- infinity
+	UInt64 intval = *reinterpret_cast<const UInt64*>(&value);
+	return (intval & 0x7ff0000000000000LL) != 0x7ff0000000000000LL;
+}
+
+inline float InvSqrt (float p) { return 1.0F / sqrt (p); }
+inline float Sqrt (float p) { return sqrt (p); }
+
+/// - Almost highest precision sqrt
+/// - Returns 0 if value is 0 or -1
+/// inline float FastSqrt (float value)
+
+/// - Almost highest precision inv sqrt
+/// - if value == 0 or -0 it returns 0.
+/// inline float FastInvSqrt (float value)
+
+/// - Low precision inv sqrt approximately
+/// - if value == 0 or -0 it returns nan or undefined
+/// inline float FastestInvSqrt (float value)
+
+#if defined(__ppc__) || defined(SN_TARGET_PS3)
+
+#if UNITY_WII
+// Copied from <CodeWarrior>\PowerPC_EABI_Support\MSL\MSL_C\PPC_EABI\Include\math_ppc_inlines.h
+// Requires hardware floating to be enabled
+// P.S I've also profiled with function below which uses fabs(x) == 0.0F, it's two times slower than this one
+inline float FastSqrt (float x)
+{
+	static const double _half=.5f;
+	static const double _three=3.0f;
+
+	if(x > 0.0f)
+	{
+		double xd = (double)x;
+		double guess = __frsqrte(xd);		  		 	/* returns an approximation to	*/
+		guess = _half*guess*(_three - guess*guess*xd);	/* now have 12 sig bits 		*/
+		guess = _half*guess*(_three - guess*guess*xd);	/* now have 24 sig bits			*/
+		return (float)(xd * guess);
+	}
+	else if (x < 0.0)
+		return NAN;
+	else
+		return x;
+}
+#else
+/// - Accurate to 1 bit precision
+/// - returns zero if x is zero
+inline float FastSqrt (float x)
+{
+    const float half = 0.5;
+    const float one = 1.0;
+    float B, y0, y1;
+
+	// This'll NaN if it hits frsqrte. Handle both +0.0 and -0.0
+    if (fabs(x) == 0.0F)
+      return x;
+
+    B = x;
+    
+#if defined(__GNUC__) && !defined(SN_TARGET_PS3)
+    y0 = __frsqrtes(B);
+#else
+    y0 = __frsqrte(B);
+#endif
+    // First refinement step
+    
+    y1 = y0 + half*y0*(one - B*y0*y0);
+    
+    // Second refinement step -- copy the output of the last step to the input of this step
+    
+    y0 = y1;
+    y1 = y0 + half*y0*(one - B*y0*y0);
+    
+    // Get sqrt(x) from x * 1/sqrt(x)
+    return x * y1;
+}
+#endif
+
+/// - Accurate to 1 bit precision
+/// - returns zero if f is zero
+inline float FastInvSqrt( float f ) 
+{
+	float result;
+	float estimate, estimate2;
+	float oneHalf = 0.5f;
+	float one = oneHalf + oneHalf;
+	//Calculate a 5 bit starting estimate for the reciprocal sqrt
+#if defined(__GNUC__) && !defined(SN_TARGET_PS3)
+    estimate = estimate2 = __frsqrtes ( f );
+#else
+    estimate = estimate2 = __frsqrte ( f );
+#endif
+
+	//if you require less precision, you may reduce the number of loop iterations
+	estimate = estimate + oneHalf * estimate * ( one - f * estimate * estimate );
+	estimate = estimate + oneHalf * estimate * ( one - f * estimate * estimate );
+	
+#if defined(__GNUC__) && !defined(SN_TARGET_PS3)
+	result = __fsels( -f, estimate2, estimate );
+#else
+	result = __fsel( -f, estimate2, estimate );
+#endif
+	return result;
+}
+
+/// Fast inverse sqrt function
+inline float FastestInvSqrt (float value)
+{
+	#if defined (__ppc__) && (defined (__MWERKS__) || defined(SN_TARGET_PS3))
+	return (float)__frsqrte (value);
+	#elif defined (__ppc__)
+		return (float)__frsqrtes(value);
+	#else
+		return 1.0F / sqrtf (value);
+	#endif
+}
+
+#else
+
+inline float FastSqrt (float value)
+{
+	return sqrtf (value);
+}
+
+inline float FastInvSqrt( float f ) 
+{
+	// The Newton iteration trick used in FastestInvSqrt is a bit faster on
+	// Pentium4 / Windows, but lower precision. Doing two iterations is precise enough,
+	// but actually a bit slower.
+	if (fabs(f) == 0.0F)
+		return f;
+	return 1.0F / sqrtf (f);
+}
+
+inline float FastestInvSqrt( float f )
+{
+	union
+	{
+		float f;
+		int i;
+	} u;
+	float fhalf = 0.5f*f;
+	u.f = f;
+	int i = u.i;
+	i = 0x5f3759df - (i>>1);
+	u.i = i;
+	f = u.f;
+	f = f*(1.5f - fhalf*f*f);
+	// f = f*(1.5f - fhalf*f*f); // uncommenting this would be two iterations
+	return f;
+}
+
+#endif
+
+inline float SqrtImpl (float f)
+{
+	#if UNITY_WII || UNITY_FLASH
+		return FastSqrt (f); 
+	#else
+		return sqrt (f); 
+	#endif
+}
+inline float Sin (float f)
+{
+	return sinf (f);
+}
+
+inline float Pow (float f, float f2)
+{
+	return powf (f, f2);
+}
+
+inline float Cos (float f)
+{
+	return cosf (f);
+}
+
+inline float Sign (float f)
+{
+#if defined(_XBOX)
+	return __fsel(f, 1.0f, -1.0f);
+#else
+	if (f < 0.0F)
+		return -1.0F;
+	else
+		return 1.0;
+#endif
+}
+
+#if UNITY_EDITOR
+
+class FloatToHalfConverter
+{
+public:
+	FloatToHalfConverter();
+
+	void Convert(const float& src, UInt16& dest)
+	{
+		UInt32 bits = *reinterpret_cast<const UInt32*>(&src);
+		UInt8 index = UInt8(bits >> 23);
+		UInt32 sign = bits & 0x80000000;
+		UInt32 mantissa = bits & 0x007fffff;
+		dest = (sign >> 16) | m_ExponentTable[index] | (mantissa >> m_MantissaShift[index]);
+	}
+
+private:
+	UInt16 m_ExponentTable[256];
+	UInt8 m_MantissaShift[256];
+};
+
+extern FloatToHalfConverter g_FloatToHalf;
+
+#endif // UNITY_EDITOR
+
+#if UNITY_SUPPORTS_SSE
+#include "Runtime/Math/Simd/SimdMath.h"
+
+#define SSE_CONST4(name, val) static const ALIGN16 UInt32 name[4] = { (val), (val), (val), (val) }
+#define CONST_M128I(name) *(const __m128i *)&name
+
+static ALIGN16 UInt16 source[] = {0,0,0,0,0,0,0,0};
+static ALIGN16 float destination[] = {0.0,0.0,0.0,0.0};
+
+static void HalfToFloat(UInt16 src, float& dest)
+{
+	SSE_CONST4(mask_nosign, 0x7fff);
+	SSE_CONST4(smallest_normal, 0x0400);
+	SSE_CONST4(infinity, 0x7c00);
+	SSE_CONST4(expadjust_normal, (127 - 15) << 23);
+	SSE_CONST4(magic_denorm, 113 << 23);
+	
+	source[0] = src;
+	__m128i in = _mm_loadu_si128(reinterpret_cast<const __m128i*>(source));
+	__m128i mnosign = CONST_M128I(mask_nosign);
+	__m128i eadjust = CONST_M128I(expadjust_normal);
+	__m128i smallest = CONST_M128I(smallest_normal);
+	__m128i infty = CONST_M128I(infinity);
+	__m128i expmant = _mm_and_si128(mnosign, in);
+	__m128i justsign = _mm_xor_si128(in, expmant);
+	__m128i b_notinfnan = _mm_cmpgt_epi32(infty, expmant);
+	__m128i b_isdenorm = _mm_cmpgt_epi32(smallest, expmant);
+	__m128i shifted = _mm_slli_epi32(expmant, 13);
+	__m128i adj_infnan = _mm_andnot_si128(b_notinfnan, eadjust);
+	__m128i adjusted = _mm_add_epi32(eadjust, shifted);
+	__m128i den1 = _mm_add_epi32(shifted, CONST_M128I(magic_denorm));
+	__m128i adjusted2 = _mm_add_epi32(adjusted, adj_infnan);
+	__m128 den2 = _mm_sub_ps(_mm_castsi128_ps(den1), *(const __m128 *)&magic_denorm);
+	__m128 adjusted3 = _mm_and_ps(den2, _mm_castsi128_ps(b_isdenorm));
+	__m128 adjusted4 = _mm_andnot_ps(_mm_castsi128_ps(b_isdenorm), _mm_castsi128_ps(adjusted2));
+	__m128 adjusted5 = _mm_or_ps(adjusted3, adjusted4);
+	__m128i sign = _mm_slli_epi32(justsign, 16);
+	__m128 out = _mm_or_ps(adjusted5, _mm_castsi128_ps(sign));
+	_mm_storeu_ps(destination, out);
+	dest = destination[0];
+#undef SSE_CONST4
+#undef CONST_M128I
+}
+
+#else
+
+static void HalfToFloat(UInt16 src, float& dest)
+{
+	// Integer alias
+	UInt32& bits = *reinterpret_cast<UInt32*>(&dest);
+
+	// Based on Fabian Giesen's public domain half_to_float_fast3
+	static const UInt32 magic = { 113 << 23 };
+	const float& magicFloat = *reinterpret_cast<const float*>(&magic);
+	static const UInt32 shiftedExp = 0x7c00 << 13; // exponent mask after shift
+
+	// Mask out sign bit
+	bits = src & 0x7fff;
+	if (bits)
+	{
+		// Move exponent + mantissa to correct bits
+		bits <<= 13;
+		UInt32 exponent = bits & shiftedExp;
+		if (exponent == 0)
+		{
+			// Handle denormal
+			bits += magic;
+			dest -= magicFloat;
+		}
+		else if (exponent == shiftedExp) // Inf/NaN
+			bits += (255 - 31) << 23;
+		else
+			bits += (127 - 15) << 23;
+	}
+
+	// Copy sign bit
+	bits |= (src & 0x8000) << 16;
+}
+
+#endif
+
+using std::cos;
+using std::pow;
+using std::atan2;
+using std::acos;
+using std::sin;
+using std::sqrt;
+using std::log;
+using std::exp;
+
+// On non-C99 platforms log2 is not available, so approximate it.
+#if UNITY_WIN || UNITY_XENON || UNITY_ANDROID || UNITY_FLASH || UNITY_WEBGL
+#define kNaturalLogarithm2 0.693147180559945309417
+#define Log2(x) (logf(x) / kNaturalLogarithm2)
+#else
+#define Log2(x) log2f(x)
+#endif
+
+
+#endif
diff --git a/Runtime/Math/FloatExceptions.cpp b/Runtime/Math/FloatExceptions.cpp
new file mode 100644
index 0000000..d8c9563
--- /dev/null
+++ b/Runtime/Math/FloatExceptions.cpp
@@ -0,0 +1,18 @@
+#include "FloatExceptions.h"
+
+
+#if 0 && DEBUGMODE && defined(__SSE__)
+
+void InitFloatExceptions ()
+{
+	  _MM_SET_EXCEPTION_MASK (_MM_GET_EXCEPTION_MASK () & ~_MM_MASK_INVALID);
+}
+
+#else
+
+void InitFloatExceptions ()
+{
+}
+
+#endif
+
diff --git a/Runtime/Math/FloatExceptions.h b/Runtime/Math/FloatExceptions.h
new file mode 100644
index 0000000..d2002e9
--- /dev/null
+++ b/Runtime/Math/FloatExceptions.h
@@ -0,0 +1,6 @@
+#ifndef FLOATEXCEPTIONS_H
+#define FLOATEXCEPTIONS_H
+
+void InitFloatExceptions ();
+
+#endif
diff --git a/Runtime/Math/Gradient.cpp b/Runtime/Math/Gradient.cpp
new file mode 100644
index 0000000..9544a2b
--- /dev/null
+++ b/Runtime/Math/Gradient.cpp
@@ -0,0 +1,310 @@
+#include "UnityPrefix.h"
+#include "Runtime/BaseClasses/ObjectDefines.h"
+#include "Runtime/Serialize/TransferFunctions/SerializeTransfer.h"
+#include "Runtime/Math/Gradient.h"
+
+GradientNEW::GradientNEW()
+:	m_NumColorKeys(2)
+,	m_NumAlphaKeys(2)
+{
+	m_Keys[0] = m_Keys[1] = ColorRGBA32(0xffffffff);
+	m_ColorTime[0] = m_AlphaTime[0] = NormalizedToWord(0.0f);
+	m_ColorTime[1] = m_AlphaTime[1] = NormalizedToWord(1.0f);
+
+	for(UInt32 i = 2; i < kGradientMaxNumKeys; i++)
+	{
+		m_Keys[i] = ColorRGBA32(0);
+		m_ColorTime[i] = NormalizedToWord(0.0f);
+		m_AlphaTime[i] = NormalizedToWord(0.0f);
+	}
+}
+
+GradientNEW::~GradientNEW()
+{
+}
+
+void GradientNEW::SetKeys (ColorKey* colorKeys, unsigned numColorKeys, AlphaKey* alphaKeys, unsigned numAlphaKeys)
+{
+	SetColorKeys (colorKeys, numColorKeys);
+	SetAlphaKeys (alphaKeys, numAlphaKeys);
+}
+
+void GradientNEW::SwapColorKeys(int i, int j)
+{
+	ColorRGBA32 tmpCol = m_Keys[i];
+	UInt16 tmpTime = m_ColorTime[i];
+	m_Keys[i].r = m_Keys[j].r;
+	m_Keys[i].g = m_Keys[j].g;
+	m_Keys[i].b = m_Keys[j].b;
+	m_ColorTime[i] = m_ColorTime[j];
+	m_Keys[j].r = tmpCol.r;
+	m_Keys[j].g = tmpCol.g;
+	m_Keys[j].b = tmpCol.b;
+	m_ColorTime[j] = tmpTime;
+}
+
+void GradientNEW::SwapAlphaKeys(int i, int j)
+{
+	ColorRGBA32 tmpCol = m_Keys[i];
+	UInt16 tmpTime = m_AlphaTime[i];
+	m_Keys[i].a = m_Keys[j].a;
+	m_AlphaTime[i] = m_AlphaTime[j];
+	m_Keys[j].a = tmpCol.a;
+	m_AlphaTime[j] = tmpTime;
+}
+
+void GradientNEW::SetColorKeys (ColorKey* colorKeys, unsigned numKeys)
+{
+	DebugAssert (numKeys <= kGradientMaxNumKeys);
+	if (numKeys > kGradientMaxNumKeys)
+		numKeys = kGradientMaxNumKeys;
+
+	for (int i=0; i<numKeys; ++i)
+	{
+		const ColorRGBAf& color = colorKeys[i].m_Color;
+		m_Keys[i].r = NormalizedToByte(color.r);
+		m_Keys[i].g = NormalizedToByte(color.g);
+		m_Keys[i].b = NormalizedToByte(color.b);
+		m_ColorTime[i] = NormalizedToWord(colorKeys[i].m_Time);
+	}
+	m_NumColorKeys = numKeys;
+
+	// Ensure sorted!
+	int i = 0;
+	const int keyCount = m_NumColorKeys;
+	while ((i + 1) < keyCount)
+	{
+		if (m_ColorTime[i] > m_ColorTime[i+1])
+		{
+			SwapColorKeys(i, i + 1);
+			if (i > 0)
+				i -= 2;
+		}
+		i++;
+	}
+
+	ValidateColorKeys();
+}
+
+void GradientNEW::SetAlphaKeys (AlphaKey* alphaKeys, unsigned numKeys)
+{
+	DebugAssert (numKeys <= kGradientMaxNumKeys);
+	if (numKeys > kGradientMaxNumKeys)
+		numKeys = kGradientMaxNumKeys;
+
+	for (int i=0; i<numKeys; ++i)
+	{
+		float alpha = alphaKeys[i].m_Alpha;
+		m_Keys[i].a = NormalizedToByte(alpha);
+		m_AlphaTime[i] = NormalizedToWord(alphaKeys[i].m_Time);
+	}
+	m_NumAlphaKeys = numKeys;
+
+	// Ensure sorted!
+	int i = 0;
+	const int keyCount = m_NumAlphaKeys;
+	while ((i + 1) < keyCount)
+	{
+		if (m_AlphaTime[i] > m_AlphaTime[i+1])
+		{
+			SwapAlphaKeys(i, i + 1);
+			if (i > 0)
+				i -= 2;
+		}
+		i++;
+	}
+	
+	ValidateAlphaKeys();
+}
+
+ColorRGBA32 GradientNEW::GetConstantColor () const
+{
+	return m_Keys[0];
+}
+
+void GradientNEW::SetConstantColor (ColorRGBA32 color)
+{
+	m_Keys[0] = color;
+	m_NumAlphaKeys = 1;
+	m_NumColorKeys = 1;
+}
+
+void GradientNEW::ValidateColorKeys()
+{
+	// Make sure there is a minimum of 2 keys
+	if(m_NumColorKeys < 2)
+	{
+		m_NumColorKeys = 2;
+		for(int rgb = 0; rgb < 3; rgb++)
+			m_Keys[1][rgb] = m_Keys[0][rgb];
+		m_ColorTime[0] = NormalizedToWord(0.0f);
+		m_ColorTime[1] = NormalizedToWord(1.0f);
+	}
+}
+
+void GradientNEW::ValidateAlphaKeys()
+{
+	// Make sure there is a minimum of 2 keys
+	if(m_NumAlphaKeys < 2)
+	{
+		m_NumAlphaKeys = 2;
+		m_Keys[1].a = m_Keys[0].a;
+		m_AlphaTime[0] = NormalizedToWord(0.0f);
+		m_AlphaTime[1] = NormalizedToWord(1.0f);
+	}
+}
+
+void GradientNEW::InitializeOptimized(OptimizedGradient& gradient)
+{
+	// Copy all time values
+	for(int i = 0; i < m_NumColorKeys; ++i)
+		gradient.times[i] = m_ColorTime[i];
+
+	for(int i = 0, i2 = m_NumColorKeys; i < m_NumAlphaKeys; ++i, ++i2)
+		gradient.times[i2] = m_AlphaTime[i];
+
+	// Remove duplicates
+	int keyCount = m_NumColorKeys + m_NumAlphaKeys;
+	for(int i = 0; i < keyCount-1; ++i)
+	{
+		for(int j = i+1; j < keyCount; )
+		{
+			if(gradient.times[i] == gradient.times[j])
+			{
+				std::swap(gradient.times[j], gradient.times[keyCount-1]);
+				keyCount--;
+				continue;	
+			}
+			++j;
+		}
+	}
+
+	// Sort
+	int i = 0;
+	while ((i + 1) < keyCount)
+	{
+		if (gradient.times[i] > gradient.times[i+1])
+		{
+			std::swap(gradient.times[i], gradient.times[i+1]);
+			if (i > 0)
+				i -= 2;
+		}
+		i++;
+	}
+
+	for(int i = 0; i < keyCount; ++i)
+		gradient.colors[i] = Evaluate(WordToNormalized(gradient.times[i]));
+	gradient.keyCount = keyCount;
+
+	for(int i = 1; i < keyCount; ++i)
+		gradient.rcp[i] = ((((1<<24)) / std::max<UInt32>(gradient.times[i] - gradient.times[i-1], 1)))+1;
+}
+
+template<class TransferFunction>
+void GradientNEW::Transfer (TransferFunction& transfer)
+{
+	AssertIf (kGradientMaxNumKeys > 9);
+	
+	const char* kKeyNames [kGradientMaxNumKeys] = { "key0", "key1", "key2", "key3", "key4", "key5", "key6", "key7", }; 
+	for(UInt32 i = 0; i < kGradientMaxNumKeys; i++)
+		transfer.Transfer(m_Keys[i], kKeyNames[i], kHideInEditorMask);
+	
+	const char* kColorTimeNames [kGradientMaxNumKeys] = { "ctime0", "ctime1", "ctime2", "ctime3", "ctime4", "ctime5", "ctime6", "ctime7", }; 
+	for(UInt32 i = 0; i < kGradientMaxNumKeys; i++)
+		transfer.Transfer(m_ColorTime[i], kColorTimeNames[i], kHideInEditorMask);
+	
+	const char* kAlphaTimeNames [kGradientMaxNumKeys] = { "atime0", "atime1", "atime2", "atime3", "atime4", "atime5", "atime6", "atime7", }; 
+	for(UInt32 i = 0; i < kGradientMaxNumKeys; i++)
+		transfer.Transfer(m_AlphaTime[i], kAlphaTimeNames[i], kHideInEditorMask);
+
+	transfer.Transfer (m_NumColorKeys, "m_NumColorKeys", kHideInEditorMask);
+	transfer.Transfer (m_NumAlphaKeys, "m_NumAlphaKeys", kHideInEditorMask);
+	transfer.Align();
+	
+	if(transfer.IsReading())
+	{
+		ValidateColorKeys();
+		ValidateAlphaKeys();
+	}
+}
+INSTANTIATE_TEMPLATE_TRANSFER(GradientNEW)
+
+
+#if ENABLE_UNIT_TESTS
+
+#include "External/UnitTest++/src/UnitTest++.h"
+
+bool CompareColors(ColorRGBA32 c0, ColorRGBA32 c1, int tolerance)
+{
+	if(Abs(c0.r-c1.r) > tolerance)
+		return false;
+	if(Abs(c0.g-c1.g) > tolerance)
+		return false;
+	if(Abs(c0.b-c1.b) > tolerance)
+		return false;
+	if(Abs(c0.a-c1.a) > tolerance)
+		return false;
+	return true;
+}
+
+SUITE (GradientTests)
+{
+TEST (GradientTests_GradientEvaluate)
+{
+	// Set up rainbow gradient
+	GradientNEW gradient;
+	gradient.SetNumColorKeys(5);
+	gradient.SetNumAlphaKeys(3);
+	gradient.GetKey(0) = ColorRGBA32(0xff, 0x00, 0x00, 0xff);
+	gradient.GetKey(1) = ColorRGBA32(0xf8, 0xff, 0x00, 0x00);
+	gradient.GetKey(2) = ColorRGBA32(0x00, 0xff, 0x49, 0xff);
+	gradient.GetKey(3) = ColorRGBA32(0x22, 0x00, 0xff, 0x00);
+	gradient.GetKey(4) = ColorRGBA32(0xff, 0x00, 0xe6, 0x00);
+	gradient.GetColorTime(0) = 0x0000;
+	gradient.GetColorTime(1) = 0x40c1;
+	gradient.GetColorTime(2) = 0x9212;
+	gradient.GetColorTime(3) = 0xce4e;
+	gradient.GetColorTime(4) = 0xffff;
+	gradient.GetAlphaTime(0) = 0x0000;
+	gradient.GetAlphaTime(1) = 0x8000;
+	gradient.GetAlphaTime(2) = 0xffff;
+
+	CHECK_EQUAL(ColorRGBA32(0xff, 0x00, 0x00, 0xff) == gradient.Evaluate(0.0f), true);
+	CHECK_EQUAL(ColorRGBA32(0xfd, 0x31, 0x00, 0xe6) == gradient.Evaluate(0.05f), true);
+	CHECK_EQUAL(ColorRGBA32(0xfa, 0x96, 0x00, 0xb3) == gradient.Evaluate(0.15f), true);
+	CHECK_EQUAL(ColorRGBA32(0xf8, 0xfc, 0x00, 0x7f) == gradient.Evaluate(0.25f), true);
+	CHECK_EQUAL(ColorRGBA32(0xac, 0xff, 0x16, 0x4c) == gradient.Evaluate(0.35f), true);
+	CHECK_EQUAL(ColorRGBA32(0x5e, 0xff, 0x2d, 0x19) == gradient.Evaluate(0.45f), true);
+	CHECK_EQUAL(ColorRGBA32(0x10, 0xff, 0x44, 0x18) == gradient.Evaluate(0.55f), true);
+	CHECK_EQUAL(ColorRGBA32(0x0b, 0xa9, 0x86, 0x4b) == gradient.Evaluate(0.65f), true);
+	CHECK_EQUAL(ColorRGBA32(0x19, 0x3c, 0xd3, 0x7e) == gradient.Evaluate(0.75f), true);
+	CHECK_EQUAL(ColorRGBA32(0x54, 0x00, 0xf9, 0xb2) == gradient.Evaluate(0.85f), true);
+	CHECK_EQUAL(ColorRGBA32(0xc6, 0x00, 0xec, 0xe5) == gradient.Evaluate(0.95f), true);
+	CHECK_EQUAL(ColorRGBA32(0xff, 0x00, 0xe6, 0xff) == gradient.Evaluate(1.0f), true);
+
+	OptimizedGradient optGradient;
+	gradient.InitializeOptimized(optGradient);
+
+#if UNITY_LINUX
+#warning Investigate/fix GradientEvaluateTest!
+#else
+	// Being off by 1LSB is okay... (due to rounding)
+	for(float time = 0.0f; time <= 1.0f; time += 0.02f)
+		CHECK_EQUAL(CompareColors(optGradient.Evaluate(time), gradient.Evaluate(time), 1), true);
+
+	// ... but require exactness precicely at key times
+	for(int i = 0; i < 5; i++)
+	{
+		float time = WordToNormalized(gradient.GetColorTime(i));
+		CHECK_EQUAL(CompareColors(optGradient.Evaluate(time), gradient.Evaluate(time), 0), true); 
+	}
+	for(int i = 0; i < 3; i++)
+	{
+		float time = WordToNormalized(gradient.GetAlphaTime(i));
+		CHECK_EQUAL(CompareColors(optGradient.Evaluate(time), gradient.Evaluate(time), 0), true);
+	}
+#endif
+}
+}
+
+#endif
diff --git a/Runtime/Math/Gradient.h b/Runtime/Math/Gradient.h
new file mode 100644
index 0000000..8469033
--- /dev/null
+++ b/Runtime/Math/Gradient.h
@@ -0,0 +1,219 @@
+#ifndef GRADIENT_H
+#define GRADIENT_H
+
+#include "Color.h"
+#include "Runtime/Utilities/LogAssert.h"
+
+enum
+{
+	kGradientMaxNumKeys = 8,
+	kOptimizedGradientMaxNumKeys = kGradientMaxNumKeys + kGradientMaxNumKeys, // color keys + alpha keys
+};
+
+// Optimized version of gradient
+struct OptimizedGradient
+{
+	static inline UInt32 InverseLerpWordOptimized (UInt32 from, UInt32 rcp, UInt32 v)
+	{
+		DebugAssert((from & 0xffff) == from);
+		DebugAssert((v & 0xffff) == v);
+		return ((v - from) * rcp)>>16;
+	}
+
+	inline ColorRGBA32 Evaluate(float normalizedTime) const
+	{
+		DebugAssert((normalizedTime >= 0.0f) && (normalizedTime <= 1.0f));
+		DebugAssert(keyCount >= 2);
+
+		UInt32 time = NormalizedToWord(normalizedTime);
+
+		// Color blend
+		const UInt32 numKeys = keyCount;
+		time = std::min(std::max((UInt32)times[0], time), (UInt32)times[keyCount-1]); // TODO: Is this necessary?
+		for (int i = 1; i < numKeys; i++)
+		{
+			const UInt32 currTime = times[i];
+			if(time <= currTime)
+			{
+				const UInt32 prevTime = times[i-1];
+				const UInt32 frac = InverseLerpWordOptimized(prevTime, rcp[i], time);
+				return Lerp (colors[i-1], colors[i], frac);
+			}
+		}
+		return ColorRGBA32 (0xff,0xff,0xff,0xff);
+	}
+
+	ColorRGBA32 colors[kOptimizedGradientMaxNumKeys];
+	UInt32 times[kOptimizedGradientMaxNumKeys];
+	UInt32 rcp[kOptimizedGradientMaxNumKeys]; // precomputed reciprocals
+	UInt32 keyCount;
+};
+
+// Work in progress (Rename NEW to something else when found..)
+class GradientNEW
+{
+public:
+	GradientNEW ();
+	~GradientNEW ();
+
+	DECLARE_SERIALIZE_NO_PPTR (GradientNEW)
+
+	ColorRGBA32 Evaluate(float time) const;
+
+	struct ColorKey
+	{
+		DEFINE_GET_TYPESTRING (GradientColorKey)
+
+		ColorKey () {}
+		ColorKey (ColorRGBAf color, float time) {m_Color = color; m_Time = time;}
+		ColorRGBAf	m_Color;
+		float		m_Time;
+	};
+
+	struct AlphaKey
+	{
+		DEFINE_GET_TYPESTRING (GradientAlphaKey)
+
+		AlphaKey () {}
+		AlphaKey (float alpha, float time) {m_Alpha = alpha; m_Time = time;}
+		float		m_Alpha;
+		float		m_Time;
+	};
+
+	void SetKeys (ColorKey* colorKeys, unsigned numColorKeys, AlphaKey* alphaKeys, unsigned numAlphaKeys);
+
+	void SetColorKeys (ColorKey* colorKeys, unsigned numKeys);
+	void SetAlphaKeys (AlphaKey* alphaKeys, unsigned numKeys);
+
+	void SetNumColorKeys (int numColorKeys) { m_NumColorKeys = numColorKeys;};
+	void SetNumAlphaKeys (int numAlphaKeys) { m_NumAlphaKeys = numAlphaKeys; };
+
+	int GetNumColorKeys () const { return m_NumColorKeys; }
+	int GetNumAlphaKeys () const { return m_NumAlphaKeys; }
+
+	ColorRGBA32& GetKey (unsigned index) { return m_Keys[index]; }
+	const ColorRGBA32& GetKey (unsigned index) const { return m_Keys[index]; }
+
+	UInt16& GetColorTime (unsigned index) { return m_ColorTime[index]; }
+	const UInt16& GetColorTime (unsigned index) const { return m_ColorTime[index]; }
+	
+	UInt16& GetAlphaTime(unsigned index) { return m_AlphaTime[index]; }
+	const UInt16& GetAlphaTime(unsigned index) const { return m_AlphaTime[index]; }
+
+	ColorRGBA32 GetConstantColor () const;
+	void SetConstantColor (ColorRGBA32 color);
+
+	void SwapColorKeys (int i, int j);
+	void SwapAlphaKeys (int i, int j);
+
+	void InitializeOptimized(OptimizedGradient& g); 
+
+private:
+	static inline UInt32 InverseLerpWord (UInt32 from, UInt32 to, UInt32 v)
+	{
+		DebugAssert((from & 0xffff) == from);
+		DebugAssert((to & 0xffff) == to);
+		DebugAssert((v & 0xffff) == v);
+		DebugAssert (from <= to);
+		UInt32 nom = (v - from) << 16;
+		UInt32 den = std::max<UInt32>(to - from, 1);
+		UInt32 res = nom / den;
+		return res;
+	}
+
+	static inline UInt32 LerpByte(UInt32 u0, UInt32 u1, UInt32 scale)
+	{
+		DebugAssert((u0 & 0xff) == u0);
+		DebugAssert((u1 & 0xff) == u1);
+		//DebugAssert((scale & 0xff) == scale);
+		return u0 + (((u1 - u0) * scale) >> 8) & 0xff;
+	}
+	
+	void ValidateColorKeys();
+	void ValidateAlphaKeys();
+
+	ColorRGBA32 m_Keys[kGradientMaxNumKeys];
+	UInt16 m_ColorTime[kGradientMaxNumKeys]; 
+	UInt16 m_AlphaTime[kGradientMaxNumKeys];
+	UInt8 m_NumColorKeys;
+	UInt8 m_NumAlphaKeys;
+};
+
+inline ColorRGBA32 GradientNEW::Evaluate(float normalizedTime) const
+{
+	DebugAssert((normalizedTime >= 0.0f) && (normalizedTime <= 1.0f));
+	DebugAssert(m_NumColorKeys >= 2);
+	DebugAssert(m_NumAlphaKeys >= 2);
+	
+	ColorRGBA32 color = ColorRGBA32 (0xff,0xff,0xff,0xff);
+	const UInt32 time = NormalizedToWord(normalizedTime);
+	
+	// Color blend
+	const UInt32 numColorKeys = m_NumColorKeys;
+	const UInt32 timeColor = std::min(std::max((UInt32)m_ColorTime[0], time), (UInt32)m_ColorTime[numColorKeys-1]);
+	for (int i = 1; i < numColorKeys; i++)
+	{
+		const UInt32 currTime = m_ColorTime[i];
+		if(timeColor <= currTime)
+		{
+			const UInt32 prevTime = m_ColorTime[i-1];
+			const UInt32 frac = InverseLerpWord(prevTime, currTime, timeColor) >> 8; // frac is byte
+			color = Lerp (m_Keys[i-1], m_Keys[i], frac);
+			break;
+		}
+	}
+	
+	// Alpha blend
+	const UInt32 numAlphaKeys = m_NumAlphaKeys;
+	const UInt32 timeAlpha = std::min(std::max((UInt32)m_AlphaTime[0], time), (UInt32)m_AlphaTime[numAlphaKeys-1]);
+	for (int i = 1; i < numAlphaKeys; i++)
+	{
+		const UInt32 currTime = m_AlphaTime[i];
+		if(timeAlpha <= currTime)
+		{
+			const UInt32 prevTime = m_AlphaTime[i-1];
+			const UInt32 frac = InverseLerpWord(prevTime, currTime, timeAlpha) >> 8; // frac is byte
+			color.a = LerpByte(m_Keys[i-1].a, m_Keys[i].a, frac);
+			break;
+		}
+	}
+	
+	return color;
+}
+
+/// Simple class to interpolate between colors.
+template<int size>
+class GradientDeprecated
+ {
+	public:
+		DEFINE_GET_TYPESTRING (Gradient)
+	
+		template<class TransferFunc>
+		void Transfer (TransferFunc& transfer) {
+			AssertIf (size > 9);
+			char name[] = "m_Color[ ]";
+			for (int i=0;i<size;i++)
+			{
+				name[8] = '0' + i;
+				transfer.Transfer (m_Colors[i], name);
+			}
+		}
+
+		/// Get a color
+		ColorRGBA32 &operator[] (int i) { AssertIf (i < 0 || i >= size); return m_Colors[i]; }
+		/// Get a color
+		const ColorRGBA32 &operator[] (int i) const { AssertIf (i < 0 || i >= size);  return m_Colors[i]; }
+
+		/// Get the color value at a given position
+		/// @param position a position in unnormalized 16.16 bit fixed
+		ColorRGBA32 GetFixed (UInt32 position) const {
+			AssertIf ((position >> 16) >= size - 1);
+			return Lerp (m_Colors[position >> 16], m_Colors[(position >> 16) + 1], (position >> 8) & 255);
+		}
+		
+	private:
+		/// The array of colors this interpolator works through
+		ColorRGBA32 m_Colors[size];
+};
+
+#endif
diff --git a/Runtime/Math/MathTests.cpp b/Runtime/Math/MathTests.cpp
new file mode 100644
index 0000000..229406a
--- /dev/null
+++ b/Runtime/Math/MathTests.cpp
@@ -0,0 +1,726 @@
+#include "UnityPrefix.h"
+#include "Configuration/UnityConfigure.h"
+
+#if ENABLE_UNIT_TESTS
+
+#include "External/UnitTest++/src/UnitTest++.h"
+#include "Runtime/Math/FloatConversion.h"
+#include "Runtime/Math/Color.h"
+#include "Runtime/Geometry/AABB.h"
+#include "Runtime/Geometry/Plane.h"
+#include "Runtime/Utilities/BitUtility.h"
+#include "Runtime/Input/TimeManager.h"
+#include "Runtime/Math/Random/Random.h"
+#include "Runtime/Math/SphericalHarmonics.h"
+#if UNITY_WIN && !UNITY_WINRT
+#include "External/DirectX/builds/dx9include/d3dx9.h"
+#endif
+#include <vector>
+
+
+SUITE (MathTests)
+{
+
+#if UNITY_WIN
+#pragma warning( disable : 4723 ) //required for the divide by 0 that's happening in this test.
+#endif
+
+TEST (Math_Nan)
+{
+	struct
+	{
+		inline bool IsNANNew (float A)
+		{
+			// A NAN has an exponent of 255 (shifted left 23 positions) and a non-zero mantissa.
+			int exp = *(int*)&A & 0x7F800000;
+			int mantissa = *(int*)&A & 0x007FFFFF;
+			return exp == 0x7F800000 && mantissa != 0;
+		}
+
+		inline void operator () (bool expect, float A)
+		{
+			CHECK_EQUAL (IsNANNew (A), IsNAN (A));
+			CHECK_EQUAL (expect, IsNAN (A));
+		}
+	} CheckNAN;
+
+	float f = 0.0F;
+	float f0 = 0.0F;
+
+	f = f / f0;
+
+	CheckNAN (true, f);
+
+	CheckNAN (true, std::numeric_limits<float>::quiet_NaN ());
+	CheckNAN (true, std::numeric_limits<float>::signaling_NaN ());
+	CheckNAN (false, 1.0F);
+	CheckNAN (false, 0.0F);
+}
+
+#if UNITY_WIN
+#pragma warning( default : 4723 )
+#endif
+
+TEST (Math_Matrix)
+{
+	Matrix4x4f m0, m1, m2, m6;
+
+	for (int i = 0; i < 16; ++i)
+	{
+		m0.m_Data[i] = (float)i;
+		m1.m_Data[15 - i] = (float)i;
+	}
+
+	MultiplyMatrices4x4REF (&m0, &m1, &m2);
+	MultiplyMatrices4x4 (&m0, &m1, &m6);
+	CHECK_EQUAL (0, memcmp (m2.m_Data, m6.m_Data, sizeof(Matrix4x4f)));
+
+	TransposeMatrix4x4REF(&m0, &m2);
+	TransposeMatrix4x4(&m0, &m6);
+	CHECK_EQUAL (0, memcmp (m2.m_Data, m6.m_Data, sizeof(Matrix4x4f)));
+
+	Vector3f v (2.0F, 5.0F, 2.0F);
+	Vector3f res (2.0F, 5.0F, -2.0F);
+
+	Quaternionf q;
+	Quaternionf backConvertedQ;
+	Matrix3x3f m3;
+	Vector3f tempResult;
+
+	q = AxisAngleToQuaternion (Vector3f::yAxis, kPI / 2.0F);
+	QuaternionToMatrix (q, m3);
+
+	CHECK_EQUAL (true, CompareApproximately (RotateVectorByQuat(q, v), res));
+	CHECK_EQUAL (true, CompareApproximately (m3.MultiplyPoint3 (v), res));
+
+	MatrixToQuaternion (m3, backConvertedQ);
+	CHECK_EQUAL (true, CompareApproximately (backConvertedQ, q));
+
+	Vector3f axis;
+	float roll;
+
+	QuaternionToAxisAngle (backConvertedQ, &axis, &roll);
+	CHECK_EQUAL (true, CompareApproximately (axis, Vector3f::yAxis));
+	CHECK_CLOSE (kPI / 2.0F, roll, 0.000001F);
+
+	q = Inverse (q);
+	m3.Invert ();
+	MatrixToQuaternion (m3, backConvertedQ);
+	CHECK_EQUAL (true, CompareApproximately (backConvertedQ, q));
+
+	tempResult = RotateVectorByQuat (q, res);
+	CHECK_EQUAL (true, CompareApproximately (tempResult, v));
+	tempResult = RotateVectorByQuat (backConvertedQ, res);
+	CHECK_EQUAL (true, CompareApproximately (tempResult, v));
+	tempResult = m3.MultiplyPoint3 (res);
+	CHECK_EQUAL (true, CompareApproximately (tempResult, v));
+}
+
+	
+
+TEST (Math_NormalizeFastTest)
+{
+	Vector3f input[] = { Vector3f (0.0f, 0.1f, 0.0f), Vector3f (0.0f, 0.0f, 0.0f), Vector3f (-0.0f, -0.0f, -0.0f) };
+	Vector3f output[] = { Vector3f (0.0f, 1.f, 0.0f), Vector3f (0.0f, 0.0f, 0.0f), Vector3f (-0.0f, -0.0f, -0.0f) };
+	
+	for (int i=0;i<3;i++)
+	{
+		Vector3f normalized = NormalizeFast(input[i]);
+		CHECK (CompareApproximately (output[i] , normalized, 0.0001f));
+	}
+}
+	
+	
+	
+TEST (Math_MatrixQuaternionConversion)
+{
+	Rand rand (GetTimeSinceStartup ());
+	for (int i = 0; i < 500; ++i)
+	{
+		Quaternionf rot = RandomQuaternion (rand);
+		Quaternionf outq, outq2;
+		Matrix3x3f m, outm;
+		QuaternionToMatrix (rot, m);
+		Vector3f angle;
+
+		MatrixToEuler (m, angle);
+		EulerToMatrix (angle, outm);
+		outq2 = EulerToQuaternion (angle);
+
+		MatrixToQuaternion (outm, outq);
+		CHECK (CompareApproximately (m , outm, 0.1f));
+		CHECK_CLOSE (1, Abs (Dot (outq, rot)), 0.01f);
+		CHECK_CLOSE (1, Abs (Dot (outq2, rot)), 0.01f);
+	}
+}
+
+
+TEST (Math_EulerAngles)
+{
+	struct
+	{
+		void operator() (float x, float y, float z)
+		{
+			Quaternionf quat = EulerToQuaternion (Vector3f (Deg2Rad (x), Deg2Rad (y), Deg2Rad (z)));
+			Matrix3x3f quatM;
+			QuaternionToMatrix (quat, quatM);
+
+			Vector3f euler = QuaternionToEuler (quat);
+			Vector3f eulerDeg (Rad2Deg (euler.x), Rad2Deg (euler.y), Rad2Deg (euler.z));
+			Quaternionf newquat = EulerToQuaternion (euler);
+
+			CHECK_CLOSE (Abs (Dot (newquat, quat)), 1, 0.01f);
+		}
+	} TestEulerAngles;
+
+	TestEulerAngles ( 90.0f, 45.0f, 0.0f);
+	TestEulerAngles ( 90.0f, 90.0f, 0.0f);
+	TestEulerAngles (270.0f,  0.0f, 0.0f);
+	TestEulerAngles (270.0f, 40.0f, 0.0f);
+}
+
+TEST (Math_EulerAnglesMatchAxisAngle)
+{
+	Quaternionf quat = AxisAngleToQuaternion(Vector3f::yAxis, Deg2Rad(20.0F));
+	Vector3f euler = QuaternionToEuler(quat);
+	CHECK_EQUAL (true, CompareApproximately (0, euler.x));
+	CHECK_EQUAL (true, CompareApproximately (Deg2Rad(20.0F), euler.y));
+	CHECK_EQUAL (true, CompareApproximately (0, euler.z));
+}
+
+// This test fails with the current version of QuaternionToEuler.  The angles
+// being close to gimbal lock will snap to 90 degree increments.
+#if 0
+    
+TEST (Math_QuaternionToEulerHandlesGimbalLock)
+{
+    Quaternionf quat = EulerToQuaternion (Vector3f (Deg2Rad (269.5f), 0.f, 0.f));
+//    printf( "%f, %f, %f, %f\n", quat.x, quat.y, quat.z, quat.w);
+    Vector3f euler = QuaternionToEuler (quat);
+//    printf( "%f, %f, %f\n", Rad2Deg(euler.x), Rad2Deg(euler.y), Rad2Deg(euler.z));
+    Quaternionf quat1 = EulerToQuaternion (euler);
+//    printf( "%f, %f, %f, %f\n", quat1.x, quat1.y, quat1.z, quat1.w);
+    
+    CHECK_CLOSE (269.5f, Rad2Deg (euler.x), 0.01f);
+    CHECK_CLOSE (0.f, euler.y, 0.01f);
+    CHECK_CLOSE (0.f, euler.z, 0.01f);
+    
+    quat = EulerToQuaternion (Vector3f (Deg2Rad (89.5f), 0.f, 0.f));
+    euler = QuaternionToEuler (quat);
+    
+    CHECK_CLOSE (89.5f, Rad2Deg (euler.x), 0.01f);
+    CHECK_CLOSE (0.f, euler.y, 0.01f);
+    CHECK_CLOSE (0.f, euler.z, 0.01f);
+    
+    quat = EulerToQuaternion (Vector3f (Deg2Rad (89.0f), 0.f, 0.f));
+    euler = QuaternionToEuler (quat);
+    
+    CHECK_CLOSE (89.0f, Rad2Deg (euler.x), 0.01f);
+    CHECK_CLOSE (0.f, euler.y, 0.01f);
+    CHECK_CLOSE (0.f, euler.z, 0.01f);
+    
+    quat = EulerToQuaternion (Vector3f (Deg2Rad (88.5f), 0.f, 0.f));
+    euler = QuaternionToEuler (quat);
+    
+//    printf( "%f, %f, %f\n", Rad2Deg(euler.x), Rad2Deg(euler.y), Rad2Deg(euler.z));
+    
+    CHECK_CLOSE (88.5f, Rad2Deg (euler.x), 0.01f);
+    CHECK_CLOSE (0.f, euler.y, 0.01f);
+    CHECK_CLOSE (0.f, euler.z, 0.01f);
+}
+    
+#endif
+    
+TEST (Math_ColorRGBA32Lerp)
+{
+#if UNITY_LINUX
+#warning Investigate/fix ColorRGBA32 Tests!
+#else
+	ColorRGBA32 c0, c1, res;
+
+	c0 = ColorRGBA32 (100, 150, 255, 0);
+	c1 = ColorRGBA32 (200, 100, 0, 255);
+
+	res = Lerp (c0, c1, 0);
+	CHECK (ColorRGBA32 (100, 150, 255, 0) == res);
+
+	res = Lerp (c0, c1, 90);
+	CHECK (ColorRGBA32 (135, 132,165,89) == res);
+
+	res = Lerp (c0, c1, 200);
+	CHECK (ColorRGBA32 (178, 110,55,199) == res);
+
+	res = Lerp (c0, c1, 255);
+	CHECK (ColorRGBA32 (199, 100, 0, 254) == res);
+#endif
+}
+
+
+TEST (Math_ColorRGBA32Scale)
+{
+#if UNITY_LINUX
+#warning Investigate/fix ColorRGBA32 Tests!
+#else
+	ColorRGBA32 c0, res;
+
+	c0 = ColorRGBA32 (100, 150, 255, 150);
+
+	res = c0 * 0;
+	CHECK (ColorRGBA32 (0, 0, 0, 0) == res);
+
+	res = c0 * 20;
+	CHECK (ColorRGBA32 (8, 12, 20, 12) == res);
+
+	res = c0 * 150;
+	CHECK (ColorRGBA32 (58, 88, 150, 88) == res);
+
+	res = c0 * 255;
+	CHECK (ColorRGBA32 (100, 150, 255, 150) == res);
+#endif
+}
+
+void TestMultiplyColorRGBA32(const ColorRGBA32 input0, const ColorRGBA32 input1, int tolerance)
+{
+	ColorRGBA32 expected;
+	ColorRGBA32 actual;
+	expected.r = (input0.r * input1.r) / 255;
+	expected.g = (input0.g * input1.g) / 255;
+	expected.b = (input0.b * input1.b) / 255;
+	expected.a = (input0.a * input1.a) / 255;
+	actual = input0*input1;
+	
+	CHECK_CLOSE((int)expected.r, (int)actual.r, tolerance);
+	CHECK_CLOSE((int)expected.g, (int)actual.g, tolerance);
+	CHECK_CLOSE((int)expected.b, (int)actual.b, tolerance);
+	CHECK_CLOSE((int)expected.a, (int)actual.a, tolerance);
+}
+
+TEST (Math_ColorRGBA32Muliply)
+{
+	for(int i = 0; i < 256; i+=4)
+	{
+		TestMultiplyColorRGBA32(ColorRGBA32(0,0,0,0), ColorRGBA32(i+0,i+1,i+2,i+3), 0);
+		TestMultiplyColorRGBA32(ColorRGBA32(i+0,i+1,i+2,i+3), ColorRGBA32(0,0,0,0), 0);
+		TestMultiplyColorRGBA32(ColorRGBA32(i+0,i+1,i+2,i+3), ColorRGBA32(0xff,0xff,0xff,0xff), 0);
+		TestMultiplyColorRGBA32(ColorRGBA32(0xff,0xff,0xff,0xff), ColorRGBA32(i+0,i+1,i+2,i+3), 0);
+	}
+
+	for(int i = 0; i < 256; i+=4)
+		for(int j = i; j < 256; j+=4)
+			TestMultiplyColorRGBA32(ColorRGBA32(j+0,j+1,j+2,j+3), ColorRGBA32(i+0,i+1,i+2,i+3), 1);
+}
+
+// Reference Implementation: D3DX; thus only test on Windows
+#if UNITY_WIN && !UNITY_WINRT
+TEST (Math_SphericalHarmonics)
+{
+	Rand r;
+
+	for (int i = 0; i < 10000; ++i)
+	{
+		float x = r.GetFloat () * 2.0f - 1.0f;
+		float y = r.GetFloat () * 2.0f - 1.0f;
+		float z = r.GetFloat () * 2.0f - 1.0f;
+		float sh[9], d3dxsh[9];
+
+		SHEvalDirection9 (x, y, z, sh);
+		D3DXSHEvalDirection (d3dxsh, 3, &D3DXVECTOR3 (x, y, z));
+		for (int j = 0; j < 9; ++j)
+		{
+			CHECK_CLOSE (sh[j], d3dxsh[j], 0.000001f);
+		}
+
+		float shR[9], shG[9], shB[9];
+		float d3dxshR[9], d3dxshG[9], d3dxshB[9];
+		SHEvalDirectionalLight9 (x, y, z, 0.1f, 0.2f, 0.3f, shR, shG, shB);
+		D3DXSHEvalDirectionalLight (3, &D3DXVECTOR3(x,y,z), 0.1f, 0.2f, 0.3f, d3dxshR, d3dxshG, d3dxshB);
+		for (int j = 0; j < 9; ++j)
+		{
+			CHECK_CLOSE (shR[j], d3dxshR[j], 0.000001f);
+			CHECK_CLOSE (shG[j], d3dxshG[j], 0.000001f);
+			CHECK_CLOSE (shB[j], d3dxshB[j], 0.000001f);
+		}
+	}
+}
+#endif
+
+
+void FabsPerformance ();
+
+TEST (Math_Repeat)
+{
+	CHECK_EQUAL (15.0F, Repeat (-5.0F, 20.0F));
+	CHECK_EQUAL ( 5.0F, Repeat (5.0F, 20.0F));
+	CHECK_EQUAL ( 5.0F, Repeat (25.0F, 20.0F));
+	CHECK_EQUAL ( 0.0F, Repeat (20.0F, 20.0F));
+	CHECK_EQUAL ( 0.0F, Repeat (0.0F, 20.0F));
+	CHECK_EQUAL (19.9F, Repeat (-0.1F, 20.0F));
+	CHECK_EQUAL (10.0F, Repeat (-10.0F, 20.0F));
+	CHECK_CLOSE (0.139999F, Repeat (0.699999F, 0.14F), 1e-5f);
+	//CHECK (Repeat (0.69999999F, 0.14F) >= 0.0f) // This fails! Revisit for the next breaking version.
+
+	// Our Repeat inverts when in negative space
+	CHECK_CLOSE ( 3.0F, Repeat ( 3.0F,  5.0F), 1e-5f);
+	CHECK_CLOSE (-2.0F, Repeat ( 3.0F, -5.0F), 1e-5f);
+	CHECK_CLOSE (-3.0F, Repeat (-3.0F, -5.0F), 1e-5f);
+	CHECK_CLOSE ( 2.0F, Repeat (-3.0F,  5.0F), 1e-5f);
+	CHECK_CLOSE ( 0.0F, Repeat ( 0.0F, -1.0F), 1e-5f);
+	CHECK_CLOSE ( 0.0F, Repeat ( 0.0F,  1.0F), 1e-5f);
+	
+	CHECK_CLOSE ( 1.0F, Repeat (-59.0F, 30.0F), 1e-5f);
+	CHECK_CLOSE ( 0.0F, Repeat (-60.0F, 30.0F), 1e-5f);
+	CHECK_CLOSE (29.0F, Repeat (-61.0F, 30.0F), 1e-5f);
+}
+
+TEST (Math_DeltaAngleRad)
+{
+	CHECK_EQUAL (0, DeltaAngleRad (12345.67890F, 12345.67890F));
+
+	CHECK_EQUAL (kPI, DeltaAngleRad (0, -kPI));
+	CHECK_EQUAL (kPI, DeltaAngleRad (0, kPI));
+	CHECK_EQUAL (kPI, DeltaAngleRad (kPI, 0));
+
+	CHECK_EQUAL (0, DeltaAngleRad (1.0F, 1.0F+2*kPI));
+	CHECK_EQUAL (0, DeltaAngleRad (1.0F+2*kPI, 1.0F));
+
+	CHECK_CLOSE ( kPI/2, DeltaAngleRad (0, 5*kPI/2), 1e-5f);
+	CHECK_CLOSE (-kPI/2, DeltaAngleRad (0, 7*kPI/2), 1e-5f);
+}
+
+/*
+TEST (Math_RoundFunctions)
+{
+	struct
+	{
+		void operator() (float t, int floor, int ceil, int round)
+		{
+			CHECK_EQUAL (floor, std::floor(t));
+			CHECK_EQUAL (ceil, std::ceil(t));
+
+			CHECK_EQUAL (round, RoundfToInt(t));
+			CHECK_EQUAL (round, Roundf(t));
+
+			CHECK_EQUAL (floor, Floorf(t));
+			CHECK_EQUAL (floor, FloorfToInt(t));
+
+			CHECK_EQUAL (ceil, Ceilf(t));
+			CHECK_EQUAL (ceil, CeilfToInt(t));
+
+			if (t >= 0.0F)
+			{
+				CHECK_EQUAL (RoundfToIntPos(t), round);
+				CHECK_EQUAL (FloorfToIntPos(t), floor);
+				CHECK_EQUAL (CeilfToIntPos(t), ceil);
+			}
+		}
+	} TestRoundFunctions;
+
+	CHECK_EQUAL (64, NextPowerOfTwo (33));
+	CHECK_EQUAL (32, NextPowerOfTwo (32));
+	CHECK_EQUAL (32, NextPowerOfTwo (31));
+
+	TestRoundFunctions (15.1F, 15, 16, 15);
+	TestRoundFunctions (0.9F, 0, 1, 1);
+
+	TestRoundFunctions (1.0F, 1, 1, 1);
+	TestRoundFunctions (2.0F, 2, 2, 2);
+
+	TestRoundFunctions (5.9F, 5, 6, 6);
+	TestRoundFunctions (7.1F, 7, 8, 7);
+	TestRoundFunctions (7.6F, 7, 8, 8);
+	TestRoundFunctions (0.49F, 0, 1, 0);
+	TestRoundFunctions (120000.51F, 120000, 120001, 120001);
+
+	TestRoundFunctions (-19.7F, -20, -19, -20);
+	TestRoundFunctions (-16.01F, -17, -16, -16);
+	TestRoundFunctions (-25.0F, -25, -25, -25);
+	TestRoundFunctions (-25.501F, -26, -25, -26);
+	TestRoundFunctions (-5.9F, -6, -5, -6);
+	TestRoundFunctions (-7.1F, -8, -7, -7);
+	TestRoundFunctions (-7.6F, -8, -7, -8);
+	TestRoundFunctions (-0.1F, -1, 0, 0);
+	TestRoundFunctions (-0.0000011F, -1, 0, 0);
+	TestRoundFunctions (-0.25F, -1, 0, 0);
+	TestRoundFunctions (-0.49F, -1, 0, 0);
+	TestRoundFunctions (-0.51F, -1, 0, -1);
+	TestRoundFunctions (-0.6F, -1, 0, -1);
+	TestRoundFunctions (-1.0F, -1, -1, -1);
+	TestRoundFunctions (-2.0F, -2, -2, -2);
+	TestRoundFunctions (-1.01F, -2, -1, -1);
+	TestRoundFunctions (-100000.49F, -100001, -100000, -100000);
+
+	CHECK_EQUAL (1, RoundfToInt(0.5F));
+	CHECK_EQUAL (2, RoundfToInt(1.5F));
+	CHECK_EQUAL (0, RoundfToInt(-0.5F));
+	CHECK_EQUAL (-1, RoundfToInt(-1.5F));
+
+	// Rounding up or down, doesn't have to match floor / ceil function. Pick fastest
+	//ErrorIf (TestFloor (120000.51F, 120000, 120000, 120000));
+
+	CHECK_EQUAL (15, FloorfToIntPos (15.1F));
+	CHECK_EQUAL (0, FloorfToIntPos (0.9F));
+
+	CHECK_EQUAL (16, CeilfToIntPos (15.1F));
+	CHECK_EQUAL (1, CeilfToIntPos (0.9F));
+
+	CHECK_EQUAL (15, RoundfToIntPos (15.1F));
+	CHECK_EQUAL (1, RoundfToIntPos (0.9F));
+}
+*/
+
+TEST (Math_TransformPoints)
+{
+	Vector3f v (1, 0, 0);
+	Matrix4x4f tr;
+	tr.SetTR (Vector3f(10,0,0), AxisAngleToQuaternion (Vector3f::zAxis, Deg2Rad (90)));
+
+	//Must ignore the translation, and work when input and output are the same.
+	TransformPoints3x3 (tr, &v, &v, 1);
+
+	CHECK (CompareApproximately (v, Vector3f(0, 1, 0)));
+}
+
+
+TEST (TypeSizes)
+{
+	CHECK_EQUAL (4, sizeof(SInt32));
+	CHECK_EQUAL (4, sizeof(UInt32));
+
+	CHECK_EQUAL (2, sizeof(SInt16));
+	CHECK_EQUAL (2, sizeof(UInt16));
+
+	CHECK_EQUAL (1, sizeof(UInt8));
+	CHECK_EQUAL (1, sizeof(SInt8));
+
+	CHECK_EQUAL (8, sizeof(UInt64));
+	CHECK_EQUAL (8, sizeof(SInt64));
+}
+
+
+TEST (Math_QuaternionMatrixEquivalence)
+{
+	Matrix3x3f m;
+	EulerToMatrix (Vector3f (Deg2Rad (15.0F), Deg2Rad (20.0F), Deg2Rad (64.0F)), m);
+
+	Quaternionf q;
+	MatrixToQuaternion (m, q);
+
+	Vector3f v (25.3F, 27.14F, 34.2F);
+
+	{
+		Vector3f matrixRes (m.MultiplyPoint3 (v));
+		Vector3f quatRes  = RotateVectorByQuat (q, v);
+
+		CHECK (CompareApproximately (matrixRes, quatRes));
+	}
+
+	{
+		Matrix3x3f m2 (m);
+		m2.Scale (Vector3f (1.0F, 1.0F, -1.0F));
+
+		CHECK (CompareApproximately (m2.GetDeterminant (), -1.0F));
+	}
+
+	{
+		Vector3f matrixRes (m.MultiplyPoint3 (v));
+		Vector3f quatRes = RotateVectorByQuat (q, Vector3f (v.x, v.y, v.z));
+
+		CHECK (CompareApproximately (matrixRes, quatRes));
+	}
+
+	{
+		Vector3f matrixRes (m.MultiplyPoint3 (v));
+
+		Quaternionf modQ = q;
+		modQ.x *= -1.0F;
+		modQ.y *= -1.0F;
+		modQ.z *= -1.0F;
+		modQ.w *= -1.0F;
+		Vector3f quatRes  = RotateVectorByQuat(modQ, v);
+
+		CHECK (CompareApproximately (matrixRes, quatRes));
+	}
+
+	{
+		Vector3f matrixRes (m.MultiplyPoint3 (v));
+
+		Quaternionf modQ = q;
+		modQ.x *= 1.0F;
+		modQ.y *= -1.0F;
+		modQ.z *= -1.0F;
+		modQ.w *= -1.0F;
+
+		Vector3f quatRes = RotateVectorByQuat(modQ, v);
+
+		CHECK (!CompareApproximately (matrixRes, quatRes));
+	}
+
+	{
+		Vector3f matrixRes (m.MultiplyPoint3 (v));
+
+		Quaternionf modQ = q;
+		modQ.x *= 1.0F;
+		modQ.y *= 1.0F;
+		modQ.z *= -1.0F;
+		modQ.w *= -1.0F;
+
+		Vector3f quatRes = RotateVectorByQuat(modQ, v);
+
+		CHECK (!CompareApproximately (matrixRes, quatRes));
+	}
+
+	{
+		Vector3f matrixRes (m.MultiplyPoint3 (v));
+
+		Quaternionf modQ = q;
+		modQ.x *= 1.0F;
+		modQ.y *= 1.0F;
+		modQ.z *= 1.0F;
+		modQ.w *= -1.0F;
+
+		Vector3f quatRes = RotateVectorByQuat(modQ, v);
+
+		CHECK (!CompareApproximately (matrixRes, quatRes));
+	}
+
+	{
+		Vector3f matrixRes (m.MultiplyPoint3 (v));
+
+		Quaternionf modQ = q;
+		modQ.x *= -1.0F;
+		modQ.y *= -1.0F;
+		modQ.z *= 1.0F;
+		modQ.w *= -1.0F;
+
+		Vector3f quatRes = RotateVectorByQuat(modQ, v);
+
+		CHECK (!CompareApproximately (matrixRes, quatRes));
+	}
+
+	{
+		Vector3f matrixRes (m.MultiplyPoint3 (v));
+
+		Quaternionf modQ = q;
+		modQ.x *= -1.0F;
+		modQ.y *= -1.0F;
+		modQ.z *= 1.0F;
+		modQ.w *= 1.0F;
+
+		Vector3f quatRes = RotateVectorByQuat(modQ, v);
+
+		CHECK (!CompareApproximately (matrixRes, quatRes));
+	}
+
+	{
+		Vector3f matrixRes (m.MultiplyPoint3 (v));
+
+		Quaternionf modQ = q;
+		modQ.x *= 1.0F;
+		modQ.y *= 1.0F;
+		modQ.z *= -1.0F;
+		modQ.w *= 1.0F;
+
+		Vector3f quatRes = RotateVectorByQuat(modQ, v);
+
+		CHECK (!CompareApproximately (matrixRes, quatRes));
+	}
+
+	{
+		Vector3f matrixRes (m.MultiplyPoint3 (v));
+
+		Quaternionf modQ = q;
+		modQ.x *= -1.0F;
+		modQ.y *= 1.0F;
+		modQ.z *= -1.0F;
+		modQ.w *= 1.0F;
+
+		Vector3f quatRes = RotateVectorByQuat(modQ, v);
+
+		CHECK (!CompareApproximately (matrixRes, quatRes));
+	}
+}
+
+
+TEST (Math_ColorMisc)
+{
+	{
+		ColorRGBA32 c0 (100, 150, 100, 0);
+		ColorRGBA32 c1 (200, 100, 0, 200);
+		ColorRGBA32 res = c0 + c1;
+		CHECK (res == ColorRGBA32 (255, 250, 100, 200));
+	}
+
+	{
+		ColorRGBA32 res = ColorRGBA32 (150, 150, 150, 150) + ColorRGBA32 (150, 150, 150, 150);
+		CHECK (res == ColorRGBA32 (255, 255, 255, 255));
+	}
+}
+
+
+TEST (Math_TransformAABB)
+{
+	Matrix4x4f m;
+
+	for (int i = 0; i < 16; ++i)
+		m.m_Data[i] = (float)(7-i);
+
+	AABB aabb(Vector3f(1,2,3), Vector3f(4,5,6));
+
+	AABB aabbSlow;
+	TransformAABBSlow(aabb, m, aabbSlow);
+
+	AABB aabbRef;
+	TransformAABB(aabb, m, aabbRef);
+
+	CHECK (CompareApproximately (aabbSlow.m_Center, aabbRef.m_Center));
+	CHECK (CompareApproximately (aabbSlow.m_Extent, aabbRef.m_Extent));
+}
+
+TEST (Math_BitsInMask)
+{
+	CHECK_EQUAL (0, BitsInMask(0x0));
+	CHECK_EQUAL (32, BitsInMask(0xFFFFFFFF));
+	CHECK_EQUAL (1, BitsInMask(0x1));
+	CHECK_EQUAL (1, BitsInMask(0x80000000));
+	CHECK_EQUAL (2, BitsInMask(0x5));
+	CHECK_EQUAL (3, BitsInMask(0x7));
+	CHECK_EQUAL (24, BitsInMask(0xDEADBEEF));
+	CHECK_EQUAL (19, BitsInMask(0xCAFE1337));
+}
+
+TEST (Math_BitsInMask64)
+{
+	CHECK_EQUAL (0,  BitsInMask64(0x0000000000000000ULL));
+	CHECK_EQUAL (64, BitsInMask64(0xFFFFFFFFFFFFFFFFULL));
+	CHECK_EQUAL (1,  BitsInMask64(0x0000000000000001ULL));
+	CHECK_EQUAL (2,  BitsInMask64(0x8000000080000000ULL));
+	CHECK_EQUAL (2,  BitsInMask64(0x0000000000000005ULL));
+	CHECK_EQUAL (3,  BitsInMask64(0x0000000000000007ULL));
+	CHECK_EQUAL (24, BitsInMask64(0x00000000DEADBEEFULL));
+	CHECK_EQUAL (19, BitsInMask64(0x00000000CAFE1337ULL));
+	CHECK_EQUAL (43, BitsInMask64(0xCAFE1337DEADBEEFULL));
+}
+
+TEST (Math_Normalize)
+{
+	Plane p;
+	p.SetABCD(0,0,0,1);
+	p.NormalizeRobust();
+	Vector3f n = p.GetNormal();
+	CHECK (IsNormalized(n));
+
+	p.SetABCD(2.5e-5f, 3.1e-5f, 1.2e-5f, 1.f);
+	p.NormalizeRobust();
+	n = p.GetNormal();
+	CHECK (IsNormalized(n));
+
+	Vector3f normal(2.3e-5f, 2.1e-5f, 3.2e-5f);
+	float invOriginalLength;
+	normal = NormalizeRobust(normal, invOriginalLength);
+	CHECK (CompareApproximately (22394.295f, invOriginalLength));
+}
+
+}
+
+
+#endif
diff --git a/Runtime/Math/Matrix3x3.cpp b/Runtime/Math/Matrix3x3.cpp
new file mode 100644
index 0000000..ac92b70
--- /dev/null
+++ b/Runtime/Math/Matrix3x3.cpp
@@ -0,0 +1,596 @@
+#include "UnityPrefix.h"
+#include "Matrix3x3.h"
+#include "Matrix4x4.h"
+using namespace std;
+
+namespace
+{
+	Matrix3x3f CreateIdentityMatrix3x3f ()
+	{
+		Matrix3x3f temp; 
+		temp.SetIdentity (); 
+		return temp;
+	}
+
+	Matrix3x3f CreateZeroMatrix3x3f ()
+	{
+		Matrix3x3f temp; 
+		temp.SetZero (); 
+		return temp;
+	}
+}
+
+const Matrix3x3f Matrix3x3f::identity = CreateIdentityMatrix3x3f ();
+const Matrix3x3f Matrix3x3f::zero = CreateZeroMatrix3x3f ();
+
+void GetRotMatrixNormVec (float* out, const float* inVec, float radians);
+
+Matrix3x3f& Matrix3x3f::operator = (const Matrix4x4f& other)
+{
+	m_Data[0] = other.m_Data[0];
+	m_Data[1] = other.m_Data[1];
+	m_Data[2] = other.m_Data[2];
+
+	m_Data[3] = other.m_Data[4];
+	m_Data[4] = other.m_Data[5];
+	m_Data[5] = other.m_Data[6];
+
+	m_Data[6] = other.m_Data[8];
+	m_Data[7] = other.m_Data[9];
+	m_Data[8] = other.m_Data[10];
+	return *this;
+}
+
+Matrix3x3f::Matrix3x3f (const Matrix4x4f& other)
+{
+	m_Data[0] = other.m_Data[0];
+	m_Data[1] = other.m_Data[1];
+	m_Data[2] = other.m_Data[2];
+
+	m_Data[3] = other.m_Data[4];
+	m_Data[4] = other.m_Data[5];
+	m_Data[5] = other.m_Data[6];
+
+	m_Data[6] = other.m_Data[8];
+	m_Data[7] = other.m_Data[9];
+	m_Data[8] = other.m_Data[10];
+}
+
+Matrix3x3f& Matrix3x3f::SetIdentity ()
+{
+	Get (0, 0) = 1.0F;	Get (0, 1) = 0.0F;	Get (0, 2) = 0.0F;
+	Get (1, 0) = 0.0F;	Get (1, 1) = 1.0F;	Get (1, 2) = 0.0F;
+	Get (2, 0) = 0.0F;	Get (2, 1) = 0.0F;	Get (2, 2) = 1.0F;
+	return *this;
+}
+
+Matrix3x3f& Matrix3x3f::SetZero ()
+{
+	Get (0, 0) = 0.0F;	Get (0, 1) = 0.0F;	Get (0, 2) = 0.0F;
+	Get (1, 0) = 0.0F;	Get (1, 1) = 0.0F;	Get (1, 2) = 0.0F;
+	Get (2, 0) = 0.0F;	Get (2, 1) = 0.0F;	Get (2, 2) = 0.0F;
+	return *this;
+}
+
+Matrix3x3f& Matrix3x3f::SetOrthoNormalBasis (const Vector3f& inX, const Vector3f& inY, const Vector3f& inZ)
+{
+	Get (0, 0) = inX[0];	Get (0, 1) = inY[0];	Get (0, 2) = inZ[0];
+	Get (1, 0) = inX[1];	Get (1, 1) = inY[1];	Get (1, 2) = inZ[1];
+	Get (2, 0) = inX[2];	Get (2, 1) = inY[2];	Get (2, 2) = inZ[2];
+	return *this;
+}
+
+Matrix3x3f& Matrix3x3f::SetOrthoNormalBasisInverse (const Vector3f& inX, const Vector3f& inY, const Vector3f& inZ)
+{
+	Get (0, 0) = inX[0];	Get (1, 0) = inY[0];	Get (2, 0) = inZ[0];
+	Get (0, 1) = inX[1];	Get (1, 1) = inY[1];	Get (2, 1) = inZ[1];
+	Get (0, 2) = inX[2];	Get (1, 2) = inY[2];	Get (2, 2) = inZ[2];
+	return *this;
+}
+
+Matrix3x3f& Matrix3x3f::SetScale (const Vector3f& inScale)
+{
+	Get (0, 0) = inScale[0];	Get (0, 1) = 0.0F;			Get (0, 2) = 0.0F;
+	Get (1, 0) = 0.0F;			Get (1, 1) = inScale[1];	Get (1, 2) = 0.0F;
+	Get (2, 0) = 0.0F;			Get (2, 1) = 0.0F;			Get (2, 2) = inScale[2];
+	return *this;
+}
+
+bool Matrix3x3f::IsIdentity (float threshold) {
+	if (CompareApproximately (Get (0,0),1.0f, threshold) && CompareApproximately (Get (0,1),0.0f, threshold) && CompareApproximately (Get (0,2),0.0f, threshold) &&
+	CompareApproximately (Get (1,0),0.0f, threshold) && CompareApproximately (Get (1,1),1.0f, threshold) && CompareApproximately (Get (1,2),0.0f, threshold) &&
+	CompareApproximately (Get (2,0),0.0f, threshold) && CompareApproximately (Get (2,1),0.0f, threshold) && CompareApproximately (Get (2,2),1.0f, threshold))
+		return true;
+	return false;
+}
+
+
+Matrix3x3f& Matrix3x3f::Scale (const Vector3f& inScale)
+{
+	Get (0, 0) *= inScale[0];
+	Get (1, 0) *= inScale[0];
+	Get (2, 0) *= inScale[0];
+
+	Get (0, 1) *= inScale[1];
+	Get (1, 1) *= inScale[1];
+	Get (2, 1) *= inScale[1];
+
+	Get (0, 2) *= inScale[2];
+	Get (1, 2) *= inScale[2];
+	Get (2, 2) *= inScale[2];
+	return *this;
+}
+
+float Matrix3x3f::GetDeterminant () const
+{
+	float fCofactor0 = Get (0, 0) * Get (1, 1) * Get (2, 2);
+	float fCofactor1 = Get (0, 1) * Get (1, 2) * Get (2, 0);
+	float fCofactor2 = Get (0, 2) * Get (1, 0) * Get (2, 1);
+
+	float fCofactor3 = Get (0, 2) * Get (1, 1) * Get (2, 0);
+	float fCofactor4 = Get (0, 1) * Get (1, 0) * Get (2, 2);
+	float fCofactor5 = Get (0, 0) * Get (1, 2) * Get (2, 1);
+	
+	return fCofactor0 + fCofactor1 + fCofactor2 - fCofactor3 - fCofactor4 - fCofactor5;
+}
+
+Matrix3x3f& Matrix3x3f::Transpose ()
+{
+	swap (Get (0, 1), Get (1, 0));
+	swap (Get (0, 2), Get (2, 0));
+	swap (Get (2, 1), Get (1, 2));
+	return *this;
+}
+/*
+Matrix3x3f& Matrix3x3f::Transpose (const Matrix3x3f& inMat)
+{
+	int i;
+	for (i=0;i<3;i++)
+	{
+		Get (i, 0) = inMat.Get (0, i);
+		Get (i, 1) = inMat.Get (1, i);
+		Get (i, 2) = inMat.Get (2, i);
+	}
+	return *this;
+}
+*/
+
+bool Matrix3x3f::Invert ()
+{
+	///@TODO make a fast but robust inverse matrix 3x3
+	Matrix4x4f m = *this;
+	bool success = InvertMatrix4x4_Full( m.GetPtr(), m.GetPtr() );
+	*this = m;
+	return success;
+
+#if 0
+	////// THIS METHOD IS NUMERICALLY LESS ROBUST
+	// Invert a 3x3 using cofactors.  This is faster than using a generic
+	// Gaussian elimination because of the loop overhead of such a method.
+
+	Matrix3x3f kInverse;
+
+	kInverse.Get (0, 0) = Get (1, 1) * Get (2, 2) - Get (1, 2) * Get (2, 1);
+	kInverse.Get (0, 1) = Get (0, 2) * Get (2, 1) - Get (0, 1) * Get (2, 2);
+	kInverse.Get (0, 2) = Get (0, 1) * Get (1, 2) - Get (0, 2) * Get (1, 1);
+	kInverse.Get (1, 0) = Get (1, 2) * Get (2, 0) - Get (1, 0) * Get (2, 2);
+	kInverse.Get (1, 1) = Get (0, 0) * Get (2, 2) - Get (0, 2) * Get (2, 0);
+	kInverse.Get (1, 2) = Get (0, 2) * Get (1, 0) - Get (0, 0) * Get (1, 2);
+	kInverse.Get (2, 0) = Get (1, 0) * Get (2, 1) - Get (1, 1) * Get (2, 0);
+	kInverse.Get (2, 1) = Get (0, 1) * Get (2, 0) - Get (0, 0) * Get (2, 1);
+	kInverse.Get (2, 2) = Get (0, 0) * Get (1, 1) - Get (0, 1) * Get (1, 0);
+
+	float fDet = Get (0, 0) * kInverse.Get (0, 0) + Get (0, 1) * kInverse.Get (1, 0) + Get (0, 2) * kInverse.Get (2, 0);
+
+	if (Abs (fDet) > Vector3f::epsilon)
+	{
+		kInverse /= fDet;
+		*this = kInverse;
+		return true;
+	}
+	else
+	{
+		SetZero ();
+		return false;
+	}
+	#endif
+}
+
+void Matrix3x3f::InvertTranspose ()
+{	
+	Invert ();
+	Transpose ();
+}
+
+Matrix3x3f& Matrix3x3f::operator *= (float f)
+{
+	for (int i=0;i<9;i++)
+		m_Data[i] *= f;
+	return *this;
+}
+
+Matrix3x3f& Matrix3x3f::operator *= (const Matrix3x3f& inM)
+{
+	int i;
+	for (i=0;i<3;i++)
+	{
+		float v[3] = {Get (i, 0), Get (i, 1), Get (i, 2)};
+		Get (i, 0) = v[0] * inM.Get (0, 0) + v[1] * inM.Get (1, 0) + v[2] * inM.Get (2, 0);
+		Get (i, 1) = v[0] * inM.Get (0, 1) + v[1] * inM.Get (1, 1) + v[2] * inM.Get (2, 1);
+		Get (i, 2) = v[0] * inM.Get (0, 2) + v[1] * inM.Get (1, 2) + v[2] * inM.Get (2, 2);
+	}
+	return *this;
+}
+
+Matrix3x3f& Matrix3x3f::operator *= (const Matrix4x4f& inM)
+{
+	int i;
+	for (i=0;i<3;i++)
+	{
+		float v[3] = {Get (i, 0), Get (i, 1), Get (i, 2)};
+		Get (i, 0) = v[0] * inM.Get (0, 0) + v[1] * inM.Get (1, 0) + v[2] * inM.Get (2, 0);
+		Get (i, 1) = v[0] * inM.Get (0, 1) + v[1] * inM.Get (1, 1) + v[2] * inM.Get (2, 1);
+		Get (i, 2) = v[0] * inM.Get (0, 2) + v[1] * inM.Get (1, 2) + v[2] * inM.Get (2, 2);
+	}
+	return *this;
+}
+
+Matrix3x3f& Matrix3x3f::SetAxisAngle (const Vector3f& rotationAxis, float radians)
+{
+	GetRotMatrixNormVec (m_Data, rotationAxis.GetPtr (), radians);
+	return *this;
+}
+
+void fromToRotation(const float from[3], const float to[3],float mtx[3][3]);
+
+Matrix3x3f& Matrix3x3f::SetFromToRotation (const Vector3f& from, const Vector3f& to)
+{
+	float mtx[3][3];
+	fromToRotation (from.GetPtr (), to.GetPtr (), mtx);
+	Get (0, 0) = mtx[0][0];	Get (0, 1) = mtx[0][1];	Get (0, 2) = mtx[0][2];
+	Get (1, 0) = mtx[1][0];	Get (1, 1) = mtx[1][1];	Get (1, 2) = mtx[1][2];
+	Get (2, 0) = mtx[2][0];	Get (2, 1) = mtx[2][1];	Get (2, 2) = mtx[2][2];
+	return *this;
+}
+
+inline void MakePositive (Vector3f& euler)
+{
+	const float negativeFlip = -0.0001F;
+	const float positiveFlip = (kPI * 2.0F) - 0.0001F;
+	
+	if (euler.x < negativeFlip)
+		euler.x += 2.0 * kPI;
+	else if (euler.x > positiveFlip)
+		euler.x -= 2.0 * kPI;
+    
+	if (euler.y < negativeFlip)
+		euler.y += 2.0 * kPI;
+	else if (euler.y > positiveFlip)
+		euler.y -= 2.0 * kPI;
+    
+	if (euler.z < negativeFlip)
+		euler.z += 2.0 * kPI;
+	else if (euler.z > positiveFlip)
+		euler.z -= 2.0 * kPI;
+}
+
+inline void SanitizeEuler (Vector3f& euler)
+{
+	MakePositive (euler);
+    /*
+     Vector3f option0 = euler;
+     option0.x = kPI - option0.x;
+     option0.y = kPI - option0.y;
+     option0.z = kPI - option0.z;
+     
+     MakePositive (euler);
+     MakePositive (option0);
+     if (option0.x+option0.y+option0.z < euler.x+euler.y+euler.z)
+     euler = option0;
+     */
+}
+
+void EulerToMatrix (const Vector3f& v, Matrix3x3f& matrix)
+{
+	float cx = cos (v.x);
+	float sx = sin (v.x);
+	float cy = cos (v.y);
+	float sy = sin (v.y);
+	float cz = cos (v.z);
+	float sz = sin (v.z);
+	
+	matrix.Get(0,0) = cy*cz + sx*sy*sz;
+	matrix.Get(0,1) = cz*sx*sy - cy*sz;
+	matrix.Get(0,2) = cx*sy;
+    
+	matrix.Get(1,0) = cx*sz;
+	matrix.Get(1,1) = cx*cz;
+	matrix.Get(1,2) = -sx;
+    
+	matrix.Get(2,0) = -cz*sy + cy*sx*sz;
+	matrix.Get(2,1) = cy*cz*sx + sy*sz;
+	matrix.Get(2,2) = cx*cy;
+}
+
+/// This is YXZ euler conversion
+bool MatrixToEuler (const Matrix3x3f& matrix, Vector3f& v)
+{
+	// from http://www.geometrictools.com/Documentation/EulerAngles.pdf
+	// YXZ order
+	if ( matrix.Get(1,2) < 0.999F ) // some fudge for imprecision
+	{
+		if ( matrix.Get(1,2) > -0.999F ) // some fudge for imprecision
+		{
+			v.x = asin(-matrix.Get(1,2));
+			v.y = atan2(matrix.Get(0,2), matrix.Get(2,2));
+			v.z = atan2(matrix.Get(1,0), matrix.Get(1,1));
+			SanitizeEuler (v);
+            return true;
+        }
+        else
+        {
+            // WARNING.  Not unique.  YA - ZA = atan2(r01,r00)
+            v.x = kPI * 0.5F;
+            v.y = atan2(matrix.Get (0,1), matrix.Get(0,0));
+            v.z = 0.0F;
+			SanitizeEuler (v);
+            
+            return false;
+        }
+    }
+    else
+    {
+        // WARNING.  Not unique.  YA + ZA = atan2(-r01,r00)
+        v.x = -kPI * 0.5F;
+        v.y = atan2(-matrix.Get(0,1),matrix.Get(0,0));
+        v.z = 0.0F;
+ 		SanitizeEuler (v);
+        return false;
+    }
+}
+
+
+#define EPSILON 0.000001
+
+#define CROSS(dest,v1,v2){                 \
+          dest[0]=v1[1]*v2[2]-v1[2]*v2[1]; \
+          dest[1]=v1[2]*v2[0]-v1[0]*v2[2]; \
+          dest[2]=v1[0]*v2[1]-v1[1]*v2[0];}
+
+#define DOT(v1,v2) (v1[0]*v2[0]+v1[1]*v2[1]+v1[2]*v2[2])
+
+#define SUB(dest,v1,v2){       \
+          dest[0]=v1[0]-v2[0]; \
+          dest[1]=v1[1]-v2[1]; \
+          dest[2]=v1[2]-v2[2];}
+
+/*
+ * A function for creating a rotation matrix that rotates a vector called
+ * "from" into another vector called "to".
+ * Input : from[3], to[3] which both must be *normalized* non-zero vectors
+ * Output: mtx[3][3] -- a 3x3 matrix in colum-major form
+ * Author: Tomas M�ller, 1999
+ */
+void fromToRotation(const float* from, const float* to,float mtx[3][3])
+{	
+	float v[3];
+	float e,h;
+	CROSS(v,from,to);
+	e=DOT(from,to);
+	if(e>1.0-EPSILON)     /* "from" almost or equal to "to"-vector? */
+	{
+		/* return identity */
+		mtx[0][0]=1.0; mtx[0][1]=0.0; mtx[0][2]=0.0;
+		mtx[1][0]=0.0; mtx[1][1]=1.0; mtx[1][2]=0.0;
+		mtx[2][0]=0.0; mtx[2][1]=0.0; mtx[2][2]=1.0;
+	}
+	else if(e<-1.0+EPSILON) /* "from" almost or equal to negated "to"? */
+	{
+		float up[3],left[3];
+		float invlen;
+		float fxx,fyy,fzz,fxy,fxz,fyz;
+		float uxx,uyy,uzz,uxy,uxz,uyz;
+		float lxx,lyy,lzz,lxy,lxz,lyz;
+		/* left=CROSS(from, (1,0,0)) */
+		left[0]=0.0; left[1]=from[2]; left[2]=-from[1];
+		if(DOT(left,left)<EPSILON) /* was left=CROSS(from,(1,0,0)) a good choice? */
+		{
+			/* here we now that left = CROSS(from, (1,0,0)) will be a good choice */
+			left[0]=-from[2]; left[1]=0.0; left[2]=from[0];
+		}
+		/* normalize "left" */
+		invlen=1.0f/sqrt(DOT(left,left));
+		left[0]*=invlen;
+		left[1]*=invlen;
+		left[2]*=invlen;
+		CROSS(up,left,from);
+		/* now we have a coordinate system, i.e., a basis;    */
+		/* M=(from, up, left), and we want to rotate to:      */
+		/* N=(-from, up, -left). This is done with the matrix:*/
+		/* N*M^T where M^T is the transpose of M              */
+		fxx=-from[0]*from[0]; fyy=-from[1]*from[1]; fzz=-from[2]*from[2];
+		fxy=-from[0]*from[1]; fxz=-from[0]*from[2]; fyz=-from[1]*from[2];
+
+		uxx=up[0]*up[0]; uyy=up[1]*up[1]; uzz=up[2]*up[2];
+		uxy=up[0]*up[1]; uxz=up[0]*up[2]; uyz=up[1]*up[2];
+
+		lxx=-left[0]*left[0]; lyy=-left[1]*left[1]; lzz=-left[2]*left[2];
+		lxy=-left[0]*left[1]; lxz=-left[0]*left[2]; lyz=-left[1]*left[2];
+		/* symmetric matrix */
+		mtx[0][0]=fxx+uxx+lxx; mtx[0][1]=fxy+uxy+lxy; mtx[0][2]=fxz+uxz+lxz;
+		mtx[1][0]=mtx[0][1];   mtx[1][1]=fyy+uyy+lyy; mtx[1][2]=fyz+uyz+lyz;
+		mtx[2][0]=mtx[0][2];   mtx[2][1]=mtx[1][2];   mtx[2][2]=fzz+uzz+lzz;
+	}
+	else  /* the most common case, unless "from"="to", or "from"=-"to" */
+	{
+		/* ...otherwise use this hand optimized version (9 mults less) */
+		float hvx,hvz,hvxy,hvxz,hvyz;
+		h=(1.0f-e)/DOT(v,v);
+		hvx=h*v[0];
+		hvz=h*v[2];
+		hvxy=hvx*v[1];
+		hvxz=hvx*v[2];
+		hvyz=hvz*v[1];
+		mtx[0][0]=e+hvx*v[0]; mtx[0][1]=hvxy-v[2];     mtx[0][2]=hvxz+v[1];
+		mtx[1][0]=hvxy+v[2];  mtx[1][1]=e+h*v[1]*v[1]; mtx[1][2]=hvyz-v[0];
+		mtx[2][0]=hvxz-v[1];  mtx[2][1]=hvyz+v[0];     mtx[2][2]=e+hvz*v[2];
+	}
+}
+
+// Right handed
+bool LookRotationToMatrix (const Vector3f& viewVec, const Vector3f& upVec, Matrix3x3f* m)
+{
+	Vector3f z = viewVec;
+	// compute u0
+	float mag = Magnitude (z);
+	if (mag < Vector3f::epsilon)
+	{
+		m->SetIdentity();
+		return false;
+	}
+	z /= mag;
+
+	Vector3f x = Cross (upVec, z);
+	mag = Magnitude (x);
+	if (mag < Vector3f::epsilon)
+	{
+		m->SetIdentity();
+		return false;
+	}
+	x /= mag;
+	
+	Vector3f y (Cross (z, x));
+	if (!CompareApproximately (SqrMagnitude (y), 1.0F))
+		return false;
+	
+	m->SetOrthoNormalBasis (x, y, z);
+	return true;	
+}
+/*
+//Left handed
+bool LookRotationToMatrixLeftHanded (const Vector3f& viewVec, const Vector3f& upVec, Matrix3x3f* m)
+{
+	Vector3f z = viewVec;
+	// compute u0
+	float mag = Magnitude (z);
+	if (mag < Vector3f::epsilon)
+		return false;
+	z /= mag;
+
+	Vector3f x = Cross (z, upVec);
+	mag = Magnitude (x);
+	if (mag < Vector3f::epsilon)
+		return false;
+	x /= mag;
+	
+	Vector3f y (Cross (x, z));
+	if (!CompareApproximately (SqrMagnitude (y), 1.0F))
+		return false;
+	
+	m->SetOrthoNormalBasis (x, y, z);
+	return true;
+}
+*/
+
+void GetRotMatrixNormVec (float* out, const float* inVec, float radians) 
+{
+  /* This function contributed by Erich Boleyn (erich@uruk.org) */
+  /* This function used from the Mesa OpenGL code (matrix.c)  */
+  float s, c;
+  float vx, vy, vz, xx, yy, zz, xy, yz, zx, xs, ys, zs, one_c;
+  
+  s = sin (radians);
+  c = cos (radians);
+    
+  vx = inVec[0];
+  vy = inVec[1];
+  vz = inVec[2];
+  
+#define M(row,col)  out[row*3 + col]
+  /*
+  *     Arbitrary axis rotation matrix.
+  *
+  *  This is composed of 5 matrices, Rz, Ry, T, Ry', Rz', multiplied
+  *  like so:  Rz * Ry * T * Ry' * Rz'.  T is the final rotation
+  *  (which is about the X-axis), and the two composite transforms
+  *  Ry' * Rz' and Rz * Ry are (respectively) the rotations necessary
+  *  from the arbitrary axis to the X-axis then back.  They are
+  *  all elementary rotations.
+  *
+  *  Rz' is a rotation about the Z-axis, to bring the axis vector
+  *  into the x-z plane.  Then Ry' is applied, rotating about the
+  *  Y-axis to bring the axis vector parallel with the X-axis.  The
+  *  rotation about the X-axis is then performed.  Ry and Rz are
+  *  simply the respective inverse transforms to bring the arbitrary
+  *  axis back to its original orientation.  The first transforms
+  *  Rz' and Ry' are considered inverses, since the data from the
+  *  arbitrary axis gives you info on how to get to it, not how
+  *  to get away from it, and an inverse must be applied.
+  *
+  *  The basic calculation used is to recognize that the arbitrary
+  *  axis vector (x, y, z), since it is of unit length, actually
+  *  represents the sines and cosines of the angles to rotate the
+  *  X-axis to the same orientation, with theta being the angle about
+  *  Z and phi the angle about Y (in the order described above)
+  *  as follows:
+  *
+  *  cos ( theta ) = x / sqrt ( 1 - z^2 )
+  *  sin ( theta ) = y / sqrt ( 1 - z^2 )
+  *
+  *  cos ( phi ) = sqrt ( 1 - z^2 )
+  *  sin ( phi ) = z
+  *
+  *  Note that cos ( phi ) can further be inserted to the above
+  *  formulas:
+  *
+  *  cos ( theta ) = x / cos ( phi )
+  *  sin ( theta ) = y / cos ( phi )
+  *
+  *  ...etc.  Because of those relations and the standard trigonometric
+  *  relations, it is pssible to reduce the transforms down to what
+  *  is used below.  It may be that any primary axis chosen will give the
+  *  same results (modulo a sign convention) using thie method.
+  *
+  *  Particularly nice is to notice that all divisions that might
+  *  have caused trouble when parallel to certain planes or
+  *  axis go away with care paid to reducing the expressions.
+  *  After checking, it does perform correctly under all cases, since
+  *  in all the cases of division where the denominator would have
+  *  been zero, the numerator would have been zero as well, giving
+  *  the expected result.
+  */
+  
+  xx = vx * vx;
+  yy = vy * vy;
+  zz = vz * vz;
+  xy = vx * vy;
+  yz = vy * vz;
+  zx = vz * vx;
+  xs = vx * s;
+  ys = vy * s;
+  zs = vz * s;
+  one_c = 1.0F - c;
+  
+  M(0,0) = (one_c * xx) + c;
+  M(1,0) = (one_c * xy) - zs;
+  M(2,0) = (one_c * zx) + ys;
+  
+  M(0,1) = (one_c * xy) + zs;
+  M(1,1) = (one_c * yy) + c;
+  M(2,1) = (one_c * yz) - xs;
+  
+  M(0,2) = (one_c * zx) - ys;
+  M(1,2) = (one_c * yz) + xs;
+  M(2,2) = (one_c * zz) + c;
+  
+#undef M
+}
+
+
+void OrthoNormalize (Matrix3x3f& matrix)
+{
+	Vector3f* c0 = (Vector3f*)matrix.GetPtr () + 0;
+	Vector3f* c1 = (Vector3f*)matrix.GetPtr () + 3;
+	Vector3f* c2 = (Vector3f*)matrix.GetPtr () + 6;
+	OrthoNormalize (c0, c1, c2);
+}
diff --git a/Runtime/Math/Matrix3x3.h b/Runtime/Math/Matrix3x3.h
new file mode 100644
index 0000000..f884349
--- /dev/null
+++ b/Runtime/Math/Matrix3x3.h
@@ -0,0 +1,119 @@
+#pragma once
+
+#include "Vector3.h"
+#include "Runtime/Modules/ExportModules.h"
+
+class EXPORT_COREMODULE Matrix3x3f
+{
+	public:
+
+	float m_Data[9];
+	
+	///@todo: Can't be Transfer optimized because Transfer doesn't write the same as memory layout
+	DECLARE_SERIALIZE_NO_PPTR (Matrix3x3f)
+	
+	Matrix3x3f () {}
+	Matrix3x3f (float m00, float m01, float m02, float m10, float m11, float m12, float m20, float m21, float m22) { Get (0,0) = m00; Get (1,0) = m10; Get (2,0) = m20; Get (0,1) = m01; Get (1,1) =m11; Get (2,1) = m21; Get (0,2) = m02; Get (1,2) = m12; Get (2,2) = m22; }
+	explicit Matrix3x3f (const class Matrix4x4f& m);
+	// The Get function accesses the matrix in std math convention
+ 	// m0,0 m0,1 m0,2
+	// m1,0 m1,1 m1,2
+	// m2,0 m2,1 m2,2
+
+	// The floats are laid out:
+	// m0   m3   m6	
+	// m1   m4   m7	
+	// m2   m5   m8	
+	
+	
+	float& Get (int row, int column) 				{ return m_Data[row + (column * 3)]; }
+	const float& Get (int row, int column)const 	{ return m_Data[row + (column * 3)]; }
+	
+	float& operator [] (int row) 				{ return m_Data[row]; }
+	float operator [] (int row) const 				{ return m_Data[row]; }
+	
+	float* GetPtr ()								{ return m_Data; }
+	const float* GetPtr () const				{ return m_Data; }
+	
+	Vector3f GetColumn (int col) const { return Vector3f (Get (0, col), Get (1, col), Get (2, col)); }
+	Matrix3x3f& operator = (const class Matrix4x4f& m);
+		
+	Matrix3x3f& operator *= (const Matrix3x3f& inM);
+	Matrix3x3f& operator *= (const class Matrix4x4f& inM);
+	friend Matrix3x3f operator * (const Matrix3x3f& lhs, const Matrix3x3f& rhs)	{ Matrix3x3f temp (lhs); temp *= rhs; return temp; }
+	Vector3f MultiplyVector3 (const Vector3f& inV) const;
+	void MultiplyVector3 (const Vector3f& inV, Vector3f& output) const;
+
+	Vector3f MultiplyPoint3 (const Vector3f& inV) const					{ return MultiplyVector3 (inV); }
+	Vector3f MultiplyVector3Transpose (const Vector3f& inV) const;
+	Vector3f MultiplyPoint3Transpose (const Vector3f& inV) const		{ return MultiplyVector3Transpose (inV); }
+
+	Matrix3x3f& operator *= (float f);
+	Matrix3x3f& operator /= (float f) { return *this *= (1.0F / f); }
+	
+	float GetDeterminant () const;
+	
+//	Matrix3x3f& Transpose (const Matrix3x3f& inM);
+	Matrix3x3f& Transpose ();
+//	Matrix3x3f& Invert (const Matrix3x3f& inM)												{ return Transpose (inM); }
+	bool Invert ();
+	void InvertTranspose ();
+	
+	Matrix3x3f& SetIdentity ();
+	Matrix3x3f& SetZero ();
+	Matrix3x3f& SetFromToRotation (const Vector3f& from, const Vector3f& to);
+	Matrix3x3f& SetAxisAngle (const Vector3f& rotationAxis, float radians);
+	Matrix3x3f& SetOrthoNormalBasis (const Vector3f& inX, const Vector3f& inY, const Vector3f& inZ);
+	Matrix3x3f& SetOrthoNormalBasisInverse (const Vector3f& inX, const Vector3f& inY, const Vector3f& inZ);
+	Matrix3x3f& SetScale (const Vector3f& inScale);
+	Matrix3x3f& Scale (const Vector3f& inScale);
+	
+	bool IsIdentity (float threshold = Vector3f::epsilon);
+	
+	static const Matrix3x3f zero;
+	static const Matrix3x3f identity;
+};
+
+// Generates an orthornormal basis from a look at rotation, returns if it was successful
+// (Righthanded)
+bool LookRotationToMatrix (const Vector3f& viewVec, const Vector3f& upVec, Matrix3x3f* m);
+
+bool MatrixToEuler (const Matrix3x3f& matrix, Vector3f& v);
+void EulerToMatrix (const Vector3f& v, Matrix3x3f& matrix);
+
+inline Vector3f Matrix3x3f::MultiplyVector3 (const Vector3f& v) const
+{
+	Vector3f res;
+	res.x = m_Data[0] * v.x + m_Data[3] * v.y + m_Data[6] * v.z;
+	res.y = m_Data[1] * v.x + m_Data[4] * v.y + m_Data[7] * v.z;
+	res.z = m_Data[2] * v.x + m_Data[5] * v.y + m_Data[8] * v.z;
+	return res;
+}
+
+inline void Matrix3x3f::MultiplyVector3 (const Vector3f& v, Vector3f& output) const
+{
+	output.x = m_Data[0] * v.x + m_Data[3] * v.y + m_Data[6] * v.z;
+	output.y = m_Data[1] * v.x + m_Data[4] * v.y + m_Data[7] * v.z;
+	output.z = m_Data[2] * v.x + m_Data[5] * v.y + m_Data[8] * v.z;
+}
+
+
+inline Vector3f Matrix3x3f::MultiplyVector3Transpose (const Vector3f& v) const
+{
+	Vector3f res;
+	res.x = Get (0, 0) * v.x + Get (1, 0) * v.y + Get (2, 0) * v.z;
+	res.y = Get (0, 1) * v.x + Get (1, 1) * v.y + Get (2, 1) * v.z;
+	res.z = Get (0, 2) * v.x + Get (1, 2) * v.y + Get (2, 2) * v.z;
+	return res;
+}
+
+
+template<class TransferFunction>
+inline void Matrix3x3f::Transfer (TransferFunction& t)
+{
+	t.Transfer (Get (0, 0), "e00");	t.Transfer (Get (0, 1), "e01");	t.Transfer (Get (0, 2), "e02");
+	t.Transfer (Get (1, 0), "e10");	t.Transfer (Get (1, 1), "e11");	t.Transfer (Get (1, 2), "e12");
+	t.Transfer (Get (2, 0), "e20");	t.Transfer (Get (2, 1), "e21");	t.Transfer (Get (2, 2), "e22");
+}
+
+void EXPORT_COREMODULE OrthoNormalize (Matrix3x3f& matrix);
diff --git a/Runtime/Math/Matrix4x4.cpp b/Runtime/Math/Matrix4x4.cpp
new file mode 100644
index 0000000..660a6bf
--- /dev/null
+++ b/Runtime/Math/Matrix4x4.cpp
@@ -0,0 +1,805 @@
+#include "UnityPrefix.h"
+#include "Matrix4x4.h"
+#include "Matrix3x3.h"
+#include "Quaternion.h"
+#include "Runtime/Utilities/Utility.h"
+
+using namespace std;
+
+namespace
+{
+	Matrix4x4f CreateIdentityMatrix4x4f ()
+	{
+		Matrix4x4f temp; 
+		temp.SetIdentity (); 
+		return temp;
+	}
+}
+
+const Matrix4x4f Matrix4x4f::identity = CreateIdentityMatrix4x4f ();
+
+Matrix4x4f::Matrix4x4f (const float data[16])
+{
+	for (int i=0; i<16; i++)
+		m_Data[i] = data[i];
+}
+
+Matrix4x4f::Matrix4x4f (const Matrix3x3f &other)
+{
+	m_Data[0] = other.m_Data[0];
+	m_Data[1] = other.m_Data[1];
+	m_Data[2] = other.m_Data[2];
+	m_Data[3] = 0.0F;
+
+	m_Data[4] = other.m_Data[3];
+	m_Data[5] = other.m_Data[4];
+	m_Data[6] = other.m_Data[5];
+	m_Data[7] = 0.0F;
+
+	m_Data[8] = other.m_Data[6];
+	m_Data[9] = other.m_Data[7];
+	m_Data[10] = other.m_Data[8];
+	m_Data[11] = 0.0F;
+
+	m_Data[12] = 0.0F;
+	m_Data[13] = 0.0F;
+	m_Data[14] = 0.0F;
+	m_Data[15] = 1.0F;
+}
+
+Matrix4x4f& Matrix4x4f::operator = (const Matrix3x3f& other)
+{
+	m_Data[0] = other.m_Data[0];
+	m_Data[1] = other.m_Data[1];
+	m_Data[2] = other.m_Data[2];
+	m_Data[3] = 0.0F;
+
+	m_Data[4] = other.m_Data[3];
+	m_Data[5] = other.m_Data[4];
+	m_Data[6] = other.m_Data[5];
+	m_Data[7] = 0.0F;
+
+	m_Data[8] = other.m_Data[6];
+	m_Data[9] = other.m_Data[7];
+	m_Data[10] = other.m_Data[8];
+	m_Data[11] = 0.0F;
+
+	m_Data[12] = 0.0F;
+	m_Data[13] = 0.0F;
+	m_Data[14] = 0.0F;
+	m_Data[15] = 1.0F;
+	return *this;
+}
+
+bool Matrix4x4f::IsIdentity (float threshold) const
+{
+	if (CompareApproximately (Get (0,0),1.0f, threshold) && CompareApproximately (Get (0,1),0.0f, threshold) && CompareApproximately (Get (0,2),0.0f, threshold) && CompareApproximately (Get (0,3),0.0f, threshold) &&
+	    CompareApproximately (Get (1,0),0.0f, threshold) && CompareApproximately (Get (1,1),1.0f, threshold) && CompareApproximately (Get (1,2),0.0f, threshold) && CompareApproximately (Get (1,3),0.0f, threshold) &&
+	    CompareApproximately (Get (2,0),0.0f, threshold) && CompareApproximately (Get (2,1),0.0f, threshold) && CompareApproximately (Get (2,2),1.0f, threshold) && CompareApproximately (Get (2,3),0.0f, threshold) &&
+   	    CompareApproximately (Get (3,0),0.0f, threshold) && CompareApproximately (Get (3,1),0.0f, threshold) && CompareApproximately (Get (3,2),0.0f, threshold) && CompareApproximately (Get (3,3),1.0f, threshold))
+		return true;
+	return false;
+}
+
+double Matrix4x4f::GetDeterminant () const
+{
+	double m00 = Get(0, 0);  double m01 = Get(0, 1);  double m02 = Get(0, 2);  double m03 = Get(0, 3);
+	double m10 = Get(1, 0);  double m11 = Get(1, 1);  double m12 = Get(1, 2);  double m13 = Get(1, 3);
+	double m20 = Get(2, 0);  double m21 = Get(2, 1);  double m22 = Get(2, 2);  double m23 = Get(2, 3);
+	double m30 = Get(3, 0);  double m31 = Get(3, 1);  double m32 = Get(3, 2);  double m33 = Get(3, 3);
+
+	double result =
+		m03 * m12 * m21 * m30 - m02 * m13 * m21 * m30 - m03 * m11 * m22 * m30 + m01 * m13 * m22 * m30 +
+		m02 * m11 * m23 * m30 - m01 * m12 * m23 * m30 - m03 * m12 * m20 * m31 + m02 * m13 * m20 * m31 +
+		m03 * m10 * m22 * m31 - m00 * m13 * m22 * m31 - m02 * m10 * m23 * m31 + m00 * m12 * m23 * m31 +
+		m03 * m11 * m20 * m32 - m01 * m13 * m20 * m32 - m03 * m10 * m21 * m32 + m00 * m13 * m21 * m32 +
+		m01 * m10 * m23 * m32 - m00 * m11 * m23 * m32 - m02 * m11 * m20 * m33 + m01 * m12 * m20 * m33 +
+		m02 * m10 * m21 * m33 - m00 * m12 * m21 * m33 - m01 * m10 * m22 * m33 + m00 * m11 * m22 * m33;
+	return result;
+}
+
+Matrix4x4f& Matrix4x4f::operator *= (const Matrix4x4f& inM1)
+{
+	Assert(&inM1 != this);
+	Matrix4x4f tmp;
+	MultiplyMatrices4x4(this, &inM1, &tmp);
+	*this = tmp;
+	return *this;
+}
+
+void MultiplyMatrices3x4( const Matrix4x4f& lhs, const Matrix4x4f& rhs, Matrix4x4f& res)
+{
+	for (int i=0;i<3;i++)
+	{
+		res.m_Data[i]    = lhs.m_Data[i] * rhs.m_Data[0]  + lhs.m_Data[i+4] * rhs.m_Data[1]  + lhs.m_Data[i+8] * rhs.m_Data[2];//  + lhs.m_Data[i+12] * rhs.m_Data[3];
+		res.m_Data[i+4]  = lhs.m_Data[i] * rhs.m_Data[4]  + lhs.m_Data[i+4] * rhs.m_Data[5]  + lhs.m_Data[i+8] * rhs.m_Data[6];//  + lhs.m_Data[i+12] * rhs.m_Data[7];
+		res.m_Data[i+8]  = lhs.m_Data[i] * rhs.m_Data[8]  + lhs.m_Data[i+4] * rhs.m_Data[9]  + lhs.m_Data[i+8] * rhs.m_Data[10];// + lhs.m_Data[i+12] * rhs.m_Data[11];
+		res.m_Data[i+12] = lhs.m_Data[i] * rhs.m_Data[12] + lhs.m_Data[i+4] * rhs.m_Data[13] + lhs.m_Data[i+8] * rhs.m_Data[14] + lhs.m_Data[i+12];// * rhs.m_Data[15];
+	}
+	
+	res.m_Data[3]  = 0.0f;
+	res.m_Data[7]  = 0.0f;
+	res.m_Data[11] = 0.0f;
+	res.m_Data[15] = 1.0f;
+}
+
+
+Matrix4x4f& Matrix4x4f::SetIdentity ()
+{
+	Get (0, 0) = 1.0;	Get (0, 1) = 0.0;	Get (0, 2) = 0.0;	Get (0, 3) = 0.0;
+	Get (1, 0) = 0.0;	Get (1, 1) = 1.0;	Get (1, 2) = 0.0;	Get (1, 3) = 0.0;
+	Get (2, 0) = 0.0;	Get (2, 1) = 0.0;	Get (2, 2) = 1.0;	Get (2, 3) = 0.0;
+	Get (3, 0) = 0.0;	Get (3, 1) = 0.0;	Get (3, 2) = 0.0;	Get (3, 3) = 1.0;
+	return *this;
+}
+
+Matrix4x4f& Matrix4x4f::SetOrthoNormalBasis (const Vector3f& inX, const Vector3f& inY, const Vector3f& inZ)
+{
+	Get (0, 0) = inX[0];	Get (0, 1) = inY[0];	Get (0, 2) = inZ[0];	Get (0, 3) = 0.0;
+	Get (1, 0) = inX[1];	Get (1, 1) = inY[1];	Get (1, 2) = inZ[1];	Get (1, 3) = 0.0;
+	Get (2, 0) = inX[2];	Get (2, 1) = inY[2];	Get (2, 2) = inZ[2];	Get (2, 3) = 0.0;
+	Get (3, 0) = 0.0;		Get (3, 1) = 0.0;		Get (3, 2) = 0.0;		Get (3, 3) = 1.0;
+	return *this;
+}
+
+Matrix4x4f& Matrix4x4f::SetOrthoNormalBasisInverse (const Vector3f& inX, const Vector3f& inY, const Vector3f& inZ)
+{
+	Get (0, 0) = inX[0];	Get (1, 0) = inY[0];	Get (2, 0) = inZ[0];	Get (3, 0) = 0.0;
+	Get (0, 1) = inX[1];	Get (1, 1) = inY[1];	Get (2, 1) = inZ[1];	Get (3, 1) = 0.0;
+	Get (0, 2) = inX[2];	Get (1, 2) = inY[2];	Get (2, 2) = inZ[2];	Get (3, 2) = 0.0;
+	Get (0, 3) = 0.0;		Get (1, 3) = 0.0;		Get (2, 3) = 0.0;		Get (3, 3) = 1.0;
+	return *this;
+}
+
+Matrix4x4f& Matrix4x4f::SetPositionAndOrthoNormalBasis (const Vector3f& inPosition, const Vector3f& inX, const Vector3f& inY, const Vector3f& inZ)
+{
+	Get (0, 0) = inX[0];	Get (0, 1) = inY[0];	Get (0, 2) = inZ[0];	Get (0, 3) = inPosition[0];
+	Get (1, 0) = inX[1];	Get (1, 1) = inY[1];	Get (1, 2) = inZ[1];	Get (1, 3) = inPosition[1];
+	Get (2, 0) = inX[2];	Get (2, 1) = inY[2];	Get (2, 2) = inZ[2];	Get (2, 3) = inPosition[2];
+	Get (3, 0) = 0.0;		Get (3, 1) = 0.0;		Get (3, 2) = 0.0;		Get (3, 3) = 1.0;
+	return *this;
+}
+
+Matrix4x4f& Matrix4x4f::SetScale (const Vector3f& inScale)
+{
+	Get (0, 0) = inScale[0];	Get (0, 1) = 0.0;			Get (0, 2) = 0.0;			Get (0, 3) = 0.0;
+	Get (1, 0) = 0.0;			Get (1, 1) = inScale[1];	Get (1, 2) = 0.0;			Get (1, 3) = 0.0;
+	Get (2, 0) = 0.0;			Get (2, 1) = 0.0;			Get (2, 2) = inScale[2];	Get (2, 3) = 0.0;
+	Get (3, 0) = 0.0;			Get (3, 1) = 0.0;			Get (3, 2) = 0.0;			Get (3, 3) = 1.0;
+	return *this;
+}
+
+Matrix4x4f& Matrix4x4f::Scale (const Vector3f& inScale)
+{
+	Get (0, 0) *= inScale[0];
+	Get (1, 0) *= inScale[0];
+	Get (2, 0) *= inScale[0];
+	Get (3, 0) *= inScale[0];
+
+	Get (0, 1) *= inScale[1];
+	Get (1, 1) *= inScale[1];
+	Get (2, 1) *= inScale[1];
+	Get (3, 1) *= inScale[1];
+
+	Get (0, 2) *= inScale[2];
+	Get (1, 2) *= inScale[2];
+	Get (2, 2) *= inScale[2];
+	Get (3, 2) *= inScale[2];
+	return *this;
+}
+
+Matrix4x4f& Matrix4x4f::Translate (const Vector3f& inTrans)
+{
+	Get (0, 3) = Get (0, 0) * inTrans[0] + Get (0, 1) * inTrans[1] + Get (0, 2) * inTrans[2] + Get (0, 3);
+	Get (1, 3) = Get (1, 0) * inTrans[0] + Get (1, 1) * inTrans[1] + Get (1, 2) * inTrans[2] + Get (1, 3);
+	Get (2, 3) = Get (2, 0) * inTrans[0] + Get (2, 1) * inTrans[1] + Get (2, 2) * inTrans[2] + Get (2, 3);
+	Get (3, 3) = Get (3, 0) * inTrans[0] + Get (3, 1) * inTrans[1] + Get (3, 2) * inTrans[2] + Get (3, 3);
+	return *this;
+}
+
+Matrix4x4f& Matrix4x4f::SetTranslate (const Vector3f& inTrans)
+{
+	Get (0, 0) = 1.0;	Get (0, 1) = 0.0;	Get (0, 2) = 0.0;	Get (0, 3) = inTrans[0];
+	Get (1, 0) = 0.0;	Get (1, 1) = 1.0;	Get (1, 2) = 0.0;	Get (1, 3) = inTrans[1];
+	Get (2, 0) = 0.0;	Get (2, 1) = 0.0;	Get (2, 2) = 1.0;	Get (2, 3) = inTrans[2];
+	Get (3, 0) = 0.0;	Get (3, 1) = 0.0;	Get (3, 2) = 0.0;	Get (3, 3) = 1.0;
+	return *this;
+}
+
+Matrix4x4f& Matrix4x4f::SetPerspective(
+	float fovy,
+	float aspect,
+	float zNear,
+	float zFar )
+{
+	float cotangent, deltaZ;
+	float radians = Deg2Rad (fovy / 2.0f);
+	cotangent = cos (radians) / sin (radians);
+	deltaZ = zNear - zFar;
+	
+	Get (0,0) = cotangent / aspect;	Get (0,1) = 0.0F;      Get (0,2) = 0.0F;                    Get (0,3) = 0.0F;
+	Get (1,0) = 0.0F;               Get (1,1) = cotangent; Get (1,2) = 0.0F;                    Get (1,3) = 0.0F;
+	Get (2,0) = 0.0F;               Get (2,1) = 0.0F;      Get (2,2) = (zFar + zNear) / deltaZ; Get (2,3) = 2.0F * zNear * zFar / deltaZ;
+	Get (3,0) = 0.0F;               Get (3,1) = 0.0F;      Get (3,2) = -1.0F;                   Get (3,3) = 0.0F;
+
+	return *this;
+}
+
+Matrix4x4f& Matrix4x4f::SetPerspectiveCotan(
+	float cotangent,
+	float zNear,
+	float zFar )
+{
+	float deltaZ = zNear - zFar;
+	
+	Get (0,0) = cotangent;			Get (0,1) = 0.0F;      Get (0,2) = 0.0F;                    Get (0,3) = 0.0F;
+	Get (1,0) = 0.0F;               Get (1,1) = cotangent; Get (1,2) = 0.0F;                    Get (1,3) = 0.0F;
+	Get (2,0) = 0.0F;               Get (2,1) = 0.0F;      Get (2,2) = (zFar + zNear) / deltaZ; Get (2,3) = 2.0F * zNear * zFar / deltaZ;
+	Get (3,0) = 0.0F;               Get (3,1) = 0.0F;      Get (3,2) = -1.0F;                   Get (3,3) = 0.0F;
+
+	return *this;
+}
+
+Matrix4x4f& Matrix4x4f::SetOrtho (
+	float left,
+	float right,
+	float bottom,
+	float top,
+	float zNear,
+	float zFar )
+{
+	SetIdentity ();
+
+	float deltax = right - left;
+	float deltay = top - bottom;
+	float deltaz = zFar - zNear;
+
+	Get(0,0) = 2.0F / deltax;
+	Get(0,3) = -(right + left) / deltax;
+	Get(1,1) = 2.0F / deltay;
+	Get(1,3) = -(top + bottom) / deltay;
+	Get(2,2) = -2.0F / deltaz;
+	Get(2,3) = -(zFar + zNear) / deltaz;
+	return *this;
+}
+
+Matrix4x4f& Matrix4x4f::SetFrustum (
+	float left,
+	float right,
+	float bottom,
+	float top,
+	float nearval,
+	float farval )
+{
+	float x, y, a, b, c, d, e;
+	    
+	x =  (2.0F * nearval) 		/ (right - left);
+	y =  (2.0F * nearval) 		/ (top - bottom);
+	a =  (right + left)			/ (right - left);
+	b =  (top + bottom)			/ (top - bottom);
+	c = -(farval + nearval)		   / (farval - nearval);
+	d = -(2.0f * farval * nearval) / (farval - nearval);
+	e = -1.0f;
+
+	Get (0,0) = x;    Get (0,1) = 0.0;  Get (0,2) = a;   Get (0,3) = 0.0;
+	Get (1,0) = 0.0;  Get (1,1) = y;    Get (1,2) = b;   Get (1,3) = 0.0;
+	Get (2,0) = 0.0;  Get (2,1) = 0.0;  Get (2,2) = c;   Get (2,3) = d;
+	Get (3,0) = 0.0;  Get (3,1) = 0.0;  Get (3,2) = e;	Get (3,3) = 0.0;
+	return *this;
+}
+
+
+
+TransformType ComputeTransformType (const Matrix4x4f& matrix, float& outUniformScale, float epsilon)
+{
+	float lengthX = Magnitude(matrix.GetAxisX());
+	float lengthY = Magnitude(matrix.GetAxisY());
+	float lengthZ = Magnitude(matrix.GetAxisZ());
+	float minAxis = std::min(std::min(lengthX, lengthY), lengthZ);
+	float maxAxis = std::max(std::max(lengthX, lengthY), lengthZ);
+	TransformType transType = kNoScaleTransform;
+	outUniformScale = 1.0f;
+	if (minAxis < 1.0 - epsilon || maxAxis > 1.0 + epsilon)
+	{
+		if (minAxis != 0.0f && maxAxis / minAxis < 1.0 + epsilon)
+		{
+			transType = kUniformScaleTransform;
+			outUniformScale = minAxis;
+		}
+		else
+			transType = kNonUniformScaleTransform;
+	}
+	return transType;
+}
+
+
+#define MAT(m,r,c) (m)[(c)*4+(r)]
+
+#define RETURN_ZERO \
+{ \
+	for (int i=0;i<16;i++) \
+		out[i] = 0.0F; \
+	return false; \
+}
+
+// 4x4 matrix inversion by Gaussian reduction with partial pivoting followed by back/substitution;
+// with loops manually unrolled.
+
+#define SWAP_ROWS(a, b) { float *_tmp = a; (a)=(b); (b)=_tmp; }
+bool InvertMatrix4x4_Full(const float* m, float* out)
+{
+   float wtmp[4][8];
+   float m0, m1, m2, m3, s;
+   float *r0, *r1, *r2, *r3;
+
+   r0 = wtmp[0], r1 = wtmp[1], r2 = wtmp[2], r3 = wtmp[3];
+
+   r0[0] = MAT(m,0,0), r0[1] = MAT(m,0,1),
+   r0[2] = MAT(m,0,2), r0[3] = MAT(m,0,3),
+   r0[4] = 1.0, r0[5] = r0[6] = r0[7] = 0.0,
+
+   r1[0] = MAT(m,1,0), r1[1] = MAT(m,1,1),
+   r1[2] = MAT(m,1,2), r1[3] = MAT(m,1,3),
+   r1[5] = 1.0, r1[4] = r1[6] = r1[7] = 0.0,
+
+   r2[0] = MAT(m,2,0), r2[1] = MAT(m,2,1),
+   r2[2] = MAT(m,2,2), r2[3] = MAT(m,2,3),
+   r2[6] = 1.0, r2[4] = r2[5] = r2[7] = 0.0,
+
+   r3[0] = MAT(m,3,0), r3[1] = MAT(m,3,1),
+   r3[2] = MAT(m,3,2), r3[3] = MAT(m,3,3),
+   r3[7] = 1.0, r3[4] = r3[5] = r3[6] = 0.0;
+
+   /* choose pivot - or die */
+   if (Abs(r3[0])>Abs(r2[0])) SWAP_ROWS(r3, r2);
+   if (Abs(r2[0])>Abs(r1[0])) SWAP_ROWS(r2, r1);
+   if (Abs(r1[0])>Abs(r0[0])) SWAP_ROWS(r1, r0);
+   if (0.0F == r0[0]) RETURN_ZERO
+
+   /* eliminate first variable     */
+   m1 = r1[0]/r0[0]; m2 = r2[0]/r0[0]; m3 = r3[0]/r0[0];
+   s = r0[1]; r1[1] -= m1 * s; r2[1] -= m2 * s; r3[1] -= m3 * s;
+   s = r0[2]; r1[2] -= m1 * s; r2[2] -= m2 * s; r3[2] -= m3 * s;
+   s = r0[3]; r1[3] -= m1 * s; r2[3] -= m2 * s; r3[3] -= m3 * s;
+   s = r0[4];
+   if (s != 0.0F) { r1[4] -= m1 * s; r2[4] -= m2 * s; r3[4] -= m3 * s; }
+   s = r0[5];
+   if (s != 0.0F) { r1[5] -= m1 * s; r2[5] -= m2 * s; r3[5] -= m3 * s; }
+   s = r0[6];
+   if (s != 0.0F) { r1[6] -= m1 * s; r2[6] -= m2 * s; r3[6] -= m3 * s; }
+   s = r0[7];
+   if (s != 0.0F) { r1[7] -= m1 * s; r2[7] -= m2 * s; r3[7] -= m3 * s; }
+
+   /* choose pivot - or die */
+   if (Abs(r3[1])>Abs(r2[1])) SWAP_ROWS(r3, r2);
+   if (Abs(r2[1])>Abs(r1[1])) SWAP_ROWS(r2, r1);
+   if (0.0F == r1[1]) RETURN_ZERO;
+
+   /* eliminate second variable */
+   m2 = r2[1]/r1[1]; m3 = r3[1]/r1[1];
+   r2[2] -= m2 * r1[2]; r3[2] -= m3 * r1[2];
+   r2[3] -= m2 * r1[3]; r3[3] -= m3 * r1[3];
+   s = r1[4]; if (0.0F != s) { r2[4] -= m2 * s; r3[4] -= m3 * s; }
+   s = r1[5]; if (0.0F != s) { r2[5] -= m2 * s; r3[5] -= m3 * s; }
+   s = r1[6]; if (0.0F != s) { r2[6] -= m2 * s; r3[6] -= m3 * s; }
+   s = r1[7]; if (0.0F != s) { r2[7] -= m2 * s; r3[7] -= m3 * s; }
+
+   /* choose pivot - or die */
+   if (Abs(r3[2])>Abs(r2[2])) SWAP_ROWS(r3, r2);
+   if (0.0F == r2[2]) RETURN_ZERO;
+
+   /* eliminate third variable */
+   m3 = r3[2]/r2[2];
+   r3[3] -= m3 * r2[3], r3[4] -= m3 * r2[4],
+   r3[5] -= m3 * r2[5], r3[6] -= m3 * r2[6],
+   r3[7] -= m3 * r2[7];
+
+   /* last check */
+   if (0.0F == r3[3]) RETURN_ZERO;
+
+   s = 1.0F/r3[3];             /* now back substitute row 3 */
+   r3[4] *= s; r3[5] *= s; r3[6] *= s; r3[7] *= s;
+
+   m2 = r2[3];                 /* now back substitute row 2 */
+   s  = 1.0F/r2[2];
+   r2[4] = s * (r2[4] - r3[4] * m2), r2[5] = s * (r2[5] - r3[5] * m2),
+   r2[6] = s * (r2[6] - r3[6] * m2), r2[7] = s * (r2[7] - r3[7] * m2);
+   m1 = r1[3];
+   r1[4] -= r3[4] * m1, r1[5] -= r3[5] * m1,
+   r1[6] -= r3[6] * m1, r1[7] -= r3[7] * m1;
+   m0 = r0[3];
+   r0[4] -= r3[4] * m0, r0[5] -= r3[5] * m0,
+   r0[6] -= r3[6] * m0, r0[7] -= r3[7] * m0;
+
+   m1 = r1[2];                 /* now back substitute row 1 */
+   s  = 1.0F/r1[1];
+   r1[4] = s * (r1[4] - r2[4] * m1), r1[5] = s * (r1[5] - r2[5] * m1),
+   r1[6] = s * (r1[6] - r2[6] * m1), r1[7] = s * (r1[7] - r2[7] * m1);
+   m0 = r0[2];
+   r0[4] -= r2[4] * m0, r0[5] -= r2[5] * m0,
+   r0[6] -= r2[6] * m0, r0[7] -= r2[7] * m0;
+
+   m0 = r0[1];                 /* now back substitute row 0 */
+   s  = 1.0F/r0[0];
+   r0[4] = s * (r0[4] - r1[4] * m0), r0[5] = s * (r0[5] - r1[5] * m0),
+   r0[6] = s * (r0[6] - r1[6] * m0), r0[7] = s * (r0[7] - r1[7] * m0);
+
+   MAT(out,0,0) = r0[4]; MAT(out,0,1) = r0[5], MAT(out,0,2) = r0[6]; MAT(out,0,3) = r0[7],
+   MAT(out,1,0) = r1[4]; MAT(out,1,1) = r1[5], MAT(out,1,2) = r1[6]; MAT(out,1,3) = r1[7],
+   MAT(out,2,0) = r2[4]; MAT(out,2,1) = r2[5], MAT(out,2,2) = r2[6]; MAT(out,2,3) = r2[7],
+   MAT(out,3,0) = r3[4]; MAT(out,3,1) = r3[5], MAT(out,3,2) = r3[6]; MAT(out,3,3) = r3[7];
+
+   return true;
+}
+
+#undef SWAP_ROWS
+
+// Invert 3D transformation matrix (not perspective). Adapted from graphics gems 2.
+// Inverts upper left by calculating its determinant and multiplying it to the symmetric
+// adjust matrix of each element. Finally deals with the translation by transforming the
+// original translation using by the calculated inverse.
+bool InvertMatrix4x4_General3D( const float* in, float* out )
+{
+	float pos, neg, t;
+	float det;
+
+	// Calculate the determinant of upper left 3x3 sub-matrix and
+	// determine if the matrix is singular.
+	pos = neg = 0.0;
+	t =  MAT(in,0,0) * MAT(in,1,1) * MAT(in,2,2);
+	if (t >= 0.0) pos += t; else neg += t;
+
+	t =  MAT(in,1,0) * MAT(in,2,1) * MAT(in,0,2);
+	if (t >= 0.0) pos += t; else neg += t;
+
+	t =  MAT(in,2,0) * MAT(in,0,1) * MAT(in,1,2);
+	if (t >= 0.0) pos += t; else neg += t;
+
+	t = -MAT(in,2,0) * MAT(in,1,1) * MAT(in,0,2);
+	if (t >= 0.0) pos += t; else neg += t;
+
+	t = -MAT(in,1,0) * MAT(in,0,1) * MAT(in,2,2);
+	if (t >= 0.0) pos += t; else neg += t;
+
+	t = -MAT(in,0,0) * MAT(in,2,1) * MAT(in,1,2);
+	if (t >= 0.0) pos += t; else neg += t;
+
+	det = pos + neg;
+
+	if (det*det < 1e-25)
+		RETURN_ZERO;
+
+	det = 1.0F / det;
+	MAT(out,0,0) = (  (MAT(in,1,1)*MAT(in,2,2) - MAT(in,2,1)*MAT(in,1,2) )*det);
+	MAT(out,0,1) = (- (MAT(in,0,1)*MAT(in,2,2) - MAT(in,2,1)*MAT(in,0,2) )*det);
+	MAT(out,0,2) = (  (MAT(in,0,1)*MAT(in,1,2) - MAT(in,1,1)*MAT(in,0,2) )*det);
+	MAT(out,1,0) = (- (MAT(in,1,0)*MAT(in,2,2) - MAT(in,2,0)*MAT(in,1,2) )*det);
+	MAT(out,1,1) = (  (MAT(in,0,0)*MAT(in,2,2) - MAT(in,2,0)*MAT(in,0,2) )*det);
+	MAT(out,1,2) = (- (MAT(in,0,0)*MAT(in,1,2) - MAT(in,1,0)*MAT(in,0,2) )*det);
+	MAT(out,2,0) = (  (MAT(in,1,0)*MAT(in,2,1) - MAT(in,2,0)*MAT(in,1,1) )*det);
+	MAT(out,2,1) = (- (MAT(in,0,0)*MAT(in,2,1) - MAT(in,2,0)*MAT(in,0,1) )*det);
+	MAT(out,2,2) = (  (MAT(in,0,0)*MAT(in,1,1) - MAT(in,1,0)*MAT(in,0,1) )*det);
+
+	// Do the translation part
+	MAT(out,0,3) = - (MAT(in,0,3) * MAT(out,0,0) +
+		MAT(in,1,3) * MAT(out,0,1) +
+		MAT(in,2,3) * MAT(out,0,2) );
+	MAT(out,1,3) = - (MAT(in,0,3) * MAT(out,1,0) +
+		MAT(in,1,3) * MAT(out,1,1) +
+		MAT(in,2,3) * MAT(out,1,2) );
+	MAT(out,2,3) = - (MAT(in,0,3) * MAT(out,2,0) +
+		MAT(in,1,3) * MAT(out,2,1) +
+		MAT(in,2,3) * MAT(out,2,2) );
+	
+	MAT(out,3,0) = 0.0f;
+	MAT(out,3,1) = 0.0f;
+	MAT(out,3,2) = 0.0f;
+	MAT(out,3,3) = 1.0f;
+
+	return true;
+}
+
+#undef MAT
+#undef RETURN_ZERO
+
+
+/*
+4x4 matrix inverse based on Cramer's rule. From Intel's "Streaming SIMD Extensions - Inverse of 4x4 Matrix" paper.
+Seems to be about the same speed as our current one, maybe slightly faster. Less numerically robust though,
+at very small numbers.
+
+bool InvertMatrix4x4_Cramer (const float* mat, float* dst)
+{
+	float    tmp[12]; // temp array for pairs
+	float    src[16]; // array of transpose source matrix
+	float    det;     // determinant
+	// transpose matrix
+	for (int i = 0; i < 4; i++) {
+		src[i]        = mat[i*4];
+		src[i + 4]    = mat[i*4 + 1];
+		src[i + 8]    = mat[i*4 + 2];
+		src[i + 12]   = mat[i*4 + 3];
+	}
+	// calculate pairs for first 8 elements (cofactors)
+	tmp[0]  = src[10] * src[15];
+	tmp[1]  = src[11] * src[14];
+	tmp[2]  = src[9]  * src[15];
+	tmp[3]  = src[11] * src[13];
+	tmp[4]  = src[9]  * src[14];
+	tmp[5]  = src[10] * src[13];
+	tmp[6]  = src[8]  * src[15];
+	tmp[7]  = src[11] * src[12];
+	tmp[8]  = src[8]  * src[14];
+	tmp[9]  = src[10] * src[12];
+	tmp[10] = src[8]  * src[13];
+	tmp[11] = src[9]  * src[12];
+	// calculate first 8 elements (cofactors)
+	dst[0]  = tmp[0]*src[5] + tmp[3]*src[6] + tmp[4]*src[7];
+	dst[0] -= tmp[1]*src[5] + tmp[2]*src[6] + tmp[5]*src[7];
+	dst[1]  = tmp[1]*src[4] + tmp[6]*src[6] + tmp[9]*src[7];
+	dst[1] -= tmp[0]*src[4] + tmp[7]*src[6] + tmp[8]*src[7];
+	dst[2]  = tmp[2]*src[4] + tmp[7]*src[5] + tmp[10]*src[7];
+	dst[2] -= tmp[3]*src[4] + tmp[6]*src[5] + tmp[11]*src[7];
+	dst[3]  = tmp[5]*src[4] + tmp[8]*src[5] + tmp[11]*src[6];
+	dst[3] -= tmp[4]*src[4] + tmp[9]*src[5] + tmp[10]*src[6];
+	dst[4]  = tmp[1]*src[1] + tmp[2]*src[2] + tmp[5]*src[3];
+	dst[4] -= tmp[0]*src[1] + tmp[3]*src[2] + tmp[4]*src[3];
+	dst[5]  = tmp[0]*src[0] + tmp[7]*src[2] + tmp[8]*src[3];
+	dst[5] -= tmp[1]*src[0] + tmp[6]*src[2] + tmp[9]*src[3];
+	dst[6]  = tmp[3]*src[0] + tmp[6]*src[1] + tmp[11]*src[3];
+	dst[6] -= tmp[2]*src[0] + tmp[7]*src[1] + tmp[10]*src[3];
+	dst[7]  = tmp[4]*src[0] + tmp[9]*src[1] + tmp[10]*src[2];
+	dst[7] -= tmp[5]*src[0] + tmp[8]*src[1] + tmp[11]*src[2];
+	// calculate pairs for second 8 elements (cofactors)
+	tmp[0]  = src[2]*src[7];
+	tmp[1]  = src[3]*src[6];
+	tmp[2]  = src[1]*src[7];
+	tmp[3]  = src[3]*src[5];
+	tmp[4]  = src[1]*src[6];
+	tmp[5]  = src[2]*src[5];
+	tmp[6]  = src[0]*src[7];
+	tmp[7]  = src[3]*src[4];
+	tmp[8]  = src[0]*src[6];
+	tmp[9]  = src[2]*src[4];
+	tmp[10] = src[0]*src[5];
+	tmp[11] = src[1]*src[4];
+	// calculate second 8 elements (cofactors)
+	dst[8]  = tmp[0]*src[13] + tmp[3]*src[14] + tmp[4]*src[15];
+	dst[8] -= tmp[1]*src[13] + tmp[2]*src[14] + tmp[5]*src[15];
+	dst[9]  = tmp[1]*src[12] + tmp[6]*src[14] + tmp[9]*src[15];
+	dst[9] -= tmp[0]*src[12] + tmp[7]*src[14] + tmp[8]*src[15];
+	dst[10] = tmp[2]*src[12] + tmp[7]*src[13] + tmp[10]*src[15];
+	dst[10]-= tmp[3]*src[12] + tmp[6]*src[13] + tmp[11]*src[15];
+	dst[11] = tmp[5]*src[12] + tmp[8]*src[13] + tmp[11]*src[14];
+	dst[11]-= tmp[4]*src[12] + tmp[9]*src[13] + tmp[10]*src[14];
+	dst[12] = tmp[2]*src[10] + tmp[5]*src[11] + tmp[1]*src[9];
+	dst[12]-= tmp[4]*src[11] + tmp[0]*src[9] + tmp[3]*src[10];
+	dst[13] = tmp[8]*src[11] + tmp[0]*src[8] + tmp[7]*src[10];
+	dst[13]-= tmp[6]*src[10] + tmp[9]*src[11] + tmp[1]*src[8];
+	dst[14] = tmp[6]*src[9] + tmp[11]*src[11] + tmp[3]*src[8];
+	dst[14]-= tmp[10]*src[11] + tmp[2]*src[8] + tmp[7]*src[9];
+	dst[15] = tmp[10]*src[10] + tmp[4]*src[8] + tmp[9]*src[9];
+	dst[15]-= tmp[8]*src[9] + tmp[11]*src[10] + tmp[5]*src[8];
+	// calculate determinant
+	det=src[0]*dst[0]+src[1]*dst[1]+src[2]*dst[2]+src[3]*dst[3];
+	// calculate matrix inverse
+	if( CompareApproximately(det,0.0f) )
+	{
+		for (int i=0;i<16;i++)
+			dst[i] = 0.0F;
+		return false;
+	}
+
+	det = 1.0f/det;
+	for (int j = 0; j < 16; j++)
+		dst[j] *= det;
+
+	return true;
+}
+*/
+
+
+/*
+// SSE based matrix inverse from Intel's "Streaming SIMD Extensions - Inverse of 4x4 Matrix" paper.
+// Does not seem to be much faster on Core 2 Duo. Keeping it here just in case.
+
+#include <emmintrin.h>
+
+bool InvertMatrix4x4( const float* src, float* dst )
+{
+	__m128 minor0, minor1, minor2, minor3;
+	__m128 row0,   row1,   row2,   row3;
+	__m128 det,    tmp1;
+	tmp1 = _mm_loadh_pi(_mm_loadl_pi(tmp1, (__m64*)(src)), (__m64*)(src+ 4));
+	row1 = _mm_loadh_pi(_mm_loadl_pi(row1, (__m64*)(src+8)), (__m64*)(src+12));
+	row0 = _mm_shuffle_ps(tmp1, row1, 0x88);
+	row1 = _mm_shuffle_ps(row1, tmp1, 0xDD);
+	tmp1 = _mm_loadh_pi(_mm_loadl_pi(tmp1, (__m64*)(src+ 2)), (__m64*)(src+ 6));
+	row3 = _mm_loadh_pi(_mm_loadl_pi(row3, (__m64*)(src+10)), (__m64*)(src+14));
+	row2 = _mm_shuffle_ps(tmp1, row3, 0x88);
+	row3 = _mm_shuffle_ps(row3, tmp1, 0xDD);
+	// -----------------------------------------------
+	tmp1 = _mm_mul_ps(row2, row3);
+	tmp1 = _mm_shuffle_ps(tmp1, tmp1, 0xB1);
+	minor0 = _mm_mul_ps(row1, tmp1);
+	minor1 = _mm_mul_ps(row0, tmp1);
+	tmp1 = _mm_shuffle_ps(tmp1, tmp1, 0x4E);
+	minor0 = _mm_sub_ps(_mm_mul_ps(row1, tmp1), minor0);
+	minor1 = _mm_sub_ps(_mm_mul_ps(row0, tmp1), minor1);
+	minor1 = _mm_shuffle_ps(minor1, minor1, 0x4E);
+	// -----------------------------------------------
+	tmp1 = _mm_mul_ps(row1, row2);
+	tmp1 = _mm_shuffle_ps(tmp1, tmp1, 0xB1);
+	minor0 = _mm_add_ps(_mm_mul_ps(row3, tmp1), minor0);
+	minor3 = _mm_mul_ps(row0, tmp1);
+	tmp1 = _mm_shuffle_ps(tmp1, tmp1, 0x4E);
+	minor0 = _mm_sub_ps(minor0, _mm_mul_ps(row3, tmp1));
+	minor3 = _mm_sub_ps(_mm_mul_ps(row0, tmp1), minor3);
+	minor3 = _mm_shuffle_ps(minor3, minor3, 0x4E);
+	// -----------------------------------------------
+	tmp1 = _mm_mul_ps(_mm_shuffle_ps(row1, row1, 0x4E), row3);
+	tmp1 = _mm_shuffle_ps(tmp1, tmp1, 0xB1);
+	row2 = _mm_shuffle_ps(row2, row2, 0x4E);
+	minor0 = _mm_add_ps(_mm_mul_ps(row2, tmp1), minor0);
+	minor2 = _mm_mul_ps(row0, tmp1);
+	tmp1 = _mm_shuffle_ps(tmp1, tmp1, 0x4E);
+	minor0 = _mm_sub_ps(minor0, _mm_mul_ps(row2, tmp1));
+	minor2 = _mm_sub_ps(_mm_mul_ps(row0, tmp1), minor2);
+	minor2 = _mm_shuffle_ps(minor2, minor2, 0x4E);
+	// -----------------------------------------------
+	tmp1 = _mm_mul_ps(row0, row1);
+	tmp1 = _mm_shuffle_ps(tmp1, tmp1, 0xB1);
+	minor2 = _mm_add_ps(_mm_mul_ps(row3, tmp1), minor2);
+	minor3 = _mm_sub_ps(_mm_mul_ps(row2, tmp1), minor3);
+	tmp1 = _mm_shuffle_ps(tmp1, tmp1, 0x4E);
+	minor2 = _mm_sub_ps(_mm_mul_ps(row3, tmp1), minor2);
+	minor3 = _mm_sub_ps(minor3, _mm_mul_ps(row2, tmp1));
+	// -----------------------------------------------
+	tmp1 = _mm_mul_ps(row0, row3);
+	tmp1 = _mm_shuffle_ps(tmp1, tmp1, 0xB1);
+	minor1 = _mm_sub_ps(minor1, _mm_mul_ps(row2, tmp1));
+	minor2 = _mm_add_ps(_mm_mul_ps(row1, tmp1), minor2);
+	tmp1 = _mm_shuffle_ps(tmp1, tmp1, 0x4E);
+	minor1 = _mm_add_ps(_mm_mul_ps(row2, tmp1), minor1);
+	minor2 = _mm_sub_ps(minor2, _mm_mul_ps(row1, tmp1));
+	// -----------------------------------------------
+	tmp1 = _mm_mul_ps(row0, row2);
+	tmp1 = _mm_shuffle_ps(tmp1, tmp1, 0xB1);
+	minor1 = _mm_add_ps(_mm_mul_ps(row3, tmp1), minor1);
+	minor3 = _mm_sub_ps(minor3, _mm_mul_ps(row1, tmp1));
+	tmp1 = _mm_shuffle_ps(tmp1, tmp1, 0x4E);
+	minor1 = _mm_sub_ps(minor1, _mm_mul_ps(row3, tmp1));
+	minor3 = _mm_add_ps(_mm_mul_ps(row1, tmp1), minor3);
+	// -----------------------------------------------
+	det = _mm_mul_ps(row0, minor0);
+	det = _mm_add_ps(_mm_shuffle_ps(det, det, 0x4E), det);
+	det = _mm_add_ss(_mm_shuffle_ps(det, det, 0xB1), det);
+	// TODO: detect zero determinant
+	tmp1 = _mm_rcp_ss(det);
+	det = _mm_sub_ss(_mm_add_ss(tmp1, tmp1), _mm_mul_ss(det, _mm_mul_ss(tmp1, tmp1)));
+	det = _mm_shuffle_ps(det, det, 0x00);
+	minor0 = _mm_mul_ps(det, minor0);
+	_mm_storel_pi((__m64*)(dst), minor0);
+	_mm_storeh_pi((__m64*)(dst+2), minor0);
+	minor1 = _mm_mul_ps(det, minor1);
+	_mm_storel_pi((__m64*)(dst+4), minor1);
+	_mm_storeh_pi((__m64*)(dst+6), minor1);
+	minor2 = _mm_mul_ps(det, minor2);
+	_mm_storel_pi((__m64*)(dst+ 8), minor2);
+	_mm_storeh_pi((__m64*)(dst+10), minor2);
+	minor3 = _mm_mul_ps(det, minor3);
+	_mm_storel_pi((__m64*)(dst+12), minor3);
+	_mm_storeh_pi((__m64*)(dst+14), minor3);
+
+	return true;
+}
+*/
+
+
+Matrix4x4f& Matrix4x4f::Transpose ()
+{
+	swap(Get (0,1),Get (1,0));
+	swap(Get (0,2),Get (2,0));
+	swap(Get (0,3),Get (3,0));
+	swap(Get (1,2),Get (2,1));
+	swap(Get (1,3),Get (3,1));
+	swap(Get (2,3),Get (3,2));
+	return *this;
+}
+
+Matrix4x4f& Matrix4x4f::Copy (const Matrix4x4f& inM) 
+{
+	CopyMatrix(inM.m_Data, m_Data);
+	return *this;
+}
+
+void fromToRotation(const float from[3], const float to[3],float mtx[3][3]);
+
+Matrix4x4f& Matrix4x4f::SetFromToRotation (const Vector3f& from, const Vector3f& to)
+{
+	float mtx[3][3];
+	fromToRotation (from.GetPtr (), to.GetPtr (), mtx);
+	Get (0, 0) = mtx[0][0];	Get (0, 1) = mtx[0][1];	Get (0, 2) = mtx[0][2];	Get (0, 3) = 0.0;
+	Get (1, 0) = mtx[1][0];	Get (1, 1) = mtx[1][1];	Get (1, 2) = mtx[1][2];	Get (1, 3) = 0.0;
+	Get (2, 0) = mtx[2][0];	Get (2, 1) = mtx[2][1];	Get (2, 2) = mtx[2][2];	Get (2, 3) = 0.0;
+	Get (3, 0) = 0.0;		Get (3, 1) = 0.0;		Get (3, 2) = 0.0;		Get (3, 3) = 1.0;
+	return *this;
+}
+
+bool CompareApproximately (const Matrix4x4f& lhs, const Matrix4x4f& rhs, float dist)
+{
+	for (int i=0;i<16;i++)
+	{
+		if (!CompareApproximately (lhs[i], rhs[i], dist))
+			return false;
+	}
+	return true;
+}
+
+void Matrix4x4f::SetTR (const Vector3f& pos, const Quaternionf& q)
+{
+	QuaternionToMatrix (q, *this);
+	m_Data[12] = pos[0];
+	m_Data[13] = pos[1];
+	m_Data[14] = pos[2];
+}
+
+void Matrix4x4f::SetTRS (const Vector3f& pos, const Quaternionf& q, const Vector3f& s)
+{
+	QuaternionToMatrix (q, *this);
+
+	m_Data[0] *= s[0];
+	m_Data[1] *= s[0];
+	m_Data[2] *= s[0];
+
+	m_Data[4] *= s[1];
+	m_Data[5] *= s[1];
+	m_Data[6] *= s[1];
+
+	m_Data[8] *= s[2];
+	m_Data[9] *= s[2];
+	m_Data[10] *= s[2];
+
+	m_Data[12] = pos[0];
+	m_Data[13] = pos[1];
+	m_Data[14] = pos[2];
+}
+
+void Matrix4x4f::SetTRInverse (const Vector3f& pos, const Quaternionf& q)
+{
+	QuaternionToMatrix (::Inverse (q), *this);
+	Translate (Vector3f (-pos[0], -pos[1], -pos[2]));
+}
+
+void TransformPoints3x3 (const Matrix4x4f& matrix, const Vector3f* in, Vector3f* out, int count)
+{
+	Matrix3x3f m = Matrix3x3f(matrix);
+	for (int i=0;i<count;i++)
+		out[i] = m.MultiplyPoint3 (in[i]);
+}
+
+void TransformPoints3x4 (const Matrix4x4f& matrix, const Vector3f* in, Vector3f* out, int count)
+{
+	for (int i=0;i<count;i++)
+		out[i] = matrix.MultiplyPoint3 (in[i]);
+}
+
+void TransformPoints3x3 (const Matrix4x4f& matrix, const Vector3f* in, size_t inStride, Vector3f* out, size_t outStride, int count)
+{
+	Matrix3x3f m = Matrix3x3f(matrix);
+	for (int i=0;i<count; ++i, in = Stride (in, inStride), out = Stride (out, outStride))
+	{
+		*out = m.MultiplyPoint3 (*in);
+	}
+}
+
+void TransformPoints3x4 (const Matrix4x4f& matrix, const Vector3f* in, size_t inStride, Vector3f* out, size_t outStride, int count)
+{
+	for (int i=0;i<count; ++i, in = Stride (in, inStride), out = Stride (out, outStride))
+	{
+		*out = matrix.MultiplyPoint3 (*in);
+	}
+}
+
+
+#include "Matrix4x4_REF.cpp"
diff --git a/Runtime/Math/Matrix4x4.h b/Runtime/Math/Matrix4x4.h
new file mode 100644
index 0000000..2e14091
--- /dev/null
+++ b/Runtime/Math/Matrix4x4.h
@@ -0,0 +1,410 @@
+#ifndef MATRIX4X4_H
+#define MATRIX4X4_H
+
+#include "Vector3.h"
+#include "Vector4.h"
+#include "Runtime/Utilities/Prefetch.h"
+#include "Runtime/Misc/CPUInfo.h"
+#include "Runtime/Modules/ExportModules.h"
+
+#if (UNITY_SUPPORTS_SSE || UNITY_SUPPORTS_VMX)
+#	include "Simd/SimdMath.h"
+#endif
+
+
+class Matrix3x3f;
+class Matrix4x4f;
+class Quaternionf;
+
+/// Uniform transform scales x, y, z in the same amount,
+/// NonUniform transform scales x, y, z differently and might contain skew.
+/// kOddNegativeScaleTransform means that FrontFace(CCW) should be used (An odd number of scale axes is negative)
+enum TransformType
+{ 
+	kNoScaleTransform = 0,
+	kUniformScaleTransform = 1 << 0, 
+	kNonUniformScaleTransform = 1 << 1,
+	kOddNegativeScaleTransform = 1 << 2
+};
+ENUM_FLAGS(TransformType);
+
+inline bool IsNoScaleTransform (TransformType type) { return type == kNoScaleTransform; }
+inline bool IsNonUniformScaleTransform (TransformType type) { return (type & kNonUniformScaleTransform) != 0; }
+
+TransformType ComputeTransformType (const Matrix4x4f& matrix, float& outUniformScale, float epsilon = Vector3f::epsilon);
+
+bool InvertMatrix4x4_Full( const float* m, float* out );
+bool InvertMatrix4x4_General3D( const float* m, float* out );
+
+
+/// Matrices in unity are column major.
+class EXPORT_COREMODULE Matrix4x4f
+{
+	public:
+	float m_Data[16];
+	
+
+	///@todo: Can't be Transfer optimized because Transfer doesn't write the same as memory layout
+	DECLARE_SERIALIZE_NO_PPTR (Matrix4x4f)
+	
+	Matrix4x4f () {}
+	Matrix4x4f (const Matrix3x3f &other);
+	explicit Matrix4x4f (const float data[16]);
+
+	float& Get (int row, int column)            { return m_Data[row + (column*4)]; }
+	const float& Get (int row, int column)const { return m_Data[row + (column*4)]; }
+	float* GetPtr ()                            { return m_Data; }
+	const float* GetPtr ()const                 { return m_Data; }
+	
+	float operator [] (int index) const         { return m_Data[index]; }
+	float& operator [] (int index)              { return m_Data[index]; }
+	
+	Matrix4x4f& operator *= (const Matrix4x4f& inM);
+
+	Matrix4x4f& operator = (const Matrix3x3f& m);
+	
+	Vector3f MultiplyVector3 (const Vector3f& inV) const;
+	void MultiplyVector3 (const Vector3f& inV, Vector3f& output) const;
+	bool PerspectiveMultiplyVector3( const Vector3f& inV, Vector3f& output ) const;
+	Vector3f MultiplyPoint3 (const Vector3f& inV) const;
+	void MultiplyPoint3 (const Vector3f& inV, Vector3f& output) const;
+	bool PerspectiveMultiplyPoint3( const Vector3f& inV, Vector3f& output ) const;
+	Vector3f InverseMultiplyPoint3Affine (const Vector3f& inV) const;
+	Vector3f InverseMultiplyVector3Affine (const Vector3f& inV) const;
+	
+	bool IsIdentity (float epsilon = Vector3f::epsilon) const;
+
+	double GetDeterminant() const;
+	
+	Matrix4x4f& Invert_Full() {
+		InvertMatrix4x4_Full( m_Data, m_Data );
+		return *this;
+	}
+	static bool Invert_Full( const Matrix4x4f &inM, Matrix4x4f &outM ) {
+		return InvertMatrix4x4_Full( inM.m_Data, outM.m_Data );
+	}
+	static bool Invert_General3D( const Matrix4x4f &inM, Matrix4x4f &outM ) {
+		return InvertMatrix4x4_General3D( inM.m_Data, outM.m_Data );
+	}
+
+	Matrix4x4f& Transpose ();
+
+	Matrix4x4f& Copy (const Matrix4x4f& inM);
+	
+	Matrix4x4f& SetIdentity ();
+	Matrix4x4f& SetPerspective( float fovy, float aspect, float zNear, float zFar );
+	// rad = Deg2Rad(fovy/2), contanHalfFOV = cos(rad)/sin(rad)
+	Matrix4x4f& SetPerspectiveCotan( float cotanHalfFOV, float zNear, float zFar );
+	Matrix4x4f& SetOrtho( float left, float right, float bottom, float top, float zNear, float zFar );
+	Matrix4x4f& SetFrustum( float left, float right, float bottom, float top, float nearval, float farval );
+
+	Vector3f GetAxisX() const;
+	Vector3f GetAxisY() const;
+	Vector3f GetAxisZ() const;
+	Vector3f GetPosition() const;
+	Vector4f GetRow(int row) const;
+	Vector4f GetColumn(int col) const;
+	// these set only these components of the matrix, everything else is untouched!
+	void SetAxisX( const Vector3f& v );
+	void SetAxisY( const Vector3f& v );
+	void SetAxisZ( const Vector3f& v );
+	void SetPosition( const Vector3f& v );
+	void SetRow( int row, const Vector4f& v );
+	void SetColumn( int col, const Vector4f& v );
+
+	Matrix4x4f& SetTranslate (const Vector3f& inTrans);
+	Matrix4x4f& SetOrthoNormalBasis (const Vector3f& inX, const Vector3f& inY, const Vector3f& inZ);
+	Matrix4x4f& SetOrthoNormalBasisInverse (const Vector3f& inX, const Vector3f& inY, const Vector3f& inZ);
+	Matrix4x4f& SetScale (const Vector3f& inScale);
+	Matrix4x4f& SetPositionAndOrthoNormalBasis (const Vector3f& inPosition, const Vector3f& inX, const Vector3f& inY, const Vector3f& inZ);
+
+	Matrix4x4f& Translate (const Vector3f& inTrans);
+	Matrix4x4f& Scale (const Vector3f& inScale);
+	
+	Matrix4x4f& SetFromToRotation (const Vector3f& from, const Vector3f& to);
+	
+	void SetTR (const Vector3f& pos, const Quaternionf& q);
+	void SetTRS (const Vector3f& pos, const Quaternionf& q, const Vector3f& s);
+	void SetTRInverse (const Vector3f& pos, const Quaternionf& q);
+	
+	static const Matrix4x4f identity;
+};
+
+bool CompareApproximately (const Matrix4x4f& lhs, const Matrix4x4f& rhs, float dist = Vector3f::epsilon);
+
+/// Transforms an array of vertices. input may be the same as output.
+void EXPORT_COREMODULE TransformPoints3x3 (const Matrix4x4f &matrix, const Vector3f* input, Vector3f* ouput, int count);
+void EXPORT_COREMODULE TransformPoints3x4 (const Matrix4x4f &matrix, const Vector3f* input, Vector3f* ouput, int count);
+void EXPORT_COREMODULE TransformPoints3x3 (const Matrix4x4f &matrix, const Vector3f* input, size_t inStride, Vector3f* ouput, size_t outStride, int count);
+void EXPORT_COREMODULE TransformPoints3x4 (const Matrix4x4f &matrix, const Vector3f* input, size_t inStride, Vector3f* ouput, size_t outStride, int count);
+
+void MultiplyMatrices3x4( const Matrix4x4f& lhs, const Matrix4x4f& rhs, Matrix4x4f& res);
+
+void MultiplyMatrices4x4REF(const Matrix4x4f* __restrict lhs, const Matrix4x4f* __restrict rhs, Matrix4x4f* __restrict res);
+void CopyMatrixREF( const float* __restrict lhs, float* __restrict res);
+void TransposeMatrix4x4REF (const Matrix4x4f* __restrict lhs, Matrix4x4f* __restrict res);
+
+// foreach R[i] = A[i] * B[i]
+void MultiplyMatrixArray4x4REF(const Matrix4x4f* __restrict arrayA, const Matrix4x4f* __restrict arrayB,
+							   Matrix4x4f* __restrict arrayRes, size_t count);
+// foreach R[i] = BASE * A[i] * B[i]
+void MultiplyMatrixArrayWithBase4x4REF (const Matrix4x4f* __restrict base,
+										const Matrix4x4f* __restrict arrayA, const Matrix4x4f* __restrict arrayB,
+										Matrix4x4f* __restrict arrayRes, size_t count);
+
+#if (UNITY_AUTO_DETECT_VECTOR_UNIT && UNITY_SUPPORTS_SSE)
+#	define DECLARE_SIMD_FUNC(f) f##Simd
+#else
+#	define DECLARE_SIMD_FUNC(f) f
+#endif
+
+#if (UNITY_SUPPORTS_SSE || UNITY_SUPPORTS_VMX)
+#	include "Simd/Matrix4x4Simd.h"
+#elif UNITY_SUPPORTS_NEON
+
+#if UNITY_ANDROID || UNITY_WINRT || UNITY_BB10 || UNITY_TIZEN
+	#define MultiplyMatrices4x4_NEON				_MultiplyMatrices4x4_NEON
+	#define CopyMatrix_NEON							_CopyMatrix_NEON
+	#define TransposeMatrix4x4_NEON					_TransposeMatrix4x4_NEON
+
+	#define	MultiplyMatrixArray4x4_NEON				_MultiplyMatrixArray4x4_NEON
+	#define	MultiplyMatrixArrayWithBase4x4_NEON		_MultiplyMatrixArrayWithBase4x4_NEON
+#endif
+
+extern "C" 
+{
+	void CopyMatrix_NEON(const float* __restrict lhs, float* __restrict res);
+	void TransposeMatrix4x4_NEON(const Matrix4x4f* __restrict lhs, Matrix4x4f* __restrict res);
+	
+	void MultiplyMatrices4x4_NEON(const Matrix4x4f* __restrict lhs, const Matrix4x4f* __restrict rhs, Matrix4x4f* __restrict res);
+	void MultiplyMatrixArray4x4_NEON(const Matrix4x4f* __restrict arrayA, const Matrix4x4f* __restrict arrayB,
+									 Matrix4x4f* __restrict arrayRes, size_t count);
+	void MultiplyMatrixArrayWithBase4x4_NEON(const Matrix4x4f* __restrict base,
+											 const Matrix4x4f* __restrict arrayA, const Matrix4x4f* __restrict arrayB,
+											 Matrix4x4f* __restrict arrayRes, size_t count);
+}
+
+#if UNITY_ANDROID && UNITY_SUPPORTS_NEON && UNITY_SUPPORTS_VFP
+
+	#define MultiplyMatrices4x4_VFP			_MultiplyMatrices4x4_VFP
+	#define	MultiplyMatrixArray4x4_VFP		_MultiplyMatrixArray4x4_VFP
+
+	extern "C"
+	{
+		void MultiplyMatrices4x4_VFP(const Matrix4x4f* __restrict lhs, const Matrix4x4f* __restrict rhs, Matrix4x4f* __restrict res);
+		void MultiplyMatrixArray4x4_VFP(const Matrix4x4f* __restrict arrayA, const Matrix4x4f* __restrict arrayB,
+										Matrix4x4f* __restrict arrayRes, size_t count);
+	}
+
+	#define CopyMatrix(a,b)								CPUInfo::HasNEONSupport() ? CopyMatrix_NEON(a,b) : CopyMatrixREF(a,b)
+	#define TransposeMatrix4x4(a,b)						CPUInfo::HasNEONSupport() ? TransposeMatrix4x4_NEON(a,b) : TransposeMatrix4x4REF(a,b)
+
+	#define MultiplyMatrices4x4(a,b,c)					CPUInfo::HasNEONSupport() ? MultiplyMatrices4x4_NEON(a,b,c) : MultiplyMatrices4x4_VFP(a,b,c)
+	#define MultiplyMatrixArray4x4(a,b,c,d)				CPUInfo::HasNEONSupport() ? MultiplyMatrixArray4x4_NEON(a,b,c,d) : MultiplyMatrixArray4x4_VFP(a,b,c,d)
+	#define MultiplyMatrixArrayWithBase4x4(a,b,c,d,e)	CPUInfo::HasNEONSupport() ? MultiplyMatrixArrayWithBase4x4_NEON(a,b,c,d,e) : MultiplyMatrixArrayWithBase4x4REF(a,b,c,d,e)
+
+#else
+	
+	#define CopyMatrix			CopyMatrix_NEON
+	#define TransposeMatrix4x4	TransposeMatrix4x4_NEON
+
+	#define MultiplyMatrices4x4	MultiplyMatrices4x4_NEON
+	#define MultiplyMatrixArray4x4			MultiplyMatrixArray4x4_NEON
+	#define MultiplyMatrixArrayWithBase4x4	MultiplyMatrixArrayWithBase4x4_NEON
+
+#endif
+	
+#elif UNITY_SUPPORTS_VFP
+
+#if UNITY_ANDROID
+	#define MultiplyMatrices4x4_VFP			_MultiplyMatrices4x4_VFP
+	#define	MultiplyMatrixArray4x4_VFP		_MultiplyMatrixArray4x4_VFP
+#endif
+
+extern "C" 
+{
+	void MultiplyMatrices4x4_VFP(const Matrix4x4f* __restrict lhs, const Matrix4x4f* __restrict rhs, Matrix4x4f* __restrict res);
+	void MultiplyMatrixArray4x4_VFP(const Matrix4x4f* __restrict arrayA, const Matrix4x4f* __restrict arrayB,
+									Matrix4x4f* __restrict arrayRes, size_t count);
+}
+
+	#define CopyMatrix						CopyMatrixREF
+	#define TransposeMatrix4x4				TransposeMatrix4x4REF
+	
+	#define MultiplyMatrices4x4				MultiplyMatrices4x4_VFP
+	#define MultiplyMatrixArray4x4			MultiplyMatrixArray4x4_VFP
+	#define MultiplyMatrixArrayWithBase4x4	MultiplyMatrixArrayWithBase4x4REF
+
+
+#else
+
+	#define CopyMatrix						CopyMatrixREF
+	#define TransposeMatrix4x4				TransposeMatrix4x4REF
+
+	#define MultiplyMatrices4x4				MultiplyMatrices4x4REF
+	#define MultiplyMatrixArray4x4			MultiplyMatrixArray4x4REF
+	#define MultiplyMatrixArrayWithBase4x4	MultiplyMatrixArrayWithBase4x4REF
+
+#endif
+
+
+inline Vector3f Matrix4x4f::GetAxisX() const {
+	return Vector3f( Get(0,0), Get(1,0), Get(2,0) );
+}
+inline Vector3f Matrix4x4f::GetAxisY() const {
+	return Vector3f( Get(0,1), Get(1,1), Get(2,1) );
+}
+inline Vector3f Matrix4x4f::GetAxisZ() const {
+	return Vector3f( Get(0,2), Get(1,2), Get(2,2) );
+}
+inline Vector3f Matrix4x4f::GetPosition() const {
+	return Vector3f( Get(0,3), Get(1,3), Get(2,3) );
+}
+inline Vector4f Matrix4x4f::GetRow(int row) const {
+	return Vector4f( Get(row,0), Get(row,1), Get(row,2), Get(row,3) );
+}
+inline Vector4f Matrix4x4f::GetColumn(int col) const {
+	return Vector4f( Get(0,col), Get(1,col), Get(2,col), Get(3,col) );
+}
+inline void Matrix4x4f::SetAxisX( const Vector3f& v ) {
+	Get(0,0) = v.x; Get(1,0) = v.y; Get(2,0) = v.z;
+}
+inline void Matrix4x4f::SetAxisY( const Vector3f& v ) {
+	Get(0,1) = v.x; Get(1,1) = v.y; Get(2,1) = v.z;
+}
+inline void Matrix4x4f::SetAxisZ( const Vector3f& v ) {
+	Get(0,2) = v.x; Get(1,2) = v.y; Get(2,2) = v.z;
+}
+inline void Matrix4x4f::SetPosition( const Vector3f& v ) {
+	Get(0,3) = v.x; Get(1,3) = v.y; Get(2,3) = v.z;
+}
+inline void Matrix4x4f::SetRow( int row, const Vector4f& v ) {
+	Get(row,0) = v.x; Get(row,1) = v.y; Get(row,2) = v.z; Get(row,3) = v.w;
+}
+inline void Matrix4x4f::SetColumn( int col, const Vector4f& v ) {
+	Get(0,col) = v.x; Get(1,col) = v.y; Get(2,col) = v.z; Get(3,col) = v.w;
+}
+
+
+inline Vector3f Matrix4x4f::MultiplyPoint3 (const Vector3f& v) const
+{
+	Vector3f res;
+	res.x = m_Data[0] * v.x + m_Data[4] * v.y + m_Data[ 8] * v.z + m_Data[12];
+	res.y = m_Data[1] * v.x + m_Data[5] * v.y + m_Data[ 9] * v.z + m_Data[13];
+	res.z = m_Data[2] * v.x + m_Data[6] * v.y + m_Data[10] * v.z + m_Data[14];
+	return res;
+}
+
+inline void Matrix4x4f::MultiplyPoint3 (const Vector3f& v, Vector3f& output) const
+{
+	output.x = m_Data[0] * v.x + m_Data[4] * v.y + m_Data[ 8] * v.z + m_Data[12];
+	output.y = m_Data[1] * v.x + m_Data[5] * v.y + m_Data[ 9] * v.z + m_Data[13];
+	output.z = m_Data[2] * v.x + m_Data[6] * v.y + m_Data[10] * v.z + m_Data[14];
+}
+
+
+inline Vector3f Matrix4x4f::MultiplyVector3 (const Vector3f& v) const
+{
+	Vector3f res;
+	res.x = m_Data[0] * v.x + m_Data[4] * v.y + m_Data[ 8] * v.z;
+	res.y = m_Data[1] * v.x + m_Data[5] * v.y + m_Data[ 9] * v.z;
+	res.z = m_Data[2] * v.x + m_Data[6] * v.y + m_Data[10] * v.z;
+	return res;
+}
+
+inline void Matrix4x4f::MultiplyVector3 (const Vector3f& v, Vector3f& output) const
+{
+	output.x = m_Data[0] * v.x + m_Data[4] * v.y + m_Data[ 8] * v.z;
+	output.y = m_Data[1] * v.x + m_Data[5] * v.y + m_Data[ 9] * v.z;
+	output.z = m_Data[2] * v.x + m_Data[6] * v.y + m_Data[10] * v.z;
+}
+
+
+inline bool Matrix4x4f::PerspectiveMultiplyPoint3( const Vector3f& v, Vector3f& output ) const
+{
+	Vector3f res;
+	float w;
+	res.x = Get (0, 0) * v.x + Get (0, 1) * v.y + Get (0, 2) * v.z + Get (0, 3); 
+	res.y = Get (1, 0) * v.x + Get (1, 1) * v.y + Get (1, 2) * v.z + Get (1, 3);
+	res.z = Get (2, 0) * v.x + Get (2, 1) * v.y + Get (2, 2) * v.z + Get (2, 3);
+	w     = Get (3, 0) * v.x + Get (3, 1) * v.y + Get (3, 2) * v.z + Get (3, 3);
+	if( Abs(w) > 1.0e-7f )
+	{
+		float invW = 1.0f / w;
+		output.x = res.x * invW;
+		output.y = res.y * invW;
+		output.z = res.z * invW;
+		return true;
+	}
+	else
+	{
+		output.x = 0.0f;
+		output.y = 0.0f;
+		output.z = 0.0f;
+		return false;
+	}
+}
+
+inline bool Matrix4x4f::PerspectiveMultiplyVector3( const Vector3f& v, Vector3f& output ) const
+{
+	Vector3f res;
+	float w;
+	res.x = Get (0, 0) * v.x + Get (0, 1) * v.y + Get (0, 2) * v.z;
+	res.y = Get (1, 0) * v.x + Get (1, 1) * v.y + Get (1, 2) * v.z;
+	res.z = Get (2, 0) * v.x + Get (2, 1) * v.y + Get (2, 2) * v.z;
+	w     = Get (3, 0) * v.x + Get (3, 1) * v.y + Get (3, 2) * v.z;
+	if( Abs(w) > 1.0e-7f )
+	{
+		float invW = 1.0f / w;
+		output.x = res.x * invW;
+		output.y = res.y * invW;
+		output.z = res.z * invW;
+		return true;
+	}
+	else
+	{
+		output.x = 0.0f;
+		output.y = 0.0f;
+		output.z = 0.0f;
+		return false;
+	}
+}
+
+inline Vector3f Matrix4x4f::InverseMultiplyPoint3Affine (const Vector3f& inV) const
+{
+	Vector3f v (inV.x - Get (0, 3), inV.y - Get (1, 3), inV.z - Get (2, 3));
+	Vector3f res;
+	res.x = Get (0, 0) * v.x + Get (1, 0) * v.y + Get (2, 0) * v.z;
+	res.y = Get (0, 1) * v.x + Get (1, 1) * v.y + Get (2, 1) * v.z;
+	res.z = Get (0, 2) * v.x + Get (1, 2) * v.y + Get (2, 2) * v.z;
+	return res;
+}
+
+inline Vector3f Matrix4x4f::InverseMultiplyVector3Affine (const Vector3f& v) const
+{
+	Vector3f res;
+	res.x = Get (0, 0) * v.x + Get (1, 0) * v.y + Get (2, 0) * v.z;
+	res.y = Get (0, 1) * v.x + Get (1, 1) * v.y + Get (2, 1) * v.z;
+	res.z = Get (0, 2) * v.x + Get (1, 2) * v.y + Get (2, 2) * v.z;
+	return res;
+}
+
+template<class TransferFunction> inline
+void Matrix4x4f::Transfer (TransferFunction& t)
+{
+	t.Transfer (Get (0, 0), "e00");	t.Transfer (Get (0, 1), "e01");	t.Transfer (Get (0, 2), "e02");	t.Transfer (Get (0, 3), "e03");
+	t.Transfer (Get (1, 0), "e10");	t.Transfer (Get (1, 1), "e11");	t.Transfer (Get (1, 2), "e12");	t.Transfer (Get (1, 3), "e13");
+	t.Transfer (Get (2, 0), "e20");	t.Transfer (Get (2, 1), "e21");	t.Transfer (Get (2, 2), "e22");	t.Transfer (Get (2, 3), "e23");
+	t.Transfer (Get (3, 0), "e30");	t.Transfer (Get (3, 1), "e31");	t.Transfer (Get (3, 2), "e32");	t.Transfer (Get (3, 3), "e33");
+}
+
+inline bool IsFinite (const Matrix4x4f& f)
+{
+	return
+	IsFinite(f.m_Data[0]) & IsFinite(f.m_Data[1]) & IsFinite(f.m_Data[2]) &
+	IsFinite(f.m_Data[4]) & IsFinite(f.m_Data[5]) & IsFinite(f.m_Data[6]) &
+	IsFinite(f.m_Data[8]) & IsFinite(f.m_Data[9]) & IsFinite(f.m_Data[10]) &
+	IsFinite(f.m_Data[12]) & IsFinite(f.m_Data[13]) & IsFinite(f.m_Data[14]) & IsFinite(f.m_Data[15]);
+}
+
+#endif
diff --git a/Runtime/Math/Matrix4x4_NEON.asm b/Runtime/Math/Matrix4x4_NEON.asm
new file mode 100644
index 0000000..59d6bce
--- /dev/null
+++ b/Runtime/Math/Matrix4x4_NEON.asm
@@ -0,0 +1,197 @@
+	AREA .text, CODE
+
+	EXPORT _CopyMatrix_NEON
+	EXPORT _TransposeMatrix4x4_NEON
+	EXPORT _MultiplyMatrices4x4_NEON
+	EXPORT _MultiplyMatrixArray4x4_NEON
+	EXPORT _MultiplyMatrixArrayWithBase4x4_NEON
+
+|_CopyMatrix_NEON| PROC
+	vld1.32	{d0-d3}, [r0]!
+	vld1.32	{d4-d7}, [r0]
+	vst1.32	{d0-d3}, [r1]!
+	vst1.32	{d4-d7}, [r1]
+	bx	lr
+	ENDP
+
+
+|_TransposeMatrix4x4_NEON| PROC
+	vld4.32	{d0,d2,d4,d6}, [r0]!
+	vld4.32	{d1,d3,d5,d7}, [r0]
+	vst1.32	{d0-d3}, [r1]!
+	vst1.32	{d4-d7}, [r1]
+	bx	lr
+	ENDP
+
+
+|_MultiplyMatrices4x4_NEON| PROC
+	vld1.32	{d0-d3}, [r1]!
+	vld1.32	{d16-d17}, [r0]!
+	vmul.f32	q12, q8, d0[0]
+	vld1.32	{d4-d5}, [r1]!
+	vmul.f32	q13, q8, d2[0]
+	vld1.32	{d6-d7}, [r1]!
+	vmul.f32	q14, q8, d4[0]
+	vld1.32	{d18-d19}, [r0]!
+	vmul.f32	q15, q8, d6[0]
+	vld1.32	{d20-d21}, [r0]!
+	vmla.f32	q12, q9, d0[1]
+	vld1.32	{d22-d23}, [r0]!
+	vmla.f32	q13, q9, d2[1]
+	vmla.f32	q14, q9, d4[1]
+	vmla.f32	q15, q9, d6[1]
+	vmla.f32	q12, q10, d1[0]
+	vmla.f32	q13, q10, d3[0]
+	vmla.f32	q14, q10, d5[0]
+	vmla.f32	q15, q10, d7[0]
+	vmla.f32	q12, q11, d1[1]
+	vmla.f32	q13, q11, d3[1]
+	vmla.f32	q14, q11, d5[1]
+	vmla.f32	q15, q11, d7[1]
+	vst1.32	{d24-d27}, [r2]!
+	vst1.32	{d28-d31}, [r2]!
+	bx	lr
+	ENDP
+
+
+|_MultiplyMatrixArray4x4_NEON| PROC
+	vpush	{d8-d15}
+	add.w	r3, r0, r3, lsl #6
+	vld1.32	{d0-d3}, [r1]!
+	vld1.32	{d4-d7}, [r1]!
+	vld1.32	{d16-d17}, [r0]!
+	nop
+
+|_MultiplyMatrixArray4x4_NEON_loop|
+	vmul.f32	q12, q8, d0[0]
+	vld1.32	{d18-d19}, [r0]!
+	vmul.f32	q13, q8, d2[0]
+	vmul.f32	q14, q8, d4[0]
+	vmul.f32	q15, q8, d6[0]
+	vmla.f32	q12, q9, d0[1]
+	vld1.32	{d20-d21}, [r0]!
+	vmla.f32	q13, q9, d2[1]
+	vld1.32	{d8-d11}, [r1]!
+	vmla.f32	q14, q9, d4[1]
+	vld1.32	{d12-d15}, [r1]!
+	vmla.f32	q15, q9, d6[1]
+	vmla.f32	q12, q10, d1[0]
+	vld1.32	{d22-d23}, [r0]!
+	vmla.f32	q13, q10, d3[0]
+	vmla.f32	q14, q10, d5[0]
+	vmla.f32	q15, q10, d7[0]
+	vmla.f32	q12, q11, d1[1]
+	vld1.32	{d16-d17}, [r0]!
+	vmla.f32	q13, q11, d3[1]
+	vmla.f32	q14, q11, d5[1]
+	vmla.f32	q15, q11, d7[1]
+	vst1.32	{d24-d27}, [r2]!
+	vst1.32	{d28-d31}, [r2]!
+	cmp	r0, r3
+	bcs.w	|_MultiplyMatrixArray4x4_out|
+	vmul.f32	q12, q8, d8[0]
+	vld1.32	{d18-d19}, [r0]!
+	vmul.f32	q13, q8, d10[0]
+	vmul.f32	q14, q8, d12[0]
+	vmul.f32	q15, q8, d14[0]
+	vmla.f32	q12, q9, d8[1]
+	vld1.32	{d20-d21}, [r0]!
+	vmla.f32	q13, q9, d10[1]
+	vld1.32	{d0-d3}, [r1]!
+	vmla.f32	q14, q9, d12[1]
+	vld1.32	{d4-d7}, [r1]!
+	vmla.f32	q15, q9, d14[1]
+	vmla.f32	q12, q10, d9[0]
+	vld1.32	{d22-d23}, [r0]!
+	vmla.f32	q13, q10, d11[0]
+	vmla.f32	q14, q10, d13[0]
+	vmla.f32	q15, q10, d15[0]
+	vmla.f32	q12, q11, d9[1]
+	vld1.32	{d16-d17}, [r0]!
+	vmla.f32	q13, q11, d11[1]
+	vmla.f32	q14, q11, d13[1]
+	vmla.f32	q15, q11, d15[1]
+	vst1.32	{d24-d27}, [r2]!
+	vst1.32	{d28-d31}, [r2]!
+	cmp	r0, r3
+	bcc.w	|_MultiplyMatrixArray4x4_NEON_loop|
+	nop.w
+
+|_MultiplyMatrixArray4x4_out|
+	vpop	{d8-d15}
+	bx	lr
+	ENDP
+
+
+|_MultiplyMatrixArrayWithBase4x4_NEON| PROC
+	mov	ip, sp
+	vpush	{d8-d15}
+	stmdb	sp!, {r4, r5}
+	ldr.w	r4, [ip]
+	add.w	r4, r1, r4, lsl #6
+	vld1.32	{d16-d17}, [r1]!
+	vld1.32	{d0-d3}, [r2]!
+	vld1.32	{d4-d7}, [r2]!
+	vld1.32	{d20-d23}, [r0]!
+	add.w	r5, r0, #16
+	vmul.f32	q4, q8, d0[0]
+	vmul.f32	q5, q8, d2[0]
+	nop.w
+	nop.w
+	nop.w
+	
+|_MultiplyMatrixArrayWithBase4x4_NEON_loop|
+	vld1.32	{d18-d19}, [r1]!
+	vmul.f32	q6, q8, d4[0]
+	vmul.f32	q7, q8, d6[0]
+	vmla.f32	q4, q9, d0[1]
+	vmla.f32	q5, q9, d2[1]
+	vld1.32	{d16-d17}, [r1]!
+	vmla.f32	q6, q9, d4[1]
+	vmla.f32	q7, q9, d6[1]
+	vmla.f32	q4, q8, d1[0]
+	vmla.f32	q5, q8, d3[0]
+	vld1.32	{d18-d19}, [r1]!
+	vmla.f32	q6, q8, d5[0]
+	vmla.f32	q7, q8, d7[0]
+	cmp	r1, r4
+	vmla.f32	q4, q9, d1[1]
+	vmla.f32	q5, q9, d3[1]
+	vld1.32	{d16-d17}, [r0]
+	vmla.f32	q6, q9, d5[1]
+	vmla.f32	q7, q9, d7[1]
+	vld1.32	{d18-d19}, [r5]
+	vmul.f32	q12, q10, d8[0]
+	vmul.f32	q13, q10, d10[0]
+	vld1.32	{d0-d1}, [r2]!
+	vmul.f32	q14, q10, d12[0]
+	vmul.f32	q15, q10, d14[0]
+	vmla.f32	q12, q11, d8[1]
+	vmla.f32	q13, q11, d10[1]
+	vld1.32	{d2-d3}, [r2]!
+	vmla.f32	q14, q11, d12[1]
+	vmla.f32	q15, q11, d14[1]
+	vmla.f32	q12, q8, d9[0]
+	vmla.f32	q13, q8, d11[0]
+	vld1.32	{d4-d5}, [r2]!
+	vmla.f32	q14, q8, d13[0]
+	vmla.f32	q12, q9, d9[1]
+	vmla.f32	q13, q9, d11[1]
+	vmla.f32	q15, q8, d15[0]
+	vld1.32	{d6-d7}, [r2]!
+	vmla.f32	q14, q9, d13[1]
+	vld1.32	{d16-d17}, [r1]!
+	vmla.f32	q15, q9, d15[1]
+	vst1.32	{d24-d27}, [r3]!
+	vmul.f32	q4, q8, d0[0]
+	vmul.f32	q5, q8, d2[0]
+	vst1.32	{d28-d31}, [r3]!
+	bcc.w	|_MultiplyMatrixArrayWithBase4x4_NEON_loop|
+	pop	{r4, r5}
+	vpop	{d8-d15}
+	bx	lr
+	nop
+	ENDP
+
+
+	END
diff --git a/Runtime/Math/Matrix4x4_NEON.s b/Runtime/Math/Matrix4x4_NEON.s
new file mode 100644
index 0000000..12f2ffd
--- /dev/null
+++ b/Runtime/Math/Matrix4x4_NEON.s
@@ -0,0 +1,375 @@
+#define UNITY_ASSEMBLER
+#include "Configuration/PrefixConfigure.h"
+
+#if UNITY_SUPPORTS_NEON
+
+.set device,0
+.set device,__arm__
+
+.if device
+
+//.code32
+
+.globl _CopyMatrix_NEON
+.globl _TransposeMatrix4x4_NEON
+.globl _MultiplyMatrices4x4_NEON
+.globl _MultiplyMatrixArray4x4_NEON
+.globl _MultiplyMatrixArrayWithBase4x4_NEON
+
+#if UNITY_ANDROID
+.hidden _CopyMatrix_NEON
+.hidden _TransposeMatrix4x4_NEON
+.hidden _MultiplyMatrices4x4_NEON
+.hidden _MultiplyMatrixArray4x4_NEON
+.hidden _MultiplyMatrixArrayWithBase4x4_NEON
+#endif
+
+
+//===========================================================================================================================================
+
+// void CopyMatrix_NEON(const float* __restrict lhs, float* __restrict res)
+_CopyMatrix_NEON:
+// r0: src
+// r1: dst
+
+vld1.32		{q0,q1}, [r0]!
+vld1.32		{q2,q3}, [r0]
+vst1.32		{q0,q1}, [r1]!
+vst1.32		{q2,q3}, [r1]
+
+bx lr
+
+
+//===========================================================================================================================================
+
+// void TransposeMatrix4x4_NEON(const Matrix4x4f* __restrict lhs, Matrix4x4f* __restrict res)
+_TransposeMatrix4x4_NEON:
+// r0: src
+// r1: dst
+
+vld4.32		{d0,d2,d4,d6}, [r0]!
+vld4.32		{d1,d3,d5,d7}, [r0]
+vst1.32		{d0,d1,d2,d3}, [r1]!
+vst1.32		{d4,d5,d6,d7}, [r1]
+
+bx lr
+
+
+//===========================================================================================================================================
+
+// void MultiplyMatrices4x4_NEON(const Matrix4x4f* __restrict lhs, const Matrix4x4f* __restrict rhs, Matrix4x4f* __restrict res)
+_MultiplyMatrices4x4_NEON:
+// r0: A
+// r1: B
+// r2: dst
+
+vld1.32		{q0,q1}, [r1]!			// load Brow1-2
+vld1.32		{q8}, [r0]!				// load Arow1
+
+// R = Arow1 * Bcol1
+vmul.f32	q12, q8, d0[0]
+vld1.32		{q2},  [r1]!			// load Brow3
+
+vmul.f32	q13, q8, d2[0]
+vld1.32		{q3},  [r1]!			// load Brow4
+
+vmul.f32	q14, q8, d4[0]
+vld1.32		{q9},  [r0]!			// load Arow2
+
+vmul.f32	q15, q8, d6[0]
+vld1.32		{q10}, [r0]!			// load Arow3
+
+// R += Arow2 * Bcolumn2
+vmla.f32	q12, q9, d0[1]
+vld1.32		{q11}, [r0]!			// load Arow4
+
+vmla.f32	q13, q9, d2[1]
+vmla.f32	q14, q9, d4[1]
+vmla.f32	q15, q9, d6[1]
+
+// R += Arow3 * Bcolumn3
+vmla.f32	q12, q10, d1[0]
+vmla.f32	q13, q10, d3[0]
+vmla.f32	q14, q10, d5[0]
+vmla.f32	q15, q10, d7[0]
+
+// R += Arow4 * Bcolumn4
+vmla.f32	q12, q11, d1[1]
+vmla.f32	q13, q11, d3[1]
+vmla.f32	q14, q11, d5[1]
+vmla.f32	q15, q11, d7[1]
+
+vst1.32		{q12,q13}, [r2]!
+vst1.32		{q14,q15}, [r2]!
+
+bx			lr
+
+
+//===========================================================================================================================================
+
+// void MultiplyMatrixArray4x4_NEON(const Matrix4x4f* arrayA, const Matrix4x4f* arrayB, Matrix4x4f* arrayRes, size_t count)
+_MultiplyMatrixArray4x4_NEON:
+// r0: A
+// r1: B
+// r2: dst
+// r3: A end
+
+vpush		{d8-d15}
+add			r3, r0, r3, lsl #6
+
+vld1.32		{q0,q1}, [r1]!
+vld1.32		{q2,q3}, [r1]!
+vld1.32		{q8},    [r0]!
+
+
+.align 4
+_MultiplyMatrixArray4x4_NEON_loop:
+
+vmul.f32	q12, q8,   d0[0]
+vld1.32		{q9}, [r0]!					// load Arow2
+
+vmul.f32	q13, q8,   d2[0]
+vmul.f32	q14, q8,   d4[0]
+vmul.f32	q15, q8,   d6[0]
+
+
+vmla.f32	q12, q9,   d0[1]
+
+vld1.32		{q10},   [r0]!				// load Arow3
+vmla.f32	q13, q9,   d2[1]
+
+vld1.32		{q4,q5}, [r1]!				// load B[i+1]
+vmla.f32	q14, q9,   d4[1]
+
+vld1.32		{q6,q7}, [r1]!				// load B[i+1]
+vmla.f32	q15, q9,   d6[1]
+
+vmla.f32	q12, q10,  d1[0]
+vld1.32		{q11},   [r0]!				// load Arow3
+
+vmla.f32	q13, q10,  d3[0]
+vmla.f32	q14, q10,  d5[0]
+vmla.f32	q15, q10,  d7[0]
+
+vmla.f32	q12, q11,  d1[1]
+vld1.32		{q8},    [r0]!				// load A[i+1]row1
+
+vmla.f32	q13, q11,  d3[1]
+vmla.f32	q14, q11,  d5[1]
+vmla.f32	q15, q11,  d7[1]
+
+vst1.32		{q12,q13}, [r2]!
+vst1.32		{q14,q15}, [r2]!
+
+cmp r0, r3
+bcs _MultiplyMatrixArray4x4_out
+
+
+vmul.f32	q12, q8,   d8[0]
+vld1.32		{q9},      [r0]!			// load A[i+1]row2
+
+vmul.f32	q13, q8,  d10[0]
+vmul.f32	q14, q8,  d12[0]
+vmul.f32	q15, q8,  d14[0]
+
+vmla.f32	q12, q9,   d8[1]
+
+vld1.32		{q10},     [r0]!			// load A[i+1]row3
+vmla.f32	q13, q9,  d10[1]
+
+vld1.32		{q0,q1},   [r1]!			// load B[i+2]
+vmla.f32	q14, q9,  d12[1]
+
+vld1.32		{q2,q3},   [r1]!			// load B[i+2]
+vmla.f32	q15, q9,  d14[1]
+
+vmla.f32	q12, q10,  d9[0]
+vld1.32		{q11},     [r0]!			// load A[i+1]row4
+
+vmla.f32	q13, q10, d11[0]
+vmla.f32	q14, q10, d13[0]
+vmla.f32	q15, q10, d15[0]
+
+vmla.f32	q12, q11,  d9[1]
+vld1.32		{q8},      [r0]!			// load A[i+2]row1
+
+vmla.f32	q13, q11, d11[1]
+vmla.f32	q14, q11, d13[1]
+vmla.f32	q15, q11, d15[1]
+
+vst1.32		{q12,q13}, [r2]!
+vst1.32		{q14,q15}, [r2]!
+
+cmp r0, r3
+bcc _MultiplyMatrixArray4x4_NEON_loop
+
+
+.align 4
+_MultiplyMatrixArray4x4_out:
+
+vpop		{d8-d15}
+bx			lr
+
+
+
+//===========================================================================================================================================
+
+#define MT_11_1                     \
+    vmul.f32    q12, q10,   d8[0] ; \
+    vmul.f32    q13, q10,  d10[0] ;
+
+#define MT_11_2                     \
+    vmul.f32    q14, q10,  d12[0] ; \
+    vmul.f32    q15, q10,  d14[0] ;
+
+#define MT_22_1                     \
+    vmla.f32    q12, q11,   d8[1] ; \
+    vmla.f32    q13, q11,  d10[1] ;
+
+#define MT_22_2                     \
+    vmla.f32    q14, q11,  d12[1] ; \
+    vmla.f32    q15, q11,  d14[1] ;
+
+#define MT_33_1                     \
+    vmla.f32    q12, q8,    d9[0] ; \
+    vmla.f32    q13, q8,   d11[0] ;
+
+#define MT_33_2_44_1                \
+    vmla.f32    q14, q8,   d13[0] ; \
+    vmla.f32    q12, q9,    d9[1] ; \
+    vmla.f32    q13, q9,   d11[1] ; \
+    vmla.f32    q15, q8,   d15[0] ;
+
+
+
+// void MultiplyMatrixArrayWithBase4x4_NEON( const Matrix4x4f* base, const Matrix4x4f* arrayA, const Matrix4x4f* arrayB, Matrix4x4f* arrayRes, size_t count )
+_MultiplyMatrixArrayWithBase4x4_NEON:
+// r0: base
+// r1: A
+// r2: B
+// r3: dst
+// r4: A end
+
+mov			ip, sp
+
+vpush		{d8-d15}
+stmfd		sp!, {r4-r5}
+
+
+ldr			r4, [ip, #0]
+add			r4, r1, r4, lsl #6
+
+
+vld1.32		{q8},		[r1]!				// load Arow1
+vld1.32		{q0,q1},	[r2]!				// load Brow1-2
+vld1.32		{q2,q3},	[r2]!				// load Brow3-4
+vld1.32		{q10,q11},	[r0]!				// load Mrow1-2
+
+add			r5, r0, #16
+
+// T = Arow1 * Bcol1
+
+vmul.f32	q4, q8,		d0[0]
+vmul.f32	q5, q8,		d2[0]
+
+
+.align 4
+_MultiplyMatrixArrayWithBase4x4_NEON_loop:
+
+// T = Arow1 * Bcol1
+
+vld1.32		{q9}, [r1]!						// load Arow2
+vmul.f32	q6, q8,		d4[0]
+vmul.f32	q7, q8,		d6[0]
+
+// T += Arow2 * Bcol2
+
+vmla.f32	q4, q9,   d0[1]
+vmla.f32	q5, q9,   d2[1]
+
+ vld1.32	{q8}, [r1]!						// load Arow3
+vmla.f32	q6, q9,   d4[1]
+vmla.f32	q7, q9,   d6[1]
+
+// T += Arow3 * Bcol3
+
+vmla.f32	q4, q8,  d1[0]
+vmla.f32	q5, q8,  d3[0]
+
+vld1.32		{q9}, [r1]!						// load Arow4
+
+vmla.f32	q6, q8,  d5[0]
+vmla.f32	q7, q8,  d7[0]
+
+cmp			r1, r4
+
+// T += Arow4 * Bcol4
+
+vmla.f32	q4, q9,  d1[1]
+vmla.f32	q5, q9,  d3[1]
+
+vld1.32		{q8}, [r0]						// load Mrow3
+vmla.f32	q6, q9,  d5[1]
+vmla.f32	q7, q9,  d7[1]
+
+// R = M * T
+
+vld1.32		{q9}, [r5]						// load Mrow4
+MT_11_1
+
+bge _MultiplyMatrixArrayWithBase4x4_NEON_epilogue
+
+vld1.32		{q0}, [r2]!						// load B[i+1]row1
+MT_11_2
+MT_22_1
+
+vld1.32		{q1}, [r2]!						// load B[i+1]row2
+MT_22_2
+
+MT_33_1
+vld1.32		{q2}, [r2]!						// load B[i+1]row3
+
+MT_33_2_44_1
+vld1.32		{q3}, [r2]!						// load B[i+1]row4
+
+vmla.f32	q14, q9,  d13[1]
+vld1.32		{q8}, [r1]!						// load A[i+1]row1
+
+vmla.f32	q15, q9,  d15[1]
+vst1.32		{q12,q13}, [r3]!
+
+// interleave T = Arow1 * Bcol1
+vmul.f32	q4, q8,		d0[0]
+vmul.f32	q5, q8,		d2[0]
+vst1.32		{q14,q15}, [r3]!
+
+bcc _MultiplyMatrixArrayWithBase4x4_NEON_loop
+
+.align 4
+_MultiplyMatrixArrayWithBase4x4_NEON_epilogue:
+
+MT_11_2
+MT_22_1
+MT_22_2
+MT_33_1
+MT_33_2_44_1
+vmla.f32    q14, q9,  d13[1]
+vmla.f32    q15, q9,  d15[1]
+vst1.32     {q12,q13}, [r3]!
+vst1.32     {q14,q15}, [r3]!
+
+
+ldmfd		sp!, {r4-r5}
+vpop		{d8-d15}
+bx			lr
+
+.endif
+
+#undef MT_11_1
+#undef MT_11_2
+#undef MT_22_1
+#undef MT_22_2
+#undef MT_33_1
+#undef MT_33_2_44_1
+
+#endif
diff --git a/Runtime/Math/Matrix4x4_REF.cpp b/Runtime/Math/Matrix4x4_REF.cpp
new file mode 100644
index 0000000..290e0d8
--- /dev/null
+++ b/Runtime/Math/Matrix4x4_REF.cpp
@@ -0,0 +1,60 @@
+#include "UnityPrefix.h"
+#include "Matrix4x4.h"
+#include <algorithm>
+
+void CopyMatrixREF ( const float* __restrict lhs, float* __restrict res)
+{
+	::memcpy( res, lhs, sizeof(Matrix4x4f) );
+}
+
+void TransposeMatrix4x4REF (const Matrix4x4f* __restrict lhs, Matrix4x4f* __restrict res)
+{
+	CopyMatrix(lhs->m_Data, res->m_Data);
+	std::swap( res->Get(0,1), res->Get(1,0) );
+	std::swap( res->Get(0,2), res->Get(2,0) );
+	std::swap( res->Get(0,3), res->Get(3,0) );
+	std::swap( res->Get(1,2), res->Get(2,1) );
+	std::swap( res->Get(1,3), res->Get(3,1) );
+	std::swap( res->Get(2,3), res->Get(3,2) );
+}
+
+void MultiplyMatrices4x4REF (const Matrix4x4f* __restrict lhs, const Matrix4x4f* __restrict rhs, Matrix4x4f* __restrict res)
+{
+	Assert (lhs != rhs && lhs != res && rhs != res);
+	for (int i=0;i<4;i++)
+	{
+		res->m_Data[i]    = lhs->m_Data[i] * rhs->m_Data[0]  + lhs->m_Data[i+4] * rhs->m_Data[1]  + lhs->m_Data[i+8] * rhs->m_Data[2]  + lhs->m_Data[i+12] * rhs->m_Data[3];
+		res->m_Data[i+4]  = lhs->m_Data[i] * rhs->m_Data[4]  + lhs->m_Data[i+4] * rhs->m_Data[5]  + lhs->m_Data[i+8] * rhs->m_Data[6]  + lhs->m_Data[i+12] * rhs->m_Data[7];
+		res->m_Data[i+8]  = lhs->m_Data[i] * rhs->m_Data[8]  + lhs->m_Data[i+4] * rhs->m_Data[9]  + lhs->m_Data[i+8] * rhs->m_Data[10] + lhs->m_Data[i+12] * rhs->m_Data[11];
+		res->m_Data[i+12] = lhs->m_Data[i] * rhs->m_Data[12] + lhs->m_Data[i+4] * rhs->m_Data[13] + lhs->m_Data[i+8] * rhs->m_Data[14] + lhs->m_Data[i+12] * rhs->m_Data[15];
+	}
+}
+
+void MultiplyMatrixArray4x4REF (const Matrix4x4f* __restrict a, const Matrix4x4f* __restrict b, Matrix4x4f* __restrict res, size_t count)
+{
+	Assert(a);
+	Assert(b);
+	Assert(res);
+	
+	for (size_t i = 0; i < count; ++i)
+	{
+		MultiplyMatrices4x4(a+i, b+i, res+i);
+	}
+}
+
+void MultiplyMatrixArrayWithBase4x4REF (const Matrix4x4f* __restrict base,
+										const Matrix4x4f* __restrict a, const Matrix4x4f* __restrict b, Matrix4x4f* __restrict res, size_t count)
+{
+	Assert(base);
+	Assert(a);
+	Assert(b);
+	Assert(res);
+	
+	Matrix4x4f tmp;
+	for (size_t i = 0; i < count; ++i)
+	{
+		MultiplyMatrices4x4(base, a+i, &tmp);
+		MultiplyMatrices4x4(&tmp, b+i, res+i);
+	}
+}
+
diff --git a/Runtime/Math/Matrix4x4_VFP.s b/Runtime/Math/Matrix4x4_VFP.s
new file mode 100644
index 0000000..1745cc3
--- /dev/null
+++ b/Runtime/Math/Matrix4x4_VFP.s
@@ -0,0 +1,149 @@
+#define UNITY_ASSEMBLER
+#include "Configuration/PrefixConfigure.h"
+#include "Runtime/Utilities/VFPUtility.h"
+
+#if UNITY_SUPPORTS_VFP
+
+.syntax unified
+
+.set device,0
+.set device,__arm__
+ 
+.if device
+
+//.code32
+
+.globl _MultiplyMatrices4x4_VFP
+.globl _MultiplyMatrixArray4x4_VFP
+
+#if UNITY_ANDROID
+
+.hidden _MultiplyMatrices4x4_VFP
+.hidden _MultiplyMatrixArray4x4_VFP
+
+#endif
+
+
+//===========================================================================================================================================
+
+
+// void MultiplyMatrices4x4_VFP(const Matrix4x4f* __restrict lhs, const Matrix4x4f* __restrict rhs, Matrix4x4f* __restrict res)
+_MultiplyMatrices4x4_VFP:
+// r0: A
+// r1: B
+// r2: dst
+
+vpush		{d8-d15}
+
+mov			ip, r0
+
+// VFP_VECTOR_LENGTH(3)
+
+mov			r0, ip
+
+vldmia.32	r0,  {s8-s23}
+vldmia.32	r1!, {s0-s7}
+
+FMULS4		(24,25,26,27,	8,9,10,11,		0,0,0,0)
+FMULS4		(28,29,30,31,	8,9,10,11,		4,4,4,4)
+
+FMACS4		(24,25,26,27,	12,13,14,15,	1,1,1,1)
+FMACS4		(28,29,30,31,	12,13,14,15,	5,5,5,5)
+
+FMACS4		(24,25,26,27,	16,17,18,19,	2,2,2,2)
+FMACS4		(28,29,30,31,	16,17,18,19,	6,6,6,6)
+
+FMACS4		(24,25,26,27,	20,21,22,23,	3,3,3,3)
+FMACS4		(28,29,30,31,	20,21,22,23,	7,7,7,7)
+
+
+vstmia.32	r2!, {s24-s31}
+vldmia.32	r1,  {s0-s7}
+                
+FMULS4		(24,25,26,27,	8,9,10,11,		0,0,0,0)
+FMULS4		(28,29,30,31,	8,9,10,11,		4,4,4,4)
+
+FMACS4		(24,25,26,27,	12,13,14,15,	1,1,1,1)
+FMACS4		(28,29,30,31,	12,13,14,15,	5,5,5,5)
+
+FMACS4		(24,25,26,27,	16,17,18,19,	2,2,2,2)
+FMACS4		(28,29,30,31,	16,17,18,19,	6,6,6,6)
+
+FMACS4		(24,25,26,27,	20,21,22,23,	3,3,3,3)
+FMACS4		(28,29,30,31,	20,21,22,23,	7,7,7,7)
+
+vstmia.32	r2,  {s24-s31}
+
+// VFP_VECTOR_LENGTH_ZERO
+
+vpop		{d8-d15}
+bx			lr
+
+
+//===========================================================================================================================================
+
+// void MultiplyMatrixArray4x4_VFP(const Matrix4x4f* arrayA, const Matrix4x4f* arrayB, Matrix4x4f* arrayRes, size_t count)
+_MultiplyMatrixArray4x4_VFP:
+// r0: A
+// r1: B
+// r2: dst
+// r3: A end
+
+vpush		{d8-d15}
+
+mov			ip, r0
+
+// VFP_VECTOR_LENGTH(3)
+
+mov			r0, ip
+add			r3, r0, r3, lsl #6
+
+	 
+.align 4
+_MultiplyMatrixArray4x4_VFP_loop:
+	 
+vldmia.32	r0!, {s16-s31}
+vldmia.32	r1!, {s0-s7}
+
+FMULS4		(8,9,10,11,		16,17,18,19,	0,0,0,0)
+FMULS4		(12,13,14,15,	16,17,18,19,	4,4,4,4)
+
+FMACS4		(8,9,10,11,		20,21,22,23,	1,1,1,1)
+FMACS4		(12,13,14,15,	20,21,22,23,	5,5,5,5)
+
+FMACS4		(8,9,10,11,		24,25,26,27,	2,2,2,2)
+FMACS4		(12,13,14,15,	24,25,26,27,	6,6,6,6)
+
+FMACS4		(8,9,10,11,		28,29,30,31,	3,3,3,3)
+FMACS4		(12,13,14,15,	28,29,30,31,	7,7,7,7)
+
+
+vldmia.32	r1!, {s0-s7}
+vstmia.32	r2!, {s8-s15}
+	 
+FMULS4		(8,9,10,11,		16,17,18,19,	0,0,0,0)
+FMULS4		(12,13,14,15,	16,17,18,19,	4,4,4,4)
+
+FMACS4		(8,9,10,11,		20,21,22,23,	1,1,1,1)
+FMACS4		(12,13,14,15,	20,21,22,23,	5,5,5,5)
+
+FMACS4		(8,9,10,11,		24,25,26,27,	2,2,2,2)
+FMACS4		(12,13,14,15,	24,25,26,27,	6,6,6,6)
+
+FMACS4		(8,9,10,11,		28,29,30,31,	3,3,3,3)
+FMACS4		(12,13,14,15,	28,29,30,31,	7,7,7,7)
+
+vstmia.32	r2!, {s8-s15}
+
+cmp			r0, r3
+bcc			_MultiplyMatrixArray4x4_VFP_loop
+	 
+// VFP_VECTOR_LENGTH_ZERO
+
+vpop		{d8-d15}
+bx			lr
+
+
+.endif
+
+#endif
+\ No newline at end of file
diff --git a/Runtime/Math/PodMathTypes.h b/Runtime/Math/PodMathTypes.h
new file mode 100644
index 0000000..6ac55a5
--- /dev/null
+++ b/Runtime/Math/PodMathTypes.h
@@ -0,0 +1,24 @@
+#ifndef POD_MATH_TYPES_H_
+#define POD_MATH_TYPES_H_
+
+namespace pod
+{
+
+struct v2f
+{
+	float x, y;
+};
+
+struct v3f
+{
+	float x, y, z;
+};
+
+struct m44f
+{
+	float m[16];
+};
+
+}
+
+#endif
diff --git a/Runtime/Math/Polynomials.h b/Runtime/Math/Polynomials.h
new file mode 100644
index 0000000..b32b172
--- /dev/null
+++ b/Runtime/Math/Polynomials.h
@@ -0,0 +1,93 @@
+#ifndef POLYNOMIALS_H
+#define POLYNOMIALS_H
+
+
+// Returns the highest root for the cubic x^3 + px^2 + qx + r
+inline double CubicPolynomialRoot(const double p, const double q, const double r)
+{
+	double rcp3 = 1.0/3.0;
+	double half = 0.5;
+	double po3 = p*rcp3;
+	double po3_2 = po3*po3;
+	double po3_3 = po3_2*po3;
+	double b = po3_3 - po3*q*half + r*half;
+	double a = -po3_2 + q*rcp3;
+	double a3 = a*a*a;
+	double det = a3 + b*b;
+
+	if (det >= 0)
+	{
+		double r0 = sqrt(det) - b;
+		r0 = r0 > 0 ? pow(r0, rcp3) : -pow(-r0, rcp3);
+
+		return - po3 - a/r0 + r0;
+	}
+
+	double abs = sqrt(-a3);
+	double arg = acos(-b/abs);
+	abs = pow(abs, rcp3);
+	abs = abs - a/abs;
+	arg = -po3 + abs*cos(arg*rcp3);
+	return arg;
+}
+
+// Calculates all real roots of polynomial ax^2 + bx + c (and returns how many)
+inline int QuadraticPolynomialRootsGeneric(const float a, const float b, const float c, float& r0, float& r1)
+{
+	const float eps = 0.00001f;
+	if (Abs(a) < eps)
+	{
+		if (Abs(b) > eps)
+		{
+			r0 = -c/b;
+			return 1;
+		}
+		else
+			return 0;
+	}
+
+	float disc = b*b - 4*a*c;
+	if (disc < 0.0f)
+		return 0;
+	
+	const float halfRcpA = 0.5f/a;
+	const float sqrtDisc = sqrt(disc);
+	r0 = (sqrtDisc-b)*halfRcpA;
+	r1 = (-sqrtDisc-b)*halfRcpA;
+	return 2;
+}
+
+// Calculates all the roots for the cubic ax^3 + bx^2 + cx + d. Max num roots is 3.
+inline int CubicPolynomialRootsGeneric(float* roots, const double a, const double b, const double c, const double d)
+{
+	int numRoots = 0;
+	if(Abs(a) >= 0.0001f)
+	{
+		const double p = b / a;
+		const double q = c / a;
+		const double r = d / a;
+		roots[0] = CubicPolynomialRoot(p, q, r);
+		numRoots++;
+
+		double la = a;
+		double lb = b + a * roots[0];
+		double lc = c + b*roots[0] + a*roots[0]*roots[0];
+		numRoots += QuadraticPolynomialRootsGeneric(la, lb, lc, roots[1], roots[2]);
+	}
+	else
+	{
+		numRoots += QuadraticPolynomialRootsGeneric(b, c, d, roots[0], roots[1]);
+	}
+
+	return numRoots;
+}
+
+// Specialized version of QuadraticPolynomialRootsGeneric that returns the largest root
+inline float QuadraticPolynomialRoot(const float a, const float b, const float c)
+{
+	float r0, r1;
+	QuadraticPolynomialRootsGeneric(a, b, c, r0, r1);
+	return r0;
+}
+
+#endif
diff --git a/Runtime/Math/Quaternion.cpp b/Runtime/Math/Quaternion.cpp
new file mode 100644
index 0000000..605cea2
--- /dev/null
+++ b/Runtime/Math/Quaternion.cpp
@@ -0,0 +1,449 @@
+#include "UnityPrefix.h"
+#include "Quaternion.h"
+#include <limits>
+/*
+Quaternionf Slerp(const Quaternionf& a, const Quaternionf& b, float time)
+{
+	#if DEBUGMODE
+	float debugLengthA = Magnitude (a);
+	float debugLengthB = Magnitude (b);
+	#endif
+	// ====================================================
+	// AART - Advanced Animation and Rendering Techniques
+	// ====================================================
+
+	float cosom = a.x*b.x + a.y*b.y + a.z*b.z + a.w*b.w;
+
+	if ( (1 + cosom) > std::numeric_limits<float>::epsilon() )
+	{
+		float sp;
+		float sq;
+
+		if ( (1 - cosom) > std::numeric_limits<float>::epsilon() )
+		{
+			double omega = acos(cosom);
+			double sinom = 1.0 / sin(omega);
+
+			sp = (sin((1 - time) * omega) * sinom);
+			sq = (sin(time * omega) * sinom);
+		}
+		else
+		{
+			sp = 1 - time;
+			sq = time;
+		}
+		
+		Quaternionf res = Quaternionf (
+			a.x*sp + b.x*sq,
+			a.y*sp + b.y*sq,
+			a.z*sp + b.z*sq,
+			a.w*sp + b.w*sq);
+		AssertIf (!CompareApproximately (SqrMagnitude (res), 1.0F) &&
+					  CompareApproximately (SqrMagnitude (b), 1.0) &&
+					  CompareApproximately (SqrMagnitude (a), 1.0));
+		return res;
+	}
+	else
+	{
+		float halfpi = pi / 2;
+		float sp = sin((1 - time) * halfpi);
+		float sq = sin(time * halfpi);
+
+		Quaternionf res = Quaternionf (
+			a.x*sp - a.y*sq,
+			a.y*sp + a.x*sq,
+			a.z*sp - a.w*sq,
+			a.z);
+
+		AssertIf (!CompareApproximately (SqrMagnitude (res), 1.0F) &&
+					  CompareApproximately (SqrMagnitude (b), 1.0) &&
+					  CompareApproximately (SqrMagnitude (a), 1.0));
+		return res;
+	}
+}
+*/
+
+Quaternionf Slerp( const Quaternionf& q1, const Quaternionf& q2, float t )
+{
+	//	Quaternionf q3 = new Quaternionf();
+	float dot = Dot( q1, q2 );
+
+	// dot = cos(theta)
+	// if (dot < 0), q1 and q2 are more than 90 degrees apart,
+	// so we can invert one to reduce spinning
+	Quaternionf tmpQuat;
+	if (dot < 0.0f )
+	{
+		dot = -dot;
+		tmpQuat.Set( -q2.x,
+					 -q2.y,
+					 -q2.z,
+					 -q2.w );
+	}
+	else
+		tmpQuat = q2;
+
+	
+	if (dot < 0.95f )
+	{
+		float angle = acos(dot);
+		float sinadiv, sinat, sinaomt;
+		sinadiv = 1.0f/sin(angle);
+		sinat   = sin(angle*t);
+		sinaomt = sin(angle*(1.0f-t));
+		tmpQuat.Set( (q1.x*sinaomt+tmpQuat.x*sinat)*sinadiv,
+			     (q1.y*sinaomt+tmpQuat.y*sinat)*sinadiv,
+			     (q1.z*sinaomt+tmpQuat.z*sinat)*sinadiv, 
+			     (q1.w*sinaomt+tmpQuat.w*sinat)*sinadiv  );
+//		AssertIf (!CompareApproximately (SqrMagnitude (tmpQuat), 1.0F));
+		return tmpQuat;
+
+	}
+	
+	// if the angle is small, use linear interpolation
+	
+	else
+	{
+		return Lerp(q1,tmpQuat,t);
+	}
+
+}
+
+float AngularDistance (const Quaternionf& lhs, const Quaternionf& rhs)
+{
+	float dot = Dot (lhs, rhs);
+	if (dot < 0.0f )
+		dot = -dot;
+	return acos (std::min (1.0F, dot)) * 2.0F;
+}
+/*
+Quaternionf EulerXYZToQuaternion (const Vector3f& someEulerAngles)
+{
+	float cX (cos (someEulerAngles.x / 2.0f));
+	float sX (sin (someEulerAngles.x / 2.0f));
+
+	float cY (cos (someEulerAngles.y / 2.0f));
+	float sY (sin (someEulerAngles.y / 2.0f));
+
+	float cZ (cos (someEulerAngles.z / 2.0f));
+	float sZ (sin (someEulerAngles.z / 2.0f));
+	
+	Quaternionf qX (sX, 0.0F, 0.0F, cX);
+	Quaternionf qY (0.0F, sY, 0.0F, cY);
+	Quaternionf qZ (0.0F, 0.0F, sZ, cZ);
+	
+	Quaternionf q = (qZ * qY) * qX;
+	AssertIf (!CompareApproximately (SqrMagnitude (q), 1.0F));
+	return q;
+}
+*/
+
+Quaternionf EulerToQuaternion (const Vector3f& someEulerAngles)
+{
+	float cX (cos (someEulerAngles.x / 2.0f));
+	float sX (sin (someEulerAngles.x / 2.0f));
+
+	float cY (cos (someEulerAngles.y / 2.0f));
+	float sY (sin (someEulerAngles.y / 2.0f));
+
+	float cZ (cos (someEulerAngles.z / 2.0f));
+	float sZ (sin (someEulerAngles.z / 2.0f));
+	
+	Quaternionf qX (sX, 0.0F, 0.0F, cX);
+	Quaternionf qY (0.0F, sY, 0.0F, cY);
+	Quaternionf qZ (0.0F, 0.0F, sZ, cZ);
+	
+	Quaternionf q = (qY * qX) * qZ;
+	AssertIf (!CompareApproximately (SqrMagnitude (q), 1.0F));
+	return q;
+}
+
+#if 1
+
+Vector3f QuaternionToEuler (const Quaternionf& quat)
+{
+	Matrix3x3f m;
+	Vector3f rot;
+	QuaternionToMatrix (quat, m);
+	MatrixToEuler (m, rot);
+	return rot;
+}
+
+#else
+
+// Version of QuaternionToEuler that prevents "snapping" on X when getting
+// close to gimbal lock.  Noticeably changes behavior compared to version
+// above, so deactivated for now.
+
+Vector3f QuaternionToEuler(const Quaternionf& q)
+{	
+    const float sqw = q.w * q.w;
+    const float sqx = q.x * q.x;
+    const float sqy = q.y * q.y;
+    const float sqz = q.z * q.z;
+    
+    const float unit = sqx + sqy + sqz + sqw;
+	const float test = q.x * q.y + q.z * q.w;
+    
+	float yaw = 0.0f;
+	float pitch = 0.0f;
+	float roll = 0.0f;
+	
+	// North pole singularity
+	if (test > 0.499f * unit)
+	{ 
+		yaw = 2.0f * atan2 (q.x, q.w);
+		pitch = kPI * 0.5f;
+		roll = 0.0f;
+	}
+	
+	// South pole singularity
+	else if (test < -0.499f * unit) 
+	{
+		yaw = -2.0f * atan2 (q.x, q.w);
+		pitch = -kPI * 0.5f;
+		roll = 0.0f;
+	}
+    
+    else
+    {
+        yaw = atan2 (2.0f * q.y * q.w - 2.0f * q.x * q.z , sqx - sqy - sqz + sqw);
+        pitch = asin (2.0f * test/unit);
+        roll = atan2 (2.0f * q.x * q.w - 2.0f * q.y * q.z , -sqx + sqy - sqz + sqw);
+    }
+    
+    // Keep angles [0..360].
+    if (Sign (yaw) < 0.f)
+        yaw = Deg2Rad (360.f) + yaw;
+    if (Sign (pitch) < 0.f)
+        pitch = Deg2Rad (360.f) + pitch;
+    if (Sign (roll) < 0.f)
+        roll = Deg2Rad (360.f) + roll;
+    
+	return Vector3f(roll, yaw, pitch);
+}
+
+#endif
+
+std::vector<Vector3f> GetEquivalentEulerAngles (const Quaternionf& quat)
+{
+	Matrix3x3f m;
+	Vector3f rot;
+
+	std::vector<Vector3f> euler_triples;
+
+	QuaternionToMatrix (quat, m);
+	MatrixToEuler (m, rot);
+	
+	euler_triples.push_back(rot);
+		
+	euler_triples.push_back(Vector3f(rot.x + 180.0f, -rot.y, rot.z + 180.0f));	
+	euler_triples.push_back(Vector3f(rot.x - 180.0f, -rot.y, rot.z - 180.0f));	
+	euler_triples.push_back(Vector3f(-rot.x, rot.y + 180.0f, -rot.z));
+	euler_triples.push_back(Vector3f(-rot.x, rot.y - 180.0f, -rot.z));
+	
+	return euler_triples;
+}
+
+void QuaternionToMatrix (const Quaternionf& q, Matrix3x3f& m)
+{
+	// If q is guaranteed to be a unit quaternion, s will always
+	// be 1.  In that case, this calculation can be optimized out.
+	#if DEBUGMODE
+	if (!CompareApproximately (SqrMagnitude (q), 1.0F, Vector3f::epsilon))
+	{
+		AssertString(Format("Quaternion To Matrix conversion failed because input Quaternion is invalid {%f, %f, %f, %f} l=%f", q.x, q.y, q.z, q.w, SqrMagnitude(q)));		
+	}
+	#endif
+	//float norm = GetNorm (q);
+	//float s = (norm > 0.0) ? 2.0/norm : 0;
+
+	// Precalculate coordinate products
+	float x = q.x * 2.0F;
+	float y = q.y * 2.0F;
+	float z = q.z * 2.0F;
+	float xx = q.x * x;
+	float yy = q.y * y;
+	float zz = q.z * z;
+	float xy = q.x * y;
+	float xz = q.x * z;
+	float yz = q.y * z;
+	float wx = q.w * x;
+	float wy = q.w * y;
+	float wz = q.w * z;
+
+	// Calculate 3x3 matrix from orthonormal basis
+	m.m_Data[0] = 1.0f - (yy + zz);
+	m.m_Data[1] = xy + wz;
+	m.m_Data[2] = xz - wy;
+
+	m.m_Data[3] = xy - wz;
+	m.m_Data[4] = 1.0f - (xx + zz);
+	m.m_Data[5] = yz + wx;
+	
+	m.m_Data[6]  = xz + wy;
+	m.m_Data[7]  = yz - wx;
+	m.m_Data[8] = 1.0f - (xx + yy);
+}
+
+
+void QuaternionToMatrix (const Quaternionf& q, Matrix4x4f& m)
+{
+	// If q is guaranteed to be a unit quaternion, s will always
+	// be 1.  In that case, this calculation can be optimized out.
+	#if DEBUGMODE
+	if (!CompareApproximately (SqrMagnitude (q), 1.0F, Vector3f::epsilon))
+	{
+		AssertString(Format("Quaternion To Matrix conversion failed because input Quaternion is invalid {%f, %f, %f, %f} l=%f", q.x, q.y, q.z, q.w, SqrMagnitude(q)));		
+	}
+	#endif
+
+	//float norm = GetNorm (q);
+	//float s = (norm > 0.0) ? 2.0/norm : 0;
+
+	// Precalculate coordinate products
+	float x = q.x * 2.0F;
+	float y = q.y * 2.0F;
+	float z = q.z * 2.0F;
+	float xx = q.x * x;
+	float yy = q.y * y;
+	float zz = q.z * z;
+	float xy = q.x * y;
+	float xz = q.x * z;
+	float yz = q.y * z;
+	float wx = q.w * x;
+	float wy = q.w * y;
+	float wz = q.w * z;
+
+	// Calculate 3x3 matrix from orthonormal basis
+	m.m_Data[0] = 1.0f - (yy + zz);
+	m.m_Data[1] = xy + wz;
+	m.m_Data[2] = xz - wy;
+	m.m_Data[3] = 0.0F;
+
+	m.m_Data[4] = xy - wz;
+	m.m_Data[5] = 1.0f - (xx + zz);
+	m.m_Data[6] = yz + wx;
+	m.m_Data[7] = 0.0F;
+
+	m.m_Data[8]  = xz + wy;
+	m.m_Data[9]  = yz - wx;
+	m.m_Data[10] = 1.0f - (xx + yy);
+	m.m_Data[11] = 0.0F;
+
+	m.m_Data[12] = 0.0F;
+	m.m_Data[13] = 0.0F;
+	m.m_Data[14] = 0.0F;
+	m.m_Data[15] = 1.0F;
+}
+
+void MatrixToQuaternion (const Matrix4x4f& m, Quaternionf& q) {
+	Matrix3x3f mat (
+		m.Get(0,0), m.Get(0,1), m.Get(0,2),
+		m.Get(1,0), m.Get(1,1), m.Get(1,2),
+		m.Get(2,0), m.Get(2,1), m.Get(2,2));
+		
+	MatrixToQuaternion (mat, q);
+//	mat.Get(0,0) = m.Get(0,0); mat.Get(0,1) = m.Get(0,1); mat.Get(0,2) = m.Get(0,2);
+//	mat.Get(1,0) = m.Get(1,0); mat.Get(1,1) = m.Get(1,1); mat.Get(1,2) = m.Get(1,2);
+//	mat.Get(2,0) = m.Get(2,0); mat.Get(2,1) = m.Get(2,1); mat.Get(2,2) = m.Get(2,2);
+}
+
+void MatrixToQuaternion (const Matrix3x3f& kRot, Quaternionf& q)
+{
+	// Algorithm in Ken Shoemake's article in 1987 SIGGRAPH course notes
+	// article "Quaternionf Calculus and Fast Animation".
+	#if DEBUGMODE
+	float det = kRot.GetDeterminant ();
+	AssertIf (!CompareApproximately (det, 1.0F, .005f));
+	#endif
+	float fTrace = kRot.Get (0, 0) + kRot.Get (1, 1) + kRot.Get (2, 2);
+	float fRoot;
+
+	if ( fTrace > 0.0f )
+	{
+		// |w| > 1/2, may as well choose w > 1/2
+		fRoot = sqrt (fTrace + 1.0f);  // 2w
+		q.w = 0.5f*fRoot;
+		fRoot = 0.5f/fRoot;  // 1/(4w)
+		q.x = (kRot.Get (2, 1) - kRot.Get (1, 2))*fRoot;
+		q.y = (kRot.Get (0, 2) - kRot.Get (2, 0))*fRoot;
+		q.z = (kRot.Get (1, 0) - kRot.Get (0, 1))*fRoot;
+	}
+	else
+	{
+		// |w| <= 1/2
+		int s_iNext[3] = { 1, 2, 0 };
+		int i = 0;
+		if ( kRot.Get (1, 1) > kRot.Get (0, 0) )
+			i = 1;
+		if ( kRot.Get (2, 2) > kRot.Get (i, i) )
+			i = 2;
+		int j = s_iNext[i];
+		int k = s_iNext[j];
+
+		fRoot = sqrt (kRot.Get (i, i) - kRot.Get (j, j) - kRot.Get (k, k) + 1.0f);
+		float* apkQuat[3] = { &q.x, &q.y, &q.z };
+		AssertIf (fRoot < Vector3f::epsilon);
+		*apkQuat[i] = 0.5f*fRoot;
+		fRoot = 0.5f / fRoot;
+		q.w = (kRot.Get (k, j) - kRot.Get (j, k)) * fRoot;
+		*apkQuat[j] = (kRot.Get (j, i) + kRot.Get (i, j))*fRoot;
+		*apkQuat[k] = (kRot.Get (k, i) + kRot.Get (i, k))*fRoot;
+	}
+	q = Normalize (q);
+}
+
+bool LookRotationToQuaternion (const Vector3f& viewVec, const Vector3f& upVec, Quaternionf* res)
+{
+	Matrix3x3f m;
+	if (!LookRotationToMatrix (viewVec, upVec, &m))
+		return false;
+	MatrixToQuaternion (m, *res);
+	return true;
+}
+
+Quaternionf FromToQuaternionSafe (const Vector3f& lhs, const Vector3f& rhs)
+{
+	float lhsMag = Magnitude (lhs);
+	float rhsMag = Magnitude (rhs);
+	if (lhsMag < Vector3f::epsilon || rhsMag < Vector3f::epsilon)
+		return Quaternionf::identity ();
+	else
+		return FromToQuaternion (lhs / lhsMag, rhs / rhsMag);
+}
+
+Quaternionf FromToQuaternion (const Vector3f& from, const Vector3f& to)
+{
+	Matrix3x3f m;
+	m.SetFromToRotation (from, to);
+	Quaternionf q;
+	MatrixToQuaternion (m, q);
+	return q;
+/*
+	AssertIf (!CompareApproximately (SqrMagnitude (from), 1.0F));
+	AssertIf (!CompareApproximately (SqrMagnitude (to), 1.0F));
+	float dot = Dot (from, to);
+	// almost the same
+	if (dot > 1.0F - Vector3f::epsilon)
+	{
+		return Quaternionf::identity ();
+	}
+	else if (dot < -1.0F + Vector3f::epsilon)
+	{
+		Vector3f axis = OrthoNormalVector (from);
+		Quaternionf q;
+		AxisAngleToQuaternion (axis, pi, &q);
+		return q;
+	}
+	// normal case
+	else
+	{
+		Vector3f axis = Normalize (Cross (from, to));
+		Quaternionf q;
+		float angle = acos (dot);
+		AxisAngleToQuaternion (axis, angle, &q);
+		return q;
+	}
+*/	
+}
diff --git a/Runtime/Math/Quaternion.h b/Runtime/Math/Quaternion.h
new file mode 100644
index 0000000..2cabe9c
--- /dev/null
+++ b/Runtime/Math/Quaternion.h
@@ -0,0 +1,405 @@
+#ifndef QUATERNION_H
+#define QUATERNION_H
+
+#include "Matrix3x3.h"
+#include "Matrix4x4.h"
+#include "Vector3.h"
+#include "FloatConversion.h"
+#include <algorithm>
+#include <vector>
+#include "Runtime/Modules/ExportModules.h"
+
+class Quaternionf
+{
+	public:
+
+	float x, y, z, w;
+
+	DEFINE_GET_TYPESTRING_IS_ANIMATION_CHANNEL (Quaternionf)
+	template<class TransferFunction> void Transfer (TransferFunction& transfer);
+
+	Quaternionf () {}
+	Quaternionf (float inX, float inY, float inZ, float inW);
+	explicit Quaternionf (const float* array)	{ x = array[0]; y = array[1]; z = array[2]; w = array[3]; }
+	
+	// methods
+	
+	const float* GetPtr ()const				{ return &x; }
+	float* GetPtr ()								{ return &x; }
+	
+	const float& operator [] (int i)const	{ return GetPtr ()[i]; }
+	float& operator [] (int i)					{ return GetPtr ()[i]; }
+
+	void Set (float inX, float inY, float inZ, float inW);
+	void Set (const Quaternionf& aQuat);
+	void Set (const float* array)	{ x = array[0]; y = array[1]; z = array[2]; w = array[3]; }
+	
+	friend Quaternionf Normalize(const Quaternionf& q) {	return q / Magnitude (q); }
+	friend Quaternionf NormalizeSafe(const Quaternionf& q);
+
+	friend Quaternionf Conjugate(const Quaternionf& q);
+	friend Quaternionf Inverse (const Quaternionf& q);
+
+	friend float SqrMagnitude (const Quaternionf& q);
+	friend float Magnitude (const Quaternionf& q);
+
+	bool operator == (const Quaternionf& q)const		{ return x == q.x && y == q.y && z == q.z && w == q.w; }
+	bool operator != (const Quaternionf& q)const		{ return x != q.x || y != q.y || z != q.z || w != q.w; }
+	
+	Quaternionf&	operator += (const Quaternionf&  aQuat);
+	Quaternionf&	operator -= (const Quaternionf&  aQuat);
+	Quaternionf&	operator *= (const float     	aScalar);
+	Quaternionf&	operator *= (const Quaternionf& 	aQuat);
+	Quaternionf&	operator /= (const float     	aScalar);
+
+	friend Quaternionf operator + (const Quaternionf& lhs, const Quaternionf& rhs)
+	{
+		Quaternionf q (lhs);
+		return q += rhs;
+	}
+	
+	friend Quaternionf	operator - (const Quaternionf& lhs, const Quaternionf& rhs)
+	{
+		Quaternionf t (lhs);
+		return t -= rhs;
+	}
+
+	Quaternionf operator - () const
+	{
+		return Quaternionf(-x, -y, -z, -w);
+	}
+
+	Quaternionf	operator * (const float s) const
+	{
+		return Quaternionf (x*s, y*s, z*s, w*s);
+	}
+	
+	friend Quaternionf	operator * (const float s, const Quaternionf& q)
+	{
+		Quaternionf t (q);
+		return t *= s;
+	}
+	
+	friend Quaternionf	operator / (const Quaternionf& q, const float s)
+	{
+		Quaternionf t (q);
+		return t /= s;
+	}
+	
+	inline friend Quaternionf operator * (const Quaternionf& lhs, const Quaternionf& rhs)
+	{
+		return Quaternionf (
+				lhs.w*rhs.x + lhs.x*rhs.w + lhs.y*rhs.z - lhs.z*rhs.y,
+				lhs.w*rhs.y + lhs.y*rhs.w + lhs.z*rhs.x - lhs.x*rhs.z,
+				lhs.w*rhs.z + lhs.z*rhs.w + lhs.x*rhs.y - lhs.y*rhs.x,
+				lhs.w*rhs.w - lhs.x*rhs.x - lhs.y*rhs.y - lhs.z*rhs.z);
+	}
+	
+	static Quaternionf identity () { return Quaternionf (0.0F, 0.0F, 0.0F, 1.0F); }
+};
+
+bool CompareApproximately (const Quaternionf& q1, const Quaternionf& q2, float epsilon = Vector3f::epsilon);
+
+Quaternionf Lerp( const Quaternionf& q1, const Quaternionf& q2, float t );
+
+Quaternionf EXPORT_COREMODULE Slerp( const Quaternionf& q1, const Quaternionf& q2, float t );
+
+float Dot( const Quaternionf& q1, const Quaternionf& q2 );
+
+Vector3f EXPORT_COREMODULE QuaternionToEuler (const Quaternionf& quat);
+
+std::vector<Vector3f> GetEquivalentEulerAngles (const Quaternionf& quat);
+
+Quaternionf EulerToQuaternion (const Vector3f& euler);
+
+void EXPORT_COREMODULE QuaternionToMatrix (const Quaternionf& q, Matrix3x3f& m);
+
+void EXPORT_COREMODULE MatrixToQuaternion (const Matrix3x3f& m, Quaternionf& q);
+void EXPORT_COREMODULE MatrixToQuaternion (const Matrix4x4f& m, Quaternionf& q);
+
+void QuaternionToMatrix (const Quaternionf& q, Matrix4x4f& m);
+
+void QuaternionToAxisAngle (const Quaternionf& q, Vector3f* axis, float* targetAngle);
+
+Quaternionf AxisAngleToQuaternion (const Vector3f& axis, float angle);
+
+/// Generates a Right handed Quat from a look rotation. Returns if conversion was successful.
+bool LookRotationToQuaternion (const Vector3f& viewVec, const Vector3f& upVec, Quaternionf* res);
+
+
+inline Vector3f RotateVectorByQuat (const Quaternionf& lhs, const Vector3f& rhs)
+{
+//	Matrix3x3f m;
+//	QuaternionToMatrix (lhs, &m);
+//	Vector3f restest = m.MultiplyVector3 (rhs);
+	float x = lhs.x * 2.0F;
+	float y = lhs.y * 2.0F;
+	float z = lhs.z * 2.0F;
+	float xx = lhs.x * x;
+	float yy = lhs.y * y;
+	float zz = lhs.z * z;
+	float xy = lhs.x * y;
+	float xz = lhs.x * z;
+	float yz = lhs.y * z;
+	float wx = lhs.w * x;
+	float wy = lhs.w * y;
+	float wz = lhs.w * z;
+
+	Vector3f res;
+	res.x = (1.0f - (yy + zz)) * rhs.x + (xy - wz)          * rhs.y + (xz + wy)          * rhs.z;
+	res.y = (xy + wz)          * rhs.x + (1.0f - (xx + zz)) * rhs.y + (yz - wx)          * rhs.z;
+	res.z = (xz - wy)          * rhs.x + (yz + wx)          * rhs.y + (1.0f - (xx + yy)) * rhs.z;
+	
+//	AssertIf (!CompareApproximately (restest, res));
+	return res;
+}
+
+// operator overloads
+//  inlines
+
+inline Quaternionf::Quaternionf(float inX, float inY, float inZ, float inW)
+{
+	x = inX;
+	y = inY;
+	z = inZ;
+	w = inW;
+}
+
+template<class TransferFunction> inline
+void Quaternionf::Transfer (TransferFunction& transfer)
+{
+	transfer.AddMetaFlag (kTransferUsingFlowMappingStyle);
+	TRANSFER (x);
+	TRANSFER (y);
+	TRANSFER (z);
+	TRANSFER (w);
+}
+
+inline void Quaternionf::Set (float inX, float inY, float inZ, float inW)
+{
+	x = inX;
+	y = inY;
+	z = inZ;
+	w = inW;
+}
+
+inline void Quaternionf::Set (const Quaternionf& aQuat )
+{
+	x = aQuat.x;
+	y = aQuat.y;
+	z = aQuat.z;
+	w = aQuat.w;
+}
+
+inline Quaternionf Conjugate (const Quaternionf& q)
+{
+	return Quaternionf (-q.x, -q.y, -q.z, q.w);
+}
+
+inline Quaternionf Inverse (const Quaternionf& q)
+{
+	// Is it necessary to divide by SqrMagnitude???
+	Quaternionf res = Conjugate (q);
+	return res;
+}
+ 
+inline float Magnitude(const Quaternionf& q)
+{
+	return SqrtImpl (SqrMagnitude (q));
+}
+
+inline float SqrMagnitude(const Quaternionf& q)
+{
+	return Dot (q, q); 
+}
+
+inline Quaternionf& Quaternionf::operator+= (const Quaternionf& aQuat)
+{
+   x += aQuat.x;
+   y += aQuat.y;
+   z += aQuat.z;
+   w += aQuat.w;
+   return *this;
+}
+
+inline Quaternionf& Quaternionf::operator-= (const Quaternionf& aQuat)
+{
+   x -= aQuat.x;
+   y -= aQuat.y;
+   z -= aQuat.z;
+   w -= aQuat.w;
+   return *this;
+}
+
+inline Quaternionf& Quaternionf::operator *= (float aScalar)
+{
+	x *= aScalar;
+	y *= aScalar;
+	z *= aScalar;
+	w *= aScalar;
+	return *this;
+}
+
+inline Quaternionf&	Quaternionf::operator /= (const float     	aScalar)
+{
+	AssertIf (CompareApproximately (aScalar, 0.0F));
+	x /= aScalar;
+	y /= aScalar;
+	z /= aScalar;
+	w /= aScalar;
+	return *this;
+}
+
+inline Quaternionf&	Quaternionf::operator *= (const Quaternionf& 	rhs)
+{
+	float tempx = w*rhs.x + x*rhs.w + y*rhs.z - z*rhs.y;
+	float tempy = w*rhs.y + y*rhs.w + z*rhs.x - x*rhs.z;
+	float tempz = w*rhs.z + z*rhs.w + x*rhs.y - y*rhs.x;
+	float tempw = w*rhs.w - x*rhs.x - y*rhs.y - z*rhs.z;
+	x = tempx; y = tempy; z = tempz; w = tempw;
+	return *this;		
+}
+
+inline Quaternionf Lerp( const Quaternionf& q1, const Quaternionf& q2, float t )
+{
+	Quaternionf tmpQuat;
+	// if (dot < 0), q1 and q2 are more than 360 deg apart.
+	// The problem is that quaternions are 720deg of freedom.
+	// so we - all components when lerping
+	if (Dot (q1, q2) < 0.0F)
+	{
+		tmpQuat.Set(q1.x + t * (-q2.x - q1.x),
+		            q1.y + t * (-q2.y - q1.y),
+		            q1.z + t * (-q2.z - q1.z),
+		            q1.w + t * (-q2.w - q1.w));
+	}
+	else
+	{
+		tmpQuat.Set(q1.x + t * (q2.x - q1.x),
+		            q1.y + t * (q2.y - q1.y),
+		            q1.z + t * (q2.z - q1.z),
+		            q1.w + t * (q2.w - q1.w));
+	}
+	return Normalize (tmpQuat);
+} 
+
+inline float Dot( const Quaternionf& q1, const Quaternionf& q2 )
+{
+	return (q1.x*q2.x + q1.y*q2.y + q1.z*q2.z + q1.w*q2.w);
+}
+
+float AngularDistance (const Quaternionf& lhs, const Quaternionf& rhs);
+
+
+inline void QuaternionToAxisAngle (const Quaternionf& q, Vector3f* axis, float* targetAngle)
+{
+	AssertIf (! CompareApproximately(SqrMagnitude (q), 1.0F));
+	*targetAngle = 2.0f* acos(q.w);
+	if (CompareApproximately (*targetAngle, 0.0F))
+	{
+		*axis = Vector3f::xAxis;
+		return;
+	}
+		
+	float div = 1.0f / sqrt(1.0f - Sqr (q.w));
+	axis->Set( q.x*div, q.y*div, q.z*div );
+}
+
+inline Quaternionf AxisAngleToQuaternion (const Vector3f& axis, float angle)
+{
+	Quaternionf q;
+	AssertIf (!CompareApproximately (SqrMagnitude (axis), 1.0F));
+	float halfAngle = angle * 0.5F;
+	float s = sin (halfAngle);
+	
+	q.w = cos (halfAngle);
+	q.x = s * axis.x;
+	q.y = s * axis.y;
+	q.z = s * axis.z;
+	return q;
+}
+
+inline Quaternionf AngularVelocityToQuaternion (const Vector3f& axis, float deltaTime)
+{
+	float w = Magnitude(axis);
+	if (w > Vector3f::epsilon)
+	{
+		float v = deltaTime * w * 0.5f;
+		float q = cos(v);
+		float s = sin(v) / w;
+		
+		Quaternionf integrated;
+		integrated.w = q;
+		integrated.x = s * axis.x;
+		integrated.y = s * axis.y;
+		integrated.z = s * axis.z;
+
+		return NormalizeSafe(integrated);
+	}
+	else
+	{
+		return Quaternionf::identity();
+	}
+}
+
+inline Quaternionf AxisAngleToQuaternionSafe (const Vector3f& axis, float angle)
+{
+	Quaternionf q;
+	float mag = Magnitude (axis);
+	if (mag > 0.000001F)
+	{
+		float halfAngle = angle * 0.5F;
+		
+		q.w = cos (halfAngle);
+
+		float s = sin (halfAngle) / mag;
+		q.x = s * axis.x;
+		q.y = s * axis.y;
+		q.z = s * axis.z;
+		return q;
+	}
+	else
+	{
+		return Quaternionf::identity ();
+	}
+}
+
+// Generates a quaternion that rotates lhs into rhs.
+Quaternionf FromToQuaternionSafe (const Vector3f& lhs, const Vector3f& rhs);
+// from and to are assumed to be normalized
+Quaternionf FromToQuaternion (const Vector3f& from, const Vector3f& to);
+
+
+inline bool CompareApproximately (const Quaternionf& q1, const Quaternionf& q2, float epsilon)
+{
+	//return SqrMagnitude (q1 - q2) < epsilon * epsilon;
+	return (SqrMagnitude (q1 - q2) < epsilon * epsilon) || (SqrMagnitude (q1 + q2) < epsilon * epsilon);
+	//return Abs (Dot (q1, q2)) > (1 - epsilon * epsilon);
+}
+
+inline Quaternionf NormalizeSafe (const Quaternionf& q)
+{
+	float mag = Magnitude (q);
+	if (mag < Vector3f::epsilon)
+		return Quaternionf::identity ();	
+	else
+		return q / mag;
+}
+
+inline Quaternionf NormalizeFastEpsilonZero (const Quaternionf& q)
+{
+	float m = SqrMagnitude (q);
+	if (m < Vector3f::epsilon)
+		return Quaternionf(0.0F, 0.0F, 0.0F, 0.0F);	
+	else
+		return q * FastInvSqrt(m);
+}
+
+
+inline bool IsFinite (const Quaternionf& f)
+{
+	return IsFinite(f.x) & IsFinite(f.y) & IsFinite(f.z) & IsFinite(f.w);
+}
+
+
+#endif
diff --git a/Runtime/Math/Random/Random.h b/Runtime/Math/Random/Random.h
new file mode 100644
index 0000000..ccd8c62
--- /dev/null
+++ b/Runtime/Math/Random/Random.h
@@ -0,0 +1,184 @@
+#ifndef RANDOM_H
+#define RANDOM_H
+
+#include "rand.h"
+#include "Runtime/Math/Vector2.h"
+#include "Runtime/Math/Quaternion.h"
+#include "Runtime/Math/FloatConversion.h"
+
+inline float RangedRandom (Rand& r, float min, float max)
+{
+	float t = r.GetFloat ();
+	t = min * t + (1.0F - t) * max;
+	return t;
+}
+
+inline float Random01 (Rand& r)
+{
+	return r.GetFloat ();
+}
+
+inline int RangedRandom (Rand& r, int min, int max)
+{
+	int dif;
+	if (min < max)
+	{
+		dif = max - min;
+		int t = r.Get () % dif;
+		t += min;
+		return t;
+	}
+	else if (min > max)
+	{
+		dif = min - max;
+		int t = r.Get () % dif;
+		t = min - t;
+		return t;
+	}
+	else
+	{
+		return min;
+	}
+}
+
+inline Vector3f RandomUnitVector (Rand& rand)
+{
+	float z = RangedRandom (rand, -1.0f, 1.0f);
+	float a = RangedRandom (rand, 0.0f, 2.0F * kPI);
+
+	float r = sqrt (1.0f - z*z);
+
+	float x = r * cos (a);
+	float y = r * sin (a);
+
+	return Vector3f (x, y, z);
+}
+
+inline Vector2f RandomUnitVector2 (Rand& rand)
+{
+	float a = RangedRandom (rand, 0.0f, 2.0F * kPI);
+
+	float x = cos (a);
+	float y = sin (a);
+
+	return Vector2f (x, y);
+}
+
+
+inline Quaternionf RandomQuaternion (Rand& rand)
+{
+	Quaternionf q;
+	q.x = RangedRandom (rand, -1.0f, 1.0f);
+	q.y = RangedRandom (rand, -1.0f, 1.0f);
+	q.z = RangedRandom (rand, -1.0f, 1.0f);
+	q.w = RangedRandom (rand, -1.0f, 1.0f);
+	q = NormalizeSafe (q);
+	if (Dot (q, Quaternionf::identity ()) < 0.0f)
+		return -q;
+	else
+		return q;
+}
+
+inline Quaternionf RandomQuaternionUniformDistribution (Rand& rand)
+{
+	const float two_pi = 2.0F * kPI;
+
+	// Employs Hopf fibration to uniformly distribute quaternions
+	float u1 = RangedRandom( rand, 0.0f, 1.0f );
+	float theta = RangedRandom( rand, 0.0f, two_pi );
+	float rho = RangedRandom( rand, 0.0f, two_pi );
+
+	float i = sqrt( 1.0f - u1 );
+	float j = sqrt( u1 );
+
+	// We do not need to normalize the generated quaternion, because the probability density corresponds to the Haar measure.
+	// This means that a random rotation is obtained by picking a point at random on S^3, and forming the unit quaternion.
+	Quaternionf q( i * sin(theta), i * cos(theta), j * sin(rho), j * cos(rho) );
+
+	if (Dot (q, Quaternionf::identity ()) < 0.0f)
+		return -q;
+	else
+		return q;
+}
+
+
+inline Vector3f RandomPointInsideCube (Rand& r, const Vector3f& extents)
+{
+	return Vector3f (	RangedRandom (r, -extents.x, extents.x),
+							RangedRandom (r, -extents.y, extents.y),
+							RangedRandom (r, -extents.z, extents.z));
+}
+
+inline Vector3f RandomPointBetweenCubes (Rand& r, const Vector3f& min, const Vector3f& max)
+{
+	Vector3f v;
+	int i;
+	for (i=0;i<3;i++)
+	{
+		float x = r.GetFloat () * 2.0F - 1.0F;
+		if (x > 0.0f)
+			v[i] = min[i] + x * (max[i] - min[i]);
+		else
+			v[i] = -min[i] + x * (max[i] - min[i]);
+	}
+	return v;
+}
+
+inline Vector3f RandomPointInsideUnitSphere (Rand& r)
+{
+	Vector3f v = RandomUnitVector (r);
+	v *= pow (Random01 (r), 1.0F / 3.0F);
+	return v;
+}
+
+inline Vector3f RandomPointInsideEllipsoid (Rand& r, const Vector3f& extents)
+{
+	return Scale (RandomPointInsideUnitSphere (r), extents);
+}
+
+inline Vector3f RandomPointBetweenSphere (Rand& r, float minRadius, float maxRadius)
+{
+	Vector3f v = RandomUnitVector (r);
+	// As the volume of the sphere increases (x^3) over an interval we have to increase range as well with x^(1/3)
+	float range = pow (RangedRandom (r, 0.0F, 1.0F), 1.0F / 3.0F);
+	return v * (minRadius + (maxRadius - minRadius) * range);
+}
+
+inline Vector2f RandomPointInsideUnitCircle (Rand& r)
+{
+	Vector2f v = RandomUnitVector2 (r);
+	// As the volume of the sphere increases (x^3) over an interval we have to increase range as well with x^(1/3)
+	v *= pow (RangedRandom (r, 0.0F, 1.0F), 1.0F / 2.0F);
+	return v;
+}
+
+inline Vector3f RandomPointBetweenEllipsoid (Rand& r, const Vector3f& maxExtents, float minRange)
+{
+	Vector3f v = Scale (RandomUnitVector (r), maxExtents);
+	// As the volume of the sphere increases (x^3) over an interval we have to increase range as well with x^(1/3)
+	float range = pow (RangedRandom (r, minRange, 1.0F), 1.0F / 3.0F);
+	return v * range;
+}
+
+/// Builds a random Barycentric coordinate which can be used to generate random points on a triangle:
+/// Vector3f point = v0 * barycentric.x + v1 * barycentric.y + v2 * barycentric.z;
+inline Vector3f RandomBarycentricCoord (Rand& rand)
+{
+// Was told that this leads to bad distribution because of the 1.0F - s
+//	float s = gRand.GetFloat ();
+//	float t = RangedRandom (gRand, 0.0F, 1.0F - s);
+//	float r = (1.0F - s - t);
+//	Vector3f positionOnMesh = r * vertices[face.v1] + s * vertices[face.v2] + t * vertices[face.v3];
+//	return positionOnMesh;
+	float u = rand.GetFloat ();
+	float v = rand.GetFloat ();
+	if (u + v > 1.0F)
+	{
+		u = 1.0F - u;
+		v = 1.0F - v;
+	}
+	float w = 1.0F - u - v;
+	return Vector3f (u, v, w);
+}
+
+#endif
diff --git a/Runtime/Math/Random/rand.h b/Runtime/Math/Random/rand.h
new file mode 100644
index 0000000..1971ab4
--- /dev/null
+++ b/Runtime/Math/Random/rand.h
@@ -0,0 +1,81 @@
+#ifndef RAND_H
+#define RAND_H
+
+/*
+Some random generator timings:
+MacBook Pro w/ Core 2 Duo 2.4GHz. Times are for gcc 4.0.1 (OS X 10.6.2) / VS2008 SP1 (Win XP SP3),
+in milliseconds for this loop (4915200 calls):
+
+ for (int j = 0; j < 100; ++j)
+   for (int i = 0; i < 128*128*3; ++i)
+     data[i] = (rnd.get() & 0x3) << 6;
+
+                  gcc   vs2008    Size
+C's rand():       57.0  109.3 ms     1
+Mersenne Twister: 56.0   37.4 ms  2500
+Unity 2.x LCG:    11.1    9.2 ms     4
+Xorshift 128:     15.0   17.8 ms    16
+Xorshift 32:      20.6   10.7 ms     4
+WELL 512:         43.6   55.1 ms    68 
+*/
+
+
+// Xorshift 128 implementation
+// Xorshift paper: http://www.jstatsoft.org/v08/i14/paper
+// Wikipedia: http://en.wikipedia.org/wiki/Xorshift
+class Rand {
+public:
+	
+	Rand (UInt32 seed = 0)
+	{
+		SetSeed (seed);
+	}
+	
+	UInt32 Get ()
+	{
+		UInt32 t;
+		t = x ^ (x << 11);
+		x = y; y = z; z = w;
+		return w = (w ^ (w >> 19)) ^ (t ^ (t >> 8));
+	}
+
+	inline static float GetFloatFromInt (UInt32 value)
+	{
+		// take 23 bits of integer, and divide by 2^23-1
+		return float(value & 0x007FFFFF) * (1.0f / 8388607.0f);
+	}
+
+	inline static UInt8 GetByteFromInt (UInt32 value)
+	{
+		// take the most significant byte from the 23-bit value
+		return UInt8(value >> (23 - 8));
+	}
+	
+	// random number between 0.0 and 1.0
+	float GetFloat ()
+	{
+		return GetFloatFromInt (Get ());
+	}
+
+	// random number between -1.0 and 1.0
+	float GetSignedFloat ()
+	{
+	    return GetFloat() * 2.0f - 1.0f;
+	}
+
+	void SetSeed (UInt32 seed)
+	{
+		x = seed;
+		y = x * 1812433253U + 1;
+		z = y * 1812433253U + 1;
+		w = z * 1812433253U + 1;
+	}
+	
+	UInt32 GetSeed () const { return x; }
+	
+private:
+	UInt32 x, y, z, w;
+};
+
+
+#endif
diff --git a/Runtime/Math/Rect.h b/Runtime/Math/Rect.h
new file mode 100644
index 0000000..4801988
--- /dev/null
+++ b/Runtime/Math/Rect.h
@@ -0,0 +1,184 @@
+#ifndef RECT_H
+#define RECT_H
+
+#include "Vector2.h"
+#include "Runtime/Modules/ExportModules.h"
+
+/// A rectangle.
+template <typename T>
+class EXPORT_COREMODULE RectT
+{
+public:
+	typedef RectT<T> RectType;
+	typedef float BaseType;
+
+	T x; ///< Rectangle x coordinate.
+	T y; ///< Rectangle y coordinate.
+	T width; ///< Rectangle width.
+	T height; ///< Rectangle height.
+
+	//DECLARE_SERIALIZE_OPTIMIZE_TRANSFER (Rectf)
+	inline static const char* GetTypeString ();
+	inline static bool IsAnimationChannel () { return false; }
+	inline static bool MightContainPPtr () { return false; }
+	inline static bool AllowTransferOptimization ()	{ return true; }
+	template <class TransferFunction>
+	void Transfer (TransferFunction& transfer)
+	{
+		TRANSFER (x);
+		TRANSFER (y);
+		TRANSFER (width);
+		TRANSFER (height);
+	}
+
+	/// Create a empty rectangle.
+	RectT ()
+	{
+		Reset ();
+	}
+
+	/// Create a new rectangle.
+	RectT (T inX, T inY, T iWidth, T iHeight)
+	{
+		x = inX; width = iWidth;
+		y = inY; height = iHeight;
+	}
+	
+	T GetRight() const { return x + width; }
+	T GetBottom() const { return y + height; }
+	void SetLeft(T l) { T oldXMax = GetXMax(); x = l; width = oldXMax - x; }
+	void SetTop(T t) { T oldYMax = GetYMax(); y = t; height = oldYMax - y; }
+	void SetRight(T r) { width = r - x; }
+	void SetBottom(T b) { height = b - y; }
+
+
+	T GetXMax() const { return x + width; }
+	T GetYMax() const { return y + height; }
+
+	/// Return true if rectangle is empty.
+	inline bool IsEmpty () const { return width <= 0 || height <= 0; }
+	
+	inline void		SetPosition(const Vector2f& position) { x = position.x; y = position.y; }
+	inline Vector2f GetPosition() const { return Vector2f(x, y); }
+
+	inline void		SetSize(const Vector2f& size) { width = size.x; height = size.y; }
+	inline Vector2f GetSize() const { return Vector2f(width, height); }
+	/// Resets the rectangle
+	inline void Reset() { x = y = width = height = 0; }
+
+	/// Sets the rectangle
+	inline void Set(T inX, T inY, T iWidth, T iHeight) 
+	{
+		x = inX; width = iWidth;
+		y = inY; height = iHeight;
+	}
+
+	inline void Scale (T dx, T dy)		{ x *= dx; width *= dx; y *= dy; height *= dy;}
+
+	/// Set Center position of rectangle (size stays the same)
+	void SetCenterPos (T cx, T cy)		{ x = cx - width / 2; y = cy - height / 2; }
+	Vector2f GetCenterPos() const		{ return Vector2f(x + (BaseType)width / 2, y + (BaseType)height / 2); }
+
+	/// Ensure this is inside the rect r.
+	void Clamp (const RectType &r)
+	{
+		T x2 = x + width;
+		T y2 = y + height;
+		T rx2 = r.x + r.width;
+		T ry2 = r.y + r.height;
+
+		if (x < r.x) x = r.x;
+		if (x2 > rx2) x2 = rx2;
+		if (y < r.y) y = r.y;
+		if (y2 > ry2) y2 = ry2;
+
+		width = x2 - x;
+		if (width < 0) width = 0;
+
+		height = y2 - y;
+		if (height < 0) height = 0;
+	}
+
+	/// Move rectangle by deltaX, deltaY.
+	inline void Move (T dX, T dY)		{ x += dX; y += dY; }
+
+	/// Return the width of rectangle.
+	inline T Width () const					{ return width; }
+
+	/// Return the height of rectangle.
+	inline T Height () const					{ return height; }
+
+	/// Return true if a point lies within rectangle bounds.
+	inline bool Contains (T px, T py) const		{ return (px >= x) && (px < x + width) && (py >= y) && (py < y + height); }
+	inline bool Contains (const Vector2f& p) const		{ return Contains(p.x, p.y); }
+	/// Return true if a relative point lies within rectangle bounds.
+	inline bool ContainsRel (T x, T y) const
+	{ return (x >= 0) && (x < Width ()) && (y >= 0) && (y < Height ()); }
+
+	inline bool Intersects(const RectType& r) const
+	{
+		// Rects are disjoint if there's at least one separating axis
+		bool disjoint = x + width < r.x;
+		disjoint |= r.x + r.width < x;
+		disjoint |= y + height < r.y;
+		disjoint |= r.y + r.height < y;
+		return !disjoint;
+	}
+
+	/// Normalize a rectangle such that xmin <= xmax and ymin <= ymax.
+	inline void Normalize ()
+	{
+		width = std::max<T>(width, 0);
+		height = std::max<T>(height, 0);
+	}
+	
+	bool operator == (const RectType& r)const		{ return x == r.x && y == r.y && width == r.width && height == r.height; }
+	bool operator != (const RectType& r)const		{ return x != r.x || y != r.y || width != r.width || height != r.height; }
+};
+
+typedef RectT<float> Rectf;
+typedef RectT<int> RectInt;
+
+template<> inline const char* Rectf::GetTypeString () { return "Rectf"; }
+template<> inline const char* RectInt::GetTypeString () { return "RectInt"; }
+
+inline bool CompareApproximately (const Rectf& lhs, const Rectf& rhs)
+{
+	return CompareApproximately (lhs.x, rhs.x) && CompareApproximately (lhs.y, rhs.y) && 
+	         CompareApproximately (lhs.width, rhs.width) && CompareApproximately (lhs.height, rhs.height);
+}
+
+/// Make a rect with width & height
+template<typename T>
+inline RectT<T> MinMaxRect (T minx, T miny, T maxx, T maxy) { return RectT<T> (minx, miny, maxx - minx, maxy - miny); }
+
+// RectT<float> specialization
+template<>
+inline bool Rectf::IsEmpty () const { return width <= 0.00001F || height <= 0.00001F; }
+
+template<>
+template<class TransferFunction> inline
+void Rectf::Transfer (TransferFunction& transfer)
+{
+	transfer.SetVersion(2);
+
+	TRANSFER (x);
+	TRANSFER (y);
+	TRANSFER (width);
+	TRANSFER (height);		
+
+	#if UNITY_EDITOR
+	if (transfer.IsOldVersion(1))
+	{
+		float xmax=0.0F, ymax=0.0F, ymin=0.0F, xmin=0.0F;
+		TRANSFER (xmin);
+		TRANSFER (ymin);
+		TRANSFER (xmax);
+		TRANSFER (ymax);
+		
+		*this = MinMaxRect(xmin, ymin, xmax, ymax);
+	}
+	#endif
+}
+
+#endif
diff --git a/Runtime/Math/Simd/Matrix4x4Simd.h b/Runtime/Math/Simd/Matrix4x4Simd.h
new file mode 100644
index 0000000..b38890d
--- /dev/null
+++ b/Runtime/Math/Simd/Matrix4x4Simd.h
@@ -0,0 +1,175 @@
+#ifndef MATRIX4X4SIMD_H
+#define MATRIX4X4SIMD_H
+
+static void MultiplyMatrices4x4NATIVE (const Simd128& m10, const Simd128& m11, const Simd128& m12, const Simd128& m13, const Simd128& m20, const Simd128& m21, const Simd128& m22, const Simd128& m23, Simd128& rm0, Simd128& rm1, Simd128& rm2, Simd128& rm3)
+{
+	const Simd128 m20_X	= V4Splat( m20, 0 );
+	const Simd128 m21_X	= V4Splat( m21, 0 );
+	const Simd128 m22_X	= V4Splat( m22, 0 );
+	const Simd128 m23_X	= V4Splat( m23, 0 );
+	const Simd128 rm0_0	= V4Mul( m20_X, m10 );
+	const Simd128 rm1_0	= V4Mul( m21_X, m10 );
+	const Simd128 rm2_0	= V4Mul( m22_X, m10 );
+	const Simd128 rm3_0	= V4Mul( m23_X, m10 );
+	const Simd128 m20_Y	= V4Splat(m20, 1 );
+	const Simd128 m21_Y	= V4Splat(m21, 1 );
+	const Simd128 m22_Y	= V4Splat(m22, 1 );
+	const Simd128 m23_Y	= V4Splat(m23, 1 );
+	const Simd128 rm0_1	= V4MulAdd( m20_Y, m11, rm0_0 );
+	const Simd128 rm1_1	= V4MulAdd( m21_Y, m11, rm1_0 );
+	const Simd128 rm2_1	= V4MulAdd( m22_Y, m11, rm2_0 );
+	const Simd128 rm3_1	= V4MulAdd( m23_Y, m11, rm3_0 );
+	const Simd128 m20_Z	= V4Splat(m20, 2 );
+	const Simd128 m21_Z	= V4Splat(m21, 2 );
+	const Simd128 m22_Z	= V4Splat(m22, 2 );
+	const Simd128 m23_Z	= V4Splat(m23, 2 );
+	const Simd128 rm0_2	= V4MulAdd( m20_Z, m12, rm0_1 );
+	const Simd128 rm1_2	= V4MulAdd( m21_Z, m12, rm1_1 );
+	const Simd128 rm2_2	= V4MulAdd( m22_Z, m12, rm2_1 );
+	const Simd128 rm3_2	= V4MulAdd( m23_Z, m12, rm3_1 );	
+	const Simd128 m20_W	= V4Splat(m20, 3 );
+	const Simd128 m21_W	= V4Splat(m21, 3 );
+	const Simd128 m22_W	= V4Splat(m22, 3 );
+	const Simd128 m23_W	= V4Splat(m23, 3 );
+	rm0	= V4MulAdd( m20_W, m13 , rm0_2 );
+	rm1	= V4MulAdd( m21_W, m13 , rm1_2 );
+	rm2	= V4MulAdd( m22_W, m13 , rm2_2 );
+	rm3	= V4MulAdd( m23_W, m13 , rm3_2 );	
+}
+
+static void TransformPoint3NATIVE(const Simd128& m0, const Simd128& m1, const Simd128& m2, const Simd128& m3, const Simd128& vin, Simd128& vout) 
+{
+	const Simd128 v0 = V4Splat(vin, 0);
+	const Simd128 v1 = V4Splat(vin, 1);
+	const Simd128 v2 = V4Splat(vin, 2);
+	Simd128 vtemp = V4MulAdd(m0, v0, m3);
+	vtemp = V4MulAdd(m1, v1, vtemp);
+	vout = V4MulAdd(m2, v2, vtemp);
+}
+
+static void TransformVector3NATIVE(const Simd128& m0, const Simd128& m1, const Simd128& m2, const Simd128& m3, const Simd128& vin, Simd128& vout) 
+{
+	const Simd128 v0 = V4Splat(vin, 0);
+	const Simd128 v1 = V4Splat(vin, 1);
+	const Simd128 v2 = V4Splat(vin, 2);
+	Simd128 vtemp = V4Mul(m0, v0);
+	vtemp = V4MulAdd(m1, v1, vtemp);
+	vout = V4MulAdd(m2, v2, vtemp);
+}
+
+static void DECLARE_SIMD_FUNC(MultiplyMatrices4x4) (const Matrix4x4f* __restrict lhs, const Matrix4x4f* __restrict rhs, Matrix4x4f* __restrict res)
+{
+	Assert (lhs != rhs && lhs != res && rhs != res);
+	float* m			= res->m_Data;
+	const float* m1		= lhs->m_Data;
+	const float* m2		= rhs->m_Data;
+	Simd128 rm0, rm1, rm2, rm3;
+
+	Prefetch((const char*)m1);
+	Prefetch((const char*)m2);
+
+	const Simd128 m10	= V4LoadUnaligned( m1, 0x0 );
+	const Simd128 m11	= V4LoadUnaligned( m1, 0x4 );
+	const Simd128 m12	= V4LoadUnaligned( m1, 0x8 );
+	const Simd128 m13	= V4LoadUnaligned( m1, 0xC );
+
+	const Simd128 m20	= V4LoadUnaligned( m2, 0x0 );
+	const Simd128 m21	= V4LoadUnaligned( m2, 0x4 );
+	const Simd128 m22	= V4LoadUnaligned( m2, 0x8 );
+	const Simd128 m23	= V4LoadUnaligned( m2, 0xC );
+
+	MultiplyMatrices4x4NATIVE(m10, m11, m12, m13,	m20, m21, m22, m23, rm0, rm1, rm2, rm3);
+
+	V4StoreUnaligned(rm0, m, 0x0 );
+	V4StoreUnaligned(rm1, m, 0x4 );
+	V4StoreUnaligned(rm2, m, 0x8 );
+	V4StoreUnaligned(rm3, m, 0xC );
+}
+
+static void DECLARE_SIMD_FUNC(CopyMatrix) ( const float* __restrict lhs, float* __restrict res) 
+{ 
+	Simd128 r0 =  V4LoadUnaligned(lhs, 0x0);
+	Simd128 r1 =  V4LoadUnaligned(lhs, 0x4);
+	Simd128 r2 =  V4LoadUnaligned(lhs, 0x8);
+	Simd128 r3 =  V4LoadUnaligned(lhs, 0xC);
+	V4StoreUnaligned(r0, res, 0x0);
+	V4StoreUnaligned(r1, res, 0x4);
+	V4StoreUnaligned(r2, res, 0x8);
+	V4StoreUnaligned(r3, res, 0xC);
+} 
+
+
+static void DECLARE_SIMD_FUNC(TransposeMatrix4x4) (const Matrix4x4f* __restrict lhs, Matrix4x4f* __restrict res)
+{
+	const float* m0		= lhs->m_Data;
+	float* m			= res->m_Data;
+
+	const Simd128 m00	= V4LoadUnaligned(m0, 0x0);
+	const Simd128 m01	= V4LoadUnaligned(m0, 0x4);
+	const Simd128 m02	= V4LoadUnaligned(m0, 0x8);
+	const Simd128 m03	= V4LoadUnaligned(m0, 0xC);
+
+	const Simd128 xxyy1	= V4MergeH(m00, m02);	
+	const Simd128 zzww1	= V4MergeL(m00, m02);	
+	const Simd128 xxyy2	= V4MergeH(m01, m03);	
+	const Simd128 zzww2	= V4MergeL(m01, m03);	
+	const Simd128 t00	= V4MergeH(xxyy1,xxyy2);					
+	const Simd128 t01	= V4MergeL(xxyy1,xxyy2);					
+	const Simd128 t02	= V4MergeH(zzww1,zzww2);					
+	const Simd128 t03	= V4MergeL(zzww1,zzww2);			
+
+	V4StoreUnaligned(t00, m, 0x0);
+	V4StoreUnaligned(t01, m, 0x4);
+	V4StoreUnaligned(t02, m, 0x8);
+	V4StoreUnaligned(t03, m, 0xC);
+}
+
+static void DECLARE_SIMD_FUNC(MultiplyMatrixArrayWithBase4x4) (const Matrix4x4f* __restrict base,
+									 const Matrix4x4f* __restrict a, const Matrix4x4f* __restrict b, Matrix4x4f* __restrict res, size_t count)
+{
+	const float* mbase		= base->m_Data;
+	Prefetch((const char*)mbase);
+
+	const Simd128 base0	= V4LoadUnaligned( mbase, 0x0 );
+	const Simd128 base1	= V4LoadUnaligned( mbase, 0x4 );
+	const Simd128 base2	= V4LoadUnaligned( mbase, 0x8 );
+	const Simd128 base3	= V4LoadUnaligned( mbase, 0xC );
+
+	for (size_t i = 0; i < count; ++i)
+	{
+		float* m			= res[i].m_Data;
+		const float* m1		= a[i].m_Data;
+		const float* m2		= b[i].m_Data;
+		Prefetch((const char*)m1);
+		Prefetch((const char*)m2);
+		const Simd128 m10	= V4LoadUnaligned( m1, 0x0 );
+		const Simd128 m11	= V4LoadUnaligned( m1, 0x4 );
+		const Simd128 m12	= V4LoadUnaligned( m1, 0x8 );
+		const Simd128 m13	= V4LoadUnaligned( m1, 0xC );
+		const Simd128 m20	= V4LoadUnaligned( m2, 0x0 );
+		const Simd128 m21	= V4LoadUnaligned( m2, 0x4 );
+		const Simd128 m22	= V4LoadUnaligned( m2, 0x8 );
+		const Simd128 m23	= V4LoadUnaligned( m2, 0xC );
+
+		Simd128 b20, b21, b22, b23, rb0, rb1, rb2, rb3;
+		MultiplyMatrices4x4NATIVE(m10, m11, m12, m13, m20, m21, m22, m23, b20, b21, b22, b23);
+		MultiplyMatrices4x4NATIVE(base0, base1, base2, base3, b20, b21, b22, b23, rb0, rb1, rb2, rb3);
+
+		V4StoreUnaligned(rb0, m, 0x0 );
+		V4StoreUnaligned(rb1, m, 0x4 );
+		V4StoreUnaligned(rb2, m, 0x8 );
+		V4StoreUnaligned(rb3, m, 0xC );
+	}
+}
+
+
+#if UNITY_AUTO_DETECT_VECTOR_UNIT && UNITY_SUPPORTS_SSE
+#	define MultiplyMatrices4x4(a,b,c)	CPUInfo::HasSSESupport() ? MultiplyMatrices4x4Simd(a,b,c) : MultiplyMatrices4x4REF(a,b,c)
+#	define CopyMatrix(a,b)				CPUInfo::HasSSESupport() ? CopyMatrixSimd(a,b) : CopyMatrixREF(a,b)
+#	define TransposeMatrix4x4(a,b)		CPUInfo::HasSSESupport() ? TransposeMatrix4x4Simd(a,b) : TransposeMatrix4x4REF(a,b)
+#	define MultiplyMatrixArrayWithBase4x4(base,a,b,res,count)	CPUInfo::HasSSESupport() ? MultiplyMatrixArrayWithBase4x4Simd(base,a,b,res,count) : MultiplyMatrixArrayWithBase4x4REF(base,a,b,res,count)
+#endif
+
+#define MultiplyMatrixArray4x4			MultiplyMatrixArray4x4REF
+
+#endif	//MATRIX4X4SIMD_H
+\ No newline at end of file
diff --git a/Runtime/Math/Simd/SimdMath.h b/Runtime/Math/Simd/SimdMath.h
new file mode 100644
index 0000000..99fd118
--- /dev/null
+++ b/Runtime/Math/Simd/SimdMath.h
@@ -0,0 +1,240 @@
+#pragma once
+
+#include "Runtime/Utilities/Prefetch.h"
+#include "Runtime/Misc/CPUInfo.h"
+
+#if UNITY_SUPPORTS_VMX
+
+#		define VMX_0X	0
+#		define VMX_0Y	1
+#		define VMX_0Z	2
+#		define VMX_0W	3
+
+#		define VMX_1X	4
+#		define VMX_1Y	5
+#		define VMX_1Z	6
+#		define VMX_1W	7
+
+#		define V4BuildPermuteMask(a,b,c,d)  {	\
+					(((a)<<2)|((a)<<10)|((a)<<18)|((a)<<26))+0x00010203,\
+					(((b)<<2)|((b)<<10)|((b)<<18)|((b)<<26))+0x00010203,\
+					(((c)<<2)|((c)<<10)|((c)<<18)|((c)<<26))+0x00010203,\
+					(((d)<<2)|((d)<<10)|((d)<<18)|((d)<<26))+0x00010203}
+
+
+#	if UNITY_XENON
+#		include <Xtl.h>
+#		define ALIGN16 __declspec(align(16))
+		typedef __vector4 Simd128;
+
+#		define vec_splat __vspltw
+#		define vec_ste(vec, off, addr) __stvebx(vec, addr, off)
+
+#		define V4Splat(v0, i) __vspltw((v0), (i))
+
+#	elif UNITY_PS3
+#		include <ppu_intrinsics.h>
+#		define ALIGN16 __attribute__((aligned(16)))
+#		define __forceinline __attribute__((always_inline))
+		typedef vec_float4 Simd128;
+		static const vec_float4 __vsignedzero = {-0.f,-0.f,-0.f,-0.f};
+
+
+#		define __vzero()					((vec_float4)vec_splat_u32(0))
+
+#		define __lvx(base, offset)			vec_lvx(offset, base)
+#		define __lvlx(base, offset)			vec_lvlx(offset, base)
+#		define __lvrx(base, offset)			vec_lvrx(offset, base)
+
+#		define __stvx(value, base, offset)	vec_stvx((value), (offset), (float*)(base))
+#		define __stvlx(value, base, offset) vec_stvlx((value), (offset), (float*)(base))
+#		define __stvrx(value, base, offset) vec_stvrx((value), (offset), (float*)(base))
+
+#		define __vmrglw(v0, v1)				vec_mergel((vec_float4)(v0), (vec_float4)(v1))
+#		define __vmrghw(v0, v1)				vec_mergeh((vec_float4)(v0), (vec_float4)(v1))
+
+#		define __vmulfp(a, b)				vec_madd( a, b, __vsignedzero)
+
+#		define __vand						vec_and
+#		define __vandc						vec_andc
+#		define __vor						vec_or
+#		define __vnor						vec_nor
+#		define __vxor						vec_xor
+#		define __vspltw						vec_splat
+#		define __vmaddfp					vec_madd
+#		define __vaddfp						vec_add
+#		define __vsubfp						vec_sub
+#		define __vperm						vec_perm
+#		define __vnmsubfp					vec_nmsub
+#		define __vminfp						vec_min
+#		define __vmaxfp						vec_max
+#		define __vrsqrtefp					vec_rsqrte
+#		define __vsel						vec_sel
+#		define __vrefp						vec_re
+
+#		define __vcmpeqfp					vec_vcmpeqfp
+#		define __vcmpgtfp					vec_vcmpgtfp
+#		define __vcmpgefp					vec_vcmpgefp
+
+		__forceinline static Simd128 __vmsum3fp(Simd128 v0, Simd128 v1)
+		{
+			const Simd128 m0 = vec_madd(v0, v1, __vsignedzero);
+			const Simd128 m1 = vec_splat(m0, 0);
+			const Simd128 m2 = vec_splat(m0, 1);
+			const Simd128 m3 = vec_splat(m0, 2);
+			return vec_add(vec_add(m1, m2), m3);
+		}
+		__forceinline static Simd128 __vmsum4fp(Simd128 v0, Simd128 v1)
+		{
+			const Simd128 m0 = vec_madd(v0, v1, __vsignedzero);
+			const Simd128 m1 = vec_sld(m0, m0, 8);
+			const Simd128 m2 = vec_add(m0, m1);
+			const Simd128 m3 = vec_sld(m2, m2, 4);
+			return vec_add(m2, m3);
+		}
+
+#	endif
+
+
+#	if UNITY_PS3
+		typedef vec_uchar16 Simd128Mask;
+#	else
+		typedef Simd128 Simd128Mask;
+#	endif
+
+		typedef ALIGN16 struct Simd128i { union { int i[4]; Simd128 v; };} Simd128i;
+
+	// Load / Save
+#	define V4Load(base, offset)					__lvx((base), sizeof(base)*(offset))
+#	define V4LoadUnaligned(base, offset)			__vor(__lvlx((base), sizeof(base)*(offset)), __lvrx((base), (sizeof(base)*(offset)) + 16))
+#	define V4Store(value, base, offset)			__stvx(value, (base), sizeof(base)*(offset))
+#	define V4StoreUnaligned(value, base, offset)	__stvlx(value, (float*)(base), sizeof(base)*(offset)); __stvrx(value, (float*)(base), (sizeof(base)*(offset)) + 16 )
+
+	// Math functions
+#	define V4Zero()					__vzero()
+#	define V4Add(v0, v1)			__vaddfp((v0), (v1))
+#	define V4Sub(v0, v1)			__vsubfp((v0), (v1))
+#	define V4Mul(v0, v1)			__vmulfp((v0), (v1))
+#	define V4MulAdd(v0, v1, v2)		__vmaddfp((v0), (v1), (v2))
+#	define V4Min(v0, v1)			__vminfp((v0), (v1))
+#	define V4Max(v0, v1)			__vmaxfp((v0), (v1))
+#	define V4Rcp(v0)				__vrefp((v0))
+#	define V4Rsqrt(v0)				__vrsqrtefp((v0))
+#	define V3Dot(v0, v1)			__vmsum3fp((v0), (v1))
+#	define V4Dot(v0, v1)			__vmsum4fp((v0), (v1))
+
+	// Shuffling / Permuting / Splatting / Merging
+#	define V4Splat(v0, i)			__vspltw((v0), (i))
+#	define V4MergeL(v0, v1)			__vmrglw((v0), (v1))
+#	define V4MergeH(v0, v1)			__vmrghw((v0), (v1))
+
+	__forceinline static Simd128 V3Cross(Simd128 v0, Simd128 v1)
+	{
+		const static Simd128i maskYZXW = V4BuildPermuteMask(VMX_0Y, VMX_0Z, VMX_0X, VMX_0W);
+		const Simd128Mask p = (Simd128Mask)maskYZXW.v;
+		const Simd128 m0 = __vperm(v1, v1, p);
+		const Simd128 m1 = __vperm(v0, v0, p);
+		const Simd128 m2 = __vmulfp(v0, m0);
+		const Simd128 m3 = __vnmsubfp(m1, v1, m2);
+		return __vperm(m3, m3, p);
+	}
+
+
+
+#elif UNITY_SUPPORTS_SSE
+
+#	if UNITY_WIN
+#		include <intrin.h>
+#		define ALIGN16 __declspec(align(16))
+#	else
+#		include <xmmintrin.h>
+#		define ALIGN16 __attribute__((aligned(16)))
+#		define __forceinline inline __attribute__((always_inline))
+#	endif
+
+	typedef __m128 Simd128;
+
+	// Load / Save
+#	define V4Load(base, offset)						_mm_load_ps((base)+(offset))
+#	define V4LoadUnaligned(base, offset)			_mm_loadu_ps((base)+(offset))
+#	define V4Store(value, base, offset)				_mm_store_ps((base)+(offset), value)
+#	define V4StoreUnaligned(value, base, offset)	_mm_storeu_ps((base)+(offset), value)
+
+	// Math functions
+#	define V4Zero()				_mm_setzero_ps()
+#	define V4Add(v0, v1)		_mm_add_ps((v0), (v1))
+#	define V4Sub(v0, v1)		_mm_sub_ps((v0), (v1))
+#	define V4Mul(v0, v1)		_mm_mul_ps((v0), (v1))
+#	define V4MulAdd(v0, v1, v2)	_mm_add_ps(_mm_mul_ps((v0), (v1)), (v2))
+#	define V4Min(v0, v1)		_mm_min_ps((v0), (v1))
+#	define V4Max(v0, v1)		_mm_max_ps((v0), (v1))
+#	define V4Rcp(v0)			_mm_rcp_ps((v0))
+#	define V4Rsqrt(v0)			_mm_rsqrt_ps((v0))
+
+	__forceinline static Simd128 V3Dot(Simd128 v0, Simd128 v1)
+	{
+		const Simd128 m0 = _mm_mul_ps(v0, v1);
+		const Simd128 m1 = _mm_shuffle_ps(m0, m0, _MM_SHUFFLE(0,0,0,0));
+		const Simd128 m2 = _mm_shuffle_ps(m0, m0, _MM_SHUFFLE(1,1,1,1));
+		const Simd128 m3 = _mm_shuffle_ps(m0, m0, _MM_SHUFFLE(2,2,2,2));
+		return _mm_add_ps(_mm_add_ps(m1, m2), m3);
+	}
+	__forceinline static Simd128 V4Dot(Simd128 v0, Simd128 v1)
+	{
+		const Simd128 m0 = _mm_mul_ps(v0, v1);
+		const Simd128 m1 = _mm_shuffle_ps(m0, m0, _MM_SHUFFLE(2,3,0,1));
+		const Simd128 m2 = _mm_add_ps(m0, m1);
+		const Simd128 m3 = _mm_shuffle_ps(m2, m2, _MM_SHUFFLE(1,0,3,2));
+		return _mm_add_ps(m2, m3);
+	}
+	__forceinline static Simd128 V3Cross(Simd128 v0, Simd128 v1)
+	{
+		const Simd128 m0 = _mm_shuffle_ps(v1, v1, _MM_SHUFFLE(3,0,2,1));
+		const Simd128 m1 = _mm_shuffle_ps(v0, v0, _MM_SHUFFLE(3,0,2,1));
+		const Simd128 m2 = _mm_mul_ps(v1, m1);
+		const Simd128 m3 = _mm_sub_ps(_mm_mul_ps(m0, v0), m2);
+		return _mm_shuffle_ps(m3, m3, _MM_SHUFFLE(3,0,2,1));
+	}
+
+	// Shuffling / Permuting / Splatting / Merging
+#	define V4Splat(v0, i)			_mm_shuffle_ps((v0), (v0), _MM_SHUFFLE(i,i,i,i))
+
+	// Attention! : these are done after PPC big-endian specs.
+#	define V4MergeL(v0, v1)			_mm_unpackhi_ps((v0), (v1))
+#	define V4MergeH(v0, v1)			_mm_unpacklo_ps((v0), (v1))
+#endif
+
+// Matrix & Quaternion types
+struct ALIGN16 SimdMatrix3x4 { Simd128 m00, m10, m20; };
+struct ALIGN16 SimdMatrix4x4 { Simd128 m00, m10, m20, m30;};
+typedef Simd128 SimdQuaternion;
+
+#if UNITY_SUPPORTS_VMX
+
+__forceinline static void V3StoreUnaligned(Simd128 value, float* volatile base, const UInt32 offset)
+{
+	const Simd128 X =  vec_splat(value, 0);
+	const Simd128 Y =  vec_splat(value, 1);
+	const Simd128 Z =  vec_splat(value, 2);
+	vec_ste(X, 0, base+offset);
+	vec_ste(Y, 4, base+offset);
+	vec_ste(Z, 8, base+offset);
+}
+
+#else
+
+__forceinline static void V3StoreUnaligned(Simd128 value, float*  base, const UInt32 offset)
+{
+	typedef union {
+		UInt32 u[4];
+		__m128i v;
+	} m128u;
+
+	static const m128u store_mask={{0xffffffff,0xffffffff,0xffffffff,0}};
+	_mm_maskmoveu_si128(*(__m128i *)&value, store_mask.v, (char*)(base+offset));
+}
+
+#endif
+
+
+
diff --git a/Runtime/Math/Simd/SimdTest.cpp b/Runtime/Math/Simd/SimdTest.cpp
new file mode 100644
index 0000000..24f76b8
--- /dev/null
+++ b/Runtime/Math/Simd/SimdTest.cpp
@@ -0,0 +1,814 @@
+#include "UnityPrefix.h"
+#if ENABLE_UNIT_TESTS
+#include "External/UnitTest++/src/UnitTest++.h"
+#include "Runtime/Profiler/TimeHelper.h"
+
+#include "Runtime/Math/Simd/float1.h"
+#include "Runtime/Math/Simd/float4.h"
+#include "Runtime/Math/Simd/math.h"
+#include "Runtime/Math/Simd/quaternion.h"
+#include "Runtime/mecanim/math/axes.h"
+
+using namespace math;
+
+const float epsilon = 1e-5f;
+
+SUITE (SimdTests)
+{
+	struct SimdFixture
+	{
+		SimdFixture()
+		{
+		}
+		~SimdFixture()
+		{
+		}
+	};
+
+	TEST_FIXTURE(SimdFixture, swizzle)
+	{
+		constant_float4(value, 1,2,3,4);
+
+		float4 a = float4(0,0,0,0);
+
+		a.x() = 3.f;
+		CHECK_CLOSE(3, a.x().tofloat(), epsilon);
+		CHECK_CLOSE(0, a.y().tofloat(), epsilon);
+		CHECK_CLOSE(0, a.z().tofloat(), epsilon);
+		CHECK_CLOSE(0, a.w().tofloat(), epsilon);
+
+		a.x() = value.x();
+		CHECK_CLOSE(1, a.x().tofloat(), epsilon);
+		CHECK_CLOSE(0, a.y().tofloat(), epsilon);
+		CHECK_CLOSE(0, a.z().tofloat(), epsilon);
+		CHECK_CLOSE(0, a.w().tofloat(), epsilon);
+
+
+
+		a.y() = value.y();
+		CHECK_CLOSE(1, a.x().tofloat(), epsilon);
+		CHECK_CLOSE(2, a.y().tofloat(), epsilon);
+		CHECK_CLOSE(0, a.z().tofloat(), epsilon);
+		CHECK_CLOSE(0, a.w().tofloat(), epsilon);
+
+		a.z() = value.z();
+		CHECK_CLOSE(1, a.x().tofloat(), epsilon);
+		CHECK_CLOSE(2, a.y().tofloat(), epsilon);
+		CHECK_CLOSE(3, a.z().tofloat(), epsilon);
+		CHECK_CLOSE(0, a.w().tofloat(), epsilon);
+
+		a.w() = value.w();
+		CHECK_CLOSE(1, a.x().tofloat(), epsilon);
+		CHECK_CLOSE(2, a.y().tofloat(), epsilon);
+		CHECK_CLOSE(3, a.z().tofloat(), epsilon);
+		CHECK_CLOSE(4, a.w().tofloat(), epsilon);
+
+		a.z() = value.y();
+		CHECK_CLOSE(1, a.x().tofloat(), epsilon);
+		CHECK_CLOSE(2, a.y().tofloat(), epsilon);
+		CHECK_CLOSE(2, a.z().tofloat(), epsilon);
+		CHECK_CLOSE(4, a.w().tofloat(), epsilon);
+
+		float1 f = value.w();
+		CHECK_CLOSE(4, f.tofloat(), epsilon);
+
+		a = value.wxzy();
+		CHECK( all(a == float4(4,1,3,2) ) );
+
+		float4 g(value.wxzy());
+		CHECK( all(g == float4(4,1,3,2) ) );
+
+
+		a = value.xzwy();
+		CHECK( all(a == float4(1,3,4,2) ) );
+
+		a = value.xwyz();
+		CHECK( all(a == float4(1,4,2,3) ) );
+
+		a = value.wyxz();
+		CHECK( all(a == float4(4,2,1,3) ) );
+
+		a = value.zywx();
+		CHECK( all(a == float4(3,2,4,1) ) );
+
+		a = value.ywzx();
+		CHECK( all(a == float4(2,4,3,1) ) );
+
+		a = value.yzxw();
+		CHECK( all(a == float4(2,3,1,4) ) );
+
+		a = value.zxyw();
+		CHECK( all(a == float4(3,1,2,4) ) );
+
+		a = value.zwxy();
+		CHECK( all(a == float4(3,4,1,2) ) );
+
+		a = value.wwwz();
+		CHECK( all(a == float4(4,4,4,3) ) );
+
+		a = value.wwzz();
+		CHECK( all(a == float4(4,4,3,3) ) );
+
+		a = value.wzyx();
+		CHECK( all(a == float4(4,3,2,1) ) );
+
+		a = value.yxwz();
+		CHECK( all(a == float4(2,1,4,3) ) );
+
+	}
+
+	TEST_FIXTURE(SimdFixture, float1_op)
+	{
+		float ATTRIBUTE_ALIGN(ALIGN4F) v[4];
+
+		float1 b(3.f);
+		cvec4f(two, 2.f, 2.f, 2.f, 2.f);
+		float1 c(two);
+		float4 cx;
+		float1 e;
+		
+		{
+		Vstorepf(b.eval(), v, 0);
+		CHECK_CLOSE(3.f, v[0], epsilon);
+		CHECK_CLOSE(3.f, v[1], epsilon);
+		CHECK_CLOSE(3.f, v[2], epsilon);
+		CHECK_CLOSE(3.f, v[3], epsilon);
+
+		Vstorepf(c.eval(), v, 0);
+		CHECK_CLOSE(2.f, v[0], epsilon);
+		CHECK_CLOSE(2.f, v[1], epsilon);
+		CHECK_CLOSE(2.f, v[2], epsilon);
+		CHECK_CLOSE(2.f, v[3], epsilon);
+
+		cx = float4(10.f,2.f,3.f,4.f);
+
+		float1 d(cx.x());
+
+		Vstorepf(d.eval(), v, 0);
+		CHECK_CLOSE(10.f, v[0], epsilon);
+		CHECK_CLOSE(10.f, v[1], epsilon);
+		CHECK_CLOSE(10.f, v[2], epsilon);
+		CHECK_CLOSE(10.f, v[3], epsilon);
+
+		e = cx.y();
+
+		Vstorepf(e.eval(), v, 0);
+		CHECK_CLOSE(2.f, v[0], epsilon);
+		CHECK_CLOSE(2.f, v[1], epsilon);
+		CHECK_CLOSE(2.f, v[2], epsilon);
+		CHECK_CLOSE(2.f, v[3], epsilon);
+
+		e = float1(4.f);
+		Vstorepf(e.eval(), v, 0);
+		CHECK_CLOSE(4.f, v[0], epsilon);
+		CHECK_CLOSE(4.f, v[1], epsilon);
+		CHECK_CLOSE(4.f, v[2], epsilon);
+		CHECK_CLOSE(4.f, v[3], epsilon);
+
+		e = float1(cx.x());
+		Vstorepf(e.eval(), v, 0);
+		CHECK_CLOSE(10.f, v[0], epsilon);
+		CHECK_CLOSE(10.f, v[1], epsilon);
+		CHECK_CLOSE(10.f, v[2], epsilon);
+		CHECK_CLOSE(10.f, v[3], epsilon);
+
+		e = cx.z();
+		Vstorepf(e.eval(), v, 0);
+		CHECK_CLOSE(3.f, v[0], epsilon);
+		CHECK_CLOSE(3.f, v[1], epsilon);
+		CHECK_CLOSE(3.f, v[2], epsilon);
+		CHECK_CLOSE(3.f, v[3], epsilon);
+
+		e += cx.w();
+		Vstorepf(e.eval(), v, 0);
+		CHECK_CLOSE(7.f, v[0], epsilon);
+		CHECK_CLOSE(7.f, v[1], epsilon);
+		CHECK_CLOSE(7.f, v[2], epsilon);
+		CHECK_CLOSE(7.f, v[3], epsilon);
+
+		e -= cx.x();
+		Vstorepf(e.eval(), v, 0);
+		CHECK_CLOSE(-3.f, v[0], epsilon);
+		CHECK_CLOSE(-3.f, v[1], epsilon);
+		CHECK_CLOSE(-3.f, v[2], epsilon);
+		CHECK_CLOSE(-3.f, v[3], epsilon);
+
+		e *= cx.y();
+		Vstorepf(e.eval(), v, 0);
+		CHECK_CLOSE(-6.f, v[0], epsilon);
+		CHECK_CLOSE(-6.f, v[1], epsilon);
+		CHECK_CLOSE(-6.f, v[2], epsilon);
+		CHECK_CLOSE(-6.f, v[3], epsilon);
+
+		e /= cx.z();
+		Vstorepf(e.eval(), v, 0);
+		CHECK_CLOSE(-2.f, v[0], epsilon);
+		CHECK_CLOSE(-2.f, v[1], epsilon);
+		CHECK_CLOSE(-2.f, v[2], epsilon);
+		CHECK_CLOSE(-2.f, v[3], epsilon);
+		}
+		{
+		float1 f = e++;
+		Vstorepf(f.eval(), v, 0);
+		CHECK_CLOSE(-2.f, v[0], epsilon);
+		CHECK_CLOSE(-2.f, v[1], epsilon);
+		CHECK_CLOSE(-2.f, v[2], epsilon);
+		CHECK_CLOSE(-2.f, v[3], epsilon);
+
+		Vstorepf(e.eval(), v, 0);
+		CHECK_CLOSE(-1.f, v[0], epsilon);
+		CHECK_CLOSE(-1.f, v[1], epsilon);
+		CHECK_CLOSE(-1.f, v[2], epsilon);
+		CHECK_CLOSE(-1.f, v[3], epsilon);
+
+		float1 g = ++e;
+		Vstorepf(g.eval(), v, 0);
+		CHECK_CLOSE(0.f, v[0], epsilon);
+		CHECK_CLOSE(0.f, v[1], epsilon);
+		CHECK_CLOSE(0.f, v[2], epsilon);
+		CHECK_CLOSE(0.f, v[3], epsilon);
+
+		Vstorepf(e.eval(), v, 0);
+		CHECK_CLOSE(0.f, v[0], epsilon);
+		CHECK_CLOSE(0.f, v[1], epsilon);
+		CHECK_CLOSE(0.f, v[2], epsilon);
+		CHECK_CLOSE(0.f, v[3], epsilon);
+
+
+		float1 h(float1::zero());
+		Vstorepf(h.eval(), v, 0);
+		CHECK_CLOSE(0.f, v[0], epsilon);
+		CHECK_CLOSE(0.f, v[1], epsilon);
+		CHECK_CLOSE(0.f, v[2], epsilon);
+		CHECK_CLOSE(0.f, v[3], epsilon);
+
+		float1 i(float1::one());
+		Vstorepf(i.eval(), v, 0);
+		CHECK_CLOSE(1.f, v[0], epsilon);
+		CHECK_CLOSE(1.f, v[1], epsilon);
+		CHECK_CLOSE(1.f, v[2], epsilon);
+		CHECK_CLOSE(1.f, v[3], epsilon);
+
+		float1 j(4.f);
+		float1 l(3.f);
+
+		float1 m = j + l;
+		Vstorepf(m.eval(), v, 0);
+		CHECK_CLOSE(7.f, v[0], epsilon);
+		CHECK_CLOSE(7.f, v[1], epsilon);
+		CHECK_CLOSE(7.f, v[2], epsilon);
+		CHECK_CLOSE(7.f, v[3], epsilon);
+
+		float1 n = j - l;
+		Vstorepf(n.eval(), v, 0);
+		CHECK_CLOSE(1.f, v[0], epsilon);
+		CHECK_CLOSE(1.f, v[1], epsilon);
+		CHECK_CLOSE(1.f, v[2], epsilon);
+		CHECK_CLOSE(1.f, v[3], epsilon);
+
+		float1 o = j * l;
+		Vstorepf(o.eval(), v, 0);
+		CHECK_CLOSE(12.f, v[0], epsilon);
+		CHECK_CLOSE(12.f, v[1], epsilon);
+		CHECK_CLOSE(12.f, v[2], epsilon);
+		CHECK_CLOSE(12.f, v[3], epsilon);
+
+		float1 p = j / l;
+		Vstorepf(p.eval(), v, 0);
+		CHECK_CLOSE(4.f/3.f, v[0], epsilon);
+		CHECK_CLOSE(4.f/3.f, v[1], epsilon);
+		CHECK_CLOSE(4.f/3.f, v[2], epsilon);
+		CHECK_CLOSE(4.f/3.f, v[3], epsilon);
+		}
+		
+		bool1 bvalue = float1(4.f) < float1(3.f);
+		CHECK( (bool)!bvalue );
+
+		bvalue = float1(4.f) < float1(4.f);
+		CHECK( (bool)!bvalue );
+
+		bvalue = float1(4.f) < float1(5.f);
+		CHECK( (bool)bvalue );
+
+		bvalue = float1(4.f) <= float1(3.f);
+		CHECK( (bool)!bvalue );
+
+		bvalue = float1(4.f) <= float1(4.f);
+		CHECK( (bool)bvalue );
+
+		bvalue = float1(4.f) <= float1(5.f);
+		CHECK( (bool)bvalue );
+
+		bvalue = float1(4.f) > float1(3.f);
+		CHECK( (bool)bvalue );
+
+		bvalue = float1(4.f) > float1(4.f);
+		CHECK( (bool)!bvalue );
+
+		bvalue = float1(4.f) > float1(5.f);
+		CHECK( (bool)!bvalue );
+
+		bvalue = float1(4.f) >= float1(3.f);
+		CHECK( (bool)bvalue );
+
+		bvalue = float1(4.f) >= float1(4.f);
+		CHECK( (bool)bvalue );
+
+		bvalue = float1(4.f) >= float1(5.f);
+		CHECK( (bool)!bvalue );
+
+		bvalue = float1(10.f) == float1(5.f);
+		CHECK( (bool)!bvalue );
+
+		bvalue = float1(10.f) == float1(10.f);
+		CHECK( (bool)bvalue );
+
+		bvalue = float1(10.f) != float1(5.f);
+		CHECK( (bool)bvalue );
+
+		bvalue = float1(10.f) != float1(10.f);
+		CHECK( (bool)!bvalue );
+
+	}
+
+	TEST_FIXTURE(SimdFixture, Operator1)
+	{
+		float ATTRIBUTE_ALIGN(ALIGN4F) v[4];
+
+		float4 a(1,2,3,4);
+		float4 b(4,3,2,1);
+		float4 e(54,3,42,2);
+
+		float4 c = a+b;
+		CHECK_CLOSE( 5.f, c.x().tofloat(), epsilon);
+		CHECK_CLOSE( 5.f, c.y().tofloat(), epsilon);
+		CHECK_CLOSE( 5.f, c.z().tofloat(), epsilon);
+		CHECK_CLOSE( 5.f, c.w().tofloat(), epsilon);
+
+		c = a+b.wwwz();
+		CHECK_CLOSE( 2.f, c.x().tofloat(), epsilon);
+		CHECK_CLOSE( 3.f, c.y().tofloat(), epsilon);
+		CHECK_CLOSE( 4.f, c.z().tofloat(), epsilon);
+		CHECK_CLOSE( 6.f, c.w().tofloat(), epsilon);
+
+		c = a+b.z();
+		CHECK_CLOSE( 3.f, c.x().tofloat(), epsilon);
+		CHECK_CLOSE( 4.f, c.y().tofloat(), epsilon);
+		CHECK_CLOSE( 5.f, c.z().tofloat(), epsilon);
+		CHECK_CLOSE( 6.f, c.w().tofloat(), epsilon);
+
+		c = a+b.wwwz()+e.y();
+		CHECK_CLOSE( 5.f, c.x().tofloat(), epsilon);
+		CHECK_CLOSE( 6.f, c.y().tofloat(), epsilon);
+		CHECK_CLOSE( 7.f, c.z().tofloat(), epsilon);
+		CHECK_CLOSE( 9.f, c.w().tofloat(), epsilon);
+
+		float4 d = a;
+		CHECK_CLOSE( 1.f, d.x().tofloat(), epsilon);
+		CHECK_CLOSE( 2.f, d.y().tofloat(), epsilon);
+		CHECK_CLOSE( 3.f, d.z().tofloat(), epsilon);
+		CHECK_CLOSE( 4.f, d.w().tofloat(), epsilon);
+
+		float1 a1 = float1(10.f);
+
+		d = a+a1;
+		CHECK_CLOSE( 11.f, d.x().tofloat(), epsilon);
+		CHECK_CLOSE( 12.f, d.y().tofloat(), epsilon);
+		CHECK_CLOSE( 13.f, d.z().tofloat(), epsilon);
+		CHECK_CLOSE( 14.f, d.w().tofloat(), epsilon);
+
+		a.x() = 0;
+		CHECK( all(a == float4(0,2,3,4)) );
+
+		a.y() = float1(12);
+		CHECK( all(a == float4(0,12,3,4)) );
+
+		a = float4(1,2,3,4);
+
+		c = a+b;
+		CHECK( all(c == float4(5,5,5,5)) );
+
+		c = a*b;
+		CHECK( all(c == float4(4.f,6.f,6.f,4.f)) );
+
+		c = a/b;
+		CHECK( all(c == float4(1.f/4.f,2.f/3.f,3.f/2.f,4.f/1.f)) );
+
+		c = ++a;
+		CHECK( all(c == float4(2.f,3.f,4.f,5.f)) );
+		CHECK( all(a == float4(2.f,3.f,4.f,5.f)) );
+
+		c = a++;
+		CHECK( all(c == float4(2.f,3.f,4.f,5.f)) );
+		CHECK( all(a == float4(3.f,4.f,5.f,6.f)) );
+
+		c += b;
+		CHECK( all(c == float4(6.f,6.f,6.f,6.f)) );
+
+		c -= a;
+		CHECK( all(c == float4(3.f,2.f,1.f,0.f)) );
+
+		c += 5.f;
+		CHECK( all(c == float4(8.f,7.f,6.f,5.f)) );
+
+		c *= b;
+		CHECK( all(c == float4(32.f,21.f,12.f,5.f)) );
+
+		c /= b;
+		CHECK( all(c == float4(8.f,7.f,6.f,5.f)) );
+
+		c = -c;
+		CHECK( all(c == float4(-8.f,-7.f,-6.f,-5.f)) );
+
+		c -= .5f;
+		CHECK( all(c == float4(-8.5f,-7.5f,-6.5f,-5.5f)) );
+
+		c *= 2.f;
+		CHECK( all(c == float4(-17.f,-15.f,-13.f,-11.f)) );
+
+		c /= 3.f;
+		Vstorepf(c.eval(), v, 0);
+		CHECK_CLOSE(-17.f/3.f, v[0], epsilon);
+		CHECK_CLOSE(-15.f/3.f, v[1], epsilon);
+		CHECK_CLOSE(-13.f/3.f, v[2], epsilon);
+		CHECK_CLOSE(-11.f/3.f, v[3], epsilon);
+	}
+
+	TEST_FIXTURE( SimdFixture, vecexpr1_operator )
+	{
+		float ATTRIBUTE_ALIGN(ALIGN4F) v[4];
+
+		constant_float4(c,-1,2,-3,4);
+		float4 t(5,6,7,8);
+
+		t.x() *= float1(-1.f);
+		CHECK( all(t == float4(-5.f, 6.f, 7.f, 8.f)));
+
+		t.y() += float1(4.f);
+		CHECK( all(t == float4(-5.f, 10.f, 7.f, 8.f)));
+
+		t.z() -= float1(-2.f);
+		CHECK( all(t == float4(-5.f, 10.f, 9.f, 8.f)));
+
+		t.w() /= float1(-2.f);
+		CHECK( all(t == float4(-5.f, 10.f, 9.f, -4.f)));
+
+		t.x() *= c.w();
+		CHECK( all(t == float4(-20.f, 10.f, 9.f, -4.f)));
+
+		t.y() /= c.z();
+		Vstorepf(t.eval(), v, 0);
+		CHECK_CLOSE(-20.f, v[0], epsilon);
+		CHECK_CLOSE(10.f/-3.f, v[1], epsilon);
+		CHECK_CLOSE(9.f, v[2], epsilon);
+		CHECK_CLOSE(-4.f, v[3], epsilon);
+
+		t.w() += c.y();
+		Vstorepf(t.eval(), v, 0);
+		CHECK_CLOSE(-20.f, v[0], epsilon);
+		CHECK_CLOSE(10.f/-3.f, v[1], epsilon);
+		CHECK_CLOSE(9.f, v[2], epsilon);
+		CHECK_CLOSE(-2.f, v[3], epsilon);
+
+		t.z() -= c.x();
+		Vstorepf(t.eval(), v, 0);
+		CHECK_CLOSE(-20.f, v[0], epsilon);
+		CHECK_CLOSE(10.f/-3.f, v[1], epsilon);
+		CHECK_CLOSE(10.f, v[2], epsilon);
+		CHECK_CLOSE(-2.f, v[3], epsilon);
+
+		float x = -c.x().tofloat();
+		CHECK( x == 1.f );
+	}
+
+
+	TEST_FIXTURE( SimdFixture, generic )
+	{
+		float ATTRIBUTE_ALIGN(ALIGN4F) v[4];
+
+		float4 a(-1.f, -.263f, 345.f, 0.f);
+		float4 b(5.f, 2.34f, -12.76f, 54.f);
+		float4 c;
+
+		float1 s;
+
+		c = abs(a);
+		CHECK( all(c == float4(1.f, .263f, 345.f, 0.f)));
+
+		c = math::clamp(c, float4(0.f, 1.f, 100.f, -2.f), float4(2.f, 3.f, 200.f, -10.f));
+		CHECK( all(c == float4(1.f, 1.f, 200.f, -10.f)));
+
+		c = cond(bool4(true), a, b);
+		CHECK( all(c == a));
+
+		c = cond(bool4(false), a, b);
+		CHECK( all(c == b));
+		
+		c = cond(a<b, a, b);
+		CHECK( all(c == float4(-1.f, -.263f, -12.76f, 0.f)));
+
+		
+		a = float4(-1.f, 0.f, 0.f, 0.f);
+		b = float4(0.f, 1.f, 0.f, 0.f);
+		c = cross(a, b);
+		CHECK( all(c == float4(0.f, 0.f, -1.f, 0.f)));
+
+		a = float4(-1.f, 2.f, -4.f, 1.f);
+		b = float4(4.f, 1.f, -3.f, 1.f);
+		c = cross(a, b);
+		CHECK( all(c == float4(-2.f, -19.f, -9.f, 0.f)));
+
+		c = degrees(float4( float(M_PIf), float(M_PI_2f), float(M_PI_4f), 0.f));
+		CHECK( all(c == float4(180.f, 90.f, 45.f, 0.f)));
+
+		c = radians(float4(180.f, 90.f, 45.f, 0.f));
+		CHECK( all(c == float4( float(M_PIf), float(M_PI_2f), float(M_PI_4f), 0.f)));
+
+		float1 teta = dot(float4(1,0,0,0), float4(0,1,0,0));
+		CHECK_CLOSE( 0.f, teta.tofloat(), epsilon);
+
+		teta = dot(float4(1,0,0,0), float4(1,0,0,0));
+		CHECK_CLOSE( 1.f, teta.tofloat(), epsilon);
+
+		teta = dot(float4(1,0,0,0), normalize(float4(1,1,0,0)));
+		CHECK_CLOSE( 0.70710f, teta.tofloat(), epsilon);
+
+		s = dot(float4( 10.f, 5.f, 2.f, 0.f));
+		CHECK_CLOSE( 129.0f, s.tofloat(), epsilon);
+
+		s = length( float4( 1.f, 0.f, 0.f, 0.f));
+		CHECK_CLOSE( 1.f, s.tofloat(), epsilon );
+
+		s = length( float4( 10.f, 5.f, 2.f, 0.f));
+		CHECK_CLOSE( 11.357816f, s.tofloat(), epsilon);
+        
+        s = length( float4( 0.f, 0.f, 0.f, 0.f));
+		CHECK_CLOSE( 0.f, s.tofloat(), epsilon );
+
+		s = lerp(float1(3), float1(6), float1(.3333333f));
+		CHECK_CLOSE( 4.f, s.tofloat(), epsilon);
+
+		c = lerp(float4(1,2,3,4), float4(3,4,5,6), float1(.5f));
+		CHECK_CLOSE( 2.f, c.x().tofloat(), epsilon);
+		CHECK_CLOSE( 3.f, c.y().tofloat(), epsilon);
+		CHECK_CLOSE( 4.f, c.z().tofloat(), epsilon);
+		CHECK_CLOSE( 5.f, c.w().tofloat(), epsilon);
+
+		c = lerp(float4(1,2,3,4), float4(3,4,5,6), float4(-.5f,0,1.0,1.5f));
+		CHECK_CLOSE( 0.f, c.x().tofloat(), epsilon);
+		CHECK_CLOSE( 2.f, c.y().tofloat(), epsilon);
+		CHECK_CLOSE( 5.f, c.z().tofloat(), epsilon);
+		CHECK_CLOSE( 7.f, c.w().tofloat(), epsilon);
+
+		s = maximum(float4(-1.f, -.263f, 345.f, 0.f));
+		CHECK_CLOSE( 345.f, s.tofloat(), epsilon);
+
+		s = minimum(float4(-1.f, -.263f, 345.f, 0.f));
+		CHECK_CLOSE( -1.f, s.tofloat(), epsilon);
+
+		c = normalize(float4( 0.f, 0.f, 0.f, 1.f));
+		CHECK_CLOSE( 0.f, c.x().tofloat(), epsilon);
+		CHECK_CLOSE( 0.f, c.y().tofloat(), epsilon);
+		CHECK_CLOSE( 0.f, c.z().tofloat(), epsilon);
+		CHECK_CLOSE( 1.f, c.w().tofloat(), epsilon);
+
+		c = normalize(float4( 10.f, 5.f, 2.f, 0.f));
+		CHECK_CLOSE( 0.880451f, c.x().tofloat(), epsilon);
+		CHECK_CLOSE( 0.440225f, c.y().tofloat(), epsilon);
+		CHECK_CLOSE( 0.176090f, c.z().tofloat(), epsilon);
+		CHECK_CLOSE( 0.f, c.w().tofloat(), epsilon);
+
+		c = rcp(float4( -25.f, 45.f, .5f, 1.f));
+		CHECK_CLOSE( 1.f/-25.f, c.x().tofloat(), epsilon);
+		CHECK_CLOSE( 1.f/45.f, c.y().tofloat(), epsilon);
+		CHECK_CLOSE( 1.f/.5f, c.z().tofloat(), epsilon);
+		CHECK_CLOSE( 1.f, c.w().tofloat(), epsilon);
+
+		//float4 rsqrt(float4 const& r);
+
+		float f = saturate(-2.f);
+		CHECK_CLOSE( 0.f, f, epsilon);
+
+		f = saturate(.5f);
+		CHECK_CLOSE( .5f, f, epsilon);
+
+		f = saturate(1.5f);
+		CHECK_CLOSE( 1.f, f, epsilon);
+
+		c = saturate(float4( -25.f, 0.f, .5f, 1.5f));
+		CHECK( all(c == float4( 0.f, 0.f, .5f, 1.f)));
+
+		f = sgn(-25.f);
+		CHECK_CLOSE( -1.f, f, epsilon);
+
+		f = sgn(0.f);
+		CHECK_CLOSE( 1.f, f, epsilon);
+
+		f = sgn(3.f);
+		CHECK_CLOSE( 1.f, f, epsilon);
+
+		c = sgn(float4( -25.f, 0.f, .5f, 1.5f));
+		CHECK( all(c == float4( -1.f, 1.f, 1.f, 1.f)));
+
+		c = sgn(float4( 25.f, 0.f, -.5f, -1.5f));
+		CHECK( all(c == float4( 1.f, 1.f, -1.f, -1.f)));
+	
+		// inconsistant how sgn of -0 is interpreted. should not matter in any real world scenarios
+/*		c = sgn(float4( -25.f, -0.f, .5f, -1.5f));
+		CHECK( all(c == float4( -1.f, -1.f, 1.f, -1.f)));
+*/
+		f = sign(-25.f);
+		CHECK_CLOSE( -1.f, f, epsilon);
+
+		f = sign(0.f);
+		CHECK_CLOSE( 0.f, f, epsilon);
+
+		f = sign(3.f);
+		CHECK_CLOSE( 1.f, f, epsilon);
+
+		c = sign(float4( -25.f, 0.f, .5f, 1.5f));
+		CHECK( all(c == float4( -1.f, 0.f, 1.f, 1.f)));
+
+		c = sign(float4( 25.f, -0.f, .5f, -1.5f));
+		CHECK( all(c == float4( 1.f, 0.f, 1.f, -1.f)));
+
+		c = sqrt(float4( 9.f, 81.f, 49.f, 74.f));
+		Vstorepf(c.eval(), v, 0);
+		CHECK_CLOSE(3.f, v[0], epsilon);
+		CHECK_CLOSE(9.f, v[1], epsilon);
+		CHECK_CLOSE(7.f, v[2], epsilon);
+		CHECK_CLOSE(8.602325f, v[3], epsilon);
+
+		s = sum(float4( 9.f, 81.f, 49.f, 74.f));
+		CHECK_CLOSE( 213.f, s.tofloat(), epsilon);
+
+		c = math::vector(float4( -25.f, 0.f, .5f, 1.5f));
+		CHECK( all(c == float4( -25.f, 0.f, .5f, 0.f)));
+
+		a = float4(-1.f, -4.f, 8.f, 0.f);
+		b = float4(5.f, 2.f, -2.f, 54.f);
+		c = float4( -25.f, 0.f, .5f, 1.5f);
+		float4 d = float4(Vmadd(a.eval(),b.eval(),c.eval()));
+		CHECK(all(d == float4(-30.f,-8.f,-15.5f,1.5f)));
+		
+		d = float4(Vmsub(a.eval(),b.eval(),c.eval()));
+		CHECK(all(d == float4(20.f,-8.f,-16.5f,-1.5f)));
+		
+		bool4 bv = bool4(0, 0, 0, 0);
+		CHECK( any(bv) == false );
+
+		bv = bool4(0, 1, 0, 0);
+		CHECK( any(bv) == true );
+
+		bv = bool4(1, 1, 1, 1);
+		CHECK( any(bv) == true );
+	}
+
+	TEST_FIXTURE( SimdFixture, quaternion )
+	{
+		float epsilon = 1e-4f;
+
+		float4 qx(1,0,0,0);
+		float4 qy(0,1,0,0);
+		float4 qz(0,0,1,0);
+
+		float4 vz(0,0,1,0);
+
+		float4 v;
+
+		v = quatMulVec(qy, vz);
+		CHECK( all(v == float4(0.f, 0.f, -1.f, 0.f)));
+
+		v = quatMulVec(qz, vz);
+		CHECK( all(v == float4(0.f, 0.f, 1.f, 0.f)));
+
+		v = quatMulVec(qx, vz);
+		CHECK( all(v == float4(0.f, 0.f, -1.f, 0.f)));
+
+		float4 euler(radians(-38.22f), radians(16.16f), radians(-45.96f), 0.f );
+		float4 q = quatEulerToQuat(euler);
+		v = quatMulVec(q, float4(32.21f, 61.03f, -11.19f, 0.f) );
+		CHECK_CLOSE( 41.990852f,	v.x().tofloat(), epsilon);
+		CHECK_CLOSE( 15.592499f,	v.y().tofloat(), epsilon);
+		CHECK_CLOSE( -53.674984f,	v.z().tofloat(), epsilon);
+		CHECK_CLOSE( 0.f,			v.w().tofloat(), epsilon);
+
+		float4 q1(radians(-38.22f), radians(16.16f), radians(-45.96f), 0.f );
+		float4 q2(radians(79.24f), radians(-1.61f), radians(-33.15f), 0.f );
+		float4 q3(radians(38.40f), radians(-6.50f), radians(-70.45f), 0.f);
+		q3 = quatEulerToQuat(q3);
+		q = quatMul(quatEulerToQuat(q1), quatEulerToQuat(q2));
+		CHECK_CLOSE( q.x().tofloat(), q3.x().tofloat(), epsilon);
+		CHECK_CLOSE( q.y().tofloat(), q3.y().tofloat(), epsilon);
+		CHECK_CLOSE( q.z().tofloat(), q3.z().tofloat(), epsilon);
+		CHECK_CLOSE( q.w().tofloat(), q3.w().tofloat(), epsilon);
+
+		q3 = quatConj(q3);
+		CHECK_CLOSE( -q.x().tofloat(), q3.x().tofloat(), epsilon);
+		CHECK_CLOSE( -q.y().tofloat(), q3.y().tofloat(), epsilon);
+		CHECK_CLOSE( -q.z().tofloat(), q3.z().tofloat(), epsilon);
+		CHECK_CLOSE( q.w().tofloat(), q3.w().tofloat(), epsilon);
+		
+		Axes cAxes;
+		q3 = ToAxes(cAxes,q2);
+		CHECK_CLOSE( 3.121f, q3.x().tofloat(), epsilon);
+		CHECK_CLOSE( 0.792092f, q3.y().tofloat(), epsilon);
+		CHECK_CLOSE( -0.0492416f, q3.z().tofloat(), epsilon);
+		CHECK_CLOSE( 0.0f, q3.w().tofloat(), epsilon);
+		
+		q3 = FromAxes(cAxes,q3);
+		CHECK_CLOSE( 0.922363f, q3.x().tofloat(), epsilon);
+		CHECK_CLOSE( -0.0187406f, q3.y().tofloat(), epsilon);
+		CHECK_CLOSE( -0.38587f, q3.z().tofloat(), epsilon);
+		CHECK_CLOSE( 0.0f, q3.w().tofloat(), epsilon);
+		
+		Axes aAxiz (float4(0,-0.268f,-0.364f,1),float4(0,-2,1,0),float4(-1,-1,1,1),17,math::kZYRoll);
+		q3 = ToAxes(aAxiz,q2);
+		CHECK_CLOSE( 1.28582f, q3.x().tofloat(), epsilon);
+		CHECK_CLOSE( 1.69701f, q3.y().tofloat(), epsilon);
+		CHECK_CLOSE( -0.652772f, q3.z().tofloat(), epsilon);
+		CHECK_CLOSE( 0.0f, q3.w().tofloat(), epsilon);
+		
+		q3 = FromAxes(aAxiz,q3);
+		CHECK_CLOSE( 0.922363f, q3.x().tofloat(), epsilon);
+		CHECK_CLOSE( -0.0187405f, q3.y().tofloat(), epsilon);
+		CHECK_CLOSE( -0.38587f, q3.z().tofloat(), epsilon);
+		CHECK_CLOSE( -0.0f, q3.w().tofloat(), epsilon);
+	
+		
+		
+		
+	/*	float4 left(0.999886f, 0.011893f, -0.006366f, 0);
+		float4 up(-0.012257f, 0.998085f, -0.060634f, 0);
+		float4 front(0.005632f, 0.060705f, 0.998116f, 0);
+		float4 rootX( -0.068884f, -0.000000f, -0.000000f, 0.997625f);
+
+		math::float4 ret = math::normalize(math::quatMul(math::quatMatrixToQuat(left,up,front),math::quatConj(rootX)));
+		CHECK_CLOSE(  0.278572f, ret.x().tofloat(), epsilon);
+		CHECK_CLOSE(  0.247044f, ret.y().tofloat(), epsilon);
+		CHECK_CLOSE(  0.216015f, ret.z().tofloat(), epsilon);
+		CHECK_CLOSE(  0.902610f, ret.w().tofloat(), epsilon);
+	*/
+	}
+
+	TEST_FIXTURE( SimdFixture, trigonometric )
+	{
+
+		int degree;
+		for(degree=-90;degree<90;degree++)
+		{
+			float rad = radians( static_cast<float>(degree) );
+
+			float			sin_stl   = math::sin(rad);
+			math::float4	sin_unity = math::sin_est( math::float4(rad) );
+
+			CHECK_CLOSE( sin_stl, sin_unity.x().tofloat(), 9.2e-5f);
+		}
+
+		for(degree=-90;degree<90;degree++)
+		{
+			float rad = radians( static_cast<float>(degree) );
+
+			float			cos_stl   = math::cos(rad);
+			math::float4	cos_unity = math::cos_est( math::float4(rad) );
+
+			CHECK_CLOSE( cos_stl, cos_unity.x().tofloat(), 9.0e-4);
+		}
+
+		/*
+		float sin_stl = 0; 
+		ABSOLUTE_TIME time_stl = START_TIME;
+
+		for(int i=0;i<1000;i++)
+		{
+		  
+		for(degree=-90;degree<90;degree++)
+		{
+			float rad = radians( static_cast<float>(degree) );
+			sin_stl   += math::sin(rad);
+		}
+		}
+
+		time_stl = ELAPSED_TIME(time_stl);
+
+		math::float4 sin_unity = math::float4::zero();   
+		ABSOLUTE_TIME time_unity = START_TIME;
+
+		for(int i=0;i<1000;i++)
+		{
+		for(degree=-90;degree<90;degree++)
+		{
+			float rad = radians( static_cast<float>(degree) );
+			sin_unity  += math::sin_est( math::float4(rad) );
+		}
+		}
+
+		time_unity = ELAPSED_TIME(time_unity);
+
+		CHECK_CLOSE( sin_stl, sin_unity.x().tofloat(), 9.0e-4);
+		CHECK_CLOSE( time_stl, time_unity, 1);
+		*/
+	}
+}
+
+#endif
diff --git a/Runtime/Math/Simd/bool1.h b/Runtime/Math/Simd/bool1.h
new file mode 100644
index 0000000..d195a4c
--- /dev/null
+++ b/Runtime/Math/Simd/bool1.h
@@ -0,0 +1,42 @@
+#ifndef SIMD_BOOL1_H
+#define SIMD_BOOL1_H
+
+#include "Runtime/Math/Simd/intrinsic.h"
+
+namespace math
+{
+
+struct ATTRIBUTE_ALIGN(ALIGN4F) bool1 
+{
+	enum {
+		RHS_SWZ	= kXYZW
+	};
+
+	typedef bool   scalar_type;
+	typedef vec4bs packed_type;
+
+	packed_type	s;
+
+	MECANIM_FORCE_INLINE bool1()
+		{}
+
+	MECANIM_FORCE_INLINE explicit bool1(packed_type x) : s(x)
+		{}
+
+	MECANIM_FORCE_INLINE explicit bool1(scalar_type x) : s(Vloadsb(x))
+		{}
+
+	MECANIM_FORCE_INLINE operator scalar_type() const
+		{ return Vstoresb(s); }
+	
+	MECANIM_FORCE_INLINE bool1 &operator=(const bool1 &r)
+		{ s = r.s; return *this; }
+
+	// unary operators
+	MECANIM_FORCE_INLINE bool1 operator!() const
+		{ bool1 r = bool1(Vnot(s)); return r; }
+};
+
+}
+
+#endif
diff --git a/Runtime/Math/Simd/bool4.h b/Runtime/Math/Simd/bool4.h
new file mode 100644
index 0000000..05afc1d
--- /dev/null
+++ b/Runtime/Math/Simd/bool4.h
@@ -0,0 +1,61 @@
+#ifndef SIMD_BOOL4_H
+#define SIMD_BOOL4_H
+
+
+#include "Runtime/Math/Simd/intrinsic.h"
+#include "Runtime/Math/Simd/bool1.h"
+
+namespace math
+{
+
+template<typename T> struct vecexp4;
+
+struct ATTRIBUTE_ALIGN(ALIGN4F) bool4
+{		
+	typedef bool		scalar_type;
+	typedef vec4b		packed_type;
+
+	packed_type	v;
+
+	MECANIM_FORCE_INLINE bool4() {}
+
+	MECANIM_FORCE_INLINE bool4(const bool4 &r):v(r.v) { }
+
+	MECANIM_FORCE_INLINE bool4(const packed_type &r):v(r) { }
+
+	explicit MECANIM_FORCE_INLINE bool4(bool s):v(Vloadsb(s)) {  }
+
+	explicit MECANIM_FORCE_INLINE bool4(bool1 const& r):v(r.s) {  }
+
+	MECANIM_FORCE_INLINE bool4(bool x, bool y, bool z, bool w):v(Vload4sb(x,y,z,w)) { }
+
+	MECANIM_FORCE_INLINE bool4 &operator=(const bool4 &r) { v = r.v; return *this; }
+
+	MECANIM_FORCE_INLINE bool4 &operator=(bool s) { v = Vloadsb(s); return *this; }		
+
+	MECANIM_FORCE_INLINE bool4 operator!() const { bool4 r = Vnot(v); return r; }
+};
+
+static MECANIM_FORCE_INLINE bool4 operator==(bool4 const& l, bool4 const& r)
+{ 
+	return bool4(Vxnor(l.v, r.v) ); 
+}
+
+static MECANIM_FORCE_INLINE bool4 operator!=(bool4 const& l, bool4 const& r)
+{ 
+	return bool4(Vxor(l.v, r.v) ); 
+}
+
+static MECANIM_FORCE_INLINE bool4 operator&&(bool4 const& l, bool4 const& r)
+{ 
+	return bool4(Vand(l.v, r.v) ); 
+}
+
+static MECANIM_FORCE_INLINE bool4 operator||(bool4 const& l, bool4 const& r)
+{ 
+	return bool4(Vor(l.v, r.v) ); 
+}
+
+}
+
+#endif
diff --git a/Runtime/Math/Simd/float1.h b/Runtime/Math/Simd/float1.h
new file mode 100644
index 0000000..90c17ae
--- /dev/null
+++ b/Runtime/Math/Simd/float1.h
@@ -0,0 +1,232 @@
+#ifndef SIMD_FLOAT1_H
+#define SIMD_FLOAT1_H
+
+#include "Runtime/Serialize/TransferFunctions/SerializeTransfer.h"
+
+#include "Runtime/Math/Simd/intrinsic.h"
+
+#include "Runtime/Math/Simd/bool1.h"
+
+namespace math
+{
+
+struct float1;
+
+template<typename T> struct ATTRIBUTE_ALIGN(ALIGN4F) vecexp1 : T
+{
+	typedef T				        value_type;
+	typedef typename T::scalar_type	scalar_type;
+	typedef typename T::packed_type	packed_type;
+
+	MECANIM_FORCE_INLINE vecexp1() {}
+
+	MECANIM_FORCE_INLINE vecexp1(vecexp1 const& e):value_type(e) { }
+	
+	explicit MECANIM_FORCE_INLINE vecexp1(scalar_type s):value_type(s){ }
+
+	explicit MECANIM_FORCE_INLINE vecexp1(packed_type vector):value_type(vector) { }
+
+	MECANIM_FORCE_INLINE vecexp1(value_type const& r):value_type(r) { }
+	
+	MECANIM_FORCE_INLINE vecexp1(scalar_type x, scalar_type y, scalar_type z, scalar_type w):value_type(x,y,z,w) { }
+
+	template<typename R> MECANIM_FORCE_INLINE vecexp1(const vecexp1<R> &e){ 
+		value_type::v = Vswizzle<value_type::LHS_SWZ>::lhs(value_type::v, e.eval() );
+	}
+
+	inline bool IsFinite ()const
+	{
+		return ::IsFinite(Vstoresf(value_type::v));
+	}
+
+	MECANIM_FORCE_INLINE vecexp1 &operator=(const vecexp1 &e)	{ 
+		SIMD_ASSERT_IF(!e.IsFinite());
+		value_type::v = Vswizzle<value_type::LHS_SWZ>::lhs(value_type::v, e.eval() ); 
+		return *this;
+	}
+
+	template<typename R> MECANIM_FORCE_INLINE vecexp1 &operator=(const vecexp1<R> &e){ 
+		SIMD_ASSERT_IF(!e.IsFinite());
+		value_type::v = Vswizzle<value_type::LHS_SWZ>::lhs(value_type::v, e.eval() ); 
+		return *this;
+	}
+	
+	MECANIM_FORCE_INLINE vecexp1 &operator=(scalar_type s) { 
+		SIMD_ASSERT_IF(!::IsFinite(s));
+		value_type::operator =(s); 
+		return *this; }
+	
+	MECANIM_FORCE_INLINE const vecexp1 &operator+() const	{ return *this; }
+	MECANIM_FORCE_INLINE vecexp1 operator-() const { return vecexp1(Vneg( value_type::eval() ) ); }
+
+	template<typename R>MECANIM_FORCE_INLINE vecexp1 &operator+=(const vecexp1<R> &r) { 
+		value_type::v = Vswizzle<value_type::LHS_SWZ>::lhs(value_type::v, Vadd( value_type::eval(), r.eval() )); return *this; 
+	}
+
+	template<typename R> MECANIM_FORCE_INLINE vecexp1 &operator-=(const vecexp1<R> &r) { 
+		value_type::v = Vswizzle<value_type::LHS_SWZ>::lhs(value_type::v, Vsub( value_type::eval(), r.eval() )); return *this; 
+	}
+	
+	template<typename R> MECANIM_FORCE_INLINE vecexp1 &operator*=(const vecexp1<R> &r) { 
+		value_type::v = Vswizzle<value_type::LHS_SWZ>::lhs(value_type::v, Vmul(value_type::eval(), r.eval())); return *this; 
+	}
+
+	template<typename R> MECANIM_FORCE_INLINE vecexp1 &operator/=(const vecexp1<R> &r) { 
+		value_type::v = Vswizzle<value_type::LHS_SWZ>::lhs(value_type::v, Vdiv(value_type::eval(), r.eval())); return *this; 
+	}
+};	
+
+struct ATTRIBUTE_ALIGN(ALIGN4F) vec1 
+{
+	typedef float			scalar_type;
+	typedef vec4f			packed_type;
+	typedef vec4f const&	const_reference_packed_type;
+	typedef vec4f&			reference_packed_type;
+
+	enum{
+		RHS_SWZ = kXYZW,
+		LHS_SWZ = kXYZW
+	};
+	
+	MECANIM_FORCE_INLINE vec1() {}
+
+	MECANIM_FORCE_INLINE vec1(scalar_type scalar):v(Vloadsf(scalar)) {  }
+
+	MECANIM_FORCE_INLINE vec1(packed_type vector):v(vector){ }
+
+	template<typename T> MECANIM_FORCE_INLINE vec1(const T& vector) { v = vector.eval(); }
+
+	MECANIM_FORCE_INLINE scalar_type tofloat() const { return Vstoresf( v ); }
+
+	MECANIM_FORCE_INLINE vec1 &operator=(const vec1 &l) { v = l.v; return *this; }
+
+	MECANIM_FORCE_INLINE packed_type eval()const{ return v; }
+
+protected:
+	packed_type			v;
+};
+
+struct ATTRIBUTE_ALIGN(ALIGN4F) float1 : public vecexp1<vec1>
+{
+	DEFINE_GET_TYPESTRING(float1)
+	
+	typedef vec1				value_type;
+	typedef vec1::scalar_type	scalar_type;
+	typedef vec1::packed_type	packed_type;
+
+	MECANIM_FORCE_INLINE float1() {}
+	
+	MECANIM_FORCE_INLINE explicit float1(scalar_type s):vecexp1<value_type>(s)	{}	
+
+	MECANIM_FORCE_INLINE explicit float1(packed_type const& v):vecexp1<value_type>(v) { }
+
+	MECANIM_FORCE_INLINE float1(const float1 &f):vecexp1<value_type>(f) { }
+
+	template<typename R> MECANIM_FORCE_INLINE float1(const vecexp1<R> &e){ value_type::v = e.eval(); }
+
+	MECANIM_FORCE_INLINE float1 &operator=(const float1 &f) { value_type::v = f.eval(); return *this; }
+
+	template<typename R> MECANIM_FORCE_INLINE float1 &operator=(const vecexp1<R> &e) { value_type::v = e.eval(); return *this; }
+
+	template<typename R>MECANIM_FORCE_INLINE float1 &operator+=(const vecexp1<R> &r) { 
+		value_type::v = Vadd( value_type::eval(), r.eval() ); return *this; 
+	}
+
+	template<typename R> MECANIM_FORCE_INLINE float1 &operator-=(const vecexp1<R> &r) { 
+		value_type::v = Vsub( value_type::eval(), r.eval() ); return *this; 
+	}
+	
+	template<typename R> MECANIM_FORCE_INLINE float1 &operator*=(const vecexp1<R> &r) { 
+		value_type::v = Vmul(value_type::eval(), r.eval() ); return *this; 
+	}
+
+	template<typename R> MECANIM_FORCE_INLINE float1 &operator/=(const vecexp1<R> &r) { 
+		value_type::v = Vdiv(value_type::eval(), r.eval() ); return *this; 
+	}
+
+
+	MECANIM_FORCE_INLINE float1 &operator++() { value_type::v = Vinc(value_type::v); return *this; }
+	MECANIM_FORCE_INLINE float1 operator++(int) { float1 r = *this; value_type::v = Vinc(value_type::v); return r; }
+
+	MECANIM_FORCE_INLINE float1 &operator--() { value_type::v = Vdec(value_type::v); return *this; }
+	MECANIM_FORCE_INLINE float1 operator--(int) { float1 r = *this; value_type::v = Vdec(value_type::v); return r; }
+
+	static float1 zero()	{return float1(Vzero()); }    // 0
+	static float1 one()		{return float1(Vone());}     // 1
+	
+	template<class TransferFunction>
+	MECANIM_FORCE_INLINE void Transfer (TransferFunction& transfer)
+	{
+		/////@TODO: This is wrong. It will not work for SafeBinaryRead
+		///////     Probably other places in the code too!
+		
+		float x;
+		if(transfer.IsReading())
+		{	
+			transfer.Transfer(x, "x");
+			*this = float1(x);
+		}
+		else if(transfer.IsWriting())
+		{
+			x = tofloat();
+			
+			transfer.Transfer(x, "x");
+		}
+		else
+		{
+			transfer.Transfer(x, "x");
+		}
+	}
+};
+
+// vecexp1 Arithemtic
+template<typename LHS, typename RHS> static MECANIM_FORCE_INLINE vecexp1<vec1> operator+(const vecexp1<LHS> &l, const vecexp1<RHS> &r)
+{ 
+	return vecexp1<vec1>( Vadd( l.eval(), r.eval() ));
+}
+
+template<typename LHS, typename RHS> static MECANIM_FORCE_INLINE vecexp1<vec1> operator-(const vecexp1<LHS> &l, const vecexp1<RHS> &r)
+{ 
+	return vecexp1<vec1>( Vsub( l.eval(), r.eval() ));
+}
+
+template<typename LHS, typename RHS> static MECANIM_FORCE_INLINE vecexp1<vec1> operator*(const vecexp1<LHS> &l, const vecexp1<RHS> &r)
+{ 
+	return vecexp1<vec1>( Vmul( l.eval(), r.eval() ));
+}
+
+template<typename LHS, typename RHS> static MECANIM_FORCE_INLINE vecexp1<vec1> operator/(const vecexp1<LHS> &l, const vecexp1<RHS> &r)
+{ 
+	return vecexp1<vec1>( Vdiv( l.eval(), r.eval() ));
+}
+
+// vecexp1 logic
+template <typename LHS, typename RHS> static MECANIM_FORCE_INLINE bool1 operator<(const vecexp1<LHS> &l, const vecexp1<RHS> &r)
+{ 
+	return bool1( Vcmplt( l.eval(), r.eval() ) ); 
+}
+template <typename LHS, typename RHS> static MECANIM_FORCE_INLINE bool1 operator<=(const vecexp1<LHS> &l, const vecexp1<RHS> &r)
+{ 
+	return bool1( Vcmple( l.eval(), r.eval() ) ); 
+}
+template <typename LHS, typename RHS> static MECANIM_FORCE_INLINE bool1 operator==(const vecexp1<LHS> &l, const vecexp1<RHS> &r)
+{ 
+	return bool1( Vcmpeq( l.eval(), r.eval() ) ); 
+}
+template <typename LHS, typename RHS> static MECANIM_FORCE_INLINE bool1 operator!=(const vecexp1<LHS> &l, const vecexp1<RHS> &r)
+{ 
+	return bool1( Vcmpneq( l.eval(), r.eval() ) ); 
+}
+template <typename LHS, typename RHS> static MECANIM_FORCE_INLINE bool1 operator>=(const vecexp1<LHS> &l, const vecexp1<RHS> &r)
+{ 
+	return bool1( Vcmpge( l.eval(), r.eval() )); 
+}
+template <typename LHS, typename RHS> static MECANIM_FORCE_INLINE bool1 operator>(const vecexp1<LHS> &l, const vecexp1<RHS> &r)
+{ 
+	return bool1( Vcmpgt( l.eval(), r.eval() ) ); 
+}
+
+}
+
+
+#endif
diff --git a/Runtime/Math/Simd/float4.h b/Runtime/Math/Simd/float4.h
new file mode 100644
index 0000000..7b653e6
--- /dev/null
+++ b/Runtime/Math/Simd/float4.h
@@ -0,0 +1,397 @@
+#ifndef SIMD_FLOAT4_H
+#define SIMD_FLOAT4_H
+
+#include "Runtime/Serialize/TransferFunctions/SerializeTransfer.h"
+
+#include "Runtime/Math/Simd/intrinsic.h"
+
+#include "Runtime/Math/Simd/bool4.h"
+#include "Runtime/Math/Simd/float1.h"
+
+namespace math
+{
+
+template<typename T> struct ATTRIBUTE_ALIGN(ALIGN4F) vecexp4 : T
+{
+	typedef T						value_type;
+	typedef typename T::scalar_type	scalar_type;
+	typedef typename T::packed_type	packed_type;
+
+	MECANIM_FORCE_INLINE vecexp4() {}
+
+	MECANIM_FORCE_INLINE vecexp4(vecexp4 const& e):value_type(e) { }
+
+	template<typename R> MECANIM_FORCE_INLINE vecexp4(const vecexp4<R> &e):value_type(e){	}
+
+	MECANIM_FORCE_INLINE vecexp4(scalar_type const& x, scalar_type const& y, scalar_type const& z, scalar_type const& w):value_type(x,y,z,w) { }
+
+	MECANIM_FORCE_INLINE vecexp4(value_type const& r):value_type(r) { }	
+	
+	//template<typename R1, typename R2, typename R3, typename R4> MECANIM_FORCE_INLINE vecexp4(const vecexp1<R1> &x, const vecexp1<R2> &y, const vecexp1<R3> &z, const vecexp1<R4> &w) { 
+	//	value_type::v = Vcombine( x.eval(), y.eval(), z.eval(), w.eval() );
+	//}
+	MECANIM_FORCE_INLINE vecexp4(const float1 &x, const float1 &y, const float1 &z, const float1 &w) { 
+		value_type::v = Vcombine( x.eval(), y.eval(), z.eval(), w.eval() );
+	}
+
+	MECANIM_FORCE_INLINE vecexp4 &operator=(const vecexp4 &l)	{ value_type::operator =(l); return *this; }
+
+	template<typename R> MECANIM_FORCE_INLINE vecexp4 &operator=(const vecexp4<R> &e) { 
+		value_type::v = e.eval(); return *this; 
+	}
+
+	template<typename R> MECANIM_FORCE_INLINE vecexp4 &operator=(const vecexp1<R> &e) { 
+		value_type::v = e.eval(); return *this; 
+	}
+
+	MECANIM_FORCE_INLINE const vecexp4 &operator+() const	{ return *this; }
+	MECANIM_FORCE_INLINE vecexp4 operator-() const { return vecexp4(Vneg( value_type::eval() )); }
+};
+
+template<typename SCALAR, typename RHS_VECTOR, typename LHS_VECTOR, int RHS_MASK, int LHS_MASK> struct ATTRIBUTE_ALIGN(ALIGN4F) swizzle1
+{
+	typedef SCALAR			scalar_type;
+	typedef LHS_VECTOR		packed_type;
+	typedef RHS_VECTOR		rhs_packed_type;
+
+	enum{
+		RHS_SWZ = RHS_MASK,
+		LHS_SWZ = LHS_MASK
+	};
+
+	
+	packed_type v;
+	
+	MECANIM_FORCE_INLINE swizzle1(packed_type vector):v(vector) {}
+
+	MECANIM_FORCE_INLINE swizzle1 &operator=(const scalar_type &s) { v = Vswizzle<LHS_SWZ>::lhs(v, Vloadsf(s)); return *this; }
+
+	MECANIM_FORCE_INLINE scalar_type tofloat() { return Vstoresf( Vswizzle<RHS_SWZ>::rhs(v) ); }
+
+	MECANIM_FORCE_INLINE rhs_packed_type eval()const{ return Vswizzle<RHS_SWZ>::rhs(v); }
+private:
+
+
+	MECANIM_FORCE_INLINE swizzle1 &operator=(const swizzle1 &s) {return *this;}
+};
+
+template<typename SCALAR, typename VECTOR, int MASK> struct ATTRIBUTE_ALIGN(ALIGN4F) swizzle
+{
+	typedef SCALAR			scalar_type;
+	typedef VECTOR			packed_type;
+
+	enum{
+		RHS_SWZ = MASK
+	};
+
+	MECANIM_FORCE_INLINE swizzle(packed_type const& vector):v(vector) {}
+
+	MECANIM_FORCE_INLINE packed_type eval()const{ return Vswizzle<RHS_SWZ>::rhs(v); }
+
+protected:
+	packed_type v;
+};
+
+struct ATTRIBUTE_ALIGN(ALIGN4F) vec4 
+{
+	typedef float			scalar_type;
+	typedef vec4f			packed_type;
+	typedef vec4f const&	const_reference_packed_type;
+	typedef vec4f&			reference_packed_type;
+
+	enum{
+		RHS_SWZ = kXYZW
+	};
+	
+	MECANIM_FORCE_INLINE vec4() {}
+
+	MECANIM_FORCE_INLINE vec4(scalar_type x, scalar_type y, scalar_type z, scalar_type w):v(Vload4sf(x,y,z,w)) {  }
+
+	MECANIM_FORCE_INLINE vec4(scalar_type s):v(Vloadsf(s)) {  }
+
+	MECANIM_FORCE_INLINE vec4(packed_type vector):v(vector){ }
+
+	template<typename T> MECANIM_FORCE_INLINE vec4(const T& vector):v(vector.eval()){ }
+
+	MECANIM_FORCE_INLINE vec4 &operator=(const vec4 &l) { 
+		SIMD_ASSERT_IF(!l.IsFinite());
+		v = l.v; 
+		return *this; 
+	}
+
+	MECANIM_FORCE_INLINE packed_type eval()const{ return v; }
+
+	inline bool IsFinite ()const
+	{
+		return ::IsFinite( x().tofloat() ) & ::IsFinite( y().tofloat() ) & ::IsFinite( z().tofloat() ) & ::IsFinite( w().tofloat() );
+	}
+
+	MECANIM_FORCE_INLINE vecexp1< swizzle1<scalar_type, packed_type, packed_type, kXXXX, kXYZW> > x()const { return vecexp1< swizzle1<scalar_type, packed_type, packed_type, kXXXX, kXYZW> >(v); }	
+	MECANIM_FORCE_INLINE vecexp1< swizzle1<scalar_type, packed_type, packed_type, kYYYY, kYXZW> > y()const { return vecexp1< swizzle1<scalar_type, packed_type, packed_type, kYYYY, kYXZW> >(v); }	
+	MECANIM_FORCE_INLINE vecexp1< swizzle1<scalar_type, packed_type, packed_type, kZZZZ, kZYXW> > z()const { return vecexp1< swizzle1<scalar_type, packed_type, packed_type, kZZZZ, kZYXW> >(v); }	
+	MECANIM_FORCE_INLINE vecexp1< swizzle1<scalar_type, packed_type, packed_type, kWWWW, kWYZX> > w()const { return vecexp1< swizzle1<scalar_type, packed_type, packed_type, kWWWW, kWYZX> >(v); }	
+
+	MECANIM_FORCE_INLINE vecexp1< swizzle1<scalar_type, packed_type, reference_packed_type, kXXXX, kXYZW> > x() { return vecexp1< swizzle1<scalar_type, packed_type, reference_packed_type, kXXXX, kXYZW> >(v); }	
+	MECANIM_FORCE_INLINE vecexp1< swizzle1<scalar_type, packed_type, reference_packed_type, kYYYY, kYXZW> > y() { return vecexp1< swizzle1<scalar_type, packed_type, reference_packed_type, kYYYY, kYXZW> >(v); }	
+	MECANIM_FORCE_INLINE vecexp1< swizzle1<scalar_type, packed_type, reference_packed_type, kZZZZ, kZYXW> > z() { return vecexp1< swizzle1<scalar_type, packed_type, reference_packed_type, kZZZZ, kZYXW> >(v); }	
+	MECANIM_FORCE_INLINE vecexp1< swizzle1<scalar_type, packed_type, reference_packed_type, kWWWW, kWYZX> > w() { return vecexp1< swizzle1<scalar_type, packed_type, reference_packed_type, kWWWW, kWYZX> >(v); }	
+																 
+	MECANIM_FORCE_INLINE vecexp4< swizzle<scalar_type, packed_type, kWXZY> > wxzy()const { return vecexp4< swizzle<scalar_type, packed_type, kWXZY> >(v); }	
+	MECANIM_FORCE_INLINE vecexp4< swizzle<scalar_type, packed_type, kXZWY> > xzwy()const { return vecexp4< swizzle<scalar_type, packed_type, kXZWY> >(v); }
+	MECANIM_FORCE_INLINE vecexp4< swizzle<scalar_type, packed_type, kXWYZ> > xwyz()const { return vecexp4< swizzle<scalar_type, packed_type, kXWYZ> >(v); }
+	MECANIM_FORCE_INLINE vecexp4< swizzle<scalar_type, packed_type, kWYXZ> > wyxz()const { return vecexp4< swizzle<scalar_type, packed_type, kWYXZ> >(v); }
+	MECANIM_FORCE_INLINE vecexp4< swizzle<scalar_type, packed_type, kZYWX> > zywx()const { return vecexp4< swizzle<scalar_type, packed_type, kZYWX> >(v); }
+	MECANIM_FORCE_INLINE vecexp4< swizzle<scalar_type, packed_type, kYWZX> > ywzx()const { return vecexp4< swizzle<scalar_type, packed_type, kYWZX> >(v); }
+	MECANIM_FORCE_INLINE vecexp4< swizzle<scalar_type, packed_type, kYZXW> > yzxw()const { return vecexp4< swizzle<scalar_type, packed_type, kYZXW> >(v); }
+	MECANIM_FORCE_INLINE vecexp4< swizzle<scalar_type, packed_type, kZXYW> > zxyw()const { return vecexp4< swizzle<scalar_type, packed_type, kZXYW> >(v); }
+	MECANIM_FORCE_INLINE vecexp4< swizzle<scalar_type, packed_type, kZWXY> > zwxy()const { return vecexp4< swizzle<scalar_type, packed_type, kZWXY> >(v); }
+	MECANIM_FORCE_INLINE vecexp4< swizzle<scalar_type, packed_type, kWWWZ> > wwwz()const { return vecexp4< swizzle<scalar_type, packed_type, kWWWZ> >(v); }
+	MECANIM_FORCE_INLINE vecexp4< swizzle<scalar_type, packed_type, kWWZZ> > wwzz()const { return vecexp4< swizzle<scalar_type, packed_type, kWWZZ> >(v); }
+	MECANIM_FORCE_INLINE vecexp4< swizzle<scalar_type, packed_type, kWZYX> > wzyx()const { return vecexp4< swizzle<scalar_type, packed_type, kWZYX> >(v); }
+	MECANIM_FORCE_INLINE vecexp4< swizzle<scalar_type, packed_type, kYXWZ> > yxwz()const { return vecexp4< swizzle<scalar_type, packed_type, kYXWZ> >(v); }
+
+protected:
+	packed_type			v;
+};		
+
+struct ATTRIBUTE_ALIGN(ALIGN4F) float4 : vecexp4<vec4>
+{		
+	DEFINE_GET_TYPESTRING(float4)
+
+	typedef vec4				value_type;
+	typedef vec4::scalar_type	scalar_type;
+	typedef vec4::packed_type	packed_type;
+
+	MECANIM_FORCE_INLINE float4() {}
+
+	MECANIM_FORCE_INLINE float4(float4 const& vector):vecexp4<value_type>(vector.v) { }
+	
+	explicit MECANIM_FORCE_INLINE float4(scalar_type s):vecexp4<value_type>(s) { }
+
+	explicit MECANIM_FORCE_INLINE float4(packed_type const& vector):vecexp4<value_type>(vector) { }
+
+	template<typename R> MECANIM_FORCE_INLINE float4(const vecexp4<R> &r):vecexp4<value_type>(r)	{ }
+
+	MECANIM_FORCE_INLINE float4(scalar_type x, scalar_type y, scalar_type z, scalar_type w):vecexp4<value_type>(x,y,z,w) { }
+
+	MECANIM_FORCE_INLINE float4(const float1 &x, const float1 &y, const float1 &z, const float1 &w):vecexp4<value_type>(x,y,z,w) { }
+
+	MECANIM_FORCE_INLINE float4 &operator=(const float4 &r) { vecexp4<vec4>::operator =(r); return *this; }
+
+	template<typename R> MECANIM_FORCE_INLINE float4 &operator=(const vecexp4<R> &r) { vecexp4<vec4>::operator =(r); return *this; }
+	template<typename R> MECANIM_FORCE_INLINE float4 &operator=(const vecexp1<R> &r) { vecexp4<vec4>::operator =(r); return *this; }
+
+	template<typename R> MECANIM_FORCE_INLINE float4 &operator+=(const vecexp4<R> &r) { value_type::v = Vadd(value_type::v, r.eval()); return *this; }
+	template<typename R> MECANIM_FORCE_INLINE float4 &operator+=(const vecexp1<R> &r) { value_type::v = Vadd(value_type::v, r.eval()); return *this; }
+	MECANIM_FORCE_INLINE  float4 &operator+=(float r) { value_type::v = Vadd(value_type::v, Vloadsf(r)); return *this; }
+
+	template<typename R> MECANIM_FORCE_INLINE float4 &operator-=(const vecexp4<R> &r) { value_type::v = Vsub(value_type::v, r.eval()); return *this; }
+	template<typename R> MECANIM_FORCE_INLINE float4 &operator-=(const vecexp1<R> &r) { value_type::v = Vsub(value_type::v, r.eval()); return *this; }
+	MECANIM_FORCE_INLINE  float4 &operator-=(float r) { value_type::v = Vsub(value_type::v, Vloadsf(r)); return *this; }
+
+	template<typename R> MECANIM_FORCE_INLINE float4 &operator*=(const vecexp4<R> &r) { value_type::v = Vmul(value_type::v, r.eval()); return *this; }
+	template<typename R> MECANIM_FORCE_INLINE float4 &operator*=(const vecexp1<R> &r) { value_type::v = Vmul(value_type::v, r.eval()); return *this; }
+	MECANIM_FORCE_INLINE  float4 &operator*=(float r) { value_type::v = Vmul(value_type::v, Vloadsf(r)); return *this; }
+
+	
+	template<typename R> MECANIM_FORCE_INLINE float4 &operator/=(const vecexp4<R> &r) { value_type::v = Vdiv(value_type::v, r.eval()); return *this; }
+	template<typename R> MECANIM_FORCE_INLINE float4 &operator/=(const vecexp1<R> &r) { value_type::v = Vdiv(value_type::v, r.eval()); return *this; }
+	MECANIM_FORCE_INLINE  float4 &operator/=(float r) { value_type::v = Vdiv(value_type::v, Vloadsf(r)); return *this; }
+
+	// prefix decrement
+	MECANIM_FORCE_INLINE float4 &operator++() { value_type::v = Vinc(value_type::v); return *this; }
+	// postfix increment
+	MECANIM_FORCE_INLINE float4 operator++(int) { float4 r = *this; value_type::v = Vinc(value_type::eval() ); return r; }
+
+	// prefix decrement
+	MECANIM_FORCE_INLINE  float4 &operator--() { value_type::v = Vdec(value_type::v); return *this; }
+	// postfix decrement
+	MECANIM_FORCE_INLINE float4 operator--(int) { float4 r = *this; value_type::v = Vdec(value_type::eval() ); return r; }
+
+
+	static float4 zero()	{return float4(Vzero()); }    // 0
+	static float4 one()		{return float4(Vone());}     // 1
+
+
+	template<class TransferFunction>
+	MECANIM_FORCE_INLINE void Transfer (TransferFunction& transfer)
+	{
+		/////@TODO: This is wrong. It will not work for SafeBinaryRead
+		///////     Probably other places in the code too!
+		
+		float ATTRIBUTE_ALIGN(ALIGN4F) buf[4] = {0.0f, 0.0f, 0.0f, 0.0f};
+		if(transfer.IsReading())
+		{	
+			transfer.Transfer(buf[0], "x");
+			transfer.Transfer(buf[1], "y");
+			transfer.Transfer(buf[2], "z");
+			transfer.Transfer(buf[3], "w");
+
+			v = Vloadpf(buf, 0);				
+		}
+		else if(transfer.IsWriting())
+		{
+			Vstorepf(v, buf, 0);
+
+			transfer.Transfer(buf[0], "x");
+			transfer.Transfer(buf[1], "y");
+			transfer.Transfer(buf[2], "z");
+			transfer.Transfer(buf[3], "w");
+		}
+		else
+		{
+			transfer.Transfer(buf[0], "x");
+			transfer.Transfer(buf[1], "y");
+			transfer.Transfer(buf[2], "z");
+			transfer.Transfer(buf[3], "w");
+		}
+	}
+};
+
+// vecexp4 Arithemtic
+template<typename LHS, typename RHS> static MECANIM_FORCE_INLINE vecexp4<vec4> operator+(const vecexp4<LHS> &l, const vecexp4<RHS> &r)
+{ 
+	return vecexp4<vec4>( Vadd( l.eval(), r.eval() ));
+}
+
+template<typename LHS, typename RHS> static MECANIM_FORCE_INLINE vecexp4<vec4> operator+(const vecexp4<LHS> &l, const vecexp1<RHS> &r)
+{ 
+	return vecexp4<vec4>( Vadd( l.eval(), r.eval() ));
+}
+
+template<typename LHS, typename RHS> static MECANIM_FORCE_INLINE vecexp4<vec4> operator+(const vecexp1<LHS> &l, const vecexp4<RHS> &r)
+{ 
+	return vecexp4<vec4>( Vadd( l.eval(), r.eval() ));
+}
+
+template<typename LHS, typename RHS> static MECANIM_FORCE_INLINE vecexp4<vec4> operator-(const vecexp4<LHS> &l, const vecexp4<RHS> &r)
+{ 
+	return vecexp4<vec4>( Vsub( l.eval(), r.eval() ));
+}
+
+template<typename LHS, typename RHS> static MECANIM_FORCE_INLINE vecexp4<vec4> operator-(const vecexp4<LHS> &l, const vecexp1<RHS> &r)
+{ 
+	return vecexp4<vec4>( Vsub( l.eval(), r.eval() ));
+}
+
+template<typename LHS, typename RHS> static MECANIM_FORCE_INLINE vecexp4<vec4> operator-(const vecexp1<LHS> &l, const vecexp4<RHS> &r)
+{ 
+	return vecexp4<vec4>( Vsub( l.eval(), r.eval() ));
+}
+
+template<typename LHS, typename RHS> static MECANIM_FORCE_INLINE vecexp4<vec4> operator*(const vecexp4<LHS> &l, const vecexp4<RHS> &r)
+{ 
+	return vecexp4<vec4>( Vmul( l.eval(), r.eval() ));
+}
+
+template<typename LHS, typename RHS> static MECANIM_FORCE_INLINE vecexp4<vec4> operator*(const vecexp4<LHS> &l,const vecexp1<RHS> &r)
+{ 
+	return vecexp4<vec4>( Vmul( l.eval(), r.eval() ));
+}
+
+template<typename LHS, typename RHS> static MECANIM_FORCE_INLINE vecexp4<vec4> operator*(const vecexp1<LHS> &l, const vecexp4<RHS> &r)
+{ 
+	return vecexp4<vec4>( Vmul( l.eval(), r.eval() ));
+}
+
+template<typename LHS, typename RHS> static MECANIM_FORCE_INLINE vecexp4<vec4> operator/(const vecexp4<LHS> &l, const vecexp4<RHS> &r)
+{ 
+	return vecexp4<vec4>( Vdiv( l.eval(), r.eval() )); 
+}
+template<typename LHS, typename RHS> static MECANIM_FORCE_INLINE vecexp4<vec4> operator/(const vecexp4<LHS> &l, const vecexp1<RHS> &r)
+{ 
+	return vecexp4<vec4>( Vdiv( l.eval(), r.eval() ));   
+}
+
+template<typename LHS, typename RHS> static MECANIM_FORCE_INLINE vecexp4<vec4> operator/(const vecexp1<LHS> &l, const vecexp4<RHS> &r)
+{ 
+	return vecexp4<vec4>( Vdiv( l.eval(), r.eval() ));
+}
+
+
+// vecexp4 logic
+template<typename LHS, typename RHS> static MECANIM_FORCE_INLINE bool4 operator<(const vecexp4<LHS> &l, const vecexp4<RHS> &r)
+{ 
+	return bool4(Vcmplt(l.eval(), r.eval())); 
+}
+template<typename LHS> static MECANIM_FORCE_INLINE bool4 operator<(const vecexp4<LHS> &l, const float1 &r)
+{ 
+	return bool4(Vcmplt(l.eval(), r.eval())); 
+}
+template<typename RHS> static MECANIM_FORCE_INLINE bool4 operator<(const float1 &l, const vecexp4<RHS> &r)
+{ 
+	return bool4(Vcmplt(l.eval(), r.eval()));
+}
+
+template<typename LHS, typename RHS> static MECANIM_FORCE_INLINE bool4 operator<=(const vecexp4<LHS> &l, const vecexp4<RHS> &r)
+{ 
+	return bool4(Vcmple(l.eval(), r.eval())); 
+}
+template<typename LHS> static MECANIM_FORCE_INLINE bool4 operator<=(const vecexp4<LHS> &l, const float1 &r)
+{ 
+	return bool4(Vcmple(l.eval(), r.eval()));
+}
+template<typename RHS> static MECANIM_FORCE_INLINE bool4 operator<=(const float1 &l, const vecexp4<RHS> &r)
+{ 
+	return bool4(Vcmple(l.eval(), r.eval()));
+}
+
+template<typename LHS, typename RHS> static MECANIM_FORCE_INLINE bool4 operator==(const vecexp4<LHS> &l, const vecexp4<RHS> &r)
+{ 
+	return bool4( Vcmpeq(l.eval(), r.eval()) ); 
+}
+template<typename LHS, typename RHS> static MECANIM_FORCE_INLINE bool4 operator==(const vecexp4<LHS> &l, const vecexp1<RHS> &r)
+{ 
+	return bool4( Vcmpeq(l.eval(), r.eval()) ); 
+}
+template<typename LHS, typename RHS> static MECANIM_FORCE_INLINE bool4 operator==(const vecexp1<LHS> &l, const vecexp4<RHS> &r)
+{ 
+	return bool4( Vcmpeq(l.eval(), r.eval()) ); 
+}
+
+template<typename LHS, typename RHS> static MECANIM_FORCE_INLINE bool4 operator!=(const vecexp4<LHS> &l, const vecexp4<RHS> &r)
+{ 
+	return bool4( Vcmpneq(l.eval(), r.eval())  ); 
+}
+template<typename LHS, typename RHS> static MECANIM_FORCE_INLINE bool4 operator!=(const vecexp1<LHS> &l, const vecexp4<RHS> &r)
+{ 
+	return bool4( Vcmpneq(l.eval(), r.eval()) ); 
+}
+template<typename LHS, typename RHS> static MECANIM_FORCE_INLINE bool4 operator!=(const vecexp4<LHS> &l, const vecexp1<RHS> &r)
+{ 
+	return bool4( Vcmpneq(l.eval(), r.eval()) ); 
+}
+
+template<typename LHS, typename RHS> static MECANIM_FORCE_INLINE bool4 operator>=(const vecexp4<LHS> &l, const vecexp4<RHS> &r)
+{ 
+	return bool4( Vcmpge(l.eval(), r.eval()) ); 
+}
+template<typename LHS, typename RHS> static MECANIM_FORCE_INLINE bool4 operator>=(const vecexp1<LHS> &l, const vecexp4<RHS> &r)
+{ 
+	return bool4( Vcmpge(l.eval(), r.eval()) ); 
+}
+template<typename LHS, typename RHS> static MECANIM_FORCE_INLINE bool4 operator>=(const vecexp4<LHS> &l, const vecexp1<RHS> &r)
+{ 
+	return bool4( Vcmpge(l.eval(), r.eval()) ); 
+}
+
+template<typename LHS, typename RHS> static MECANIM_FORCE_INLINE bool4 operator>(const vecexp4<LHS> &l, const vecexp4<RHS> &r)
+{ 
+	return bool4( Vcmpgt(l.eval(), r.eval()) ); 
+}
+template<typename LHS, typename RHS> static MECANIM_FORCE_INLINE bool4 operator>(const vecexp4<LHS> &l, const vecexp1<RHS> &r)
+{ 
+	return bool4( Vcmpgt(l.eval(), r.eval()) );
+}
+template<typename LHS, typename RHS> static MECANIM_FORCE_INLINE bool4 operator>(const vecexp1<LHS> &l, const vecexp4<RHS> &r)
+{ 
+	return bool4( Vcmpgt(l.eval(), r.eval()) );
+}
+
+#define constant_float4(name, x,y,z,w)					\
+	cvec4f(c##name, x,y,z,w);							\
+	math::float4 const name(c##name);             \
+
+}
+
+#endif
diff --git a/Runtime/Math/Simd/fpu.h b/Runtime/Math/Simd/fpu.h
new file mode 100644
index 0000000..20bd800
--- /dev/null
+++ b/Runtime/Math/Simd/fpu.h
@@ -0,0 +1,245 @@
+#ifndef SIMD_FPU_H
+#define SIMD_FPU_H
+
+// vector 4 packed 
+struct vec4f
+{
+	vec4f(float s = 0.0f){m128_f32[0] = m128_f32[1] = m128_f32[2] = m128_f32[3]= s;}
+	vec4f(float x, float y, float z, float w){m128_f32[0] = x; m128_f32[1] = y; m128_f32[2] = z; m128_f32[3]= w;}
+
+	float m128_f32[4];
+};
+typedef vec4f 		vec4fs;	// vector 4 scalar
+
+// vector 4 bool packed
+struct vec4b
+{
+	vec4b(bool s = false){m128_f32[0] = m128_f32[1] = m128_f32[2] = m128_f32[3]= s;}
+	vec4b(bool x, bool y, bool z, bool w){m128_f32[0] = x; m128_f32[1] = y; m128_f32[2] = z; m128_f32[3]= w;}
+
+	bool m128_f32[4];
+};
+
+typedef vec4b		vec4bs; // vector 4 bool scalar
+
+#define cvec4f(name, x,y,z,w) static const vec4f name((x),(y),(z),(w))
+#define cvec4b(name, x,y,z,w) static const vec4b name((x),(y),(z),(w))
+#define cvec4fs(name, s)      static const vec4fs name(s)
+
+
+#define SWZ_MASK(x, y, z, w)	(((w) << 6) | ((z) << 4) | ((y) << 2) | ((x)))
+#define SWZ_X(MASK)				(((MASK) >> 0) & 3)
+#define SWZ_Y(MASK)				(((MASK) >> 2) & 3)
+#define SWZ_Z(MASK)				(((MASK) >> 4) & 3)
+#define SWZ_W(MASK)				(((MASK) >> 6) & 3)
+#define SWZ_I(MASK, I)			(((MASK) >> (I*2)) & 3)
+
+enum simd_mask
+{
+	kXYZW = SWZ_MASK(0,1,2,3),
+	kXXXX = SWZ_MASK(0,0,0,0),
+	kYYYY = SWZ_MASK(1,1,1,1),
+	kZZZZ = SWZ_MASK(2,2,2,2),
+	kWWWW = SWZ_MASK(3,3,3,3),
+	
+	kXWYZ = SWZ_MASK(0,3,1,2),
+	kXZWY = SWZ_MASK(0,2,3,1),
+
+	kYZWX = SWZ_MASK(1,2,3,0),
+	kYXZW = SWZ_MASK(1,0,2,3),
+	kYWZX = SWZ_MASK(1,3,2,0),
+	kYZXW = SWZ_MASK(1,2,0,3),
+	kYXWZ = SWZ_MASK(1,0,3,2),
+
+    kZWXY = SWZ_MASK(2,3,0,1),
+	kZYXW = SWZ_MASK(2,1,0,3),
+	kZYWX = SWZ_MASK(2,1,3,0),
+	kZXYW = SWZ_MASK(2,0,1,3),
+	
+	kWYZX = SWZ_MASK(3,1,2,0),
+	kWXZY = SWZ_MASK(3,0,2,1),
+	kWYXZ = SWZ_MASK(3,1,0,2),
+	kWWWZ = SWZ_MASK(3,3,3,2),
+	kWWZZ = SWZ_MASK(3,3,2,2),
+	kWZYX = SWZ_MASK(3,2,1,0),
+};
+
+#define Vzero() vec4f(0.f)
+#define Vone() vec4f(1.f)
+	
+static MECANIM_FORCE_INLINE vec4f Vpermute(vec4f v, int mask) 
+{
+	return vec4f(v.m128_f32[SWZ_X(mask)], v.m128_f32[SWZ_Y(mask)], v.m128_f32[SWZ_Z(mask)], v.m128_f32[SWZ_W(mask)]);
+}
+
+#define Vmove(l, r) vec4f(r.m128_f32[0], l.m128_f32[1], l.m128_f32[2], l.m128_f32[3])
+
+// This template is part of the back-end because some instruction set support some swizzle operation that could be specialized, like xbox vmx rotate instruction that is use in dot product
+template<int SWZ> struct Vswizzle 
+{
+	static MECANIM_FORCE_INLINE vec4f rhs(vec4f r)
+	{
+		return Vpermute(r, SWZ);
+	}
+
+	static MECANIM_FORCE_INLINE vec4f lhs(vec4f l, vec4f r)
+	{
+		return Vswizzle<SWZ>::rhs(Vmove(Vswizzle<SWZ>::rhs(l), r));
+	}
+};
+
+template<> struct Vswizzle<kXYZW> 
+{
+	static MECANIM_FORCE_INLINE vec4f rhs(vec4f r)
+	{
+		return r;
+	}
+	static MECANIM_FORCE_INLINE vec4f lhs(vec4f l, vec4f r)
+	{
+		return Vmove(l, r);
+	}
+};
+
+
+// Aligned store, store vector at address base as 4 float
+#define Vstorepf(v, base, offset) memcpy( (base)+(offset), &v.m128_f32[0], sizeof(v))
+
+// Return component x as a float
+#define Vstoresf(r) r.m128_f32[0]
+
+// Return component x as a bool
+#define Vstoresb(r) r.m128_f32[0]		
+
+// Aligned store, store vector at address base as 4 bool 
+static MECANIM_FORCE_INLINE void Vstorepb(vec4b v, bool* r)
+{
+	r[0] = v.m128_f32[0];
+	r[1] = v.m128_f32[1];
+	r[2] = v.m128_f32[2];
+	r[3] = v.m128_f32[3];
+}
+
+// Aligned load, load 4 float at address v in vector register
+static MECANIM_FORCE_INLINE vec4f Vloadpf(float const* v, int offset)
+{
+	float const* p = v+offset;
+	return vec4f( p[0], p[1], p[2], p[3] );
+}
+
+// Load float value in vector register and replicate value in all component
+#define Vloadsf(s) vec4f(s)
+
+// Load bool value in vector register and replicate value in all component
+#define Vloadsb(s) vec4bs(s)
+
+// Load 4 float value in vector register
+#define Vload4sf(x, y, z, w) vec4f(x,y,z,w)
+
+// Load 4 bool value in vector register
+#define Vload4sb(x, y, z, w) vec4b(x,y,z,w)
+
+static MECANIM_FORCE_INLINE vec4f Vadd(vec4f l, vec4f r)
+{
+	return vec4f(l.m128_f32[0]+r.m128_f32[0], l.m128_f32[1]+r.m128_f32[1], l.m128_f32[2]+r.m128_f32[2], l.m128_f32[3]+r.m128_f32[3]);
+}
+
+static MECANIM_FORCE_INLINE vec4f Vsub(vec4f l, vec4f r)
+{
+	return vec4f(l.m128_f32[0]-r.m128_f32[0], l.m128_f32[1]-r.m128_f32[1], l.m128_f32[2]-r.m128_f32[2], l.m128_f32[3]-r.m128_f32[3]);
+}
+
+static MECANIM_FORCE_INLINE vec4f Vmul( vec4f l,  vec4f r)
+{
+	return vec4f(l.m128_f32[0]*r.m128_f32[0], l.m128_f32[1]*r.m128_f32[1], l.m128_f32[2]*r.m128_f32[2], l.m128_f32[3]*r.m128_f32[3]);
+}
+
+static MECANIM_FORCE_INLINE vec4f Vdiv( vec4f l,  vec4f r)
+{
+	return vec4f(l.m128_f32[0]/r.m128_f32[0], l.m128_f32[1]/r.m128_f32[1], l.m128_f32[2]/r.m128_f32[2], l.m128_f32[3]/r.m128_f32[3]);
+}
+
+#define Vmadd( a,  b,  c) vec4f(a.m128_f32[0]*b.m128_f32[0] + c.m128_f32[0], a.m128_f32[1]*b.m128_f32[1] + c.m128_f32[1], a.m128_f32[2]*b.m128_f32[2] + c.m128_f32[2], a.m128_f32[3]*b.m128_f32[3] + c.m128_f32[3])
+#define Vmsub( a,  b,  c) vec4f(a.m128_f32[0]*b.m128_f32[0] - c.m128_f32[0], a.m128_f32[1]*b.m128_f32[1] - c.m128_f32[1], a.m128_f32[2]*b.m128_f32[2] - c.m128_f32[2], a.m128_f32[3]*b.m128_f32[3] - c.m128_f32[3])
+#define Vneg(r)  vec4f(-r.m128_f32[0], -r.m128_f32[1], -r.m128_f32[2], -r.m128_f32[3])
+
+// Vector sgn: return -1, 1
+#define Vsgn(r) vec4f( r.m128_f32[0] < 0 ? -1.f : 1.f, r.m128_f32[1] < 0 ? -1.f : 1.f, r.m128_f32[2] < 0 ? -1.f : 1.f, r.m128_f32[3] < 0 ? -1.f : 1.f)
+
+// Vector sgn: return -1, 0, 1
+static MECANIM_FORCE_INLINE vec4f Vsign(vec4f r)
+{
+	return vec4f( r.m128_f32[0] < 0 ? -1.f : r.m128_f32[0] > 0 ? 1.f : 0.f,
+		r.m128_f32[1] < 0 ? -1.f : r.m128_f32[1] > 0 ? 1.f : 0.f, 
+		r.m128_f32[2] < 0 ? -1.f : r.m128_f32[2] > 0 ? 1.f : 0.f, 
+		r.m128_f32[3] < 0 ? -1.f : r.m128_f32[3] > 0 ? 1.f : 0.f);
+}
+
+#define Vinc(r) Vadd( (r), Vone())
+#define Vdec(r) Vsub( (r), Vone())
+#define Vabs(r) vec4f( abs(r.m128_f32[0]), abs(r.m128_f32[1]), abs(r.m128_f32[2]), abs(r.m128_f32[3]))
+#define Vmax( l,  r) vec4f( l.m128_f32[0] > r.m128_f32[0] ? l.m128_f32[0] : r.m128_f32[0], l.m128_f32[1] > r.m128_f32[1] ? l.m128_f32[1] : r.m128_f32[1], l.m128_f32[2] > r.m128_f32[2] ? l.m128_f32[2] : r.m128_f32[2], l.m128_f32[3] > r.m128_f32[3] ? l.m128_f32[3] : r.m128_f32[3])
+#define Vmin( l,  r) vec4f( l.m128_f32[0] < r.m128_f32[0] ? l.m128_f32[0] : r.m128_f32[0], l.m128_f32[1] < r.m128_f32[1] ? l.m128_f32[1] : r.m128_f32[1], l.m128_f32[2] < r.m128_f32[2] ? l.m128_f32[2] : r.m128_f32[2], l.m128_f32[3] < r.m128_f32[3] ? l.m128_f32[3] : r.m128_f32[3])
+
+// Return the largest of the 4 component
+static MECANIM_FORCE_INLINE vec4fs Vlargest(vec4f r)
+{
+	r = Vmax(r, Vswizzle<kYZWX>::rhs(r));
+	r = Vmax(r, Vswizzle<kZWXY>::rhs(r));
+	return r.m128_f32[0];
+}
+
+// Return the smallest of the 4 component
+static MECANIM_FORCE_INLINE vec4fs Vsmallest(vec4f r)
+{
+	r = Vmin(r, Vswizzle<kYZWX>::rhs(r));
+	r = Vmin(r, Vswizzle<kZWXY>::rhs(r));
+	return r.m128_f32[0];;
+}
+
+static MECANIM_FORCE_INLINE vec4fs Vsum(vec4f r)
+{
+	r = Vadd(r, Vswizzle<kYZWX>::rhs(r) );
+	r = Vadd(r, Vswizzle<kZWXY>::rhs(r) );
+	r = Vswizzle<kXXXX>::rhs(r);
+	return r.m128_f32[0];
+}
+
+#define Vdot( l,  r) Vsum( Vmul((l), (r)) )
+	
+#define Vsqrt(r) vec4f( sqrt(r.m128_f32[0]), sqrt(r.m128_f32[1]), sqrt(r.m128_f32[2]), sqrt(r.m128_f32[3]))
+
+static MECANIM_FORCE_INLINE vec4f Vrsqrt(vec4f r)
+{
+	vec4f const e = Vdiv(vec4f(1.f), Vsqrt(r));
+	return Vmul(Vmul(e, Vsub(vec4f(3.0f), Vmul(Vmul(e,e),r))), vec4f(.5f));
+}
+
+static MECANIM_FORCE_INLINE vec4f Vrcp(vec4f r)
+{
+	return Vdiv(vec4f(1.f), r );
+}
+
+
+// Merge 4 vector low bytes
+#define Vcombine(x,y,z,w) vec4f(x.m128_f32[0], y.m128_f32[0], z.m128_f32[0], w.m128_f32[0])
+
+// Vector comparison	
+#define Vcmpeq( a,  b) vec4b(a.m128_f32[0] == b.m128_f32[0], a.m128_f32[1] == b.m128_f32[1], a.m128_f32[2] == b.m128_f32[2], a.m128_f32[3] == b.m128_f32[3])
+#define Vcmpneq( a,  b) vec4b(a.m128_f32[0] != b.m128_f32[0], a.m128_f32[1] != b.m128_f32[1], a.m128_f32[2] != b.m128_f32[2], a.m128_f32[3] != b.m128_f32[3])
+#define Vcmpgt( a,  b) vec4b(a.m128_f32[0] > b.m128_f32[0], a.m128_f32[1] > b.m128_f32[1], a.m128_f32[2] > b.m128_f32[2], a.m128_f32[3] > b.m128_f32[3])
+#define Vcmpge( a,  b) vec4b(a.m128_f32[0] >= b.m128_f32[0], a.m128_f32[1] >= b.m128_f32[1], a.m128_f32[2] >= b.m128_f32[2], a.m128_f32[3] >= b.m128_f32[3])
+#define Vcmplt( a,  b) vec4b(a.m128_f32[0] < b.m128_f32[0], a.m128_f32[1] < b.m128_f32[1], a.m128_f32[2] < b.m128_f32[2], a.m128_f32[3] < b.m128_f32[3])
+#define Vcmple( a,  b) vec4b(a.m128_f32[0] <= b.m128_f32[0], a.m128_f32[1] <= b.m128_f32[1], a.m128_f32[2] <= b.m128_f32[2], a.m128_f32[3] <= b.m128_f32[3])
+
+#define Vsel( c,  a,  b) vec4f(c.m128_f32[0] ? a.m128_f32[0] : b.m128_f32[0], c.m128_f32[1] ? a.m128_f32[1] : b.m128_f32[1], c.m128_f32[2] ? a.m128_f32[2] : b.m128_f32[2], c.m128_f32[3] ? a.m128_f32[3] : b.m128_f32[3])
+
+//	vector logics 
+#define Vnot(r) vec4b(!r.m128_f32[0], !r.m128_f32[1], !r.m128_f32[2], !r.m128_f32[3])
+#define Vxnor( a,  b) vec4b(!(a.m128_f32[0] ^ b.m128_f32[0]), !(a.m128_f32[1] ^ b.m128_f32[1]), !(a.m128_f32[2] ^ b.m128_f32[2]), !(a.m128_f32[3] ^ b.m128_f32[3]))
+#define Vxor( a,  b) vec4b(a.m128_f32[0] ^ b.m128_f32[0], a.m128_f32[1] ^ b.m128_f32[1], a.m128_f32[2] ^ b.m128_f32[2], a.m128_f32[3] ^ b.m128_f32[3])
+#define Vand( a,  b) vec4b(a.m128_f32[0] && b.m128_f32[0], a.m128_f32[1] && b.m128_f32[1], a.m128_f32[2] && b.m128_f32[2], a.m128_f32[3] && b.m128_f32[3])
+#define Vor( a,  b) vec4b(a.m128_f32[0] || b.m128_f32[0], a.m128_f32[1] || b.m128_f32[1], a.m128_f32[2] || b.m128_f32[2], a.m128_f32[3] || b.m128_f32[3])
+#define Vall(a) (a.m128_f32[0] && a.m128_f32[1] && a.m128_f32[2] && a.m128_f32[3])
+#define Vany(a) (a.m128_f32[0] || a.m128_f32[1] || a.m128_f32[2] || a.m128_f32[3])
+
+#endif // SIMD_FPU_H
+\ No newline at end of file
diff --git a/Runtime/Math/Simd/intrinsic.h b/Runtime/Math/Simd/intrinsic.h
new file mode 100644
index 0000000..eb58c2e
--- /dev/null
+++ b/Runtime/Math/Simd/intrinsic.h
@@ -0,0 +1,184 @@
+#ifndef SIMD_INTRINSIC_H
+#define SIMD_INTRINSIC_H
+
+/* Here the Math library back-end interface
+	When you declare a function always returns results by values, you want to be sure that simd register stay in register, otherwise you may get poor performance if the CPU need to push back the register into memory
+	Vector data is declared purely, ex: typedef __m128 vec4f. most compile won't recognize encapsulated vector type in class and thus generate more temporary and push back vector in memory.
+
+	to support a new platform you need at least to support this function set
+
+	typedef __m128	vec4f;	// vector 4 float packed
+	typedef __m128	vec4fs;	// vector 4 float scalar
+	typedef __m128	vec4b;	// vector 4 bool packed
+	typedef __m128	vec4bs;	// vector 4 bool scalar
+
+	#define Vzero()
+	#define Vone()
+	#define Vpermute(v, mask)
+	#define Vmove(l, r)
+
+	// This template is part of the back-end because some instruction set support some swizzle operation that could be specialized, like xbox vmx rotate instruction that is use in dot product
+	template<int SWZ> struct Vswizzle
+	{
+		static MECANIM_FORCE_INLINE vec4f rhs(vec4f r)
+		{
+			return Vpermute(r, SWZ);
+		}
+
+		static MECANIM_FORCE_INLINE vec4f lhs(vec4f l, vec4f r)
+		{
+			return Vswizzle<SWZ>::rhs(Vmove(Vswizzle<SWZ>::rhs(l), r));
+		}
+	};
+
+
+	// Aligned store, store vector at adress base as 4 float
+	#define Vstorepf(v, base, offset)
+
+	// Return component x as a float
+	#define Vstoresf(r)
+
+	// Return component x as a bool
+	#define Vstoresb(r)
+
+	// Aligned store, store vector at adress base as 4 bool
+	#define Vstorepb(vec4f v, bool* r)
+
+	// Aligned load, load 4 float at adress v in vector register
+	#define Vloadpf(v, offset)
+
+	// Load float value in vector register and replicate value in all component
+	#define Vloadsf(s)
+
+	// Load bool value in vector register and replicate value in all component
+	#define Vloadsb(s)
+
+	// Load 4 float value in vector register
+	#define Vload4sf(x, y, z, w)
+
+	// Load 4 bool value in vector register
+	#define Vload4sb( x, y, z, w)
+
+	#define Vadd(l, r)
+	#define Vsub( l,  r)
+	#define Vmul( l,  r)
+	#define Vdiv( l,  r)
+	#define Vmadd( a,  b,  c)
+	#define Vmsub( a,  b,  c)
+	#define Vneg(r)
+
+	// Vector sgn: return -1, 1
+	#define Vsgn(r)
+
+	// Vector sgn: return -1, 0, 1
+	#define Vsign(r)
+
+	#define Vinc(r)
+	#define Vdec(r)
+	#define Vabs(r)
+	#define Vmax( l,  r)
+	#define Vmin( l,  r)
+
+	// Return the largest of the 4 component
+	#define Vlargest(r)
+
+	// Return the smallest of the 4 component
+	#define Vsmallest(r)
+	#define Vsum(r)
+	#define Vdot( l,  r)
+	#define Vsqrt(r)
+
+	#define Vrsqrt(r)
+	#define Vrcp(r)
+
+	// Merge 4 vector low bytes
+	#define Vcombine(x,y,z,w)
+
+	// Vector comparison
+	#define Vcmpeq( a,  b)
+	#define Vcmpneq( a,  b)
+	#define Vcmpgt( a,  b)
+	#define Vcmpge( a,  b)
+	#define Vcmplt( a,  b)
+	#define Vcmple( a,  b)
+
+	#define Vsel( c,  a,  b)
+
+	//	vector logics
+	#define Vnot(r)
+	#define Vxnor( a,  b)
+	#define Vxor( a,  b)
+	#define Vand( a,  b)
+	#define Vor( a,  b)
+	#define Vall(a)
+	#define Vany( a)
+
+*/
+#if defined(__INTEL_COMPILER) || defined(__ICL) || defined(_MSC_VER)
+	#include <cstddef>
+	#define ATTRIBUTE_ALIGN(a)		__declspec(align(a))
+	#define ALIGN4F					16
+	#define MECANIM_FORCE_INLINE	__forceinline
+#elif defined(__GNUC__) || defined(__clang__)
+	#include <cstddef>
+
+	#ifndef __has_attribute
+  		#define __has_attribute(x) 0
+	#endif
+
+	#if ((__GNUC__ >= 3) && (__GNUC_MINOR__ >= 1)) || (__GNUC__ >= 4) || __has_attribute(always_inline)
+        #ifdef _DEBUG
+            #ifndef MECANIM_FORCE_INLINE
+                #define MECANIM_FORCE_INLINE		inline
+            #endif
+        #else
+            #ifndef MECANIM_FORCE_INLINE
+                #define MECANIM_FORCE_INLINE		inline __attribute__((always_inline))
+            #endif
+        #endif
+	#endif
+
+	#if defined(__GNUC__) || __has_attribute(aligned)
+		#define ATTRIBUTE_ALIGN(a)					__attribute__ ((aligned(a)))
+	#endif
+
+	#define ALIGN4F						16
+#endif
+
+#ifndef MECANIM_FORCE_INLINE
+	#define MECANIM_FORCE_INLINE		inline
+#endif
+
+#ifndef ATTRIBUTE_ALIGN
+	#define ATTRIBUTE_ALIGN(a)
+#endif
+
+#ifndef ALIGN4F
+	#define ALIGN4F						16
+#endif
+
+#if UNITY_FORCE_FPU
+	#include "Runtime/Math/Simd/fpu.h"
+#elif UNITY_XENON
+	#include "Runtime/Math/Simd/xenon.h"
+#elif UNITY_PS3
+	#include "Runtime/Math/Simd/ppu.h"
+#elif UNITY_WIN && UNITY_SUPPORTS_SSE
+	#include "Runtime/Math/Simd/sse.h"
+#elif UNITY_OSX
+	#include "Runtime/Math/Simd/sse.h"
+#elif UNITY_SUPPORTS_NEON && (!UNITY_ANDROID)
+	#include "Runtime/Math/Simd/neon.h"
+#else
+    #include "Runtime/Math/Simd/fpu.h"
+#endif
+
+//#define DEBUG_SIMD_ASSERT_IF 1
+#if DEBUG_SIMD_ASSERT_IF
+	#define SIMD_ASSERT_IF(x) AssertIf(x)
+#else
+	#define SIMD_ASSERT_IF(x)
+#endif
+
+#endif
+
diff --git a/Runtime/Math/Simd/math.h b/Runtime/Math/Simd/math.h
new file mode 100644
index 0000000..43a8837
--- /dev/null
+++ b/Runtime/Math/Simd/math.h
@@ -0,0 +1,678 @@
+#ifndef SIMD_MATH_H
+#define SIMD_MATH_H
+
+#include <cmath>
+
+// Standard macro define in cmath
+#ifndef M_EPSF
+#define M_EPSF     1e-6f
+#endif
+#ifndef M_PIf
+#define M_PIf       3.14159265358979323846f
+#endif
+#ifndef M_PI_2f
+#define M_PI_2f     1.57079632679489661923f
+#endif
+#ifndef M_PI_4f
+#define M_PI_4f     0.785398163397448309616f
+#endif
+#ifndef M_1_PIf
+#define M_1_PIf     0.318309886183790671538f
+#endif
+#ifndef M_2_PIf
+#define M_2_PIf     0.636619772367581343076f
+#endif
+#ifndef M_DEG_2_RADf
+#define M_DEG_2_RADf  0.0174532925f
+#endif
+#ifndef M_RAD_2_DEGf
+#define M_RAD_2_DEGf  57.295779513f
+#endif
+
+#include "Runtime/Math/Simd/float4.h"
+#include "Runtime/Math/Simd/bool4.h"
+
+namespace math
+{
+
+static inline bool all(bool4 const& r);
+static inline bool any(bool4 const& r);
+template <typename T> static inline T clamp(T const& v, T const& a, T const& b);
+static inline float cond(bool c, float const& a, float const& b);
+static inline int cond(bool c, int const& a, int const& b);
+static inline float cubic(float const& a, float const& b, float const& c, float const& d, float const& u);
+static inline float4 cubic(float4 const& a, float4 const& b, float4 const& c, float4 const& d, float4 const& u);
+static inline float4 cross(float4 const& a, float4 const& b);
+static inline float degrees(float const& deg);
+static inline float4 degrees(float4 const& deg);
+static inline float1 dot(float4 const& l, float4 const& r);
+static inline float1 dot(float4 const& r);
+static inline float1 length(float4 const& r);
+static inline float lerp(float const& a, float const& b, float x);
+static inline float1 lerp(float1 const& a, float1 const& b, float1 const& x);
+static inline float4 lerp(float4 const& a, float4 const& b, float1 const& x);
+static inline float4 lerp(float4 const& a, float4 const& b, float4 const& x);
+template <typename T> static inline T maximum(T const& a, T const& b);
+template <typename T> static inline T minimum(T const& a, T const& b);
+static inline float1 maximum(float4 const& r);
+static inline float1 minimum(float4 const& r);
+static inline float4 normalize(float4 const& r);
+static inline float pow(float const& r, float const& e);
+static inline float4 pow(float4 const& r, float1 const& e);
+static inline float radians(float const& deg);
+static inline float4 radians(float4 const& deg);
+static inline float4 rcp(float4 const& r);
+static inline float1 rcp(float1 const& r);
+static inline float4 rsqrt(float4 const& r );
+static inline float saturate(float const& r);
+static inline float1 saturate(float1 const& r);
+static inline float4 saturate(float4 const& r);
+static inline float4 scaleIdentity();
+static inline float4 scaleWeight(float4 const& s, float1 const& w);
+static inline void sincos(float4 const& u, float4& s, float4& c);
+static inline void sincos(float1 const& u, float1& s, float1& c);
+static inline void sincose(float4 const& u, float4& s, float4& c);
+static inline void sincose(float1 const& u, float1& s, float1& c);
+static inline float sgn(float const& r);
+static inline float1 sgn(float1 const& r);
+template <typename T> static inline vecexp4<vec4> sgn(vecexp4<T> const& x);
+template <typename T> static inline vecexp1<vec4> sgn(vecexp1<T> const& x);
+static inline float sign(float const& r);
+static inline float4 sign(float4 const& r);
+static inline float smoothstep( float min, float max, float x);
+static inline float smoothpulse( float minmin, float minmax, float maxmin, float maxmax, float x);
+static inline float1 sqrt(float1 const& r);
+static inline float4 sqrt(float4 const& r);
+static inline float1 sum(float4 const& r);	
+static inline float4 vector(float4 const& v);
+static inline float unrollangle(float angleRef, float angle);
+static inline float4 load(float const* v);
+static inline void store(float4 const& v, float* r);
+static inline void store(bool4 const& v, bool* r);
+
+
+static inline float abs(const float &r)
+{
+	return std::abs(r);
+}
+
+static inline float cos(float const& theta)
+{
+	return std::cos(theta);
+}
+
+static inline float rcp(const float &r)
+{
+	return 1.f/r;
+}
+
+static inline float rsqrt(const float& r)
+{
+	return 1.f/std::sqrt(r);
+}
+
+static inline float sin(float const& theta)
+{
+	return std::sin(theta);
+}
+
+static inline void sincos(float const& u, float& s, float& c)
+{
+	s = sin(u);
+	c = cos(u);		
+}
+
+static inline float tan(float const& theta)
+{
+	return std::tan(theta);
+}
+
+static inline float atan(float const& t)
+{
+	return std::atan(t);
+}
+
+static inline  float sqrt(const float& r)
+{
+	return std::sqrt(r);
+}	
+
+static inline float modf(float x, float &ip)
+{
+#if UNITY_FLASH
+	float intPart;
+	__asm __volatile__("%[RES] = (%[FARG] < 0 ? Math.ceil(%[FARG]) : Math.floor(%[FARG]));//modf" : [RES] "=f" (intPart) : [FARG] "f" (x));
+	ip = intPart;
+	return x-intPart;
+#else
+	return std::modf(x, &ip); 
+#endif
+}
+
+static inline float fmod(float x, float y)
+{
+	return std::fmod(x,y);
+}
+
+static inline  float pow(const float& x,const float& y)
+{
+	return std::pow(x,y);
+}	
+
+template <typename T> static inline vecexp4<vec4> abs(vecexp4<T> const& x)
+{
+	return vecexp4<vec4>( Vabs( x.eval() ) );
+}
+
+template <typename T> static inline vecexp1<T> abs(vecexp1<T> const& x)
+{
+	return vecexp1<T>( Vabs( x.eval() ) );
+}
+
+static inline float1 abs(float1 const& x)
+{
+	return float1( Vabs( x.eval() ) );
+}
+
+static inline bool all(bool4 const& r)
+{
+	return Vall(r.v); 
+}
+static inline bool any(bool4 const& r)
+{
+	return Vany(r.v); 
+}
+
+// x clamped to the range [a, b] as follows:
+// Returns a if x is less than a.
+// Returns b if x is greater than b.
+// Returns x otherwise.
+template <typename T> static inline T clamp(T const& v, T const& a, T const& b)
+{
+	return minimum(b, maximum(a, v));
+}
+
+template <typename L, typename R> static inline vecexp4<vec4> cond(bool4 const& c, vecexp4<L> const& l, vecexp4<R> const& r)
+{	
+	return vecexp4<vec4>( Vsel(c.v, l.eval(), r.eval()) ); 
+}
+
+template <typename L, typename R> static inline vecexp4<vec4> cond(bool1 const& c, vecexp4<L> const& l, vecexp4<R> const& r)
+{
+	return vecexp4<vec4>( Vsel(c.s, l.eval(), r.eval()) ); 
+}
+
+template <typename L, typename R> static inline vecexp1<vec4> cond(bool1 const& c, vecexp1<L> const& l, vecexp1<R> const& r)
+{
+	return vecexp4<vec4>( Vsel(c.s, l.eval(), r.eval()) ); 
+}
+
+static inline float cond(bool c, float const& a, float const& b)
+{
+	return c ? a : b;
+}
+
+static inline int cond(bool c, int const& a, int const& b)
+{
+	return int(b + (-int(c) & (a - b)));
+}
+
+static inline int cond(bool c, long int const& a, long int const& b)
+{
+	typedef long int long_int;
+	return long_int(b + (-long_int(c) & (a - b)));
+}
+
+static inline unsigned long cond(bool c, unsigned long const& a, unsigned long const& b)
+{
+	return b + (- long(c) & (a - b));
+}
+
+static inline unsigned int cond(bool c, unsigned int const& a, unsigned int const& b)
+{
+	return b + (- int(c) & (a - b));
+}
+
+// De Casteljau construction of bezier
+static inline float cubic(float const& a, float const& b, float const& c, float const& d, float const& u)
+{
+	const float ab = lerp(a,b,u);
+	const float bc = lerp(b,c,u);
+	const float cd = lerp(c,d,u);
+	const float abc = lerp(ab,bc,u);
+	const float bcd = lerp(bc,cd,u);
+	return lerp(abc, bcd, u);
+}
+
+static inline float4 cubic(float4 const& a, float4 const& b, float4 const& c, float4 const& d, float4 const& u)
+{
+	const float4 ab = lerp(a,b,u);
+	const float4 bc = lerp(b,c,u);
+	const float4 cd = lerp(c,d,u);
+	const float4 abc = lerp(ab,bc,u);
+	const float4 bcd = lerp(bc,cd,u);
+	return lerp(abc, bcd, u);
+}
+
+static inline float4 cross(float4 const& a, float4 const& b)
+{
+	return float4(a.yzxw()*b.zxyw() - a.zxyw()*b.yzxw());		
+}
+
+static inline float degrees(float const& rad)
+{
+	return M_RAD_2_DEGf*rad;
+}
+
+static inline float4 degrees(float4 const& rad)
+{
+	return float1(M_RAD_2_DEGf)*rad;
+}
+
+static inline float1 degrees(float1 const& rad)
+{
+	return float1(M_RAD_2_DEGf)*rad;
+}
+
+static inline float1 dot(float4 const& l, float4 const& r)
+{
+	return float1( Vdot(l.eval(), r.eval()) );
+}
+
+static inline float1 dot(float4 const& r)
+{
+	return float1( Vdot(r.eval(), r.eval()) );
+}
+
+static inline float1 length(float4 const& r)
+{
+	return float1(Vsqrt( Vdot(r.eval(), r.eval()) ));
+}
+
+static inline float lerp(float const& a, float const& b, float x)
+{
+	return a + x*(b - a);
+}
+
+static inline float1 lerp(float1 const& a, float1 const& b, float1 const& x)
+{
+	return a + x*(b - a);
+}
+
+static inline float4 lerp(float4 const& a, float4 const& b, float1 const& x)
+{
+	return a + x*(b - a);
+}
+
+static inline float4 lerp(float4 const& a, float4 const& b, float4 const& x)
+{
+	return a + x*(b - a);
+}
+
+template <typename T> static inline T maximum(T const& a, T const& b)
+{
+	return cond(a > b, a, b);
+}
+
+static inline float1 maximum(float4 const& r)
+{
+	return float1( Vlargest(r.eval()) );
+}
+
+template <typename T> static inline T minimum(T const& a, T const& b)
+{
+	return cond(a < b, a, b);
+}	
+
+static inline float1 minimum(float4 const& r)
+{
+	return float1( Vsmallest(r.eval()) );
+}
+
+static inline float4 normalize(float4 const& r)
+{
+	return float4( Vmul(r.eval(), Vrsqrt(Vdot(r.eval(), r.eval()) ) ));
+}	
+
+static inline float4 pow(float4 const& r, float1 const& e)
+{
+	float e1 = e.tofloat();	
+	
+	return float4( std::pow( r.x().tofloat(), e1), std::pow( r.y().tofloat(), e1), std::pow( r.z().tofloat(), e1), std::pow( r.w().tofloat(), e1));	
+}	
+
+static inline float radians(float const& deg)
+{
+	return M_DEG_2_RADf*deg;
+}
+
+static inline float4 radians(float4 const& deg)
+{
+	return float1(M_DEG_2_RADf)*deg;
+}
+
+static inline float4 rcp(float4 const& r)
+{
+	return float4(Vrcp(r.eval()));
+}	
+
+static inline float1 rcp(float1 const& r)
+{
+	return float1(Vrcp(r.eval()));
+}
+
+static inline float4 rsqrt(float4 const& r)
+{
+	return float4(Vrsqrt(r.eval()));
+}		
+
+static inline float saturate(float const& r)
+{
+	return clamp(r, 0.f, 1.f);
+}
+
+static inline float1 saturate(float1 const& r)
+{
+	return float1(Vmin( Vmax(r.eval(), Vzero()), Vone()));	
+}
+
+static inline float4 saturate(float4 const& r)
+{
+	return float4(Vmin( Vmax(r.eval(), Vzero()), Vone()));		
+}
+
+static inline float4 scaleIdentity()
+{
+	return float4::one();
+}
+
+static inline float4 scaleWeight(float4 const& s, float1 const& w)
+{
+	float4 s_abs = math::abs(s);
+	float4 s_sng = math::sgn(s);
+
+	return s_sng * pow( s_abs, w);	
+}
+
+static inline float4 scaleBlend(float4 const& sa, float4 const& sb,float1 const& w)
+{
+	const float4 saw = scaleWeight(sa, float1::one() - w);
+	const float4 sbw = scaleWeight(sb, w);
+	const float4 s_sng = math::sgn( cond( w > float1(.5), sb, sa) );
+	return s_sng * math::abs(saw * sbw);
+}
+
+// return -1 if r < 0
+// return 1 if r >= 0
+static inline float sgn(float const& r)
+{
+	return cond(r >= 0.f, 1.f, -1.f);
+}
+
+// return -1 if r < 0
+// return 1 if r >= 0
+static inline float1 sgn(float1 const& r)
+{
+	return float1(Vsgn(r.eval()));
+}
+
+// return -1 if r < 0
+// return 1 if r >= 0
+template <typename T> static inline vecexp4<vec4> sgn(vecexp4<T> const& x)
+{
+	return vecexp4<vec4>( Vsgn( x.eval()) );
+}
+
+// return -1 if r < 0
+// return 1 if r >= 0
+template <typename T> static inline vecexp1<vec4> sgn(vecexp1<T> const& x)
+{
+	return vecexp1<vec4>( Vsgn( x.eval() ) );
+}
+
+// return -1 if r < 0
+// return 0 if r == 0
+// return 1 if r > 0
+static inline float sign(float const& r)
+{
+	return cond( r > 0, 1.f, cond( r < 0, -1.f, 0.f));
+}
+
+// return -1 if r < 0
+// return 0 if r == 0
+// return 1 if r > 0
+static inline float4 sign(float4 const& r)
+{
+	return float4(Vsign(r.eval()));
+}
+
+static inline float4 smoothClamp(float4 const& v, float4 const& m, float1 const& r)
+{
+	return cond(v-m>float1::zero(),m+r*((v-m)/(v-m+r)),v);
+}
+
+static inline float smoothstep( float min, float max, float x)
+{
+	x = math::clamp(x, min, max);
+	return -2.f * math::pow((x-min)/(max-min), 3.f) + 3.f * math::pow((x-min)/(max-min), 2.f);
+}
+
+static inline float smoothpulse( float minmin, float minmax, float maxmin, float maxmax, float x)
+{
+	return smoothstep(minmin,minmax,x) - smoothstep(maxmin,maxmax,x); 
+}
+
+static inline float1 sqrt(float1 const& r)
+{
+	return float1(Vsqrt(r.eval()));
+}
+
+static inline float4 sqrt(float4 const& r)
+{
+	return float4(Vsqrt(r.eval()));
+}
+
+static inline float1 sum(float4 const& r)
+{
+	return float1(Vsum(r.eval()));
+}
+
+static inline float1 triangleAngle(math::float1 const& aLen, math::float1 const& aLen1, math::float1 const& aLen2)
+{ 
+	math::float1 c = clamp<float1>((aLen1*aLen1 + aLen2*aLen2 - aLen*aLen) / (aLen1*aLen2) / float1(2.f), -float1::one() , float1::one());
+	return math::float1(acos(c.tofloat()));
+}
+
+static inline float4 vector(float4 const& v)
+{
+	constant_float4( mask, 1.f,1.f,1.f,0.f);
+	return v*mask;
+}	
+
+static inline float4 vector(float const& x, float const& y, float const& z)
+{
+	return float4(x, y, z, 0);
+}
+
+static inline float unrollangle(float angleRef, float angle)
+{
+	float i;
+	float f = math::modf( (angleRef-angle)/(2.f*M_PIf), i);
+	return angle + ( (i+(math::abs(f) > 0.5f ? math::sgn(f) * 1 : 0)) * 2.f * M_PIf);
+}
+
+static inline float4 doubleAtan(float4 const& v)
+{
+	float ATTRIBUTE_ALIGN(ALIGN4F) av[4];
+	
+	store(v, av);
+
+	return float4(2.0f*atan(av[0]),2.0f*atan(av[1]),2.0f*atan(av[2]),2.0f*atan(av[3]));
+}
+
+
+// between range [-pi/2, pi/2] the maximum error is 8.186e-4
+static inline vec4f cos_estimate(vec4f x)
+{
+	// cos(x) = 1 - (c2*x^2) + (c4*x^4) - (c6*x^6)
+	// cos(x) = 1 + (-c2*x^2) + (c4*x^4) + (-c6*x^6) // 3 mul and 3 mul add
+	// let's bake sign into constant to remove some complexity
+	cvec4fs(c2, -0.5f);
+	cvec4fs(c4, 4.166666666667e-2f);
+	cvec4fs(c6, -1.38888889e-3f);
+	
+	// Use horner form to reduce the polynomial instruction count
+	// cos(x) = 1 + x^2*(c2 + x^2*(c4 + x^2*(c6))) // 1 mul and 3 mul add
+	vec4f x2 = Vmul(x,x);
+	return Vmadd(Vmadd(Vmadd(c6, x2, c4), x2, c2), x2, Vone());
+}
+
+
+// between range [-pi/2, pi/2] the maximum error is 9.1e-5
+static inline vec4f sin_estimate(vec4f x)
+{
+	// sin(x) = x - (c3*x^3) + (c5*x^5) - (c7*x^7)
+	// sin(x) = x + (-c3*x^3) + (c5*x^5) + (-c7*x^7) // 4 mul and 3 mul add
+	// let's bake sign into constant to remove some complexity
+	cvec4fs(c3, -0.166666567325592041015625f);
+	cvec4fs(c5, 8.33220803e-3f);
+	cvec4fs(c7, -1.95168955e-4f);
+	
+	// Use horner form to reduce the polynomial instruction count
+	// sin(x) = x * ( 1 + x^2*(c3 + x^2*(c5 + x^2*c7))) // 2 mul and 3 mul add
+	vec4f x2 = Vmul(x,x);
+	return Vmul(x, Vmadd(Vmadd(Vmadd(c7, x2, c5), x2, c3), x2, Vone()));
+}
+
+static inline float4 sin_est(float4 const& x)
+{
+	return float4( sin_estimate( x.eval() ) );
+}
+
+static inline float4 cos_est(float4 const& x)
+{
+	return float4( cos_estimate( x.eval() ) );
+}
+
+static inline void sincos(float4 const& u, float4& s, float4& c)
+{
+	float ATTRIBUTE_ALIGN(ALIGN4F) sv[4];
+	float ATTRIBUTE_ALIGN(ALIGN4F) cv[4];
+	float ATTRIBUTE_ALIGN(ALIGN4F) uv[4];
+	
+	store(u, uv);
+
+	sincos(uv[0], sv[0], cv[0]);
+	sincos(uv[1], sv[1], cv[1]);
+	sincos(uv[2], sv[2], cv[2]);
+	sincos(uv[3], sv[3], cv[3]);
+
+	s = load(sv);
+	c = load(cv);
+}
+
+static inline void sincos(float1 const& u, float1& s, float1& c)
+{
+	float sv;
+	float cv;
+	
+	sincos(u.tofloat(), sv, cv);
+
+	s = float1(sv);
+	c = float1(cv);
+}
+
+static inline void sincos_est(float4 const& u, float4& s, float4& c)
+{
+	s = sin_est(u);
+	c = cos_est(u);
+}
+
+static inline void sincos_est(float1 const& u, float1& s, float1& c)
+{
+	s = float1( sin_estimate( u.eval() ) );
+	c = float1( cos_estimate( u.eval() ) );
+}
+
+static inline float4 tan(float4 const& x)
+{
+	vec4f x2,x3;
+
+	// Compute x^2 and x^3
+	//
+	x2 = Vmul(x.eval(),x.eval());
+	x3 = Vmul(x2,x.eval());	    
+
+	// Compute both the sin and cos of the angles
+	// using a polynomial expression:
+	//   cx = 1.0f + x2 * (C0 * x2 + C1), and
+	//   sx = xl + x3 * S0
+	//
+	cvec4fs(c0, 0.0097099364f);
+	cvec4fs(c1, -0.4291161787f);
+	cvec4fs(s0, -0.0957822992f);
+
+	vec4f ct2 = Vmadd(c0,x2,c1);
+    
+	vec4f cx = Vmadd(ct2,x2, Vone());
+	vec4f sx = Vmadd(s0,x3, x.eval());
+
+	return float4(Vdiv(sx,cx));
+}
+
+static inline float4 atan(float4 const& x)
+{
+	//x - (x^3)/3 + (x^5)/5 - (x^7)/7 + ...
+	
+	cvec4fs(c3, 3.f);
+	cvec4fs(c5, 5.f);
+	cvec4fs(c7, 7.f);
+	vec4f x2 = Vmul(x.eval(),x.eval());
+	vec4f x3 = Vmul(x2,x.eval());
+	vec4f x5 = Vmul(x3,x2);
+	vec4f x7 = Vmul(x5,x2);
+
+	return float4(Vsub( x.eval(), Vadd(Vdiv( x3, c3), Vsub(Vdiv(x5,c5), Vdiv(x7,c7)))));
+}
+
+static inline float halfTan(float a)
+{
+	//float x = math::fmod(0.5f*abs(a)+M_PI_2,float(M_PI));
+	float x1 = (0.5f*abs(a)+M_PI_2f);
+	return tan(clamp(sign(a)*(x1-M_PI_2f),-M_PI_2f+M_EPSF,M_PI_2f-M_EPSF));
+}
+
+static inline float4 halfTan(float4 const& a)
+{	
+	static const float4 nM_PI_2(-M_PI_2f+M_EPSF);
+	static const float4 pM_PI_2( M_PI_2f+M_EPSF);
+
+	float4 x = float1(0.5f) * abs(a) + float1(M_PI_2f);		
+	return tan( math::clamp<float4>( sign(a) * (x-float4(M_PI_2f)), nM_PI_2, pM_PI_2  ));
+}
+
+static inline float4 mirror(float4 const& t)
+{
+	constant_float4(mirrorT,-1,1,1,1);
+	return t * mirrorT;
+}
+
+static inline float4 load(float const* v)
+{
+	return float4(Vloadpf(v, 0));
+}
+
+static inline void store(float4 const& v, float* r)
+{
+	Vstorepf(v.eval(), r, 0);
+}	
+
+static inline void store(bool4 const& v, bool* r)
+{
+	Vstorepb(v.v, r);
+}	
+
+}
+
+#endif
+
diff --git a/Runtime/Math/Simd/neon.h b/Runtime/Math/Simd/neon.h
new file mode 100644
index 0000000..08196f9
--- /dev/null
+++ b/Runtime/Math/Simd/neon.h
@@ -0,0 +1,548 @@
+#ifndef SIMD_NEON_H
+#define SIMD_NEON_H
+
+#include <arm_neon.h>
+
+typedef float32x4_t vec4f;
+typedef float32x4_t vec4fs;
+typedef uint32x4_t vec4b;
+typedef uint32x4_t vec4bs;
+
+#define SWZ_MASK(x, y, z, w)	(((w) << 6) | ((z) << 4) | ((y) << 2) | ((x)))
+#define SWZ_X(MASK)				(((MASK) >> 0) & 3)
+#define SWZ_Y(MASK)				(((MASK) >> 2) & 3)
+#define SWZ_Z(MASK)				(((MASK) >> 4) & 3)
+#define SWZ_W(MASK)				(((MASK) >> 6) & 3)
+
+//VPERMWI_CONST(x, y, z, w)
+#if UNITY_WINRT
+#define cvec4f(name, x,y,z,w) static const vec4f name = Vload4sf(x, y, z, w)
+#define cvec4b(name, x,y,z,w) static const vec4b name = Vload4sb(x, y, z, w)
+#define cvec4fs(name, s) static const vec4fs name = Vloadsf(s)
+#else
+#define cvec4f(name, x,y,z,w) static const vec4f name = {(x),(y),(z),(w)}
+#define cvec4b(name, x,y,z,w) static const vec4b name = {(x),(y),(z),(w)}
+#define cvec4fs(name, s) static const vec4fs name = {(s),(s),(s),(s)}
+#endif
+
+enum simd_mask
+{
+	kXYZW = SWZ_MASK(0,1,2,3),
+	kXXXX = SWZ_MASK(0,0,0,0),
+	kYYYY = SWZ_MASK(1,1,1,1),
+	kZZZZ = SWZ_MASK(2,2,2,2),
+	kWWWW = SWZ_MASK(3,3,3,3),
+
+	kXWYZ = SWZ_MASK(0,3,1,2),
+	kXZWY = SWZ_MASK(0,2,3,1),
+
+	kYZWX = SWZ_MASK(1,2,3,0),
+	kYXZW = SWZ_MASK(1,0,2,3),
+	kYWZX = SWZ_MASK(1,3,2,0),
+	kYZXW = SWZ_MASK(1,2,0,3),
+	kYXWZ = SWZ_MASK(1,0,3,2),
+
+	kZWXY = SWZ_MASK(2,3,0,1),
+	kZYXW = SWZ_MASK(2,1,0,3),
+	kZYWX = SWZ_MASK(2,1,3,0),
+	kZXYW = SWZ_MASK(2,0,1,3),
+
+	kWYZX = SWZ_MASK(3,1,2,0),
+	kWXZY = SWZ_MASK(3,0,2,1),
+	kWYXZ = SWZ_MASK(3,1,0,2),
+	kWWWZ = SWZ_MASK(3,3,3,2),
+	kWWZZ = SWZ_MASK(3,3,2,2),
+	kWZYX = SWZ_MASK(3,2,1,0),
+};
+
+#define Vzero() vdupq_n_f32(0.0f)
+#define Vone() vdupq_n_f32(1.0f)
+
+#define Vfalse() vdupq_n_u32(0)
+#define Vtrue() vdupq_n_f32(0xFFFFFFFF)
+
+union U { float32x2x2_t f2x2; float32x4_t f4; uint8x8x2_t b8x2; float32_t f[4]; };
+
+#define LHS_FUNTION() \
+	static MECANIM_FORCE_INLINE vec4f lhs(vec4f l, vec4f r)\
+	{\
+		vec4f m = Vmove(rhs(l), r);\
+		return rhs(m);\
+	}
+
+//#define Vpermute(v, mask) v
+//#define Vmove(l, r) vextq_f32(l, r, 0)
+MECANIM_FORCE_INLINE vec4f Vmove(vec4f l, vec4f r)
+{
+	uint32x4_t sel = Vfalse();
+	sel = vsetq_lane_u32(0xFFFFFFFF,sel,0);
+	return vbslq_f32(sel, r, l);
+}
+template<int SWZ> struct Vswizzle;
+/*
+{
+	static MECANIM_FORCE_INLINE vec4f rhs(vec4f r)
+	{
+		::uint32_t lanes[4];
+		uint32x4_t u = vreinterpretq_u32_f32(r);
+		uint32x4_t result;
+
+		lanes[0] = vgetq_lane_u32(u, 0);
+		lanes[1] = vgetq_lane_u32(u, 1);
+		lanes[2] = vgetq_lane_u32(u, 2);
+		lanes[3] = vgetq_lane_u32(u, 3);
+
+		result = vdupq_n_u32(lanes[SWZ_X(SWZ)]);
+		result = vsetq_lane_u32(lanes[SWZ_Y(SWZ)], result, 1);
+		result = vsetq_lane_u32(lanes[SWZ_Z(SWZ)], result, 2);
+		result = vsetq_lane_u32(lanes[SWZ_W(SWZ)], result, 3);
+
+		return vreinterpretq_f32_u32(result);
+	}
+
+	static MECANIM_FORCE_INLINE vec4f lhs(vec4f l, vec4f r)
+	{
+		vec4f m = Vmove(Vswizzle<SWZ>::rhs(l), r);
+		return Vswizzle<SWZ>::rhs(m);
+	}
+};
+*/
+
+template<> struct Vswizzle<kXYZW>
+{
+	static MECANIM_FORCE_INLINE vec4f rhs(vec4f r)
+	{
+		return r;
+	}
+	static MECANIM_FORCE_INLINE vec4f lhs(vec4f l, vec4f r)
+	{
+		return Vmove(l, r);
+	}
+};
+template<> struct Vswizzle<kXXXX>
+{
+	static MECANIM_FORCE_INLINE vec4f rhs(vec4f r)
+	{
+		return vdupq_lane_f32(vget_low_f32(r),0);
+	}
+
+	LHS_FUNTION()
+};
+template<> struct Vswizzle<kYYYY>
+{
+	static MECANIM_FORCE_INLINE vec4f rhs(vec4f r)
+	{
+		return vdupq_lane_f32(vget_low_f32(r),1);
+	}
+
+	LHS_FUNTION()
+};
+template<> struct Vswizzle<kZZZZ>
+{
+	static MECANIM_FORCE_INLINE vec4f rhs(vec4f r)
+	{
+		return vdupq_lane_f32(vget_high_f32(r),0);
+	}
+	LHS_FUNTION()
+};
+template<> struct Vswizzle<kWWWW>
+{
+	static MECANIM_FORCE_INLINE vec4f rhs(vec4f r)
+	{
+		return vdupq_lane_f32(vget_high_f32(r),1);
+	}
+	LHS_FUNTION()
+};
+
+template<> struct Vswizzle<kXWYZ>
+	{
+	static MECANIM_FORCE_INLINE vec4f rhs(vec4f p)
+	{ 
+		U u; u.f2x2 = vtrn_f32(vget_low_f32(p), vrev64_f32(vget_high_f32(p))); return u.f4; 
+	}
+	LHS_FUNTION()
+};
+
+template<> struct Vswizzle<kXZWY>
+{
+	static MECANIM_FORCE_INLINE vec4f rhs(vec4f p)
+	{ 
+		return vcombine_f32(vtrn_f32(vget_low_f32(p), vget_high_f32(p)).val[0], vrev64_f32(vtrn_f32(vget_low_f32(p), vget_high_f32(p)).val[1])); 
+	}
+	LHS_FUNTION()
+};
+
+template<> struct Vswizzle<kYZWX>
+{
+	static MECANIM_FORCE_INLINE vec4f rhs(vec4f p)
+	{
+		return vreinterpretq_f32_u32(vextq_u32(vreinterpretq_u32_f32(p), vreinterpretq_u32_f32(p), 1)); 
+	}
+	LHS_FUNTION()
+};
+
+template<> struct Vswizzle<kYXZW>
+{
+	static MECANIM_FORCE_INLINE vec4f rhs(vec4f p)
+	{
+		return vcombine_f32(vrev64_f32(vget_low_f32(p)), vget_high_f32(p));
+	}
+	LHS_FUNTION()
+};
+
+template<> struct Vswizzle<kYWZX>
+{
+	static MECANIM_FORCE_INLINE vec4f rhs(vec4f p)
+	{
+		return vcombine_f32(vtrn_f32(vget_low_f32(p), vget_high_f32(p)).val[1], vrev64_f32(vtrn_f32(vget_low_f32(p), vget_high_f32(p)).val[0])); 
+	}
+	LHS_FUNTION()
+};
+
+template<> struct Vswizzle<kYZXW>
+{
+	static MECANIM_FORCE_INLINE vec4f rhs(vec4f p)
+	{
+		U u;
+        u.f2x2 = vtrn_f32(vrev64_f32(vget_low_f32(p)), vget_high_f32(p));
+        return u.f4;
+	}
+	LHS_FUNTION()
+};
+
+template<> struct Vswizzle<kYXWZ>
+{
+	static MECANIM_FORCE_INLINE vec4f rhs(vec4f p)
+	{
+		return vrev64q_f32(p); 
+	}
+	LHS_FUNTION()
+};
+
+template<> struct Vswizzle<kZWXY>
+{
+	static MECANIM_FORCE_INLINE vec4f rhs(vec4f p)
+	{
+		return vcombine_f32(vget_high_f32(p), vget_low_f32(p));
+	}
+	LHS_FUNTION()
+};
+
+template<> struct Vswizzle<kZYXW>
+{
+	static MECANIM_FORCE_INLINE vec4f rhs(vec4f p)
+	{ 
+		return vcombine_f32(vrev64_f32(vreinterpret_f32_u32(vext_u32(vreinterpret_u32_f32(vget_low_f32(p)), vreinterpret_u32_f32(vget_high_f32(p)), 1))), vrev64_f32(vreinterpret_f32_u32(vext_u32(vreinterpret_u32_f32(vget_high_f32(p)), vreinterpret_u32_f32(vget_low_f32(p)), 1)))); 
+	}
+	LHS_FUNTION()
+};
+
+template<> struct Vswizzle<kZYWX>
+{
+	static MECANIM_FORCE_INLINE vec4f rhs(vec4f p)
+	{ 
+		return vcombine_f32(vrev64_f32(vreinterpret_f32_u32(vext_u32(vreinterpret_u32_f32(vget_low_f32(p)), vreinterpret_u32_f32(vget_high_f32(p)), 1))), vreinterpret_f32_u32(vext_u32(vreinterpret_u32_f32(vget_high_f32(p)), vreinterpret_u32_f32(vget_low_f32(p)), 1))); 
+	}
+	LHS_FUNTION()
+};
+
+template<> struct Vswizzle<kZXYW>
+{
+	static MECANIM_FORCE_INLINE vec4f rhs(vec4f p)
+	{ 
+		return vcombine_f32(vrev64_f32(vtrn_f32(vget_low_f32(p), vget_high_f32(p)).val[0]), vtrn_f32(vget_low_f32(p), vget_high_f32(p)).val[1]); 
+	}
+	LHS_FUNTION()
+};
+
+template<> struct Vswizzle<kWYZX>
+{
+	static MECANIM_FORCE_INLINE vec4f rhs(vec4f p)
+	{ 
+		U u;
+		u.f4 = vrev64q_f32(p); 
+		u.f2x2 = vtrn_f32(u.f2x2.val[1], u.f2x2.val[0]);
+		return u.f4;
+	}
+	LHS_FUNTION()
+};
+
+template<> struct Vswizzle<kWXZY>
+{
+	static MECANIM_FORCE_INLINE vec4f rhs(vec4f p)
+	{ 
+		U u; u.f2x2 = vtrn_f32(vrev64_f32(vget_high_f32(p)), vget_low_f32(p)); return u.f4;
+	}
+	LHS_FUNTION()
+};
+
+template<> struct Vswizzle<kWYXZ>
+{
+	static MECANIM_FORCE_INLINE vec4f rhs(vec4f p)
+	{ 
+		return vcombine_f32(vrev64_f32(vtrn_f32(vget_low_f32(p), vget_high_f32(p)).val[1]), vtrn_f32(vget_low_f32(p), vget_high_f32(p)).val[0]);
+	}
+	LHS_FUNTION()
+};
+
+template<> struct Vswizzle<kWWWZ>
+{
+	static MECANIM_FORCE_INLINE vec4f rhs(vec4f p)
+	{ 
+		return vcombine_f32(vdup_lane_f32(vget_high_f32(p), 1), vrev64_f32(vget_high_f32(p)));
+	}
+	LHS_FUNTION()
+};
+
+template<> struct Vswizzle<kWWZZ>
+{
+	static MECANIM_FORCE_INLINE vec4f rhs(vec4f p)
+	{ 
+		U u; u.f2x2 = vtrn_f32(vget_high_f32(p), vget_high_f32(p));
+		return vreinterpretq_f32_u32(vextq_u32(vreinterpretq_u32_f32(u.f4), vreinterpretq_u32_f32(u.f4), 2));
+	}
+	LHS_FUNTION()
+};
+
+template<> struct Vswizzle<kWZYX>
+{
+	static MECANIM_FORCE_INLINE vec4f rhs(vec4f p)
+	{ 
+		return vcombine_f32(vrev64_f32(vget_high_f32(p)), vrev64_f32(vget_low_f32(p)));
+	}
+	LHS_FUNTION()
+};
+
+static MECANIM_FORCE_INLINE float Vstoresf(vec4f r)
+{
+	return vgetq_lane_f32(r, 0);
+}
+
+static MECANIM_FORCE_INLINE bool Vstoresb(vec4b r)
+{
+	return (vgetq_lane_u32(r, 0) > 0) ? true : false;
+}
+
+// Aligned store
+#define Vstorepf(v, base, offset) vst1q_f32((float32_t*)((base)+(offset)),v);
+
+static MECANIM_FORCE_INLINE void Vstorepb(const vec4b v, bool* r)
+{
+	::uint32_t u;
+    vst1q_lane_u32(&u, v, 0);
+    r[0] = (u > 0) ? true : false;
+    vst1q_lane_u32(&u, v, 1);
+    r[1] = (u > 0) ? true : false;
+    vst1q_lane_u32(&u, v, 2);
+    r[2] = (u > 0) ? true : false;
+    vst1q_lane_u32(&u, v, 3);
+    r[3] = (u > 0) ? true : false;
+}
+
+static MECANIM_FORCE_INLINE vec4f Vloadsf(float s)
+{
+	return vmovq_n_f32(s);
+}
+
+static MECANIM_FORCE_INLINE vec4b Vloadsb(bool s)
+{
+	const ::uint32_t false_true[2] = { 0, 0xFFFFFFFF };
+	return vdupq_n_u32(false_true[s ? 1 : 0]);
+}
+
+static MECANIM_FORCE_INLINE vec4f Vload4sf(float x, float y, float z, float w)
+{
+	float32x4_t result;
+	result = vdupq_n_f32(x);
+	result = vsetq_lane_f32(y, result, 1);
+	result = vsetq_lane_f32(z, result, 2);
+	result = vsetq_lane_f32(w, result, 3);
+	return result;
+}
+
+static MECANIM_FORCE_INLINE vec4b Vload4sb(bool x, bool y, bool z, bool w) 
+{
+	const ::uint32_t val[4] =
+	{
+		x ? 0xffffffff : 0x00,
+		y ? 0xffffffff : 0x00,
+		z ? 0xffffffff : 0x00,
+		w ? 0xffffffff : 0x00
+	};
+
+	return vld1q_u32(&val[0]);
+}
+
+static MECANIM_FORCE_INLINE vec4f Vloadpf(float const* buf, int offset)
+{
+	return vld1q_f32((float32_t const*)buf + offset);
+}
+
+#define Vadd(l, r) vaddq_f32(l, r)
+#define Vsub(l, r) vsubq_f32(l, r)
+#define Vmul(l, r) vmulq_f32(l, r)
+
+
+// return a*b+c : be aware that vmlaq does a+b*c
+#define Vmadd(a, b, c) vmlaq_f32(c, a, b)
+// return a*b-c : be aware that vmlaq does a-b*c
+#define Vmsub(a, b, c) Vneg(vmlsq_f32(c, a, b))
+
+static MECANIM_FORCE_INLINE vec4f Vneg(vec4f r)
+{
+	uint32x4_t sign_constant = vdupq_n_u32(0x80000000);
+	uint32x4_t negated = veorq_u32(vreinterpretq_u32_f32(r), sign_constant);
+	return vreinterpretq_f32_u32(negated);
+}
+
+// vector sgn: return -1, 1
+static MECANIM_FORCE_INLINE vec4f Vsgn(vec4f r)
+{
+	uint32x4_t sign_constant = vdupq_n_u32(0x80000000);
+	uint32x4_t signs = vandq_u32(vreinterpretq_u32_f32(r), sign_constant);
+	uint32x4_t ones = vdupq_n_u32 (0x3f800000);
+
+	return vreinterpretq_f32_u32(vorrq_u32(signs,ones));
+/*	float32x4_t ones = Vone();
+	float32x4_t nones = Vneg(ones);
+	uint32x4_t cmp = vcltq_f32(r,Vzero());
+	return vbslq_f32(cmp,nones,ones);*/
+}
+
+// vector sgn: return -1, 0, 1
+static MECANIM_FORCE_INLINE vec4f Vsign(vec4f r)
+{
+	uint32x4_t sign_constant = vdupq_n_u32(0x80000000);
+	uint32x4_t signs = vandq_u32(vreinterpretq_u32_f32(r), sign_constant);
+	uint32x4_t ones = vdupq_n_u32 (0x3f800000);
+
+	return vreinterpretq_f32_u32(vorrq_u32( signs, vandq_u32( vmvnq_u32( vceqq_f32( r, Vzero())), ones)));
+}
+
+#define Vinc(r) Vadd( (r), Vone())
+#define Vdec(r) Vsub( (r), Vone())
+
+static MECANIM_FORCE_INLINE vec4f Vabs(vec4f r)
+{
+	return vabsq_f32(r);
+}
+
+#define Vmax( l,  r) vmaxq_f32(l, r)
+#define Vmin( l,  r) vminq_f32(l, r)
+
+static MECANIM_FORCE_INLINE vec4fs Vlargest(vec4f r)
+{
+	float32x2_t temp = vpmax_f32 ( vget_high_f32(r), vget_low_f32(r) );
+	temp = vpmax_f32(temp, temp);
+	return vcombine_f32(temp,temp);
+}
+
+static MECANIM_FORCE_INLINE vec4fs Vsmallest(vec4f r)
+{
+	float32x2_t temp = vpmin_f32 ( vget_high_f32(r), vget_low_f32(r) );
+	temp = vpmin_f32(temp, temp);
+	return vcombine_f32(temp,temp);
+}
+
+static MECANIM_FORCE_INLINE vec4fs Vsum(vec4f r)
+{
+	float32x2_t temp = vpadd_f32 ( vget_high_f32(r), vget_low_f32(r) );
+	temp = vpadd_f32(temp, temp);
+	return vcombine_f32(temp,temp);
+}
+
+#define Vdot( l,  r) Vsum( Vmul((l), (r)) )
+
+static MECANIM_FORCE_INLINE vec4f Vrsqrt(vec4f r)
+{
+    float32x4_t e = vrsqrteq_f32(r);
+	float32x4_t s = vmulq_f32(e, r);
+	float32x4_t v = vrsqrtsq_f32(s, e);
+    
+	e = vmulq_f32(e,v);
+	s = vmulq_f32(e, r);
+	v = vrsqrtsq_f32(s, e);
+    
+	return vmulq_f32(e,v);
+}
+
+static MECANIM_FORCE_INLINE vec4f Vrcp(vec4f r)
+{
+    cvec4fs(C0,-3.402823466e+38f);
+    cvec4fs(C1, 3.402823466e+38f);
+    
+    float32x4_t R0 = vrecpeq_f32(r);
+    R0 = vmaxq_f32(R0, C0);
+    R0 = vminq_f32(R0, C1);
+    
+    float32x4_t R1 = vrecpsq_f32(r, R0);
+    R0 = vmulq_f32(R0, R1);
+    R0 = vmaxq_f32(R0, C0);
+    R0 = vminq_f32(R0, C1);
+    R1 = vrecpsq_f32(r, R0);
+    return vmulq_f32(R0, R1);
+    
+	//float32x4_t inv = vrecpeq_f32(r);
+	//float32x4_t step = vrecpsq_f32(r, inv);
+	//return vmulq_f32(step, inv);
+}
+
+static MECANIM_FORCE_INLINE vec4f Vdiv(const vec4f l, const vec4f r)
+{
+	return Vmul(l, Vrcp(r));
+}
+
+static MECANIM_FORCE_INLINE vec4f Vcombine(vec4f x, vec4f y, vec4f z, vec4f w)
+{
+	float32x2x2_t temp1 = vtrn_f32(vget_high_f32(x), vget_high_f32(y));
+	float32x2x2_t temp2 = vtrn_f32(vget_high_f32(z), vget_high_f32(w));
+	return vcombine_f32(temp1.val[0], temp2.val[0]);
+}
+
+// Vector comparison	
+#define Vcmpeq( a,  b) vceqq_f32(a, b)
+#define Vcmpneq( a,  b) Vnot(vceqq_f32(a, b))
+#define Vcmpgt( a,  b) vcgtq_f32(a, b)
+#define Vcmpge( a,  b) vcgeq_f32(a, b)
+#define Vcmplt( a,  b) vcltq_f32(a, b)
+#define Vcmple( a,  b) vcleq_f32(a, b)
+
+static MECANIM_FORCE_INLINE vec4f Vsel(vec4b c,  vec4f a,  vec4f b)
+{
+	return vbslq_f32(c, a, b);
+}
+
+#define Vsqrt(r) Vsel( Vcmpeq(r, Vzero()), Vzero(), Vmul(r,Vrsqrt(r)))
+
+//	vector logics 
+#define Vnot(r) vmvnq_u32(r)
+#define Vxnor(a, b) Vnot(veorq_u32(a, b))
+#define Vxor(a, b) veorq_u32(a, b)
+#define Vand(a, b) vandq_u32(a, b)
+#define Vor(a, b) vorrq_u32(a, b)
+
+static MECANIM_FORCE_INLINE bool Vall(const vec4b a)
+{
+	::uint32_t u[4];
+
+	vst1q_lane_u32(&u[0], a, 0);
+	vst1q_lane_u32(&u[1], a, 1);
+	vst1q_lane_u32(&u[2], a, 2);
+	vst1q_lane_u32(&u[3], a, 3);
+
+	return (u[0] & u[1] & u[2] & u[3]);
+};
+
+static MECANIM_FORCE_INLINE bool Vany(const vec4b a)
+{
+	::uint32_t u[4];
+
+	vst1q_lane_u32(&u[0], a, 0);
+	vst1q_lane_u32(&u[1], a, 1);
+	vst1q_lane_u32(&u[2], a, 2);
+	vst1q_lane_u32(&u[3], a, 3);
+
+	return (u[0] | u[1] | u[2] | u[3]);
+};
+
+#endif
diff --git a/Runtime/Math/Simd/ppu.h b/Runtime/Math/Simd/ppu.h
new file mode 100644
index 0000000..bfa8832
--- /dev/null
+++ b/Runtime/Math/Simd/ppu.h
@@ -0,0 +1,1944 @@
+#ifndef SIMD_PPU
+#define SIMD_PPU
+
+#include "Runtime/Math/Simd/SimdMath.h"
+#define USE_WPERMWI_EQUIVALENT 1
+typedef vec_float4 vec4f;
+typedef vec_bint4 vec4b;
+typedef vec_bint4 vec4bs;
+
+
+#if USE_WPERMWI_EQUIVALENT
+#	define SWZ_MASK(x, y, z, w) (((x&3)<<6) | ((y&3)<<4) | ((z&3)<<2) | (w&3))
+#else
+#	define SWZ_MASK(x, y, z, w) (((x)<<24) | ((y)<<16) | ((z)<<8) | (w))
+#endif
+
+#define cvec4f(name, x,y,z,w) static const vec4f name = {(x),(y),(z),(w)}
+#define cvec4b(name, x,y,z,w) static const vec4b name = {(x),(y),(z),(w)}
+#define cvec4fs(name, s) static const vec4f name = {(s),(s),(s),(s)}
+
+enum simd_mask
+{
+	kXYZW = SWZ_MASK(0,1,2,3),
+	kXXXX = SWZ_MASK(0,0,0,0),
+	kYYYY = SWZ_MASK(1,1,1,1),
+	kZZZZ = SWZ_MASK(2,2,2,2),
+	kWWWW = SWZ_MASK(3,3,3,3),
+	
+	kXWYZ = SWZ_MASK(0,3,1,2),
+	kXZWY = SWZ_MASK(0,2,3,1),
+	
+	kYZWX = SWZ_MASK(1,2,3,0),
+	kYXZW = SWZ_MASK(1,0,2,3),
+	kYWZX = SWZ_MASK(1,3,2,0),
+	kYZXW = SWZ_MASK(1,2,0,3),
+	kYXWZ = SWZ_MASK(1,0,3,2),
+	
+    kZWXY = SWZ_MASK(2,3,0,1),
+	kZYXW = SWZ_MASK(2,1,0,3),
+	kZYWX = SWZ_MASK(2,1,3,0),
+	kZXYW = SWZ_MASK(2,0,1,3),
+	
+	kWYZX = SWZ_MASK(3,1,2,0),
+	kWXZY = SWZ_MASK(3,0,2,1),
+	kWYXZ = SWZ_MASK(3,1,0,2),
+	kWWWZ = SWZ_MASK(3,3,3,2),
+	kWWZZ = SWZ_MASK(3,3,2,2),
+	kWZYX = SWZ_MASK(3,2,1,0),
+};
+
+#define Vzero() __vzero()
+#define Vone() vec_ctf(vec_splat_u32(1), 0)
+
+
+#if USE_WPERMWI_EQUIVALENT
+#	define Vpermute(v, mask) __vpermwi2<mask>( (v) )
+#else
+#	define Vpermute(v, mask) __vpermwi3( (v), (mask) )
+#endif
+
+#if USE_WPERMWI_EQUIVALENT
+
+template <const int i>
+vec_float4 __vpermwi2(vec_float4 v0a)
+{
+#if 1
+	if (i == SWZ_MASK(0,0,0,0))
+	{
+		return vec_splat( v0a, 0 );
+	}
+	else if (i == SWZ_MASK(1,0,0,0))
+	{
+		vec_float4	v1a			=	vec_sld( v0a, v0a, 8 );
+		vec_float4	v2a			=	vec_splat( v0a, 0 );
+		return						vec_sld( v1a, v2a, 12 );
+	}
+	else if (i == SWZ_MASK(2,0,0,0))
+	{
+		vec_float4	v1a			=	vec_sld( v0a, v0a, 12 );
+		vec_float4	v2a			=	vec_splat( v0a, 0 );
+		return						vec_sld( v1a, v2a, 12 );
+	}
+	else if (i == SWZ_MASK(3,0,0,0))
+	{
+		vec_float4	v1a			=	vec_splat( v0a, 0 );
+		return						vec_sld( v0a, v1a, 12 );
+	}
+	else if (i == SWZ_MASK(0,1,0,0))
+	{
+		vec_float4	v1a			=	vec_sld( v0a, v0a, 8 );
+		vec_float4	v2a			=	vec_splat( v0a, 0 );
+		return						vec_sld( v1a, v2a, 8 );
+	}
+	else if (i == SWZ_MASK(1,1,0,0))
+	{
+		vec_float4	v1a			=	vec_mergeh( v0a, v0a );
+		return						vec_sld( v1a, v1a, 8 );
+	}
+	else if (i == SWZ_MASK(2,1,0,0))
+	{
+		vec_float4	v1a			=	vec_sld( v0a, v0a, 4 );
+		vec_float4	v2a			=	vec_splat( v0a, 0 );
+		vec_float4	v3a			=	vec_mergeh( v1a, v0a );
+		return						vec_sld( v3a, v2a, 8 );
+	}
+	else if (i == SWZ_MASK(3,1,0,0))
+	{
+		vec_float4	v1a			=	vec_sld( v0a, v0a, 12 );
+		vec_float4	v2a			=	vec_mergeh( v0a, v1a );
+		return						vec_sld( v2a, v0a, 4 );
+	}
+	else if (i == SWZ_MASK(0,2,0,0))
+	{
+		vec_float4	v1a			=	vec_splat( v0a, 0 );
+		vec_float4	v2a			=	vec_mergel( v0a, v1a );
+		return						vec_mergeh( v1a, v2a );
+	}
+	else if (i == SWZ_MASK(1,2,0,0))
+	{
+		vec_float4	v1a			=	vec_sld( v0a, v0a, 12 );
+		vec_float4	v2a			=	vec_splat( v0a, 0 );
+		return						vec_sld( v1a, v2a, 8 );
+	}
+	else if (i == SWZ_MASK(2,2,0,0))
+	{
+		vec_float4	v1a			=	vec_splat( v0a, 2 );
+		vec_float4	v2a			=	vec_splat( v0a, 0 );
+		return						vec_sld( v1a, v2a, 8 );
+	}
+	else if (i == SWZ_MASK(3,2,0,0))
+	{
+		vec_float4	v1a			=	vec_sld( v0a, v0a, 12 );
+		vec_float4	v2a			=	vec_sld( v1a, v0a, 12 );
+		return						vec_mergeh( v1a, v2a );
+	}
+	else if (i == SWZ_MASK(0,3,0,0))
+	{
+		vec_float4	v1a			=	vec_splat( v0a, 0 );
+		vec_float4	v2a			=	vec_sld( v0a, v0a, 4 );
+		return						vec_mergel( v1a, v2a );
+	}
+	else if (i == SWZ_MASK(1,3,0,0))
+	{
+		vec_float4	v1a			=	vec_sld( v0a, v0a, 12 );
+		vec_float4	v2a			=	vec_mergeh( v1a, v0a );
+		return						vec_sld( v2a, v2a, 12 );
+	}
+	else if (i == SWZ_MASK(2,3,0,0))
+	{
+		vec_float4	v1a			=	vec_splat( v0a, 0 );
+		return						vec_sld( v0a, v1a, 8 );
+	}
+	else if (i == SWZ_MASK(3,3,0,0))
+	{
+		vec_float4	v1a			=	vec_sld( v0a, v0a, 4 );
+		return						vec_mergel( v1a, v1a );
+	}
+	else if (i == SWZ_MASK(0,0,1,0))
+	{
+		vec_float4	v1a			=	vec_mergeh( v0a, v0a );
+		return						vec_mergeh( v0a, v1a );
+	}
+	else if (i == SWZ_MASK(1,0,1,0))
+	{
+		vec_float4	v1a			=	vec_splat( v0a, 0 );
+		vec_float4	v2a			=	vec_splat( v0a, 1 );
+		return						vec_mergel( v2a, v1a );
+	}
+	else if (i == SWZ_MASK(2,0,1,0))
+	{
+		vec_float4	v1a			=	vec_splat( v0a, 2 );
+		vec_float4	v2a			=	vec_sld( v1a, v0a, 8 );
+		return						vec_sld( v2a, v0a, 4 );
+	}
+	else if (i == SWZ_MASK(3,0,1,0))
+	{
+		vec_float4	v1a			=	vec_sld( v0a, v0a, 8 );
+		return						vec_sld( v1a, v0a, 4 );
+	}
+	else if (i == SWZ_MASK(0,1,1,0))
+	{
+		vec_float4	v1a			=	vec_mergeh( v0a, v0a );
+		return						vec_sld( v1a, v0a, 4 );
+	}
+	else if (i == SWZ_MASK(1,1,1,0))
+	{
+		vec_float4	v1a			=	vec_splat( v0a, 1 );
+		return						vec_sld( v1a, v0a, 4 );
+	}
+	else if (i == SWZ_MASK(2,1,1,0))
+	{
+		vec_float4	v1a			=	vec_sld( v0a, v0a, 4 );
+		vec_float4	v2a			=	vec_mergeh( v1a, v0a );
+		return						vec_sld( v2a, v2a, 8 );
+	}
+	else if (i == SWZ_MASK(3,1,1,0))
+	{
+		vec_float4	v1a			=	vec_splat( v0a, 1 );
+		vec_float4	v2a			=	vec_sld( v1a, v0a, 8 );
+		return						vec_sld( v0a, v2a, 12 );
+	}
+	else if (i == SWZ_MASK(0,2,1,0))
+	{
+		vec_float4	v1a			=	vec_splat( v0a, 2 );
+		vec_float4	v2a			=	vec_sld( v1a, v0a, 12 );
+		return						vec_mergeh( v0a, v2a );
+	}
+	else if (i == SWZ_MASK(1,2,1,0))
+	{
+		vec_float4	v1a			=	vec_splat( v0a, 1 );
+		vec_float4	v2a			=	vec_sld( v1a, v0a, 12 );
+		return						vec_sld( v2a, v2a, 8 );
+	}
+	else if (i == SWZ_MASK(2,2,1,0))
+	{
+		vec_float4	v1a			=	vec_sld( v0a, v0a, 8 );
+		vec_float4	v2a			=	vec_splat( v0a, 2 );
+		vec_float4	v3a			=	vec_sld( v1a, v0a, 12 );
+		return						vec_sld( v2a, v3a, 8 );
+	}
+	else if (i == SWZ_MASK(3,2,1,0))
+	{
+		vec_float4	v1a			=	vec_splat( v0a, 1 );
+		vec_float4	v2a			=	vec_sld( v1a, v0a, 12 );
+		vec_float4	v3a			=	vec_mergel( v0a, v2a );
+		return						vec_sld( v3a, v2a, 8 );
+	}
+	else if (i == SWZ_MASK(0,3,1,0))
+	{
+		vec_float4	v1a			=	vec_sld( v0a, v0a, 12 );
+		return						vec_mergeh( v0a, v1a );
+	}
+	else if (i == SWZ_MASK(1,3,1,0))
+	{
+		vec_float4	v1a			=	vec_mergeh( v0a, v0a );
+		vec_float4	v2a			=	vec_sld( v0a, v0a, 4 );
+		return						vec_mergel( v1a, v2a );
+	}
+	else if (i == SWZ_MASK(2,3,1,0))
+	{
+		vec_float4	v1a			=	vec_sld( v0a, v0a, 4 );
+		vec_float4	v2a			=	vec_mergeh( v1a, v0a );
+		return						vec_sld( v0a, v2a, 8 );
+	}
+	else if (i == SWZ_MASK(3,3,1,0))
+	{
+		vec_float4	v1a			=	vec_sld( v0a, v0a, 4 );
+		vec_float4	v2a			=	vec_sld( v0a, v1a, 4 );
+		return						vec_mergel( v2a, v1a );
+	}
+	else if (i == SWZ_MASK(0,0,2,0))
+	{
+		vec_float4	v1a			=	vec_splat( v0a, 0 );
+		vec_float4	v2a			=	vec_mergel( v1a, v0a );
+		return						vec_sld( v1a, v2a, 12 );
+	}
+	else if (i == SWZ_MASK(1,0,2,0))
+	{
+		vec_float4	v1a			=	vec_splat( v0a, 0 );
+		vec_float4	v2a			=	vec_sld( v0a, v0a, 4 );
+		return						vec_mergeh( v2a, v1a );
+	}
+	else if (i == SWZ_MASK(2,0,2,0))
+	{
+		vec_float4	v1a			=	vec_splat( v0a, 2 );
+		vec_float4	v2a			=	vec_splat( v0a, 0 );
+		return						vec_mergel( v1a, v2a );
+	}
+	else if (i == SWZ_MASK(3,0,2,0))
+	{
+		vec_float4	v1a			=	vec_splat( v0a, 0 );
+		vec_float4	v2a			=	vec_mergel( v1a, v0a );
+		return						vec_sld( v0a, v2a, 12 );
+	}
+	else if (i == SWZ_MASK(0,1,2,0))
+	{
+		vec_float4	v1a			=	vec_sld( v0a, v0a, 12 );
+		return						vec_sld( v1a, v0a, 4 );
+	}
+	else if (i == SWZ_MASK(1,1,2,0))
+	{
+		vec_float4	v1a			=	vec_sld( v0a, v0a, 4 );
+		vec_float4	v2a			=	vec_mergeh( v0a, v1a );
+		return						vec_sld( v2a, v0a, 4 );
+	}
+	else if (i == SWZ_MASK(2,1,2,0))
+	{
+		vec_float4	v1a			=	vec_splat( v0a, 2 );
+		vec_float4	v2a			=	vec_mergeh( v1a, v0a );
+		return						vec_sld( v2a, v2a, 8 );
+	}
+	else if (i == SWZ_MASK(3,1,2,0))
+	{
+		vec_float4	v1a			=	vec_sld( v0a, v0a, 8 );
+		vec_float4	v2a			=	vec_mergel( v0a, v1a );
+		return						vec_sld( v2a, v2a, 8 );
+	}
+	else if (i == SWZ_MASK(0,2,2,0))
+	{
+		vec_float4	v1a			=	vec_splat( v0a, 2 );
+		vec_float4	v2a			=	vec_splat( v0a, 0 );
+		vec_float4	v3a			=	vec_sld( v1a, v0a, 8 );
+		return						vec_sld( v2a, v3a, 12 );
+	}
+	else if (i == SWZ_MASK(1,2,2,0))
+	{
+		vec_float4	v1a			=	vec_sld( v0a, v0a, 4 );
+		vec_float4	v2a			=	vec_mergeh( v1a, v1a );
+		return						vec_sld( v2a, v0a, 4 );
+	}
+	else if (i == SWZ_MASK(2,2,2,0))
+	{
+		vec_float4	v1a			=	vec_splat( v0a, 2 );
+		return						vec_sld( v1a, v0a, 4 );
+	}
+	else if (i == SWZ_MASK(3,2,2,0))
+	{
+		vec_float4	v1a			=	vec_mergel( v0a, v0a );
+		vec_float4	v2a			=	vec_sld( v0a, v1a, 8 );
+		return						vec_sld( v2a, v0a, 4 );
+	}
+	else if (i == SWZ_MASK(0,3,2,0))
+	{
+		vec_float4	v1a			=	vec_sld( v0a, v0a, 4 );
+		vec_float4	v2a			=	vec_mergel( v1a, v0a );
+		return						vec_sld( v1a, v2a, 12 );
+	}
+	else if (i == SWZ_MASK(1,3,2,0))
+	{
+		vec_float4	v1a			=	vec_sld( v0a, v0a, 4 );
+		vec_float4	v2a			=	vec_sld( v0a, v0a, 12 );
+		return						vec_mergel( v2a, v1a );
+	}
+	else if (i == SWZ_MASK(2,3,2,0))
+	{
+		vec_float4	v1a			=	vec_splat( v0a, 2 );
+		vec_float4	v2a			=	vec_sld( v0a, v0a, 4 );
+		return						vec_mergel( v1a, v2a );
+	}
+	else if (i == SWZ_MASK(3,3,2,0))
+	{
+		vec_float4	v1a			=	vec_sld( v0a, v0a, 4 );
+		vec_float4	v2a			=	vec_mergel( v1a, v0a );
+		return						vec_sld( v0a, v2a, 12 );
+	}
+	else if (i == SWZ_MASK(0,0,3,0))
+	{
+		vec_float4	v1a			=	vec_sld( v0a, v0a, 12 );
+		vec_float4	v2a			=	vec_splat( v0a, 0 );
+		return						vec_sld( v2a, v1a, 8 );
+	}
+	else if (i == SWZ_MASK(1,0,3,0))
+	{
+		vec_float4	v1a			=	vec_sld( v0a, v0a, 12 );
+		vec_float4	v2a			=	vec_mergeh( v0a, v1a );
+		return						vec_sld( v2a, v1a, 8 );
+	}
+	else if (i == SWZ_MASK(2,0,3,0))
+	{
+		vec_float4	v1a			=	vec_splat( v0a, 0 );
+		return						vec_mergel( v0a, v1a );
+	}
+	else if (i == SWZ_MASK(3,0,3,0))
+	{
+		vec_float4	v1a			=	vec_splat( v0a, 0 );
+		vec_float4	v2a			=	vec_splat( v0a, 3 );
+		return						vec_mergel( v2a, v1a );
+	}
+	else if (i == SWZ_MASK(0,1,3,0))
+	{
+		vec_float4	v1a			=	vec_sld( v0a, v0a, 8 );
+		vec_float4	v2a			=	vec_sld( v0a, v0a, 12 );
+		return						vec_sld( v1a, v2a, 8 );
+	}
+	else if (i == SWZ_MASK(1,1,3,0))
+	{
+		vec_float4	v1a			=	vec_sld( v0a, v0a, 12 );
+		vec_float4	v2a			=	vec_splat( v0a, 1 );
+		return						vec_sld( v2a, v1a, 8 );
+	}
+	else if (i == SWZ_MASK(2,1,3,0))
+	{
+		vec_float4	v1a			=	vec_splat( v0a, 1 );
+		vec_float4	v2a			=	vec_sld( v1a, v0a, 4 );
+		return						vec_mergel( v0a, v2a );
+	}
+	else if (i == SWZ_MASK(3,1,3,0))
+	{
+		vec_float4	v1a			=	vec_splat( v0a, 3 );
+		vec_float4	v2a			=	vec_mergeh( v1a, v0a );
+		return						vec_sld( v2a, v2a, 8 );
+	}
+	else if (i == SWZ_MASK(0,2,3,0))
+	{
+		vec_float4	v1a			=	vec_sld( v0a, v0a, 8 );
+		vec_float4	v2a			=	vec_splat( v0a, 0 );
+		return						vec_sld( v2a, v1a, 12 );
+	}
+	else if (i == SWZ_MASK(1,2,3,0))
+	{
+		return						vec_sld( v0a, v0a, 4 );
+	}
+	else if (i == SWZ_MASK(2,2,3,0))
+	{
+		vec_float4	v1a			=	vec_sld( v0a, v0a, 8 );
+		vec_float4	v2a			=	vec_splat( v0a, 2 );
+		return						vec_sld( v2a, v1a, 12 );
+	}
+	else if (i == SWZ_MASK(3,2,3,0))
+	{
+		vec_float4	v1a			=	vec_sld( v0a, v0a, 8 );
+		return						vec_sld( v0a, v1a, 12 );
+	}
+	else if (i == SWZ_MASK(0,3,3,0))
+	{
+		vec_float4	v1a			=	vec_sld( v0a, v0a, 4 );
+		vec_float4	v2a			=	vec_mergel( v1a, v1a );
+		return						vec_sld( v1a, v2a, 12 );
+	}
+	else if (i == SWZ_MASK(1,3,3,0))
+	{
+		vec_float4	v1a			=	vec_splat( v0a, 3 );
+		vec_float4	v2a			=	vec_sld( v1a, v0a, 8 );
+		return						vec_sld( v2a, v2a, 12 );
+	}
+	else if (i == SWZ_MASK(2,3,3,0))
+	{
+		vec_float4	v1a			=	vec_mergel( v0a, v0a );
+		return						vec_sld( v1a, v0a, 4 );
+	}
+	else if (i == SWZ_MASK(3,3,3,0))
+	{
+		vec_float4	v1a			=	vec_splat( v0a, 3 );
+		return						vec_sld( v1a, v0a, 4 );
+	}
+	else if (i == SWZ_MASK(0,0,0,1))
+	{
+		vec_float4	v1a			=	vec_splat( v0a, 0 );
+		return						vec_sld( v1a, v0a, 8 );
+	}
+	else if (i == SWZ_MASK(1,0,0,1))
+	{
+		vec_float4	v1a			=	vec_mergeh( v0a, v0a );
+		return						vec_sld( v1a, v1a, 12 );
+	}
+	else if (i == SWZ_MASK(2,0,0,1))
+	{
+		vec_float4	v1a			=	vec_sld( v0a, v0a, 12 );
+		vec_float4	v2a			=	vec_mergeh( v0a, v0a );
+		return						vec_sld( v1a, v2a, 12 );
+	}
+	else if (i == SWZ_MASK(3,0,0,1))
+	{
+		vec_float4	v1a			=	vec_sld( v0a, v0a, 12 );
+		return						vec_mergeh( v1a, v0a );
+	}
+	else if (i == SWZ_MASK(0,1,0,1))
+	{
+		vec_float4	v1a			=	vec_sld( v0a, v0a, 8 );
+		return						vec_sld( v1a, v0a, 8 );
+	}
+	else if (i == SWZ_MASK(1,1,0,1))
+	{
+		vec_float4	v1a			=	vec_splat( v0a, 1 );
+		return						vec_sld( v1a, v0a, 8 );
+	}
+	else if (i == SWZ_MASK(2,1,0,1))
+	{
+		vec_float4	v1a			=	vec_sld( v0a, v0a, 4 );
+		vec_float4	v2a			=	vec_mergeh( v1a, v0a );
+		return						vec_sld( v2a, v0a, 8 );
+	}
+	else if (i == SWZ_MASK(3,1,0,1))
+	{
+		vec_float4	v1a			=	vec_splat( v0a, 1 );
+		vec_float4	v2a			=	vec_sld( v0a, v0a, 4 );
+		return						vec_mergel( v2a, v1a );
+	}
+	else if (i == SWZ_MASK(0,2,0,1))
+	{
+		vec_float4	v1a			=	vec_mergeh( v0a, v0a );
+		vec_float4	v2a			=	vec_mergel( v0a, v1a );
+		return						vec_mergeh( v1a, v2a );
+	}
+	else if (i == SWZ_MASK(1,2,0,1))
+	{
+		vec_float4	v1a			=	vec_sld( v0a, v0a, 12 );
+		return						vec_sld( v1a, v0a, 8 );
+	}
+	else if (i == SWZ_MASK(2,2,0,1))
+	{
+		vec_float4	v1a			=	vec_splat( v0a, 2 );
+		return						vec_sld( v1a, v0a, 8 );
+	}
+	else if (i == SWZ_MASK(3,2,0,1))
+	{
+		vec_float4	v1a			=	vec_sld( v0a, v0a, 12 );
+		vec_float4	v2a			=	vec_mergel( v0a, v1a );
+		return						vec_sld( v2a, v0a, 8 );
+	}
+	else if (i == SWZ_MASK(0,3,0,1))
+	{
+		vec_float4	v1a			=	vec_sld( v0a, v0a, 12 );
+		vec_float4	v2a			=	vec_splat( v0a, 0 );
+		return						vec_sld( v2a, v1a, 12 );
+	}
+	else if (i == SWZ_MASK(1,3,0,1))
+	{
+		vec_float4	v1a			=	vec_sld( v0a, v0a, 12 );
+		vec_float4	v2a			=	vec_splat( v0a, 1 );
+		return						vec_sld( v2a, v1a, 12 );
+	}
+	else if (i == SWZ_MASK(2,3,0,1))
+	{
+		return						vec_sld( v0a, v0a, 8 );
+	}
+	else if (i == SWZ_MASK(3,3,0,1))
+	{
+		vec_float4	v1a			=	vec_splat( v0a, 3 );
+		return						vec_sld( v1a, v0a, 8 );
+	}
+	else if (i == SWZ_MASK(0,0,1,1))
+	{
+		return						vec_mergeh( v0a, v0a );
+	}
+	else if (i == SWZ_MASK(1,0,1,1))
+	{
+		vec_float4	v1a			=	vec_splat( v0a, 1 );
+		return						vec_mergeh( v1a, v0a );
+	}
+	else if (i == SWZ_MASK(2,0,1,1))
+	{
+		vec_float4	v1a			=	vec_sld( v0a, v0a, 4 );
+		vec_float4	v2a			=	vec_mergeh( v0a, v1a );
+		return						vec_sld( v2a, v2a, 12 );
+	}
+	else if (i == SWZ_MASK(3,0,1,1))
+	{
+		vec_float4	v1a			=	vec_sld( v0a, v0a, 8 );
+		vec_float4	v2a			=	vec_splat( v0a, 1 );
+		return						vec_sld( v1a, v2a, 4 );
+	}
+	else if (i == SWZ_MASK(0,1,1,1))
+	{
+		vec_float4	v1a			=	vec_splat( v0a, 1 );
+		return						vec_mergeh( v0a, v1a );
+	}
+	else if (i == SWZ_MASK(1,1,1,1))
+	{
+		return						vec_splat( v0a, 1 );
+	}
+	else if (i == SWZ_MASK(2,1,1,1))
+	{
+		vec_float4	v1a			=	vec_sld( v0a, v0a, 12 );
+		vec_float4	v2a			=	vec_splat( v0a, 1 );
+		return						vec_sld( v1a, v2a, 12 );
+	}
+	else if (i == SWZ_MASK(3,1,1,1))
+	{
+		vec_float4	v1a			=	vec_splat( v0a, 1 );
+		return						vec_sld( v0a, v1a, 12 );
+	}
+	else if (i == SWZ_MASK(0,2,1,1))
+	{
+		vec_float4	v1a			=	vec_sld( v0a, v0a, 4 );
+		vec_float4	v2a			=	vec_mergeh( v1a, v0a );
+		return						vec_sld( v2a, v1a, 4 );
+	}
+	else if (i == SWZ_MASK(1,2,1,1))
+	{
+		vec_float4	v1a			=	vec_sld( v0a, v0a, 12 );
+		vec_float4	v2a			=	vec_splat( v0a, 1 );
+		return						vec_sld( v1a, v2a, 8 );
+	}
+	else if (i == SWZ_MASK(2,2,1,1))
+	{
+		vec_float4	v1a			=	vec_splat( v0a, 1 );
+		vec_float4	v2a			=	vec_splat( v0a, 2 );
+		return						vec_sld( v2a, v1a, 8 );
+	}
+	else if (i == SWZ_MASK(3,2,1,1))
+	{
+		vec_float4	v1a			=	vec_sld( v0a, v0a, 12 );
+		vec_float4	v2a			=	vec_splat( v0a, 1 );
+		vec_float4	v3a			=	vec_mergel( v0a, v1a );
+		return						vec_sld( v3a, v2a, 8 );
+	}
+	else if (i == SWZ_MASK(0,3,1,1))
+	{
+		vec_float4	v1a			=	vec_splat( v0a, 1 );
+		vec_float4	v2a			=	vec_sld( v0a, v1a, 12 );
+		return						vec_mergeh( v0a, v2a );
+	}
+	else if (i == SWZ_MASK(1,3,1,1))
+	{
+		vec_float4	v1a			=	vec_splat( v0a, 1 );
+		vec_float4	v2a			=	vec_sld( v0a, v1a, 4 );
+		return						vec_mergel( v1a, v2a );
+	}
+	else if (i == SWZ_MASK(2,3,1,1))
+	{
+		vec_float4	v1a			=	vec_splat( v0a, 1 );
+		return						vec_sld( v0a, v1a, 8 );
+	}
+	else if (i == SWZ_MASK(3,3,1,1))
+	{
+		vec_float4	v1a			=	vec_splat( v0a, 3 );
+		vec_float4	v2a			=	vec_splat( v0a, 1 );
+		return						vec_sld( v1a, v2a, 8 );
+	}
+	else if (i == SWZ_MASK(0,0,2,1))
+	{
+		vec_float4	v1a			=	vec_sld( v0a, v0a, 8 );
+		vec_float4	v2a			=	vec_mergel( v1a, v0a );
+		return						vec_mergeh( v2a, v0a );
+	}
+	else if (i == SWZ_MASK(1,0,2,1))
+	{
+		vec_float4	v1a			=	vec_sld( v0a, v0a, 4 );
+		return						vec_mergeh( v1a, v0a );
+	}
+	else if (i == SWZ_MASK(2,0,2,1))
+	{
+		vec_float4	v1a			=	vec_splat( v0a, 2 );
+		return						vec_mergeh( v1a, v0a );
+	}
+	else if (i == SWZ_MASK(3,0,2,1))
+	{
+		vec_float4	v1a			=	vec_splat( v0a, 2 );
+		vec_float4	v2a			=	vec_sld( v0a, v1a, 12 );
+		return						vec_mergeh( v2a, v0a );
+	}
+	else if (i == SWZ_MASK(0,1,2,1))
+	{
+		vec_float4	v1a			=	vec_sld( v0a, v0a, 12 );
+		vec_float4	v2a			=	vec_splat( v0a, 1 );
+		return						vec_sld( v1a, v2a, 4 );
+	}
+
+	else if (i == SWZ_MASK(1,1,2,1))
+	{
+		vec_float4	v1a			=	vec_splat( v0a, 1 );
+		vec_float4	v2a			=	vec_sld( v0a, v0a, 4 );
+		return						vec_mergeh( v2a, v1a );
+	}
+
+	else if (i == SWZ_MASK(2,1,2,1))
+	{
+		vec_float4	v1a			=	vec_mergeh( v0a, v0a );
+		vec_float4	v2a			=	vec_splat( v0a, 2 );
+		return						vec_mergel( v2a, v1a );
+	}
+
+	else if (i == SWZ_MASK(3,1,2,1))
+	{
+		vec_float4	v1a			=	vec_splat( v0a, 1 );
+		vec_float4	v2a			=	vec_mergel( v1a, v0a );
+		return						vec_sld( v0a, v2a, 12 );
+	}
+
+	else if (i == SWZ_MASK(0,2,2,1))
+	{
+		vec_float4	v1a			=	vec_sld( v0a, v0a, 4 );
+		vec_float4	v2a			=	vec_splat( v0a, 2 );
+		vec_float4	v3a			=	vec_sld( v2a, v1a, 8 );
+		return						vec_sld( v1a, v3a, 12 );
+	}
+
+	else if (i == SWZ_MASK(1,2,2,1))
+	{
+		vec_float4	v1a			=	vec_sld( v0a, v0a, 4 );
+		vec_float4	v2a			=	vec_mergeh( v1a, v1a );
+		return						vec_sld( v2a, v1a, 4 );
+	}
+
+	else if (i == SWZ_MASK(2,2,2,1))
+	{
+		vec_float4	v1a			=	vec_sld( v0a, v0a, 4 );
+		vec_float4	v2a			=	vec_splat( v0a, 2 );
+		return						vec_sld( v2a, v1a, 4 );
+	}
+
+	else if (i == SWZ_MASK(3,2,2,1))
+	{
+		vec_float4	v1a			=	vec_sld( v0a, v0a, 12 );
+		vec_float4	v2a			=	vec_mergel( v0a, v1a );
+		return						vec_sld( v2a, v2a, 8 );
+	}
+
+	else if (i == SWZ_MASK(0,3,2,1))
+	{
+		vec_float4	v1a			=	vec_sld( v0a, v0a, 4 );
+		vec_float4	v2a			=	vec_mergel( v1a, v0a );
+		vec_float4	v3a			=	vec_sld( v1a, v2a, 8 );
+		return						vec_sld( v3a, v1a, 4 );
+	}
+
+	else if (i == SWZ_MASK(1,3,2,1))
+	{
+		vec_float4	v1a			=	vec_sld( v0a, v0a, 4 );
+		vec_float4	v2a			=	vec_sld( v0a, v1a, 12 );
+		return						vec_mergeh( v1a, v2a );
+	}
+
+	else if (i == SWZ_MASK(2,3,2,1))
+	{
+		vec_float4	v1a			=	vec_sld( v0a, v0a, 12 );
+		vec_float4	v2a			=	vec_mergel( v0a, v1a );
+		return						vec_sld( v0a, v2a, 8 );
+	}
+
+	else if (i == SWZ_MASK(3,3,2,1))
+	{
+		vec_float4	v1a			=	vec_sld( v0a, v0a, 12 );
+		vec_float4	v2a			=	vec_splat( v0a, 3 );
+		vec_float4	v3a			=	vec_mergel( v0a, v1a );
+		return						vec_sld( v2a, v3a, 8 );
+	}
+
+	else if (i == SWZ_MASK(0,0,3,1))
+	{
+		vec_float4	v1a			=	vec_sld( v0a, v0a, 12 );
+		vec_float4	v2a			=	vec_mergeh( v0a, v1a );
+		return						vec_sld( v2a, v2a, 12 );
+	}
+
+	else if (i == SWZ_MASK(1,0,3,1))
+	{
+		vec_float4	v1a			=	vec_sld( v0a, v0a, 8 );
+		vec_float4	v2a			=	vec_mergel( v1a, v0a );
+		return						vec_mergel( v2a, v1a );
+	}
+
+	else if (i == SWZ_MASK(2,0,3,1))
+	{
+		vec_float4	v1a			=	vec_sld( v0a, v0a, 8 );
+		return						vec_mergel( v0a, v1a );
+	}
+
+	else if (i == SWZ_MASK(3,0,3,1))
+	{
+		vec_float4	v1a			=	vec_splat( v0a, 3 );
+		return						vec_mergeh( v1a, v0a );
+	}
+
+	else if (i == SWZ_MASK(0,1,3,1))
+	{
+		vec_float4	v1a			=	vec_splat( v0a, 1 );
+		vec_float4	v2a			=	vec_sld( v0a, v0a, 4 );
+		vec_float4	v3a			=	vec_mergel( v2a, v1a );
+		return						vec_sld( v3a, v3a, 8 );
+	}
+
+	else if (i == SWZ_MASK(1,1,3,1))
+	{
+		vec_float4	v1a			=	vec_splat( v0a, 1 );
+		vec_float4	v2a			=	vec_sld( v0a, v1a, 12 );
+		return						vec_sld( v1a, v2a, 8 );
+	}
+
+	else if (i == SWZ_MASK(2,1,3,1))
+	{
+		vec_float4	v1a			=	vec_splat( v0a, 1 );
+		return						vec_mergel( v0a, v1a );
+	}
+
+	else if (i == SWZ_MASK(3,1,3,1))
+	{
+		vec_float4	v1a			=	vec_splat( v0a, 3 );
+		vec_float4	v2a			=	vec_splat( v0a, 1 );
+		return						vec_mergel( v1a, v2a );
+	}
+
+	else if (i == SWZ_MASK(0,2,3,1))
+	{
+		vec_float4	v1a			=	vec_sld( v0a, v0a, 4 );
+		vec_float4	v2a			=	vec_sld( v0a, v1a, 8 );
+		return						vec_sld( v1a, v2a, 12 );
+	}
+
+	else if (i == SWZ_MASK(1,2,3,1))
+	{
+		vec_float4	v1a			=	vec_splat( v0a, 1 );
+		return						vec_sld( v0a, v1a, 4 );
+	}
+
+	else if (i == SWZ_MASK(2,2,3,1))
+	{
+		vec_float4	v1a			=	vec_sld( v0a, v0a, 12 );
+		vec_float4	v2a			=	vec_mergel( v1a, v0a );
+		return						vec_sld( v2a, v2a, 4 );
+	}
+
+	else if (i == SWZ_MASK(3,2,3,1))
+	{
+		vec_float4	v1a			=	vec_splat( v0a, 1 );
+		vec_float4	v2a			=	vec_sld( v0a, v1a, 8 );
+		return						vec_sld( v0a, v2a, 12 );
+	}
+
+	else if (i == SWZ_MASK(0,3,3,1))
+	{
+		vec_float4	v1a			=	vec_sld( v0a, v0a, 4 );
+		vec_float4	v2a			=	vec_mergel( v1a, v0a );
+		vec_float4	v3a			=	vec_sld( v0a, v1a, 4 );
+		return						vec_mergel( v2a, v3a );
+	}
+
+	else if (i == SWZ_MASK(1,3,3,1))
+	{
+		vec_float4	v1a			=	vec_splat( v0a, 1 );
+		vec_float4	v2a			=	vec_sld( v0a, v1a, 12 );
+		vec_float4	v3a			=	vec_mergel( v1a, v0a );
+		return						vec_sld( v3a, v2a, 8 );
+	}
+
+	else if (i == SWZ_MASK(2,3,3,1))
+	{
+		vec_float4	v1a			=	vec_sld( v0a, v0a, 4 );
+		vec_float4	v2a			=	vec_mergel( v0a, v0a );
+		return						vec_sld( v2a, v1a, 4 );
+	}
+
+	else if (i == SWZ_MASK(3,3,3,1))
+	{
+		vec_float4	v1a			=	vec_sld( v0a, v0a, 4 );
+		vec_float4	v2a			=	vec_splat( v0a, 3 );
+		return						vec_sld( v2a, v1a, 4 );
+	}
+
+	else if (i == SWZ_MASK(0,0,0,2))
+	{
+		vec_float4	v1a			=	vec_sld( v0a, v0a, 8 );
+		vec_float4	v2a			=	vec_splat( v0a, 0 );
+		return						vec_sld( v2a, v1a, 4 );
+	}
+
+	else if (i == SWZ_MASK(1,0,0,2))
+	{
+		vec_float4	v1a			=	vec_mergeh( v0a, v0a );
+		vec_float4	v2a			=	vec_splat( v0a, 2 );
+		vec_float4	v3a			=	vec_sld( v1a, v1a, 8 );
+		return						vec_sld( v3a, v2a, 4 );
+	}
+
+	else if (i == SWZ_MASK(2,0,0,2))
+	{
+		vec_float4	v1a			=	vec_splat( v0a, 2 );
+		vec_float4	v2a			=	vec_sld( v1a, v0a, 4 );
+		vec_float4	v3a			=	vec_mergeh( v0a, v1a );
+		return						vec_sld( v2a, v3a, 8 );
+	}
+
+	else if (i == SWZ_MASK(3,0,0,2))
+	{
+		vec_float4	v1a			=	vec_mergeh( v0a, v0a );
+		vec_float4	v2a			=	vec_sld( v0a, v1a, 8 );
+		return						vec_sld( v2a, v2a, 4 );
+	}
+
+	else if (i == SWZ_MASK(0,1,0,2))
+	{
+		vec_float4	v1a			=	vec_splat( v0a, 0 );
+		vec_float4	v2a			=	vec_sld( v0a, v0a, 4 );
+		return						vec_mergeh( v1a, v2a );
+	}
+
+	else if (i == SWZ_MASK(1,1,0,2))
+	{
+		vec_float4	v1a			=	vec_sld( v0a, v0a, 4 );
+		vec_float4	v2a			=	vec_mergeh( v1a, v0a );
+		return						vec_sld( v2a, v2a, 12 );
+	}
+
+	else if (i == SWZ_MASK(2,1,0,2))
+	{
+		vec_float4	v1a			=	vec_sld( v0a, v0a, 12 );
+		vec_float4	v2a			=	vec_sld( v1a, v0a, 4 );
+		return						vec_mergel( v2a, v1a );
+	}
+
+	else if (i == SWZ_MASK(3,1,0,2))
+	{
+		vec_float4	v1a			=	vec_sld( v0a, v0a, 4 );
+		vec_float4	v2a			=	vec_sld( v0a, v0a, 12 );
+		return						vec_mergel( v1a, v2a );
+	}
+
+	else if (i == SWZ_MASK(0,2,0,2))
+	{
+		vec_float4	v1a			=	vec_splat( v0a, 2 );
+		vec_float4	v2a			=	vec_splat( v0a, 0 );
+		return						vec_mergel( v2a, v1a );
+	}
+
+	else if (i == SWZ_MASK(1,2,0,2))
+	{
+		vec_float4	v1a			=	vec_splat( v0a, 2 );
+		vec_float4	v2a			=	vec_mergeh( v1a, v0a );
+		return						vec_sld( v2a, v2a, 12 );
+	}
+
+	else if (i == SWZ_MASK(2,2,0,2))
+	{
+		vec_float4	v1a			=	vec_splat( v0a, 2 );
+		vec_float4	v2a			=	vec_sld( v1a, v0a, 4 );
+		return						vec_mergel( v2a, v1a );
+	}
+
+	else if (i == SWZ_MASK(3,2,0,2))
+	{
+		vec_float4	v1a			=	vec_splat( v0a, 2 );
+		vec_float4	v2a			=	vec_sld( v0a, v0a, 4 );
+		return						vec_mergel( v2a, v1a );
+	}
+
+	else if (i == SWZ_MASK(0,3,0,2))
+	{
+		vec_float4	v1a			=	vec_splat( v0a, 0 );
+		vec_float4	v2a			=	vec_mergel( v1a, v0a );
+		return						vec_sld( v2a, v2a, 8 );
+	}
+
+	else if (i == SWZ_MASK(1,3,0,2))
+	{
+		vec_float4	v1a			=	vec_sld( v0a, v0a, 8 );
+		vec_float4	v2a			=	vec_mergel( v1a, v0a );
+		return						vec_sld( v2a, v2a, 8 );
+	}
+
+	else if (i == SWZ_MASK(2,3,0,2))
+	{
+		vec_float4	v1a			=	vec_sld( v0a, v0a, 4 );
+		vec_float4	v2a			=	vec_splat( v0a, 2 );
+		return						vec_sld( v1a, v2a, 4 );
+	}
+
+	else if (i == SWZ_MASK(3,3,0,2))
+	{
+		vec_float4	v1a			=	vec_sld( v0a, v0a, 4 );
+		vec_float4	v2a			=	vec_mergel( v0a, v1a );
+		return						vec_sld( v2a, v2a, 4 );
+	}
+
+	else if (i == SWZ_MASK(0,0,1,2))
+	{
+		vec_float4	v1a			=	vec_splat( v0a, 0 );
+		return						vec_sld( v1a, v0a, 12 );
+	}
+
+	else if (i == SWZ_MASK(1,0,1,2))
+	{
+		vec_float4	v1a			=	vec_splat( v0a, 1 );
+		return						vec_sld( v1a, v0a, 12 );
+	}
+
+	else if (i == SWZ_MASK(2,0,1,2))
+	{
+		vec_float4	v1a			=	vec_splat( v0a, 2 );
+		return						vec_sld( v1a, v0a, 12 );
+	}
+
+	else if (i == SWZ_MASK(3,0,1,2))
+	{
+		return						vec_sld( v0a, v0a, 12 );
+	}
+
+	else if (i == SWZ_MASK(0,1,1,2))
+	{
+		vec_float4	v1a			=	vec_sld( v0a, v0a, 4 );
+		return						vec_mergeh( v0a, v1a );
+	}
+
+	else if (i == SWZ_MASK(1,1,1,2))
+	{
+		vec_float4	v1a			=	vec_splat( v0a, 1 );
+		vec_float4	v2a			=	vec_sld( v0a, v0a, 4 );
+		return						vec_mergeh( v1a, v2a );
+	}
+
+	else if (i == SWZ_MASK(2,1,1,2))
+	{
+		vec_float4	v1a			=	vec_sld( v0a, v0a, 12 );
+		vec_float4	v2a			=	vec_mergel( v1a, v1a );
+		return						vec_sld( v1a, v2a, 12 );
+	}
+
+	else if (i == SWZ_MASK(3,1,1,2))
+	{
+		vec_float4	v1a			=	vec_sld( v0a, v0a, 4 );
+		vec_float4	v2a			=	vec_mergeh( v1a, v1a );
+		return						vec_sld( v0a, v2a, 12 );
+	}
+
+	else if (i == SWZ_MASK(0,2,1,2))
+	{
+		vec_float4	v1a			=	vec_mergel( v0a, v0a );
+		return						vec_mergeh( v0a, v1a );
+	}
+
+	else if (i == SWZ_MASK(1,2,1,2))
+	{
+		vec_float4	v1a			=	vec_splat( v0a, 1 );
+		vec_float4	v2a			=	vec_splat( v0a, 2 );
+		return						vec_mergel( v1a, v2a );
+	}
+
+	else if (i == SWZ_MASK(2,2,1,2))
+	{
+		vec_float4	v1a			=	vec_sld( v0a, v0a, 4 );
+		vec_float4	v2a			=	vec_splat( v0a, 2 );
+		return						vec_sld( v2a, v1a, 8 );
+	}
+
+	else if (i == SWZ_MASK(3,2,1,2))
+	{
+		vec_float4	v1a			=	vec_splat( v0a, 2 );
+		vec_float4	v2a			=	vec_sld( v0a, v1a, 4 );
+		return						vec_sld( v2a, v2a, 8 );
+	}
+
+	else if (i == SWZ_MASK(0,3,1,2))
+	{
+		vec_float4	v1a			=	vec_splat( v0a, 2 );
+		vec_float4	v2a			=	vec_sld( v0a, v1a, 12 );
+		return						vec_mergeh( v0a, v2a );
+	}
+
+	else if (i == SWZ_MASK(1,3,1,2))
+	{
+		vec_float4	v1a			=	vec_splat( v0a, 1 );
+		vec_float4	v2a			=	vec_mergel( v1a, v0a );
+		return						vec_sld( v2a, v2a, 8 );
+	}
+
+	else if (i == SWZ_MASK(2,3,1,2))
+	{
+		vec_float4	v1a			=	vec_sld( v0a, v0a, 4 );
+		return						vec_sld( v0a, v1a, 8 );
+	}
+
+	else if (i == SWZ_MASK(3,3,1,2))
+	{
+		vec_float4	v1a			=	vec_sld( v0a, v0a, 4 );
+		vec_float4	v2a			=	vec_splat( v0a, 3 );
+		return						vec_sld( v2a, v1a, 8 );
+	}
+
+	else if (i == SWZ_MASK(0,0,2,2))
+	{
+		vec_float4	v1a			=	vec_splat( v0a, 0 );
+		vec_float4	v2a			=	vec_splat( v0a, 2 );
+		return						vec_sld( v1a, v2a, 8 );
+	}
+
+	else if (i == SWZ_MASK(1,0,2,2))
+	{
+		vec_float4	v1a			=	vec_splat( v0a, 0 );
+		vec_float4	v2a			=	vec_sld( v0a, v0a, 4 );
+		vec_float4	v3a			=	vec_mergel( v1a, v0a );
+		return						vec_mergeh( v2a, v3a );
+	}
+
+	else if (i == SWZ_MASK(2,0,2,2))
+	{
+		vec_float4	v1a			=	vec_splat( v0a, 2 );
+		vec_float4	v2a			=	vec_sld( v1a, v0a, 4 );
+		return						vec_sld( v2a, v1a, 8 );
+	}
+
+	else if (i == SWZ_MASK(3,0,2,2))
+	{
+		vec_float4	v1a			=	vec_sld( v0a, v0a, 4 );
+		vec_float4	v2a			=	vec_splat( v0a, 2 );
+		return						vec_sld( v1a, v2a, 8 );
+	}
+
+	else if (i == SWZ_MASK(0,1,2,2))
+	{
+		vec_float4	v1a			=	vec_sld( v0a, v0a, 8 );
+		vec_float4	v2a			=	vec_splat( v0a, 2 );
+		return						vec_sld( v1a, v2a, 8 );
+	}
+
+	else if (i == SWZ_MASK(1,1,2,2))
+	{
+		vec_float4	v1a			=	vec_sld( v0a, v0a, 12 );
+		return						vec_mergel( v1a, v1a );
+	}
+
+	else if (i == SWZ_MASK(2,1,2,2))
+	{
+		vec_float4	v1a			=	vec_splat( v0a, 2 );
+		vec_float4	v2a			=	vec_sld( v0a, v0a, 4 );
+		return						vec_mergeh( v1a, v2a );
+	}
+
+	else if (i == SWZ_MASK(3,1,2,2))
+	{
+		vec_float4	v1a			=	vec_sld( v0a, v0a, 12 );
+		vec_float4	v2a			=	vec_mergel( v1a, v0a );
+		return						vec_sld( v0a, v2a, 12 );
+	}
+
+	else if (i == SWZ_MASK(0,2,2,2))
+	{
+		vec_float4	v1a			=	vec_sld( v0a, v0a, 4 );
+		vec_float4	v2a			=	vec_splat( v0a, 2 );
+		return						vec_sld( v1a, v2a, 12 );
+	}
+
+	else if (i == SWZ_MASK(1,2,2,2))
+	{
+		vec_float4	v1a			=	vec_mergel( v0a, v0a );
+		vec_float4	v2a			=	vec_sld( v0a, v0a, 4 );
+		return						vec_mergeh( v2a, v1a );
+	}
+
+	else if (i == SWZ_MASK(2,2,2,2))
+	{
+		return						vec_splat( v0a, 2 );
+	}
+
+	else if (i == SWZ_MASK(3,2,2,2))
+	{
+		vec_float4	v1a			=	vec_splat( v0a, 2 );
+		return						vec_sld( v0a, v1a, 12 );
+	}
+
+	else if (i == SWZ_MASK(0,3,2,2))
+	{
+		vec_float4	v1a			=	vec_sld( v0a, v0a, 4 );
+		vec_float4	v2a			=	vec_splat( v0a, 2 );
+		vec_float4	v3a			=	vec_mergel( v1a, v0a );
+		return						vec_sld( v3a, v2a, 8 );
+	}
+
+	else if (i == SWZ_MASK(1,3,2,2))
+	{
+		vec_float4	v1a			=	vec_sld( v0a, v0a, 12 );
+		vec_float4	v2a			=	vec_mergel( v0a, v1a );
+		return						vec_sld( v2a, v2a, 4 );
+	}
+
+	else if (i == SWZ_MASK(2,3,2,2))
+	{
+		vec_float4	v1a			=	vec_splat( v0a, 2 );
+		return						vec_sld( v0a, v1a, 8 );
+	}
+
+	else if (i == SWZ_MASK(3,3,2,2))
+	{
+		vec_float4	v1a			=	vec_mergel( v0a, v0a );
+		return						vec_sld( v1a, v1a, 8 );
+	}
+
+	else if (i == SWZ_MASK(0,0,3,2))
+	{
+		vec_float4	v1a			=	vec_sld( v0a, v0a, 8 );
+		vec_float4	v2a			=	vec_splat( v0a, 0 );
+		vec_float4	v3a			=	vec_sld( v0a, v1a, 12 );
+		return						vec_sld( v2a, v3a, 8 );
+	}
+
+	else if (i == SWZ_MASK(1,0,3,2))
+	{
+		vec_float4	v1a			=	vec_sld( v0a, v0a, 8 );
+		vec_float4	v2a			=	vec_mergel( v1a, v0a );
+		vec_float4	v3a			=	vec_sld( v2a, v0a, 8 );
+		return						vec_mergeh( v3a, v2a );
+	}
+
+	else if (i == SWZ_MASK(2,0,3,2))
+	{
+		vec_float4	v1a			=	vec_sld( v0a, v0a, 8 );
+		vec_float4	v2a			=	vec_mergel( v1a, v0a );
+		return						vec_mergeh( v1a, v2a );
+	}
+
+	else if (i == SWZ_MASK(3,0,3,2))
+	{
+		vec_float4	v1a			=	vec_sld( v0a, v0a, 4 );
+		vec_float4	v2a			=	vec_mergel( v1a, v0a );
+		return						vec_sld( v1a, v2a, 8 );
+	}
+
+	else if (i == SWZ_MASK(0,1,3,2))
+	{
+		vec_float4	v1a			=	vec_sld( v0a, v0a, 8 );
+		vec_float4	v2a			=	vec_sld( v0a, v1a, 12 );
+		return						vec_sld( v1a, v2a, 8 );
+	}
+
+	else if (i == SWZ_MASK(1,1,3,2))
+	{
+		vec_float4	v1a			=	vec_splat( v0a, 1 );
+		vec_float4	v2a			=	vec_sld( v0a, v0a, 12 );
+		vec_float4	v3a			=	vec_mergel( v1a, v0a );
+		return						vec_mergel( v3a, v2a );
+	}
+
+	else if (i == SWZ_MASK(2,1,3,2))
+	{
+		vec_float4	v1a			=	vec_sld( v0a, v0a, 12 );
+		return						vec_mergel( v0a, v1a );
+	}
+
+	else if (i == SWZ_MASK(3,1,3,2))
+	{
+		vec_float4	v1a			=	vec_splat( v0a, 3 );
+		vec_float4	v2a			=	vec_sld( v0a, v0a, 4 );
+		return						vec_mergeh( v1a, v2a );
+	}
+
+	else if (i == SWZ_MASK(0,2,3,2))
+	{
+		vec_float4	v1a			=	vec_sld( v0a, v0a, 8 );
+		vec_float4	v2a			=	vec_splat( v0a, 0 );
+		vec_float4	v3a			=	vec_sld( v0a, v1a, 8 );
+		return						vec_sld( v2a, v3a, 12 );
+	}
+
+	else if (i == SWZ_MASK(1,2,3,2))
+	{
+		vec_float4	v1a			=	vec_mergel( v0a, v0a );
+		return						vec_sld( v0a, v1a, 4 );
+	}
+
+	else if (i == SWZ_MASK(2,2,3,2))
+	{
+		vec_float4	v1a			=	vec_splat( v0a, 2 );
+		return						vec_mergel( v0a, v1a );
+	}
+
+	else if (i == SWZ_MASK(3,2,3,2))
+	{
+		vec_float4	v1a			=	vec_splat( v0a, 2 );
+		vec_float4	v2a			=	vec_splat( v0a, 3 );
+		return						vec_mergel( v2a, v1a );
+	}
+
+	else if (i == SWZ_MASK(0,3,3,2))
+	{
+		vec_float4	v1a			=	vec_sld( v0a, v0a, 4 );
+		vec_float4	v2a			=	vec_mergel( v1a, v0a );
+		return						vec_sld( v2a, v2a, 8 );
+	}
+
+	else if (i == SWZ_MASK(1,3,3,2))
+	{
+		vec_float4	v1a			=	vec_mergel( v0a, v0a );
+		vec_float4	v2a			=	vec_splat( v0a, 1 );
+		vec_float4	v3a			=	vec_sld( v1a, v1a, 8 );
+		return						vec_sld( v2a, v3a, 12 );
+	}
+
+	else if (i == SWZ_MASK(2,3,3,2))
+	{
+		vec_float4	v1a			=	vec_mergel( v0a, v0a );
+		return						vec_sld( v1a, v1a, 4 );
+	}
+
+	else if (i == SWZ_MASK(3,3,3,2))
+	{
+		vec_float4	v1a			=	vec_sld( v0a, v0a, 8 );
+		vec_float4	v2a			=	vec_splat( v0a, 3 );
+		return						vec_sld( v2a, v1a, 4 );
+	}
+
+	else if (i == SWZ_MASK(0,0,0,3))
+	{
+		vec_float4	v1a			=	vec_sld( v0a, v0a, 12 );
+		vec_float4	v2a			=	vec_splat( v0a, 0 );
+		return						vec_sld( v2a, v1a, 4 );
+	}
+
+	else if (i == SWZ_MASK(1,0,0,3))
+	{
+		vec_float4	v1a			=	vec_sld( v0a, v0a, 12 );
+		vec_float4	v2a			=	vec_mergeh( v0a, v1a );
+		return						vec_sld( v2a, v2a, 8 );
+	}
+
+	else if (i == SWZ_MASK(2,0,0,3))
+	{
+		vec_float4	v1a			=	vec_sld( v0a, v0a, 12 );
+		vec_float4	v2a			=	vec_mergeh( v0a, v1a );
+		vec_float4	v3a			=	vec_sld( v1a, v0a, 12 );
+		return						vec_mergeh( v3a, v2a );
+	}
+
+	else if (i == SWZ_MASK(3,0,0,3))
+	{
+		vec_float4	v1a			=	vec_sld( v0a, v0a, 4 );
+		vec_float4	v2a			=	vec_mergel( v1a, v1a );
+		return						vec_sld( v2a, v2a, 4 );
+	}
+
+	else if (i == SWZ_MASK(0,1,0,3))
+	{
+		vec_float4	v1a			=	vec_sld( v0a, v0a, 8 );
+		vec_float4	v2a			=	vec_sld( v1a, v0a, 4 );
+		return						vec_sld( v2a, v2a, 4 );
+	}
+
+	else if (i == SWZ_MASK(1,1,0,3))
+	{
+		vec_float4	v1a			=	vec_sld( v0a, v0a, 12 );
+		vec_float4	v2a			=	vec_splat( v0a, 1 );
+		vec_float4	v3a			=	vec_mergeh( v0a, v1a );
+		return						vec_sld( v2a, v3a, 8 );
+	}
+
+	else if (i == SWZ_MASK(2,1,0,3))
+	{
+		vec_float4	v1a			=	vec_sld( v0a, v0a, 12 );
+		vec_float4	v2a			=	vec_mergeh( v0a, v1a );
+		vec_float4	v3a			=	vec_sld( v2a, v1a, 8 );
+		return						vec_sld( v1a, v3a, 12 );
+	}
+
+	else if (i == SWZ_MASK(3,1,0,3))
+	{
+		vec_float4	v1a			=	vec_sld( v0a, v0a, 12 );
+		vec_float4	v2a			=	vec_mergeh( v0a, v1a );
+		return						vec_sld( v2a, v1a, 4 );
+	}
+
+	else if (i == SWZ_MASK(0,2,0,3))
+	{
+		vec_float4	v1a			=	vec_splat( v0a, 0 );
+		return						vec_mergel( v1a, v0a );
+	}
+
+	else if (i == SWZ_MASK(1,2,0,3))
+	{
+		vec_float4	v1a			=	vec_splat( v0a, 1 );
+		vec_float4	v2a			=	vec_sld( v1a, v0a, 4 );
+		return						vec_mergel( v2a, v0a );
+	}
+
+	else if (i == SWZ_MASK(2,2,0,3))
+	{
+		vec_float4	v1a			=	vec_splat( v0a, 2 );
+		vec_float4	v2a			=	vec_sld( v1a, v0a, 4 );
+		return						vec_mergel( v2a, v0a );
+	}
+
+	else if (i == SWZ_MASK(3,2,0,3))
+	{
+		vec_float4	v1a			=	vec_sld( v0a, v0a, 4 );
+		return						vec_mergel( v1a, v0a );
+	}
+
+	else if (i == SWZ_MASK(0,3,0,3))
+	{
+		vec_float4	v1a			=	vec_splat( v0a, 3 );
+		vec_float4	v2a			=	vec_splat( v0a, 0 );
+		return						vec_mergel( v2a, v1a );
+	}
+
+	else if (i == SWZ_MASK(1,3,0,3))
+	{
+		vec_float4	v1a			=	vec_splat( v0a, 3 );
+		vec_float4	v2a			=	vec_mergeh( v1a, v0a );
+		return						vec_sld( v2a, v2a, 12 );
+	}
+
+	else if (i == SWZ_MASK(2,3,0,3))
+	{
+		vec_float4	v1a			=	vec_sld( v0a, v0a, 4 );
+		vec_float4	v2a			=	vec_splat( v0a, 3 );
+		return						vec_sld( v1a, v2a, 4 );
+	}
+
+	else if (i == SWZ_MASK(3,3,0,3))
+	{
+		vec_float4	v1a			=	vec_mergel( v0a, v0a );
+		vec_float4	v2a			=	vec_sld( v0a, v0a, 4 );
+		return						vec_mergel( v2a, v1a );
+	}
+
+	else if (i == SWZ_MASK(0,0,1,3))
+	{
+		vec_float4	v1a			=	vec_sld( v0a, v0a, 12 );
+		vec_float4	v2a			=	vec_mergeh( v1a, v0a );
+		return						vec_sld( v2a, v1a, 4 );
+	}
+
+	else if (i == SWZ_MASK(1,0,1,3))
+	{
+		vec_float4	v1a			=	vec_sld( v0a, v0a, 8 );
+		vec_float4	v2a			=	vec_splat( v0a, 3 );
+		vec_float4	v3a			=	vec_sld( v1a, v0a, 8 );
+		return						vec_sld( v3a, v2a, 4 );
+	}
+
+	else if (i == SWZ_MASK(2,0,1,3))
+	{
+		vec_float4	v1a			=	vec_sld( v0a, v0a, 12 );
+		vec_float4	v2a			=	vec_sld( v1a, v0a, 8 );
+		return						vec_sld( v2a, v1a, 4 );
+	}
+
+	else if (i == SWZ_MASK(3,0,1,3))
+	{
+		vec_float4	v1a			=	vec_sld( v0a, v0a, 8 );
+		vec_float4	v2a			=	vec_splat( v0a, 3 );
+		return						vec_sld( v1a, v2a, 4 );
+	}
+
+	else if (i == SWZ_MASK(0,1,1,3))
+	{
+		vec_float4	v1a			=	vec_sld( v0a, v0a, 12 );
+		vec_float4	v2a			=	vec_mergeh( v0a, v0a );
+		return						vec_sld( v2a, v1a, 4 );
+	}
+
+	else if (i == SWZ_MASK(1,1,1,3))
+	{
+		vec_float4	v1a			=	vec_sld( v0a, v0a, 12 );
+		vec_float4	v2a			=	vec_splat( v0a, 1 );
+		return						vec_sld( v2a, v1a, 4 );
+	}
+
+	else if (i == SWZ_MASK(2,1,1,3))
+	{
+		vec_float4	v1a			=	vec_sld( v0a, v0a, 12 );
+		vec_float4	v2a			=	vec_splat( v0a, 1 );
+		vec_float4	v3a			=	vec_sld( v2a, v1a, 8 );
+		return						vec_sld( v1a, v3a, 12 );
+	}
+
+	else if (i == SWZ_MASK(3,1,1,3))
+	{
+		vec_float4	v1a			=	vec_splat( v0a, 1 );
+		vec_float4	v2a			=	vec_splat( v0a, 3 );
+		vec_float4	v3a			=	vec_sld( v0a, v1a, 8 );
+		return						vec_sld( v3a, v2a, 4 );
+	}
+
+	else if (i == SWZ_MASK(0,2,1,3))
+	{
+		vec_float4	v1a			=	vec_sld( v0a, v0a, 8 );
+		return						vec_mergel( v1a, v0a );
+	}
+
+	else if (i == SWZ_MASK(1,2,1,3))
+	{
+		vec_float4	v1a			=	vec_splat( v0a, 1 );
+		return						vec_mergel( v1a, v0a );
+	}
+
+	else if (i == SWZ_MASK(2,2,1,3))
+	{
+		vec_float4	v1a			=	vec_sld( v0a, v0a, 12 );
+		vec_float4	v2a			=	vec_mergel( v0a, v1a );
+		return						vec_sld( v1a, v2a, 12 );
+	}
+
+	else if (i == SWZ_MASK(3,2,1,3))
+	{
+		vec_float4	v1a			=	vec_sld( v0a, v0a, 12 );
+		vec_float4	v2a			=	vec_mergel( v0a, v1a );
+		return						vec_sld( v0a, v2a, 12 );
+	}
+
+	else if (i == SWZ_MASK(0,3,1,3))
+	{
+		vec_float4	v1a			=	vec_splat( v0a, 3 );
+		return						vec_mergeh( v0a, v1a );
+	}
+
+	else if (i == SWZ_MASK(1,3,1,3))
+	{
+		vec_float4	v1a			=	vec_splat( v0a, 1 );
+		vec_float4	v2a			=	vec_splat( v0a, 3 );
+		return						vec_mergel( v1a, v2a );
+	}
+
+	else if (i == SWZ_MASK(2,3,1,3))
+	{
+		vec_float4	v1a			=	vec_mergel( v0a, v0a );
+		vec_float4	v2a			=	vec_mergeh( v1a, v0a );
+		return						vec_mergel( v2a, v1a );
+	}
+
+	else if (i == SWZ_MASK(3,3,1,3))
+	{
+		vec_float4	v1a			=	vec_splat( v0a, 3 );
+		vec_float4	v2a			=	vec_mergeh( v1a, v0a );
+		return						vec_mergel( v2a, v1a );
+	}
+
+	else if (i == SWZ_MASK(0,0,2,3))
+	{
+		vec_float4	v1a			=	vec_sld( v0a, v0a, 8 );
+		vec_float4	v2a			=	vec_splat( v0a, 0 );
+		return						vec_sld( v2a, v1a, 8 );
+	}
+
+	else if (i == SWZ_MASK(1,0,2,3))
+	{
+		vec_float4	v1a			=	vec_sld( v0a, v0a, 8 );
+		vec_float4	v2a			=	vec_sld( v1a, v0a, 4 );
+		return						vec_sld( v2a, v1a, 8 );
+	}
+
+	else if (i == SWZ_MASK(2,0,2,3))
+	{
+		vec_float4	v1a			=	vec_splat( v0a, 2 );
+		vec_float4	v2a			=	vec_sld( v0a, v0a, 4 );
+		vec_float4	v3a			=	vec_mergel( v2a, v1a );
+		return						vec_sld( v3a, v3a, 4 );
+	}
+
+	else if (i == SWZ_MASK(3,0,2,3))
+	{
+		vec_float4	v1a			=	vec_sld( v0a, v0a, 4 );
+		vec_float4	v2a			=	vec_mergel( v0a, v1a );
+		return						vec_sld( v1a, v2a, 8 );
+	}
+
+	else if (i == SWZ_MASK(1,1,2,3))
+	{
+		vec_float4	v1a			=	vec_sld( v0a, v0a, 4 );
+		vec_float4	v2a			=	vec_splat( v0a, 1 );
+		return						vec_sld( v2a, v1a, 12 );
+	}
+
+	else if (i == SWZ_MASK(2,1,2,3))
+	{
+		vec_float4	v1a			=	vec_sld( v0a, v0a, 4 );
+		vec_float4	v2a			=	vec_splat( v0a, 2 );
+		return						vec_sld( v2a, v1a, 12 );
+	}
+
+	else if (i == SWZ_MASK(3,1,2,3))
+	{
+		vec_float4	v1a			=	vec_sld( v0a, v0a, 4 );
+		return						vec_sld( v0a, v1a, 12 );
+	}
+
+	else if (i == SWZ_MASK(0,2,2,3))
+	{
+		vec_float4	v1a			=	vec_sld( v0a, v0a, 4 );
+		vec_float4	v2a			=	vec_mergel( v0a, v0a );
+		return						vec_sld( v1a, v2a, 12 );
+	}
+
+	else if (i == SWZ_MASK(1,2,2,3))
+	{
+		vec_float4	v1a			=	vec_sld( v0a, v0a, 12 );
+		return						vec_mergel( v1a, v0a );
+	}
+
+	else if (i == SWZ_MASK(2,2,2,3))
+	{
+		vec_float4	v1a			=	vec_splat( v0a, 2 );
+		return						vec_mergel( v1a, v0a );
+	}
+
+	else if (i == SWZ_MASK(3,2,2,3))
+	{
+		vec_float4	v1a			=	vec_mergel( v0a, v0a );
+		return						vec_sld( v0a, v1a, 12 );
+	}
+
+	else if (i == SWZ_MASK(0,3,2,3))
+	{
+		vec_float4	v1a			=	vec_sld( v0a, v0a, 8 );
+		vec_float4	v2a			=	vec_sld( v0a, v1a, 12 );
+		return						vec_sld( v2a, v2a, 12 );
+	}
+
+	else if (i == SWZ_MASK(1,3,2,3))
+	{
+		vec_float4	v1a			=	vec_splat( v0a, 3 );
+		vec_float4	v2a			=	vec_sld( v0a, v0a, 4 );
+		return						vec_mergeh( v2a, v1a );
+	}
+
+	else if (i == SWZ_MASK(2,3,2,3))
+	{
+		vec_float4	v1a			=	vec_sld( v0a, v0a, 8 );
+		return						vec_sld( v0a, v1a, 8 );
+	}
+
+	else if (i == SWZ_MASK(3,3,2,3))
+	{
+		vec_float4	v1a			=	vec_sld( v0a, v0a, 8 );
+		vec_float4	v2a			=	vec_splat( v0a, 3 );
+		return						vec_sld( v2a, v1a, 8 );
+	}
+
+	else if (i == SWZ_MASK(0,0,3,3))
+	{
+		vec_float4	v1a			=	vec_splat( v0a, 0 );
+		vec_float4	v2a			=	vec_splat( v0a, 3 );
+		return						vec_sld( v1a, v2a, 8 );
+	}
+
+	else if (i == SWZ_MASK(1,0,3,3))
+	{
+		vec_float4	v1a			=	vec_sld( v0a, v0a, 8 );
+		vec_float4	v2a			=	vec_splat( v0a, 3 );
+		vec_float4	v3a			=	vec_sld( v1a, v0a, 4 );
+		return						vec_sld( v3a, v2a, 8 );
+	}
+
+	else if (i == SWZ_MASK(2,0,3,3))
+	{
+		vec_float4	v1a			=	vec_sld( v0a, v0a, 4 );
+		vec_float4	v2a			=	vec_mergel( v1a, v0a );
+		return						vec_sld( v2a, v2a, 4 );
+	}
+
+	else if (i == SWZ_MASK(3,0,3,3))
+	{
+		vec_float4	v1a			=	vec_sld( v0a, v0a, 4 );
+		vec_float4	v2a			=	vec_splat( v0a, 3 );
+		return						vec_sld( v1a, v2a, 8 );
+	}
+
+	else if (i == SWZ_MASK(0,1,3,3))
+	{
+		vec_float4	v1a			=	vec_sld( v0a, v0a, 8 );
+		vec_float4	v2a			=	vec_splat( v0a, 3 );
+		return						vec_sld( v1a, v2a, 8 );
+	}
+
+	else if (i == SWZ_MASK(1,1,3,3))
+	{
+		vec_float4	v1a			=	vec_splat( v0a, 1 );
+		vec_float4	v2a			=	vec_splat( v0a, 3 );
+		return						vec_sld( v1a, v2a, 8 );
+	}
+
+	else if (i == SWZ_MASK(2,1,3,3))
+	{
+		vec_float4	v1a			=	vec_sld( v0a, v0a, 8 );
+		vec_float4	v2a			=	vec_mergel( v1a, v0a );
+		return						vec_mergel( v0a, v2a );
+	}
+
+	else if (i == SWZ_MASK(3,1,3,3))
+	{
+		vec_float4	v1a			=	vec_splat( v0a, 3 );
+		vec_float4	v2a			=	vec_mergeh( v1a, v0a );
+		return						vec_sld( v2a, v1a, 8 );
+	}
+
+	else if (i == SWZ_MASK(0,2,3,3))
+	{
+		vec_float4	v1a			=	vec_sld( v0a, v0a, 4 );
+		vec_float4	v2a			=	vec_mergel( v0a, v1a );
+		return						vec_sld( v1a, v2a, 12 );
+	}
+
+	else if (i == SWZ_MASK(1,2,3,3))
+	{
+		vec_float4	v1a			=	vec_splat( v0a, 3 );
+		return						vec_sld( v0a, v1a, 4 );
+	}
+
+	else if (i == SWZ_MASK(2,2,3,3))
+	{
+		return						vec_mergel( v0a, v0a );
+	}
+
+	else if (i == SWZ_MASK(3,2,3,3))
+	{
+		vec_float4	v1a			=	vec_splat( v0a, 3 );
+		return						vec_mergel( v1a, v0a );
+	}
+
+	else if (i == SWZ_MASK(0,3,3,3))
+	{
+		vec_float4	v1a			=	vec_sld( v0a, v0a, 4 );
+		vec_float4	v2a			=	vec_splat( v0a, 3 );
+		return						vec_sld( v1a, v2a, 12 );
+	}
+
+	else if (i == SWZ_MASK(1,3,3,3))
+	{
+		vec_float4	v1a			=	vec_sld( v0a, v0a, 8 );
+		vec_float4	v2a			=	vec_splat( v0a, 3 );
+		return						vec_sld( v1a, v2a, 12 );
+	}
+	else if (i == SWZ_MASK(2,3,3,3))
+	{
+		vec_float4	v1a			=	vec_splat( v0a, 3 );
+		return						vec_sld( v0a, v1a, 8 );
+	}
+	else if (i == SWZ_MASK(3,3,3,3))
+	{
+		return						vec_splat( v0a, 3 );
+	}
+	else if (i == SWZ_MASK(0,1,2,3))
+	{
+		return v0a;
+	}
+	return	v0a;
+#endif
+}
+
+#else
+
+MECANIM_FORCE_INLINE vec4f __vpermwi3(vec4f v0a, const unsigned int mask)
+{
+
+	unsigned int w = (mask >> 0  ) & 0xff;
+	unsigned int z = (mask >> 8  ) & 0xff;
+	unsigned int y = (mask >> 16 ) & 0xff;
+	unsigned int x = (mask >> 24 ) & 0xff;
+
+	vec_uint4 wmask = V4BuildPermuteMask(x,y,z,w);
+	vec_float4 v = vec_perm(v0a, v0a, (vec_uchar16)wmask);
+	return v;
+
+}
+#endif
+
+
+
+MECANIM_FORCE_INLINE vec4f Vmove(vec4f l, vec4f r)
+ {
+	 static const vec_uint4 vu32 = {0xFFFFFFFF,0,0,0};
+	 vec_float4 v = __vsel(l, r, vu32);
+	 return v;
+ }
+
+template<int SWZ> struct Vswizzle 
+{
+	static inline vec4f rhs(vec4f r)
+	{
+		return Vpermute(r, SWZ);
+	}
+
+	static inline vec4f lhs(vec4f l, vec4f r)
+	{
+		vec4f m = Vmove(Vswizzle<SWZ>::rhs(l), r);
+		vec4f v = Vswizzle<SWZ>::rhs(m);
+		return v;
+	}
+};
+template<> struct Vswizzle<kXYZW> 
+{
+	static inline vec4f rhs(vec4f r)
+	{
+		return r;
+	}
+	static inline vec4f lhs(vec4f l, vec4f r)
+	{
+		return Vmove(l, r);
+	}
+};
+
+MECANIM_FORCE_INLINE float Vstoresf(vec4f r)
+{
+	float f; 
+	vec_ste(__vspltw(r, 0), 0, &f); 
+	return f;
+}
+
+MECANIM_FORCE_INLINE bool Vstoresb(vec4b r)
+{
+	r=__vspltw(r, 0);
+	return !vec_all_eq((vec4f)r, Vzero());
+}
+
+// Aligned store
+#define Vstorepf(v, base, offset) __stvx((v), (base), (offset))
+
+MECANIM_FORCE_INLINE void Vstorepb(vec4b v, bool* r)
+{
+	union {
+		vec4b v;
+		int	   i[4];
+	} a; a.v = v;
+	r[0] = a.i[0] != 0;
+	r[1] = a.i[1] != 0;
+	r[2] = a.i[2] != 0;
+	r[3] = a.i[3] != 0;
+}	
+
+MECANIM_FORCE_INLINE vec4f Vloadsf(float s)
+{
+	vec4f v = {s,s,s,s};
+	return v; 
+}
+
+MECANIM_FORCE_INLINE vec4b Vloadsb(bool s) 
+{
+	vec4b vTrue = (vec4b)vec_splat_u32(0xffffffff);
+	
+	return s ? vTrue : (vec4b)Vzero(); 
+}
+	
+MECANIM_FORCE_INLINE vec4f Vload4sf(float x, float y, float z, float w)
+{
+	vec4f v = {x,y,z,w};
+	return v; 
+}
+
+static MECANIM_FORCE_INLINE vec4b Vload4sb(bool x, bool y, bool z, bool w) 
+{
+	static const unsigned int false_true[2] = {0,~0};
+
+	vec4b v = (vec4b)(vec_uint4) { false_true[x], false_true[y], false_true[z], false_true[w] };
+	return v; 
+}
+
+#define Vloadpf(v, offset) __lvx((v), (offset))
+
+#define Vadd(l, r) __vaddfp((l), (r))
+
+#define Vsub( l,  r) __vsubfp((l), (r))
+
+#define Vmul( l,  r) __vmulfp((l), (r))
+
+MECANIM_FORCE_INLINE vec4f Vrcp(vec4f r)
+{
+	// This function does two iterations of Newton's method! (taken from XMVector)
+	vec_float4 Reciprocal = vec_re(r);
+
+	// First refinement iteration (Newton-Raphson) for 1.0 / x
+	//     y0 = reciprocal_estimate(x)
+	//     y1 = y0 + y0 * (1.0 - x * y0)
+
+	vec_float4 vone = Vone();
+	vec_float4 Scale = vec_nmsub(r, Reciprocal, vone);
+	vec_float4 Result = vec_madd(Reciprocal, Scale, Reciprocal);
+
+	// Second refinement iteration
+	//     y2 = y1 + y1 * (1.0 - x * y1)
+
+	Scale = vec_nmsub(r, Result, vone);
+	vec_bint4 Refine = vec_cmpeq(Result, Result);
+	Result = vec_madd(Result, Scale, Result);
+	return (vec_sel(Reciprocal, Result, Refine));
+}
+
+MECANIM_FORCE_INLINE vec4f Vdiv(vec4f l, vec4f r)
+{
+	// This function does two iterations of Newton's method!
+	return Vmul(l, Vrcp(r));
+}
+
+#define Vmadd( a,  b,  c) __vmaddfp((a), (b), (c))
+
+#define Vmsub( a,  b,  c) Vneg(__vnmsubfp((a), (b), (c)))
+
+#define Vneg(r) __vxor( (r), __vsignedzero)
+
+
+// vector sgn: return -1, 1
+#define Vsgn(r) __vor(Vone(), __vand(__vsignedzero, (r) ))
+
+// vector sgn: return -1, 0, 1
+static MECANIM_FORCE_INLINE vec4f Vsign(vec4f r)
+{
+	vec4f c = (vec4f)__vcmpeqfp(r, Vzero());
+	return __vor( __vand(vec_nor(c,c), Vone()), __vand(__vsignedzero, r ));
+}
+
+#define Vinc(r) Vadd( (r), Vone())
+#define Vdec(r) Vsub( (r), Vone())
+#define Vabs(r) __vandc((r), __vsignedzero)
+#define Vmax( l,  r) __vmaxfp((l), (r))
+#define Vmin( l,  r) __vminfp((l), (r))
+
+MECANIM_FORCE_INLINE vec4f Vlargest(vec4f r)
+{
+	r = Vmax(r, Vswizzle<kYZWX>::rhs(r));
+	r = Vmax(r, Vswizzle<kZWXY>::rhs(r));
+	return r;
+}
+
+MECANIM_FORCE_INLINE vec4f Vsmallest(vec4f r)
+{
+	r = Vmin(r, Vswizzle<kYZWX>::rhs(r));
+	r = Vmin(r, Vswizzle<kZWXY>::rhs(r));
+	return r;
+}
+
+MECANIM_FORCE_INLINE vec4f Vsum(vec4f r)
+{
+	r = Vadd(r, Vswizzle<kYZWX>::rhs(r) );
+	r = Vadd(r, Vswizzle<kZWXY>::rhs(r) );
+	return  Vswizzle<kXXXX>::rhs(r);
+}
+
+#define Vdot( l,  r) __vmsum4fp( (l), (r) )
+
+MECANIM_FORCE_INLINE vec4f Vrsqrt(vec4f r)
+{	
+	static const vec4f three = {3.f,3.f,3.f,3.f};
+	static const vec4f a = {0.5f,0.5f,0.5f,0.5f};
+
+	vec4f const e = __vrsqrtefp(r);
+	return Vmul( Vmul(e, Vsub(three, Vmul( Vmul(e,e),r))), a);
+}
+
+
+#define Vsqrt(r) __vsel( Vmul(r, Vrsqrt(r)), Vzero(), __vcmpeqfp(r, Vzero()))
+
+#define Vcombine(x,y,z,w) __vmrghw(__vmrghw((x), (z)), __vmrghw((y), (w)))
+
+// Vector comparison	
+#define Vcmpeq( a,  b) (vec_bint4)__vcmpeqfp((a), (b))
+
+MECANIM_FORCE_INLINE vec4b Vcmpneq( vec4f a,  vec4f b)
+{
+	vec4f c = (vec4f)Vcmpeq(a, b); 	
+	return (vec4b)__vnor(c, c);
+}
+
+#define Vcmpgt( a,  b) (vec_bint4)__vcmpgtfp((a), (b))
+#define Vcmpge( a,  b) (vec_bint4)__vcmpgefp((a), (b))
+#define Vcmplt( a,  b) (vec_bint4)__vcmpgtfp((b), (a))
+#define Vcmple( a,  b) (vec_bint4)__vcmpgefp((b), (a))
+
+#define Vsel( c, a, b) __vxor(b, __vand(__vxor(a, b), c))
+	
+//	vector logics 
+#define Vnot(r) __vnor( (r), (r) )
+#define Vxnor( a,  b) Vnot(__vxor((a), (b)))
+#define Vxor( a,  b) __vxor((a), (b))
+#define Vand( a,  b) __vand((a), (b))
+#define Vor( a,  b) __vor((a), (b))
+
+MECANIM_FORCE_INLINE bool Vall(vec4b a)
+{
+	return vec_all_ne((vec4f)a, Vzero());
+}
+
+MECANIM_FORCE_INLINE bool Vany(vec4b a)
+{
+	// Not all words equal to 0
+	return vec_any_ne((vec4f)a, Vzero());
+}
+
+#endif
diff --git a/Runtime/Math/Simd/quaternion.h b/Runtime/Math/Simd/quaternion.h
new file mode 100644
index 0000000..ccb5077
--- /dev/null
+++ b/Runtime/Math/Simd/quaternion.h
@@ -0,0 +1,253 @@
+#ifndef SIMD_QUATERNION_H
+#define SIMD_QUATERNION_H
+
+#include "Runtime/Math/Simd/math.h"
+
+namespace math
+{
+
+static inline float4 quatIdentity()
+{
+	cvec4f(id, 0,0,0,1);	
+	return float4(id);
+}
+
+static inline float4 quatConj(float4 const& q)
+{
+	cvec4f(conj, -1,-1,-1,1);	
+	return float4(Vmul(q.eval(), conj));
+}
+
+static inline float4 quatMul(float4 const& a, float4 const& b)
+{		
+	return quatConj(a.zxyw()*b.yzxw() - a.ywzx()*b.zywx() - a.wyxz()*b.xwyz() - a.xzwy()*b.wxzy());
+}
+
+static inline float4 quatMulVec(float4 const& q, float4 const& v1)
+{
+	const float4 v = math::vector(v1 + v1);
+	const float4 qv = q*v;
+	return q.w()*(cross(q, v) + q.w()*v) + q*(qv + qv.yzxw() + qv.zxyw()) + quatConj(v1);
+}
+
+static inline float4 quatLerp(float4 const& p, float4 const& q, float1 const& blend)
+{
+   return normalize(p + blend*(q*sgn(dot(p, q)) - p));
+}
+
+static inline float4 quatArcRotate(float4 const& a, float4 const& b)
+{
+	float4 q = cross(a, b);
+	q.w() = dot(a, b) + math::sqrt( float1(dot(a)*dot(b)) );
+	return q;
+}
+
+static inline float4 quatArcRotateX(float4 const& n)
+{
+	return float4(float1::zero(),-n.z(),n.y(),n.x()+float1::one());
+}
+
+static inline float4 quatXcos(float4 const &qn)
+{
+	const float4 qw = qn.w()*qn - float4(0, 0, 0, .5f);
+	const float4 u = qn.x()*qn + float4(1, 1, -1, -1) * qw.wzyx();
+	return u + u;
+}
+
+static inline float4 quatYcos(float4 const &qn)
+{
+	const float4 qw = qn.w()*qn - float4(0, 0, 0, .5f);
+	const float4 v = qn.y()*qn + float4(-1, 1, 1, -1)*qw.zwxy();
+	return v + v;
+}
+
+static inline float4 quatZcos(float4 const &qn)
+{
+	const float4 qw = qn.w()*qn - float4(0, 0, 0, .5f);
+	const float4 w = qn.z()*qn + float4(1, -1, 1, -1)*qw.yxwz();
+
+	return w + w;
+}
+
+static inline float4 quatEulerToQuat(float4 const& euler)
+{
+	float4 s, c; sincos( float1(0.5f)*euler, s, c);
+
+	const float4 t = float4(s.x()*c.z(), s.x()*s.z(), c.x()*s.z(), c.x()*c.z());
+
+	constant_float4( mask, -1.f, 1.f, -1.f, 1.f);
+	return c.y()*t + s.y()*mask*t.zwxy();
+}
+
+static inline float4 quatQuatToEuler(float4 const& q)
+{
+	float4 euler;
+
+	const float4 x = q.x()*q;
+	const float4 y = q.y()*q;
+	const float1 discr = x.z() - y.w();
+
+	if(discr >= float1(.5f - M_EPSF)) 
+	{
+		float1 _y = x.w() - y.z();
+		float1 _x = -x.z() - y.w();
+		
+		euler = float4( atan2( _y.tofloat(), _x.tofloat() ), -M_PI_2f, 0.f, 0.f);
+	} 
+	else 
+	{
+		const float4 w = q.wwwz()*q.wwzz() - float4(.5f, 0.f, 0.f, 0.f);
+		if(discr <= float1(M_EPSF - .5f)) 
+		{
+			float1 _y = x.y() - w.z();
+			float1 _x = y.y() + w.x();
+			euler = float4( atan2( _y.tofloat(), _x.tofloat() ), M_PI_2f, 0.f, 0.f);
+		} 
+		else 
+		{
+			float1 _yX = x.w() + y.z();
+			float1 _xX = w.w() + w.x();
+			float1 discr2 = discr + discr;
+			float1 _yZ = x.y() + w.z();
+			float1 _xZ = x.x() + w.x();
+
+			euler = float4( atan2( _yX.tofloat(), _xX.tofloat() ), -asin( discr2.tofloat() ), atan2( _yZ.tofloat(), _xZ.tofloat() ), 0.f);
+		}
+	}
+	return euler;
+}
+
+// get unit quaternion from rotation matrix
+static inline float4 quatMatrixToQuat(float4 const& u, float4 const& v, float4 const& w)
+{
+	float4 q;
+	if(u.x() >= float1::zero()) 
+	{
+		const float1 t = v.y() + w.z();
+		if(t >= float1::zero()) 
+		{			
+			float1 x(v.z() - w.y());
+			float1 y(w.x() - u.z());
+			float1 z(u.y() - v.x());
+			float1 ww(float1::one() + u.x() + t);
+			q = float4(x, y, z, ww);
+			// Android doesn't like this expression, it does generate the wrong assembly
+			//q = float4(v.z() - w.y(), w.x() - u.z(), u.y() - v.x(), float1::one() + u.x() + t);
+		} 
+		else 
+		{
+			float1 x(float1::one() + u.x() - t);
+			float1 y(u.y() + v.x());
+			float1 z(w.x() + u.z());
+			float1 ww(v.z() - w.y());
+			q = float4(x, y, z, ww);
+			// Android doesn't like this expression, it does generate the wrong assembly
+			//q = float4(float1::one() + u.x() - t, u.y() + v.x(), w.x() + u.z(), v.z() - w.y());
+		}
+	} 
+	else 
+	{
+		const float1 t = v.y() - w.z();
+		if(t >= float1::zero()) 
+		{
+			float1 x(u.y() + v.x());
+			float1 y(float1::one() - u.x() + t);
+			float1 z(v.z() + w.y());
+			float1 ww(w.x() - u.z());
+			q = float4(x, y, z, ww);
+			// Android doesn't like this expression, it does generate the wrong assembly
+			//q = float4(u.y() + v.x(), float1::one() - u.x() + t, v.z() + w.y(), w.x() - u.z());
+		} 
+		else 
+		{
+			float1 x(w.x() + u.z());
+			float1 y(v.z() + w.y());
+			float1 z(float1::one() - u.x() - t);
+			float1 ww(u.y() - v.x());
+			q = float4(x, y, z, ww);
+			// Android doesn't like this expression, it does generate the wrong assembly
+			//q = float4(w.x() + u.z(), v.z() + w.y(), float1::one() - u.x() - t, u.y() - v.x());
+		}
+	}
+	return normalize(q);
+}
+
+static inline float4 quatProjOnYPlane(float4 const& q)
+{
+	constant_float4(yMask, 0,1,0,0);
+	constant_float4(ywMask, 0,1,0,1);
+
+	const float4 lQAlignUp = quatArcRotate(quatYcos(q), yMask );
+	
+	return normalize( quatMul(lQAlignUp,q) * ywMask);
+}
+
+static inline float4 quatClamp(const float4 &q,float maxAngle)
+{
+	float4 ret = q;
+
+	float1 halfCosMaxAnlge = float1(math::cos(0.5f*maxAngle));
+
+	float4 qn = normalize(q);
+	qn = cond(qn.w() < float1::zero(), -qn, qn);
+
+	if(qn.w() < halfCosMaxAnlge)
+	{
+		float1 fact = (halfCosMaxAnlge - qn.w()) / halfCosMaxAnlge;
+
+		ret = qn * (float1::one() - fact); 
+		ret.w() = lerp(halfCosMaxAnlge, float1::one(), fact);
+	}
+
+	return ret;
+}
+
+static inline float4 quatWeight(float4 const& q, float1 const& w)
+{
+   return normalize(float4(q.x()*w,q.y()*w,q.z()*w,q.w()));
+}
+
+static inline float4 quat2Qtan(float4 const& q)
+{
+	float1 w = q.w();	
+	w = cond(w == float1::zero(), float1(M_EPSF), w);
+	return math::vector(q/w);
+}
+
+static inline float4 qtan2Quat(float4 const& q)
+{
+	float4 qn = q;
+	qn.w() = float1::one();
+	return normalize(qn);
+}
+
+static inline float4 ZYRoll2Quat(float4 const& zyroll)
+{
+	return normalize(float4(zyroll.x(),zyroll.y()+zyroll.x()*zyroll.z(),zyroll.z()-zyroll.x()*zyroll.y(), float1::one())); 
+}
+
+static inline float4 quat2ZYRoll(float4 const& q)
+{
+	const float4 qtan = quat2Qtan(q);
+	const float1 qtanx = qtan.x();
+	const float1 x2p1 = float1::one()+qtanx*qtanx;
+	return float4(qtanx,(qtan.y()-qtanx*qtan.z())/x2p1,(qtan.z()+qtanx*qtan.y())/x2p1,float1::zero());	
+}
+
+static inline float4 RollZY2Quat(float4 const& zyroll)
+{
+	return normalize(float4(zyroll.x(),zyroll.y()-zyroll.x()*zyroll.z(),zyroll.z()+zyroll.x()*zyroll.y(),float1::one())); 
+}
+
+static inline float4 quat2RollZY(float4 const& q)
+{
+	const float4 qtan = quat2Qtan(q);
+	const float1 qtanx = qtan.x();
+	const float1 x2p1 = float1::one()+qtanx*qtanx;
+	return float4(qtanx,(qtan.y()+qtanx*qtan.z())/x2p1,(qtan.z()-qtanx*qtan.y())/x2p1,float1::zero());	
+}
+
+}
+
+#endif
+
diff --git a/Runtime/Math/Simd/sse.h b/Runtime/Math/Simd/sse.h
new file mode 100644
index 0000000..59796d7
--- /dev/null
+++ b/Runtime/Math/Simd/sse.h
@@ -0,0 +1,237 @@
+#ifndef SIMD_SSE_H
+#define SIMD_SSE_H
+
+#include <xmmintrin.h>
+#include <emmintrin.h>
+
+typedef __m128	vec4f;	// vector 4 packed
+typedef __m128	vec4fs;	// vector 4 scalar
+typedef __m128	vec4b;	// vector 4 bool packed
+typedef __m128	vec4bs;	// vector 4 bool scalar
+
+#define SWZ_MASK(x, y, z, w)	_MM_SHUFFLE(w,z,y,x)
+
+#define cvec4f(name, x,y,z,w) static const vec4f name = {x,y,z,w}
+#define cvec4b(name, x,y,z,w) static const vec4b name = {x,y,z,w}
+#define cvec4fs(name, s) static const vec4f name = {s,s,s,s}
+
+enum simd_mask
+{
+	kXYZW = SWZ_MASK(0,1,2,3),
+	kXXXX = SWZ_MASK(0,0,0,0),
+	kYYYY = SWZ_MASK(1,1,1,1),
+	kZZZZ = SWZ_MASK(2,2,2,2),
+	kWWWW = SWZ_MASK(3,3,3,3),
+	
+	kXWYZ = SWZ_MASK(0,3,1,2),
+	kXZWY = SWZ_MASK(0,2,3,1),
+	
+	kYZWX = SWZ_MASK(1,2,3,0),
+	kYXZW = SWZ_MASK(1,0,2,3),
+	kYWZX = SWZ_MASK(1,3,2,0),
+	kYZXW = SWZ_MASK(1,2,0,3),
+	kYXWZ = SWZ_MASK(1,0,3,2),
+	
+    kZWXY = SWZ_MASK(2,3,0,1),
+	kZYXW = SWZ_MASK(2,1,0,3),
+	kZYWX = SWZ_MASK(2,1,3,0),
+	kZXYW = SWZ_MASK(2,0,1,3),
+	
+	kWYZX = SWZ_MASK(3,1,2,0),
+	kWXZY = SWZ_MASK(3,0,2,1),
+	kWYXZ = SWZ_MASK(3,1,0,2),
+	kWWWZ = SWZ_MASK(3,3,3,2),
+	kWWZZ = SWZ_MASK(3,3,2,2),
+	kWZYX = SWZ_MASK(3,2,1,0),
+};
+
+#define Vzero() _mm_setzero_ps()
+
+#define Vone() _mm_set1_ps(1.f)
+
+#define Vpermute(v, mask) _mm_shuffle_ps( (v), (v), (mask) )
+
+#define Vmove(l, r) _mm_move_ss( (l), (r) )
+
+
+template<int SWZ> struct Vswizzle 
+{
+	static MECANIM_FORCE_INLINE vec4f rhs(vec4f r)
+	{
+		return Vpermute(r, SWZ);
+	}
+
+	static MECANIM_FORCE_INLINE vec4f lhs(vec4f l, vec4f r)
+	{
+		vec4f m = Vmove(Vswizzle<SWZ>::rhs(l), r);
+		return Vswizzle<SWZ>::rhs(m);
+	}
+};
+
+template<> struct Vswizzle<kXYZW> 
+{
+	static MECANIM_FORCE_INLINE vec4f rhs(vec4f r)
+	{
+		return r;
+	}
+	static MECANIM_FORCE_INLINE vec4f lhs(vec4f l, vec4f r)
+	{
+		return Vmove(l, r);
+	}
+};
+
+static MECANIM_FORCE_INLINE float Vstoresf(vec4f r)
+{
+	float f; _mm_store_ss(&f, r); return f;
+}
+
+#define Vstoresb(r) ( _mm_movemask_ps((r)) & 0x1<<0) != 0
+
+// Aligned store
+#define Vstorepf(v, base, offset) _mm_store_ps((base)+(offset), (v))
+
+static MECANIM_FORCE_INLINE void Vstorepb(vec4f v, bool* r)
+{
+	r[0] = ( _mm_movemask_ps(v) & 0x1<<0) != 0;
+	r[1] = ( _mm_movemask_ps(v) & 0x1<<1) != 0;
+	r[2] = ( _mm_movemask_ps(v) & 0x1<<2) != 0;
+	r[3] = ( _mm_movemask_ps(v) & 0x1<<3) != 0;
+}
+
+#define Vloadsf(s) _mm_set1_ps(s)
+
+static MECANIM_FORCE_INLINE vec4f Vloadsb(bool s) 
+{
+	union {
+		int		b[4];
+		vec4f		v;
+	} static const false_true[2] = {
+		{0,0,0,0},
+		{~0,~0,~0,~0}
+	};
+
+	return false_true[s].v; 
+}
+	
+#define Vload4sf(x, y, z, w) _mm_set_ps(w, z, y, x)
+
+static MECANIM_FORCE_INLINE vec4f Vload4sb(bool x, bool y, bool z, bool w) 
+{
+	union {
+		int		b;
+		float	f;
+	} static const false_true[2] = {
+		0,~0
+	};
+
+	return _mm_set_ps(false_true[w].f, false_true[z].f, false_true[y].f, false_true[x].f); 
+}
+
+#define Vloadpf(v, offset) _mm_load_ps( (v)+(offset))
+
+#define Vadd(l, r) _mm_add_ps((l), (r))
+
+#define Vsub( l,  r) _mm_sub_ps((l), (r))
+
+#define Vmul( l,  r) _mm_mul_ps((l), (r))
+
+#define Vdiv( l,  r) _mm_div_ps((l), (r))
+
+#define Vmadd( a,  b,  c) _mm_add_ps(_mm_mul_ps((a), (b)), (c))
+
+#define Vmsub( a,  b,  c) _mm_sub_ps(_mm_mul_ps((a), (b)), (c))
+
+
+static MECANIM_FORCE_INLINE vec4f Vneg(vec4f r)
+{
+	static const vec4f sign_constant = {-0.f,-0.f,-0.f,-0.f};
+	return _mm_xor_ps( (r), sign_constant);
+}
+
+// vector sgn: return -1, 1
+static MECANIM_FORCE_INLINE vec4f Vsgn(vec4f r)
+{
+	static const vec4f sign_constant = {-0.f,-0.f,-0.f,-0.f};	
+	return _mm_or_ps(Vone(), _mm_and_ps(sign_constant, (r) ));
+}
+
+// vector sgn: return -1, 0, 1
+static MECANIM_FORCE_INLINE vec4f Vsign(vec4f r)
+{
+	static const vec4f sign_constant = {-0.f,-0.f,-0.f,-0.f};	
+	return _mm_or_ps( _mm_and_ps( _mm_cmpneq_ps(r, Vzero()), Vone()), _mm_and_ps(sign_constant, r ));
+}
+
+#define Vinc(r) Vadd( (r), Vone())
+#define Vdec(r) Vsub( (r), Vone())
+
+static MECANIM_FORCE_INLINE vec4f Vabs(vec4f r)
+{
+	static const vec4f sign_constant = {-0.f,-0.f,-0.f,-0.f};	
+	return _mm_andnot_ps(sign_constant, (r));
+}
+
+#define Vmax( l,  r) _mm_max_ps((l), (r))
+#define Vmin( l,  r) _mm_min_ps((l), (r))
+
+static MECANIM_FORCE_INLINE vec4fs Vlargest(vec4f r)
+{
+	r = Vmax(r, Vswizzle<kYZWX>::rhs(r));
+	r = Vmax(r, Vswizzle<kZWXY>::rhs(r));
+	return r;
+}
+
+static MECANIM_FORCE_INLINE vec4fs Vsmallest(vec4f r)
+{
+	r = Vmin(r, Vswizzle<kYZWX>::rhs(r));
+	r = Vmin(r, Vswizzle<kZWXY>::rhs(r));
+	return r;
+}
+
+static MECANIM_FORCE_INLINE vec4fs Vsum(vec4f r)
+{
+	r = Vadd(r, Vswizzle<kYZWX>::rhs(r) );
+	r = Vadd(r, Vswizzle<kZWXY>::rhs(r) );
+	return  Vswizzle<kXXXX>::rhs(r);
+}
+
+#define Vdot( l,  r) Vsum( Vmul((l), (r)) )
+#define Vsqrt(r) _mm_sqrt_ps((r))
+
+static MECANIM_FORCE_INLINE vec4f Vrsqrt(vec4f r)
+{
+	vec4f const e = _mm_rsqrt_ps(r);
+	return Vmul(Vmul(e, Vsub(_mm_set1_ps(3.0f), Vmul(Vmul(e,e),r))), _mm_set1_ps(.5f));
+}
+
+static MECANIM_FORCE_INLINE vec4f Vrcp(vec4f r)
+{
+	vec4f e = _mm_rcp_ps( r );
+	return Vsub( Vadd(e, e), Vmul(r, Vmul(e, e)));
+}
+
+#define Vcombine(x,y,z,w) _mm_movelh_ps(_mm_unpacklo_ps( (x), (y) ), _mm_unpacklo_ps((z), (w)))
+
+// Vector comparison	
+#define Vcmpeq( a,  b) _mm_cmpeq_ps((a), (b))
+#define Vcmpneq( a,  b) _mm_cmpneq_ps((a), (b))
+#define Vcmpgt( a,  b) _mm_cmpgt_ps((a), (b))
+#define Vcmpge( a,  b) _mm_cmpge_ps((a), (b))
+#define Vcmplt( a,  b) _mm_cmplt_ps((a), (b))
+#define Vcmple( a,  b) _mm_cmple_ps((a), (b))
+
+static MECANIM_FORCE_INLINE vec4f Vsel( vec4f c,  vec4f a,  vec4f b)
+{
+	return _mm_xor_ps(b, _mm_and_ps(_mm_xor_ps(a, b), c));
+}
+	
+//	vector logics 
+#define Vnot(r) _mm_cmpeq_ps( (r), Vzero() )
+#define Vxnor( a,  b) Vnot(_mm_xor_ps((a), (b)))
+#define Vxor( a,  b) _mm_xor_ps((a), (b))
+#define Vand( a,  b) _mm_and_ps((a), (b))
+#define Vor( a,  b) _mm_or_ps((a), (b))
+#define Vall(a) (_mm_movemask_ps((a)) & 0xf) == 0xf
+#define Vany( a) _mm_movemask_ps((a)) != 0
+
+#endif
diff --git a/Runtime/Math/Simd/xenon.h b/Runtime/Math/Simd/xenon.h
new file mode 100644
index 0000000..5ddfc68
--- /dev/null
+++ b/Runtime/Math/Simd/xenon.h
@@ -0,0 +1,275 @@
+#ifndef SIMD_XENON
+#define SIMD_XENON
+
+#include <vectorintrinsics.h>
+#include <xnamath.h>
+
+typedef __vector4	vec4f;	// vector 4 packed
+typedef __vector4	vec4fs;	// vector 4 scalar
+typedef __vector4	vec4b;	// vector 4 bool packed
+typedef __vector4	vec4bs;	// vector 4 bool scalar
+
+#define SWZ_MASK(x, y, z, w)	VPERMWI_CONST(x, y, z, w)
+
+#define cvec4f(name, x,y,z,w) static const vec4f name = {(x),(y),(z),(w)}
+#define cvec4b(name, x,y,z,w) static const vec4b name = {(x),(y),(z),(w)}
+#define cvec4fs(name, s) static const vec4fs name = {(s),(s),(s),(s)}
+
+enum simd_mask
+{
+	kXYZW = SWZ_MASK(0,1,2,3),
+	kXXXX = SWZ_MASK(0,0,0,0),
+	kYYYY = SWZ_MASK(1,1,1,1),
+	kZZZZ = SWZ_MASK(2,2,2,2),
+	kWWWW = SWZ_MASK(3,3,3,3),
+	
+	kXWYZ = SWZ_MASK(0,3,1,2),
+	kXZWY = SWZ_MASK(0,2,3,1),
+	
+	kYZWX = SWZ_MASK(1,2,3,0),
+	kYXZW = SWZ_MASK(1,0,2,3),
+	kYWZX = SWZ_MASK(1,3,2,0),
+	kYZXW = SWZ_MASK(1,2,0,3),
+	kYXWZ = SWZ_MASK(1,0,3,2),
+	
+    kZWXY = SWZ_MASK(2,3,0,1),
+	kZYXW = SWZ_MASK(2,1,0,3),
+	kZYWX = SWZ_MASK(2,1,3,0),
+	kZXYW = SWZ_MASK(2,0,1,3),
+	
+	kWYZX = SWZ_MASK(3,1,2,0),
+	kWXZY = SWZ_MASK(3,0,2,1),
+	kWYXZ = SWZ_MASK(3,1,0,2),
+	kWWWZ = SWZ_MASK(3,3,3,2),
+	kWWZZ = SWZ_MASK(3,3,2,2),
+	kWZYX = SWZ_MASK(3,2,1,0),
+};
+
+#define Vzero() __vzero()
+#define Vone() __vupkd3d(__vspltisw(0), VPACK_D3DCOLOR)
+
+#define Vpermute(v, mask) __vpermwi( (v), (mask) )
+
+ MECANIM_FORCE_INLINE vec4f Vmove(vec4f l, vec4f r)
+ {
+	 static const XMVECTORU32 vu32 = {0xFFFFFFFF,0,0,0};
+	 return __vsel(l, r, vu32.v);
+ }
+
+template<int SWZ> struct Vswizzle 
+{
+	static inline vec4f rhs(vec4f r)
+	{
+		return Vpermute(r, SWZ);
+	}
+
+	static inline vec4f lhs(vec4f l, vec4f r)
+	{
+		vec4f m = Vmove(Vswizzle<SWZ>::rhs(l), r);
+		return Vswizzle<SWZ>::rhs(m);
+	}
+};
+template<> struct Vswizzle<kXYZW> 
+{
+	static inline vec4f rhs(vec4f r)
+	{
+		return r;
+	}
+	static inline vec4f lhs(vec4f l, vec4f r)
+	{
+		return Vmove(l, r);
+	}
+};
+
+MECANIM_FORCE_INLINE float Vstoresf(vec4f r)
+{
+	float f; __stvewx(__vspltw(r, 0), &f, 0); return f;
+}
+
+MECANIM_FORCE_INLINE bool Vstoresb(vec4f r)
+{
+	union {
+		vec4f v;
+		int	  i[4];
+	} a; a.v = r;
+	return a.i[0] != 0;
+}
+
+// Aligned store
+#define Vstorepf(v, base, offset) __stvx((v), (base), (offset))
+
+MECANIM_FORCE_INLINE void Vstorepb(vec4f v, bool* r)
+{
+	union {
+		vec4f v;
+		int	   i[4];
+	} a; a.v = v;
+	r[0] = a.i[0] != 0;
+	r[1] = a.i[1] != 0;
+	r[2] = a.i[2] != 0;
+	r[3] = a.i[3] != 0;
+}	
+
+MECANIM_FORCE_INLINE vec4f Vloadsf(float s)
+{
+	vec4f v = {s,s,s,s};
+	return v; 
+}
+
+MECANIM_FORCE_INLINE vec4f Vloadsb(bool s) 
+{
+	union {
+		int		b[4];
+		vec4f		v;
+	} static const false_true[2] = {
+		{0,0,0,0},
+		{~0,~0,~0,~0}
+	};
+
+	return false_true[s].v; 
+}
+	
+MECANIM_FORCE_INLINE vec4f Vload4sf(float x, float y, float z, float w)
+{
+	vec4f v = {x,y,z,w};
+	return v; 
+}
+
+static MECANIM_FORCE_INLINE vec4f Vload4sb(bool x, bool y, bool z, bool w) 
+{
+	union {
+		int		b;
+		float	f;
+	} static const false_true[2] = {
+		0,~0
+	};
+
+	vec4f v = {false_true[x].f,false_true[y].f,false_true[z].f,false_true[w].f};
+	return v; 
+}
+
+#define Vloadpf(v, offset) __lvx( (v), (offset))
+
+#define Vadd(l, r) __vaddfp((l), (r))
+
+#define Vsub( l,  r) __vsubfp((l), (r))
+
+#define Vmul( l,  r) __vmulfp((l), (r))
+
+MECANIM_FORCE_INLINE vec4f Vrcp(vec4f r)
+{
+	// This function does two iterations of Newton's method!
+	return XMVectorReciprocal(r);	
+}
+
+MECANIM_FORCE_INLINE vec4f Vdiv(vec4f l, vec4f r)
+{
+	// This function does two iterations of Newton's method!
+	return XMVectorDivide(l, r);
+}
+
+#define Vmadd( a,  b,  c) __vmaddfp((a), (b), (c))
+
+#define Vmsub( a,  b,  c) Vneg(__vnmsubfp((a), (b), (c)))
+
+static const vec4f sign_constant = {-0.f,-0.f,-0.f,-0.f};
+
+#define Vneg(r) __vxor( (r), sign_constant)
+
+
+// vector sgn: return -1, 1
+#define Vsgn(r) __vor(Vone(), __vand(sign_constant, (r) ))
+
+// vector sgn: return -1, 0, 1
+static MECANIM_FORCE_INLINE vec4f Vsign(vec4f r)
+{
+	vec4f c = __vcmpeqfp(r, Vzero());
+	return __vor( __vand(__vnor(c,c), Vone()), __vand(sign_constant, r ));
+}
+
+#define Vinc(r) Vadd( (r), Vone())
+#define Vdec(r) Vsub( (r), Vone())
+#define Vabs(r) __vandc((r), sign_constant)
+#define Vmax( l,  r) __vmaxfp((l), (r))
+#define Vmin( l,  r) __vminfp((l), (r))
+
+MECANIM_FORCE_INLINE vec4fs Vlargest(vec4f r)
+{
+	r = Vmax(r, Vswizzle<kYZWX>::rhs(r));
+	r = Vmax(r, Vswizzle<kZWXY>::rhs(r));
+	return r;
+}
+
+MECANIM_FORCE_INLINE vec4fs Vsmallest(vec4f r)
+{
+	r = Vmin(r, Vswizzle<kYZWX>::rhs(r));
+	r = Vmin(r, Vswizzle<kZWXY>::rhs(r));
+	return r;
+}
+
+MECANIM_FORCE_INLINE vec4fs Vsum(vec4f r)
+{
+	r = Vadd(r, Vswizzle<kYZWX>::rhs(r) );
+	r = Vadd(r, Vswizzle<kZWXY>::rhs(r) );
+	return  Vswizzle<kXXXX>::rhs(r);
+}
+
+#define Vdot( l,  r) __vmsum4fp( (l), (r) )
+
+MECANIM_FORCE_INLINE vec4f Vrsqrt(vec4f r)
+{	
+	static const vec4f three = {3.f,3.f,3.f,3.f};
+	static const vec4f a = {0.5f,0.5f,0.5f,0.5f};
+
+	vec4f const e = __vrsqrtefp(r);
+	return Vmul( Vmul(e, Vsub(three, Vmul( Vmul(e,e),r))), a);
+}
+
+
+#define Vsqrt(r) __vsel( Vmul(r, Vrsqrt(r)), Vzero(), __vcmpeqfp(r, Vzero()))
+
+#define Vcombine(x,y,z,w) __vmrghw(__vmrghw((x), (z)), __vmrghw((y), (w)))
+
+// Vector comparison	
+#define Vcmpeq( a,  b) __vcmpeqfp((a), (b))
+
+MECANIM_FORCE_INLINE vec4f Vcmpneq( vec4f a,  vec4f b)
+{
+	vec4f c = __vcmpeqfp(a, b); 	
+	return __vnor(c, c);
+}
+
+#define Vcmpgt( a,  b) __vcmpgtfp((a), (b))
+#define Vcmpge( a,  b) __vcmpgefp((a), (b))
+#define Vcmplt( a,  b) __vcmpgtfp((b), (a))
+#define Vcmple( a,  b) __vcmpgefp((b), (a))
+
+#define Vsel( c, a, b) __vxor(b, __vand(__vxor(a, b), c))
+	
+//	vector logics 
+#define Vnot(r) __vnor( (r), (r) )
+#define Vxnor( a,  b) Vnot(__vxor((a), (b)))
+#define Vxor( a,  b) __vxor((a), (b))
+#define Vand( a,  b) __vand((a), (b))
+#define Vor( a,  b) __vor((a), (b))
+
+MECANIM_FORCE_INLINE bool Vall(vec4b a)
+{
+	// All words equal
+	static const XMVECTORU32 vu32 = {0xFFFFFFFF,0xFFFFFFFF,0xFFFFFFFF,0xFFFFFFFF};
+	const int All = 0x00000080;
+	unsigned int compareResult;
+	__vcmpequwR(a, vu32.v, &compareResult);
+	return (compareResult & All) == All;
+}
+
+MECANIM_FORCE_INLINE bool Vany(vec4b a)
+{
+	// Not all words equal to 0
+	const int All = 0x00000080;
+	unsigned int compareResult;
+	__vcmpequwR(a, Vzero(), &compareResult);
+	return (compareResult & All) == 0;
+}
+
+#endif
diff --git a/Runtime/Math/Simd/xform.h b/Runtime/Math/Simd/xform.h
new file mode 100644
index 0000000..0bfbc13
--- /dev/null
+++ b/Runtime/Math/Simd/xform.h
@@ -0,0 +1,153 @@
+#ifndef SIMD_XFORM_H
+#define SIMD_XFORM_H
+
+#include "Runtime/Serialize/TransferFunctions/SerializeTransfer.h"
+
+#include "Runtime/Math/Simd/float4.h"
+#include "Runtime/Math/Simd/quaternion.h"
+
+namespace math
+{
+
+struct ATTRIBUTE_ALIGN(ALIGN4F) xform  
+{		
+	DEFINE_GET_TYPESTRING(xform)
+
+	inline xform():t(float4::zero()),q(quatIdentity()),s(float4::one()){}
+
+	inline xform( xform const& x)
+		{ t = x.t; q = x.q; s = x.s; }
+	
+	inline xform &operator=(xform const& x) 
+		{ t = x.t; q = x.q; s = x.s; return *this; }
+	
+	inline xform(float4 const& t, float4 const& q, float4 const& s)
+		{ this->t = t; this->q = q; this->s = s; }
+
+	float4 t;
+	float4 q;
+	float4 s;
+
+	template<class TransferFunction>
+	inline void Transfer (TransferFunction& transfer)
+	{
+		TRANSFER(t);
+		TRANSFER(q);
+		TRANSFER(s);
+	}
+};
+
+static inline xform xformIdentity()
+{
+	return xform( float4::zero(), quatIdentity(), float4::one() );
+}
+
+static inline float4 xformMulVec(xform const& x, float4 const& v)
+{
+	return x.t + quatMulVec(x.q, v * x.s );
+}
+
+static inline float4 xformInvMulVec(xform const& x, float4 const& v)
+{
+	return quatMulVec(quatConj(x.q), v - x.t) / x.s;
+}
+    
+static inline float4 xformInvMulVecNS(xform const& x, float4 const& v)
+{
+    return quatMulVec(quatConj(x.q), v - x.t);
+}
+    
+static inline xform xformMul(xform const& a, xform const& b)
+{
+	return xform( xformMulVec(a, b.t), normalize(quatMul(a.q, b.q)), a.s * b.s);
+}
+
+static inline xform xformInvMul(xform const& a, xform const& b)
+{
+	return xform(xformInvMulVec(a, b.t), normalize( quatMul( quatConj(a.q), b.q)), b.s / a.s);	
+}
+
+static inline xform xformInvMulNS(xform const& a, xform const& b)
+{
+    return xform(xformInvMulVecNS(a, b.t), normalize( quatMul( quatConj(a.q), b.q)), float4::one());
+}
+    
+static inline xform xformMulInv(xform const& a, xform const& b)
+{
+	const float4 qinv = quatConj(b.q);
+	const float4 sinv = rcp(b.s);
+
+	return xform(xformMulVec(a, quatMulVec( qinv, -b.t) * sinv),normalize( quatMul( a.q, qinv)), a.s * sinv);	
+}
+
+static inline xform xformBlend(xform const &a, xform const &b, float1 const& w)
+{
+	return xform(lerp(a.t, b.t, w),quatLerp(a.q, b.q, w),scaleBlend(a.s, b.s, w));
+}
+
+static inline bool operator==(xform const& l, xform const& r)
+{ 
+	return all(l.t == r.t) && all(l.q == r.q) && all(l.s == r.s);
+}
+
+static inline xform xformWeight(xform const& x, float1 const& w)
+{
+	return xform(x.t*w,quatWeight(x.q,w),scaleWeight(x.s,w));
+}
+
+static inline xform xformAdd(xform const& a, xform const& b)
+{
+	return xform(a.t+b.t,qtan2Quat(quat2Qtan(a.q)+quat2Qtan(b.q)),a.s*b.s);
+}
+
+static inline xform xformSub(xform const& a, xform const& b)
+{
+	return xform(a.t-b.t,qtan2Quat(quat2Qtan(a.q)-quat2Qtan(b.q)),a.s/b.s);
+}
+
+static inline xform xformBlend(xform const* apXFormArray, float const* apWeightArray, unsigned int aCount)
+{
+	xform ret;
+
+	ret.t = float4::zero(); 
+	ret.q = float4::zero(); 
+	ret.s = float4::one(); 
+
+	float sumW = 0;
+
+	unsigned int i;
+	for(i = 0; i < aCount ; i++)
+	{
+		float w = apWeightArray[i]; 
+		math::float1 w1(w);
+		sumW += w;
+		
+		ret.t += apXFormArray[i].t*w1; 
+		ret.q += cond(dot(ret.q,apXFormArray[i].q) < float1::zero(),apXFormArray[i].q * -w1,apXFormArray[i].q * w1); 
+		ret.s *= scaleWeight(apXFormArray[i].s,w1); 
+	}
+
+	float4 q(0,0,0,saturate(1.0f-sumW));
+	ret.q = normalize(ret.q+q);
+
+	return ret;
+}
+
+static inline xform mirror(xform const& x)
+{
+	constant_float4(mirrorQ,1,-1,-1,1);
+
+	xform ret = x;		
+	ret.t = mirror(ret.t);
+	ret.q *= mirrorQ;
+	return ret;
+}
+
+static inline xform cond(bool c, xform const& a, xform const& b)
+{
+	return c?a:b;
+}
+
+}
+
+#endif
diff --git a/Runtime/Math/SphericalHarmonics.h b/Runtime/Math/SphericalHarmonics.h
new file mode 100644
index 0000000..539d316
--- /dev/null
+++ b/Runtime/Math/SphericalHarmonics.h
@@ -0,0 +1,83 @@
+#pragma once
+
+
+static inline void SHEvalDirection9 (float x, float y, float z, float outsh[9])
+{
+	// Core i7 920, VS2008 Release, FPU code:
+
+	// 114 clocks
+	//D3DXSHEvalDirection (outsh, 3, &D3DXVECTOR3(x,y,z));
+
+	/*
+	// 314 clocks
+	// Reference implementation from Stupid Spherical Harmonics Tricks
+	// http://www.ppsloan.org/publications/StupidSH36.pdf
+	const float kSqrtPI = sqrtf(kPI);
+	const float kSqrt3 = sqrtf(3.0f);
+	const float kSqrt15 = sqrtf(15.0f);
+	outsh[0] = 1.0f / (2.0f * kSqrtPI);
+	outsh[1] = -(kSqrt3 * y) / (2.0f * kSqrtPI);
+	outsh[2] = (kSqrt3 * z) / (2.0f * kSqrtPI);
+	outsh[3] = -(kSqrt3 * x) / (2.0f * kSqrtPI);
+	outsh[4] = (kSqrt15 * x * y) / (2.0f * kSqrtPI);
+	outsh[5] = -(kSqrt15 * y * z) / (2.0f * kSqrtPI);
+	outsh[6] = (sqrtf(5.0f) * (3.0f*z*z-1.0f)) / (4.0f * kSqrtPI);
+	outsh[7] = -(kSqrt15 * x * z) / (2.0f * kSqrtPI);
+	outsh[8] = (kSqrt15 * (x*x - y*y)) / (4.0f * kSqrtPI);
+	*/
+
+	// 86 clocks
+	// Make sure all constants are never computed at runtime
+	const float kInv2SqrtPI = 0.28209479177387814347403972578039f; // 1 / (2*sqrt(kPI))
+	const float kSqrt3Div2SqrtPI = 0.48860251190291992158638462283835f; // sqrt(3) / (2*sqrt(kPI))
+	const float kSqrt15Div2SqrtPI = 1.0925484305920790705433857058027f; // sqrt(15) / (2*sqrt(kPI))
+	const float k3Sqrt5Div4SqrtPI = 0.94617469575756001809268107088713f; // 3 * sqrtf(5) / (4*sqrt(kPI))
+	const float kSqrt15Div4SqrtPI = 0.54627421529603953527169285290135f; // sqrt(15) / (4*sqrt(kPI))
+	const float kOneThird = 0.3333333333333333333333f; // 1.0/3.0
+	outsh[0] = kInv2SqrtPI;
+	outsh[1] = - y * kSqrt3Div2SqrtPI;
+	outsh[2] =   z * kSqrt3Div2SqrtPI;
+	outsh[3] = - x * kSqrt3Div2SqrtPI;
+	outsh[4] =   x * y * kSqrt15Div2SqrtPI;
+	outsh[5] = - y * z * kSqrt15Div2SqrtPI;
+	outsh[6] =  (z*z-kOneThird) * k3Sqrt5Div4SqrtPI;
+	outsh[7] = - x * z * kSqrt15Div2SqrtPI;
+	outsh[8] =  (x*x-y*y) * kSqrt15Div4SqrtPI;
+}
+
+
+static inline void SHEvalDirectionalLight9 (
+	float x, float y, float z,
+	float colorR, float colorG, float colorB,
+	float outR[9], float outG[9], float outB[9])
+{
+	// Core i7 920, VS2008 Release, FPU code:
+
+	// 397 clocks
+	//D3DXSHEvalDirectionalLight (3, &D3DXVECTOR3(x,y,z), colorR, colorG, colorB, outR, outG, outB);
+
+	// 300 clocks
+	float sh[9];
+	SHEvalDirection9 (x, y, z, sh);
+	// Normalization factor from http://www.ppsloan.org/publications/StupidSH36.pdf
+	const float kNormalization = 2.9567930857315701067858823529412f; // 16*kPI/17
+	float rscale = colorR * kNormalization;
+	float gscale = colorG * kNormalization;
+	float bscale = colorB * kNormalization;
+	for (int i = 0; i < 9; ++i)
+	{
+		float c = sh[i];
+		outR[i] = c * rscale;
+		outG[i] = c * gscale;
+		outB[i] = c * bscale;
+	}
+}
+
+// Add 'out' to the 0th SH coefficients
+static inline void SHEvalAmbientLight(const ColorRGBAf& ambient, float out[3])
+{
+	const float k2SqrtPI = 3.54490770181103205459633496668229f; // 2*sqrt(kPI)
+	out[0] = ambient.r * k2SqrtPI;
+	out[1] = ambient.g * k2SqrtPI;
+	out[2] = ambient.b * k2SqrtPI;
+}
diff --git a/Runtime/Math/Vector2.cpp b/Runtime/Math/Vector2.cpp
new file mode 100644
index 0000000..3c73465
--- /dev/null
+++ b/Runtime/Math/Vector2.cpp
@@ -0,0 +1,13 @@
+#include "UnityPrefix.h"
+#include "Vector2.h"
+#include <limits>
+
+using namespace std;
+
+const float		Vector2f::epsilon = 0.00001F;
+const float		Vector2f::infinity = numeric_limits<float>::infinity ();
+const Vector2f	Vector2f::infinityVec = Vector2f (numeric_limits<float>::infinity (), numeric_limits<float>::infinity ());
+
+const Vector2f	Vector2f::zero  = Vector2f (0, 0);
+const Vector2f	Vector2f::xAxis = Vector2f (1, 0);
+const Vector2f	Vector2f::yAxis = Vector2f (0, 1);
diff --git a/Runtime/Math/Vector2.h b/Runtime/Math/Vector2.h
new file mode 100644
index 0000000..7a5a813
--- /dev/null
+++ b/Runtime/Math/Vector2.h
@@ -0,0 +1,126 @@
+#ifndef VECTOR2_H
+#define VECTOR2_H
+
+#include <algorithm>
+#include "FloatConversion.h"
+#include "Runtime/Serialize/SerializeUtility.h"
+#include "Runtime/Serialize/SerializationMetaFlags.h"
+#include "Runtime/Utilities/LogAssert.h"
+#include "Runtime/Modules/ExportModules.h"
+
+class Vector2f
+{
+	public:
+	float x, y;
+
+	DECLARE_SERIALIZE_OPTIMIZE_TRANSFER (Vector2f)
+
+	Vector2f () : x(0.f) , y(0.f) {}
+	Vector2f (float inX, float inY)				{ x = inX; y = inY; }
+	explicit Vector2f (const float* p)			{ x = p[0]; y = p[1]; }
+
+	void Set (float inX, float inY)				{ x = inX; y = inY; }
+
+	float* GetPtr ()							{ return &x; }
+	const float* GetPtr ()const					{ return &x; }
+	float& operator[] (int i)					{ DebugAssertIf (i < 0 || i > 1); return (&x)[i]; }
+	const float& operator[] (int i)const		{ DebugAssertIf (i < 0 || i > 1); return (&x)[i]; }
+
+	Vector2f& operator += (const Vector2f& inV) { x += inV.x; y += inV.y; return *this; }
+	Vector2f& operator -= (const Vector2f& inV) { x -= inV.x; y -= inV.y; return *this; }
+	Vector2f& operator *= (const float s)		{ x *= s; y *= s; return *this; }
+	Vector2f& operator /= (const float s)		{ DebugAssertIf (CompareApproximately (s, 0.0F)); x /= s; y /= s; return *this; }
+	bool operator == (const Vector2f& v)const	{ return x == v.x && y == v.y; }
+	bool operator != (const Vector2f& v)const	{ return x != v.x || y != v.y; }
+
+
+	Vector2f operator - () const				{ return Vector2f (-x, -y); }
+
+	Vector2f& Scale (const Vector2f& inV)		{ x *= inV.x; y *= inV.y; return *this;}
+
+	static const float		epsilon;
+	static const float		infinity;
+	static const Vector2f	infinityVec;
+	static EXPORT_COREMODULE const Vector2f	zero;
+	static const Vector2f	xAxis;
+	static const Vector2f	yAxis;
+};
+
+inline Vector2f Scale (const Vector2f& lhs, const Vector2f& rhs) 			{ return Vector2f (lhs.x * rhs.x, lhs.y * rhs.y); }
+
+inline Vector2f operator + (const Vector2f& lhs, const Vector2f& rhs)		{ return Vector2f (lhs.x + rhs.x, lhs.y + rhs.y); }
+inline Vector2f operator - (const Vector2f& lhs, const Vector2f& rhs)		{ return Vector2f (lhs.x - rhs.x, lhs.y - rhs.y); }
+inline float Dot (const Vector2f& lhs, const Vector2f& rhs)					{ return lhs.x * rhs.x + lhs.y * rhs.y; }
+
+inline float SqrMagnitude (const Vector2f& inV) 							{ return Dot (inV, inV); }
+inline float Magnitude (const Vector2f& inV)								{ return SqrtImpl(Dot (inV, inV)); }
+
+inline float Angle (const Vector2f& lhs, const Vector2f& rhs)				{ return acos (std::min (1.0f, std::max (-1.0f, Dot (lhs, rhs) / (Magnitude (lhs) * Magnitude (rhs))))); }
+
+inline Vector2f operator * (const Vector2f& inV, float s)					{ return Vector2f (inV.x * s, inV.y * s); }
+inline Vector2f operator * (const float s, const Vector2f& inV)				{ return Vector2f (inV.x * s, inV.y * s); }
+inline Vector2f operator / (const Vector2f& inV, float s)					{ Vector2f temp (inV); temp /= s; return temp; }
+inline Vector2f Inverse (const Vector2f& inVec)								{ return Vector2f (1.0F / inVec.x, 1.0F / inVec.y); }
+
+// Normalizes a vector, asserts if the vector can be normalized
+inline Vector2f Normalize (const Vector2f& inV)								{ return inV / Magnitude (inV); }
+// Normalizes a vector, returns default vector if it can't be normalized
+inline Vector2f NormalizeSafe (const Vector2f& inV, const Vector2f& defaultV = Vector2f::zero);
+
+inline Vector2f Lerp (const Vector2f& from, const Vector2f& to, float t)	{ return to * t + from * (1.0f - t); }
+
+// Returns a vector with the smaller of every component from v0 and v1
+inline Vector2f min (const Vector2f& lhs, const Vector2f& rhs) 				{ return Vector2f (std::min (lhs.x, rhs.x), std::min (lhs.y, rhs.y)); }
+// Returns a vector with the larger  of every component from v0 and v1
+inline Vector2f max (const Vector2f& lhs, const Vector2f& rhs)				{ return Vector2f (std::max (lhs.x, rhs.x), std::max (lhs.y, rhs.y)); }
+
+bool CompareApproximately (const Vector2f& inV0, const Vector2f& inV1, float inMaxDist = Vector2f::epsilon);
+
+inline bool CompareApproximately (const Vector2f& inV0, const Vector2f& inV1, float inMaxDist)
+{
+	return SqrMagnitude (inV1 - inV0) < inMaxDist * inMaxDist;
+}
+
+inline bool IsNormalized (const Vector2f& vec, float epsilon = Vector2f::epsilon)
+{
+	return CompareApproximately (SqrMagnitude (vec), 1.0F, epsilon);
+}
+
+/// Returns the abs of every component of the vector
+inline Vector2f Abs (const Vector2f& v) { return Vector2f (Abs (v.x), Abs (v.y)); }
+
+inline bool IsFinite (const Vector2f& f)
+{
+	return IsFinite(f.x) & IsFinite(f.y);
+}
+
+template<class TransferFunction> inline
+void Vector2f::Transfer (TransferFunction& t)
+{
+	t.AddMetaFlag (kTransferUsingFlowMappingStyle);
+	t.Transfer (x, "x");
+	t.Transfer (y, "y");
+}
+
+inline Vector2f NormalizeFast (const Vector2f& inV)
+{
+	float m = SqrMagnitude (inV);
+	// GCC version of __frsqrte:
+	//	static inline double __frsqrte (double x) {
+	//		double y;
+	//		asm ( "frsqrte %0, %1" : /*OUT*/ "=f" (y) : /*IN*/ "f" (x) );
+	//		return y;
+	//	}
+	return inV * FastInvSqrt (m);
+}
+
+inline Vector2f NormalizeSafe (const Vector2f& inV, const Vector2f& defaultV)
+{
+	float mag = Magnitude (inV);
+	if (mag > Vector2f::epsilon)
+		return inV / Magnitude (inV);
+	else
+		return defaultV;
+}
+
+#endif
diff --git a/Runtime/Math/Vector3.cpp b/Runtime/Math/Vector3.cpp
new file mode 100644
index 0000000..3195ef2
--- /dev/null
+++ b/Runtime/Math/Vector3.cpp
@@ -0,0 +1,361 @@
+#include "UnityPrefix.h"
+#include "Vector3.h"
+#include "Matrix3x3.h"
+#include <limits>
+
+#define FPFIXES 1
+
+using namespace std;
+
+const float		Vector3f::epsilon = 0.00001F;
+const float		Vector3f::infinity = numeric_limits<float>::infinity ();
+const Vector3f	Vector3f::infinityVec = Vector3f (numeric_limits<float>::infinity (), numeric_limits<float>::infinity (), numeric_limits<float>::infinity ());
+
+const Vector3f	Vector3f::zero  = Vector3f (0, 0, 0);
+const Vector3f	Vector3f::one  = Vector3f (1.0F, 1.0F, 1.0F);
+const Vector3f	Vector3f::xAxis = Vector3f (1, 0, 0);
+const Vector3f	Vector3f::yAxis = Vector3f (0, 1, 0);
+const Vector3f	Vector3f::zAxis = Vector3f (0, 0, 1);
+
+
+void OrthoNormalizeFast (Vector3f* inU, Vector3f* inV, Vector3f* inW)
+{
+	// compute u0
+	*inU = Normalize (*inU);
+
+	// compute u1
+	float dot0 = Dot (*inU, *inV); 
+	*inV -= dot0 * *inU;
+	*inV = Normalize (*inV);
+
+	// compute u2
+	float dot1 = Dot (*inV, *inW);
+	dot0 = Dot (*inU, *inW);
+	*inW -= dot0 * *inU + dot1 * *inV;
+	*inW = Normalize (*inW);
+}
+
+void OrthoNormalize (Vector3f* inU, Vector3f* inV)
+{
+	// compute u0
+	float mag = Magnitude (*inU);
+	if (mag > Vector3f::epsilon)
+		*inU /= mag;
+	else
+		*inU = Vector3f (1.0F, 0.0F, 0.0F);
+
+	// compute u1
+	float dot0 = Dot (*inU, *inV);
+	*inV -= dot0 * *inU;
+	mag = Magnitude (*inV);
+	if (mag < Vector3f::epsilon)
+		*inV = OrthoNormalVectorFast (*inU);
+	else
+		*inV /= mag;
+}
+
+void OrthoNormalize (Vector3f* inU, Vector3f* inV, Vector3f* inW)
+{
+	// compute u0
+	float mag = Magnitude (*inU);
+	if (mag > Vector3f::epsilon)
+		*inU /= mag;
+	else
+		*inU = Vector3f (1.0F, 0.0F, 0.0F);
+
+	// compute u1
+	float dot0 = Dot (*inU, *inV);
+	*inV -= dot0 * *inU;
+	mag = Magnitude (*inV);
+	if (mag > Vector3f::epsilon)
+		*inV /= mag;
+	else
+		*inV = OrthoNormalVectorFast (*inU);
+
+	// compute u2
+	float dot1 = Dot (*inV, *inW);
+	dot0 = Dot (*inU, *inW);
+	*inW -= dot0 * *inU + dot1 * *inV;
+	mag = Magnitude (*inW);
+	if (mag > Vector3f::epsilon)
+		*inW /= mag;
+	else
+		*inW = Cross (*inU, *inV);
+}
+
+#define k1OverSqrt2 float(0.7071067811865475244008443621048490)
+
+Vector3f OrthoNormalVectorFast (const Vector3f& n)
+{
+	Vector3f res;
+	if (Abs (n.z) > k1OverSqrt2)
+	{
+		// choose p in y-z plane
+		float a = n.y*n.y + n.z*n.z;
+		float k = 1.0F / sqrt (a);
+		res.x = 0;
+		res.y = -n.z*k;
+		res.z = n.y*k;
+	}
+	else
+	{
+		// choose p in x-y plane
+		float a = n.x*n.x + n.y*n.y;
+		float k = 1.0F / sqrt (a);
+		res.x = -n.y*k;
+		res.y = n.x*k;
+		res.z = 0;
+	}
+	return res;
+}
+
+/* from chris hecker (Generates Orthonormal basis)
+void 
+DextralBases(real32 const *XAxis, real32 *YAxis, real32 *ZAxis)
+{
+    real32 CrossVector[3] = {1.0f, 1.0f, 1.0f};
+
+    real32 MaximumElement = 0.0f;
+
+    int MaximumElementIndex = 0;
+    {for(int ElementIndex = 0;
+         ElementIndex < 3;
+         ++ElementIndex)
+    {
+        real32 ElementValue = AbsoluteValue(XAxis[ElementIndex]);
+        if(ElementValue > MaximumElement)
+        {
+            MaximumElement = ElementValue;
+            MaximumElementIndex = ElementIndex;
+        }
+    }}
+
+    CrossVector[MaximumElementIndex] = 0.0f;
+
+    VectorCrossProduct3(YAxis, CrossVector, XAxis);
+    Normalize3(YAxis);
+
+    VectorCrossProduct3(ZAxis, XAxis, YAxis);
+    Normalize3(ZAxis);
+}
+
+*/
+
+/// Returns a Vector3 that moves lhs towards rhs by a maximum of clampedDistance
+Vector3f MoveTowards (const Vector3f& lhs, const Vector3f& rhs, float clampedDistance)
+{
+	Vector3f delta = rhs - lhs;
+	float sqrDelta = SqrMagnitude (delta);
+	float sqrClampedDistance = clampedDistance * clampedDistance;
+	if (sqrDelta > sqrClampedDistance)
+	{
+		float deltaMag = sqrt (sqrDelta);
+		if (deltaMag > Vector3f::epsilon)
+			return lhs + delta / deltaMag * clampedDistance;
+		else	
+			return lhs;
+	}	
+	else
+		return rhs;
+}
+
+static inline float ClampedMove (float lhs, float rhs, float clampedDelta)
+{
+	float delta = rhs - lhs;
+	if (delta > 0.0F)
+		return lhs + min (delta, clampedDelta);
+	else
+		return lhs - min (-delta, clampedDelta);
+}
+
+Vector3f RotateTowards (const Vector3f& lhs, const Vector3f& rhs, float angleMove, float magnitudeMove)
+{
+	float lhsMag = Magnitude (lhs);
+	float rhsMag = Magnitude (rhs);
+	
+	// both vectors are non-zero
+	if (lhsMag > Vector3f::epsilon && rhsMag > Vector3f::epsilon)
+	{
+		Vector3f lhsNorm = lhs / lhsMag;
+		Vector3f rhsNorm = rhs / rhsMag;
+		
+		float dot = Dot (lhsNorm, rhsNorm);
+		// direction is almost the same
+		if (dot > 1.0F - Vector3f::epsilon)
+		{
+			return MoveTowards (lhs, rhs, magnitudeMove);
+		}
+		// directions are almost opposite
+		else if (dot < -1.0F + Vector3f::epsilon)
+		{
+			Vector3f axis = OrthoNormalVectorFast (lhsNorm);
+			Matrix3x3f m;
+			m.SetAxisAngle (axis, angleMove);
+			Vector3f rotated = m.MultiplyPoint3 (lhsNorm);
+			rotated *= ClampedMove (lhsMag, rhsMag, magnitudeMove);
+			return rotated;
+		}
+		// normal case
+		else
+		{
+			float angle = acos (dot);
+			Vector3f axis = Normalize (Cross (lhsNorm, rhsNorm));
+			Matrix3x3f m;
+			m.SetAxisAngle (axis, min (angleMove, angle));
+			Vector3f rotated = m.MultiplyPoint3 (lhsNorm);
+			rotated *= ClampedMove (lhsMag, rhsMag, magnitudeMove);
+			return rotated;
+		}
+	}
+	// at least one of the vectors is almost zero
+	else
+	{
+		return MoveTowards (lhs, rhs, magnitudeMove);
+	}
+}
+
+
+Vector3f Slerp (const Vector3f& lhs, const Vector3f& rhs, float t) {
+	
+	float lhsMag = Magnitude (lhs);
+	float rhsMag = Magnitude (rhs);
+	
+	if (lhsMag < Vector3f::epsilon || rhsMag < Vector3f::epsilon)
+		return Lerp (lhs, rhs, t);
+
+	float lerpedMagnitude = Lerp (lhsMag, rhsMag, t);
+	
+	float dot = Dot (lhs, rhs) / (lhsMag * rhsMag);
+	// direction is almost the same
+	if (dot > 1.0F - Vector3f::epsilon)
+	{
+		return Lerp (lhs, rhs, t);
+	}
+	// directions are almost opposite
+	else if (dot < -1.0F + Vector3f::epsilon)
+	{
+		Vector3f lhsNorm = lhs / lhsMag;
+		Vector3f axis = OrthoNormalVectorFast (lhsNorm);
+		Matrix3x3f m;
+		m.SetAxisAngle (axis, kPI * t);
+		Vector3f slerped = m.MultiplyPoint3 (lhsNorm);
+		slerped *= lerpedMagnitude;
+		return slerped;
+	}
+	// normal case
+	else
+	{
+		Vector3f axis = Cross (lhs, rhs);
+		Vector3f lhsNorm = lhs / lhsMag;
+		axis = Normalize (axis);
+		float angle = acos (dot) * t;
+		
+		Matrix3x3f m;
+		m.SetAxisAngle (axis, angle);
+		Vector3f slerped = m.MultiplyPoint3 (lhsNorm);
+		slerped *= lerpedMagnitude;
+		return slerped;
+	}
+}
+
+inline static Vector3f NormalizeRobust (const Vector3f& a, float &l, float &div)
+{
+	float a0,a1,a2,aa0,aa1,aa2;
+	a0 = a[0];
+	a1 = a[1];
+	a2 = a[2];
+
+#if FPFIXES
+	if ( CompareApproximately( a0, 0.0F, 0.00001F ) )
+		a0 = aa0 = 0;
+	else
+#endif	
+	{
+		aa0 = Abs (a0);
+	}
+
+#if FPFIXES	
+	if ( CompareApproximately( a1, 0.0F, 0.00001F ) )
+		a1 = aa1 =0;
+	else
+#endif	
+	{
+		aa1 = Abs (a1);
+	}
+
+#if FPFIXES	
+	if ( CompareApproximately( a2, 0.0F, 0.00001F ) )
+		a2 = aa2 = 0;
+	else
+#endif	
+	{
+		aa2 = Abs (a2);
+	}
+	
+	if (aa1 > aa0)
+	{
+		if (aa2 > aa1)
+		{
+			a0 /= aa2;
+			a1 /= aa2;
+			l = InvSqrt (a0*a0 + a1*a1 + 1.0F);
+			div = aa2;
+			return Vector3f (a0*l, a1*l, CopySignf (l,a2));
+		}
+		else
+		{	
+			// aa1 is largest
+			a0 /= aa1;
+			a2 /= aa1;
+			l = InvSqrt (a0*a0 + a2*a2 + 1.0F);
+			div = aa1;
+			return Vector3f (a0*l, CopySignf (l, a1), a2*l);
+		}
+	}
+	else
+	{
+		if (aa2 > aa0)
+		{
+			// aa2 is largest
+			a0 /= aa2;
+			a1 /= aa2;
+			l = InvSqrt (a0*a0 + a1*a1 + 1.0F);
+			div = aa2;
+			return Vector3f (a0*l, a1*l, CopySignf (l,a2));
+		}
+		else
+		{	
+			// aa0 is largest
+			if (aa0 <= 0)
+			{
+				l = 0;
+				div = 1;
+				return Vector3f (0.0F, 1.0F, 0.0F);
+			}
+				
+			a1 /= aa0;
+			a2 /= aa0;
+			l = InvSqrt (a1*a1 + a2*a2 + 1.0F);
+			div = aa0;
+			return Vector3f (CopySignf (l,a0), a1*l, a2*l);
+		}
+	}
+}
+
+Vector3f NormalizeRobust (const Vector3f& a)
+{
+	float l, div;
+	return NormalizeRobust(a, l, div);
+}
+
+Vector3f NormalizeRobust (const Vector3f& a, float &invOriginalLength)
+{
+	float l, div;
+	const Vector3f &n = NormalizeRobust(a, l, div);
+	invOriginalLength = l/div;
+	// guard for NaNs
+	Assert (n == n);
+	Assert (invOriginalLength == invOriginalLength);
+	Assert (IsNormalized(n));
+	return n;
+}
+\ No newline at end of file
diff --git a/Runtime/Math/Vector3.h b/Runtime/Math/Vector3.h
new file mode 100644
index 0000000..ea4bbc4
--- /dev/null
+++ b/Runtime/Math/Vector3.h
@@ -0,0 +1,205 @@
+#ifndef VECTOR3_H
+#define VECTOR3_H
+
+#include <algorithm>
+#include "FloatConversion.h"
+#include "Runtime/Serialize/SerializeUtility.h"
+#include "Runtime/Serialize/SerializationMetaFlags.h"
+#include "Runtime/Utilities/LogAssert.h"
+#include "Runtime/Modules/ExportModules.h"
+
+class Vector3f
+{
+	public:
+
+	float x, y, z;
+
+	DEFINE_GET_TYPESTRING_IS_ANIMATION_CHANNEL (Vector3f)
+	template<class TransferFunction> void Transfer (TransferFunction& transfer);
+
+	Vector3f () : x(0.f), y(0.f), z(0.f) {}
+	Vector3f (float inX, float inY, float inZ)	{ x = inX; y = inY; z = inZ; }
+	explicit Vector3f (const float* array)	{ x = array[0]; y = array[1]; z = array[2]; }
+	void Set (float inX, float inY, float inZ)	{ x = inX; y = inY; z = inZ; }
+	void Set (const float* array)	{ x = array[0]; y = array[1]; z = array[2]; }
+
+	float* GetPtr ()								{ return &x; }
+	const float* GetPtr ()const						{ return &x; }
+	float& operator[] (int i)						{ DebugAssertIf (i < 0 || i > 2); return (&x)[i]; }
+	const float& operator[] (int i)const			{ DebugAssertIf (i < 0 || i > 2); return (&x)[i]; }
+
+	bool operator == (const Vector3f& v)const		{ return x == v.x && y == v.y && z == v.z; }
+	bool operator != (const Vector3f& v)const		{ return x != v.x || y != v.y || z != v.z; }
+
+	Vector3f& operator += (const Vector3f& inV)		{ x += inV.x; y += inV.y; z += inV.z; return *this; }
+	Vector3f& operator -= (const Vector3f& inV)		{ x -= inV.x; y -= inV.y; z -= inV.z; return *this; }
+	Vector3f& operator *= (float s)					{ x *= s; y *= s; z *= s; return *this; }
+	Vector3f& operator /= (float s);
+
+	Vector3f operator - () const					{ return Vector3f (-x, -y, -z); }
+
+	Vector3f& Scale (const Vector3f& inV)			{ x *= inV.x; y *= inV.y; z *= inV.z; return *this;}
+
+
+	EXPORT_COREMODULE static const float		epsilon;
+	EXPORT_COREMODULE static const float		infinity;
+	EXPORT_COREMODULE static const Vector3f	infinityVec;
+	EXPORT_COREMODULE static const Vector3f	zero;
+	EXPORT_COREMODULE static const Vector3f	one;
+	EXPORT_COREMODULE static const Vector3f	xAxis;
+	EXPORT_COREMODULE static const Vector3f	yAxis;
+	EXPORT_COREMODULE static const Vector3f	zAxis;
+};
+
+inline Vector3f Scale (const Vector3f& lhs, const Vector3f& rhs) { return Vector3f (lhs.x * rhs.x, lhs.y * rhs.y, lhs.z * rhs.z); }
+
+inline Vector3f operator + (const Vector3f& lhs, const Vector3f& rhs)	{ return Vector3f (lhs.x + rhs.x, lhs.y + rhs.y, lhs.z + rhs.z); }
+inline Vector3f operator - (const Vector3f& lhs, const Vector3f& rhs)	{ return Vector3f (lhs.x - rhs.x, lhs.y - rhs.y, lhs.z - rhs.z); }
+inline Vector3f Cross (const Vector3f& lhs, const Vector3f& rhs);
+inline float Dot (const Vector3f& lhs, const Vector3f& rhs)					{ return lhs.x * rhs.x + lhs.y * rhs.y + lhs.z * rhs.z; }
+
+inline Vector3f operator * (const Vector3f& inV, const float s)			{ return Vector3f (inV.x * s, inV.y * s, inV.z * s); }
+inline Vector3f operator * (const float s, const Vector3f& inV)			{ return Vector3f (inV.x * s, inV.y * s, inV.z * s); }
+inline Vector3f operator / (const Vector3f& inV, const float s)			{ Vector3f temp (inV); temp /= s; return temp; }
+inline Vector3f Inverse (const Vector3f& inVec)									{ return Vector3f (1.0F / inVec.x, 1.0F / inVec.y, 1.0F / inVec.z); }
+
+inline float SqrMagnitude (const Vector3f& inV) 								{ return Dot (inV, inV); }
+inline float Magnitude (const Vector3f& inV)							{return SqrtImpl(Dot (inV, inV));}
+
+// Normalizes a vector, asserts if the vector can be normalized
+inline Vector3f Normalize (const Vector3f& inV)									{ return inV / Magnitude (inV); }
+// Normalizes a vector, returns default vector if it can't be normalized
+inline Vector3f NormalizeSafe (const Vector3f& inV, const Vector3f& defaultV = Vector3f::zero);
+
+inline Vector3f ReflectVector (const Vector3f& inDirection, const Vector3f& inNormal) 	{ return -2.0F * Dot (inNormal, inDirection) * inNormal + inDirection; }
+
+inline Vector3f Lerp (const Vector3f& from, const Vector3f& to, float t) { return to * t + from * (1.0F - t); }
+Vector3f Slerp (const Vector3f& from, const Vector3f& to, float t);
+
+// Returns a vector with the smaller of every component from v0 and v1
+inline Vector3f min (const Vector3f& lhs, const Vector3f& rhs) 			{ return Vector3f (FloatMin (lhs.x, rhs.x), FloatMin (lhs.y, rhs.y), FloatMin (lhs.z, rhs.z)); }
+// Returns a vector with the larger  of every component from v0 and v1
+inline Vector3f max (const Vector3f& lhs, const Vector3f& rhs)				{ return Vector3f (FloatMax (lhs.x, rhs.x), FloatMax (lhs.y, rhs.y), FloatMax (lhs.z, rhs.z)); }
+
+/// Project one vector onto another.
+inline Vector3f Project (const Vector3f& v1, const Vector3f& v2)	{ return v2* Dot (v1, v2)/ Dot (v2, v2); }
+
+
+/// Returns the abs of every component of the vector
+inline Vector3f Abs (const Vector3f& v) { return Vector3f (Abs (v.x), Abs (v.y), Abs (v.z)); }
+
+bool CompareApproximately (const Vector3f& inV0, const Vector3f& inV1, const float inMaxDist = Vector3f::epsilon);
+// Orthonormalizes the three vectors, assuming that a orthonormal basis can be formed
+void OrthoNormalizeFast (Vector3f* inU, Vector3f* inV, Vector3f* inW);
+// Orthonormalizes the three vectors, returns false if no orthonormal basis could be formed.
+EXPORT_COREMODULE void OrthoNormalize (Vector3f* inU, Vector3f* inV, Vector3f* inW);
+// Orthonormalizes the two vectors. inV is taken as a hint and will try to be as close as possible to inV.
+EXPORT_COREMODULE void OrthoNormalize (Vector3f* inU, Vector3f* inV);
+
+// Calculates a vector that is orthonormal to n.
+// Assumes that n is normalized
+Vector3f OrthoNormalVectorFast (const Vector3f& n);
+
+// Rotates lhs towards rhs by no more than max Angle
+// Moves the magnitude of lhs towards rhs by no more than maxMagnitude
+Vector3f RotateTowards (const Vector3f& lhs, const Vector3f& rhs, float maxAngle, float maxMagnitude);
+
+// Spherically interpolates the direction of two vectors
+// and interpolates the magnitude of the two vectors
+Vector3f Slerp (const Vector3f& lhs, const Vector3f& rhs, float t);
+
+/// Returns a Vector3 that moves lhs towards rhs by a maximum of clampedDistance
+Vector3f MoveTowards (const Vector3f& lhs, const Vector3f& rhs, float clampedDistance);
+
+inline bool IsNormalized (const Vector3f& vec, float epsilon = Vector3f::epsilon)
+{
+	return CompareApproximately (SqrMagnitude (vec), 1.0F, epsilon);
+}
+
+inline Vector3f Cross (const Vector3f& lhs, const Vector3f& rhs)
+{
+	return Vector3f (
+		lhs.y * rhs.z - lhs.z * rhs.y,
+		lhs.z * rhs.x - lhs.x * rhs.z,
+		lhs.x * rhs.y - lhs.y * rhs.x);
+}
+
+inline Vector3f NormalizeSafe (const Vector3f& inV, const Vector3f& defaultV)
+{
+	float mag = Magnitude (inV);
+	if (mag > Vector3f::epsilon)
+		return inV / Magnitude (inV);
+	else
+		return defaultV;
+}
+
+/// - Handles zero vector correclty
+inline Vector3f NormalizeFast (const Vector3f& inV)
+{
+	float m = SqrMagnitude (inV);
+	// GCC version of __frsqrte:
+	//	static inline double __frsqrte (double x) {
+	//		double y;
+	//		asm ( "frsqrte %0, %1" : /*OUT*/ "=f" (y) : /*IN*/ "f" (x) );
+	//		return y;
+	//	}
+	return inV * FastInvSqrt (m);
+}
+
+/// - low precision normalize
+/// - nan for zero vector
+inline Vector3f NormalizeFastest (const Vector3f& inV)
+{
+	float m = SqrMagnitude (inV);
+	// GCC version of __frsqrte:
+	//	static inline double __frsqrte (double x) {
+	//		double y;
+	//		asm ( "frsqrte %0, %1" : /*OUT*/ "=f" (y) : /*IN*/ "f" (x) );
+	//		return y;
+	//	}
+	return inV * FastestInvSqrt (m);
+}
+
+inline bool IsFinite (const Vector3f& f)
+{
+	return IsFinite(f.x) & IsFinite(f.y) & IsFinite(f.z);
+}
+
+
+
+inline bool CompareApproximately (const Vector3f& inV0, const Vector3f& inV1, const float inMaxDist)
+{
+	return SqrMagnitude (inV1 - inV0) < inMaxDist * inMaxDist;
+}
+
+inline Vector3f& Vector3f::operator /= (float s)
+{
+	DebugAssertIf (CompareApproximately (s, 0.0F));
+	x /= s;
+	y /= s;
+	z /= s;
+	return *this;
+}
+
+template<class TransferFunction>
+inline void Vector3f::Transfer (TransferFunction& t)
+{
+	t.AddMetaFlag (kTransferUsingFlowMappingStyle);
+	t.Transfer (x, "x");
+	t.Transfer (y, "y");
+	t.Transfer (z, "z");
+}
+
+// this may be called for vectors `a' with extremely small magnitude, for
+// example the result of a cross product on two nearly perpendicular vectors.
+// we must be robust to these small vectors. to prevent numerical error,
+// first find the component a[i] with the largest magnitude and then scale
+// all the components by 1/a[i]. then we can compute the length of `a' and
+// scale the components by 1/l. this has been verified to work with vectors
+// containing the smallest representable numbers.
+Vector3f NormalizeRobust (const Vector3f& a);
+// This also returns vector's inverse original length, to avoid duplicate
+// invSqrt calculations when needed. If a is a zero vector, invOriginalLength will be 0.
+Vector3f NormalizeRobust (const Vector3f& a, float &invOriginalLength);
+
+#endif
diff --git a/Runtime/Math/Vector4.h b/Runtime/Math/Vector4.h
new file mode 100644
index 0000000..97f2ec0
--- /dev/null
+++ b/Runtime/Math/Vector4.h
@@ -0,0 +1,55 @@
+#pragma once
+
+#include "Vector3.h"
+
+class Vector4f
+{
+public:
+	Vector4f() {}
+	Vector4f( float inX, float inY, float inZ, float inW ) : x(inX), y(inY), z(inZ), w(inW) {}
+	explicit Vector4f( const Vector3f& v, float inW ) : x(v.x), y(v.y), z(v.z), w(inW) {}
+	explicit Vector4f( const float* v ) : x(v[0]), y(v[1]), z(v[2]), w(v[3]) {}
+
+	void Set( float inX, float inY, float inZ, float inW ) { x = inX; y = inY; z = inZ; w = inW; }
+	void Set( const float* array ) { x = array[0]; y = array[1]; z = array[2]; w = array[3]; }
+
+	float* GetPtr() 			{ return &x; }
+	const float* GetPtr() const { return &x; }
+
+	float& operator[] (int i)						{ DebugAssertIf (i < 0 || i > 3); return (&x)[i]; }
+	const float& operator[] (int i)const			{ DebugAssertIf (i < 0 || i > 3); return (&x)[i]; }
+
+	bool operator == (const Vector4f& v) const		{ return x == v.x && y == v.y && z == v.z && w == v.w; }
+	bool operator != (const Vector4f& v) const		{ return x != v.x || y != v.y || z != v.z || w != v.w; }
+	bool operator == (const float v[4]) const		{ return x == v[0] && y == v[1] && z == v[2] && w == v[3]; }
+	bool operator != (const float v[4]) const		{ return x != v[0] || y != v[1] || z != v[2] || w != v[3]; }
+
+	Vector4f operator - () const					{ return Vector4f (-x, -y, -z, -w); }
+
+	DEFINE_GET_TYPESTRING_IS_ANIMATION_CHANNEL (Vector4f)
+	template<class TransferFunction> void Transfer (TransferFunction& transfer);
+
+	float x;
+	float y;
+	float z;
+	float w;
+};
+
+
+inline Vector4f operator * (const Vector4f& lhs, const Vector4f& rhs)	{ return Vector4f (lhs.x * rhs.x, lhs.y * rhs.y, lhs.z * rhs.z, lhs.w * rhs.w); }
+inline Vector4f operator * (const Vector4f& inV, const float s)			{ return Vector4f (inV.x * s, inV.y * s, inV.z * s, inV.w * s); }
+inline Vector4f operator + (const Vector4f& lhs, const Vector4f& rhs)	{ return Vector4f (lhs.x + rhs.x, lhs.y + rhs.y, lhs.z + rhs.z, lhs.w + rhs.w); }
+inline Vector4f operator - (const Vector4f& lhs, const Vector4f& rhs)	{ return Vector4f (lhs.x - rhs.x, lhs.y - rhs.y, lhs.z - rhs.z, lhs.w - rhs.w); }
+inline float Dot (const Vector4f& lhs, const Vector4f& rhs)				{ return lhs.x * rhs.x + lhs.y * rhs.y + lhs.z * rhs.z + lhs.w * rhs.w; }
+
+inline Vector4f Lerp (const Vector4f& from, const Vector4f& to, float t) { return to * t + from * (1.0F - t); }
+
+template<class TransferFunction>
+inline void Vector4f::Transfer (TransferFunction& t)
+{
+	t.AddMetaFlag (kTransferUsingFlowMappingStyle);
+	t.Transfer (x, "x");
+	t.Transfer (y, "y");
+	t.Transfer (z, "z");
+	t.Transfer (w, "w");
+}