diff options
author | chai <chaifix@163.com> | 2019-08-14 22:50:43 +0800 |
---|---|---|
committer | chai <chaifix@163.com> | 2019-08-14 22:50:43 +0800 |
commit | 15740faf9fe9fe4be08965098bbf2947e096aeeb (patch) | |
tree | a730ec236656cc8cab5b13f088adfaed6bb218fb /Runtime/Filters/Mesh/MeshSkinningGenericSIMD.h |
Diffstat (limited to 'Runtime/Filters/Mesh/MeshSkinningGenericSIMD.h')
-rw-r--r-- | Runtime/Filters/Mesh/MeshSkinningGenericSIMD.h | 212 |
1 files changed, 212 insertions, 0 deletions
diff --git a/Runtime/Filters/Mesh/MeshSkinningGenericSIMD.h b/Runtime/Filters/Mesh/MeshSkinningGenericSIMD.h new file mode 100644 index 0000000..0b17b42 --- /dev/null +++ b/Runtime/Filters/Mesh/MeshSkinningGenericSIMD.h @@ -0,0 +1,212 @@ +#if 0 + +/* + mircea@INFO: this doesn't do normalization. + */ + +#include "Runtime/Math/Simd/Matrix4x4Simd.h" + +template<TransformInstruction transformInstruction, int bonesPerVertexCount, +bool skinNormal, bool skinTangent, bool copy8BytesAt24Offset> +void SkinGenericSimd (SkinMeshInfo& info) +{ + DebugAssertIf( copy8BytesAt24Offset && (!info.skinNormals || info.normalOffset != 12) ); + const int* influence1 = reinterpret_cast<const int*> (info.compactSkin); + const BoneInfluence2* influence2 = reinterpret_cast<const BoneInfluence2*> (info.compactSkin); + const BoneInfluence* influence4 = reinterpret_cast<const BoneInfluence*> (info.compactSkin); + + const Matrix4x4f* bones4x4 = info.cachedPose; + + const int inStride = info.inStride; + int outStride = info.outStride; + int count = info.vertexCount; + + const int normalOffset = (copy8BytesAt24Offset ? 12 : info.normalOffset) >> 2; + const int tangentOffset = info.tangentOffset >> 2; + + const UInt8* inputVertex = (const UInt8*)info.inVertices; + UInt8* outputVertex = (UInt8*)info.outVertices; + + Simd128 pose0, pose1, pose2, pose3; + + for( int v = 0; v < count; v++ ) + { + ALIGN_LOOP_OPTIMIZATION + + // Blend the matrices first, then transform everything with this + // blended matrix. Gives a small speed boost on XCode/Intel (11.3 to 12.00 FPS + // in skin4 bench), and a good boost on MSVC/Windows (9.6 to 12.4 FPS). + if (bonesPerVertexCount == 1) + { + const float* maddr = bones4x4[*influence1].m_Data; + + Prefetch(maddr); + + pose0 = V4LoadUnaligned( maddr, 0x0 ); + pose1 = V4LoadUnaligned( maddr, 0x4 ); + pose2 = V4LoadUnaligned( maddr, 0x8 ); + pose3 = V4LoadUnaligned( maddr, 0xC ); + } + else if (bonesPerVertexCount == 2) + { + Prefetch(influence2); + + Simd128 weights = {influence2->weight[0], influence2->weight[1], 0, 0}; + + const float* maddr0 = bones4x4[influence2->boneIndex[0]].m_Data; + const float* maddr1 = bones4x4[influence2->boneIndex[1]].m_Data; + + Prefetch(maddr0); + Prefetch(maddr1); + + Simd128 weight0 = V4Splat(weights, 0); + Simd128 weight1 = V4Splat(weights, 1); + + Simd128 mat00 = V4LoadUnaligned( maddr0, 0x0 ); + Simd128 mat01 = V4LoadUnaligned( maddr0, 0x4 ); + Simd128 mat02 = V4LoadUnaligned( maddr0, 0x8 ); + Simd128 mat03 = V4LoadUnaligned( maddr0, 0xC ); + + Simd128 mat10 = V4LoadUnaligned( maddr1, 0x0 ); + Simd128 mat11 = V4LoadUnaligned( maddr1, 0x4 ); + Simd128 mat12 = V4LoadUnaligned( maddr1, 0x8 ); + Simd128 mat13 = V4LoadUnaligned( maddr1, 0xC ); + + pose0 = V4Mul(mat00, weight0); + pose1 = V4Mul(mat01, weight0); + pose2 = V4Mul(mat02, weight0); + pose3 = V4Mul(mat03, weight0); + + pose0 = V4MulAdd(mat10, weight1, pose0); + pose1 = V4MulAdd(mat11, weight1, pose1); + pose2 = V4MulAdd(mat12, weight1, pose2); + pose3 = V4MulAdd(mat13, weight1, pose3); + } + else if (bonesPerVertexCount == 4) + { + Prefetch(influence4); + + Simd128 weights = {influence4->weight[0], influence4->weight[1], influence4->weight[2], influence4->weight[3]}; + + const float* maddr0 = bones4x4[influence4->boneIndex[0]].m_Data; + const float* maddr1 = bones4x4[influence4->boneIndex[1]].m_Data; + const float* maddr2 = bones4x4[influence4->boneIndex[2]].m_Data; + const float* maddr3 = bones4x4[influence4->boneIndex[3]].m_Data; + + Prefetch(maddr0); + Prefetch(maddr1); + Prefetch(maddr2); + Prefetch(maddr3); + + Simd128 weight0 = V4Splat(weights, 0); + Simd128 weight1 = V4Splat(weights, 1); + Simd128 weight2 = V4Splat(weights, 2); + Simd128 weight3 = V4Splat(weights, 3); + + Simd128 mat00 = V4LoadUnaligned( maddr0, 0x0 ); + Simd128 mat01 = V4LoadUnaligned( maddr0, 0x4 ); + Simd128 mat02 = V4LoadUnaligned( maddr0, 0x8 ); + Simd128 mat03 = V4LoadUnaligned( maddr0, 0xC ); + + Simd128 mat10 = V4LoadUnaligned( maddr1, 0x0 ); + Simd128 mat11 = V4LoadUnaligned( maddr1, 0x4 ); + Simd128 mat12 = V4LoadUnaligned( maddr1, 0x8 ); + Simd128 mat13 = V4LoadUnaligned( maddr1, 0xC ); + + Simd128 mat20 = V4LoadUnaligned( maddr2, 0x0 ); + Simd128 mat21 = V4LoadUnaligned( maddr2, 0x4 ); + Simd128 mat22 = V4LoadUnaligned( maddr2, 0x8 ); + Simd128 mat23 = V4LoadUnaligned( maddr2, 0xC ); + + Simd128 mat30 = V4LoadUnaligned( maddr3, 0x0 ); + Simd128 mat31 = V4LoadUnaligned( maddr3, 0x4 ); + Simd128 mat32 = V4LoadUnaligned( maddr3, 0x8 ); + Simd128 mat33 = V4LoadUnaligned( maddr3, 0xC ); + + pose0 = V4Mul(mat00, weight0); + pose1 = V4Mul(mat01, weight0); + pose2 = V4Mul(mat02, weight0); + pose3 = V4Mul(mat03, weight0); + + pose0 = V4MulAdd(mat10, weight1, pose0); + pose1 = V4MulAdd(mat11, weight1, pose1); + pose2 = V4MulAdd(mat12, weight1, pose2); + pose3 = V4MulAdd(mat13, weight1, pose3); + + pose0 = V4MulAdd(mat20, weight2, pose0); + pose1 = V4MulAdd(mat21, weight2, pose1); + pose2 = V4MulAdd(mat22, weight2, pose2); + pose3 = V4MulAdd(mat23, weight2, pose3); + + pose0 = V4MulAdd(mat30, weight3, pose0); + pose1 = V4MulAdd(mat31, weight3, pose1); + pose2 = V4MulAdd(mat32, weight3, pose2); + pose3 = V4MulAdd(mat33, weight3, pose3); + } + + Prefetch(inputVertex); + + Simd128 vpos = V4LoadUnaligned((const float*)inputVertex, 0); + TransformPoint3NATIVE(pose0, pose1, pose2, pose3, vpos, vpos); + + Simd128 vnor, vtan, ndot, tdot; + + // remember... this is a template and skinNormal & skinTangent are consts + if(skinNormal || skinTangent) + { + Simd128 vlen; + if( skinNormal ) + { + vnor = V4LoadUnaligned((const float*)inputVertex, normalOffset); + TransformVector3NATIVE(pose0, pose1, pose2, pose3, vnor, vnor); + ndot = V3Dot(vnor, vnor); + } + else + { + ndot = V4Zero(); + } + + if( skinTangent ) + { + vtan = V4LoadUnaligned((const float*)inputVertex, tangentOffset); + TransformVector3NATIVE(pose0, pose1, pose2, pose3, vtan, vtan); + tdot = V3Dot(vtan, vtan); + } + else + { + tdot = V4Zero(); + } + + vlen = V4MergeH(ndot, tdot); + vlen = V4Rsqrt(vlen); + + if(skinNormal) { + vnor = V4Mul(vnor, V4Splat(vlen, 0)); + V3StoreUnaligned(vnor, (float*)outputVertex, normalOffset); + } + + if(skinTangent) { + vtan = V4Mul(vtan, V4Splat(vlen, 1)); + V3StoreUnaligned(vtan, (float*)outputVertex, tangentOffset); + } + } + + V3StoreUnaligned(vpos, (float*)outputVertex, 0); + + if( skinTangent ) + { + *reinterpret_cast<float*>( outputVertex + (tangentOffset<<2) + sizeof(Vector3f) ) = *reinterpret_cast<const float*>( inputVertex + (tangentOffset<<2) + sizeof(Vector3f) ); + } + + outputVertex += outStride; + inputVertex += inStride; + + if (bonesPerVertexCount == 1) + influence1++; + else if (bonesPerVertexCount == 2) + influence2++; + if (bonesPerVertexCount == 4) + influence4++; + } +} +#endif |