diff options
Diffstat (limited to 'Runtime/Filters/Mesh/MeshSkinningSSE2.asm')
-rw-r--r-- | Runtime/Filters/Mesh/MeshSkinningSSE2.asm | 323 |
1 files changed, 323 insertions, 0 deletions
diff --git a/Runtime/Filters/Mesh/MeshSkinningSSE2.asm b/Runtime/Filters/Mesh/MeshSkinningSSE2.asm new file mode 100644 index 0000000..395bf16 --- /dev/null +++ b/Runtime/Filters/Mesh/MeshSkinningSSE2.asm @@ -0,0 +1,323 @@ +;; SkinSSE2.s +;; +;; Created by Kaspar Daugaard on 1/12/11. +;; Copyright 2011 Unity Technologies. All rights reserved. + +bits 32 + +section .text align=32 + +%define normalOffset 12 +%define tangentOffset 24 + +%macro SkinSSE2_Generic 3 + ; %1 numBones + ; %2 hasNormals + ; %3 hasTangents + ; [ebp + 8] inVertices + ; [ebp + 12] outVertices + ; [ebp + 16] numVertices + ; [ebp + 20] boneMatrices + ; [ebp + 24] weightsAndIndices + ; [ebp + 28] inputStride + ; [ebp + 32] outputStride + + push ebp + mov ebp, esp + pushad + + ; Local variables (32 byte aligned) + ; [esp + 0] MaskW + ; [esp + 16] MaskVec3 + ; [esp + 32] savedEcx + sub esp, 16*3 + and esp, ~31 + + ; Create bitmasks on stack + sub eax, eax + mov [esp + 0], eax ; MaskW + mov [esp + 4], eax + mov [esp + 8], eax + dec eax + mov [esp + 12], eax + mov [esp + 16], eax ; MaskVec3 + mov [esp + 20], eax + mov [esp + 24], eax + inc eax + mov [esp + 28], eax + + mov esi, [ebp + 8] ; inVertices + mov edi, [ebp + 12] ; outVertices + mov ecx, [ebp + 16] ; numVertices + mov edx, [ebp + 24] ; weightsAndIndices + + ; Prefetch vertices + prefetchnta [edx] + prefetchnta [esi] + prefetchnta [esi + 32] + + align 32 + +%%SkinSSE2_loop: + prefetchnta [esi + 64] + + mov ebx, [ebp + 20] ; boneMatrices + mov [esp + 32], ecx ; savedEcx + + ; Load first bone index +%if %1 == 1 + ; Single bone, no weight + mov eax, [edx] + shl eax, 6 +%else + ; Indices come after weights + mov eax, [edx + %1*4] + shl eax, 6 + prefetchnta [ebx + eax] + prefetchnta [ebx + eax + 32] + + ; Load second bone index + mov ecx, [edx + %1*4 + 4] + shl ecx, 6 + prefetchnta [ebx + ecx] + prefetchnta [ebx + ecx + 32] + + ; Load all weights to xmm0 + movups xmm0, [edx] +%endif + + ; Load first matrix to xmm4-xmm7 + movaps xmm4, [ebx + eax] + movaps xmm5, [ebx + eax + 16] + movaps xmm6, [ebx + eax + 32] + movaps xmm7, [ebx + eax + 48] + +%if %1 >= 2 + ; Multiply first matrix with first weight + movaps xmm1, xmm0 + shufps xmm1, xmm1, 0x00 + mulps xmm4, xmm1 + mulps xmm5, xmm1 + mulps xmm6, xmm1 + mulps xmm7, xmm1 +%endif + +%if %1 >= 3 + ; Load third bone index + mov eax, [edx + %1*4 + 8] + shl eax, 6 + prefetchnta [ebx + eax] + prefetchnta [ebx + eax + 32] +%endif + +%if %1 >= 2 + ; Load first two rows of the second matrix to xmm2-xmm3 + movaps xmm2, [ebx + ecx] + movaps xmm3, [ebx + ecx + 16] + ; Shuffle second weight to all elements of xmm1 + movaps xmm1, xmm0 + shufps xmm1, xmm1, 0x55 + ; Multiply two first rows of second matrix with second weight + mulps xmm2, xmm1 + mulps xmm3, xmm1 + ; Add + addps xmm4, xmm2 + addps xmm5, xmm3 + + ; Load last two rows of the second matrix to xmm2-xmm3 + movaps xmm2, [ebx + ecx + 32] + movaps xmm3, [ebx + ecx + 48] + ; Multiply two last rows of the second matri with second weight + mulps xmm2, xmm1 + mulps xmm3, xmm1 + ; Add + addps xmm6, xmm2 + addps xmm7, xmm3 +%endif + +%if %1 >= 4 + ; Load fourth bone index + mov ecx, [edx + %1*4 + 12] + shl ecx, 6 + prefetchnta [ebx + ecx] + prefetchnta [ebx + ecx + 32] +%endif + +%if %1 >= 3 + ; Load first two rows of the third matrix to xmm2-xmm3 + movaps xmm2, [ebx + eax] + movaps xmm3, [ebx + eax + 16] + ; Shuffle third weight to all elements of xmm1 + movaps xmm1, xmm0 + shufps xmm1, xmm1, 0xaa + ; Multiply first two rows of third matrix with third weight + mulps xmm2, xmm1 + mulps xmm3, xmm1 + ; Add + addps xmm4, xmm2 + addps xmm5, xmm3 + + ; Load last two rows of the third matrix to xmm2-xmm3 + movaps xmm2, [ebx + eax + 32] + movaps xmm3, [ebx + eax + 48] + ; Multiply last two rows of third matrix with third weight + mulps xmm2, xmm1 + mulps xmm3, xmm1 + ; Add + addps xmm6, xmm2 + addps xmm7, xmm3 +%endif + +%if %1 >= 4 + ; Load first two rows of the fourth matrix into xmm2-xmm3 + movaps xmm2, [ebx + ecx] + movaps xmm3, [ebx + ecx + 16] + ; Shuffle fourth weight to all elements of xmm1 + movaps xmm1, xmm0 + shufps xmm1, xmm1, 0xff + ; Multiply first two rows of the fourth matrix with fourth weight + mulps xmm2, xmm1 + mulps xmm3, xmm1 + ; Add + addps xmm4, xmm2 + addps xmm5, xmm3 + + ; Load last two rows of the fourth matrix to xmm2-xmm3 + movaps xmm2, [ebx + ecx + 32] + movaps xmm3, [ebx + ecx + 48] + ; Multiply last two rows of the fourth matrix with fourth weight + mulps xmm2, xmm1 + mulps xmm3, xmm1 + ; Add + addps xmm6, xmm2 + addps xmm7, xmm3 +%endif + + ; Matrix is in xmm4-xmm7 + ; Transform position by 4x4 matrix in xmm4-xmm7 + movups xmm0, [esi] + movaps xmm1, xmm0 + movaps xmm2, xmm0 + shufps xmm1, xmm1, 0x55 + shufps xmm2, xmm2, 0xaa + shufps xmm0, xmm0, 0x00 + mulps xmm1, xmm5 + mulps xmm2, xmm6 + mulps xmm0, xmm4 + addps xmm1, xmm2 + addps xmm0, xmm7 + addps xmm0, xmm1 + ; Store vertex position in outvert + movaps xmm7, [esp + 16] ; MaskVec3 + maskmovdqu xmm0, xmm7 + +%if %2 ; Has normal + ; Transform vector by 3x3 matrix in xmm4-xmm6 + movups xmm0, [esi + normalOffset] + movaps xmm1, xmm0 + movaps xmm2, xmm0 + shufps xmm1, xmm1, 0x55 + shufps xmm2, xmm2, 0xaa + shufps xmm0, xmm0, 0x00 + mulps xmm1, xmm5 + mulps xmm2, xmm6 + mulps xmm0, xmm4 + addps xmm1, xmm2 + addps xmm0, xmm1 +%endif + +%if %3 ; Has tangent + ; Transform vector by 3x3 matrix in xmm4-xmm6 + movups xmm1, [esi + tangentOffset] + movaps xmm2, xmm1 + movaps xmm3, xmm1 + shufps xmm2, xmm2, 0x55 + shufps xmm3, xmm3, 0xaa + mulps xmm2, xmm5 + mulps xmm3, xmm6 + movaps xmm6, xmm1 ; Save original tangent's W in xmm6 + shufps xmm1, xmm1, 0x00 + andps xmm6, [esp + 0] ; MaskW + mulps xmm1, xmm4 + addps xmm2, xmm3 + addps xmm1, xmm2 +%endif + +%if %2 || %3 ; Has normal or tangent + ; Calculate lengths and normalize + movaps xmm2, xmm0 + movaps xmm5, xmm1 + mulps xmm2, xmm2 + mulps xmm5, xmm5 + movaps xmm3, xmm2 + movaps xmm4, xmm2 + shufps xmm3, xmm5, 0x55 + shufps xmm4, xmm5, 0xaa + shufps xmm2, xmm5, 0x00 + addps xmm3, xmm4 + addps xmm2, xmm3 + sqrtps xmm2, xmm2 + rcpps xmm2, xmm2 + movaps xmm3, xmm2 + shufps xmm2, xmm2, 0x00 + shufps xmm3, xmm3, 0xaa + mulps xmm0, xmm2 + mulps xmm1, xmm3 +%endif + +%if %2 ; Write normal + add edi, normalOffset + maskmovdqu xmm0, xmm7 ; MaskVec3 + sub edi, normalOffset +%endif + +%if %3 ; Write tangent + andps xmm1, xmm7 ; MaskVec3 + orps xmm1, xmm6 ; Restore original W + movups [edi + tangentOffset], xmm1 +%endif + +%if %1 == 1 + ; Indices only + add edx, 4 +%else + ; Indices and weights + add edx, %1 * 8 +%endif + + add esi, [ebp + 28] ; inputStride + add edi, [ebp + 32] ; outputStride + mov ecx, [esp + 32] ; savedEcx + dec ecx + jnz %%SkinSSE2_loop + + ; Remove local variables from stack + lea esp, [ebp-32] + + popad + pop ebp + ret + align 16 +%endmacro + + +global SkinSSE2_1Bone_Pos +global SkinSSE2_2Bones_Pos +global SkinSSE2_4Bones_Pos +global SkinSSE2_1Bone_PosNormal +global SkinSSE2_2Bones_PosNormal +global SkinSSE2_4Bones_PosNormal +global SkinSSE2_1Bone_PosNormalTan +global SkinSSE2_2Bones_PosNormalTan +global SkinSSE2_4Bones_PosNormalTan + + +SkinSSE2_1Bone_Pos: SkinSSE2_Generic 1, 0, 0 +SkinSSE2_2Bones_Pos: SkinSSE2_Generic 2, 0, 0 +SkinSSE2_4Bones_Pos: SkinSSE2_Generic 4, 0, 0 +SkinSSE2_1Bone_PosNormal: SkinSSE2_Generic 1, 1, 0 +SkinSSE2_2Bones_PosNormal: SkinSSE2_Generic 2, 1, 0 +SkinSSE2_4Bones_PosNormal: SkinSSE2_Generic 4, 1, 0 +SkinSSE2_1Bone_PosNormalTan: SkinSSE2_Generic 1, 1, 1 +SkinSSE2_2Bones_PosNormalTan: SkinSSE2_Generic 2, 1, 1 +SkinSSE2_4Bones_PosNormalTan: SkinSSE2_Generic 4, 1, 1 |