o3de/Gems/Atom/Feature/Common/Assets/Shaders/SkinnedMesh/LinearSkinningCS.azsl

/*
 * Copyright (c) Contributors to the Open 3D Engine Project.
 * For complete copyright and license terms please see the LICENSE at the root of this distribution.
 *
 * SPDX-License-Identifier: Apache-2.0 OR MIT
 *
 */
#include "LinearSkinningPassSRG.azsli"
#include <Atom/Features/MorphTargets/MorphTargetCompression.azsli>
#include <Atom/Features/MatrixUtility.azsli>
#include <Atom/RPI/Math.azsli>


option enum class SkinningMethod { LinearSkinning, DualQuaternion } o_skinningMethod = SkinningMethod::LinearSkinning;
option bool o_applyMorphTargets = false;
option bool o_applyColorMorphTargets = false;

float3 ReadFloat3FromFloatBuffer(Buffer<float> buffer, uint index)
{
    float3 result;
    result.x = buffer[index * 3];
    result.y = buffer[index * 3 + 1];
    result.z = buffer[index * 3 + 2];
    return result;
}

// Apply a morph target delta with three components
void ApplyMorphTargetDelta(uint streamOffset, uint vertexIndex, inout float3 modifiedValue)
{
    // Get the start of the current delta
    uint offset = streamOffset + vertexIndex * 3;

    // Read in the encoded deltas
    uint3 encodedInts;
    encodedInts.x = asint(PassSrg::m_skinnedMeshOutputStream[offset]);
    encodedInts.y = asint(PassSrg::m_skinnedMeshOutputStream[offset + 1]);
    encodedInts.z = asint(PassSrg::m_skinnedMeshOutputStream[offset + 2]);

    // Since we're done reading, re-set the accumulation to 0 for the next frame
    PassSrg::m_skinnedMeshOutputStream[offset] = asfloat(0);
    PassSrg::m_skinnedMeshOutputStream[offset + 1] = asfloat(0);
    PassSrg::m_skinnedMeshOutputStream[offset + 2] = asfloat(0);

    // Now decode and apply the delta
    float3 decodedFloats = DecodeIntsToFloats(encodedInts, InstanceSrg::m_morphTargetDeltaInverseIntegerEncoding);
    modifiedValue += decodedFloats;
}

// Apply a morph target delta with four components

void ApplyMorphTargetDelta(uint streamOffset, uint vertexIndex, inout float4 modifiedValue)
{
    // Get the start of the current delta
    uint offset = streamOffset + vertexIndex * 4;

    // Read in the encoded deltas
    uint4 encodedInts;
    encodedInts.x = asint(PassSrg::m_skinnedMeshOutputStream[offset]);
    encodedInts.y = asint(PassSrg::m_skinnedMeshOutputStream[offset + 1]);
    encodedInts.z = asint(PassSrg::m_skinnedMeshOutputStream[offset + 2]);
    encodedInts.w = asint(PassSrg::m_skinnedMeshOutputStream[offset + 3]);

    // Since we're done reading, re-set the accumulation to 0 for the next frame
    PassSrg::m_skinnedMeshOutputStream[offset] = asfloat(0);
    PassSrg::m_skinnedMeshOutputStream[offset + 1] = asfloat(0);
    PassSrg::m_skinnedMeshOutputStream[offset + 2] = asfloat(0);
    PassSrg::m_skinnedMeshOutputStream[offset + 3] = asfloat(0);

    // Now decode and apply the delta
    float4 decodedFloats = DecodeIntsToFloats(encodedInts, InstanceSrg::m_morphTargetDeltaInverseIntegerEncoding);
    modifiedValue += decodedFloats;
}


//! Utility function for vertex shaders to transform vertex tangent, bitangent, and normal vectors into world space based on MikkT conventions.
//! Structured like ConstructTBN from TangentSpace.azsli, but uses a float3x3 for the localToWorld matrix.
//! It does not flip the bitangent using the w component of the tangent, and instead assumes that the input bitangent is already oriented correctly.
void ConstructTBN(float3 vertexNormal, float4 vertexTangent, float3 vertexBitangent, float3x3 localToWorld, float3x3 localToWorldInverseTranspose, out float3 normalWS, out float3 tangentWS, out float3 bitangentWS)
{
    normalWS = normalize(mul(localToWorldInverseTranspose, vertexNormal));
    tangentWS = normalize(mul(localToWorld, vertexTangent.xyz));
    bitangentWS = normalize(mul(localToWorld, vertexBitangent));
}

void SkinTBN(float3x3 skinToWorldMatrix, inout float3 normal, inout float4 tangent, inout float3 bitangent)
{
    // For non-uniform scaling the Normal needs the inverse scale
    float3x3 skinToWorldInvTrans = InverseTransposeMatrixFast(skinToWorldMatrix);

    ConstructTBN(normal, tangent, bitangent, skinToWorldMatrix, skinToWorldInvTrans, normal, tangent.xyz, bitangent);
}

////////////////////////////////////////////////////////////////////////////////////////////////////
// Skinning support
void SkinVertexLinear(int4 indices, float4 weights, inout float3 position, inout float3 normal, inout float4 tangent, inout float3 bitangent)
{
    float3x4 skinToWorldMatrix = (float3x4)0;

    [unroll]
    for(uint i = 0; i < 4; ++i)
    {
        skinToWorldMatrix += InstanceSrg::m_boneTransformsLinear[ indices[i] ] * weights[i];
    }

    position = mul(skinToWorldMatrix, float4(position, 1.0));

    // Cast to float3x3 because we only need the rotation and scale when computing the TBN
    SkinTBN((float3x3)skinToWorldMatrix, normal, tangent, bitangent);
}

void AddWeightedDualQuaternion(inout float2x4 lhs, float2x4 rhs, float weight)
{
    float flip = dot(lhs[0], rhs[0]) < 0.0f ? -1.0f : 1.0f;
    lhs += rhs * weight * flip;
}

void NormalizeDualQuaternion(inout float2x4 dualQuaternion)
{
    float invLength = rsqrt(dot(dualQuaternion[0], dualQuaternion[0]));
    dualQuaternion *= invLength;
}

void SkinVertexDualQuaternion(int4 indices, float4 weights, inout float3 position, inout float3 normal, inout float4 tangent, inout float3 bitangent)
{
    float2x4 skinToWorldDualQuaternion = (float2x4)0;

    [unroll]
    for(uint i = 0; i < 4; ++i)
    {
        AddWeightedDualQuaternion(skinToWorldDualQuaternion, InstanceSrg::m_boneTransformsDualQuaternion[ indices[i] ], weights[i]);
    }

    NormalizeDualQuaternion(skinToWorldDualQuaternion);
    const float4 rotation = skinToWorldDualQuaternion[0];
    const float4 translation = skinToWorldDualQuaternion[1];

    position.xyz =
        RotateVectorByQuaternion(position.xyz, rotation) +
        (rotation.w * translation.xyz - translation.w * rotation.xyz + cross(rotation.xyz, translation.xyz)) * 2;

    normal = normalize(RotateVectorByQuaternion(normal, rotation));
    tangent.xyz = normalize(RotateVectorByQuaternion(tangent.xyz, rotation));
    bitangent = normalize(RotateVectorByQuaternion(bitangent, rotation));
}

[numthreads(256,1,1)]
void MainCS(uint3 thread_id: SV_DispatchThreadID)
{
    // Each thread is responsible for one vertex
    // The total number of threads in a per-ActorInstance dispatch item matches the total number of vertices in the skinned mesh

    // The thread id for each dimension is limited to uint16_t max, so to support more than 65535 vertices we get the real index from both the x and y dimensions
    const uint i = (thread_id.x) + InstanceSrg::m_totalNumberOfThreadsX * (thread_id.y);
    if(i < InstanceSrg::m_numVertices)
    {
        // Moving current vertex position updated last frame to a predefined location to maintain a vertex history between two frames
        PassSrg::m_skinnedMeshOutputStream[InstanceSrg::m_targetPositions + (InstanceSrg::m_numVertices + i) * 3] = PassSrg::m_skinnedMeshOutputStream[InstanceSrg::m_targetPositions + i * 3];
        PassSrg::m_skinnedMeshOutputStream[InstanceSrg::m_targetPositions + (InstanceSrg::m_numVertices + i) * 3 + 1] = PassSrg::m_skinnedMeshOutputStream[InstanceSrg::m_targetPositions + i * 3 + 1];
        PassSrg::m_skinnedMeshOutputStream[InstanceSrg::m_targetPositions + (InstanceSrg::m_numVertices + i) * 3 + 2] = PassSrg::m_skinnedMeshOutputStream[InstanceSrg::m_targetPositions + i * 3 + 2];

        float4 blendWeights;
        blendWeights.x = InstanceSrg::m_sourceBlendWeights[i * 4];
        blendWeights.y = InstanceSrg::m_sourceBlendWeights[i * 4 + 1];
        blendWeights.z = InstanceSrg::m_sourceBlendWeights[i * 4 + 2];
        blendWeights.w = InstanceSrg::m_sourceBlendWeights[i * 4 + 3];

        // [TODO ATOM-15288]
        // Temporary workaround. When all the blend weights of a vertex are zero it means its data is set by the CPU directly
        // and skinning must be skipped to not overwrite it (e.g. cloth simulation).
        if(!any(blendWeights))
        {
            return;
        }

        float3 position = ReadFloat3FromFloatBuffer(InstanceSrg::m_sourcePositions, i);
        float3 normal = ReadFloat3FromFloatBuffer(InstanceSrg::m_sourceNormals, i);
        float4 tangent = InstanceSrg::m_sourceTangents[i];
        float3 bitangent = ReadFloat3FromFloatBuffer(InstanceSrg::m_sourceBiTangents, i);

        // Four indices, 16-bits each, stored in 2 32-bit uints
        uint2 rawIndices = InstanceSrg::m_sourceBlendIndices.Load2(i * 8);

        uint4 blendIndices;
        // Although the first index in each 32-bit pair is in the most significant bits in the cpu buffer,
        // the indices get swapped within the 32-bit uint when they are loaded so we treat them
        // as if the first index is in the least significant bits
        blendIndices.x = rawIndices.x >> 16;
        blendIndices.y = rawIndices.x & 0x0000FFFF;
        blendIndices.z = rawIndices.y >> 16;
        blendIndices.w = rawIndices.y & 0x0000FFFF;


        // Moving current vertex position updated last frame to a predefined location to maintain a vertex history between two frames
        PassSrg::m_skinnedMeshOutputStream[InstanceSrg::m_targetPositions + (InstanceSrg::m_numVertices + i) * 3] = PassSrg::m_skinnedMeshOutputStream[InstanceSrg::m_targetPositions + i * 3];
        PassSrg::m_skinnedMeshOutputStream[InstanceSrg::m_targetPositions + (InstanceSrg::m_numVertices + i) * 3 + 1] = PassSrg::m_skinnedMeshOutputStream[InstanceSrg::m_targetPositions + i * 3 + 1];
        PassSrg::m_skinnedMeshOutputStream[InstanceSrg::m_targetPositions + (InstanceSrg::m_numVertices + i) * 3 + 2] = PassSrg::m_skinnedMeshOutputStream[InstanceSrg::m_targetPositions + i * 3 + 2];

        if(o_applyMorphTargets)
        {
            ApplyMorphTargetDelta(InstanceSrg::m_morphTargetPositionDeltaOffset, i, position);
            ApplyMorphTargetDelta(InstanceSrg::m_morphTargetNormalDeltaOffset, i, normal);
            ApplyMorphTargetDelta(InstanceSrg::m_morphTargetTangentDeltaOffset, i, tangent.xyz);
            ApplyMorphTargetDelta(InstanceSrg::m_morphTargetBitangentDeltaOffset, i, bitangent);
        }

        if (o_applyColorMorphTargets)
        {
            float4 color = InstanceSrg::m_sourceColors[i];
            ApplyMorphTargetDelta(InstanceSrg::m_morphTargetColorDeltaOffset, i, color);
            PassSrg::m_skinnedMeshOutputStream[InstanceSrg::m_targetColors + i * 4] = color.r;
            PassSrg::m_skinnedMeshOutputStream[InstanceSrg::m_targetColors + i * 4 + 1] = color.g;
            PassSrg::m_skinnedMeshOutputStream[InstanceSrg::m_targetColors + i * 4 + 2] = color.b;
            PassSrg::m_skinnedMeshOutputStream[InstanceSrg::m_targetColors + i * 4 + 3] = color.a;
        }

        switch(o_skinningMethod)
        {
        case SkinningMethod::LinearSkinning:
            SkinVertexLinear(blendIndices, blendWeights, position, normal, tangent, bitangent);
            break;
        case SkinningMethod::DualQuaternion:
            SkinVertexDualQuaternion(blendIndices, blendWeights, position, normal, tangent, bitangent);
            break;
        }

        PassSrg::m_skinnedMeshOutputStream[InstanceSrg::m_targetPositions + i * 3] = position.x;
        PassSrg::m_skinnedMeshOutputStream[InstanceSrg::m_targetPositions + i * 3 + 1] = position.y;
        PassSrg::m_skinnedMeshOutputStream[InstanceSrg::m_targetPositions + i * 3 + 2] = position.z;

        PassSrg::m_skinnedMeshOutputStream[InstanceSrg::m_targetNormals + i * 3] = normal.x;
        PassSrg::m_skinnedMeshOutputStream[InstanceSrg::m_targetNormals + i * 3 + 1] = normal.y;
        PassSrg::m_skinnedMeshOutputStream[InstanceSrg::m_targetNormals + i * 3 + 2] = normal.z;

        PassSrg::m_skinnedMeshOutputStream[InstanceSrg::m_targetTangents + i * 4] = tangent.x;
        PassSrg::m_skinnedMeshOutputStream[InstanceSrg::m_targetTangents + i * 4 + 1] = tangent.y;
        PassSrg::m_skinnedMeshOutputStream[InstanceSrg::m_targetTangents + i * 4 + 2] = tangent.z;
        PassSrg::m_skinnedMeshOutputStream[InstanceSrg::m_targetTangents + i * 4 + 3] = tangent.w;

        PassSrg::m_skinnedMeshOutputStream[InstanceSrg::m_targetBiTangents + i * 3] = bitangent.x;
        PassSrg::m_skinnedMeshOutputStream[InstanceSrg::m_targetBiTangents + i * 3 + 1] = bitangent.y;
        PassSrg::m_skinnedMeshOutputStream[InstanceSrg::m_targetBiTangents + i * 3 + 2] = bitangent.z;

    }
}