CGPROGRAM
#define SKINNING_COMPUTE_THREADCOUNT 64

#include "common.inc"

struct BoneMatrix
{
    float4 pose0;
	float4 pose1;
	float4 pose2;
};

struct SkinVert
{
    float4 vertex;
    float4 normal;
    float4 binoraml;
    float4 tangent;
    uint4  boneIndices;
    float4 boneWeight;
};

struct OutputVert
{
    float4 vertex;
    float4 normal;
    float4 binoraml;
    float4 tangent;
    float4 texcoord0;
};

StructuredBuffer<BoneMatrix>  		_BoneMatrixArray;
StructuredBuffer<SkinVert>    		_InPositionArray;
RWStructuredBuffer<OutputVert>    _OutPositionArray;
//RWByteAddressBuffer 							_OutPositionArray;
groupshared BoneMatrix LDS_BoneList[SKINNING_COMPUTE_THREADCOUNT];

float3 Skinning(float4 pos, uint4 boneInd, float4 weight)
{
    float3 vert = float3(0.0, 0.0, 0.0);

    [loop]
    for (uint i = 0; i < 4; ++i)
    {
    	 float4 row0 = LDS_BoneList[boneInd[i]].pose0;
    	 float4 row1 = LDS_BoneList[boneInd[i]].pose1;
    	 float4 row2 = LDS_BoneList[boneInd[i]].pose2;
    
            float4x4 m = float4x4(row0.x, row1.x, row2.x, 0.0,
												row0.y, row1.y, row2.y, 0.0,
												row0.z, row1.z, row2.z, 0.0,
												row0.w, row1.w, row2.w, 1.0);
            vert += mul(pos, m).xyz * weight[i];
    }
    return vert;
}

#pragma compute main

[numthreads(SKINNING_COMPUTE_THREADCOUNT, 1, 1)]
void main(uint3 DTid : SV_DispatchThreadID, uint3 GTid : SV_GroupThreadID)
{
    LDS_BoneList[GTid.x] = _BoneMatrixArray[GTid.x];
    SkinVert vert        = _InPositionArray[DTid.x];
    GroupMemoryBarrierWithGroupSync();
    float4 pos = float4(0.0, 0.0, 0.0, 1.0);
    pos.xyz = Skinning(vert.vertex, vert.boneIndices, vert.boneWeight);
    
    /*const uint stride_POS = 0;
    const uint SIZE_VERT = 112;
    const uint fetchAddress_POS = DTid.x * SIZE_VERT + stride_POS;
    uint4  u_pos = asuint(pos);*/
    
    _OutPositionArray[DTid.x].vertex = pos;
    //_OutPositionArray.Store4(fetchAddress_POS, u_pos);
}
ENDCG