CGPROGRAM
#pragma compute CSMain
#define NUM_THREADS 8

float4 _Size;

RWStructuredBuffer<float> _WriteP;
RWStructuredBuffer<float> _Write;
RWStructuredBuffer<float4> _Velocity;
RWStructuredBuffer<float4> _Obstacles;

[numthreads(NUM_THREADS,NUM_THREADS,NUM_THREADS)]
void CSMain (int3 id : SV_DispatchThreadID)
{
	int idx = dot(id, float3(1, _Size.xw));

	int idxL = idx - step(1, id.x);
	int idxR = idx + 1 - step(_Size.x-1, id.x);
	
	int idxB = idx - step(1, id.y)*_Size.x;
	int idxT = idx + (1 - step(_Size.y-1, id.y))*_Size.x;
	
	int idxD = idx - step(1, id.z)*_Size.w;
	int idxU = idx + (1 - step(_Size.z-1, id.z))*_Size.w;
	
	
	float3 VelAddOne = float3(_Velocity[idxR].x, _Velocity[idxT].y, _Velocity[idxU].z);
	float3 VelSubOne = float3(_Velocity[idxL].x, _Velocity[idxB].y, _Velocity[idxD].z);
	
	// Find neighboring obstacles:
	float2 bL = _Obstacles[idxL].xy;
	float2 bR = _Obstacles[idxR].xy;
	float2 bB = _Obstacles[idxB].xz;
	float2 bT = _Obstacles[idxT].xz;
	float2 bD = _Obstacles[idxD].xw;
	float2 bU = _Obstacles[idxU].xw;
	
	// Use obstacle velocities for solid cells:
	if(bL.x > 0.0) VelSubOne.x = bL.y;
	if(bR.x > 0.0) VelAddOne.x = bR.y;
	
	if(bB.x > 0.0) VelSubOne.y = bB.y;
	if(bT.x > 0.0) VelAddOne.y = bT.y;
	
	if(bD.x > 0.0) VelSubOne.z = bD.y;
	if(bU.x > 0.0) VelAddOne.z = bU.y;
	
	float divergence =  0.5f * dot(VelAddOne - VelSubOne, float3(1,1,1));
	
	_Write[idx] = divergence;
	
	_WriteP[idx] = -divergence / 6.0f;
}

ENDCG

