CGPROGRAM
#pragma compute CSMain
#define NUM_THREADS 8

float4 _Size;
float _TimeStep;
float _RFactor;

		
RWStructuredBuffer<float> _Pressure;
RWStructuredBuffer<float4> _Obstacles;
RWStructuredBuffer<float4> _Velocity;

[numthreads(NUM_THREADS,NUM_THREADS,NUM_THREADS)]
void CSMain (int3 id : SV_DispatchThreadID)
{
	int idx = dot(id, float3(1, _Size.xw));

	if(_Obstacles[idx].x > 0.0)
	{
	 _Velocity[idx] = float4(0,0,0,0);
	 return;
	}

	int idxL = idx - step(1, id.x);
	int idxR = idx + 1 - step(_Size.x-1, id.x);
	
	int idxB = idx - step(1, id.y)*_Size.x;
	int idxT = idx + (1 - step(_Size.y-1, id.y))*_Size.x;
	
	int idxD = idx - step(1, id.z)*_Size.w;
	int idxU = idx + (1 - step(_Size.z-1, id.z))*_Size.w;
	
	float3 PressAddOne = float3(_Pressure[idxR], _Pressure[idxT], _Pressure[idxU]);
	float3 PressSubOne = float3(_Pressure[idxL], _Pressure[idxB], _Pressure[idxD]);
	
	float C = _Pressure[idx];
		
	// Find neighboring obstacles:
	float4 bL = _Obstacles[idxL];
	float4 bR = _Obstacles[idxR];
	float4 bB = _Obstacles[idxB];
	float4 bT = _Obstacles[idxT];
	float4 bD = _Obstacles[idxD];
	float4 bU = _Obstacles[idxU];
	
	float3 mask = float3(1,1,1);
	float3 obstV = float3(0.0, 0.0, 0.0);
	
	if(bL.x > 0.0) { PressSubOne.x = C; obstV.x = bL.y; mask.x = 0; }
	if(bR.x > 0.0) { PressAddOne.x = C; obstV.x = bR.y; mask.x = 0; }
	
	if(bB.x > 0.0) { PressSubOne.y = C; obstV.y = bB.z; mask.y = 0; }
	if(bT.x > 0.0) { PressAddOne.y = C; obstV.y = bT.z; mask.y = 0; }
	
	if(bD.x > 0.0) { PressSubOne.z = C; obstV.z = bD.w; mask.z = 0; }
	if(bU.x > 0.0) { PressAddOne.z = C; obstV.z = bU.w; mask.z = 0; }
	
	float3 v = _Velocity[idx].xyz - PressAddOne + PressSubOne;
	float3 ret = v * mask + obstV * _RFactor;
	_Velocity[idx] =  float4(ret, 0.0);
}

ENDCG

