CGPROGRAM
#define NUM_THREADS 8

float4 _Size;
float _DeltaTime, _Dissipate, _Forward;

RWStructuredBuffer<float4> _Obstacles;

RWStructuredBuffer<float4> _Write3f;
RWStructuredBuffer<float4> _Read3f;


float3 SampleBilinear(float3 uv, float3 size)
{
	int x = uv.x;
	int y = uv.y;
	int z = uv.z;
	
	int X = _Size.x;
	int XY = _Size.w;
	
	float3 fxyz = frac(uv);
	int3 pxyz = min(_Size.xyz-1, uv+1);
	
	// float fx = uv.x-x;
	// float fy = uv.y-y;
	// float fz = uv.z-z;
	

	// int xp1 = min(size.x-1, x+1);
	// int yp1 = min(size.y-1, y+1);
	// int zp1 = min(size.z-1, z+1);
	
	float3 x0 = _Read3f[x + y*X + z*XY].xyz * (1.0f - fxyz.x) + _Read3f[pxyz.x + y*X + z*XY].xyz * fxyz.x;
	float3 x1 = _Read3f[x + y*X + pxyz.z*XY].xyz * (1.0f - fxyz.x) + _Read3f[pxyz.x + y*X + pxyz.z*XY].xyz * fxyz.x;
	
	float3 x2 = _Read3f[x + pxyz.y*X + z*XY].xyz * (1.0f - fxyz.x) + _Read3f[pxyz.x + pxyz.y*X + z*XY].xyz * fxyz.x;
	float3 x3 = _Read3f[x + pxyz.y*X + pxyz.z*XY].xyz * (1.0f - fxyz.x) + _Read3f[pxyz.x + pxyz.y*X + pxyz.z*XY].xyz * fxyz.x;
	
	float3 z0 = x0 * (1.0f - fxyz.z) + x1 * fxyz.z;
	float3 z1 = x2 * (1.0f - fxyz.z) + x3 * fxyz.z;
	
	return z0 * (1.0f - fxyz.y) + z1 * fxyz.y;

}

#pragma compute AdvectVelocity
[numthreads(NUM_THREADS,NUM_THREADS,NUM_THREADS)]
void AdvectVelocity(int3 id : SV_DispatchThreadID)
{
	int idx = dot(id, float3(1, _Size.xw));
	
	if(_Obstacles[idx].x > 0.0)
	{
		 _Write3f[idx] = float4(0,0,0,0);
		 return;
	}

  float3 uv = float3(id) - _DeltaTime * _Forward * _Read3f[idx].xyz;
			
	float3 ret = SampleBilinear(uv, _Size.xyz) * _Dissipate;			
  _Write3f[idx] = float4(ret, 0.0);
   	
}

ENDCG

