CGPROGRAM
#define NUM_THREADS 8

float4 _Size;
float _DeltaTime, _Dissipate, _Decay, _Forward;

RWStructuredBuffer<float4> _Velocity;
RWStructuredBuffer<float4> _Obstacles;

RWStructuredBuffer<float> _Write1f;
RWStructuredBuffer<float> _Read1f;

float SampleBilinear(float3 uv, float3 size)
{
	int x = uv.x;
	int y = uv.y;
	int z = uv.z;
	
	int X = size.x;
	int XY = size.x*size.y;
	
	float fx = uv.x-x;
	float fy = uv.y-y;
	float fz = uv.z-z;
	
	int xp1 = min(size.x-1, x+1);
	int yp1 = min(size.y-1, y+1);
	int zp1 = min(size.z-1, z+1);
	
	float x0 = _Read1f[x+y*X+z*XY] * (1.0f-fx) + _Read1f[xp1+y*X+z*XY] * fx;
	float x1 = _Read1f[x+y*X+zp1*XY] * (1.0f-fx) + _Read1f[xp1+y*X+zp1*XY] * fx;
	
	float x2 = _Read1f[x+yp1*X+z*XY] * (1.0f-fx) + _Read1f[xp1+yp1*X+z*XY] * fx;
	float x3 = _Read1f[x+yp1*X+zp1*XY] * (1.0f-fx) + _Read1f[xp1+yp1*X+zp1*XY] * fx;
	
	float z0 = x0 * (1.0f-fz) + x1 * fz;
	float z1 = x2 * (1.0f-fz) + x3 * fz;
	
	return z0 * (1.0f-fy) + z1 * fy;

}

#pragma compute Advect

[numthreads(NUM_THREADS,NUM_THREADS,NUM_THREADS)]
void Advect(int3 id : SV_DispatchThreadID)
{
	int idx = dot(id, float3(1, _Size.xw));
	
	if(_Obstacles[idx].x > 0.1)
	{
		 _Write1f[idx] = 0;
		 return;
	}

  float3 uv = float3(id) - _DeltaTime * _Forward * _Velocity[idx].xyz;
			
  _Write1f[idx] = max(0, SampleBilinear(uv, _Size.xyz) * _Dissipate - _Decay);

}

ENDCG

