/*
 * Copyright (c) Contributors to the Open 3D Engine Project.
 * For complete copyright and license terms please see the LICENSE at the root of this distribution.
 *
 * SPDX-License-Identifier: Apache-2.0 OR MIT
 *
 */

#include <scenesrg.srgi>

// Perform light culling on a compute shader

#include <Atom/RPI/Math.azsli>
#include <Atom/Features/LightCulling/LightCullingShared.azsli>

enum QuadLightFlag // Copied from QuadLight.azsli. See ATOM-3731
{
    None = 0x00,
    EmitsBothDirections  = 0x01, // 1 << 0, // Quad should emit light from both sides
    UseFastApproximation = 0x02, // 1 << 1, // Use a fast approximation instead of linearly transformed cosines.
};

enum DiskLightFlag
{
    UseConeAngle = 1,
};

ShaderResourceGroup PassSrg : SRG_PerPass
{
    // Figure out how to remove duplicate struct definitions.
    // These are also defined in View.srg
    // ATOM-3731
    
    struct SimplePointLight
    {
        float3 m_position;
        float m_invAttenuationRadiusSquared; // For a radius at which this light no longer has an effect, 1 / radius^2.
        float3 m_rgbIntensityCandelas;
        float m_padding; // explicit padding.
    };

    struct SimpleSpotLight
    {
        float3 m_position;
        float m_invAttenuationRadiusSquared; // For a radius at which this light no longer has an effect, 1 / radius^2.
        float3 m_direction;
        float m_cosInnerConeAngle; // cosine of the outer cone angle
        float3 m_rgbIntensityCandelas;
        float m_cosOuterConeAngle; // cosine of the inner cone angle
    };

    struct PointLight
    {
        float3 m_position;
        float m_invAttenuationRadiusSquared; // For a radius at which this light no longer has an effect, 1 / radius^2.
        float3 m_rgbIntensityCandelas;
        float m_bulbRadius;
        uint3 m_shadowIndices;
        uint m_padding;
    };

    struct DiskLight
    {
        float3 m_position;
        float m_invAttenuationRadiusSquared; // For a radius at which this light no longer has an effect, 1 / radius^2.
        float3 m_rgbIntensityCandelas;
        float m_diskRadius;
        float3 m_direction;
        uint m_flags;
        float m_cosInnerConeAngle;
        float m_cosOuterConeAngle;
        float m_bulbPositionOffset;
        uint m_shadowIndex;
    };

    struct CapsuleLight
    {
        float3 m_startPoint;   // One of the end points of the capsule
        float m_radius;        // Radius of the capsule, ie distance from line segment to surface.
        float3 m_direction;    // normalized vector from m_startPoint towards the other end point.
        float m_length;        // length of the line segment making up the inside of the capsule. Doesn't include caps (0 length capsule == sphere)
        float3 m_rgbIntensityCandelas; // total rgb luminous intensity of the capsule in candela
        float m_invAttenuationRadiusSquared; // Inverse of the distance at which this light no longer has an effect, squared. Also used for falloff calculations.
    };
    
    struct QuadLight
    {
        float3 m_position;
        float m_invAttenuationRadiusSquared; // For a radius at which this light no longer has an effect, 1 / radius^2.
        float3 m_leftDir; // Direction from center of quad to the left edge
        float m_halfWidth; // Half the width of the quad. m_leftDir * m_halfWidth is a vector from the center to the left edge.
        float3 m_upDir; // Direction from center of quad to the top edge
        float m_halfHeight; // Half the height of the quad. m_upDir * m_halfHeight is a vector from the center to the top edge.
        float3 m_rgbIntensityNits;
        uint m_flags; // See QuadLightFlag
    };
    
    struct LightCullingConstants
    {
        float4x4        m_worldToView;
        float4          m_screenUVToRay;
        float2          m_gridPixel;
        float2          m_gridHalfPixel;
        uint            m_gridWidth;
        uint            m_padding0;
        uint            m_padding1;
        uint            m_padding2;
    };    
    LightCullingConstants m_constantData;

    // Source light data 
    StructuredBuffer<SimplePointLight> m_simplePointLights;
    StructuredBuffer<SimpleSpotLight> m_simpleSpotLights;
    StructuredBuffer<PointLight> m_pointLights;
    StructuredBuffer<DiskLight> m_diskLights;
    StructuredBuffer<CapsuleLight> m_capsuleLights;
    StructuredBuffer<QuadLight> m_quadLights;
    uint m_simplePointLightCount;
    uint m_simpleSpotLightCount;
    uint m_pointLightCount;
    uint m_diskLightCount;
    uint m_capsuleLightCount;
    uint m_quadLightCount;

    // Produced by the LightCullingTilePrepare pass. Contains depth min/max and mask data (a bit set for each location where opaque geo was found)
    Texture2D<uint4> m_tileLightData;
        
    // Destination light data
    RWStructuredBuffer<uint> m_lightList;
    RWTexture2D<uint> m_lightCount;
    
    struct Decal
    {
        float3 m_position;
        float m_opacity;
        float4 m_quaternion;
        float3 m_halfSize;
        float m_angleAttenuation;
        uint m_sortKeyPacked;
        uint m_textureArrayIndex;
        uint m_textureIndex;
        uint m_padding[1];
    };
    
    StructuredBuffer<Decal> m_decals;
    uint m_decalCount;  
}

groupshared uint shared_lightCount;
groupshared uint shared_lightIndices[TILE_DIM_X * TILE_DIM_Y];

bool IsVectorPointingTowardsEye(const float3 dir)
{
    return (dir.z * RH_COORD_SYSTEM_REVERSE) < 0;
}

float3 WorldToView_Point(float3 p)
{
    float3 result = mul(PassSrg::m_constantData.m_worldToView, float4(p, 1.0)).xyz;    
    return result;
}

float3 WorldToView_Vector(float3 v)
{
    float3 result = mul((float3x3)PassSrg::m_constantData.m_worldToView, v);
    return result;
}

bool TestSphereVsAabbInvSqrt(float3 sphereCenter, float invSphereRadiusSq, float3 aabbCenter, float3 aabbHalfSize)
{
    float3 delta = max(float3(0.0, 0.0, 0.0), abs(aabbCenter - sphereCenter) - aabbHalfSize);
    float d2 = dot(delta, delta);
    return d2 * invSphereRadiusSq < 1.0f;
}

bool TestSphereVsAabb(float3 sphereCenter, float sphereRadiusSq, float3 aabbCenter, float3 aabbHalfSize)
{
    float3 delta = max(float3(0.0, 0.0, 0.0), abs(aabbCenter - sphereCenter) - aabbHalfSize);
    float d2 = dot(delta, delta);

    return d2 < sphereRadiusSq;
}

// Note that this function isn't precise. It will have false positives due to being simplified for speed.
// Function origin and description: https://bartwronski.com/2017/04/13/cull-that-cone/
bool TestSphereVsCone(float3 spherePos, float sphereRadius, float3 origin, float3 forward, float cosa, float size)
{
    float3 V = spherePos - origin;
    float V1len = dot(V, forward);

    bool backOk = V1len >= -sphereRadius;
    bool frontOk = V1len <= sphereRadius + size;

    float rsina = rsqrt(1 - cosa * cosa);
    float VlenSq = dot(V, V);
    float distanceClosestPoint = rsina * cosa * sqrt(max(0.0, VlenSq - V1len * V1len)) - V1len;
    bool angleOk = distanceClosestPoint <= sphereRadius* rsina;

    return angleOk && backOk && frontOk;
}


float2 ScreenUvToRay(float2 uv)
{
    return uv * PassSrg::m_constantData.m_screenUVToRay.xy + PassSrg::m_constantData.m_screenUVToRay.zw;
}

// Returns screen rays
// xy contains the top left corner 
// zw contains the bottom right corner
// These rays are constructed assuming distance to them along z is 1.0
// This lets us simply multiply the numbers by an actual depth value to get a position in the tile in view space

// we also return: tileCenterUv which is 2D screenCoordinates of the center of the tile
float4 ComputeScreenRays(uint2 tileId, out float2 tileCenterUv)
{
    float2 tile_uv = float2(tileId) * PassSrg::m_constantData.m_gridPixel;

    tileCenterUv = tile_uv + PassSrg::m_constantData.m_gridHalfPixel;

    float4 tileRect;
    tileRect.xy = ScreenUvToRay(tile_uv);
    tileRect.zw = ScreenUvToRay(tile_uv + PassSrg::m_constantData.m_gridPixel);

    return tileRect;
}

uint NextPowerTwo(uint x)
{
    // https://wickedengine.net/2018/01/05/next-power-of-two-in-hlsl/
    return 2 << firstbithigh(max(1, x) - 1);
}

uint GetSortKey(PassSrg::Decal decal)
{
    return (decal.m_sortKeyPacked & 0xFF);
}

bool AreDecalsOutofOrder(uint packedIndexLeft, uint packedIndexRight)
{
    uint leftIndex = Light_GetIndex(packedIndexLeft);
    uint rightIndex = Light_GetIndex(packedIndexRight);
    PassSrg::Decal leftDecal = PassSrg::m_decals[leftIndex];
    PassSrg::Decal rightDecal = PassSrg::m_decals[rightIndex];

    uint leftSortIndex = GetSortKey(leftDecal);
    uint rightSortIndex = GetSortKey(rightDecal);
    if (leftSortIndex == rightSortIndex)
    {
        return leftIndex > rightIndex;
    }
    else
    {
        return leftSortIndex > rightSortIndex;
    }
}

void SortDecals(uint groupIndex)
{
    // Note that shared_lightCount can exceed the array size if too many decals intersect the tile, so clamp it here.
    uint numArray = min(TILE_DIM_X * TILE_DIM_Y, shared_lightCount);
    uint numArrayPowerOfTwo = NextPowerTwo(numArray);
    
    // Bitonic sort code from AMD: https://github.com/GPUOpen-LibrariesAndSDKs/GPUParticles11
    // AMD / MIT License is contained in the same directory as this file
    // subArraySize = 2,4,8,16,etc...
    for (uint subArraySize = 2; subArraySize <= numArrayPowerOfTwo; subArraySize = subArraySize * 2)
    {
        // compareDist = (subArraySize / 2), (subArraySize / 4), ... 32, 16, 8, 4, 2, 1
        for (uint compareDist = subArraySize >> 1; compareDist > 0; compareDist = compareDist >> 1)
        {
            // This code from AMD very cleverly computes the locations of two different array indices to compare.
            // The pattern that is produced from this is identical to: https://en.wikipedia.org/wiki/Bitonic_sorter#Alternative_representation
            // Essentially creating larger monotonic sequences and then merging them together. (Two monotonic sequences together is a bitonic)
            uint index_low = groupIndex & (compareDist - 1);
            uint index_high = 2 * (groupIndex - index_low);
            uint index0 = index_high + index_low;

            uint index1 = compareDist == subArraySize >> 1 ? index_high + (2 * compareDist - 1) - index_low : index_high + compareDist + index_low;

            if (index0 < numArray && index1 < numArray)
            {                                      
                bool areDecalsOutOrder = AreDecalsOutofOrder(shared_lightIndices[index0], shared_lightIndices[index1]);            
                if (areDecalsOutOrder)
                {
                    uint uTemp = shared_lightIndices[index0];
                    shared_lightIndices[index0] = shared_lightIndices[index1];
                    shared_lightIndices[index1] = uTemp;
                }
            }
            GroupMemoryBarrierWithGroupSync();
        }
    }
}

void MarkLightAsVisibleInSharedMemory(uint lightIndex, uint inside)
{
    uint sharedLightIndex;
    InterlockedAdd(shared_lightCount, 1, sharedLightIndex); 
    
    sharedLightIndex = min(sharedLightIndex, NVLC_MAX_POSSIBLE_LIGHTS_PER_BIN - 1);
    
    shared_lightIndices[sharedLightIndex] = PackLightIndexWithBinMask(lightIndex, inside);  
} 
  
void CopySharedLightsToMainMemory(uint lightCount, uint groupIndex, uint3 groupID)
{
    if( groupIndex < shared_lightCount )
    {
        uint offset = min(lightCount + groupIndex, NVLC_MAX_POSSIBLE_LIGHTS_PER_BIN - 1);
        uint index = GetLightListIndex(groupID, PassSrg::m_constantData.m_gridWidth, offset);
        PassSrg::m_lightList[index] = shared_lightIndices[groupIndex];
    }
}

// Return the minz and maxz of this light in view space
float2 ComputePointLightMinMaxZ(float lightRadius, float3 lightPosition)
{                    
    float2 minmax = lightPosition.z + lightRadius * float2(-1,1) * RH_COORD_SYSTEM_REVERSE;  
    return minmax;
}

float2 ComputeSimpleSpotLightMinMax(PassSrg::SimpleSpotLight light, float3 lightPosition)
{
    float lightRadius = rsqrt(light.m_invAttenuationRadiusSquared);
    float2 minmax = lightPosition.z + lightRadius * float2(-1, 1) * RH_COORD_SYSTEM_REVERSE;  
    return minmax;
}

// Return the minz and maxz of this quad light in view space 
// Quad light must be double sided
float2 ComputeQuadLightMinMaxZ_DoubleSided(PassSrg::QuadLight light, float3 lightPosition)
{
    const float lightRadius = rsqrt(light.m_invAttenuationRadiusSquared);                       
    const float2 minmax = lightPosition.z + lightRadius * float2(-1,1) * RH_COORD_SYSTEM_REVERSE;  
    return minmax;    
}

// Return the minz and maxz of this quad light in view space 
// Quad light must be single sided
float2 ComputeQuadLightMinMaxZ_SingleSided(PassSrg::QuadLight light, float3 lightPosition, float3 lightDirection)
{
    // [GFX TODO][ATOM-6170] We can compute a tighter bounds with single sided lights by bringing in one of bounds  
    return ComputeQuadLightMinMaxZ_DoubleSided(light, lightPosition);
}

float2 ComputeDiskLightMinMax(PassSrg::DiskLight light, float3 lightPosition)
{
    float lightRadius = rsqrt(light.m_invAttenuationRadiusSquared) + light.m_bulbPositionOffset;                       
    float2 minmax = lightPosition.z + lightRadius * float2(-1, 1) * RH_COORD_SYSTEM_REVERSE;  
    return minmax;
}

float2 ComputeCapsuleLightMinMax(PassSrg::CapsuleLight light, float3 lightPosition, float lightFalloffRadius)
{ 
    float offsetZ = abs(WorldToView_Vector(light.m_direction).z * light.m_length * 0.5f) + lightFalloffRadius;
    float nearZ = lightPosition.z - offsetZ * RH_COORD_SYSTEM_REVERSE;
    float farZ = lightPosition.z + offsetZ * RH_COORD_SYSTEM_REVERSE;  
   
    return float2(nearZ, farZ);  
}

void CullDecals(uint groupIndex, TileLightData tileLightData, float3 aabb_center, float3 aabb_extents, float2 tile_center_uv)
{
    for (uint decalIndex = groupIndex ; decalIndex < PassSrg::m_decalCount ; decalIndex += TILE_DIM_X * TILE_DIM_Y)
    { 
        PassSrg::Decal decal = PassSrg::m_decals[decalIndex];
        float3 decalPosition = WorldToView_Point(decal.m_position); 
        
        // just wrapping a bounding sphere around a cube for now to get a minor perf boost. i.e. the sphere radius is sqrt(x*x + y*y + z*z)
        // ATOM-4224 - try AABB-AABB 
        float boundingSphereRadiusSqr = dot(decal.m_halfSize, decal.m_halfSize);
          
        bool potentiallyIntersects = TestSphereVsAabb(decalPosition, boundingSphereRadiusSqr, aabb_center, aabb_extents);
        
        if (potentiallyIntersects) 
        {                                           
            uint inside = 0;
            float2 minmax = ComputePointLightMinMaxZ(sqrt(boundingSphereRadiusSqr), decalPosition);
            if (IsObjectInsideTile(tileLightData, minmax, inside))
            {
                MarkLightAsVisibleInSharedMemory(decalIndex, inside);            
            }
        }          
    }   
}

void CullPointLight(uint lightIndex, float3 lightPosition, float invLightRadius, TileLightData tileLightData, float3 aabb_center, float3 aabb_extents)
{
    lightPosition = WorldToView_Point(lightPosition); 
    bool potentiallyIntersects = TestSphereVsAabbInvSqrt(lightPosition, invLightRadius, aabb_center, aabb_extents);
    if (potentiallyIntersects)
    {                                           
        // Implement and profile fine-grained light culling testing
        // ATOM-3732

        uint inside = 0;
        float2 minmax = ComputePointLightMinMaxZ(rsqrt(invLightRadius), lightPosition);
        if (IsObjectInsideTile(tileLightData, minmax, inside))
        {
            MarkLightAsVisibleInSharedMemory(lightIndex, inside);            
        }
    }       
}

void CullSimplePointLights(uint groupIndex, TileLightData tileLightData, float3 aabb_center, float3 aabb_extents)
{
    for (uint lightIndex = groupIndex ; lightIndex < PassSrg::m_simplePointLightCount ; lightIndex += TILE_DIM_X * TILE_DIM_Y)
    {
        PassSrg::SimplePointLight light = PassSrg::m_simplePointLights[lightIndex];
        CullPointLight(lightIndex, light.m_position, light.m_invAttenuationRadiusSquared, tileLightData, aabb_center, aabb_extents);
    }  
}

void CullPointLights(uint groupIndex, TileLightData tileLightData, float3 aabb_center, float3 aabb_extents)
{
    for (uint lightIndex = groupIndex ; lightIndex < PassSrg::m_pointLightCount ; lightIndex += TILE_DIM_X * TILE_DIM_Y)
    {
        PassSrg::PointLight light = PassSrg::m_pointLights[lightIndex];
        CullPointLight(lightIndex, light.m_position, light.m_invAttenuationRadiusSquared, tileLightData, aabb_center, aabb_extents);
    }  
}

void CullSimpleSpotLights(uint groupIndex, TileLightData tileLightData, float3 aabb_center, float3 aabb_extents)
{
    for (uint lightIndex = groupIndex ; lightIndex < PassSrg::m_simpleSpotLightCount ; lightIndex += TILE_DIM_X * TILE_DIM_Y)
    {
        PassSrg::SimpleSpotLight light = PassSrg::m_simpleSpotLights[lightIndex];
        float3 lightPosition = WorldToView_Point(light.m_position);
        float3 lightDirection = WorldToView_Vector(light.m_direction);
        
        bool potentiallyIntersects = TestSphereVsCone(aabb_center, length(aabb_extents), lightPosition, lightDirection, light.m_cosOuterConeAngle, rsqrt(light.m_invAttenuationRadiusSquared));
        if (potentiallyIntersects)
        {
            // Implement and profile fine-grained light culling testing
            // ATOM-3732

            uint inside = 0;
            float2 minmax = ComputeSimpleSpotLightMinMax(light, lightPosition);
            if (IsObjectInsideTile(tileLightData, minmax, inside))
            {
                MarkLightAsVisibleInSharedMemory(lightIndex, inside);            
            }
        }                                    
    }   
}

void CullDiskLights(uint groupIndex, TileLightData tileLightData, float3 aabb_center, float3 aabb_extents)
{
    for (uint lightIndex = groupIndex ; lightIndex < PassSrg::m_diskLightCount ; lightIndex += TILE_DIM_X * TILE_DIM_Y)
    {
        PassSrg::DiskLight light = PassSrg::m_diskLights[lightIndex];
        float3 lightPosition = WorldToView_Point(light.m_position - light.m_bulbPositionOffset * light.m_direction);
        float lightRadius = rsqrt(light.m_invAttenuationRadiusSquared) + light.m_diskRadius;
        float lightRadiusSqr = lightRadius * lightRadius;
        float aabbRadius = length(aabb_extents);
        float3 lightDirection = WorldToView_Vector(light.m_direction);

        bool potentiallyIntersects;
        if (light.m_flags & DiskLightFlag::UseConeAngle > 0)
        {
            potentiallyIntersects = TestSphereVsCone(aabb_center, length(aabb_extents), lightPosition, lightDirection, light.m_cosOuterConeAngle, rsqrt(light.m_invAttenuationRadiusSquared) + light.m_bulbPositionOffset);
        }
        else
        {
            potentiallyIntersects = TestSphereVsAabb(lightPosition, lightRadiusSqr, aabb_center, aabb_extents);

            if (potentiallyIntersects)
            {
                // Only one side is visible, check that we are above the hemisphere
                float3 toAABBCenter = aabb_center - lightPosition;
                float distanceToLightPlane = dot(lightDirection, toAABBCenter);
                
                potentiallyIntersects = distanceToLightPlane >= -aabbRadius;
            }
        }

        if (potentiallyIntersects)
        {
            // Implement and profile fine-grained light culling testing
            // ATOM-3732

            uint inside = 0;
            float2 minmax = ComputeDiskLightMinMax(light, lightPosition);
            if (IsObjectInsideTile(tileLightData, minmax, inside))
            {
                MarkLightAsVisibleInSharedMemory(lightIndex, inside);            
            } 
        }                                      
    }   
}

void CullCapsuleLights(uint groupIndex, TileLightData tileLightData, float3 aabb_center, float3 aabb_extents)
{
    for (uint lightIndex = groupIndex ; lightIndex < PassSrg::m_capsuleLightCount ; lightIndex += TILE_DIM_X * TILE_DIM_Y)
    {
        PassSrg::CapsuleLight light = PassSrg::m_capsuleLights[lightIndex];
        float3 lightMiddleWorld = light.m_startPoint + light.m_direction * light.m_length * 0.5f;
        float3 lightMiddleView = WorldToView_Point(lightMiddleWorld);
        
        float lightFalloffRadius = rsqrt(light.m_invAttenuationRadiusSquared);
        float lightConservativeBoundingRadius = lightFalloffRadius + light.m_length * 0.5f;
        
        bool potentiallyIntersects = TestSphereVsAabb(lightMiddleView, lightConservativeBoundingRadius * lightConservativeBoundingRadius, aabb_center, aabb_extents);            
    
        if (potentiallyIntersects)
        {
            // Implement and profile fine-grained light culling testing
            // ATOM-3732

            uint inside = 0;
            float2 minmax = ComputeCapsuleLightMinMax(light, lightMiddleView, lightFalloffRadius);
            if (IsObjectInsideTile(tileLightData, minmax, inside))
            {
                MarkLightAsVisibleInSharedMemory(lightIndex, inside);            
            } 
        }                                       
    }   
}

void CullQuadLights(uint groupIndex, TileLightData tileLightData, float3 aabb_center, float3 aabb_extents)
{
    // Implement and profile fine-grained light culling testing
    // ATOM-3732

    for (uint lightIndex = groupIndex ; lightIndex < PassSrg::m_quadLightCount ; lightIndex += TILE_DIM_X * TILE_DIM_Y)
    {
        const PassSrg::QuadLight light = PassSrg::m_quadLights[lightIndex];
        const float3 lightPosition = WorldToView_Point(light.m_position);             
        
        bool potentiallyIntersects = TestSphereVsAabbInvSqrt(lightPosition, light.m_invAttenuationRadiusSquared, aabb_center, aabb_extents);

        if (potentiallyIntersects)
        {           
            float2 minmaxz; 
        
            const bool singleSided = (light.m_flags & QuadLightFlag::EmitsBothDirections) == 0;
            if (singleSided)
            {
                // Only one side is visible, check that we are above the hemisphere
                const float3 leftDir = light.m_leftDir;
                const float3 upDir = light.m_upDir;            
                const float3 lightDirection = WorldToView_Vector(cross(leftDir, upDir));                        
                const float3 toAABBCenter = aabb_center - lightPosition;
                const float distanceToLightPlane = dot(lightDirection, toAABBCenter);
                const float aabbRadius = length(aabb_extents);
                             
                const bool aboveHemisphere = distanceToLightPlane >= -aabbRadius; 
                if (aboveHemisphere)
                {
                    minmaxz = ComputeQuadLightMinMaxZ_SingleSided(light, lightPosition, lightDirection);                
                }
                else
                {
                    potentiallyIntersects = false;                            
                }
            }
            else
            {
                minmaxz = ComputeQuadLightMinMaxZ_DoubleSided(light, lightPosition);
            }      

            uint inside = 0;
            if (potentiallyIntersects && IsObjectInsideTile(tileLightData, minmaxz, inside))
            {
                MarkLightAsVisibleInSharedMemory(lightIndex, inside);            
            }              
        }             
    }   
}

uint WriteEndOfGroup(uint lightCount, uint3 groupID)
{
    uint lightsAfter = lightCount + shared_lightCount;
    
    uint end = PackLightIndexWithBinMask(NVLC_END_OF_GROUP, NVLC_ALL_BIN_BITS);
    uint offset = min(lightCount + shared_lightCount, NVLC_MAX_POSSIBLE_LIGHTS_PER_BIN - 1);    
    uint index = GetLightListIndex(groupID, PassSrg::m_constantData.m_gridWidth, offset);
    PassSrg::m_lightList[index] = end;

    lightsAfter++;
    
    return lightsAfter;
}

void ClearSharedLightCount(uint groupIndex)
{
    if( groupIndex == 0 )
    {
        shared_lightCount = 0;    
    }
}

void ClearSharedLightCountWithDoubleBarrier(uint groupIndex)
{
    GroupMemoryBarrierWithGroupSync();    
    ClearSharedLightCount(groupIndex);
    GroupMemoryBarrierWithGroupSync(); 
}

float2 ReadDepthCloseFar(uint3 groupID)
{
    float2 depthCloseFar = asfloat(PassSrg::m_tileLightData[groupID.xy].xy);
    return depthCloseFar;
}

TileLightData ReadTileLightData(uint3 groupID)
{
    uint4 packedData = PassSrg::m_tileLightData[groupID.xy];
    return Tile_UnpackData(packedData);
}

uint WriteCullingDataToMainMemory(uint lightCount, uint groupIndex, uint3 groupID)
{
    GroupMemoryBarrierWithGroupSync();    
    CopySharedLightsToMainMemory(lightCount, groupIndex, groupID );
    lightCount = WriteEndOfGroup(lightCount, groupID);
    return lightCount;
}

// This shader is invoke one thread-group per on-screen tile
// e.g. if the screen resolution is 1920x1080, with 16x16 tiles, there will be 120x68 tiles (and 120x68 thread groups)
// Each thread-group is dedicated to culling all lights against that screen-tile.
// It might be worth splitting this compute shader into several shaders, one per light type.    

// Each thread will read one light, determine if it is visible, write it to shared memory, then move onto the next light until
// all lights are processed
// After all lights visibility is computed, it will write them back from shared memory to GPU memory

// This will write out the following:

// Point light index << 16 | bitmask contains which bits the light is present in
// Point light index << 16 | bitmask contains which bits the light is present in
// Point light index << 16 | bitmask contains which bits the light is present in
// ...
// End of Group
// Disk light index << 16 | bitmask contains which bits the light is present in
// Disk light index << 16 | bitmask contains which bits the light is present in
// Disk light index << 16 | bitmask contains which bits the light is present in
// ...
// End of Group
// i.e. for each 32-bit UINT, it contains the 16 bit light index + 16-bit binning information
// (other light types and decals to come)
// Note! This isn't consumed by the forward shader. This light list will be further processed by the LightCullingRemap shader, producing a LightListRemapped buffer
// that is more optimal for consumption by the forward shader.

[numthreads(TILE_DIM_X, TILE_DIM_Y, 1)]
void MainCS(
    uint3 dispatchThreadID : SV_DispatchThreadID, 
    uint3 groupID : SV_GroupID, 
    uint groupIndex : SV_GroupIndex)
{
    ClearSharedLightCount(groupIndex);

    uint lightCount = 0;

    TileLightData tileLightData = ReadTileLightData(groupID);
  
    float2 tileCenterUv;
    float4 tileRect = ComputeScreenRays(groupID.xy, tileCenterUv);   
    float3 aabb_center, aabb_extents;
    BuildAabb(tileRect, tileLightData, aabb_center, aabb_extents);                
    GroupMemoryBarrierWithGroupSync();
    
    CullDecals(groupIndex, tileLightData, aabb_center, aabb_extents, tileCenterUv); 
    GroupMemoryBarrierWithGroupSync();    
    SortDecals(groupIndex);
    lightCount = WriteCullingDataToMainMemory(lightCount, groupIndex, groupID );
    
    ClearSharedLightCountWithDoubleBarrier(groupIndex);
            
    CullSimplePointLights(groupIndex, tileLightData, aabb_center, aabb_extents); 
    lightCount = WriteCullingDataToMainMemory(lightCount, groupIndex, groupID );
    
    ClearSharedLightCountWithDoubleBarrier(groupIndex);

    CullSimpleSpotLights(groupIndex, tileLightData, aabb_center, aabb_extents); 
    lightCount = WriteCullingDataToMainMemory(lightCount, groupIndex, groupID );
    
    ClearSharedLightCountWithDoubleBarrier(groupIndex);

    CullPointLights(groupIndex, tileLightData, aabb_center, aabb_extents); 
    lightCount = WriteCullingDataToMainMemory(lightCount, groupIndex, groupID );
    
    ClearSharedLightCountWithDoubleBarrier(groupIndex);

    CullDiskLights(groupIndex, tileLightData, aabb_center, aabb_extents);
    lightCount = WriteCullingDataToMainMemory(lightCount, groupIndex, groupID );
 
    ClearSharedLightCountWithDoubleBarrier(groupIndex);

    CullCapsuleLights(groupIndex, tileLightData, aabb_center, aabb_extents);
    lightCount = WriteCullingDataToMainMemory(lightCount, groupIndex, groupID );

    ClearSharedLightCountWithDoubleBarrier(groupIndex);

    CullQuadLights(groupIndex, tileLightData, aabb_center, aabb_extents);
    lightCount = WriteCullingDataToMainMemory(lightCount, groupIndex, groupID );


    if (groupIndex == 0)
    {
        PassSrg::m_lightCount[groupID.xy] = lightCount;     
    }
}