You cannot select more than 25 topics Topics must start with a letter or number, can include dashes ('-') and can be up to 35 characters long.
o3de/Gems/Atom/Feature/Common/Assets/Shaders/LightCulling/LightCullingRemap.azsl

217 lines
7.7 KiB
Plaintext

/*
* Copyright (c) Contributors to the Open 3D Engine Project.
* For complete copyright and license terms please see the LICENSE at the root of this distribution.
*
* SPDX-License-Identifier: Apache-2.0 OR MIT
*
*/
// Compute shader that takes a LightList and remaps it into a format that is fast for the forward shader to read (LightListRemapped)
#include <Atom/Features/SrgSemantics.azsli>
#include <Atom/RPI/Math.azsli>
#include <Atom/Features/LightCulling/LightCullingShared.azsli>
// Make sure to change ComputeSharedBinLocationFromLightIndex() if NUM_THREADS changes.
#define NUM_THREADS 32
#define UNROLLING_STEPS 2
ShaderResourceGroup PassSrg : SRG_PerPass
{
// Contains the number of lights (along with End of Group Markers) for each tile. This includes all types of lights (point, spot, disk, etc)
Texture2D<uint> m_lightCount;
// The light list is of type R32_UINT, and contents are either light indices or End of Group markers
// 32-bits because we wrote directly from groupshared uints to device memory and also because they are packed with a bitmask containing which
// bins the light is present in
// We will compact it into 16-bit uints here
// Size is width * height * max_lights_per_tile
StructuredBuffer<uint> m_lightList;
// The output is an R16_UINT containing light indices, separated into bins that run along the Z axis
// Size is width * height * max_lights_per_tile * num_bins
RWStructuredBuffer<uint> m_lightListRemapped;
RWTexture2D<uint4> m_tileLightData;
uint m_tileWidth;
}
void ComputeSharedBinLocationFromLightIndex(uint lightIndex, out uint locationBin, out uint locationBit)
{
locationBit = 1 << (lightIndex & 31); // lightIndex % NUM_THREADS
locationBin = lightIndex >> 5; // lightIndex / NUM_THREADS
}
// What we are doing with shared_bin is storing an "is present" bit for each light in groupshared memory
// If we have a max possible 256 lights, then we need 8 buckets of uints to store all these bits since a uint has 32 bits
#define NUM_LIGHT_BINS (NVLC_MAX_POSSIBLE_LIGHTS_PER_BIN / 32)
groupshared uint shared_bin[NVLC_MAX_BINS][NUM_LIGHT_BINS];
groupshared uint shared_lights[NVLC_MAX_POSSIBLE_LIGHTS_PER_BIN];
void ClearSharedMemory(uint groupIndex)
{
if( groupIndex < NUM_LIGHT_BINS )
{
[unroll]
for( uint bin = 0; bin < NVLC_MAX_BINS; bin++ )
{
shared_bin[bin][groupIndex] = 0;
}
}
}
void AssignLightsToSharedMemoryBins(uint groupIndex, uint3 groupID, uint totalLights)
{
for(uint lightIndex = groupIndex; lightIndex < totalLights; lightIndex += NUM_THREADS )
{
uint index = GetLightListIndex(groupID, PassSrg::m_tileWidth, lightIndex);
uint package = PassSrg::m_lightList[index];
shared_lights[lightIndex] = Light_GetIndex(package);
uint lightBin;
uint lightBit;
ComputeSharedBinLocationFromLightIndex(lightIndex, lightBin, lightBit);
[unroll]
for( uint tileBin = 0; tileBin < NVLC_MAX_BINS; tileBin++ )
{
if( Light_IsInsideBin(package, tileBin) )
{
InterlockedOr(shared_bin[tileBin][lightBin], lightBit);
}
}
}
}
uint ComputeBaseBin(uint3 groupID)
{
uint baseBin = (groupID.y * PassSrg::m_tileWidth + groupID.x) * NVLC_MAX_POSSIBLE_LIGHTS_PER_BIN * NVLC_MAX_BINS;
return baseBin;
}
void WriteLightsToRemappedLightList(uint groupIndex, uint3 groupID, uint totalLights, inout uint writeIndices[NVLC_MAX_BINS])
{
uint threadMask = (1 << groupIndex) - 1;
for(uint lightIndex = groupIndex; lightIndex < totalLights; lightIndex += NUM_THREADS )
{
uint lightBin;
uint lightBit;
ComputeSharedBinLocationFromLightIndex(lightIndex, lightBin, lightBit);
[unroll]
for( uint tileBin = 0; tileBin < NVLC_MAX_BINS; tileBin++ )
{
if( (shared_bin[tileBin][lightBin] & lightBit) != 0 )
{
uint addr = countbits(shared_bin[tileBin][lightBin] & threadMask);
PassSrg::m_lightListRemapped[writeIndices[tileBin] + addr] = shared_lights[lightIndex];
}
uint n = countbits(shared_bin[tileBin][lightBin]);
writeIndices[tileBin] += n;
}
}
}
uint CalculateNumLightsInWorstBin(uint groupIndex, uint3 groupID, uint baseBin, uint writeIndices[NVLC_MAX_BINS])
{
uint lightsInWorstBin = 0;
if( groupIndex == 0 )
{
[unroll]
for( uint bin = 0; bin < NVLC_MAX_BINS; bin++ )
{
uint base_bin = baseBin + bin * NVLC_MAX_POSSIBLE_LIGHTS_PER_BIN;
uint lights = writeIndices[bin] - base_bin;
lightsInWorstBin = max(lightsInWorstBin, lights);
}
}
return lightsInWorstBin;
}
void WriteTileLightData(const uint groupIndex, const uint3 groupID, const uint lightsInWorstBin, const bool overflow)
{
if (groupIndex == 0)
{
uint4 tileLightData = PassSrg::m_tileLightData[groupID.xy];
// Used for the heatmap
tileLightData.w = lightsInWorstBin;
// pack a "lights have overflowed bit" into this uint
tileLightData.w |= overflow ? (1 << 31) : 0;
PassSrg::m_tileLightData[groupID.xy] = tileLightData;
}
}
void WriteEndOfList(uint groupIndex, uint writeIndices[NVLC_MAX_BINS])
{
if (groupIndex == 0)
{
[unroll]
for( uint bin = 0; bin < NVLC_MAX_BINS; bin++ )
{
PassSrg::m_lightListRemapped[writeIndices[bin]] = NVLC_END_OF_LIST;
}
}
}
void InitWriteIndices(uint3 groupID, uint baseBin, out uint writeIndices[NVLC_MAX_BINS])
{
[unroll]
for( uint bin = 0; bin < NVLC_MAX_BINS ; bin++ )
{
writeIndices[bin] = baseBin + bin * NVLC_MAX_POSSIBLE_LIGHTS_PER_BIN;
}
}
// This compute shader reads the results of the culling phase and packs into a tight buffer that is used by the forward pass
// The primary input is the Light List which is of size width * height * MAX_LIGHTS_PER_BIN * sizeof(R32_UINT).
// Each R32_UINT contains two items: the light index and a list of bins that the light covers
// The output is LightListRemapped, it is of size: width * height * MAX_LIGHTS_PER_BIN * NUM_BINS * sizeof(R16_UINT)
// It contains only light (and eventually decal) indices along with END_OF_GROUP markers
// The forward pass will look up which bin the pixel it wants to shade exists in and do a linear walk through the
// light indices until it hits an END_OF_X marker
// Note that this code could probably be made faster with wave intrinsics
// ATOM-4104
[numthreads(NUM_THREADS, 1, 1)]
void MainCS(
uint3 dispatchThreadID : SV_DispatchThreadID,
uint3 groupID : SV_GroupID,
uint groupIndex : SV_GroupIndex)
{
ClearSharedMemory(groupIndex);
GroupMemoryBarrierWithGroupSync();
uint totalLights = PassSrg::m_lightCount.Load(uint3(groupID.xy, 0)).x;
// expect flickering if the max number of lights per tile is exceeded
const bool overflow = totalLights > (NVLC_MAX_POSSIBLE_LIGHTS_PER_BIN - 1);
totalLights = min(totalLights, NVLC_MAX_POSSIBLE_LIGHTS_PER_BIN - 1);
AssignLightsToSharedMemoryBins(groupIndex, groupID, totalLights);
GroupMemoryBarrierWithGroupSync();
uint baseBin = ComputeBaseBin(groupID);
uint writeIndices[NVLC_MAX_BINS];
InitWriteIndices(groupID, baseBin, writeIndices);
WriteLightsToRemappedLightList(groupIndex, groupID, totalLights, writeIndices);
uint lightsInWorstBin = CalculateNumLightsInWorstBin(groupIndex, groupID, baseBin, writeIndices);
WriteEndOfList(groupIndex, writeIndices);
WriteTileLightData(groupIndex, groupID, lightsInWorstBin, overflow);
}