You cannot select more than 25 topics
Topics must start with a letter or number, can include dashes ('-') and can be up to 35 characters long.
331 lines
14 KiB
Plaintext
331 lines
14 KiB
Plaintext
/*
|
|
* Copyright (c) Contributors to the Open 3D Engine Project.
|
|
* For complete copyright and license terms please see the LICENSE at the root of this distribution.
|
|
*
|
|
* SPDX-License-Identifier: Apache-2.0 OR MIT
|
|
*
|
|
*/
|
|
|
|
/*
|
|
This compute shader analyzes the depth buffer and for each tile, find the min / max depths.
|
|
It also produces a bitmask indicating which z subsections of the tile have geometry inside them
|
|
This version supports 4x MSAA depth buffers with reversed depth
|
|
|
|
*/
|
|
|
|
|
|
#include <Atom/RPI/Math.azsli>
|
|
|
|
// We are using the shader variant system to deal with different MSAA modes
|
|
// The shader variant system needs a fallback, using DefaultDrawSrg for this.
|
|
#include <Atom/RPI/ShaderResourceGroups/DefaultDrawSrg.azsli>
|
|
#include <Atom/Features/LightCulling/LightCullingShared.azsli>
|
|
|
|
|
|
option enum class MsaaMode { None, Msaa2x, Msaa4x, Msaa8x } o_msaaMode;
|
|
|
|
// This compute shader thread-group runs across a screen tile (usually 16x16 pixels)
|
|
// Each pixel in the tile has a z depth value associated with it
|
|
// The minimum/maximum depth values in the tile form a range
|
|
// This range can be subdivided into 32 pieces
|
|
// Each time we find a depth value that falls into that subdivided range, we mark
|
|
// the corresponding bit in a 32-bit bitmask
|
|
// We use this mask to only apply lights to pixels that need it.
|
|
|
|
// It looks something like this:
|
|
// Camera .............|transparent geometry| |opaque geometry| --> Z values
|
|
// | minmaxZ |
|
|
// 32-bit mask |X|X|X|X|X|X|X|X|X|X|X| | | | | | | |X| |X| |X|X|X| |X
|
|
//
|
|
// X indicates areas where geometry was discovered
|
|
|
|
groupshared uint shared_opaqueMask;
|
|
|
|
// Each thread-group attempts to find the minimum/maximum opaque z value in the tile by sharing them through:
|
|
groupshared uint shared_depthNear;
|
|
groupshared uint shared_depthFar;
|
|
|
|
// Likewise for transparent z values:
|
|
groupshared uint shared_depthTransparentNear;
|
|
groupshared uint shared_depthTransparentFar;
|
|
|
|
|
|
void StoreMinMaxIntoSharedMemory(float2 data)
|
|
{
|
|
uint orig;
|
|
DEPTH_InterlockedMin(shared_depthNear, asuint(data.x), orig);
|
|
DEPTH_InterlockedMax(shared_depthFar, asuint(data.y), orig);
|
|
}
|
|
|
|
void StoreTransparentMinMaxIntoSharedMemory(float2 data)
|
|
{
|
|
uint orig;
|
|
DEPTH_InterlockedMin(shared_depthTransparentNear, asuint(data.x), orig);
|
|
DEPTH_InterlockedMax(shared_depthTransparentFar, asuint(data.y), orig);
|
|
}
|
|
|
|
ShaderResourceGroup PassSrg : SRG_PerPass
|
|
{
|
|
Texture2DMS<float4> m_depthBufferMSAA;
|
|
|
|
// Depth buffer where we rendered the nearest pixels of transparent objects
|
|
Texture2D<float4> m_depthBufferTransparentMin;
|
|
|
|
// Depth buffer where we rendered the furthest pixels of transparent objects
|
|
Texture2D<float4> m_depthBufferTransparentMax;
|
|
|
|
RWTexture2D<uint4> m_tileLightData;
|
|
|
|
struct Constants
|
|
{
|
|
// Used to convert from raw depth buffer values to Z values
|
|
float2 m_unprojectZ;
|
|
uint m_depthBufferWidth;
|
|
uint m_depthBufferHeight;
|
|
};
|
|
Constants m_constantData;
|
|
}
|
|
|
|
void ClearSharedMemory(uint groupIndex)
|
|
{
|
|
if( groupIndex == 0 )
|
|
{
|
|
shared_depthNear = asuint(DEPTH_FAR);
|
|
shared_depthFar = asuint(DEPTH_NEAR);
|
|
shared_depthTransparentNear = asuint(DEPTH_FAR);
|
|
shared_depthTransparentFar = asuint(DEPTH_NEAR);
|
|
shared_opaqueMask = 0;
|
|
}
|
|
}
|
|
|
|
float2 ReadOpaqueDepthSamples2xMSAA(uint2 uv)
|
|
{
|
|
float2 depthSamples;
|
|
depthSamples.x = PassSrg::m_depthBufferMSAA.Load(uv, 0).x;
|
|
depthSamples.y = PassSrg::m_depthBufferMSAA.Load(uv, 1).x;
|
|
return depthSamples;
|
|
}
|
|
|
|
float4 ReadOpaqueDepthSamples4xMSAA(uint2 uv)
|
|
{
|
|
float4 depthSamples;
|
|
depthSamples.x = PassSrg::m_depthBufferMSAA.Load(uv, 0).x;
|
|
depthSamples.y = PassSrg::m_depthBufferMSAA.Load(uv, 1).x;
|
|
depthSamples.z = PassSrg::m_depthBufferMSAA.Load(uv, 2).x;
|
|
depthSamples.w = PassSrg::m_depthBufferMSAA.Load(uv, 3).x;
|
|
return depthSamples;
|
|
}
|
|
|
|
void CombineOpaqueMaskWithAllThreads(uint opaqueMask)
|
|
{
|
|
InterlockedOr(shared_opaqueMask, opaqueMask);
|
|
GroupMemoryBarrierWithGroupSync();
|
|
}
|
|
|
|
// Given the locations of two values in a uint, return a bitmask that contains 1s in between those bits and 0s elsewhere
|
|
// e.g. CreateFilledBitmask(0,31) produces 0b11111111111111111111111111111111
|
|
// e.g. CreateFilledBitmask(1,30) produces 0b01111111111111111111111111111110
|
|
uint CreateFilledBitmask(uint startBit, uint endBit)
|
|
{
|
|
uint x = 1 << startBit;
|
|
uint y = 1 << endBit;
|
|
return ((y - 1) | y) & ~(x - 1);
|
|
}
|
|
|
|
// This function produces a bit mask that indicates which parts of the tile (along z) that contains transparent geometry
|
|
// Note that minmaxZ includes both the opaque and transparent parts.
|
|
uint ComputeTransparentBitMask(float2 minmaxZ)
|
|
{
|
|
float2 minmaxDepth_transparent = float2(asfloat(shared_depthTransparentNear), asfloat(shared_depthTransparentFar));
|
|
|
|
// Handle case where no transparent pixel were drawn here.
|
|
if (minmaxDepth_transparent.x == DEPTH_FAR)
|
|
{
|
|
return 0;
|
|
}
|
|
|
|
float2 minmaxZ_transparent = DepthBufferToViewSpace(minmaxDepth_transparent, PassSrg::m_constantData.m_unprojectZ);
|
|
|
|
float2 minmaxUnit_transparent = RemapZToUnit(minmaxZ_transparent, minmaxZ);
|
|
|
|
// Convert that 0 to 1 to 0.0 to 31.99999
|
|
float2 bit = Tile_UnitValueToBit(minmaxUnit_transparent);
|
|
|
|
// IMPORTANT: we have only opaque depth and closest pixels for transparency, so we don't know
|
|
// what is going on in between. We have to fill this "unknown" region to avoid skipping lights
|
|
// This is not needed if an app does additional pass to get bits from all transparent layers
|
|
// An additional permutation is required for this case
|
|
|
|
uint bitMask = CreateFilledBitmask(bit.x, bit.y);
|
|
return bitMask;
|
|
}
|
|
|
|
uint ComputeOpaqueAndTransparentMask(float2 minmaxZ)
|
|
{
|
|
uint result = shared_opaqueMask | ComputeTransparentBitMask(minmaxZ);
|
|
return result;
|
|
}
|
|
|
|
void WriteTileLightDataToMainMemory(uint groupIndex, uint3 groupID, float2 minmaxZ)
|
|
{
|
|
if( groupIndex == 0 )
|
|
{
|
|
// Note that we could be in a RH coordinate system in which case minmaxz.x is > minmaxz.y, hence the abs() call
|
|
float tileZ = abs(minmaxZ.y - minmaxZ.x);
|
|
|
|
// Note that Nvidia Light Culling framework has some additional code to better calculate this ratio
|
|
// See cb_perFrame.fRangeThreshold in their framework. This code might be something we want to port over.
|
|
// ATOM-5554
|
|
float ratio = tileZ / 1.0f;
|
|
float logMaxBins = clamp(log2(ratio), 0.0, LOG_MAX_BINS);
|
|
uint ulogMaxBins = uint(logMaxBins + 0.5);
|
|
|
|
uint4 data;
|
|
data.x = asuint(minmaxZ.x);
|
|
data.y = (asuint(minmaxZ.y) & ~NVLC_BINS_MASK) | ulogMaxBins;
|
|
data.z = ComputeOpaqueAndTransparentMask(minmaxZ);
|
|
data.w = 0;
|
|
|
|
PassSrg::m_tileLightData[groupID.xy] = data;
|
|
}
|
|
}
|
|
|
|
float2 ReadTransparentMinMaxMSAA(uint2 uv)
|
|
{
|
|
float2 transparentMinMax;
|
|
|
|
// Nvidia light culling sample does fewer samples for transparency with the note that it
|
|
// should still be good enough with MSAA as sampling from MSAA textures can be expensive.
|
|
// Doing 1 tap for min/max textures seems pretty precise in AtomSampleViewer test-scene.
|
|
// More experimentation required with more complex scenes would be ideal.
|
|
|
|
uint3 uv3 = uint3(uv,0);
|
|
|
|
transparentMinMax.x = PassSrg::m_depthBufferTransparentMin.Load(uv3).x;
|
|
transparentMinMax.y = PassSrg::m_depthBufferTransparentMax.Load(uv3).x;
|
|
|
|
return transparentMinMax;
|
|
}
|
|
|
|
// We want to compute the minmax bounding values for a tile to get tight bounds for accurate culling.
|
|
// Each thread will call this function to share its values with every other thread, replacing them with the minmax for all threads
|
|
void UpdateMinMaxFromAllThreads(in out float2 minmaxDepth_both, in out float2 transparentMinMax, const bool isPixelOnScreen)
|
|
{
|
|
// Propagate this thread's data across the thread-group
|
|
if (isPixelOnScreen)
|
|
{
|
|
StoreMinMaxIntoSharedMemory(minmaxDepth_both);
|
|
StoreTransparentMinMaxIntoSharedMemory(transparentMinMax);
|
|
}
|
|
GroupMemoryBarrierWithGroupSync();
|
|
|
|
// Read back data from all threads in the thread-group
|
|
minmaxDepth_both = float2(asfloat(shared_depthNear), asfloat(shared_depthFar));
|
|
transparentMinMax = float2(asfloat(shared_depthTransparentNear), asfloat(shared_depthTransparentFar));
|
|
}
|
|
|
|
float2 ReplaceSkyPixelsWithFurthestPixelsFromTransparentObjects2x(float2 opaqueDepthSamples, float2 transparentMinMax)
|
|
{
|
|
opaqueDepthSamples = (opaqueDepthSamples.x == DEPTH_FAR && transparentMinMax.y != DEPTH_NEAR) ? transparentMinMax.y : opaqueDepthSamples;
|
|
return opaqueDepthSamples;
|
|
}
|
|
|
|
float4 ReplaceSkyPixelsWithFurthestPixelsFromTransparentObjects4x(float4 opaqueDepthSamples, float2 transparentMinMax)
|
|
{
|
|
opaqueDepthSamples = (opaqueDepthSamples.x == DEPTH_FAR && transparentMinMax.y != DEPTH_NEAR) ? transparentMinMax.y : opaqueDepthSamples;
|
|
return opaqueDepthSamples;
|
|
}
|
|
|
|
float IncrementULP(float x)
|
|
{
|
|
x = asfloat( asuint(x) + 1 );
|
|
return x;
|
|
}
|
|
|
|
float2 ExpandMinMax(float2 minmaxA, float2 minmaxB)
|
|
{
|
|
float2 expandedMinmax;
|
|
expandedMinmax.x = DEPTH_MIN(minmaxA.x, minmaxB.x);
|
|
expandedMinmax.y = DEPTH_MAX(minmaxA.y, minmaxB.y);
|
|
return expandedMinmax;
|
|
}
|
|
|
|
bool IsPixelOnScreen(uint3 dispatchThreadID)
|
|
{
|
|
return dispatchThreadID.x < PassSrg::m_constantData.m_depthBufferWidth &&
|
|
dispatchThreadID.y < PassSrg::m_constantData.m_depthBufferHeight;
|
|
}
|
|
|
|
// Note that Nvidia light culling framework will dispatch different sized groups depending upon MSAA mode
|
|
// For instance, they dispatch 32 threads for 1x, 64 for 2x, 128 for 4x and 8x.
|
|
// This allows for some interesting optimizations, such as calling GatherRed instead of Load.
|
|
// I am dispatching one thread per screen pixel, which simplifies coding but we might be leaving some performance on the table
|
|
[numthreads(TILE_DIM_X, TILE_DIM_Y, 1)]
|
|
void MainCS(
|
|
uint3 dispatchThreadID : SV_DispatchThreadID,
|
|
uint3 groupID : SV_GroupID,
|
|
uint groupIndex : SV_GroupIndex)
|
|
{
|
|
const bool isPixelOnScreen = IsPixelOnScreen(dispatchThreadID);
|
|
|
|
ClearSharedMemory(groupIndex);
|
|
GroupMemoryBarrierWithGroupSync();
|
|
|
|
// .x contains the camera Z value of the pixel in the tile that is closest to the camera
|
|
// .y contains the furthest
|
|
float2 minmaxDepth_both = 0;
|
|
|
|
// Bitmask where each bit indicates where opaque geometry was found. See the diagram at the top of this file.
|
|
uint maskOpaque = 0;
|
|
|
|
float2 minmaxDepth_transparent = ReadTransparentMinMaxMSAA(dispatchThreadID.xy);
|
|
|
|
if (o_msaaMode == MsaaMode::None || o_msaaMode == MsaaMode::Msaa2x)
|
|
{
|
|
float2 opaqueDepthSamples;
|
|
if (o_msaaMode == MsaaMode::None)
|
|
{
|
|
float depth = PassSrg::m_depthBufferMSAA.Load(dispatchThreadID.xy, 0).x;
|
|
opaqueDepthSamples = float2(depth, depth);
|
|
}
|
|
else if (o_msaaMode == MsaaMode::Msaa2x)
|
|
{
|
|
opaqueDepthSamples = ReadOpaqueDepthSamples2xMSAA(dispatchThreadID.xy);
|
|
}
|
|
|
|
// Transparent geometry can't be behind opaque geometry. Just pick the first MSAA sample since we are trying to reduce reading from MSAA buffer
|
|
minmaxDepth_transparent = DEPTH_MIN(minmaxDepth_transparent, opaqueDepthSamples.x);
|
|
opaqueDepthSamples = ReplaceSkyPixelsWithFurthestPixelsFromTransparentObjects2x(opaqueDepthSamples, minmaxDepth_transparent);
|
|
float2 minmaxDepth_opaque = ComputeDepthMinMaxFrom2Samples(opaqueDepthSamples);
|
|
minmaxDepth_both = ExpandMinMax(minmaxDepth_opaque, minmaxDepth_transparent);
|
|
UpdateMinMaxFromAllThreads(minmaxDepth_both, minmaxDepth_transparent, isPixelOnScreen);
|
|
minmaxDepth_both = DepthBufferToViewSpace(minmaxDepth_both, PassSrg::m_constantData.m_unprojectZ);
|
|
|
|
// if zNear == zFar we want to map z == zNear to 0-bit, so we have to keep zNear without modifications
|
|
minmaxDepth_both.y = IncrementULP(minmaxDepth_both.y);
|
|
|
|
maskOpaque = DepthSamplesToBinMask2x(opaqueDepthSamples, minmaxDepth_both, PassSrg::m_constantData.m_unprojectZ);
|
|
}
|
|
// Note that Nvidia light culling sample uses the 4x code for 8x msaa, with a note that msaa is obviously very expensive and taking 4x samples should still be accurate enough
|
|
else if (o_msaaMode == MsaaMode::Msaa4x || o_msaaMode == MsaaMode::Msaa8x)
|
|
{
|
|
float4 opaqueDepthSamples = ReadOpaqueDepthSamples4xMSAA(dispatchThreadID.xy);
|
|
|
|
// Transparent geometry can't be behind opaque geometry. Just pick the first MSAA sample since we are trying to reduce reading from MSAA buffer
|
|
minmaxDepth_transparent = DEPTH_MIN(minmaxDepth_transparent, opaqueDepthSamples.x);
|
|
opaqueDepthSamples = ReplaceSkyPixelsWithFurthestPixelsFromTransparentObjects4x(opaqueDepthSamples, minmaxDepth_transparent);
|
|
float2 minmaxDepth_opaque = ComputeDepthMinMaxFrom4Samples(opaqueDepthSamples);
|
|
minmaxDepth_both = ExpandMinMax(minmaxDepth_opaque, minmaxDepth_transparent);
|
|
UpdateMinMaxFromAllThreads(minmaxDepth_both, minmaxDepth_transparent, isPixelOnScreen);
|
|
minmaxDepth_both = DepthBufferToViewSpace(minmaxDepth_both, PassSrg::m_constantData.m_unprojectZ);
|
|
|
|
// if zNear == zFar we want to map z == zNear to 0-bit, so we have to keep zNear without modifications
|
|
minmaxDepth_both.y = IncrementULP(minmaxDepth_both.y);
|
|
|
|
maskOpaque = DepthSamplesToBinMask4x(opaqueDepthSamples, minmaxDepth_both, PassSrg::m_constantData.m_unprojectZ);
|
|
}
|
|
|
|
CombineOpaqueMaskWithAllThreads(maskOpaque);
|
|
WriteTileLightDataToMainMemory(groupIndex, groupID, minmaxDepth_both);
|
|
}
|