You cannot select more than 25 topics Topics must start with a letter or number, can include dashes ('-') and can be up to 35 characters long.
o3de/Gems/Atom/Feature/Common/Assets/Shaders/PostProcessing/SsaoCompute.azsl

405 lines
15 KiB
Plaintext

/*
* Copyright (c) Contributors to the Open 3D Engine Project.
* For complete copyright and license terms please see the LICENSE at the root of this distribution.
*
* SPDX-License-Identifier: Apache-2.0 OR MIT
*
*/
#include <Atom/Features/SrgSemantics.azsli>
#include <scenesrg.srgi>
#include <viewsrg.srgi>
#include <Atom/RPI/Math.azsli>
#define THREADS 16
#define NUM_THREADS (THREADS * THREADS)
#define PADDING (THREADS / 2)
#define LDS_WIDTH (PADDING + THREADS + PADDING)
// LDS_SIZE = LDS_WIDTH * LDS_WIDTH = 32 * 32 = 1024
#define LDS_SIZE 1024
// LDS (local data store) will store depths from 32x32 region around the group in Phase I
// of SSAO and then the sampled positions for sharing with neighbor threads in the Phase II
groupshared float LDS[LDS_SIZE];
// Number of AO samples to calculate from local depth values stored in LDS in Phase I
// This value affects the size of the first loop which takes sample positions from
// localSamplesForLDS array defined below, and should therefore be set to a multpile of
// 4 and never exceed a value of 24, which is the size of the array.
#define LOCAL_SAMPLE_COUNT 12
// In Phase II we use additional samples from the depth input texture. Groups of 4x4 threads
// will fetch samples in an outer loop and share these samples with each in an inner loop.
// This sharing is accomplished by writing the samples to LDS prior to the inner loop:
#define INNER_SAMPLE_COUNT 4
#define OUTER_SAMPLE_LOOPS 10
// Subgroups are groups of 4x4 threads. Because 16 consecutive threads execute simultaneously,
// this removes the need for memory barriers when these groups write and read from LDS
#define THREADS_PER_SUBGROUP 16
#define SUBGROUP_COUNT (NUM_THREADS / THREADS_PER_SUBGROUP)
#define OUTER_SAMPLE_ANGLE (TWO_PI / OUTER_SAMPLE_LOOPS)
#define OUTER_SAMPLE_ANGLE_STEP (OUTER_SAMPLE_ANGLE / THREADS_PER_SUBGROUP)
// Array used for calculating sample positions in Phase I
static const int2 localSamplesForLDS[] = {
{ 0, -4 },
{ -4, 0 },
{ 4, 0 },
{ 0, 4 },
{ -4, -4 },
{ 4, -4 },
{ -4, 4 },
{ 4, 4 },
{ 0, -8 },
{ -8, 0 },
{ 8, 0 },
{ 0, 8 },
{ -4, -8 },
{ 4, -8 },
{ -4, 8 },
{ 4, 8 },
{ -8, -4 },
{ -8, 4 },
{ 8, -4 },
{ 8, 4 },
{ -8, -8 },
{ 8, -8 },
{ -8, 8 },
{ 8, 8 },
};
ShaderResourceGroup PassSrg : SRG_PerPass
{
Texture2D<float> m_linearDepth;
RWTexture2D<float> m_ssaoOutput;
// Must match the struct in SsaoPasses.cpp
struct SsaoConstants
{
// The texture dimensions of SSAO output
uint2 m_outputSize;
// The size of a pixel relative to screenspace UV
// Calculated by taking the inverse of the texture dimensions
float2 m_pixelSize;
// The size of half a pixel relative to screenspace UV
float2 m_halfPixelSize;
// The strength of the SSAO effect
float m_strength;
// The sampling radius calculated in screen UV space
float m_samplingRadius;
};
SsaoConstants m_constants;
Sampler PointSampler
{
MinFilter = Point;
MagFilter = Point;
MipFilter = Point;
AddressU = Clamp;
AddressV = Clamp;
AddressW = Clamp;
};
}
float2 GetOutputSize() { return PassSrg::m_constants.m_outputSize; }
float2 GetPixelSize() { return PassSrg::m_constants.m_pixelSize; }
float2 GetHalfPixelSize() { return PassSrg::m_constants.m_halfPixelSize; }
// --- LDS DEPTH ---
// In the first phase of SSAO, we store 32x32 depth samples in LDS.
// We use these depth values to reconstruct view space positions and
// normals from which we can then calculate SSAO contribution
int GetLdsIndex(int2 index)
{
return mad(index.y, LDS_WIDTH, index.x);
}
void WriteDepthGatherToLDS(int2 ldsPosition, float4 depthGather)
{
// Write the gathered depth values to LDS
LDS[ GetLdsIndex(ldsPosition) ] = depthGather.w;
++ldsPosition.x;
LDS[ GetLdsIndex(ldsPosition) ] = depthGather.z;
++ldsPosition.y;
LDS[ GetLdsIndex(ldsPosition) ] = depthGather.y;
--ldsPosition.x;
LDS[ GetLdsIndex(ldsPosition) ] = depthGather.x;
}
float3 GetPositionFromDepthLDS(int2 ldsPosition, float2 ldsOffsetUV)
{
float linearDepth = LDS[ GetLdsIndex(ldsPosition) ];
float2 screenSpaceUV = mad(float2(ldsPosition), GetPixelSize(), ldsOffsetUV);
return ViewSrg::GetViewSpacePosition(screenSpaceUV, linearDepth);
}
float3 CalculateNormalFromLDS(int2 ldsPosition, float2 ldsOffsetUV, float3 position)
{
// Normal is calculated from the cross product between horizontal and vertical position derivatives
// We calculate two position derivatives and take the smallest to avoid neighboring pixels from separate objects.
float3 diffX;
{
float3 positionLeft = GetPositionFromDepthLDS(ldsPosition - int2(1, 0), ldsOffsetUV);
float3 positionRight = GetPositionFromDepthLDS(ldsPosition + int2(1, 0), ldsOffsetUV);
float3 diffLeft = position - positionLeft;
float3 diffRight = positionRight - position;
diffX = (abs(diffLeft.z) < abs(diffRight.z)) ? diffLeft : diffRight;
}
float3 diffY;
{
float3 positionUp = GetPositionFromDepthLDS(ldsPosition - int2(0, 1), ldsOffsetUV);
float3 positionDown = GetPositionFromDepthLDS(ldsPosition + int2(0, 1), ldsOffsetUV);
float3 diffUp = position - positionUp;
float3 diffDown = positionDown - position;
diffY = (abs(diffUp.z) < abs(diffDown.z)) ? diffUp : diffDown;
}
float3 normal = normalize( cross(diffX, diffY) );
return normal;
}
// --- LDS POSITION ---
// In the second phase of SSAO, we store reconstructed positions in LDS.
// Positions takes up 3x more space, but we only need 16x16 (4x less than
// the number of depth samples). To write positions, we store the X component
// in the first quarter of LDS, the Y component in the second and Z in the third
// Dividing this way yields better performance as it avoids memory bank conflicts
#define LDS_QUARTER_SIZE (LDS_SIZE / 4)
void WritePositionToLDS(uint index, float3 position)
{
LDS[index] = position.x;
LDS[index + LDS_QUARTER_SIZE] = position.y;
LDS[index + (LDS_QUARTER_SIZE * 2)] = position.z;
}
float3 GetPositionFromLDS(uint index)
{
float3 position;
position.x = LDS[index];
position.y = LDS[index + LDS_QUARTER_SIZE];
position.z = LDS[index + (LDS_QUARTER_SIZE * 2)];
return position;
}
uint GetNextLinearIndex(uint linearIndex, uint increment)
{
// The next index in our group of 4x4
uint nextSubgroupIndex = (linearIndex + increment) & 0x0F;
return (linearIndex & 0xF0) + nextSubgroupIndex;
}
// --- Occlusion ---
// Adds occlusion from a potential occluder to the current thread with the provided surface position and normal
void AccumulateOcclusion(float3 position, float3 normal, float3 occluderPosition, inout float totalOcclusion, inout float totalWeight)
{
// Get the vector from the pixel position to the position
float3 diff = (occluderPosition - position);
float distanceSq = dot(diff, diff);
float invDistance = rsqrt(distanceSq);
float invDistanceSq = invDistance * invDistance;
// Calculate the cosine falloff based on occluder direction and surface normal
float3 direction = diff * invDistance;
float cosineFalloff = dot(direction, normal);
float weight;
// --- TODO ---
// When blur, temporal reprojection and the SSAO AtomSampleViewer sample are finished, revisit and test
// these various weighting strategies to see which produces the best results
// Option 1:
// weight = cosineFalloff >= -0.00001f ? invDistanceSq : 0.0f;
// Option 2:
// weight = invDistanceSq;
// Option 3:
// weight = saturate(1.0f + cosineFalloff);
// weight *= weight;
// weight *= invDistanceSq;
// Option 4:
const float minDistanceSq = 0.07f;
weight = rsqrt(distanceSq + minDistanceSq);
weight *= weight;
// Option 5: option 3 + 4
// const float minDistanceSq = 0.12f;
// invDistanceSq = rsqrt(distanceSq + minDistanceSq);
// invDistanceSq *= invDistanceSq;
// weight = saturate(1.0f + cosineFalloff);
// weight *= weight;
// weight *= invDistanceSq;
// --- END TODO ---
// Accumulate AO and weight
totalOcclusion += saturate(cosineFalloff) * weight;
totalWeight += weight;
}
[numthreads(THREADS, THREADS, 1)]
void MainCS(uint3 thread_id : SV_GroupThreadID, uint3 group_id : SV_GroupID, uint3 dispatch_id: SV_DispatchThreadID, uint linear_id : SV_GroupIndex)
{
// LDS covers 32x32 pixels. This is the on screen position of the upper left pixel in the 32x32 group.
float2 ldsOffsetPixel = mad(float2(group_id.xy), THREADS, -PADDING);
// The screen space UV of the above calculated pixel offset
float2 ldsOffsetUV = mad(ldsOffsetPixel, GetPixelSize(), GetHalfPixelSize());
// Write depth to LDS
{
// Each thread will gather 4 depth values, so space them apart
int2 ldsPosition = int2(thread_id.xy) << 1;
// Gather depth values
float2 depthGatherUV = mad(float2(ldsPosition), GetPixelSize(), ldsOffsetUV);
// Gather on some GPUs will fall onto same pixels in adjacent coordinates due to rounding errors
depthGatherUV += GetHalfPixelSize();
float4 depthGather = PassSrg::m_linearDepth.Gather(PassSrg::PointSampler, depthGatherUV);
WriteDepthGatherToLDS(ldsPosition, depthGather);
}
// Sync after LDS
GroupMemoryBarrierWithGroupSync();
// Reshuffle thread indices so groups of 16 consecutive threads form 4x4 subgroups
// Initially 16 consecutive threads form a row as such:
//
// 0 1 2 3 4 5 6 7 8 9 A B C D E F
// 0 1 2 3 4 5 6 7 8 9 A B C D E F
// 0 1 2 3 4 5 6 7 8 9 A B C D E F
// 0 1 2 3 4 5 6 7 8 9 A B C D E F
//
// After the reshuffle the threads will be in a 4x4 pixel group like so:
//
// 0 1 4 5 | 0 1 4 5 | 0 1 4 5 | 0 1 4 5
// 2 3 6 7 | 2 3 6 7 | 2 3 6 7 | 2 3 6 7
// 8 9 C D | 8 9 C D | 8 9 C D | 8 9 C D
// A B E F | A B E F | A B E F | A B E F
//
uint2 subgroup = uint2((linear_id >> 4) & 3, (linear_id >> 6) & 3);
uint2 threadSubID;
threadSubID.x = (linear_id & 1) + ((linear_id >> 1) & 2);
threadSubID.y = ((linear_id >> 1) & 1) + ((linear_id >> 2) & 2);
uint2 newThreadID = mad(subgroup, 4, threadSubID);
uint newLinearID = mad(newThreadID.y, THREADS, newThreadID.x);
// Position + Normal
int2 ldsPosition = int2(newThreadID.xy) + int2(PADDING, PADDING);
float3 position = GetPositionFromDepthLDS(ldsPosition, ldsOffsetUV);
float3 normal = CalculateNormalFromLDS(ldsPosition, ldsOffsetUV, position);
float totalOcclusion = 0.0f;
float totalWeight = 0.0f;
// PHASE I: accumulates AO from local depth samples stored in LDS
[unroll]
for (uint i = 0; i < LOCAL_SAMPLE_COUNT; ++i)
{
float3 occluderPosition = GetPositionFromDepthLDS(ldsPosition + localSamplesForLDS[i], ldsOffsetUV);
AccumulateOcclusion(position, normal, occluderPosition, totalOcclusion, totalWeight);
}
totalWeight = max(totalWeight, EPSILON);
float finalAO = totalOcclusion / totalWeight;
// TODO: these commented lines are tied to the options in AccumulateOcclusion
//totalWeight = 0.0f;
//totalOcclusion = 0.0f;
// Note: we want our sample offset to be at the center of the 4x4 subgroup, that's why we add 2.0f to groupOffsetPixel
float2 groupOffsetPixel = mad(float2(group_id.xy), THREADS, 2.0f);
float2 subgroupOffsetPixel = mad(float2(subgroup), 4.0f, groupOffsetPixel);
float subgroupAngleOffset = float((linear_id >> 4) & 0x0F) * float(OUTER_SAMPLE_ANGLE / SUBGROUP_COUNT);
float sampleFactor = (float(linear_id & 0x0F) + 1.0f) / float(THREADS_PER_SUBGROUP);
// Sync because otherwise position writes to LDS from PHASE II can mess up threads trying to read depth from LDS in Phase I
GroupMemoryBarrierWithGroupSync();
// PHASE II: sample depth buffer and share reconstructed positions between threads using LDS
[unroll]
for (uint i = 0; i < OUTER_SAMPLE_LOOPS; ++i)
{
// Get the sample radius and angle for this thread
float sampleRadius = sqrt(sampleFactor) * (GetOutputSize().y * PassSrg::m_constants.m_samplingRadius);
float sampleAngle = mad(sampleFactor, OUTER_SAMPLE_ANGLE_STEP, subgroupAngleOffset);
// Calculate which pixel to sample
float2 samplePixel;
sincos(sampleAngle, samplePixel.y, samplePixel.x);
samplePixel = mad(samplePixel, sampleRadius, subgroupOffsetPixel);
// We need the UV value to be at the exact pixel center for accurate position reconstruction from depth
// We therefore floor the sample pixel location and then add half pixel size to sampleUV in the next line
samplePixel = floor(samplePixel);
float2 sampleUV = mad(samplePixel, GetPixelSize(), GetHalfPixelSize());
// Caculate the sampled position and accumulate occluion
float occluderDepth = PassSrg::m_linearDepth.SampleLevel(PassSrg::PointSampler, sampleUV, 0);
float3 occluderPosition = ViewSrg::GetViewSpacePosition(sampleUV, occluderDepth);
AccumulateOcclusion(position, normal, occluderPosition, totalOcclusion, totalWeight);
// Write the calculated occluder position to LDS
uint ldsIndex = linear_id;
WritePositionToLDS(ldsIndex, occluderPosition);
// Note: No need for GroupMemoryBarrierWithGroupSync() here because we only read from the group
// of 16 consecutive threads that we're in, and those 16 threads all execute at the same time
// In this loop, groups of 16 consecutive threads will read positions from LDS written by other threads
// in the group. This effectively amounts to 4x4 pixel groups sharing samples to avoid texture fetches
[unroll]
for (uint j = 1; j < INNER_SAMPLE_COUNT; ++j) // Start at 1 because we already have one sample
{
// Get the index for the next position in our group of 4x4 that we want to sample
ldsIndex = GetNextLinearIndex(ldsIndex, (THREADS_PER_SUBGROUP / INNER_SAMPLE_COUNT));
// Accumulate AO from the position stored in LDS
occluderPosition = GetPositionFromLDS(ldsIndex);
AccumulateOcclusion(position, normal, occluderPosition, totalOcclusion, totalWeight);
}
// Rotate for the next samples
subgroupAngleOffset += OUTER_SAMPLE_ANGLE;
}
totalWeight = max(totalWeight, EPSILON);
// TODO: this commented line is tied to the options in AccumulateOcclusion
//finalAO = (finalAO * 0.5f) + (0.5f * totalOcclusion / totalWeight);
finalAO = totalOcclusion / totalWeight;
uint2 outPixel = mad(int2(group_id.xy), THREADS, newThreadID.xy);
// This amounts to output = 1.0f - (finalAO * strength)
// We do 1.0 - AO because greater AO values should result in darker pixels, not lighter
float output = mad(-finalAO, PassSrg::m_constants.m_strength, 1.0f);
// Gamma compensation
output = pow(output, 2.2f);
// Output
PassSrg::m_ssaoOutput[outPixel] = output;
}