/* * Copyright (c) Contributors to the Open 3D Engine Project. * For complete copyright and license terms please see the LICENSE at the root of this distribution. * * SPDX-License-Identifier: Apache-2.0 OR MIT * */ #include #include #include #include #define THREADS 16 #define NUM_THREADS (THREADS * THREADS) #define PADDING (THREADS / 2) #define LDS_WIDTH (PADDING + THREADS + PADDING) // LDS_SIZE = LDS_WIDTH * LDS_WIDTH = 32 * 32 = 1024 #define LDS_SIZE 1024 // LDS (local data store) will store depths from 32x32 region around the group in Phase I // of SSAO and then the sampled positions for sharing with neighbor threads in the Phase II groupshared float LDS[LDS_SIZE]; // Number of AO samples to calculate from local depth values stored in LDS in Phase I // This value affects the size of the first loop which takes sample positions from // localSamplesForLDS array defined below, and should therefore be set to a multpile of // 4 and never exceed a value of 24, which is the size of the array. #define LOCAL_SAMPLE_COUNT 12 // In Phase II we use additional samples from the depth input texture. Groups of 4x4 threads // will fetch samples in an outer loop and share these samples with each in an inner loop. // This sharing is accomplished by writing the samples to LDS prior to the inner loop: #define INNER_SAMPLE_COUNT 4 #define OUTER_SAMPLE_LOOPS 10 // Subgroups are groups of 4x4 threads. Because 16 consecutive threads execute simultaneously, // this removes the need for memory barriers when these groups write and read from LDS #define THREADS_PER_SUBGROUP 16 #define SUBGROUP_COUNT (NUM_THREADS / THREADS_PER_SUBGROUP) #define OUTER_SAMPLE_ANGLE (TWO_PI / OUTER_SAMPLE_LOOPS) #define OUTER_SAMPLE_ANGLE_STEP (OUTER_SAMPLE_ANGLE / THREADS_PER_SUBGROUP) // Array used for calculating sample positions in Phase I static const int2 localSamplesForLDS[] = { { 0, -4 }, { -4, 0 }, { 4, 0 }, { 0, 4 }, { -4, -4 }, { 4, -4 }, { -4, 4 }, { 4, 4 }, { 0, -8 }, { -8, 0 }, { 8, 0 }, { 0, 8 }, { -4, -8 }, { 4, -8 }, { -4, 8 }, { 4, 8 }, { -8, -4 }, { -8, 4 }, { 8, -4 }, { 8, 4 }, { -8, -8 }, { 8, -8 }, { -8, 8 }, { 8, 8 }, }; ShaderResourceGroup PassSrg : SRG_PerPass { Texture2D m_linearDepth; RWTexture2D m_ssaoOutput; // Must match the struct in SsaoPasses.cpp struct SsaoConstants { // The texture dimensions of SSAO output uint2 m_outputSize; // The size of a pixel relative to screenspace UV // Calculated by taking the inverse of the texture dimensions float2 m_pixelSize; // The size of half a pixel relative to screenspace UV float2 m_halfPixelSize; // The strength of the SSAO effect float m_strength; // The sampling radius calculated in screen UV space float m_samplingRadius; }; SsaoConstants m_constants; Sampler PointSampler { MinFilter = Point; MagFilter = Point; MipFilter = Point; AddressU = Clamp; AddressV = Clamp; AddressW = Clamp; }; } float2 GetOutputSize() { return PassSrg::m_constants.m_outputSize; } float2 GetPixelSize() { return PassSrg::m_constants.m_pixelSize; } float2 GetHalfPixelSize() { return PassSrg::m_constants.m_halfPixelSize; } // --- LDS DEPTH --- // In the first phase of SSAO, we store 32x32 depth samples in LDS. // We use these depth values to reconstruct view space positions and // normals from which we can then calculate SSAO contribution int GetLdsIndex(int2 index) { return mad(index.y, LDS_WIDTH, index.x); } void WriteDepthGatherToLDS(int2 ldsPosition, float4 depthGather) { // Write the gathered depth values to LDS LDS[ GetLdsIndex(ldsPosition) ] = depthGather.w; ++ldsPosition.x; LDS[ GetLdsIndex(ldsPosition) ] = depthGather.z; ++ldsPosition.y; LDS[ GetLdsIndex(ldsPosition) ] = depthGather.y; --ldsPosition.x; LDS[ GetLdsIndex(ldsPosition) ] = depthGather.x; } float3 GetPositionFromDepthLDS(int2 ldsPosition, float2 ldsOffsetUV) { float linearDepth = LDS[ GetLdsIndex(ldsPosition) ]; float2 screenSpaceUV = mad(float2(ldsPosition), GetPixelSize(), ldsOffsetUV); return ViewSrg::GetViewSpacePosition(screenSpaceUV, linearDepth); } float3 CalculateNormalFromLDS(int2 ldsPosition, float2 ldsOffsetUV, float3 position) { // Normal is calculated from the cross product between horizontal and vertical position derivatives // We calculate two position derivatives and take the smallest to avoid neighboring pixels from separate objects. float3 diffX; { float3 positionLeft = GetPositionFromDepthLDS(ldsPosition - int2(1, 0), ldsOffsetUV); float3 positionRight = GetPositionFromDepthLDS(ldsPosition + int2(1, 0), ldsOffsetUV); float3 diffLeft = position - positionLeft; float3 diffRight = positionRight - position; diffX = (abs(diffLeft.z) < abs(diffRight.z)) ? diffLeft : diffRight; } float3 diffY; { float3 positionUp = GetPositionFromDepthLDS(ldsPosition - int2(0, 1), ldsOffsetUV); float3 positionDown = GetPositionFromDepthLDS(ldsPosition + int2(0, 1), ldsOffsetUV); float3 diffUp = position - positionUp; float3 diffDown = positionDown - position; diffY = (abs(diffUp.z) < abs(diffDown.z)) ? diffUp : diffDown; } float3 normal = normalize( cross(diffX, diffY) ); return normal; } // --- LDS POSITION --- // In the second phase of SSAO, we store reconstructed positions in LDS. // Positions takes up 3x more space, but we only need 16x16 (4x less than // the number of depth samples). To write positions, we store the X component // in the first quarter of LDS, the Y component in the second and Z in the third // Dividing this way yields better performance as it avoids memory bank conflicts #define LDS_QUARTER_SIZE (LDS_SIZE / 4) void WritePositionToLDS(uint index, float3 position) { LDS[index] = position.x; LDS[index + LDS_QUARTER_SIZE] = position.y; LDS[index + (LDS_QUARTER_SIZE * 2)] = position.z; } float3 GetPositionFromLDS(uint index) { float3 position; position.x = LDS[index]; position.y = LDS[index + LDS_QUARTER_SIZE]; position.z = LDS[index + (LDS_QUARTER_SIZE * 2)]; return position; } uint GetNextLinearIndex(uint linearIndex, uint increment) { // The next index in our group of 4x4 uint nextSubgroupIndex = (linearIndex + increment) & 0x0F; return (linearIndex & 0xF0) + nextSubgroupIndex; } // --- Occlusion --- // Adds occlusion from a potential occluder to the current thread with the provided surface position and normal void AccumulateOcclusion(float3 position, float3 normal, float3 occluderPosition, inout float totalOcclusion, inout float totalWeight) { // Get the vector from the pixel position to the position float3 diff = (occluderPosition - position); float distanceSq = dot(diff, diff); float invDistance = rsqrt(distanceSq); float invDistanceSq = invDistance * invDistance; // Calculate the cosine falloff based on occluder direction and surface normal float3 direction = diff * invDistance; float cosineFalloff = dot(direction, normal); float weight; // --- TODO --- // When blur, temporal reprojection and the SSAO AtomSampleViewer sample are finished, revisit and test // these various weighting strategies to see which produces the best results // Option 1: // weight = cosineFalloff >= -0.00001f ? invDistanceSq : 0.0f; // Option 2: // weight = invDistanceSq; // Option 3: // weight = saturate(1.0f + cosineFalloff); // weight *= weight; // weight *= invDistanceSq; // Option 4: const float minDistanceSq = 0.07f; weight = rsqrt(distanceSq + minDistanceSq); weight *= weight; // Option 5: option 3 + 4 // const float minDistanceSq = 0.12f; // invDistanceSq = rsqrt(distanceSq + minDistanceSq); // invDistanceSq *= invDistanceSq; // weight = saturate(1.0f + cosineFalloff); // weight *= weight; // weight *= invDistanceSq; // --- END TODO --- // Accumulate AO and weight totalOcclusion += saturate(cosineFalloff) * weight; totalWeight += weight; } [numthreads(THREADS, THREADS, 1)] void MainCS(uint3 thread_id : SV_GroupThreadID, uint3 group_id : SV_GroupID, uint3 dispatch_id: SV_DispatchThreadID, uint linear_id : SV_GroupIndex) { // LDS covers 32x32 pixels. This is the on screen position of the upper left pixel in the 32x32 group. float2 ldsOffsetPixel = mad(float2(group_id.xy), THREADS, -PADDING); // The screen space UV of the above calculated pixel offset float2 ldsOffsetUV = mad(ldsOffsetPixel, GetPixelSize(), GetHalfPixelSize()); // Write depth to LDS { // Each thread will gather 4 depth values, so space them apart int2 ldsPosition = int2(thread_id.xy) << 1; // Gather depth values float2 depthGatherUV = mad(float2(ldsPosition), GetPixelSize(), ldsOffsetUV); // Gather on some GPUs will fall onto same pixels in adjacent coordinates due to rounding errors depthGatherUV += GetHalfPixelSize(); float4 depthGather = PassSrg::m_linearDepth.Gather(PassSrg::PointSampler, depthGatherUV); WriteDepthGatherToLDS(ldsPosition, depthGather); } // Sync after LDS GroupMemoryBarrierWithGroupSync(); // Reshuffle thread indices so groups of 16 consecutive threads form 4x4 subgroups // Initially 16 consecutive threads form a row as such: // // 0 1 2 3 4 5 6 7 8 9 A B C D E F // 0 1 2 3 4 5 6 7 8 9 A B C D E F // 0 1 2 3 4 5 6 7 8 9 A B C D E F // 0 1 2 3 4 5 6 7 8 9 A B C D E F // // After the reshuffle the threads will be in a 4x4 pixel group like so: // // 0 1 4 5 | 0 1 4 5 | 0 1 4 5 | 0 1 4 5 // 2 3 6 7 | 2 3 6 7 | 2 3 6 7 | 2 3 6 7 // 8 9 C D | 8 9 C D | 8 9 C D | 8 9 C D // A B E F | A B E F | A B E F | A B E F // uint2 subgroup = uint2((linear_id >> 4) & 3, (linear_id >> 6) & 3); uint2 threadSubID; threadSubID.x = (linear_id & 1) + ((linear_id >> 1) & 2); threadSubID.y = ((linear_id >> 1) & 1) + ((linear_id >> 2) & 2); uint2 newThreadID = mad(subgroup, 4, threadSubID); uint newLinearID = mad(newThreadID.y, THREADS, newThreadID.x); // Position + Normal int2 ldsPosition = int2(newThreadID.xy) + int2(PADDING, PADDING); float3 position = GetPositionFromDepthLDS(ldsPosition, ldsOffsetUV); float3 normal = CalculateNormalFromLDS(ldsPosition, ldsOffsetUV, position); float totalOcclusion = 0.0f; float totalWeight = 0.0f; // PHASE I: accumulates AO from local depth samples stored in LDS [unroll] for (uint i = 0; i < LOCAL_SAMPLE_COUNT; ++i) { float3 occluderPosition = GetPositionFromDepthLDS(ldsPosition + localSamplesForLDS[i], ldsOffsetUV); AccumulateOcclusion(position, normal, occluderPosition, totalOcclusion, totalWeight); } totalWeight = max(totalWeight, EPSILON); float finalAO = totalOcclusion / totalWeight; // TODO: these commented lines are tied to the options in AccumulateOcclusion //totalWeight = 0.0f; //totalOcclusion = 0.0f; // Note: we want our sample offset to be at the center of the 4x4 subgroup, that's why we add 2.0f to groupOffsetPixel float2 groupOffsetPixel = mad(float2(group_id.xy), THREADS, 2.0f); float2 subgroupOffsetPixel = mad(float2(subgroup), 4.0f, groupOffsetPixel); float subgroupAngleOffset = float((linear_id >> 4) & 0x0F) * float(OUTER_SAMPLE_ANGLE / SUBGROUP_COUNT); float sampleFactor = (float(linear_id & 0x0F) + 1.0f) / float(THREADS_PER_SUBGROUP); // Sync because otherwise position writes to LDS from PHASE II can mess up threads trying to read depth from LDS in Phase I GroupMemoryBarrierWithGroupSync(); // PHASE II: sample depth buffer and share reconstructed positions between threads using LDS [unroll] for (uint i = 0; i < OUTER_SAMPLE_LOOPS; ++i) { // Get the sample radius and angle for this thread float sampleRadius = sqrt(sampleFactor) * (GetOutputSize().y * PassSrg::m_constants.m_samplingRadius); float sampleAngle = mad(sampleFactor, OUTER_SAMPLE_ANGLE_STEP, subgroupAngleOffset); // Calculate which pixel to sample float2 samplePixel; sincos(sampleAngle, samplePixel.y, samplePixel.x); samplePixel = mad(samplePixel, sampleRadius, subgroupOffsetPixel); // We need the UV value to be at the exact pixel center for accurate position reconstruction from depth // We therefore floor the sample pixel location and then add half pixel size to sampleUV in the next line samplePixel = floor(samplePixel); float2 sampleUV = mad(samplePixel, GetPixelSize(), GetHalfPixelSize()); // Caculate the sampled position and accumulate occluion float occluderDepth = PassSrg::m_linearDepth.SampleLevel(PassSrg::PointSampler, sampleUV, 0); float3 occluderPosition = ViewSrg::GetViewSpacePosition(sampleUV, occluderDepth); AccumulateOcclusion(position, normal, occluderPosition, totalOcclusion, totalWeight); // Write the calculated occluder position to LDS uint ldsIndex = linear_id; WritePositionToLDS(ldsIndex, occluderPosition); // Note: No need for GroupMemoryBarrierWithGroupSync() here because we only read from the group // of 16 consecutive threads that we're in, and those 16 threads all execute at the same time // In this loop, groups of 16 consecutive threads will read positions from LDS written by other threads // in the group. This effectively amounts to 4x4 pixel groups sharing samples to avoid texture fetches [unroll] for (uint j = 1; j < INNER_SAMPLE_COUNT; ++j) // Start at 1 because we already have one sample { // Get the index for the next position in our group of 4x4 that we want to sample ldsIndex = GetNextLinearIndex(ldsIndex, (THREADS_PER_SUBGROUP / INNER_SAMPLE_COUNT)); // Accumulate AO from the position stored in LDS occluderPosition = GetPositionFromLDS(ldsIndex); AccumulateOcclusion(position, normal, occluderPosition, totalOcclusion, totalWeight); } // Rotate for the next samples subgroupAngleOffset += OUTER_SAMPLE_ANGLE; } totalWeight = max(totalWeight, EPSILON); // TODO: this commented line is tied to the options in AccumulateOcclusion //finalAO = (finalAO * 0.5f) + (0.5f * totalOcclusion / totalWeight); finalAO = totalOcclusion / totalWeight; uint2 outPixel = mad(int2(group_id.xy), THREADS, newThreadID.xy); // This amounts to output = 1.0f - (finalAO * strength) // We do 1.0 - AO because greater AO values should result in darker pixels, not lighter float output = mad(-finalAO, PassSrg::m_constants.m_strength, 1.0f); // Gamma compensation output = pow(output, 2.2f); // Output PassSrg::m_ssaoOutput[outPixel] = output; }