@ -22,131 +22,125 @@ namespace AZ
{
namespace RHI
{
/**
* Problem : High - level rendering code works in ' materials ' , ' shaders ' , and ' models ' , but the RHI works in
* ' pipeline states ' . Therefore , a translation process must exist to resolve a shader variation ( plus runtime
* state ) into a pipeline state suitable for consumption by the RHI . These resolve operations can number in the
* thousands per frame , and ( ideally ) are heavily jobified .
*
* Another problem is that pipeline state creation is not fast , as on some platforms it will involve synchronous
* byte - code compilation . This could take anywhere from < 1 ms to > 150 ms . If compilation is done synchronously and
* immediately , the cache will effectively stall the entire process if multiple threads request the same pending
* pipeline state .
*
* Therefore , PipelineStateCache adheres to the following requirements :
* 1. A cache miss does not serialize all threads on a pipeline state compilation event .
* 2. A cache hit results in zero contention .
*
* Justification : Most pipeline state compilation will occur in the first few frames , but can also occur when new
* ' permutations ' are hit while exploring . In the 90 % case , the cache is warm and each frame results in a 100 %
* cache hit rate . With zero locks , this scales extremely well across threads and removes a bottleneck from the
* render code . In the event that compilations are required , multiple threads are now able to participate in the
* compilation process without serializing each other .
*
* To accomplish this , the pipeline state cache uses three ' phases ' of caching .
* 1. A global , read - only cache - designed as the ' fast path ' for when the cache is warm .
* 2. A thread - local cache - reduces contention on the global pending cache for successive requests on the same thread .
* 3. A global , locked pending cache - de - duplicates pipeline state allocations .
*
* Each library has global and thread - local caches . Initially , the global cache is checked , if that fails , the
* thread - local cache is checked ( no locks taken ) . Finally , the pending cache is checked under a lock and if
* the entry still doesn ' t exist , it is allocated and added to the pending cache . A thread - local PipelineLibrary
* is used to compile the pipeline state , which eliminates all locking for compilation .
*
* Pipeline states can be acquired at any time and from any thread . The cache will take a reader lock . During
* AcquirePipelineState , the global read - only cache is not updated , but the thread - local cache and pending
* global cache may be . Furthermore , compilations are performed on the calling thread , which means that separate
* thread may return a pipeline state that is still compiling . It is required that all pending AcquirePipelineState
* calls complete prior to using the returned pipeline state pointers during command list recording .
*
* Example Scenarios :
*
* 1. Threads request the same un - cached pipeline state :
*
* Both the global read - only cache and thread - local caches miss , one thread wins the race to take a lock
* on the global pending cache . It allocates but does not compile the pipeline state . All other threads wait on the
* lock ( which should be quick ) and then find and return the uninitialized pipeline state . The compiling
* thread uses the thread - local PipelineLibrary instance to compile the pipeline state . Non - compiling threads
* will enter the uninitialized pipeline state into their thread - local cache ( as does the compiling thread once it
* completes ) . Note that the compiling thread is now busy , but all remaining threads are now unblocked to compile other
* pipeline states .
*
* 2. A thread requests a pipeline state being compiled on another thread :
*
* In this case , the global read - only cache won ' t have the pipeline state ( since it ' s being compiled during
* the current cycle , and the pending cache is only merged at the end of the cycle ) . It also won ' t have the
* entry in the thread - local cache . It then hits the global pending cache , which will return the live instance
* ( being compiled ) . It then caches the result in its thread - local cache , so that successive requests will no
* longer require a lock on the pending cache .
*
* 3. The cache is warm and all pipeline states are compiled :
*
* Each thread hits the same read - only cache ( which succeeds ) and returns the pipeline state immediately .
* This is the fast - path case where multiple threads are now able to resolve pipeline states with very
* little performance overhead .
*
* Example Usage :
* @ code { . cpp }
* // Create library instance.
* RHI : : PipelineLibraryHandle libraryHandle = pipelineStateCache - > CreateLibrary ( serializedData ) ; // Initial data loaded from disk.
*
* // In jobs. Lots and lots of requests.
* const RHI : : PipelineState * pipelineState = pipelineStateCache - > AcquirePipelineState ( libraryHandle , descriptor ) ;
*
* // Reset contents of library. Releases all pipeline state references. Library remains valid.
* pipelineStateCache - > ResetLibrary ( libraryHandle ) ;
*
* // Release library and all held references.
* pipelineStateCache - > ReleaseLibrary ( libraryHandle ) ;
* @ endcode
*/
//! Problem: High-level rendering code works in 'materials', 'shaders', and 'models', but the RHI works in
//! 'pipeline states'. Therefore, a translation process must exist to resolve a shader variation (plus runtime
//! state) into a pipeline state suitable for consumption by the RHI. These resolve operations can number in the
//! thousands per frame, and (ideally) are heavily jobified.
//!
//! Another problem is that pipeline state creation is not fast, as on some platforms it will involve synchronous
//! byte-code compilation. This could take anywhere from <1ms to >150ms. If compilation is done synchronously and
//! immediately, the cache will effectively stall the entire process if multiple threads request the same pending
//! pipeline state.
//!
//! Therefore, PipelineStateCache adheres to the following requirements:
//! 1. A cache miss does not serialize all threads on a pipeline state compilation event.
//! 2. A cache hit results in zero contention.
//!
//! Justification: Most pipeline state compilation will occur in the first few frames, but can also occur when new
//! 'permutations' are hit while exploring. In the 90% case, the cache is warm and each frame results in a 100%
//! cache hit rate. With zero locks, this scales extremely well across threads and removes a bottleneck from the
//! render code. In the event that compilations are required, multiple threads are now able to participate in the
//! compilation process without serializing each other.
//!
//! To accomplish this, the pipeline state cache uses three 'phases' of caching.
//! 1. A global, read-only cache - designed as the 'fast path' for when the cache is warm.
//! 2. A thread-local cache - reduces contention on the global pending cache for successive requests on the same thread.
//! 3. A global, locked pending cache - de-duplicates pipeline state allocations.
//!
//! Each library has global and thread-local caches. Initially, the global cache is checked, if that fails, the
//! thread-local cache is checked (no locks taken). Finally, the pending cache is checked under a lock and if
//! the entry still doesn't exist, it is allocated and added to the pending cache. A thread-local PipelineLibrary
//! is used to compile the pipeline state, which eliminates all locking for compilation.
//!
//! Pipeline states can be acquired at any time and from any thread. The cache will take a reader lock. During
//! AcquirePipelineState, the global read-only cache is not updated, but the thread-local cache and pending
//! global cache may be. Furthermore, compilations are performed on the calling thread, which means that separate
//! thread may return a pipeline state that is still compiling. It is required that all pending AcquirePipelineState
//! calls complete prior to using the returned pipeline state pointers during command list recording.
//!
//! Example Scenarios:
//!
//! 1. Threads request the same un-cached pipeline state:
//!
//! Both the global read-only cache and thread-local caches miss, one thread wins the race to take a lock
//! on the global pending cache. It allocates but does not compile the pipeline state. All other threads wait on the
//! lock (which should be quick) and then find and return the uninitialized pipeline state. The compiling
//! thread uses the thread-local PipelineLibrary instance to compile the pipeline state. Non-compiling threads
//! will enter the uninitialized pipeline state into their thread-local cache (as does the compiling thread once it
//! completes). Note that the compiling thread is now busy, but all remaining threads are now unblocked to compile other
//! pipeline states.
//!
//! 2. A thread requests a pipeline state being compiled on another thread:
//!
//! In this case, the global read-only cache won't have the pipeline state (since it's being compiled during
//! the current cycle, and the pending cache is only merged at the end of the cycle). It also won't have the
//! entry in the thread-local cache. It then hits the global pending cache, which will return the live instance
//! (being compiled). It then caches the result in its thread-local cache, so that successive requests will no
//! longer require a lock on the pending cache.
//!
//! 3. The cache is warm and all pipeline states are compiled:
//!
//! Each thread hits the same read-only cache (which succeeds) and returns the pipeline state immediately.
//! This is the fast-path case where multiple threads are now able to resolve pipeline states with very
//! little performance overhead.
//!
//! Example Usage:
//! @code{.cpp}
//! // Create library instance.
//! RHI::PipelineLibraryHandle libraryHandle = pipelineStateCache->CreateLibrary(serializedData); // Initial data loaded from disk.
//!
//! // In jobs. Lots and lots of requests.
//! const RHI::PipelineState* pipelineState = pipelineStateCache->AcquirePipelineState(libraryHandle, descriptor);
//!
//! // Reset contents of library. Releases all pipeline state references. Library remains valid.
//! pipelineStateCache->ResetLibrary(libraryHandle);
//!
//! // Release library and all held references.
//! pipelineStateCache->ReleaseLibrary(libraryHandle);
//! @endcode
//!
class PipelineStateCache final
: public AZStd : : intrusive_base
{
public :
AZ_CLASS_ALLOCATOR ( PipelineStateCache , SystemAllocator , 0 ) ;
/**
* The maximum number of libraries is configurable at compile time . A fixed number is used
* to avoid having to lazily resize thread - local arrays when traversing them , and also to
* avoid a pointer indirection on access .
*/
//! The maximum number of libraries is configurable at compile time. A fixed number is used
//! to avoid having to lazily resize thread-local arrays when traversing them, and also to
//! avoid a pointer indirection on access.
static const size_t LibraryCountMax = 256 ;
static Ptr < PipelineStateCache > Create ( Device & device ) ;
// / Resets the caches of all pipeline libraries back to empty. All internal references to pipeline states are released.
// ! Resets the caches of all pipeline libraries back to empty. All internal references to pipeline states are released.
void Reset ( ) ;
// / Creates an internal pipeline library instance and returns its handle.
PipelineLibraryHandle CreateLibrary ( const PipelineLibraryData * serializedData );
// ! Creates an internal pipeline library instance and returns its handle.
PipelineLibraryHandle CreateLibrary ( const PipelineLibraryData * serializedData , const AZStd : : string & filePath = " " );
// / Releases the pipeline library and purges it from the cache. Releases all held references to pipeline states for the library.
// ! Releases the pipeline library and purges it from the cache. Releases all held references to pipeline states for the library.
void ReleaseLibrary ( PipelineLibraryHandle handle ) ;
// / Resets cache contents in the library. Releases all held references to pipeline states for the library.
// ! Resets cache contents in the library. Releases all held references to pipeline states for the library.
void ResetLibrary ( PipelineLibraryHandle handle ) ;
/// Returns the serialized data for the library, which can be used to re-initialize it.
ConstPtr < PipelineLibraryData > GetLibrarySerializedData ( PipelineLibraryHandle handle ) const ;
/**
* Acquires a pipeline state ( either draw or dispatch variants ) from the cache . Pipeline states are associated
* to a specific library handle . Successive calls with the same pipeline state descriptor hash will return the same
* pipeline state , even across threads . If the library handle is invalid or the acquire operation fails , a null pointer
* is returned . Otherwise , a valid pipeline state pointer is returned ( regardless of whether pipeline state compilation succeeds ) .
*
* It is permitted to take a strong reference to the returned pointer , but is not necessary as long as the reference
* is discarded on a library reset / release event . The cache will store a reference internally . If a strong reference
* is held externally , the instance will remain valid even after the cache is reset / destroyed .
*/
//! Returns the resulting merged library from all the threadLibraries related to the passed in handle.
//! The merged library can be used to write out the serialized data.
Ptr < PipelineLibrary > GetMergedLibrary ( PipelineLibraryHandle handle ) const ;
//! Acquires a pipeline state (either draw or dispatch variants) from the cache. Pipeline states are associated
//! to a specific library handle. Successive calls with the same pipeline state descriptor hash will return the same
//! pipeline state, even across threads. If the library handle is invalid or the acquire operation fails, a null pointer
//! is returned. Otherwise, a valid pipeline state pointer is returned (regardless of whether pipeline state compilation succeeds).
//!
//! It is permitted to take a strong reference to the returned pointer, but is not necessary as long as the reference
//! is discarded on a library reset / release event. The cache will store a reference internally. If a strong reference
//! is held externally, the instance will remain valid even after the cache is reset / destroyed.
const PipelineState * AcquirePipelineState ( PipelineLibraryHandle library , const PipelineStateDescriptor & descriptor ) ;
/**
* This method merges the global pending cache into the global read - only cache and clears all thread - local caches .
* This reduces the total memory footprint of the caches and optimizes subsequent fetches . This method should be called
* once per frame .
*/
//! This method merges the global pending cache into the global read-only cache and clears all thread-local caches.
//! This reduces the total memory footprint of the caches and optimizes subsequent fetches. This method should be called
//! once per frame.
void Compact ( ) ;
private :
@ -198,8 +192,9 @@ namespace AZ
// Tracks the number of pipeline states actively being compiled across all threads.
AZStd : : atomic_uint32_t m_pendingCompileCount = { 0 } ;
// Used to prime the thread libraries.
ConstPtr < PipelineLibraryData > m_serializedData ;
// Contains the initial serialized data (Used to prime the thread libraries)
// or the file name that contains the serialized data
PipelineLibraryDescriptor m_pipelineLibraryDescriptor ;
} ;
using GlobalLibrarySet = AZStd : : fixed_vector < GlobalLibraryEntry , LibraryCountMax > ;
@ -209,36 +204,32 @@ namespace AZ
// A thread-local cache used to reduce contention on the global pending cache.
PipelineStateSet m_threadLocalCache ;
/**
* Each thread has its own pipeline library . This allows threads to cache disjoint
* pipeline states without locking . The libraries are coalesced into a single library
* during GetLibrarySerializedData . The library is lazily initialized on the thread
* and uses the initial serialized data passed in at creation time .
*/
//! Each thread has its own pipeline library. This allows threads to cache disjoint
//! pipeline states without locking. The libraries are coalesced into a single library
//! during GetMergedLibrary. The library is lazily initialized on the thread
//! and uses the initial serialized data passed in at creation time.
Ptr < PipelineLibrary > m_library ;
} ;
/**
* Each thread has its own list of pipeline library entries . The index maps 1 - to - 1 with GlobalLibrarySet .
* GlobalLibrarySet contains the total size of the array ; whereas the ThreadLibrarySet is just an array .
* The size of the global set should be used when traversing the thread library entries .
*/
//! Each thread has its own list of pipeline library entries. The index maps 1-to-1 with GlobalLibrarySet.
//! GlobalLibrarySet contains the total size of the array; whereas the ThreadLibrarySet is just an array.
//! The size of the global set should be used when traversing the thread library entries.
using ThreadLibrarySet = AZStd : : array < ThreadLibraryEntry , LibraryCountMax > ;
// / Helper function which binary searches a pipeline state set looking for an entry which matches the requested descriptor.
// ! Helper function which binary searches a pipeline state set looking for an entry which matches the requested descriptor.
static const PipelineState * FindPipelineState ( const PipelineStateSet & pipelineStateSet , const PipelineStateDescriptor & descriptor ) ;
// / Helper function which inserts an entry into the set. Returns true if the entry was inserted, or false is a duplicate entry existed.
// ! Helper function which inserts an entry into the set. Returns true if the entry was inserted, or false is a duplicate entry existed.
static bool InsertPipelineState ( PipelineStateSet & pipelineStateSet , PipelineStateEntry pipelineStateEntry ) ;
// / Performs a pipeline state compilation on the global cache using the thread-local pipeline library.
// ! Performs a pipeline state compilation on the global cache using the thread-local pipeline library.
ConstPtr < PipelineState > CompilePipelineState (
GlobalLibraryEntry & globalLibraryEntry ,
ThreadLibraryEntry & threadLibraryEntry ,
const PipelineStateDescriptor & pipelineStateDescriptor ,
PipelineStateHash pipelineStateHash ) ;
// / Resets the library without validating the handle or taking a lock.
// ! Resets the library without validating the handle or taking a lock.
void ResetLibraryImpl ( PipelineLibraryHandle handle ) ;
Ptr < Device > m_device ;