You cannot select more than 25 topics Topics must start with a letter or number, can include dashes ('-') and can be up to 35 characters long.
o3de/Code/Legacy/CryCommon/MemoryAccess.h

567 lines
16 KiB
C

/*
* Copyright (c) Contributors to the Open 3D Engine Project
*
* SPDX-License-Identifier: Apache-2.0 OR MIT
*
*/
// Description : Misc mathematical functions
#pragma once
#include <platform.h>
// Section dictionary
#if defined(AZ_RESTRICTED_PLATFORM)
#undef AZ_RESTRICTED_SECTION
#define MEMORYACCESS_H_SECTION_TRAITS 1
#define MEMORYACCESS_H_SECTION_CRYPREFETCH 2
#endif
// Traits
#if defined(AZ_RESTRICTED_PLATFORM)
#define AZ_RESTRICTED_SECTION MEMORYACCESS_H_SECTION_TRAITS
#include AZ_RESTRICTED_FILE(MemoryAccess_h)
#else
#define MEMORYACCESS_H_TRAIT_USE_LEGACY_PREFETCHLINE 1
#endif
#if MEMORYACCESS_H_TRAIT_USE_LEGACY_PREFETCHLINE
#define PrefetchLine(ptr, off) cryPrefetchT0SSE((void*)((UINT_PTR)ptr + off))
#else
#define PrefetchLine(ptr, off) (void)(0)
#endif
#define ResetLine128(ptr, off) (void)(0)
#define FlushLine128(ptr, off) (void)(0)
//========================================================================================
// cryMemcpy flags
#define MC_CPU_TO_GPU 0x10
#define MC_GPU_TO_CPU 0x20
#define MC_CPU_TO_CPU 0x40
extern int g_CpuFlags;
//
#define CPUF_SSE 0x01
#define CPUF_SSE2 0x02
#define CPUF_3DNOW 0x04
#define CPUF_MMX 0x08
#define CPUF_SSE3 0x10
#define CPUF_F16C 0x20
#define CPUF_SSE41 0x40
#ifdef _CPU_SSE
#ifdef _CPU_X86
#include <xmmintrin.h>
#endif
#define _MM_PREFETCH(MemPtr, Hint) _mm_prefetch((MemPtr), (Hint));
#define _MM_PREFETCH_LOOP(nCount, MemPtr, Hint) { for (int p = 0; p < nCount; p += 64) { _mm_prefetch((const char*)(MemPtr) + p, Hint); } \
}
#else //_CPU_SSE
#define _MM_PREFETCH(MemPtr, Hint)
#define _MM_PREFETCH_LOOP(nCount, MemPtr, Hint)
#endif //_CPU_SSE
void cryMemcpy(void* Dst, const void* Src, int Count);
#if defined(LINUX) || defined(APPLE)
// Define this for Mac and Linux since it is used with the pthread sources
#define mymemcpy16 memcpy
#endif
//==========================================================================================
// 3DNow! optimizations
#if defined _CPU_X86 && !defined(LINUX) && !defined(APPLE)
// ***************************************************************************
inline void cryPrecacheSSE(const void* src, int nbytes)
{
_asm
{
mov esi, src
mov ecx, nbytes
// 64 bytes per pass
shr ecx, 6
jz endLabel
loopMemToL1:
prefetchnta 64[ESI] // Prefetch next loop, non-temporal
prefetchnta 96[ESI]
movq mm1, 0[ESI]// Read in source data
movq mm2, 8[ESI]
movq mm3, 16[ESI]
movq mm4, 24[ESI]
movq mm5, 32[ESI]
movq mm6, 40[ESI]
movq mm7, 48[ESI]
movq mm0, 56[ESI]
add esi, 64
dec ecx
jnz loopMemToL1
emms
endLabel:
}
}
#endif
ILINE void cryPrefetchT0SSE(const void* src)
{
#if defined(WIN32) && !defined(WIN64)
_asm
{
mov esi, src
prefetchT0 [ESI] // Prefetch
}
#else
_MM_PREFETCH((char*)src, _MM_HINT_T0);
#endif
}
//=================================================================================
// Very optimized memcpy() routine for AMD Athlon and Duron family.
// This code uses any of FOUR different basic copy methods, depending
// on the transfer size.
// NOTE: Since this code uses MOVNTQ (also known as "Non-Temporal MOV" or
// "Streaming Store"), and also uses the software prefetch instructions,
// be sure you're running on Athlon/Duron or other recent CPU before calling!
#define TINY_BLOCK_COPY 64 // Upper limit for movsd type copy.
// The smallest copy uses the X86 "movsd" instruction, in an optimized
// form which is an "unrolled loop".
#define IN_CACHE_COPY 64 * 1024 // Upper limit for movq/movq copy w/SW prefetch.
// Next is a copy that uses the MMX registers to copy 8 bytes at a time,
// also using the "unrolled loop" optimization. This code uses
// the software prefetch instruction to get the data into the cache.
#define UNCACHED_COPY 197 * 1024 // Upper limit for movq/movntq w/SW prefetch.
// For larger blocks, which will spill beyond the cache, it's faster to
// use the Streaming Store instruction MOVNTQ. This write instruction
// bypasses the cache and writes straight to main memory. This code also
// uses the software prefetch instruction to pre-read the data.
// USE 64 * 1024 FOR THIS VALUE IF YOU'RE ALWAYS FILLING A "CLEAN CACHE".
#define BLOCK_PREFETCH_COPY infinity // No limit for movq/movntq w/block prefetch.
#define CACHEBLOCK 80h // Number of 64-byte blocks (cache lines) for block prefetch.
// For the largest size blocks, a special technique called Block Prefetch
// can be used to accelerate the read operations. Block Prefetch reads
// one address per cache line, for a series of cache lines, in a short loop.
// This is faster than using software prefetch. The technique is great for
// getting maximum read bandwidth, especially in DDR memory systems.
#if defined _CPU_X86 && !defined(LINUX) && !defined(APPLE)
// Inline assembly syntax for use with Visual C++
inline void cryMemcpy(void* Dst, const void* Src, int Count)
{
if (g_CpuFlags & CPUF_SSE)
{
__asm
{
mov ecx, [Count];
number of bytes to copy
mov edi, [Dst];
destination
mov esi, [Src];
source
mov ebx, ecx;
keep a copy of count
cld
cmp ecx, TINY_BLOCK_COPY
jb $memcpy_ic_3;
tiny ? skip mmx copy
cmp ecx, 32 * 1024;
dont align between 32k - 64k because
jbe $memcpy_do_align;
it appears to be slower
cmp ecx, 64*1024
jbe $memcpy_align_done
$memcpy_do_align :
mov ecx, 8;
a trick thats faster than rep movsb ...
sub ecx, edi;
align destination to qword
and ecx, 111b;
get the low bits
sub ebx, ecx;
update copy count
neg ecx;
set up to jump into the array
add ecx, offset $memcpy_align_done
jmp ecx;
jump to array of movsbs
align 4
movsb
movsb
movsb
movsb
movsb
movsb
movsb
movsb
$memcpy_align_done:;
destination is dword aligned
mov ecx, ebx;
number of bytes left to copy
shr ecx, 6;
get 64 - byte block count
jz $memcpy_ic_2;
finish the last few bytes
cmp ecx, IN_CACHE_COPY / 64;
too big 4 cache ? use uncached copy
jae $memcpy_uc_test
// This is small block copy that uses the MMX registers to copy 8 bytes
// at a time. It uses the "unrolled loop" optimization, and also uses
// the software prefetch instruction to get the data into the cache.
align 16
$memcpy_ic_1 :;
64 - byte block copies, in - cache copy
prefetchnta [esi + (200 * 64 / 34 + 192)];
start reading ahead
movq mm0, [esi + 0];
read 64 bits
movq mm1, [esi + 8]
movq [edi + 0], mm0;
write 64 bits
movq [edi + 8], mm1;
note: the normal movq writes the
movq mm2, [esi + 16];
data to cache;
a cache line will be
movq mm3, [esi + 24];
allocated as needed, to store the data
movq [edi + 16], mm2
movq [edi + 24], mm3
movq mm0, [esi + 32]
movq mm1, [esi + 40]
movq [edi + 32], mm0
movq [edi + 40], mm1
movq mm2, [esi + 48]
movq mm3, [esi + 56]
movq [edi + 48], mm2
movq [edi + 56], mm3
add esi, 64;
update source pointer
add edi, 64;
update destination pointer
dec ecx;
count down
jnz $memcpy_ic_1;
last 64 - byte block ?
$memcpy_ic_2 :
mov ecx, ebx;
has valid low 6 bits of the byte count
$memcpy_ic_3:
shr ecx, 2;
dword count
and ecx, 1111b;
only look at the "remainder" bits
neg ecx;
set up to jump into the array
add ecx, offset $memcpy_last_few
jmp ecx;
jump to array of movsds
$memcpy_uc_test:
cmp ecx, UNCACHED_COPY / 64;
big enough ? use block prefetch copy
jae $memcpy_bp_1
$memcpy_64_test :
or ecx, ecx;
tail end of block prefetch will jump here
jz $memcpy_ic_2;
no more 64 - byte blocks left
// For larger blocks, which will spill beyond the cache, it's faster to
// use the Streaming Store instruction MOVNTQ. This write instruction
// bypasses the cache and writes straight to main memory. This code also
// uses the software prefetch instruction to pre-read the data.
align 16
$memcpy_uc_1:;
64 - byte blocks, uncached copy
prefetchnta [esi + (200 * 64 / 34 + 192)];
start reading ahead
movq mm0, [esi + 0];
read 64 bits
add edi, 64;
update destination pointer
movq mm1, [esi + 8]
add esi, 64;
update source pointer
movq mm2, [esi - 48]
movntq [edi - 64], mm0;
write 64 bits, bypassing the cache
movq mm0, [esi - 40];
note: movntq also prevents the CPU
movntq [edi - 56], mm1;
from READING the destination address
movq mm1, [esi - 32];
into the cache, only to be over - written
movntq [edi - 48], mm2;
so that also helps performance
movq mm2, [esi - 24]
movntq [edi - 40], mm0
movq mm0, [esi - 16]
movntq [edi - 32], mm1
movq mm1, [esi - 8]
movntq [edi - 24], mm2
movntq [edi - 16], mm0
dec ecx
movntq [edi - 8], mm1
jnz $memcpy_uc_1;
last 64 - byte block ?
jmp $memcpy_ic_2;
almost done
// For the largest size blocks, a special technique called Block Prefetch
// can be used to accelerate the read operations. Block Prefetch reads
// one address per cache line, for a series of cache lines, in a short loop.
// This is faster than using software prefetch. The technique is great for
// getting maximum read bandwidth, especially in DDR memory systems.
$memcpy_bp_1 :;
large blocks, block prefetch copy
cmp ecx, CACHEBLOCK;
big enough to run another prefetch loop ?
jl $memcpy_64_test;
no, back to regular uncached copy
mov eax, CACHEBLOCK / 2;
block prefetch loop, unrolled 2X
add esi, CACHEBLOCK* 64;
move to the top of the block
align 16
$memcpy_bp_2 :
mov edx, [esi - 64];
grab one address per cache line
mov edx, [esi - 128];
grab one address per cache line
sub esi, 128;
go reverse order to suppress HW prefetcher
dec eax;
count down the cache lines
jnz $memcpy_bp_2;
keep grabbing more lines into cache
mov eax, CACHEBLOCK;
now that its in cache, do
{
the copy
align 16
$memcpy_bp_3:
movq mm0, [esi ];
} read 64 bits
movq mm1, [esi + 8]
movq mm2, [esi + 16]
movq mm3, [esi + 24]
movq mm4, [esi + 32]
movq mm5, [esi + 40]
movq mm6, [esi + 48]
movq mm7, [esi + 56]
add esi, 64;
update source pointer
movntq [edi ], mm0;
write 64 bits, bypassing cache
movntq [edi + 8], mm1;
note: movntq also prevents the CPU
movntq [edi + 16], mm2;
from READING the destination address
movntq [edi + 24], mm3;
into the cache, only to be over - written,
movntq [edi + 32], mm4;
so that also helps performance
movntq [edi + 40], mm5
movntq [edi + 48], mm6
movntq [edi + 56], mm7
add edi, 64;
update dest pointer
dec eax;
count down
jnz $memcpy_bp_3;
keep copying
sub ecx, CACHEBLOCK;
update the 64 - byte block count
jmp $memcpy_bp_1;
keep processing chunks
// The smallest copy uses the X86 "movsd" instruction, in an optimized
// form which is an "unrolled loop". Then it handles the last few bytes.
align 4
movsd
movsd;
perform last 1 - 15 dword copies
movsd
movsd
movsd
movsd
movsd
movsd
movsd
movsd;
perform last 1 - 7 dword copies
movsd
movsd
movsd
movsd
movsd
movsd
$memcpy_last_few:;
dword aligned from before movsds
mov ecx, ebx;
has valid low 2 bits of the byte count
and ecx, 11b;
the last few cows must come home
jz $memcpy_final;
no more, lets leave
rep movsb;
the last 1, 2, or 3 bytes
$memcpy_final:
emms;
clean up the MMX state
sfence;
flush the write buffer
// mov eax, [dest] ; ret value = destination pointer
}
}
else
{
memcpy(Dst, Src, Count);
}
}
inline void cryPrefetch(const void* Src, int nCount)
{
nCount >>= 6;
if (nCount > 0)
{
_asm
{
mov esi, Src;
mov ecx, nCount;
mPr0:
align 16
dec ecx;
mov eax, [esi];
mov eax, 0;
lea esi, [esi + 40h];
jne mPr0;
}
}
else
{
_asm
{
mov esi, Src;
mov ecx, nCount;
mPr1:
align 16
inc ecx;
mov eax, [esi];
mov eax, 0;
lea esi, [esi - 40h];
jne mPr1;
}
}
}
inline void cryMemcpy (void* inDst, const void* inSrc, int nCount, int nFlags)
{
cryMemcpy(inDst, inSrc, nCount);
}
//==========================================================================================
// SSE optimizations
#else
const int PREFNTA_BLOCK = 0x4000;
ILINE void cryMemcpy(void* Dst, const void* Src, int n)
{
char* dst = (char*)Dst;
char* src = (char*)Src;
while (n > PREFNTA_BLOCK)
{
_MM_PREFETCH_LOOP(PREFNTA_BLOCK, src, _MM_HINT_NTA);
memcpy(dst, src, PREFNTA_BLOCK);
src += PREFNTA_BLOCK;
dst += PREFNTA_BLOCK;
n -= PREFNTA_BLOCK;
}
_MM_PREFETCH_LOOP(n, src, _MM_HINT_NTA);
memcpy(dst, src, n);
}
ILINE void cryMemcpy(void* Dst, const void* Src, int n, [[maybe_unused]] int nFlags)
{
char* dst = (char*)Dst;
char* src = (char*)Src;
while (n > PREFNTA_BLOCK)
{
_MM_PREFETCH_LOOP(PREFNTA_BLOCK, src, _MM_HINT_NTA);
memcpy(dst, src, PREFNTA_BLOCK);
src += PREFNTA_BLOCK;
dst += PREFNTA_BLOCK;
n -= PREFNTA_BLOCK;
}
_MM_PREFETCH_LOOP(n, src, _MM_HINT_NTA);
memcpy(dst, src, n);
}
#endif
#if defined(AZ_RESTRICTED_PLATFORM)
#define AZ_RESTRICTED_SECTION MEMORYACCESS_H_SECTION_CRYPREFETCH
#include AZ_RESTRICTED_FILE(MemoryAccess_h)
#endif
#if defined(AZ_RESTRICTED_SECTION_IMPLEMENTED)
#undef AZ_RESTRICTED_SECTION_IMPLEMENTED
#else
//implement something usual to bring one memory location into L1 data cache
ILINE void CryPrefetch(const void* const cpSrc)
{
cryPrefetchT0SSE(cpSrc);
}
#endif
#define CryPrefetchInl CryPrefetch