You cannot select more than 25 topics
Topics must start with a letter or number, can include dashes ('-') and can be up to 35 characters long.
567 lines
16 KiB
C
567 lines
16 KiB
C
/*
|
|
* Copyright (c) Contributors to the Open 3D Engine Project
|
|
*
|
|
* SPDX-License-Identifier: Apache-2.0 OR MIT
|
|
*
|
|
*/
|
|
|
|
|
|
// Description : Misc mathematical functions
|
|
|
|
#pragma once
|
|
|
|
|
|
#include <platform.h>
|
|
|
|
// Section dictionary
|
|
#if defined(AZ_RESTRICTED_PLATFORM)
|
|
#undef AZ_RESTRICTED_SECTION
|
|
#define MEMORYACCESS_H_SECTION_TRAITS 1
|
|
#define MEMORYACCESS_H_SECTION_CRYPREFETCH 2
|
|
#endif
|
|
|
|
// Traits
|
|
#if defined(AZ_RESTRICTED_PLATFORM)
|
|
#define AZ_RESTRICTED_SECTION MEMORYACCESS_H_SECTION_TRAITS
|
|
#include AZ_RESTRICTED_FILE(MemoryAccess_h)
|
|
#else
|
|
#define MEMORYACCESS_H_TRAIT_USE_LEGACY_PREFETCHLINE 1
|
|
#endif
|
|
|
|
#if MEMORYACCESS_H_TRAIT_USE_LEGACY_PREFETCHLINE
|
|
#define PrefetchLine(ptr, off) cryPrefetchT0SSE((void*)((UINT_PTR)ptr + off))
|
|
#else
|
|
#define PrefetchLine(ptr, off) (void)(0)
|
|
#endif
|
|
#define ResetLine128(ptr, off) (void)(0)
|
|
#define FlushLine128(ptr, off) (void)(0)
|
|
|
|
|
|
|
|
//========================================================================================
|
|
|
|
// cryMemcpy flags
|
|
#define MC_CPU_TO_GPU 0x10
|
|
#define MC_GPU_TO_CPU 0x20
|
|
#define MC_CPU_TO_CPU 0x40
|
|
|
|
extern int g_CpuFlags;
|
|
|
|
//
|
|
#define CPUF_SSE 0x01
|
|
#define CPUF_SSE2 0x02
|
|
#define CPUF_3DNOW 0x04
|
|
#define CPUF_MMX 0x08
|
|
#define CPUF_SSE3 0x10
|
|
#define CPUF_F16C 0x20
|
|
#define CPUF_SSE41 0x40
|
|
|
|
#ifdef _CPU_SSE
|
|
|
|
#ifdef _CPU_X86
|
|
#include <xmmintrin.h>
|
|
#endif
|
|
|
|
#define _MM_PREFETCH(MemPtr, Hint) _mm_prefetch((MemPtr), (Hint));
|
|
#define _MM_PREFETCH_LOOP(nCount, MemPtr, Hint) { for (int p = 0; p < nCount; p += 64) { _mm_prefetch((const char*)(MemPtr) + p, Hint); } \
|
|
}
|
|
#else //_CPU_SSE
|
|
#define _MM_PREFETCH(MemPtr, Hint)
|
|
#define _MM_PREFETCH_LOOP(nCount, MemPtr, Hint)
|
|
#endif //_CPU_SSE
|
|
|
|
void cryMemcpy(void* Dst, const void* Src, int Count);
|
|
#if defined(LINUX) || defined(APPLE)
|
|
// Define this for Mac and Linux since it is used with the pthread sources
|
|
#define mymemcpy16 memcpy
|
|
#endif
|
|
|
|
|
|
//==========================================================================================
|
|
// 3DNow! optimizations
|
|
|
|
#if defined _CPU_X86 && !defined(LINUX) && !defined(APPLE)
|
|
// ***************************************************************************
|
|
inline void cryPrecacheSSE(const void* src, int nbytes)
|
|
{
|
|
_asm
|
|
{
|
|
mov esi, src
|
|
mov ecx, nbytes
|
|
// 64 bytes per pass
|
|
shr ecx, 6
|
|
jz endLabel
|
|
|
|
loopMemToL1:
|
|
prefetchnta 64[ESI] // Prefetch next loop, non-temporal
|
|
prefetchnta 96[ESI]
|
|
|
|
movq mm1, 0[ESI]// Read in source data
|
|
movq mm2, 8[ESI]
|
|
movq mm3, 16[ESI]
|
|
movq mm4, 24[ESI]
|
|
movq mm5, 32[ESI]
|
|
movq mm6, 40[ESI]
|
|
movq mm7, 48[ESI]
|
|
movq mm0, 56[ESI]
|
|
|
|
add esi, 64
|
|
dec ecx
|
|
jnz loopMemToL1
|
|
|
|
emms
|
|
|
|
endLabel:
|
|
}
|
|
}
|
|
|
|
|
|
#endif
|
|
|
|
|
|
|
|
ILINE void cryPrefetchT0SSE(const void* src)
|
|
{
|
|
#if defined(WIN32) && !defined(WIN64)
|
|
_asm
|
|
{
|
|
mov esi, src
|
|
prefetchT0 [ESI] // Prefetch
|
|
}
|
|
#else
|
|
_MM_PREFETCH((char*)src, _MM_HINT_T0);
|
|
#endif
|
|
}
|
|
|
|
//=================================================================================
|
|
|
|
// Very optimized memcpy() routine for AMD Athlon and Duron family.
|
|
// This code uses any of FOUR different basic copy methods, depending
|
|
// on the transfer size.
|
|
// NOTE: Since this code uses MOVNTQ (also known as "Non-Temporal MOV" or
|
|
// "Streaming Store"), and also uses the software prefetch instructions,
|
|
// be sure you're running on Athlon/Duron or other recent CPU before calling!
|
|
|
|
#define TINY_BLOCK_COPY 64 // Upper limit for movsd type copy.
|
|
// The smallest copy uses the X86 "movsd" instruction, in an optimized
|
|
// form which is an "unrolled loop".
|
|
|
|
#define IN_CACHE_COPY 64 * 1024 // Upper limit for movq/movq copy w/SW prefetch.
|
|
// Next is a copy that uses the MMX registers to copy 8 bytes at a time,
|
|
// also using the "unrolled loop" optimization. This code uses
|
|
// the software prefetch instruction to get the data into the cache.
|
|
|
|
#define UNCACHED_COPY 197 * 1024 // Upper limit for movq/movntq w/SW prefetch.
|
|
// For larger blocks, which will spill beyond the cache, it's faster to
|
|
// use the Streaming Store instruction MOVNTQ. This write instruction
|
|
// bypasses the cache and writes straight to main memory. This code also
|
|
// uses the software prefetch instruction to pre-read the data.
|
|
// USE 64 * 1024 FOR THIS VALUE IF YOU'RE ALWAYS FILLING A "CLEAN CACHE".
|
|
|
|
#define BLOCK_PREFETCH_COPY infinity // No limit for movq/movntq w/block prefetch.
|
|
#define CACHEBLOCK 80h // Number of 64-byte blocks (cache lines) for block prefetch.
|
|
// For the largest size blocks, a special technique called Block Prefetch
|
|
// can be used to accelerate the read operations. Block Prefetch reads
|
|
// one address per cache line, for a series of cache lines, in a short loop.
|
|
// This is faster than using software prefetch. The technique is great for
|
|
// getting maximum read bandwidth, especially in DDR memory systems.
|
|
|
|
|
|
#if defined _CPU_X86 && !defined(LINUX) && !defined(APPLE)
|
|
// Inline assembly syntax for use with Visual C++
|
|
inline void cryMemcpy(void* Dst, const void* Src, int Count)
|
|
{
|
|
if (g_CpuFlags & CPUF_SSE)
|
|
{
|
|
__asm
|
|
{
|
|
mov ecx, [Count];
|
|
number of bytes to copy
|
|
mov edi, [Dst];
|
|
destination
|
|
mov esi, [Src];
|
|
source
|
|
mov ebx, ecx;
|
|
keep a copy of count
|
|
|
|
cld
|
|
cmp ecx, TINY_BLOCK_COPY
|
|
jb $memcpy_ic_3;
|
|
tiny ? skip mmx copy
|
|
|
|
cmp ecx, 32 * 1024;
|
|
dont align between 32k - 64k because
|
|
jbe $memcpy_do_align;
|
|
it appears to be slower
|
|
cmp ecx, 64*1024
|
|
jbe $memcpy_align_done
|
|
$memcpy_do_align :
|
|
mov ecx, 8;
|
|
a trick thats faster than rep movsb ...
|
|
sub ecx, edi;
|
|
align destination to qword
|
|
and ecx, 111b;
|
|
get the low bits
|
|
sub ebx, ecx;
|
|
update copy count
|
|
neg ecx;
|
|
set up to jump into the array
|
|
add ecx, offset $memcpy_align_done
|
|
jmp ecx;
|
|
jump to array of movsbs
|
|
|
|
align 4
|
|
movsb
|
|
movsb
|
|
movsb
|
|
movsb
|
|
movsb
|
|
movsb
|
|
movsb
|
|
movsb
|
|
|
|
$memcpy_align_done:;
|
|
destination is dword aligned
|
|
mov ecx, ebx;
|
|
number of bytes left to copy
|
|
shr ecx, 6;
|
|
get 64 - byte block count
|
|
jz $memcpy_ic_2;
|
|
finish the last few bytes
|
|
|
|
cmp ecx, IN_CACHE_COPY / 64;
|
|
too big 4 cache ? use uncached copy
|
|
jae $memcpy_uc_test
|
|
|
|
// This is small block copy that uses the MMX registers to copy 8 bytes
|
|
// at a time. It uses the "unrolled loop" optimization, and also uses
|
|
// the software prefetch instruction to get the data into the cache.
|
|
align 16
|
|
$memcpy_ic_1 :;
|
|
64 - byte block copies, in - cache copy
|
|
|
|
prefetchnta [esi + (200 * 64 / 34 + 192)];
|
|
start reading ahead
|
|
|
|
movq mm0, [esi + 0];
|
|
read 64 bits
|
|
movq mm1, [esi + 8]
|
|
movq [edi + 0], mm0;
|
|
write 64 bits
|
|
movq [edi + 8], mm1;
|
|
note: the normal movq writes the
|
|
movq mm2, [esi + 16];
|
|
data to cache;
|
|
a cache line will be
|
|
movq mm3, [esi + 24];
|
|
allocated as needed, to store the data
|
|
movq [edi + 16], mm2
|
|
movq [edi + 24], mm3
|
|
movq mm0, [esi + 32]
|
|
movq mm1, [esi + 40]
|
|
movq [edi + 32], mm0
|
|
movq [edi + 40], mm1
|
|
movq mm2, [esi + 48]
|
|
movq mm3, [esi + 56]
|
|
movq [edi + 48], mm2
|
|
movq [edi + 56], mm3
|
|
|
|
add esi, 64;
|
|
update source pointer
|
|
add edi, 64;
|
|
update destination pointer
|
|
dec ecx;
|
|
count down
|
|
jnz $memcpy_ic_1;
|
|
last 64 - byte block ?
|
|
|
|
$memcpy_ic_2 :
|
|
mov ecx, ebx;
|
|
has valid low 6 bits of the byte count
|
|
$memcpy_ic_3:
|
|
shr ecx, 2;
|
|
dword count
|
|
and ecx, 1111b;
|
|
only look at the "remainder" bits
|
|
neg ecx;
|
|
set up to jump into the array
|
|
add ecx, offset $memcpy_last_few
|
|
jmp ecx;
|
|
jump to array of movsds
|
|
|
|
$memcpy_uc_test:
|
|
cmp ecx, UNCACHED_COPY / 64;
|
|
big enough ? use block prefetch copy
|
|
jae $memcpy_bp_1
|
|
|
|
$memcpy_64_test :
|
|
or ecx, ecx;
|
|
tail end of block prefetch will jump here
|
|
jz $memcpy_ic_2;
|
|
no more 64 - byte blocks left
|
|
|
|
// For larger blocks, which will spill beyond the cache, it's faster to
|
|
// use the Streaming Store instruction MOVNTQ. This write instruction
|
|
// bypasses the cache and writes straight to main memory. This code also
|
|
// uses the software prefetch instruction to pre-read the data.
|
|
align 16
|
|
$memcpy_uc_1:;
|
|
64 - byte blocks, uncached copy
|
|
|
|
prefetchnta [esi + (200 * 64 / 34 + 192)];
|
|
start reading ahead
|
|
|
|
movq mm0, [esi + 0];
|
|
read 64 bits
|
|
add edi, 64;
|
|
update destination pointer
|
|
movq mm1, [esi + 8]
|
|
add esi, 64;
|
|
update source pointer
|
|
movq mm2, [esi - 48]
|
|
movntq [edi - 64], mm0;
|
|
write 64 bits, bypassing the cache
|
|
movq mm0, [esi - 40];
|
|
note: movntq also prevents the CPU
|
|
movntq [edi - 56], mm1;
|
|
from READING the destination address
|
|
movq mm1, [esi - 32];
|
|
into the cache, only to be over - written
|
|
movntq [edi - 48], mm2;
|
|
so that also helps performance
|
|
movq mm2, [esi - 24]
|
|
movntq [edi - 40], mm0
|
|
movq mm0, [esi - 16]
|
|
movntq [edi - 32], mm1
|
|
movq mm1, [esi - 8]
|
|
movntq [edi - 24], mm2
|
|
movntq [edi - 16], mm0
|
|
dec ecx
|
|
movntq [edi - 8], mm1
|
|
jnz $memcpy_uc_1;
|
|
last 64 - byte block ?
|
|
|
|
jmp $memcpy_ic_2;
|
|
almost done
|
|
|
|
// For the largest size blocks, a special technique called Block Prefetch
|
|
// can be used to accelerate the read operations. Block Prefetch reads
|
|
// one address per cache line, for a series of cache lines, in a short loop.
|
|
// This is faster than using software prefetch. The technique is great for
|
|
// getting maximum read bandwidth, especially in DDR memory systems.
|
|
$memcpy_bp_1 :;
|
|
large blocks, block prefetch copy
|
|
|
|
cmp ecx, CACHEBLOCK;
|
|
big enough to run another prefetch loop ?
|
|
jl $memcpy_64_test;
|
|
no, back to regular uncached copy
|
|
|
|
mov eax, CACHEBLOCK / 2;
|
|
block prefetch loop, unrolled 2X
|
|
add esi, CACHEBLOCK* 64;
|
|
move to the top of the block
|
|
align 16
|
|
$memcpy_bp_2 :
|
|
mov edx, [esi - 64];
|
|
grab one address per cache line
|
|
mov edx, [esi - 128];
|
|
grab one address per cache line
|
|
sub esi, 128;
|
|
go reverse order to suppress HW prefetcher
|
|
dec eax;
|
|
count down the cache lines
|
|
jnz $memcpy_bp_2;
|
|
keep grabbing more lines into cache
|
|
|
|
mov eax, CACHEBLOCK;
|
|
now that its in cache, do
|
|
{
|
|
the copy
|
|
align 16
|
|
$memcpy_bp_3:
|
|
movq mm0, [esi ];
|
|
} read 64 bits
|
|
movq mm1, [esi + 8]
|
|
movq mm2, [esi + 16]
|
|
movq mm3, [esi + 24]
|
|
movq mm4, [esi + 32]
|
|
movq mm5, [esi + 40]
|
|
movq mm6, [esi + 48]
|
|
movq mm7, [esi + 56]
|
|
add esi, 64;
|
|
update source pointer
|
|
movntq [edi ], mm0;
|
|
write 64 bits, bypassing cache
|
|
movntq [edi + 8], mm1;
|
|
note: movntq also prevents the CPU
|
|
movntq [edi + 16], mm2;
|
|
from READING the destination address
|
|
movntq [edi + 24], mm3;
|
|
into the cache, only to be over - written,
|
|
movntq [edi + 32], mm4;
|
|
so that also helps performance
|
|
movntq [edi + 40], mm5
|
|
movntq [edi + 48], mm6
|
|
movntq [edi + 56], mm7
|
|
add edi, 64;
|
|
update dest pointer
|
|
|
|
dec eax;
|
|
count down
|
|
|
|
jnz $memcpy_bp_3;
|
|
keep copying
|
|
sub ecx, CACHEBLOCK;
|
|
update the 64 - byte block count
|
|
jmp $memcpy_bp_1;
|
|
keep processing chunks
|
|
|
|
// The smallest copy uses the X86 "movsd" instruction, in an optimized
|
|
// form which is an "unrolled loop". Then it handles the last few bytes.
|
|
align 4
|
|
movsd
|
|
movsd;
|
|
perform last 1 - 15 dword copies
|
|
movsd
|
|
movsd
|
|
movsd
|
|
movsd
|
|
movsd
|
|
movsd
|
|
movsd
|
|
movsd;
|
|
perform last 1 - 7 dword copies
|
|
movsd
|
|
movsd
|
|
movsd
|
|
movsd
|
|
movsd
|
|
movsd
|
|
|
|
$memcpy_last_few:;
|
|
dword aligned from before movsds
|
|
mov ecx, ebx;
|
|
has valid low 2 bits of the byte count
|
|
and ecx, 11b;
|
|
the last few cows must come home
|
|
jz $memcpy_final;
|
|
no more, lets leave
|
|
rep movsb;
|
|
the last 1, 2, or 3 bytes
|
|
|
|
$memcpy_final:
|
|
emms;
|
|
clean up the MMX state
|
|
sfence;
|
|
flush the write buffer
|
|
// mov eax, [dest] ; ret value = destination pointer
|
|
}
|
|
}
|
|
else
|
|
{
|
|
memcpy(Dst, Src, Count);
|
|
}
|
|
}
|
|
|
|
inline void cryPrefetch(const void* Src, int nCount)
|
|
{
|
|
nCount >>= 6;
|
|
if (nCount > 0)
|
|
{
|
|
_asm
|
|
{
|
|
mov esi, Src;
|
|
mov ecx, nCount;
|
|
mPr0:
|
|
align 16
|
|
dec ecx;
|
|
mov eax, [esi];
|
|
mov eax, 0;
|
|
lea esi, [esi + 40h];
|
|
jne mPr0;
|
|
}
|
|
}
|
|
else
|
|
{
|
|
_asm
|
|
{
|
|
mov esi, Src;
|
|
mov ecx, nCount;
|
|
mPr1:
|
|
align 16
|
|
inc ecx;
|
|
mov eax, [esi];
|
|
mov eax, 0;
|
|
lea esi, [esi - 40h];
|
|
jne mPr1;
|
|
}
|
|
}
|
|
}
|
|
|
|
inline void cryMemcpy (void* inDst, const void* inSrc, int nCount, int nFlags)
|
|
{
|
|
cryMemcpy(inDst, inSrc, nCount);
|
|
}
|
|
|
|
//==========================================================================================
|
|
// SSE optimizations
|
|
|
|
|
|
#else
|
|
|
|
const int PREFNTA_BLOCK = 0x4000;
|
|
|
|
ILINE void cryMemcpy(void* Dst, const void* Src, int n)
|
|
{
|
|
char* dst = (char*)Dst;
|
|
char* src = (char*)Src;
|
|
while (n > PREFNTA_BLOCK)
|
|
{
|
|
_MM_PREFETCH_LOOP(PREFNTA_BLOCK, src, _MM_HINT_NTA);
|
|
|
|
memcpy(dst, src, PREFNTA_BLOCK);
|
|
src += PREFNTA_BLOCK;
|
|
dst += PREFNTA_BLOCK;
|
|
n -= PREFNTA_BLOCK;
|
|
}
|
|
_MM_PREFETCH_LOOP(n, src, _MM_HINT_NTA);
|
|
memcpy(dst, src, n);
|
|
}
|
|
|
|
ILINE void cryMemcpy(void* Dst, const void* Src, int n, [[maybe_unused]] int nFlags)
|
|
{
|
|
char* dst = (char*)Dst;
|
|
char* src = (char*)Src;
|
|
while (n > PREFNTA_BLOCK)
|
|
{
|
|
_MM_PREFETCH_LOOP(PREFNTA_BLOCK, src, _MM_HINT_NTA);
|
|
memcpy(dst, src, PREFNTA_BLOCK);
|
|
src += PREFNTA_BLOCK;
|
|
dst += PREFNTA_BLOCK;
|
|
n -= PREFNTA_BLOCK;
|
|
}
|
|
_MM_PREFETCH_LOOP(n, src, _MM_HINT_NTA);
|
|
memcpy(dst, src, n);
|
|
}
|
|
|
|
|
|
#endif
|
|
|
|
|
|
#if defined(AZ_RESTRICTED_PLATFORM)
|
|
#define AZ_RESTRICTED_SECTION MEMORYACCESS_H_SECTION_CRYPREFETCH
|
|
#include AZ_RESTRICTED_FILE(MemoryAccess_h)
|
|
#endif
|
|
#if defined(AZ_RESTRICTED_SECTION_IMPLEMENTED)
|
|
#undef AZ_RESTRICTED_SECTION_IMPLEMENTED
|
|
#else
|
|
//implement something usual to bring one memory location into L1 data cache
|
|
ILINE void CryPrefetch(const void* const cpSrc)
|
|
{
|
|
cryPrefetchT0SSE(cpSrc);
|
|
}
|
|
#endif
|
|
|
|
#define CryPrefetchInl CryPrefetch
|