You cannot select more than 25 topics Topics must start with a letter or number, can include dashes ('-') and can be up to 35 characters long.
o3de/Code/CryEngine/Cry3DEngine/VMath_SSE.hpp

331 lines
8.7 KiB
C++

/*
* All or portions of this file Copyright (c) Amazon.com, Inc. or its affiliates or
* its licensors.
*
* For complete copyright and license terms please see the LICENSE at the root of this
* distribution (the "License"). All use of this software is governed by the License,
* or, if provided, by the license below or the license accompanying this file. Do not
* remove or modify any license notices. This file is distributed on an "AS IS" BASIS,
* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
*
*/
// Original file Copyright Crytek GMBH or its affiliates, used under license.
// Description : unified vector math lib
#ifndef __D_VMATH_SSE__
#define __D_VMATH_SSE__
#define _DO_NOT_DECLARE_INTERLOCKED_INTRINSICS_IN_MEMORY
//#include <smmintrin.h>
typedef __m128 vec4;
#include "VMath_Prototypes.hpp"
#define SWIZZLEMASK4(N, X, Y, Z) N##x = _MM_SHUFFLE(0, Z, Y, X), \
N##y = _MM_SHUFFLE(1, Z, Y, X), \
N##z = _MM_SHUFFLE(2, Z, Y, X), \
N##w = _MM_SHUFFLE(3, Z, Y, X),
#define SWIZZLEMASK3(N, X, Y) SWIZZLEMASK4(N##x, X, Y, 0) \
SWIZZLEMASK4(N##y, X, Y, 1) \
SWIZZLEMASK4(N##z, X, Y, 2) \
SWIZZLEMASK4(N##w, X, Y, 3)
#define SWIZZLEMASK2(N, X) SWIZZLEMASK3(N##x, X, 0) \
SWIZZLEMASK3(N##y, X, 1) \
SWIZZLEMASK3(N##z, X, 2) \
SWIZZLEMASK3(N##w, X, 3)
#define SWIZZLEMASK1 SWIZZLEMASK2(x, 0) \
SWIZZLEMASK2(y, 1) \
SWIZZLEMASK2(z, 2) \
SWIZZLEMASK2(w, 3)
enum ESwizzleMask
{
SWIZZLEMASK1
};
enum ECacheLvl
{
ECL_LVL1 = _MM_HINT_T0,
ECL_LVL2 = _MM_HINT_T1,
ECL_LVL3 = _MM_HINT_T2,
};
#define BitX 1
#define BitY 2
#define BitZ 4
#define BitW 8
ILINE vec4 Vec4(float x)
{
return _mm_set1_ps(x);
}
ILINE vec4 Vec4(float x, float y, float z, float w)
{
return _mm_set_ps(w, z, y, x);
}
ILINE vec4 Vec4(uint32 x, uint32 y, uint32 z, uint32 w)
{
return _mm_set_ps(*reinterpret_cast<float*>(&w),
*reinterpret_cast<float*>(&z),
*reinterpret_cast<float*>(&y),
*reinterpret_cast<float*>(&x));
}
ILINE float Vec4float(vec4 V, uint32 Idx)
{
union
{
vec4 Tmp;
float V4[4];
} T;
T.Tmp = V;
return T.V4[Idx];
}
template<int Idx>
ILINE float Vec4float(vec4 V)
{
#if defined(VEC4_SSE4)
float f;
_MM_EXTRACT_FLOAT(f, V, Idx);
return f;
#else
return Vec4float(V, Idx);
#endif
}
template<int Idx>
ILINE int32 Vec4int32(vec4 V)
{
#if defined(VEC4_SSE4)
return _mm_extract_ps(V, Idx);
#else
return Vec4int32(V, Idx);
#endif
}
ILINE int32 Vec4int32(vec4 V, uint32 Idx)
{
union
{
vec4 Tmp;
int32 V4[4];
} T;
T.Tmp = V;
return T.V4[Idx];
}
ILINE vec4 Vec4Zero()
{
return _mm_setzero_ps();
}
ILINE vec4 Vec4One()
{
return _mm_set_ps(1.f, 1.f, 1.f, 1.f);
}
ILINE vec4 Vec4Four()
{
return _mm_set_ps(4.f, 4.f, 4.f, 4.f);
}
ILINE vec4 Vec4ZeroOneTwoThree()
{
return _mm_set_ps(3.f, 2.f, 1.f, 0.f);
}
ILINE vec4 Vec4FFFFFFFF()
{
__m128 a = _mm_setzero_ps();
return _mm_cmpeq_ps(a, a);
}
ILINE vec4 Vec4Epsilon()
{
return _mm_set_ps(FLT_EPSILON, FLT_EPSILON, FLT_EPSILON, FLT_EPSILON);
}
template<ECacheLvl L>
ILINE void Prefetch(const void* pData)
{
#if defined(LINUX) && !defined(__clang__)
_mm_prefetch(reinterpret_cast<const char*>(pData), (_mm_hint)L);
#else
_mm_prefetch(reinterpret_cast<const char*>(pData), (int)L);
#endif
}
template<ESwizzleMask M>
ILINE vec4 Shuffle(vec4 V0, vec4 V1)
{
return _mm_shuffle_ps(V0, V1, M);
}
template<ESwizzleMask M>
ILINE vec4 Swizzle(vec4 V)
{
return Shuffle<M>(V, V);
}
ILINE void ExtractByteToFloat(vec4& rVOut0, vec4& rVOut1, vec4& rVOut2, vec4& rVOut3, vec4 VIn)
{
const vec4 Zf = Vec4Zero();
const __m128i Z = *reinterpret_cast<const __m128i*>(&Zf);
__m128i V0 = _mm_unpacklo_epi8(*reinterpret_cast<const __m128i*>(&VIn), Z);
__m128i V1 = _mm_unpackhi_epi8(*reinterpret_cast<const __m128i*>(&VIn), Z);
__m128i V00 = _mm_unpacklo_epi8(V0, Z);
__m128i V01 = _mm_unpackhi_epi8(V0, Z);
__m128i V10 = _mm_unpacklo_epi8(V1, Z);
__m128i V11 = _mm_unpackhi_epi8(V1, Z);
V00 = _mm_srai_epi32(_mm_slli_epi32(V00, 24), 24);
V01 = _mm_srai_epi32(_mm_slli_epi32(V01, 24), 24);
V10 = _mm_srai_epi32(_mm_slli_epi32(V10, 24), 24);
V11 = _mm_srai_epi32(_mm_slli_epi32(V11, 24), 24);
rVOut0 = int32Tofloat(*reinterpret_cast<const vec4*>(&V00));
rVOut1 = int32Tofloat(*reinterpret_cast<const vec4*>(&V01));
rVOut2 = int32Tofloat(*reinterpret_cast<const vec4*>(&V10));
rVOut3 = int32Tofloat(*reinterpret_cast<const vec4*>(&V11));
}
ILINE vec4 Add(vec4 V0, vec4 V1)
{
return _mm_add_ps(V0, V1);
}
ILINE vec4 Sub(vec4 V0, vec4 V1)
{
return _mm_sub_ps(V0, V1);
}
ILINE vec4 Mul(vec4 V0, vec4 V1)
{
return _mm_mul_ps(V0, V1);
}
ILINE vec4 Div(vec4 V0, vec4 V1)
{
return _mm_div_ps(V0, V1);
}
ILINE vec4 RcpFAST(vec4 V)
{
return _mm_rcp_ps(V);
}
ILINE vec4 DivFAST(vec4 V0, vec4 V1)
{
return Mul(V0, RcpFAST(V1));
}
ILINE vec4 Rcp(vec4 V)
{
return Div(Vec4One(), V);
}
ILINE vec4 Madd(vec4 V0, vec4 V1, vec4 V2)
{
return Add(V2, Mul(V0, V1));
}
ILINE vec4 Msub(vec4 V0, vec4 V1, vec4 V2)
{
return Sub(Mul(V0, V1), V2);
}
ILINE vec4 Min(vec4 V0, vec4 V1)
{
return _mm_min_ps(V0, V1);
}
ILINE vec4 Max(vec4 V0, vec4 V1)
{
return _mm_max_ps(V0, V1);
}
ILINE vec4 floatToint32(vec4 V)
{
const __m128i Tmp = _mm_cvttps_epi32(V);
return *reinterpret_cast<const vec4*>(&Tmp);
}
ILINE vec4 int32Tofloat(vec4 V)
{
return _mm_cvtepi32_ps(*reinterpret_cast<__m128i*>(&V));
}
ILINE vec4 CmpLE(vec4 V0, vec4 V1)
{
return _mm_cmple_ps(V0, V1);
}
ILINE vec4 CmpEq(vec4 V0, vec4 V1)
{
return _mm_cmpeq_ps(V0, V1);
}
ILINE uint32 SignMask(vec4 V)
{
return _mm_movemask_ps(V);
}
ILINE vec4 And(vec4 V0, vec4 V1)
{
const __m128i Tmp = _mm_and_si128(*reinterpret_cast<__m128i*>(&V0), *reinterpret_cast<__m128i*>(&V1));
return *reinterpret_cast<const vec4*>(&Tmp);
}
ILINE vec4 AndNot(vec4 V0, vec4 V1)
{
const __m128i Tmp = _mm_andnot_si128(*reinterpret_cast<__m128i*>(&V0), *reinterpret_cast<__m128i*>(&V1));
return *reinterpret_cast<const vec4*>(&Tmp);
}
ILINE vec4 Or(vec4 V0, vec4 V1)
{
const __m128i Tmp = _mm_or_si128(*reinterpret_cast<__m128i*>(&V0), *reinterpret_cast<__m128i*>(&V1));
return *reinterpret_cast<const vec4*>(&Tmp);
}
ILINE vec4 Xor(vec4 V0, vec4 V1)
{
const __m128i Tmp = _mm_xor_si128(*reinterpret_cast<__m128i*>(&V0), *reinterpret_cast<__m128i*>(&V1));
return *reinterpret_cast<const vec4*>(&Tmp);
}
ILINE vec4 ShiftAR(vec4 V, uint32 Count)
{
const __m128i Tmp = _mm_srai_epi32(*reinterpret_cast<__m128i*>(&V), Count);
return *reinterpret_cast<const vec4*>(&Tmp);
}
template<int INDEX>
ILINE vec4 Splat(vec4 V)
{
CRY_ASSERT_MESSAGE(0, "Should not be reached!");
return Vec4FFFFFFFF();
}
template<>
ILINE vec4 Splat<0>(vec4 V)
{
return Shuffle<xxxx>(V, V);
}
template<>
ILINE vec4 Splat<1>(vec4 V)
{
return Shuffle<yyyy>(V, V);
}
template<>
ILINE vec4 Splat<2>(vec4 V)
{
return Shuffle<zzzz>(V, V);
}
template<>
ILINE vec4 Splat<3>(vec4 V)
{
return Shuffle<wwww>(V, V);
}
ILINE vec4 SelectBits(vec4 V0, vec4 V1, vec4 M)
{
#if defined(VEC4_SSE4)
return _mm_blendv_ps(V0, V1, M);
#else
return Or(AndNot(M, V0), And(M, V1));
#endif
}
ILINE vec4 Select(vec4 V0, vec4 V1, vec4 M)
{
#if !defined(VEC4_SSE4)
M = ShiftAR(M, 31);
#endif
return SelectBits(V0, V1, M);
}
ILINE vec4 SelectSign(vec4 V0, vec4 V1, vec4 M)
{
#if defined(VEC4_SSE4)
return Select(V0, V1, M);
#else
return Select(V0, V1, ShiftAR(M, 31));
#endif
}
template <int M>
ILINE vec4 SelectStatic(vec4 V0, vec4 V1)
{
#if defined(VEC4_SSE4)
return _mm_blend_ps(V0, V1, M);
#else
const vec4 mask = Vec4(M & 0x1 ? ~0x0u : 0x0u, M & 0x2 ? ~0x0u : 0x0u, M & 0x4 ? ~0x0u : 0x0u, M & 0x8 ? ~0x0u : 0x0u);
return Select(V0, V1, mask);
#endif
}
#endif