remove of unicode files from Cry
Signed-off-by: Esteban Papp <81431996+amznestebanpapp@users.noreply.github.com>monroegm-disable-blank-issue-2
parent
56eda61828
commit
f21fc79dc4
@ -1,946 +0,0 @@
|
||||
/*
|
||||
* Copyright (c) Contributors to the Open 3D Engine Project.
|
||||
* For complete copyright and license terms please see the LICENSE at the root of this distribution.
|
||||
*
|
||||
* SPDX-License-Identifier: Apache-2.0 OR MIT
|
||||
*
|
||||
*/
|
||||
|
||||
|
||||
// Note: The utilities in this file should typically not be used directly,
|
||||
// consider including UnicodeFunctions.h or UnicodeIterator.h instead.
|
||||
//
|
||||
// (At least) the following string types can be bound with these helper functions:
|
||||
// Types Input Output Null-Terminator
|
||||
// std::basic_string<T>, std::string, std::wstring: yes yes implied by type
|
||||
// QString: yes yes implied by type
|
||||
// std::vector<T>, std::list<T>, std::deque<T>: yes yes not present
|
||||
// T[] (fixed-length buffer): yes yes guaranteed to be emitted on output, accepted on input
|
||||
// T * and size_t (user-specified-size buffer): no yes guaranteed to be emitted on output
|
||||
// const T * (null-terminated string): yes no expected
|
||||
// const T[] (literal): yes no implied as the last item in the array
|
||||
// pair of iterators over T: yes no should not be included in the range
|
||||
// uint32 (single UCS code-point): yes no not present
|
||||
// If some other string type is not listed, you can still use it for input easily by passing begin/end iterators.
|
||||
// Note: For all types, T can be any 8-bit, 16-bit or 32-bit integral or character type.
|
||||
// Further T types may be processed by explicitly passing InputEncoding and OutputEncoding.
|
||||
// We never actively tested such scenario's, so no guarantees on floating and user-defined types as code-units.
|
||||
|
||||
|
||||
#pragma once
|
||||
|
||||
#ifndef assert
|
||||
// Some tools use CRT's assert, most engine and game modules use CryAssert.h (via platform.h maybe).
|
||||
// We don't want to force a choice upon all code that uses Unicode utilities, so we just assume assert is defined.
|
||||
#error This header uses assert macro, please provide an applicable definition before including UnicodeXXX.h
|
||||
#endif
|
||||
|
||||
#include "UnicodeEncoding.h"
|
||||
#include <string.h> // For str(n)len and memcpy.
|
||||
#include <wchar.h> // For wcs(n)len.
|
||||
#include <stddef.h> // For size_t and ptrdiff_t.
|
||||
#include <iterator> // For std::iterator_traits.
|
||||
#include <string> // For std::basic_string.
|
||||
#include <vector> // For std::vector.
|
||||
#include <list> // For std::list.
|
||||
#include <deque> // For std::deque.
|
||||
#include <type_traits> // ... standard type-traits (as of C++11).
|
||||
|
||||
#if defined(AZ_RESTRICTED_PLATFORM)
|
||||
#undef AZ_RESTRICTED_SECTION
|
||||
#define UNICODEBINDING_H_SECTION_1 1
|
||||
#define UNICODEBINDING_H_SECTION_2 2
|
||||
#endif
|
||||
|
||||
// Forward declare the supported types.
|
||||
// Before actually instantiating a binding however, you need to have the full definition included.
|
||||
// Also, this allows us to work with QChar/QString as declared names without a dependency on Qt.
|
||||
namespace AZStd
|
||||
{
|
||||
template<class Element, size_t MaxElementCount, class Traits>
|
||||
class basic_fixed_string;
|
||||
}
|
||||
class QChar;
|
||||
class QString;
|
||||
|
||||
namespace Unicode
|
||||
{
|
||||
namespace Detail
|
||||
{
|
||||
// Import standard type traits.
|
||||
// This requires C++11 compiler support.
|
||||
using std::add_const;
|
||||
using std::conditional;
|
||||
using std::extent;
|
||||
using std::integral_constant;
|
||||
using std::is_arithmetic;
|
||||
using std::is_array;
|
||||
using std::is_base_of;
|
||||
using std::is_const;
|
||||
using std::is_convertible;
|
||||
using std::is_integral;
|
||||
using std::is_pointer;
|
||||
using std::is_same;
|
||||
using std::make_unsigned;
|
||||
using std::remove_cv;
|
||||
using std::remove_extent;
|
||||
using std::remove_pointer;
|
||||
|
||||
// SVoid<T>:
|
||||
// Result type will be void if T is well-formed.
|
||||
// Note: This is mostly used to test the presence of member types at compile-time.
|
||||
template<typename T>
|
||||
struct SVoid
|
||||
{
|
||||
typedef void type;
|
||||
};
|
||||
|
||||
// SValidChar<T, InferEncoding, Input>:
|
||||
// Determine if T is a valid character type in the given compile-time context.
|
||||
// The InferEncoding flag is set if the encoding has to be detected automatically.
|
||||
// The Input flag is set if the type is used for input (and not set if the type is used for output).
|
||||
template<typename T, bool InferEncoding, bool Input>
|
||||
struct SValidChar
|
||||
{
|
||||
typedef typename remove_cv<T>::type BaseType;
|
||||
static const bool isArithmeticType = is_arithmetic<BaseType>::value;
|
||||
static const bool isQChar = is_same<BaseType, QChar>::value;
|
||||
static const bool isUsable = isArithmeticType || isQChar;
|
||||
static const bool isValidQualified = !is_const<T>::value || Input;
|
||||
static const bool isKnownSize = sizeof(T) == 1 || sizeof(T) == 2 || sizeof(T) == 4;
|
||||
static const bool isValidInferred = isKnownSize || !InferEncoding;
|
||||
static const bool value = isUsable && isValidQualified && isValidInferred;
|
||||
};
|
||||
|
||||
// SPackedIterators<T>:
|
||||
// A pair of iterators over some range.
|
||||
// Note: Packing iterators into a single object allows us to pass them as a single argument like all other types.
|
||||
template<typename T>
|
||||
struct SPackedIterators
|
||||
{
|
||||
const T begin, end;
|
||||
SPackedIterators(const T& _begin, const T& _end)
|
||||
: begin(_begin)
|
||||
, end(_end) {}
|
||||
};
|
||||
|
||||
// SPackedBuffer<T>:
|
||||
// A buffer-pointer/length tuple.
|
||||
// Note: Packing them into a single object allows us to pass them as a single argument like all other types.
|
||||
template<typename T>
|
||||
struct SPackedBuffer
|
||||
{
|
||||
T buffer;
|
||||
size_t size;
|
||||
SPackedBuffer(T _buffer, size_t _size)
|
||||
: buffer(_buffer)
|
||||
, size(_size) {}
|
||||
};
|
||||
|
||||
// SDependentType<T, X>:
|
||||
// Makes the name of type T dependent on X (which is otherwise meaningless).
|
||||
// Note: This is used to force two-phase lookup so we don't need the definition of T until instantiation.
|
||||
// This way we can convince standards-compliant compilers Clang and GCC to not require definition of forward-declared types.
|
||||
// Specifically, we forward-declare Qt's QString and QChar, for which the definition will never be available outside Editor.
|
||||
template<typename T, int X>
|
||||
struct SDependentType
|
||||
{
|
||||
typedef T type;
|
||||
};
|
||||
|
||||
// EBind:
|
||||
// Methods of binding a type for input and/or output.
|
||||
// Note: These are used for tag-dispatch by binding functions, and are private to the implementation.
|
||||
enum EBind
|
||||
{ // Input Output Description
|
||||
eBind_Impossible, // No No Can't bind this type.
|
||||
eBind_Iterators, // Yes Yes Bind by using begin() and end() member functions.
|
||||
eBind_Data, // Yes Yes Bind by using data() and size() member functions.
|
||||
eBind_Literal, // Yes No Bind a fixed size buffer (const element, aka string literal).
|
||||
eBind_Buffer, // Yes No Bind a fixed size buffer (non-const element) that may be null-terminated.
|
||||
eBind_PackedBuffer, // No Yes Bind a user-specified size buffer (non-const element).
|
||||
eBind_NullTerminated, // Yes No Bind a null-terminated buffer of unknown length (C string).
|
||||
eBind_CodePoint, // Yes No Bind a single code-point value.
|
||||
};
|
||||
|
||||
// SBindIterator<T, InferEncoding>:
|
||||
// Find the EBind for input from iterator pair of type T at compile-time.
|
||||
// If the type is not supported, the resulting value will be eBind_Impossible
|
||||
template<typename T, bool InferEncoding, typename HasValueType = void, typename HasIteratorCategory = void>
|
||||
struct SBindIterator
|
||||
{
|
||||
typedef const void CharType;
|
||||
static const EBind value = eBind_Impossible;
|
||||
};
|
||||
template<typename T, bool InferEncoding, typename HasValueType, typename HasIteratorCategory>
|
||||
struct SBindIterator<T*, InferEncoding, HasValueType, HasIteratorCategory>
|
||||
{
|
||||
typedef typename add_const<T>::type CharType;
|
||||
static const bool isValid = SValidChar<CharType, InferEncoding, true>::value;
|
||||
static const EBind value = isValid ? eBind_Iterators : eBind_Impossible;
|
||||
};
|
||||
template<typename T, bool InferEncoding>
|
||||
struct SBindIterator<T, InferEncoding,
|
||||
typename SVoid<typename T::value_type>::type,
|
||||
typename SVoid<typename T::iterator_category>::type
|
||||
>
|
||||
{
|
||||
typedef typename add_const<typename T::value_type>::type CharType;
|
||||
typedef typename T::iterator_category IteratorCategory;
|
||||
static const bool isInputIterator = is_base_of<std::input_iterator_tag, IteratorCategory>::value;
|
||||
static const bool isValid = SValidChar<CharType, InferEncoding, true>::value;
|
||||
static const EBind value = isValid && isInputIterator ? eBind_Iterators : eBind_Impossible;
|
||||
};
|
||||
|
||||
// SBindObject<T, InferEncoding>:
|
||||
// Find the EBind for input from object of type T at compile-time.
|
||||
// If the type is not supported, the resulting value will be eBind_Impossible.
|
||||
template<typename T, bool InferEncoding>
|
||||
struct SBindObject
|
||||
{
|
||||
typedef typename add_const<
|
||||
typename conditional<
|
||||
is_array<T>::value,
|
||||
typename remove_extent<T>::type,
|
||||
typename remove_pointer<T>::type
|
||||
>::type
|
||||
>::type CharType;
|
||||
static const size_t FixedSize = extent<T>::value;
|
||||
COMPILE_TIME_ASSERT(!is_array<T>::value || FixedSize > 0);
|
||||
static const bool isConstArray = is_array<T>::value && is_const<typename remove_extent<T>::type>::value;
|
||||
static const bool isBufferArray = is_array<T>::value && !isConstArray;
|
||||
static const bool isPointer = is_pointer<T>::value;
|
||||
static const bool isCodePoint = is_integral<T>::value;
|
||||
static const bool isValidChar = SValidChar<CharType, InferEncoding, true>::value;
|
||||
static const EBind value =
|
||||
!isValidChar ? eBind_Impossible :
|
||||
isConstArray ? eBind_Literal :
|
||||
isBufferArray ? eBind_Buffer :
|
||||
isPointer ? eBind_NullTerminated :
|
||||
isCodePoint ? eBind_CodePoint :
|
||||
eBind_Impossible;
|
||||
};
|
||||
template<typename CharT, typename Traits, typename Allocator, bool InferEncoding>
|
||||
struct SBindObject<std::basic_string<CharT, Traits, Allocator>, InferEncoding>
|
||||
{
|
||||
typedef typename add_const<CharT>::type CharType;
|
||||
static const bool isValid = SValidChar<CharT, InferEncoding, true>::value;
|
||||
static const EBind value = isValid ? eBind_Data : eBind_Impossible;
|
||||
};
|
||||
template<typename T, typename Allocator, bool InferEncoding>
|
||||
struct SBindObject<std::vector<T, Allocator>, InferEncoding>
|
||||
{
|
||||
typedef typename add_const<T>::type CharType;
|
||||
static const bool isValid = SValidChar<T, InferEncoding, true>::value;
|
||||
static const EBind value = isValid ? eBind_Data : eBind_Impossible;
|
||||
};
|
||||
template<typename T, typename Allocator, bool InferEncoding>
|
||||
struct SBindObject<std::list<T, Allocator>, InferEncoding>
|
||||
{
|
||||
typedef typename add_const<T>::type CharType;
|
||||
static const bool isValid = SValidChar<T, InferEncoding, true>::value;
|
||||
static const EBind value = isValid ? eBind_Iterators : eBind_Impossible;
|
||||
};
|
||||
template<typename T, typename Allocator, bool InferEncoding>
|
||||
struct SBindObject<std::deque<T, Allocator>, InferEncoding>
|
||||
{
|
||||
typedef typename add_const<T>::type CharType;
|
||||
static const bool isValid = SValidChar<T, InferEncoding, true>::value;
|
||||
static const EBind value = isValid ? eBind_Iterators : eBind_Impossible;
|
||||
};
|
||||
template<typename T, size_t S, bool InferEncoding>
|
||||
struct SBindObject<AZStd::basic_fixed_string<T, S>, InferEncoding>
|
||||
{
|
||||
typedef typename add_const<T>::type CharType;
|
||||
static const bool isValid = SValidChar<T, InferEncoding, true>::value;
|
||||
static const EBind value = isValid ? eBind_Data : eBind_Impossible;
|
||||
};
|
||||
template<size_t S, bool InferEncoding>
|
||||
struct SBindObject<AZStd::fixed_wstring<S>, InferEncoding>
|
||||
{
|
||||
typedef wchar_t CharType;
|
||||
static const bool isValid = SValidChar<CharType, InferEncoding, true>::value;
|
||||
static const EBind value = isValid ? eBind_Data : eBind_Impossible;
|
||||
};
|
||||
template<bool InferEncoding>
|
||||
struct SBindObject<QString, InferEncoding>
|
||||
{
|
||||
typedef const QChar CharType;
|
||||
static const EBind value = eBind_Data;
|
||||
};
|
||||
template<typename T, bool InferEncoding>
|
||||
struct SBindObject<SPackedIterators<T>, InferEncoding>
|
||||
{
|
||||
typedef typename SBindIterator<T, InferEncoding>::CharType CharType;
|
||||
static const EBind value = eBind_Iterators;
|
||||
};
|
||||
|
||||
// SBindOutput<T, InferEncoding>:
|
||||
// Find the EBind for output to object of type T at compile-time.
|
||||
// If the type is not supported, the resulting value will be eBind_Impossible.
|
||||
template<typename T, bool InferEncoding>
|
||||
struct SBindOutput
|
||||
{
|
||||
typedef typename remove_extent<T>::type CharType;
|
||||
static const size_t FixedSize = extent<T>::value;
|
||||
static const bool isArray = is_array<T>::value;
|
||||
static const bool isValid = SValidChar<typename remove_extent<T>::type, InferEncoding, false>::value;
|
||||
static const EBind value = isArray && isValid ? eBind_Buffer : eBind_Impossible;
|
||||
};
|
||||
template<typename OutputCharType, bool InferEncoding>
|
||||
struct SBindOutput<SPackedBuffer<OutputCharType*>, InferEncoding>
|
||||
{
|
||||
typedef OutputCharType CharType;
|
||||
static const bool isValid = SValidChar<CharType, InferEncoding, false>::value;
|
||||
static const EBind value = isValid ? eBind_PackedBuffer : eBind_Impossible;
|
||||
};
|
||||
template<typename CharT, typename Traits, typename Allocator, bool InferEncoding>
|
||||
struct SBindOutput<std::basic_string<CharT, Traits, Allocator>, InferEncoding>
|
||||
{
|
||||
typedef CharT CharType;
|
||||
static const bool isValid = SValidChar<CharT, InferEncoding, false>::value;
|
||||
static const EBind value = isValid ? eBind_Data : eBind_Impossible;
|
||||
};
|
||||
template<typename T, typename Allocator, bool InferEncoding>
|
||||
struct SBindOutput<std::vector<T, Allocator>, InferEncoding>
|
||||
{
|
||||
typedef T CharType;
|
||||
static const bool isValid = SValidChar<T, InferEncoding, false>::value;
|
||||
static const EBind value = isValid ? eBind_Data : eBind_Impossible;
|
||||
};
|
||||
template<typename T, typename Allocator, bool InferEncoding>
|
||||
struct SBindOutput<std::list<T, Allocator>, InferEncoding>
|
||||
{
|
||||
typedef T CharType;
|
||||
static const bool isValid = SValidChar<T, InferEncoding, false>::value;
|
||||
static const EBind value = isValid ? eBind_Iterators : eBind_Impossible;
|
||||
};
|
||||
template<typename T, typename Allocator, bool InferEncoding>
|
||||
struct SBindOutput<std::deque<T, Allocator>, InferEncoding>
|
||||
{
|
||||
typedef T CharType;
|
||||
static const bool isValid = SValidChar<T, InferEncoding, false>::value;
|
||||
static const EBind value = isValid ? eBind_Iterators : eBind_Impossible;
|
||||
};
|
||||
template<typename T, size_t S, bool InferEncoding>
|
||||
struct SBindOutput<AZStd::basic_fixed_string<T, S>, InferEncoding>
|
||||
{
|
||||
typedef T CharType;
|
||||
static const bool isValid = SValidChar<T, InferEncoding, false>::value;
|
||||
static const EBind value = isValid ? eBind_Data : eBind_Impossible;
|
||||
};
|
||||
template<bool InferEncoding>
|
||||
struct SBindOutput<QString, InferEncoding>
|
||||
{
|
||||
typedef QChar CharType;
|
||||
static const EBind value = eBind_Data;
|
||||
};
|
||||
|
||||
// SInferEncoding<T>:
|
||||
// Infers the encoding of the given character type.
|
||||
// Note: This will always pick an UTF encoding type based on the size of the element type.
|
||||
template<typename T, bool Input>
|
||||
struct SInferEncoding
|
||||
{
|
||||
typedef SBindObject<T, true> ObjectType;
|
||||
typedef SBindIterator<T, true> IteratorType;
|
||||
typedef typename conditional<
|
||||
IteratorType::value != eBind_Impossible,
|
||||
typename IteratorType::CharType,
|
||||
typename ObjectType::CharType
|
||||
>::type CharType;
|
||||
static const EEncoding value =
|
||||
sizeof(CharType) == 1 ? eEncoding_UTF8 :
|
||||
sizeof(CharType) == 2 ? eEncoding_UTF16 :
|
||||
eEncoding_UTF32;
|
||||
COMPILE_TIME_ASSERT(value != eEncoding_UTF32 || sizeof(CharType) == 4);
|
||||
};
|
||||
|
||||
// SBindCharacter<T, Input>:
|
||||
// Pick the base character type to use during input or output with this element type.
|
||||
template<typename T, bool Input, bool Integral = is_integral<T>::value, bool IsQChar = is_same<QChar, typename remove_cv<T>::type>::value>
|
||||
struct SBindCharacter
|
||||
{
|
||||
typedef typename make_unsigned<T>::type BaseType; // The standard doesn't define if a character type is signed or unsigned.
|
||||
typedef typename remove_cv<BaseType>::type UnqualifiedType;
|
||||
typedef typename conditional<Input, const UnqualifiedType, UnqualifiedType>::type type;
|
||||
};
|
||||
template<typename T, bool Input>
|
||||
struct SBindCharacter<T, Input, false, false>
|
||||
{
|
||||
COMPILE_TIME_ASSERT(is_arithmetic<T>::value);
|
||||
typedef typename remove_cv<T>::type UnqualifiedType;
|
||||
typedef typename conditional<Input, const UnqualifiedType, UnqualifiedType>::type type;
|
||||
};
|
||||
template<typename T, bool Input>
|
||||
struct SBindCharacter<T, Input, false, true>
|
||||
{
|
||||
typedef typename conditional<Input, const uint16, uint16>::type type;
|
||||
typedef typename SDependentType<QChar, Input>::type ActuallyQChar; // Force two-phase name lookup on QChar.
|
||||
COMPILE_TIME_ASSERT(sizeof(ActuallyQChar) == sizeof(type)); // In case Qt ever changes QChar.
|
||||
};
|
||||
|
||||
// SBindPointer<T, Input>:
|
||||
// Pick the pointer type to use during input or output with buffers (potentially inside string types).
|
||||
template<typename T, bool Input>
|
||||
struct SBindPointer
|
||||
{
|
||||
COMPILE_TIME_ASSERT(is_pointer<T>::value || is_array<T>::value);
|
||||
typedef typename conditional<
|
||||
is_pointer<T>::value,
|
||||
typename remove_pointer<T>::type,
|
||||
typename remove_extent<T>::type
|
||||
>::type UnboundCharType;
|
||||
typedef typename SBindCharacter<UnboundCharType, Input>::type BoundCharType;
|
||||
typedef BoundCharType* type;
|
||||
};
|
||||
|
||||
// SAutomaticallyDeduced:
|
||||
// Placeholder type that is never defined, used by SRequire for SFINAE overloading.
|
||||
struct SAutomaticallyDeduced;
|
||||
|
||||
// SRequire<Expr, T>:
|
||||
// Helper for SFINAE overloading.
|
||||
// Similar to C++11's std::enable_if, which is not in boost (with that exact name anyway).
|
||||
template<bool SFINAE, typename T = SAutomaticallyDeduced>
|
||||
struct SRequire
|
||||
{
|
||||
typedef T type;
|
||||
};
|
||||
template<typename T>
|
||||
struct SRequire<false, T> {};
|
||||
|
||||
// SafeCast<T, SourceChar>:
|
||||
// Cast a pointer to type T, but only allowing safe casts.
|
||||
// This guards against bad code in other functions since it prevents unintended casts.
|
||||
template<typename T, typename SourceChar>
|
||||
inline T SafeCast(SourceChar* ptr, typename SRequire<is_integral<SourceChar>::value>::type* = 0)
|
||||
{
|
||||
// Allow casts from pointer-to-integral to unrelated pointer-to-integral, provided they are of the same size.
|
||||
typedef typename remove_pointer<T>::type TargetChar;
|
||||
COMPILE_TIME_ASSERT(is_integral<SourceChar>::value && is_integral<TargetChar>::value);
|
||||
COMPILE_TIME_ASSERT(sizeof(SourceChar) == sizeof(TargetChar));
|
||||
return reinterpret_cast<T>(ptr);
|
||||
}
|
||||
template<typename T, typename SourceChar>
|
||||
inline T SafeCast(SourceChar* ptr, typename SRequire<is_same<typename remove_cv<SourceChar>::type, QChar>::value>::type* = 0)
|
||||
{
|
||||
// Allow casts from pointer-to-QChar to unrelated pointer-to-integral, provided they are of the same size.
|
||||
typedef typename remove_pointer<T>::type TargetChar;
|
||||
COMPILE_TIME_ASSERT(is_integral<TargetChar>::value);
|
||||
COMPILE_TIME_ASSERT(sizeof(SourceChar) == sizeof(TargetChar));
|
||||
return reinterpret_cast<T>(ptr);
|
||||
}
|
||||
template<typename T, typename SourceChar>
|
||||
inline T SafeCast(SourceChar* ptr, typename SRequire<!is_integral<SourceChar>::value&& !is_same<typename remove_cv<SourceChar>::type, QChar>::value>::type* = 0)
|
||||
{
|
||||
// Any other casts that are allowed by C++.
|
||||
return static_cast<T>(ptr);
|
||||
}
|
||||
|
||||
// SCharacterTrait<T>:
|
||||
// Exposes some basic traits for a given character.
|
||||
// Note: Map to (hopefully optimized) CRT functions where possible.
|
||||
template<typename T, size_t Size = sizeof(T)* is_integral<T>::value>
|
||||
struct SCharacterTrait
|
||||
{
|
||||
static size_t StrLen(const T* nts) // Fall-back strlen.
|
||||
{
|
||||
size_t result = 0;
|
||||
while (*nts != 0)
|
||||
{
|
||||
++nts;
|
||||
++result;
|
||||
}
|
||||
return result;
|
||||
}
|
||||
static size_t StrNLen(const T* ptr, size_t len) // Fall-back strnlen.
|
||||
{
|
||||
size_t result = 0;
|
||||
while (*ptr != 0 && result != len)
|
||||
{
|
||||
++ptr;
|
||||
++result;
|
||||
}
|
||||
return result;
|
||||
}
|
||||
};
|
||||
template<typename T>
|
||||
struct SCharacterTrait<T, sizeof(char)>
|
||||
{
|
||||
static size_t StrLen(const T* nts) // Narrow CRT strlen.
|
||||
{
|
||||
return ::strlen(SafeCast<const char*>(nts));
|
||||
}
|
||||
static size_t StrNLen(const T* ptr, size_t len) // Narrow CRT strnlen.
|
||||
{
|
||||
return ::strnlen(SafeCast<const char*>(ptr), len);
|
||||
}
|
||||
};
|
||||
template<typename T>
|
||||
struct SCharacterTrait<T, sizeof(wchar_t)>
|
||||
{
|
||||
static size_t StrLen(const T* nts) // Wide CRT strlen.
|
||||
{
|
||||
return ::wcslen(SafeCast<const wchar_t*>(nts));
|
||||
}
|
||||
static size_t StrNLen(const T* ptr, size_t len) // Wide CRT strnlen.
|
||||
{
|
||||
#if defined(AZ_RESTRICTED_PLATFORM)
|
||||
#define AZ_RESTRICTED_SECTION UNICODEBINDING_H_SECTION_1
|
||||
#include AZ_RESTRICTED_FILE(UnicodeBinding_h)
|
||||
#endif
|
||||
return ::wcsnlen(SafeCast<const wchar_t*>(ptr), len);
|
||||
#if defined(AZ_RESTRICTED_PLATFORM)
|
||||
#define AZ_RESTRICTED_SECTION UNICODEBINDING_H_SECTION_2
|
||||
#include AZ_RESTRICTED_FILE(UnicodeBinding_h)
|
||||
#endif
|
||||
}
|
||||
};
|
||||
|
||||
// void Feed(const SPackedIterators<InputIteratorType> &its, Sink &out, tag):
|
||||
// Feeds the provided sink from provided packed iterator-range.
|
||||
template<typename InputIteratorType, typename Sink>
|
||||
inline void Feed(const SPackedIterators<InputIteratorType>& its, Sink& out, integral_constant<EBind, eBind_Iterators>)
|
||||
{
|
||||
typedef typename std::iterator_traits<InputIteratorType>::value_type UnboundCharType;
|
||||
typedef typename SBindCharacter<UnboundCharType, true>::type BoundCharType;
|
||||
for (InputIteratorType it = its.begin; it != its.end; ++it)
|
||||
{
|
||||
const UnboundCharType unbound = *it;
|
||||
const BoundCharType bound = static_cast<BoundCharType>(unbound);
|
||||
const uint32 item = static_cast<uint32>(bound);
|
||||
out(item);
|
||||
}
|
||||
}
|
||||
|
||||
// void Feed(const SPackedIterators<const InputCharType *> &its, Sink &out, tag):
|
||||
// Feeds the provided sink from provided packed pointer-range.
|
||||
// This is slightly better code-generation than using generic iterators.
|
||||
template<typename InputCharType, typename Sink>
|
||||
inline void Feed(const SPackedIterators<const InputCharType*>& its, Sink& out, integral_constant<EBind, eBind_Iterators>)
|
||||
{
|
||||
typedef typename SBindPointer<const InputCharType*, true>::type PointerType;
|
||||
assert(reinterpret_cast<size_t>(its.begin) <= reinterpret_cast<size_t>(its.end) && "Invalid range specified");
|
||||
const size_t length = its.end - its.begin;
|
||||
PointerType ptr = SafeCast<PointerType>(its.begin);
|
||||
assert((ptr || !length) && "Passed a non-empty range containing a null-pointer");
|
||||
for (size_t i = 0; i < length; ++i, ++ptr)
|
||||
{
|
||||
const uint32 item = static_cast<uint32>(*ptr);
|
||||
out(item);
|
||||
}
|
||||
}
|
||||
|
||||
// void Feed(const InputStringType &in, Sink &out, tag):
|
||||
// Feeds the provided sink from a container, using it's iterators.
|
||||
// Note: Dispatches to one of the packed-range overloads.
|
||||
template<typename InputStringType, typename Sink>
|
||||
inline void Feed(const InputStringType& in, Sink& out, integral_constant<EBind, eBind_Iterators> tag)
|
||||
{
|
||||
typedef typename InputStringType::const_iterator IteratorType;
|
||||
Detail::SPackedIterators<IteratorType> its(in.begin(), in.end());
|
||||
Feed(its, out, tag);
|
||||
}
|
||||
|
||||
// void Feed(const InputStringType &in, Sink &out, tag):
|
||||
// Feeds the provided sink from a string-object's buffer.
|
||||
template<typename InputStringType, typename Sink>
|
||||
inline void Feed(const InputStringType& in, Sink& out, integral_constant<EBind, eBind_Data>)
|
||||
{
|
||||
typedef typename InputStringType::size_type SizeType;
|
||||
typedef typename InputStringType::value_type ValueType;
|
||||
typedef typename SBindPointer<const ValueType*, true>::type PointerType;
|
||||
const SizeType length = in.size();
|
||||
if (length)
|
||||
{
|
||||
PointerType ptr = SafeCast<PointerType>(in.data());
|
||||
for (SizeType i = 0; i < length; ++i, ++ptr)
|
||||
{
|
||||
const uint32 item = static_cast<uint32>(*ptr);
|
||||
out(item);
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
// void Feed(const InputStringType &in, Sink &out, tag):
|
||||
// Feeds the provided sink from a string-literal.
|
||||
// Note: The literal is assumed to be null-terminated.
|
||||
// It's possible that a const-element fixed-size-buffer is mistaken as a literal.
|
||||
// However, we expect no-one uses such buffers that are not null-terminated already.
|
||||
// If somehow this use-case is desired, either terminate the buffer, or remove const from the buffer, or pass iterators.
|
||||
template<typename InputStringType, typename Sink>
|
||||
inline void Feed(const InputStringType& in, Sink& out, integral_constant<EBind, eBind_Literal>)
|
||||
{
|
||||
COMPILE_TIME_ASSERT(is_array<InputStringType>::value && extent<InputStringType>::value > 0);
|
||||
typedef typename SBindPointer<InputStringType, true>::type PointerType;
|
||||
const size_t length = extent<InputStringType>::value - 1;
|
||||
PointerType ptr = SafeCast<PointerType>(in);
|
||||
assert(ptr[length] == 0 && "Literal is not null-terminated");
|
||||
for (size_t i = 0; i < length; ++i, ++ptr)
|
||||
{
|
||||
const uint32 item = static_cast<uint32>(*ptr);
|
||||
out(item);
|
||||
}
|
||||
}
|
||||
|
||||
// void Feed(const InputStringType &in, Sink &out, tag):
|
||||
// Feeds the provided sink from a non-const-element fixed-size buffer.
|
||||
// Note: The buffer is allowed to be null-terminated, but it's not required.
|
||||
template<typename InputStringType, typename Sink>
|
||||
inline void Feed(const InputStringType& in, Sink& out, integral_constant<EBind, eBind_Buffer>)
|
||||
{
|
||||
COMPILE_TIME_ASSERT(is_array<InputStringType>::value && extent<InputStringType>::value > 0);
|
||||
typedef typename SBindPointer<InputStringType, true>::type PointerType;
|
||||
typedef typename SBindPointer<InputStringType, true>::BoundCharType CharType;
|
||||
const size_t length = extent<InputStringType>::value;
|
||||
PointerType ptr = SafeCast<PointerType>(in);
|
||||
for (size_t i = 0; i < length; ++i, ++ptr)
|
||||
{
|
||||
const CharType unbound = *ptr;
|
||||
if (unbound == 0)
|
||||
{
|
||||
break;
|
||||
}
|
||||
const uint32 item = static_cast<uint32>(unbound);
|
||||
out(item);
|
||||
}
|
||||
}
|
||||
|
||||
// void Feed(const InputStringType &in, Sink &out, tag):
|
||||
// Feeds the provided sink from a null-terminated C-style string.
|
||||
template<typename InputStringType, typename Sink>
|
||||
inline void Feed(const InputStringType& in, Sink& out, integral_constant<EBind, eBind_NullTerminated>)
|
||||
{
|
||||
COMPILE_TIME_ASSERT(is_pointer<InputStringType>::value);
|
||||
typedef typename SBindPointer<InputStringType, true>::type PointerType;
|
||||
typedef typename SBindPointer<InputStringType, true>::BoundCharType CharType;
|
||||
PointerType ptr = SafeCast<PointerType>(in);
|
||||
if (ptr)
|
||||
{
|
||||
while (true)
|
||||
{
|
||||
const CharType unbound = *ptr;
|
||||
++ptr;
|
||||
if (unbound == 0)
|
||||
{
|
||||
break;
|
||||
}
|
||||
const uint32 item = static_cast<uint32>(unbound);
|
||||
out(item);
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
// void Feed(const InputCharType &in, Sink &out, tag):
|
||||
// Feeds the provided sink from a single value (interpreted as an UCS code-point).
|
||||
template<typename InputCharType, typename Sink>
|
||||
inline void Feed(const InputCharType& in, Sink& out, integral_constant<EBind, eBind_CodePoint>)
|
||||
{
|
||||
COMPILE_TIME_ASSERT(is_arithmetic<InputCharType>::value);
|
||||
const uint32 item = static_cast<uint32>(in);
|
||||
out(item);
|
||||
}
|
||||
|
||||
// size_t EncodedLength(const SPackedIterators<InputIteratorType> &its, tag):
|
||||
// Determines the length of the input sequence in a range of iterators.
|
||||
template<typename InputIteratorType>
|
||||
inline size_t EncodedLength(const SPackedIterators<InputIteratorType>& its, integral_constant<EBind, eBind_Iterators>)
|
||||
{
|
||||
return std::distance(its.begin, its.end); // std::distance will pick optimal implementation depending on iterator category.
|
||||
}
|
||||
|
||||
// size_t EncodedLength(const InputStringType &in, tag):
|
||||
// Determines the length of an input container, which would otherwise be enumerated with iterators.
|
||||
template<typename InputStringType>
|
||||
inline size_t EncodedLength(const InputStringType& in, integral_constant<EBind, eBind_Iterators>)
|
||||
{
|
||||
return in.size(); // Can there be a container without size()? At the very least, not in the supported types.
|
||||
}
|
||||
|
||||
// size_t EncodedLength(const InputStringType &in, tag):
|
||||
// Determines the length of the input container. The container uses contiguous element layout.
|
||||
template<typename InputStringType>
|
||||
inline size_t EncodedLength(const InputStringType& in, integral_constant<EBind, eBind_Data>)
|
||||
{
|
||||
return in.size();
|
||||
}
|
||||
|
||||
// size_t EncodedLength(const InputStringType &in, tag):
|
||||
// Determines the length of the input string-literal. This is a compile-time constant.
|
||||
template<typename InputStringType>
|
||||
inline size_t EncodedLength(const InputStringType& in, integral_constant<EBind, eBind_Literal>)
|
||||
{
|
||||
COMPILE_TIME_ASSERT(is_array<InputStringType>::value && extent<InputStringType>::value > 0);
|
||||
return extent<InputStringType>::value - 1;
|
||||
}
|
||||
|
||||
// size_t EncodedLength(const InputStringType &in, tag):
|
||||
// Determines the length of the input fixed-size-buffer. We look for an (optional) null-terminator in the buffer.
|
||||
template<typename InputStringType>
|
||||
inline size_t EncodedLength(const InputStringType& in, integral_constant<EBind, eBind_Buffer>)
|
||||
{
|
||||
COMPILE_TIME_ASSERT(is_array<InputStringType>::value && extent<InputStringType>::value > 0);
|
||||
typedef typename remove_extent<InputStringType>::type CharType;
|
||||
return SCharacterTrait<CharType>::StrNLen(in, extent<InputStringType>::value);
|
||||
}
|
||||
|
||||
// size_t EncodedLength(const InputStringType &in, tag):
|
||||
// Determines the length of the input used-specified buffer. We look for an (optional) null-terminator in the buffer.
|
||||
template<typename InputCharType>
|
||||
inline size_t EncodedLength(const SPackedBuffer<InputCharType*>& in, integral_constant<EBind, eBind_PackedBuffer>)
|
||||
{
|
||||
return in.buffer ? SCharacterTrait<InputCharType>::StrNLen(in.buffer, in.size) : 0;
|
||||
}
|
||||
|
||||
// size_t EncodedLength(const InputStringType &in, tag):
|
||||
// Determines the length of the input null-terminated c-style string. We just use strlen() if available.
|
||||
template<typename InputStringType>
|
||||
inline size_t EncodedLength(const InputStringType& in, integral_constant<EBind, eBind_NullTerminated>)
|
||||
{
|
||||
COMPILE_TIME_ASSERT(is_pointer<InputStringType>::value);
|
||||
typedef typename remove_pointer<InputStringType>::type CharType;
|
||||
return in ? SCharacterTrait<CharType>::StrLen(in) : 0;
|
||||
}
|
||||
|
||||
// size_t EncodedLength(const InputCharType &in, tag):
|
||||
// Determines the length of a single UCS code-point. This is always 1.
|
||||
template<typename InputCharType>
|
||||
inline size_t EncodedLength([[maybe_unused]] const InputCharType& in, integral_constant<EBind, eBind_CodePoint>)
|
||||
{
|
||||
COMPILE_TIME_ASSERT(is_arithmetic<InputCharType>::value);
|
||||
return 1;
|
||||
}
|
||||
|
||||
// const void *EncodedPointer(const SPackedIterators<const InputCharType *> &its, tag):
|
||||
// Get a pointer to contiguous storage for an iterator range.
|
||||
// Note: This can only work if the iterators are pointers, or the storage won't be guaranteed contiguous.
|
||||
template<typename InputCharType>
|
||||
inline const void* EncodedPointer(const SPackedIterators<const InputCharType*>& its, integral_constant<EBind, eBind_Iterators>)
|
||||
{
|
||||
return its.begin;
|
||||
}
|
||||
|
||||
// const void *EncodedPointer(const InputStringType &in, tag):
|
||||
// Get a pointer to contiguous storage for string/vector object.
|
||||
// Note: This can only work for containers that actually use contiguous storage, which is determined by the SBindXXX helpers.
|
||||
template<typename InputStringType>
|
||||
inline const void* EncodedPointer(const InputStringType& in, integral_constant<EBind, eBind_Data>)
|
||||
{
|
||||
return in.data();
|
||||
}
|
||||
|
||||
// const void *EncodedPointer(const InputStringType &in, tag):
|
||||
// Get a pointer to contiguous storage for a string-literal.
|
||||
template<typename InputStringType>
|
||||
inline const void* EncodedPointer(const InputStringType& in, integral_constant<EBind, eBind_Literal>)
|
||||
{
|
||||
COMPILE_TIME_ASSERT(is_array<InputStringType>::value && extent<InputStringType>::value > 0);
|
||||
return in; // We can just let the array type decay to a pointer.
|
||||
}
|
||||
|
||||
// const void *EncodedPointer(const InputStringType &in, tag):
|
||||
// Get a pointer to contiguous storage for a fixed-size-buffer.
|
||||
template<typename InputStringType>
|
||||
inline const void* EncodedPointer(const InputStringType& in, integral_constant<EBind, eBind_Buffer>)
|
||||
{
|
||||
COMPILE_TIME_ASSERT(is_array<InputStringType>::value && extent<InputStringType>::value > 0);
|
||||
return in; // We can just let the array type decay to a pointer.
|
||||
}
|
||||
|
||||
// const void *EncodedPointer(const InputStringType &in, tag):
|
||||
// Get a pointer to contiguous storage for a null-terminated c-style-string.
|
||||
template<typename InputStringType>
|
||||
inline const void* EncodedPointer(const InputStringType& in, integral_constant<EBind, eBind_NullTerminated>)
|
||||
{
|
||||
COMPILE_TIME_ASSERT(is_pointer<InputStringType>::value);
|
||||
return in; // Implied
|
||||
}
|
||||
|
||||
// const void *EncodedPointer(const InputCharType &in, tag):
|
||||
// Get a pointer to contiguous storage for a single UCS code-point.
|
||||
template<typename InputCharType>
|
||||
inline const void* EncodedPointer(const InputCharType& in, integral_constant<EBind, eBind_CodePoint>)
|
||||
{
|
||||
COMPILE_TIME_ASSERT(is_arithmetic<InputCharType>::value);
|
||||
return ∈ // Take the address of the parameter (which is kept on the stack of the caller).
|
||||
}
|
||||
|
||||
// SWriteSink<T, Append, BindMethod>:
|
||||
// A helper that performs writing to the type T and can be passed as Sink type to a trans-coder helper.
|
||||
template<typename T, bool Append, EBind>
|
||||
struct SWriteSink;
|
||||
template<typename T, bool Append>
|
||||
struct SWriteSink<T, Append, eBind_Iterators>
|
||||
{
|
||||
typedef typename T::value_type OutputCharType;
|
||||
T& out;
|
||||
SWriteSink(T& _out, size_t)
|
||||
: out(_out)
|
||||
{
|
||||
if (!Append)
|
||||
{
|
||||
// If not appending, clear the object beforehand.
|
||||
out.clear();
|
||||
}
|
||||
}
|
||||
void operator()(uint32 item)
|
||||
{
|
||||
const OutputCharType bound = static_cast<OutputCharType>(item);
|
||||
out.push_back(bound); // We assume this can't fail and STL container takes care of memory.
|
||||
}
|
||||
void operator()(const void*, size_t); // Not implemented.
|
||||
void HintSequence(uint32 length) {} // Don't care about sequences.
|
||||
bool CanWrite() const { return true; } // Always writable
|
||||
};
|
||||
template<typename T, bool Append>
|
||||
struct SWriteSink<T, Append, eBind_Data>
|
||||
{
|
||||
typedef SBindPointer<typename T::value_type*, false> BindHelper;
|
||||
typedef typename BindHelper::UnboundCharType CharType;
|
||||
CharType* ptr;
|
||||
SWriteSink(T& out, size_t length)
|
||||
{
|
||||
const size_t offset = Append ? out.size() : 0;
|
||||
length += offset;
|
||||
out.resize(length); // resize() can't fail without exceptions, so assert instead.
|
||||
assert((out.size() == length) && "Buffer resize failed (out-of-memory?)");
|
||||
const CharType* base = length ? out.data() : 0;
|
||||
ptr = const_cast<CharType*>(base + offset);
|
||||
}
|
||||
void operator()(uint32 item)
|
||||
{
|
||||
*SafeCast<typename BindHelper::type>(ptr) = static_cast<typename BindHelper::BoundCharType>(item);
|
||||
++ptr;
|
||||
}
|
||||
void operator()(const void* src, size_t length)
|
||||
{
|
||||
::memcpy(ptr, src, length * sizeof(CharType));
|
||||
ptr += length;
|
||||
}
|
||||
void HintSequence([[maybe_unused]] uint32 length) {} // Don't care about sequences.
|
||||
bool CanWrite() const { return true; } // Always writable
|
||||
};
|
||||
template<typename P, bool Append>
|
||||
struct SWriteSink<SPackedBuffer<P>, Append, eBind_PackedBuffer>
|
||||
{
|
||||
typedef typename remove_pointer<P>::type ElementType;
|
||||
typedef SBindPointer<ElementType*, false> BindHelper;
|
||||
typedef typename BindHelper::UnboundCharType CharType;
|
||||
CharType* ptr;
|
||||
CharType* const terminator;
|
||||
SWriteSink(CharType* _terminator)
|
||||
: terminator(_terminator) {}
|
||||
SWriteSink(SPackedBuffer<P>& out, size_t)
|
||||
: terminator(out.size && out.buffer ? out.buffer + out.size - 1 : 0)
|
||||
{
|
||||
const size_t offset = Append
|
||||
? EncodedLength(out, integral_constant<EBind, eBind_PackedBuffer>())
|
||||
: 0;
|
||||
const size_t fixedOffset = Append && offset >= out.size
|
||||
? out.size - 1 // In case the buffer is already full and not terminated.
|
||||
: offset;
|
||||
CharType* base = static_cast<CharType*>(out.buffer);
|
||||
ptr = terminator ? base + fixedOffset : 0;
|
||||
}
|
||||
~SWriteSink()
|
||||
{
|
||||
if (ptr)
|
||||
{
|
||||
*ptr = 0; // Guarantees that the output is null-terminated.
|
||||
}
|
||||
}
|
||||
void operator()(uint32 item)
|
||||
{
|
||||
if (ptr != terminator) // Guarantees we don't overflow the buffer.
|
||||
{
|
||||
*SafeCast<typename BindHelper::type>(ptr) = static_cast<typename BindHelper::BoundCharType>(item);
|
||||
++ptr;
|
||||
}
|
||||
}
|
||||
void operator()(const void* src, size_t length)
|
||||
{
|
||||
const size_t maxLength = terminator - ptr;
|
||||
if (length > maxLength)
|
||||
{
|
||||
length = maxLength;
|
||||
}
|
||||
::memcpy(ptr, src, length * sizeof(CharType));
|
||||
ptr += length;
|
||||
}
|
||||
void HintSequence(uint32 length)
|
||||
{
|
||||
if (terminator && (ptr + length >= terminator))
|
||||
{
|
||||
// This sequence will overflow the buffer.
|
||||
// In this case, we prefer to not generate any part of the sequence.
|
||||
// Terminate at the current position and flag as full.
|
||||
*ptr = 0;
|
||||
ptr = terminator;
|
||||
}
|
||||
}
|
||||
bool CanWrite() const
|
||||
{
|
||||
return terminator != ptr;
|
||||
}
|
||||
};
|
||||
template<typename T, bool Append>
|
||||
struct SWriteSink<T, Append, eBind_Buffer> // Uses above implementation with specialized constructor
|
||||
: SWriteSink<SPackedBuffer<typename remove_extent<T>::type*>, Append, eBind_PackedBuffer>
|
||||
{
|
||||
typedef typename remove_extent<T>::type ElementType;
|
||||
typedef SWriteSink<SPackedBuffer<ElementType*>, Append, eBind_PackedBuffer> Super;
|
||||
typedef SBindPointer<ElementType*, false> BindHelper;
|
||||
typedef typename BindHelper::UnboundCharType CharType;
|
||||
SWriteSink(T& out, size_t)
|
||||
: Super(out + extent<T>::value - 1)
|
||||
{
|
||||
const size_t offset = Append
|
||||
? EncodedLength(out, integral_constant<EBind, eBind_Buffer>())
|
||||
: 0;
|
||||
const size_t fixedOffset = Append && offset >= extent<T>::value
|
||||
? extent<T>::value - 1 // In case the buffer is already full and not terminated.
|
||||
: offset;
|
||||
Super::ptr = out + fixedOffset; // Qualification for Super required for two-phase lookup.
|
||||
}
|
||||
};
|
||||
|
||||
// SIsBlockCopyable<InputType, OutputType>:
|
||||
// Check if block-copy optimization is possible for these types.
|
||||
// InputType should be an instantiation of SBindObject or SBindIterator.
|
||||
// OutputType should be an instantiation of SBindOutput.
|
||||
// Note: This doesn't take into account safe/unsafe conversions, just if the underlying storage types are compatible.
|
||||
template<typename InputType, typename OutputType>
|
||||
struct SIsBlockCopyable
|
||||
{
|
||||
template<EBind M>
|
||||
struct SIsContiguous
|
||||
{
|
||||
static const bool value =
|
||||
M == eBind_Data ||
|
||||
M == eBind_Literal ||
|
||||
M == eBind_Buffer ||
|
||||
M == eBind_PackedBuffer ||
|
||||
M == eBind_NullTerminated ||
|
||||
M == eBind_CodePoint;
|
||||
};
|
||||
template<typename T>
|
||||
struct SIsPointers
|
||||
{
|
||||
static const bool value = false;
|
||||
};
|
||||
template<typename T>
|
||||
struct SIsPointers<SPackedIterators<T*> >
|
||||
{
|
||||
static const bool value = true;
|
||||
};
|
||||
typedef typename SBindCharacter<typename InputType::CharType, true>::type InputCharType;
|
||||
typedef typename SBindCharacter<typename OutputType::CharType, false>::type OutputCharType;
|
||||
static const bool isIntegral = is_integral<InputCharType>::value && is_integral<OutputCharType>::value;
|
||||
static const bool isSameSize = sizeof(InputCharType) == sizeof(OutputCharType);
|
||||
static const bool isInputContiguous = (SIsContiguous<InputType::value>::value || SIsPointers<InputType>::value);
|
||||
static const bool isOutputContiguous = (SIsContiguous<OutputType::value>::value || SIsPointers<OutputType>::value);
|
||||
static const bool value = isIntegral && isSameSize && isInputContiguous && isOutputContiguous;
|
||||
};
|
||||
}
|
||||
}
|
||||
@ -1,767 +0,0 @@
|
||||
/*
|
||||
* Copyright (c) Contributors to the Open 3D Engine Project.
|
||||
* For complete copyright and license terms please see the LICENSE at the root of this distribution.
|
||||
*
|
||||
* SPDX-License-Identifier: Apache-2.0 OR MIT
|
||||
*
|
||||
*/
|
||||
|
||||
|
||||
// Description : Generic Unicode encoding helpers.
|
||||
//
|
||||
// Defines encoding and decoding functions used by the higher-level functions.
|
||||
// These are used by the various conversion functions in UnicodeFunctions.h and UnicodeIterator.h.
|
||||
// Note: You can use these functions manually for low-level functionality, but we don't recommend that.
|
||||
// In that case, you probably want to check inside the nested Detail namespace for the elementary bits.
|
||||
|
||||
|
||||
#pragma once
|
||||
#include "BaseTypes.h" // For uint8, uint16, uint32
|
||||
#include "CompileTimeAssert.h" // For COMPILE_TIME_ASSERT macro
|
||||
namespace Unicode
|
||||
{
|
||||
// Supported encoding/conversion types.
|
||||
enum EEncoding
|
||||
{
|
||||
// UTF-8 encoding, see http://www.unicode.org/resources/utf8.html.
|
||||
// Input and output are supported.
|
||||
// Note: This format maps the entire UCS, where each code-point can take [1, 4] 8-bit code-units.
|
||||
// Note: This is a strict super-set of Latin1/ISO-885901 as well as ASCII.
|
||||
eEncoding_UTF8,
|
||||
|
||||
// UTF-16 encoding, see http://tools.ietf.org/html/rfc2781.
|
||||
// Input and output are supported.
|
||||
// Note: This format maps the entire UCS, where each code-point can take [1, 2] 16-bit code-units.
|
||||
eEncoding_UTF16,
|
||||
|
||||
// UTF-32 encoding, see http://www.unicode.org/reports/tr17/.
|
||||
// Input and output are supported.
|
||||
// Note: This format maps the entire UCS, each code-point is stored in a single 32-bit code-unit.
|
||||
eEncoding_UTF32,
|
||||
|
||||
// ASCII encoding, see http://en.wikipedia.org/wiki/ASCII.
|
||||
// Input and output are supported (any output UCS values out of supported range are mapped to question mark).
|
||||
// Note: Only values [U+0000, U+007F] can be mapped.
|
||||
eEncoding_ASCII,
|
||||
|
||||
// Latin1, aka ISO-8859-1 encoding, see http://en.wikipedia.org/wiki/ISO/IEC_8859-1.
|
||||
// Only input is supported.
|
||||
// Note: This is a strict super-set of ASCII, it additionally maps [U+00A0, U+00FF].
|
||||
eEncoding_Latin1,
|
||||
|
||||
// Windows ANSI codepage 1252, see http://en.wikipedia.org/wiki/Windows-1252.
|
||||
// Only input is supported.
|
||||
// Note: This is a strict super-set of ASCII and Latin1/ISO-8859-1, it maps some code-units from [0x80, 0x9F].
|
||||
eEncoding_Win1252,
|
||||
};
|
||||
|
||||
// Methods of recovery from invalid encoded sequences.
|
||||
enum EErrorRecovery
|
||||
{
|
||||
// No attempt to detect invalid encoding is performed, the input is assumed to be valid.
|
||||
// If the input is not valid, the output is undefined (in debug, this condition will cause an assert to trigger).
|
||||
eErrorRecovery_None,
|
||||
|
||||
// When an invalidly encoded sequence is detected, the sequence is discarded (will not be part of the output).
|
||||
// Typically used for logic/hashing purposes when the input is almost certainly valid.
|
||||
eErrorRecovery_Discard,
|
||||
|
||||
// When an invalidly encoded sequence is detected, the sequence is replaced with the replacement-character (U+FFFD).
|
||||
// Typically used when the output sequence is used for UI display purposes.
|
||||
eErrorRecovery_Replace,
|
||||
|
||||
// When an invalidly encoded sequence is detected, the sequence is replaced with the eEncoding_Latin1 equivalent.
|
||||
// If the sequence is also not valid Latin1 encoded, the sequence is discarded.
|
||||
// Typically used when reading generic text files with 1-byte code-units.
|
||||
// Note: This recovery method can only be used when decoding UTF-8.
|
||||
eErrorRecovery_FallbackLatin1ThenDiscard,
|
||||
|
||||
// When an invalidly encoded sequence is detected, the sequence is replaced with the eEncoding_Win1252 equivalent.
|
||||
// If the sequence is also not valid codepage 1252 encoded, the sequence is discarded.
|
||||
// Typically used when reading text files generated on Windows with 1-byte code-units.
|
||||
// Note: This recovery method can only be used when decoding UTF-8.
|
||||
eErrorRecovery_FallbackWin1252ThenDiscard,
|
||||
|
||||
// When an invalidly encoded sequence is detected, the sequence is replaced with the eEncoding_Latin1 equivalent.
|
||||
// If the sequence is also not valid Latin1 encoded, it is replaced with the replacement-character (U+FFFD).
|
||||
// Typically used when reading generic text files with 1-byte code-units.
|
||||
// Note: This recovery method can only be used when decoding UTF-8.
|
||||
eErrorRecovery_FallbackLatin1ThenReplace,
|
||||
|
||||
// When an invalidly encoded sequence is detected, the sequence is replaced with the eEncoding_Win1252 equivalent.
|
||||
// If the sequence is also not valid codepage 1252 encoded, it is replaced with the replacement-character (U+FFFD).
|
||||
// Typically used when reading text files generated on Windows with 1-byte code-units.
|
||||
// Note: This recovery method can only be used when decoding UTF-8.
|
||||
eErrorRecovery_FallbackWin1252ThenReplace,
|
||||
};
|
||||
|
||||
namespace Detail
|
||||
{
|
||||
// Decode<Encoding, Safe>(state, unit): Decodes a single code-unit of an encoding into an UCS code-point.
|
||||
// When Safe flag is set, encoding errors are detected so a fall-back encoding or other recovery method can be used.
|
||||
// Interpret return value as follows:
|
||||
// < 0x001FFFFF: Decoded codepoint (== return value), call again with next code-unit and clear state.
|
||||
// < 0x80000000: Intermediate state returned, call again with next code-unit and the returned state.
|
||||
// >= 0x80000000: Bad encoding detected, up to 16 bits (UTF-16) or 24 bits (UTF-8, last in lower bits)
|
||||
// contain previous consumed values (does not happen if Safe == false).
|
||||
template<EEncoding InputEncoding, bool Safe>
|
||||
inline uint32 Decode(uint32 state, uint32 unit);
|
||||
|
||||
// Some constant values used when encoding/decoding.
|
||||
enum
|
||||
{
|
||||
cDecodeShiftRemaining = 26, // Where to store the remaining count in the state.
|
||||
cDecodeOneRemaining = 1 << cDecodeShiftRemaining, // Remaining value of one.
|
||||
cDecodeMaskRemaining = 3 << cDecodeShiftRemaining, // All possible remaining bits that can be used.
|
||||
cDecodeLeadBit = 1 << 22, // All bits up to and including this one are reserved.
|
||||
cDecodeErrorBit = 1 << 31, // Set if an error occurs during decoding.
|
||||
cDecodeOverlongBit = 1 << 30, // Set if overlong sequence was used.
|
||||
cDecodeSurrogateBit = 1 << 29, // Set if surrogate code-point decoded in UTF-8.
|
||||
cDecodeInvalidBit = 1 << 28, // Set if invalid code-point decoded (U+FFFE/FFFF).
|
||||
cDecodeSuccess = 0, // Placeholder to indicate no error occurred.
|
||||
cCodepointMax = 0x10FFFF, // The maximum value of an UCS code-point.
|
||||
cLeadSurrogateFirst = 0xD800, // The first valid UTF-16 lead-surrogate value.
|
||||
cLeadSurrogateLast = 0xDBFF, // The last valid UTF-16 lead-surrogate value.
|
||||
cTrailSurrogateFirst = 0xDC00, // The first valid UTF-16 trail-surrogate value.
|
||||
cTrailSurrogateLast = 0xDFFF, // The last valid UTF-16 trail-surrogate value.
|
||||
cReplacementCharacter = 0xFFFD, // The default replacement character.
|
||||
};
|
||||
|
||||
// Validate the UTF-8 state of a multi-byte sequence.
|
||||
// The safe decoder of UTF-8 will call this function when a full potential code-point has been decoded.
|
||||
// This function is (at most) called for 50% of the decoded UTF-8 code-units, but likely at much lower frequency.
|
||||
inline uint32 DecodeValidate8(uint32 state)
|
||||
{
|
||||
uint32 errorbits = (state >> 8) | cDecodeErrorBit;
|
||||
state ^= (state & 0x400000) >> 1; // For 3-byte sequences, bit 5 of the lead byte needs to be cleared.
|
||||
const uint32 cp =
|
||||
(state & 0x3F) |
|
||||
((state & 0x3F00) >> 2) |
|
||||
((state & 0x3F0000) >> 4) |
|
||||
((state & 0x07000000) >> 6);
|
||||
if (cp <= cCodepointMax)
|
||||
{
|
||||
if (cp >= cLeadSurrogateFirst && cp <= cTrailSurrogateLast)
|
||||
{
|
||||
errorbits += cDecodeSurrogateBit; // CESU-8 encoding might have been used.
|
||||
}
|
||||
else
|
||||
{
|
||||
uint32 minval = 0x80;
|
||||
minval += (0x00400000 & state) ? 0x800 - 0x80 : 0;
|
||||
minval += (0x40000000 & state) ? 0x10000 - 0x80 : 0;
|
||||
if (cp >= minval)
|
||||
{
|
||||
if ((cp & 0xFFFFFFFEU) != 0xFFFEU)
|
||||
{
|
||||
return cp; // Valid code-point.
|
||||
}
|
||||
errorbits += cDecodeInvalidBit; // Invalid character used.
|
||||
}
|
||||
errorbits += cDecodeOverlongBit; // Overlong encoding used.
|
||||
}
|
||||
}
|
||||
return errorbits;
|
||||
}
|
||||
|
||||
// Decode UTF-8, unsafe.
|
||||
template<>
|
||||
inline uint32 Decode<eEncoding_UTF8, false>(uint32 state, uint32 unit)
|
||||
{
|
||||
if (state == 0) // First byte.
|
||||
{
|
||||
unit = unit & 0xFF;
|
||||
if (unit < 0xC0)
|
||||
{
|
||||
return unit; // Single-unit (ASCII).
|
||||
}
|
||||
uint32 remaining = (unit >> 4) - 0xC;
|
||||
remaining += (remaining == 0);
|
||||
return (unit & 0x1F) + (remaining << cDecodeShiftRemaining); // Lead byte of multi-byte.
|
||||
}
|
||||
state = (state << 6) + (unit & 0x3F) + (state & cDecodeMaskRemaining) - cDecodeOneRemaining; // Apply c-byte.
|
||||
return state & ~cDecodeLeadBit; // Mask off the lead bits of a 4-byte sequence.
|
||||
}
|
||||
|
||||
// Decode UTF-8, safe
|
||||
template<>
|
||||
inline uint32 Decode<eEncoding_UTF8, true>(uint32 state, uint32 unit)
|
||||
{
|
||||
if (unit <= 0xF4) // Discard out-of-range values immediately.
|
||||
{
|
||||
if (state == 0) // First byte.
|
||||
{
|
||||
if (unit < 0x80)
|
||||
{
|
||||
return unit; // Single-byte.
|
||||
}
|
||||
if (unit < 0xC2)
|
||||
{
|
||||
return cDecodeErrorBit; // Invalid continuation byte (or illegal 0xC0/0xC1).
|
||||
}
|
||||
uint32 remaining = (unit >> 4) - 0xC;
|
||||
remaining += (remaining == 0);
|
||||
return unit + (remaining << cDecodeShiftRemaining); // Multi-byte.
|
||||
}
|
||||
if ((unit & 0xC0) == 0x80)
|
||||
{
|
||||
const uint32 remaining = (state & cDecodeMaskRemaining) - cDecodeOneRemaining;
|
||||
state = (state << 8) + unit;
|
||||
if (remaining != 0)
|
||||
{
|
||||
return state | remaining; // Intermediate byte of a multi-byte sequence.
|
||||
}
|
||||
return DecodeValidate8(state); // Final byte of a multi-byte sequence.
|
||||
}
|
||||
}
|
||||
return cDecodeErrorBit | state;
|
||||
}
|
||||
|
||||
// Decode UTF-16, unsafe.
|
||||
template<>
|
||||
inline uint32 Decode<eEncoding_UTF16, false>(uint32 state, uint32 unit)
|
||||
{
|
||||
const bool bLead = (unit >= cLeadSurrogateFirst) && (unit <= cLeadSurrogateLast);
|
||||
const uint32 initial = unit + (bLead << cDecodeShiftRemaining);
|
||||
const uint32 pair = 0x10000 + ((state & 0x3FF) << 10) + (unit & 0x3FF);
|
||||
return state == 0 ? initial : pair;
|
||||
}
|
||||
|
||||
// Decode UTF-16, safe.
|
||||
template<>
|
||||
inline uint32 Decode<eEncoding_UTF16, true>(uint32 state, uint32 unit)
|
||||
{
|
||||
const bool bTrail = (unit >= cTrailSurrogateFirst) && (unit <= cTrailSurrogateLast);
|
||||
if (state != 0 && !bTrail)
|
||||
{
|
||||
return cDecodeErrorBit + (state & 0xFFFF); // Lead surrogate without trail surrogate
|
||||
}
|
||||
uint32 result = Decode<eEncoding_UTF16, false>(state, unit);
|
||||
bool bValid = (result & 0xFFFFFFFEU) != 0xFFFEU;
|
||||
return bValid ? result : result + cDecodeErrorBit + cDecodeInvalidBit;
|
||||
}
|
||||
|
||||
// Decode UTF-32, unsafe.
|
||||
template<>
|
||||
inline uint32 Decode<eEncoding_UTF32, false>([[maybe_unused]] uint32 state, uint32 unit)
|
||||
{
|
||||
return unit;
|
||||
}
|
||||
|
||||
// Decode UTF-32, safe.
|
||||
template<>
|
||||
inline uint32 Decode<eEncoding_UTF32, true>([[maybe_unused]] uint32 state, uint32 unit)
|
||||
{
|
||||
if (unit > cCodepointMax)
|
||||
{
|
||||
return cDecodeErrorBit;
|
||||
}
|
||||
if (unit >= cLeadSurrogateFirst && unit <= cTrailSurrogateLast)
|
||||
{
|
||||
return cDecodeErrorBit | cDecodeSurrogateBit;
|
||||
}
|
||||
if ((unit & 0xFFFEU) == 0xFFFEU)
|
||||
{
|
||||
return cDecodeErrorBit | cDecodeInvalidBit;
|
||||
}
|
||||
return unit;
|
||||
}
|
||||
|
||||
// Decode ASCII, unsafe.
|
||||
template<>
|
||||
inline uint32 Decode<eEncoding_ASCII, false>([[maybe_unused]] uint32 state, uint32 unit)
|
||||
{
|
||||
return unit;
|
||||
}
|
||||
|
||||
// Decode ASCII, safe.
|
||||
template<>
|
||||
inline uint32 Decode<eEncoding_ASCII, true>([[maybe_unused]] uint32 state, uint32 unit)
|
||||
{
|
||||
if (unit > 0x7F)
|
||||
{
|
||||
return cDecodeErrorBit;
|
||||
}
|
||||
return unit;
|
||||
}
|
||||
|
||||
// Decode Latin1, unsafe.
|
||||
template<>
|
||||
inline uint32 Decode<eEncoding_Latin1, false>([[maybe_unused]] uint32 state, uint32 unit)
|
||||
{
|
||||
return unit;
|
||||
}
|
||||
|
||||
// Decode Latin1, safe.
|
||||
template<>
|
||||
inline uint32 Decode<eEncoding_Latin1, true>([[maybe_unused]] uint32 state, uint32 unit)
|
||||
{
|
||||
if ((unit >= 0x80 && unit <= 0x9F) || (unit > 0xFF))
|
||||
{
|
||||
return cDecodeErrorBit;
|
||||
}
|
||||
return unit;
|
||||
}
|
||||
|
||||
// Decode Windows CP-1252, unsafe.
|
||||
template<>
|
||||
inline uint32 Decode<eEncoding_Win1252, false>([[maybe_unused]] uint32 state, uint32 unit)
|
||||
{
|
||||
static const uint16 cp1252[] =
|
||||
{
|
||||
0x20AC, 0x0081, 0x201A, 0x0192, 0x201E, 0x2026, 0x2020, 0x2021,
|
||||
0x02C6, 0x2030, 0x0160, 0x2039, 0x0152, 0x008D, 0x017D, 0x008F,
|
||||
0x0090, 0x2018, 0x2019, 0x201C, 0x201D, 0x2022, 0x2013, 0x2014,
|
||||
0x02DC, 0x2122, 0x0161, 0x203A, 0x0153, 0x009D, 0x017E, 0x0178,
|
||||
};
|
||||
return (unit < 0x80 || unit > 0x9F) ? unit : cp1252[unit - 0x80];
|
||||
}
|
||||
|
||||
// Decode Windows CP-1252, safe.
|
||||
template<>
|
||||
inline uint32 Decode<eEncoding_Win1252, true>(uint32 state, uint32 unit)
|
||||
{
|
||||
if (unit > 0xFF)
|
||||
{
|
||||
return cDecodeErrorBit;
|
||||
}
|
||||
uint32 result = Decode<eEncoding_Win1252, false>(state, unit);
|
||||
if (!(unit < 0x80 || unit > 0x9F) && (result == unit))
|
||||
{
|
||||
return cDecodeErrorBit; // Not defined in codepage 1252.
|
||||
}
|
||||
return result;
|
||||
}
|
||||
|
||||
// SBase<T>:
|
||||
// Utility to apply empty-base-optimization on type T.
|
||||
// Will fall back to a member if T is a reference type.
|
||||
template<typename T, int Tag = 0>
|
||||
struct SBase
|
||||
: T
|
||||
{
|
||||
SBase(T base)
|
||||
: T(base) {}
|
||||
T& GetBase() { return *this; }
|
||||
const T& GetBase() const { return *this; }
|
||||
};
|
||||
template<typename T, int Tag>
|
||||
struct SBase<T&, Tag>
|
||||
{
|
||||
T& base;
|
||||
SBase(T& b)
|
||||
: base(b) {}
|
||||
T& GetBase() { return base; }
|
||||
const T& GetBase() const { return base; }
|
||||
};
|
||||
|
||||
// SDecoder<Encoding, Sink, Recovery>:
|
||||
// Functor to decode UCS code-points from an input range.
|
||||
// Recovery functor will be invoked as a fall-back if decoding fails.
|
||||
// This allows ensuring all the output is valid (even if the input isn't).
|
||||
// Note: The destructor will automatically flush any remaining (erroneous) state, you can also call Finalize().
|
||||
template<EEncoding InputEncoding, typename Sink, typename Recovery = void>
|
||||
struct SDecoder
|
||||
: SBase<Sink, 1>
|
||||
, SBase<Recovery, 2>
|
||||
{
|
||||
uint32 state;
|
||||
SDecoder(Sink sink, Recovery recovery = Recovery())
|
||||
: SBase<Sink, 1>(sink)
|
||||
, SBase<Recovery, 2>(recovery)
|
||||
, state(0) {}
|
||||
SDecoder() { Finalize(); }
|
||||
Recovery& recovery() { return SBase<Recovery, 2>::GetBase(); }
|
||||
Sink& sink() { return SBase<Sink, 1>::GetBase(); }
|
||||
void operator()(uint32 unit)
|
||||
{
|
||||
state = Detail::Decode<InputEncoding, true>(state, unit);
|
||||
if (state <= 0x1FFFFF)
|
||||
{
|
||||
sink()(state);
|
||||
state = 0;
|
||||
}
|
||||
else if (state & Detail::cDecodeErrorBit)
|
||||
{
|
||||
recovery()(sink(), state, unit);
|
||||
state = 0;
|
||||
}
|
||||
}
|
||||
void Finalize()
|
||||
{
|
||||
if (state)
|
||||
{
|
||||
recovery()(sink(), state, 0);
|
||||
state = 0;
|
||||
}
|
||||
}
|
||||
};
|
||||
|
||||
// SDecoder<Encoding, Sink>:
|
||||
// Functor to decode to UCS code-points from an input range.
|
||||
// No attempt to discover or recover from encoding errors is made, can only safely be used with known-valid input.
|
||||
template<EEncoding InputEncoding, typename Sink>
|
||||
struct SDecoder<InputEncoding, Sink, void>
|
||||
: SBase<Sink>
|
||||
{
|
||||
uint32 state;
|
||||
SDecoder(Sink sink)
|
||||
: SBase<Sink>(sink)
|
||||
, state(0) {}
|
||||
Sink& sink() { return SBase<Sink>::GetBase(); }
|
||||
void operator()(uint32 unit)
|
||||
{
|
||||
state = Detail::Decode<InputEncoding, false>(state, unit);
|
||||
if (state <= 0x1FFFFF)
|
||||
{
|
||||
sink()(state);
|
||||
state = 0;
|
||||
}
|
||||
}
|
||||
void Finalize() {}
|
||||
};
|
||||
|
||||
// SEncoder<Encoding, Sink>:
|
||||
// Generic Unicode encoder functor.
|
||||
// Encoding must be one an encoding type for which output is supported.
|
||||
// The Sink type must have HintSequence member for UTF-8 and UTF-16 (although it may be a no-op).
|
||||
// In general, you feed operator() with UCS code-points and it will emit code-units.
|
||||
template<EEncoding OutputEncoding, typename Sink>
|
||||
struct SEncoder
|
||||
{
|
||||
static const bool value = false;
|
||||
};
|
||||
|
||||
// SEncoder<Encoding, Sink>:
|
||||
// Specialization of ASCII encoder functor.
|
||||
// Note: Any out-of-range character is mapped to question mark.
|
||||
template<typename Sink>
|
||||
struct SEncoder<eEncoding_ASCII, Sink>
|
||||
: SBase<Sink>
|
||||
{
|
||||
static const bool value = true;
|
||||
typedef uint8 value_type;
|
||||
SEncoder(Sink sink)
|
||||
: SBase<Sink>(sink) {}
|
||||
void operator()(uint32 cp)
|
||||
{
|
||||
cp = cp < 0x80 ? cp : (uint32)'?';
|
||||
SBase<Sink>::GetBase()(value_type(cp));
|
||||
}
|
||||
};
|
||||
|
||||
// SEncoder<Encoding, Sink>:
|
||||
// Specialization of UTF-8 encoder functor.
|
||||
template<typename Sink>
|
||||
struct SEncoder<eEncoding_UTF8, Sink>
|
||||
: SBase<Sink>
|
||||
{
|
||||
static const bool value = true;
|
||||
typedef uint8 value_type;
|
||||
SEncoder(Sink sink)
|
||||
: SBase<Sink>(sink) {}
|
||||
Sink& sink() { return SBase<Sink>::GetBase(); }
|
||||
void operator()(uint32 cp)
|
||||
{
|
||||
if (cp < 0x80)
|
||||
{
|
||||
// Single byte sequence.
|
||||
sink()(value_type(cp));
|
||||
}
|
||||
else
|
||||
{
|
||||
// Expand 21-bit value to 32-bit.
|
||||
uint32 bits =
|
||||
(cp & 0x00003F) +
|
||||
((cp & 0x000FC0) << 2) +
|
||||
((cp & 0x03F000) << 4) +
|
||||
((cp & 0x1C0000) << 6);
|
||||
|
||||
// Type of sequence.
|
||||
const bool bSeq4 = (cp >= 0x10000);
|
||||
const bool bSeq3 = (cp >= 0x800);
|
||||
|
||||
// Mask lead-bytes and continuation-bytes.
|
||||
uint32 mask = 0xEFE0C080;
|
||||
mask ^= (bSeq3 << 14);
|
||||
mask += (bSeq4 ? 0xA00000 : 0);
|
||||
bits |= mask;
|
||||
|
||||
// Length of the sequence.
|
||||
const uint32 length = (uint32)bSeq4 + (uint32)bSeq3 + 1;
|
||||
sink().HintSequence(length);
|
||||
|
||||
// Sink the multi-byte sequence.
|
||||
if (bSeq4)
|
||||
{
|
||||
sink()(value_type(bits >> 24));
|
||||
}
|
||||
if (bSeq3)
|
||||
{
|
||||
sink()(value_type(bits >> 16));
|
||||
}
|
||||
sink()(value_type(bits >> 8));
|
||||
sink()(value_type(bits));
|
||||
}
|
||||
}
|
||||
};
|
||||
|
||||
// SEncoder<Encoding, Sink>:
|
||||
// Specialization of UTF-16 encoder functor.
|
||||
template<typename Sink>
|
||||
struct SEncoder<eEncoding_UTF16, Sink>
|
||||
: SBase<Sink>
|
||||
{
|
||||
static const bool value = true;
|
||||
typedef uint16 value_type;
|
||||
SEncoder(Sink sink)
|
||||
: SBase<Sink>(sink) {}
|
||||
Sink& sink() { return SBase<Sink>::GetBase(); }
|
||||
void operator()(uint32 cp)
|
||||
{
|
||||
if (cp < 0x10000)
|
||||
{
|
||||
// Single unit
|
||||
sink()(value_type(cp));
|
||||
}
|
||||
else
|
||||
{
|
||||
// We will generate two-element sequence
|
||||
sink().HintSequence(2);
|
||||
|
||||
// Surrogate pair
|
||||
cp -= 0x10000;
|
||||
uint32 lead = ((cp >> 10) & 0x3FF) + Detail::cLeadSurrogateFirst;
|
||||
uint32 trail = (cp & 0x3FF) + Detail::cTrailSurrogateFirst;
|
||||
sink()(value_type(lead));
|
||||
sink()(value_type(trail));
|
||||
}
|
||||
}
|
||||
};
|
||||
|
||||
// SEncoder<Encoding, Sink>:
|
||||
// Specialization of UTF-32 encoder functor.
|
||||
// Note: This is a no-op, but we want to be able to express UTF-32 just like the other encodings.
|
||||
template<typename Sink>
|
||||
struct SEncoder<eEncoding_UTF32, Sink>
|
||||
: SBase<Sink>
|
||||
{
|
||||
static const bool value = true;
|
||||
typedef uint32 value_type;
|
||||
SEncoder(Sink sink)
|
||||
: SBase<Sink>(sink) {}
|
||||
void operator()(uint32 cp)
|
||||
{
|
||||
SBase<Sink>::GetBase()(value_type(cp));
|
||||
}
|
||||
};
|
||||
|
||||
// SDecoder<Encoding, SEncoder<Encoding>, void>:
|
||||
// Specialization for unsafe no-op trans-coding.
|
||||
// Since the conversion is a no-op, no need to keep any state or do any computation.
|
||||
// Note: For a decoding with a fallback, this is not possible since we can't guarantee the input is valid.
|
||||
template<EEncoding SameEncoding, typename Sink>
|
||||
struct SDecoder<SameEncoding, SEncoder<SameEncoding, Sink>, void>
|
||||
{
|
||||
Sink sink;
|
||||
SDecoder(Sink s)
|
||||
: sink(s) {}
|
||||
void operator()(uint32 unit)
|
||||
{
|
||||
sink(unit);
|
||||
}
|
||||
void Finalize() {}
|
||||
};
|
||||
|
||||
// SRecoveryDiscard<Sink>:
|
||||
// Recovery handler that, on encoding error, discards the offending sequence.
|
||||
template<typename Sink>
|
||||
struct SRecoveryDiscard
|
||||
{
|
||||
SRecoveryDiscard() {}
|
||||
void operator()([[maybe_unused]] Sink& sink, [[maybe_unused]] uint32 error, [[maybe_unused]] uint32 unit) {}
|
||||
};
|
||||
|
||||
// SRecoveryReplace<Sink>:
|
||||
// Recovery handler that, on encoding error, replaces the sequence with replacement-character (U+FFFD).
|
||||
// Note: This implementation matches a whole invalid sequence, it could be changed to emit for every code-unit.
|
||||
template<typename Sink>
|
||||
struct SRecoveryReplace
|
||||
{
|
||||
SRecoveryReplace() {}
|
||||
void operator()(Sink& sink, uint32 error, uint32 unit) { sink(cReplacementCharacter); }
|
||||
};
|
||||
|
||||
// SRecoveryFallback<Sink>:
|
||||
// Recovery handler that, on encoding error, falls back to another encoding.
|
||||
// The fallback encoding must be stateless (ie: ASCII, Latin1 or Win1252).
|
||||
// This type assumes an 8-bit primary encoding since the only viable fallback encodings are 8-bit.
|
||||
template<typename Sink, EEncoding FallbackEncoding, typename NextFallback>
|
||||
struct SRecoveryFallback
|
||||
: NextFallback
|
||||
{
|
||||
SRecoveryFallback()
|
||||
: NextFallback() {}
|
||||
void operator()(Sink& sink, uint32 error, uint32 unit)
|
||||
{
|
||||
SDecoder<FallbackEncoding, Sink&, NextFallback&> fallback(sink, *static_cast<NextFallback*>(this));
|
||||
uint8 byte1(error >> 16);
|
||||
uint8 byte2(error >> 8);
|
||||
uint8 byte3(error);
|
||||
uint8 byte4(unit);
|
||||
if (byte1)
|
||||
{
|
||||
fallback(byte1);
|
||||
}
|
||||
if (byte1 | byte2)
|
||||
{
|
||||
fallback(byte2);
|
||||
}
|
||||
if (byte1 | byte2 | byte3)
|
||||
{
|
||||
fallback(byte3);
|
||||
}
|
||||
fallback(byte4);
|
||||
}
|
||||
};
|
||||
|
||||
// SRecoveryFallbackHelper<Sink, RecoveryMethod>:
|
||||
// Helper to pick a SRecoveryFallback instantiation based on RecoveryMethod.
|
||||
template<EEncoding OutputEncoding, typename Sink, EErrorRecovery RecoveryMethod>
|
||||
struct SRecoveryFallbackHelper
|
||||
{
|
||||
// A compilation error here means RecoveryMethod value was unexpected here
|
||||
COMPILE_TIME_ASSERT(
|
||||
RecoveryMethod == eErrorRecovery_FallbackLatin1ThenDiscard ||
|
||||
RecoveryMethod == eErrorRecovery_FallbackLatin1ThenReplace ||
|
||||
RecoveryMethod == eErrorRecovery_FallbackWin1252ThenDiscard ||
|
||||
RecoveryMethod == eErrorRecovery_FallbackWin1252ThenReplace);
|
||||
typedef SEncoder<OutputEncoding, Sink> SinkType;
|
||||
static const EEncoding FallbackEncoding =
|
||||
RecoveryMethod == eErrorRecovery_FallbackLatin1ThenDiscard ||
|
||||
RecoveryMethod == eErrorRecovery_FallbackLatin1ThenReplace
|
||||
? eEncoding_Latin1 : eEncoding_Win1252;
|
||||
template<typename Dummy, bool WithDiscard>
|
||||
struct Pick
|
||||
{
|
||||
typedef SRecoveryDiscard<SinkType> type;
|
||||
};
|
||||
template<typename Dummy>
|
||||
struct Pick<Dummy, false>
|
||||
{
|
||||
typedef SRecoveryReplace<SinkType> type;
|
||||
};
|
||||
typedef typename Pick<Sink,
|
||||
RecoveryMethod == eErrorRecovery_FallbackLatin1ThenDiscard ||
|
||||
RecoveryMethod == eErrorRecovery_FallbackWin1252ThenDiscard>::type NextFallback;
|
||||
typedef SRecoveryFallback<SinkType, FallbackEncoding, NextFallback> RecoveryType;
|
||||
typedef SDecoder<eEncoding_UTF8, SinkType, RecoveryType> FullType;
|
||||
};
|
||||
|
||||
// STranscoderSelect<InputEncoding, OutputEncoding, Sink, RecoveryMethod>:
|
||||
// Derives a chained decoder/encoder pair that performs code-unit -> code-unit transform.
|
||||
// The RecoveryMethod template parameter determines the behavior during encoding.
|
||||
// This is the basic way to perform trans-coding, and is the type instantiated by the higher-level functions.
|
||||
template<EEncoding InputEncoding, EEncoding OutputEncoding, typename Sink, EErrorRecovery RecoveryMethod>
|
||||
struct STranscoderSelect;
|
||||
template<EEncoding InputEncoding, EEncoding OutputEncoding, typename Sink>
|
||||
struct STranscoderSelect<InputEncoding, OutputEncoding, Sink, eErrorRecovery_None>
|
||||
: SDecoder<InputEncoding, SEncoder<OutputEncoding, Sink>, void>
|
||||
{
|
||||
typedef SDecoder<InputEncoding, SEncoder<OutputEncoding, Sink>, void> TranscoderType;
|
||||
STranscoderSelect(Sink sink)
|
||||
: TranscoderType(sink) {}
|
||||
};
|
||||
template<EEncoding InputEncoding, EEncoding OutputEncoding, typename Sink>
|
||||
struct STranscoderSelect<InputEncoding, OutputEncoding, Sink, eErrorRecovery_Discard>
|
||||
: SDecoder<InputEncoding, SEncoder<OutputEncoding, Sink>, SRecoveryDiscard<SEncoder<OutputEncoding, Sink> > >
|
||||
{
|
||||
typedef SRecoveryDiscard<SEncoder<OutputEncoding, Sink> > RecoveryType;
|
||||
typedef SDecoder<InputEncoding, SEncoder<OutputEncoding, Sink>, RecoveryType> TranscoderType;
|
||||
STranscoderSelect(Sink sink)
|
||||
: TranscoderType(sink) {}
|
||||
};
|
||||
template<EEncoding InputEncoding, EEncoding OutputEncoding, typename Sink>
|
||||
struct STranscoderSelect<InputEncoding, OutputEncoding, Sink, eErrorRecovery_Replace>
|
||||
: SDecoder<InputEncoding, SEncoder<OutputEncoding, Sink>, SRecoveryReplace<SEncoder<OutputEncoding, Sink> > >
|
||||
{
|
||||
typedef SRecoveryReplace<SEncoder<OutputEncoding, Sink> > RecoveryType;
|
||||
typedef SDecoder<InputEncoding, SEncoder<OutputEncoding, Sink>, RecoveryType> TranscoderType;
|
||||
STranscoderSelect(Sink sink)
|
||||
: TranscoderType(sink) {}
|
||||
};
|
||||
template<EEncoding OutputEncoding, typename Sink>
|
||||
struct STranscoderSelect<eEncoding_UTF8, OutputEncoding, Sink, eErrorRecovery_FallbackLatin1ThenDiscard>
|
||||
: SRecoveryFallbackHelper<OutputEncoding, Sink, eErrorRecovery_FallbackLatin1ThenDiscard>::FullType
|
||||
{
|
||||
static const EErrorRecovery RecoveryMethod = eErrorRecovery_FallbackLatin1ThenDiscard;
|
||||
typedef typename SRecoveryFallbackHelper<OutputEncoding, Sink, RecoveryMethod>::RecoveryType RecoveryType;
|
||||
typedef typename SRecoveryFallbackHelper<OutputEncoding, Sink, RecoveryMethod>::FullType TranscoderType;
|
||||
STranscoderSelect(Sink sink)
|
||||
: TranscoderType(sink) {}
|
||||
};
|
||||
template<EEncoding OutputEncoding, typename Sink>
|
||||
struct STranscoderSelect<eEncoding_UTF8, OutputEncoding, Sink, eErrorRecovery_FallbackLatin1ThenReplace>
|
||||
: SRecoveryFallbackHelper<OutputEncoding, Sink, eErrorRecovery_FallbackLatin1ThenReplace>::FullType
|
||||
{
|
||||
static const EErrorRecovery RecoveryMethod = eErrorRecovery_FallbackLatin1ThenReplace;
|
||||
typedef typename SRecoveryFallbackHelper<OutputEncoding, Sink, RecoveryMethod>::RecoveryType RecoveryType;
|
||||
typedef typename SRecoveryFallbackHelper<OutputEncoding, Sink, RecoveryMethod>::FullType TranscoderType;
|
||||
STranscoderSelect(Sink sink)
|
||||
: TranscoderType(sink) {}
|
||||
};
|
||||
template<EEncoding OutputEncoding, typename Sink>
|
||||
struct STranscoderSelect<eEncoding_UTF8, OutputEncoding, Sink, eErrorRecovery_FallbackWin1252ThenDiscard>
|
||||
: SRecoveryFallbackHelper<OutputEncoding, Sink, eErrorRecovery_FallbackWin1252ThenDiscard>::FullType
|
||||
{
|
||||
static const EErrorRecovery RecoveryMethod = eErrorRecovery_FallbackWin1252ThenDiscard;
|
||||
typedef typename SRecoveryFallbackHelper<OutputEncoding, Sink, RecoveryMethod>::RecoveryType RecoveryType;
|
||||
typedef typename SRecoveryFallbackHelper<OutputEncoding, Sink, RecoveryMethod>::FullType TranscoderType;
|
||||
STranscoderSelect(Sink sink)
|
||||
: TranscoderType(sink) {}
|
||||
};
|
||||
template<EEncoding OutputEncoding, typename Sink>
|
||||
struct STranscoderSelect<eEncoding_UTF8, OutputEncoding, Sink, eErrorRecovery_FallbackWin1252ThenReplace>
|
||||
: SRecoveryFallbackHelper<OutputEncoding, Sink, eErrorRecovery_FallbackWin1252ThenReplace>::FullType
|
||||
{
|
||||
static const EErrorRecovery RecoveryMethod = eErrorRecovery_FallbackWin1252ThenReplace;
|
||||
typedef typename SRecoveryFallbackHelper<OutputEncoding, Sink, RecoveryMethod>::RecoveryType RecoveryType;
|
||||
typedef typename SRecoveryFallbackHelper<OutputEncoding, Sink, RecoveryMethod>::FullType TranscoderType;
|
||||
STranscoderSelect(Sink sink)
|
||||
: TranscoderType(sink) {}
|
||||
};
|
||||
|
||||
// SIsSafeEncoding<R>:
|
||||
// Check if the given recovery mode is safe.
|
||||
// This is used for SFINAE checks in higher-level functions.
|
||||
template<EErrorRecovery R>
|
||||
struct SIsSafeEncoding
|
||||
{
|
||||
static const bool value =
|
||||
R == eErrorRecovery_Discard ||
|
||||
R == eErrorRecovery_Replace ||
|
||||
R == eErrorRecovery_FallbackLatin1ThenDiscard ||
|
||||
R == eErrorRecovery_FallbackLatin1ThenReplace ||
|
||||
R == eErrorRecovery_FallbackWin1252ThenDiscard ||
|
||||
R == eErrorRecovery_FallbackWin1252ThenReplace;
|
||||
};
|
||||
|
||||
// SIsCopyableEncoding<I, O>:
|
||||
// Check if data in one encoding can be copied directly to another encoding.
|
||||
// This is the basis for block-copy and string-assign optimizations in un-safe conversion functions.
|
||||
// Note: There are more valid combinations, they are left out since those can't occur with the output encodings supported.
|
||||
// Note: Only used for un-safe functions since it doesn't account for potential invalid sequences (they would be copied over).
|
||||
template<EEncoding InputEncoding, EEncoding OutputEncoding>
|
||||
struct SIsCopyableEncoding
|
||||
{
|
||||
static const bool value =
|
||||
InputEncoding == eEncoding_ASCII || // ASCII and Latin1 values don't change in any encoding.
|
||||
(InputEncoding == eEncoding_Latin1 && OutputEncoding != eEncoding_ASCII); // Except Latin1 -> ASCII is lossy.
|
||||
};
|
||||
template<EEncoding SameEncoding>
|
||||
struct SIsCopyableEncoding<SameEncoding, SameEncoding>
|
||||
{
|
||||
static const bool value = true; // If the input and output encodings are the same, then it's copyable.
|
||||
};
|
||||
}
|
||||
}
|
||||
File diff suppressed because it is too large
Load Diff
@ -1,615 +0,0 @@
|
||||
/*
|
||||
* Copyright (c) Contributors to the Open 3D Engine Project.
|
||||
* For complete copyright and license terms please see the LICENSE at the root of this distribution.
|
||||
*
|
||||
* SPDX-License-Identifier: Apache-2.0 OR MIT
|
||||
*
|
||||
*/
|
||||
|
||||
|
||||
// Description : Encoded Unicode sequence iteration.
|
||||
//
|
||||
// For lower level accessing of encoded text, an STL compatible iterator wrapper is provided.
|
||||
// This iterator will decode the underlying sequence, abstracting it to a sequence of UCS code-points.
|
||||
// Using the iterator wrapper, you can find where in an encoded string code-points (or encoding errors) are located.
|
||||
// Note: The iterator is an input-only iterator, you cannot write to the underlying sequence.
|
||||
|
||||
|
||||
#pragma once
|
||||
#include "UnicodeBinding.h"
|
||||
namespace Unicode
|
||||
{
|
||||
namespace Detail
|
||||
{
|
||||
// MoveNext(it, checker, tag):
|
||||
// Moves the iterator to the next UCS code-point in the encoded sequence.
|
||||
// Non-specialized version (for 1:1 code-unit to code-point).
|
||||
template<typename BaseIterator, typename BoundsChecker, EEncoding Encoding>
|
||||
inline void MoveNext(BaseIterator& it, const BoundsChecker& checker, const integral_constant<EEncoding, Encoding>)
|
||||
{
|
||||
COMPILE_TIME_ASSERT(
|
||||
Encoding == eEncoding_ASCII ||
|
||||
Encoding == eEncoding_UTF32 ||
|
||||
Encoding == eEncoding_Latin1 ||
|
||||
Encoding == eEncoding_Win1252);
|
||||
assert(!checker.IsEnd(it) && "Attempt to iterate past the end of the sequence");
|
||||
|
||||
// All of these encodings use a single code-unit for each code-point.
|
||||
++it;
|
||||
}
|
||||
|
||||
// MoveNext(it, checker, tag):
|
||||
// Moves the iterator to the next UCS code-point in the encoded sequence.
|
||||
// Specialized for UTF-8.
|
||||
template<typename BaseIterator, typename BoundsChecker>
|
||||
inline void MoveNext(BaseIterator& it, const BoundsChecker& checker, integral_constant<EEncoding, eEncoding_UTF8>)
|
||||
{
|
||||
assert(!checker.IsEnd(it) && "Attempt to iterate past the end of the sequence");
|
||||
|
||||
// UTF-8: just need to skip up to 3 continuation bytes.
|
||||
for (int i = 0; i < 4; ++i)
|
||||
{
|
||||
++it;
|
||||
if (checker.IsEnd(it)) // :WARN: always returns false if "safe" bool is false!
|
||||
{
|
||||
break;
|
||||
}
|
||||
uint32 val = static_cast<uint32>(*it);
|
||||
if ((val & 0xC0) != 0x80)
|
||||
{
|
||||
break;
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
// MoveNext(it, checker, tag):
|
||||
// Moves the iterator to the next UCS code-point in the encoded sequence.
|
||||
// Specialized for UTF-16.
|
||||
template<typename BaseIterator, typename BoundsChecker>
|
||||
inline void MoveNext(BaseIterator& it, const BoundsChecker& checker, integral_constant<EEncoding, eEncoding_UTF16>)
|
||||
{
|
||||
assert(!checker.IsEnd(it) && "Attempt to iterate past the end of the sequence");
|
||||
|
||||
// UTF-16: just need to skip one lead surrogate.
|
||||
++it;
|
||||
uint32 val = static_cast<uint32>(*it);
|
||||
if (val >= cLeadSurrogateFirst && val <= cLeadSurrogateLast)
|
||||
{
|
||||
if (!checker.IsEnd(it))
|
||||
{
|
||||
++it;
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
// MovePrev(it, checker, tag):
|
||||
// Moves the iterator to the previous UCS code-point in the encoded sequence.
|
||||
// Non-specialized version (for 1:1 code-unit to code-point).
|
||||
template<typename BaseIterator, typename BoundsChecker, EEncoding Encoding>
|
||||
inline void MovePrev(BaseIterator& it, const BoundsChecker& checker, const integral_constant<EEncoding, Encoding>)
|
||||
{
|
||||
COMPILE_TIME_ASSERT(
|
||||
Encoding == eEncoding_ASCII ||
|
||||
Encoding == eEncoding_UTF32 ||
|
||||
Encoding == eEncoding_Latin1 ||
|
||||
Encoding == eEncoding_Win1252);
|
||||
assert(!checker.IsBegin(it) && "Attempt to iterate past the beginning of the sequence");
|
||||
|
||||
// All of these encodings use a single code-unit for each code-point.
|
||||
--it;
|
||||
}
|
||||
|
||||
// MovePrev(it, checker, tag):
|
||||
// Moves the iterator to the previous UCS code-point in the encoded sequence.
|
||||
// Specialized for UTF-8.
|
||||
template<typename BaseIterator, typename BoundsChecker>
|
||||
inline void MovePrev(BaseIterator& it, const BoundsChecker& checker, integral_constant<EEncoding, eEncoding_UTF8>)
|
||||
{
|
||||
assert(!checker.IsBegin(it) && "Attempt to iterate past the beginning of the sequence");
|
||||
|
||||
// UTF-8: just need to skip up to 3 continuation bytes.
|
||||
for (int i = 0; i < 4; ++i)
|
||||
{
|
||||
--it;
|
||||
if (checker.IsBegin(it))
|
||||
{
|
||||
break;
|
||||
}
|
||||
uint32 val = static_cast<uint32>(*it);
|
||||
if ((val & 0xC0) != 0x80)
|
||||
{
|
||||
break;
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
// MovePrev(it, checker, tag):
|
||||
// Moves the iterator to the previous UCS code-point in the encoded sequence.
|
||||
// Specialized for UTF-16.
|
||||
template<typename BaseIterator, typename BoundsChecker>
|
||||
inline void MovePrev(BaseIterator& it, const BoundsChecker& checker, integral_constant<EEncoding, eEncoding_UTF16>)
|
||||
{
|
||||
assert(!checker.IsBegin(it) && "Attempt to iterate past the beginning of the sequence");
|
||||
|
||||
// UTF-16: just need to skip one lead surrogate.
|
||||
--it;
|
||||
uint32 val = static_cast<uint32>(*it);
|
||||
if (val >= cLeadSurrogateFirst && val <= cLeadSurrogateLast)
|
||||
{
|
||||
if (!checker.IsBegin(it))
|
||||
{
|
||||
--it;
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
// SBaseIterators<BaseIterator, BoundsChecked>:
|
||||
// Utility to access base iterators properties from CIterator.
|
||||
// This is the bounds-checked specialization, the range information is kept to defend against malformed sequences.
|
||||
template<typename BaseIterator, bool BoundsChecked>
|
||||
struct SBaseIterators
|
||||
{
|
||||
typedef BaseIterator type;
|
||||
type begin, end;
|
||||
type it;
|
||||
|
||||
SBaseIterators(const BaseIterator& _begin, const BaseIterator& _end)
|
||||
: begin(_begin)
|
||||
, end(_end)
|
||||
, it(_begin) {}
|
||||
|
||||
SBaseIterators(const SBaseIterators& other)
|
||||
: begin(other.begin)
|
||||
, end(other.end)
|
||||
, it(other.it) {}
|
||||
|
||||
SBaseIterators& operator =(const SBaseIterators& other)
|
||||
{
|
||||
begin = other.begin;
|
||||
end = other.end;
|
||||
it = other.it;
|
||||
return *this;
|
||||
}
|
||||
|
||||
bool IsBegin(const BaseIterator& _it) const
|
||||
{
|
||||
return begin == _it;
|
||||
}
|
||||
|
||||
bool IsEnd(const BaseIterator& _it) const
|
||||
{
|
||||
return end == _it;
|
||||
}
|
||||
|
||||
bool IsEqual(const SBaseIterators& other) const
|
||||
{
|
||||
return it == other.it
|
||||
&& begin == other.begin
|
||||
&& end == other.end;
|
||||
}
|
||||
|
||||
// Note: Only called inside assert.
|
||||
// O(N) version; works with any forward-iterator (or better)
|
||||
bool IsInRange(const BaseIterator& _it, std::forward_iterator_tag) const
|
||||
{
|
||||
for (BaseIterator i = begin; i != end; ++i)
|
||||
{
|
||||
if (_it == i)
|
||||
{
|
||||
return true;
|
||||
}
|
||||
}
|
||||
return false;
|
||||
}
|
||||
|
||||
// Note: Only called inside assert.
|
||||
// O(1) version; requires random-access-iterator.
|
||||
bool IsInRange(const BaseIterator& _it, std::random_access_iterator_tag) const
|
||||
{
|
||||
return (begin <= _it && _it < end);
|
||||
}
|
||||
|
||||
// Note: Only called inside assert.
|
||||
// Dispatches to the O(1) version if a random-access iterator is used (common case).
|
||||
bool IsInRange(const BaseIterator& _it) const
|
||||
{
|
||||
return IsInRange(_it, typename std::iterator_traits<BaseIterator>::iterator_category());
|
||||
}
|
||||
};
|
||||
|
||||
// SBaseIterators<BaseIterator, BoundsChecked>:
|
||||
// Utility to access base iterators properties from CIterator.
|
||||
// This is the un-checked specialization for known-safe sequences.
|
||||
template<typename BaseIterator>
|
||||
struct SBaseIterators<BaseIterator, false>
|
||||
{
|
||||
typedef BaseIterator type;
|
||||
type it;
|
||||
|
||||
explicit SBaseIterators(const BaseIterator& begin)
|
||||
: it(begin) {}
|
||||
|
||||
SBaseIterators(const BaseIterator& begin, const BaseIterator& end)
|
||||
: it(begin) {}
|
||||
|
||||
SBaseIterators(const SBaseIterators& other)
|
||||
: it(other.it) {}
|
||||
|
||||
SBaseIterators& operator =(const SBaseIterators& other)
|
||||
{
|
||||
it = other.it;
|
||||
return *this;
|
||||
}
|
||||
|
||||
bool IsBegin(const BaseIterator&) const
|
||||
{
|
||||
return false;
|
||||
}
|
||||
|
||||
bool IsEnd(const BaseIterator&) const
|
||||
{
|
||||
return false;
|
||||
}
|
||||
|
||||
bool IsEqual(const SBaseIterators& other) const
|
||||
{
|
||||
return it == other.it;
|
||||
}
|
||||
|
||||
bool IsInRange(const BaseIterator&) const
|
||||
{
|
||||
return true;
|
||||
}
|
||||
};
|
||||
|
||||
// SIteratorSink<Safe>:
|
||||
// Helper to store the last code-point and error bit that was decoded.
|
||||
// This is the safe specialization for potentially malformed sequences.
|
||||
template<bool Safe>
|
||||
struct SIteratorSink
|
||||
{
|
||||
static const uint32 cEmpty = 0xFFFFFFFFU;
|
||||
uint32 value;
|
||||
bool error;
|
||||
|
||||
void Clear()
|
||||
{
|
||||
value = cEmpty;
|
||||
error = false;
|
||||
}
|
||||
|
||||
bool IsEmpty() const
|
||||
{
|
||||
return value == cEmpty;
|
||||
}
|
||||
|
||||
bool IsError() const
|
||||
{
|
||||
return error;
|
||||
}
|
||||
|
||||
const uint32& GetValue() const
|
||||
{
|
||||
return value;
|
||||
}
|
||||
|
||||
void MarkDecodingError()
|
||||
{
|
||||
value = cReplacementCharacter;
|
||||
error = true;
|
||||
}
|
||||
|
||||
template<EEncoding Encoding, typename BaseIterator, bool BoundsChecked>
|
||||
void Decode(const SBaseIterators<BaseIterator, BoundsChecked>& its, integral_constant<EEncoding, Encoding>)
|
||||
{
|
||||
typedef SDecoder<Encoding, SIteratorSink&, SIteratorSink&> DecoderType;
|
||||
DecoderType decoder(*this, *this);
|
||||
Clear();
|
||||
for (BaseIterator it = its.it; IsEmpty(); ++it)
|
||||
{
|
||||
uint32 val = static_cast<uint32>(*it);
|
||||
decoder(val);
|
||||
if (its.IsEnd(it))
|
||||
{
|
||||
break;
|
||||
}
|
||||
}
|
||||
if (IsEmpty())
|
||||
{
|
||||
// If we still have neither a new value or an error flag, just treat as error.
|
||||
// This can happen if we reached the end of the sequence, but it ends in an incomplete code-sequence.
|
||||
MarkDecodingError();
|
||||
}
|
||||
}
|
||||
|
||||
template<EEncoding Encoding, typename BaseIterator, bool BoundsChecked>
|
||||
void DecodeIfEmpty(const SBaseIterators<BaseIterator, BoundsChecked>& its, integral_constant<EEncoding, Encoding> tag)
|
||||
{
|
||||
if (IsEmpty())
|
||||
{
|
||||
Decode(its, tag);
|
||||
}
|
||||
}
|
||||
|
||||
void operator()(uint32 unit)
|
||||
{
|
||||
value = unit;
|
||||
}
|
||||
|
||||
void operator()(SIteratorSink&, uint32, uint32)
|
||||
{
|
||||
MarkDecodingError();
|
||||
}
|
||||
};
|
||||
|
||||
// SIteratorSink<Safe>:
|
||||
// Helper to store the last code-point that was decoded.
|
||||
// This is the un-safe specialization for known-valid sequences.
|
||||
// Note: No error-state is tracked since we won't handle that regardless for un-safe CIterator.
|
||||
template<>
|
||||
struct SIteratorSink<false>
|
||||
{
|
||||
static const uint32 cEmpty = 0xFFFFFFFFU;
|
||||
uint32 value;
|
||||
|
||||
void Clear()
|
||||
{
|
||||
value = cEmpty;
|
||||
}
|
||||
|
||||
bool IsEmpty() const
|
||||
{
|
||||
return value == cEmpty;
|
||||
}
|
||||
|
||||
bool IsError() const
|
||||
{
|
||||
return false;
|
||||
}
|
||||
|
||||
const uint32& GetValue() const
|
||||
{
|
||||
return value;
|
||||
}
|
||||
|
||||
template<EEncoding Encoding, typename BaseIterator, bool BoundsChecked>
|
||||
void Decode(const SBaseIterators<BaseIterator, BoundsChecked>& its, integral_constant<EEncoding, Encoding>)
|
||||
{
|
||||
typedef SDecoder<Encoding, SIteratorSink&, void> DecoderType;
|
||||
DecoderType decoder(*this);
|
||||
for (BaseIterator it = its.it; IsEmpty(); ++it)
|
||||
{
|
||||
uint32 val = static_cast<uint32>(*it);
|
||||
decoder(val);
|
||||
}
|
||||
}
|
||||
|
||||
template<EEncoding Encoding, typename BaseIterator, bool BoundsChecked>
|
||||
void DecodeIfEmpty(const SBaseIterators<BaseIterator, BoundsChecked>& its, integral_constant<EEncoding, Encoding> tag)
|
||||
{
|
||||
if (IsEmpty())
|
||||
{
|
||||
Decode(its, tag);
|
||||
}
|
||||
}
|
||||
|
||||
void operator()(uint32 unit)
|
||||
{
|
||||
value = unit;
|
||||
}
|
||||
};
|
||||
}
|
||||
|
||||
// CIterator<BaseIterator [, Safe, Encoding]>:
|
||||
// Helper class that can iterate over an encoded text sequence and read the underlying UCS code-points.
|
||||
// If the Safe flag is set, bounds checking is performed inside multi-unit sequences to guard against decoding errors.
|
||||
// This requires the user to know where the sequence ends (use the constructor taking two parameters).
|
||||
// Note: The BaseIterator must be forward-iterator or better when Safe flag is set.
|
||||
// If the Safe flag is not set, you must guarantee the sequence is validly encoded, and allows the use of the single argument constructor.
|
||||
// In the case of unsafe iterator used for C-style string pointer, look for a U+0000 dereferenced value to end the iteration.
|
||||
// Regardless of the Safe flag, the user must ensure that the iterator is never moved past the beginning or end of the range (just like any other STL iterator).
|
||||
// Example of typical usage:
|
||||
// string utf8 = "foo"; // UTF-8
|
||||
// for (Unicode::CIterator<string::const_iterator> it(utf8.begin(), utf8.end()); it != utf8.end(); ++it)
|
||||
// {
|
||||
// uint32 codepoint = *it; // 32-bit UCS code-point
|
||||
// }
|
||||
// Example unsafe usage: (for known-valid encoded C-style strings):
|
||||
// const char *pValid = "foo"; // UTF-8
|
||||
// for (Unicode::CIterator<const char *, false> it = pValid; *it != 0; ++it)
|
||||
// {
|
||||
// uint32 codepoint = *it; // 32-bit UCS code-point
|
||||
// }
|
||||
template<typename BaseIterator, bool Safe = true, EEncoding Encoding = Detail::SInferEncoding<BaseIterator, true>::value>
|
||||
class CIterator
|
||||
{
|
||||
// The iterator value in the encoded sequence.
|
||||
// Optionally provides bounds-checking.
|
||||
Detail::SBaseIterators<BaseIterator, Safe> its;
|
||||
|
||||
// The cached UCS code-point at the current position.
|
||||
// Mutable because dereferencing is conceptually const, but does cache some state in this case.
|
||||
mutable Detail::SIteratorSink<Safe> sink;
|
||||
|
||||
public:
|
||||
// Types for compatibility with STL bidirectional iterator requirements.
|
||||
typedef const uint32 value_type;
|
||||
typedef const uint32& reference;
|
||||
typedef const uint32* pointer;
|
||||
typedef const ptrdiff_t difference_type;
|
||||
typedef std::bidirectional_iterator_tag iterator_category;
|
||||
|
||||
// Construct an iterator for the given range.
|
||||
// The initial position of the iterator as at the beginning of the range.
|
||||
CIterator(const BaseIterator& begin, const BaseIterator& end)
|
||||
: its(begin, end)
|
||||
{
|
||||
sink.Clear();
|
||||
}
|
||||
|
||||
// Construct an iterator from a single iterator (typically C-style string pointer).
|
||||
// This can only be used for unsafe iterators.
|
||||
template<typename IteratorType>
|
||||
CIterator(const IteratorType& it, typename Detail::SRequire<!Safe&& Detail::is_convertible<IteratorType, BaseIterator>::value, IteratorType>::type* = 0)
|
||||
: its(static_cast<const BaseIterator&>(it))
|
||||
{
|
||||
sink.Clear();
|
||||
}
|
||||
|
||||
// Copy-construct an iterator.
|
||||
CIterator(const CIterator& other)
|
||||
: its(other.its)
|
||||
, sink(other.sink) {}
|
||||
|
||||
// Copy-assign an iterator.
|
||||
CIterator& operator =(const CIterator& other)
|
||||
{
|
||||
its = other.its;
|
||||
sink = other.sink;
|
||||
return *this;
|
||||
}
|
||||
|
||||
// Test if the iterator points at an encoding error in the underlying encoded sequence.
|
||||
// If so, the function returns false.
|
||||
// When using an un-safe iterator, this function always returns true, if a sequence can contain encoding errors, you must use the safe variant.
|
||||
// Note: This requires the underlying iterator to be dereferenced, so you cannot use it only while the iterator is inside the valid range.
|
||||
bool IsAtValidCodepoint() const
|
||||
{
|
||||
assert(!its.IsEnd(its.it) && "Attempt to dereference the past-the-end iterator");
|
||||
Detail::integral_constant<EEncoding, Encoding> tag;
|
||||
sink.DecodeIfEmpty(its, tag);
|
||||
return !sink.IsError();
|
||||
}
|
||||
|
||||
// Gets the current position in the underlying encoded sequence.
|
||||
// If the iterator points to an invalidly encoded sequence (ie, IsError() returns true), the direction of iteration is significant.
|
||||
// In that case the returned position is approximated; to work around this: move all iterators of which the position is compared in the same direction.
|
||||
const BaseIterator& GetPosition() const
|
||||
{
|
||||
return its.it;
|
||||
}
|
||||
|
||||
// Sets the current position in the underlying encoded sequence.
|
||||
// You may not set the position outside the range for which this iterator was constructed.
|
||||
void SetPosition(const BaseIterator& it)
|
||||
{
|
||||
assert(its.IsInRange(it) && "Attempt to set the underlying iterator outside of the supported range");
|
||||
its.it = it;
|
||||
}
|
||||
|
||||
// Test if this iterator is equal to another iterator instance.
|
||||
// Note: In the presence of an invalidly encoded sequence (ie, IsError() returns true), the direction of iteration is significant.
|
||||
// To work around this, you can either:
|
||||
// 1) Move all iterators that will be compared in the same direction; or
|
||||
// 2) Compare the dereferenced iterator value(s) instead (if applicable).
|
||||
bool operator ==(const CIterator& other) const
|
||||
{
|
||||
return its.IsEqual(other.its);
|
||||
}
|
||||
|
||||
// Test if this iterator is equal to another base iterator.
|
||||
// Note: If the provided iterator does not point to the the first code-unit of an UCS code-point, the behavior is undefined.
|
||||
bool operator ==(const BaseIterator& other) const
|
||||
{
|
||||
return its.it == other;
|
||||
}
|
||||
|
||||
// Test if this iterator is equal to another iterator instance.
|
||||
// Note: In the presence of an invalidly encoded sequence (ie, IsError() returns true), the direction of iteration is significant.
|
||||
// To work around this, you can either:
|
||||
// 1) Move all iterators that will be compared in the same direction; or
|
||||
// 2) Compare the dereferenced iterator value(s) instead (if applicable).
|
||||
bool operator !=(const CIterator& other) const
|
||||
{
|
||||
return !its.IsEqual(other.its);
|
||||
}
|
||||
|
||||
// Test if this iterator is equal to another base iterator.
|
||||
// Note: If the provided iterator does not point to the the first code-unit of an UCS code-point, the behavior is undefined.
|
||||
bool operator !=(const BaseIterator& other) const
|
||||
{
|
||||
return its.it != other;
|
||||
}
|
||||
|
||||
// Get the decoded UCS code-point at the current position in the sequence.
|
||||
// If the iterator points to an invalidly encoded sequence (ie, IsError() returns true) the function returns U+FFFD (replacement character).
|
||||
reference operator *() const
|
||||
{
|
||||
assert(!its.IsEnd(its.it) && "Attempt to dereference the past-the-end iterator");
|
||||
Detail::integral_constant<EEncoding, Encoding> tag;
|
||||
sink.DecodeIfEmpty(its, tag);
|
||||
return sink.GetValue();
|
||||
}
|
||||
|
||||
// Advance the iterator to the next UCS code-point.
|
||||
// Note: You must make sure the iterator is not at the end of the sequence, even in Safe mode.
|
||||
// However, in Safe mode, the iterator will never move past the end of the sequence in the presence of encoding errors.
|
||||
CIterator& operator ++()
|
||||
{
|
||||
Detail::integral_constant<EEncoding, Encoding> tag;
|
||||
Detail::MoveNext(its.it, its, tag);
|
||||
sink.Clear();
|
||||
return *this;
|
||||
}
|
||||
|
||||
// Go back to the previous UCS code-point.
|
||||
// Note: You must make sure the iterator is not at the beginning of the sequence, even in Safe mode.
|
||||
// However, in Safe mode, the iterators will never move past the beginning of the sequence in the presence of encoding errors.
|
||||
CIterator& operator --()
|
||||
{
|
||||
Detail::integral_constant<EEncoding, Encoding> tag;
|
||||
Detail::MovePrev(its.it, its, tag);
|
||||
sink.Clear();
|
||||
return *this;
|
||||
}
|
||||
|
||||
// Advance the iterator to the next UCS code-point, return a copy of the iterator position before advancing.
|
||||
// Note: You must make sure the iterator is not at the end of the sequence, even in Safe mode.
|
||||
// However, in Safe mode, the iterator will never move past the end of the sequence in the presence of encoding errors.
|
||||
CIterator operator ++(int)
|
||||
{
|
||||
CIterator result = *this;
|
||||
++*this;
|
||||
return result;
|
||||
}
|
||||
|
||||
// Go back to the previous UCS code-point, return a copy of the iterator position before going back.
|
||||
// Note: You must make sure the iterator is not at the beginning of the sequence, even in Safe mode.
|
||||
// However, in Safe mode, the iterators will never move past the beginning of the sequence in the presence of encoding errors.
|
||||
CIterator operator --(int)
|
||||
{
|
||||
CIterator result = *this;
|
||||
--*this;
|
||||
return result;
|
||||
}
|
||||
};
|
||||
|
||||
namespace Detail
|
||||
{
|
||||
// SIteratorSpecializer<T>:
|
||||
// Specializes the CIterator template to use for a given string type.
|
||||
// Note: The reason we use this is because MSVC doesn't want to deduce this on the MakeIterator declaration.
|
||||
template<typename StringType>
|
||||
struct SIteratorSpecializer
|
||||
{
|
||||
typedef CIterator<typename StringType::const_iterator> type;
|
||||
};
|
||||
}
|
||||
|
||||
// MakeIterator(const StringType &str):
|
||||
// Helper function to make an UCS code-point iterator given an Unicode string.
|
||||
// Example usage:
|
||||
// string utf8 = "foo"; // UTF-8
|
||||
// auto it = Unicode::MakeIterator(utf8);
|
||||
// while (it != utf8.end())
|
||||
// {
|
||||
// uint32 codepoint = *it; // 32-bit UCS code-point
|
||||
// }
|
||||
// Or, in a for-loop:
|
||||
// for (auto it = Unicode::MakeIterator(utf8); it != utf8.end(); ++it) {}
|
||||
template<typename StringType>
|
||||
inline typename Detail::SIteratorSpecializer<StringType>::type MakeIterator(const StringType& str)
|
||||
{
|
||||
return typename Detail::SIteratorSpecializer<StringType>::type(str.begin(), str.end());
|
||||
}
|
||||
}
|
||||
Loading…
Reference in New Issue