remove of unicode files from Cry

Signed-off-by: Esteban Papp <81431996+amznestebanpapp@users.noreply.github.com>
monroegm-disable-blank-issue-2
Esteban Papp 5 years ago
parent 56eda61828
commit f21fc79dc4

@ -1,946 +0,0 @@
/*
* Copyright (c) Contributors to the Open 3D Engine Project.
* For complete copyright and license terms please see the LICENSE at the root of this distribution.
*
* SPDX-License-Identifier: Apache-2.0 OR MIT
*
*/
// Note: The utilities in this file should typically not be used directly,
// consider including UnicodeFunctions.h or UnicodeIterator.h instead.
//
// (At least) the following string types can be bound with these helper functions:
// Types Input Output Null-Terminator
// std::basic_string<T>, std::string, std::wstring: yes yes implied by type
// QString: yes yes implied by type
// std::vector<T>, std::list<T>, std::deque<T>: yes yes not present
// T[] (fixed-length buffer): yes yes guaranteed to be emitted on output, accepted on input
// T * and size_t (user-specified-size buffer): no yes guaranteed to be emitted on output
// const T * (null-terminated string): yes no expected
// const T[] (literal): yes no implied as the last item in the array
// pair of iterators over T: yes no should not be included in the range
// uint32 (single UCS code-point): yes no not present
// If some other string type is not listed, you can still use it for input easily by passing begin/end iterators.
// Note: For all types, T can be any 8-bit, 16-bit or 32-bit integral or character type.
// Further T types may be processed by explicitly passing InputEncoding and OutputEncoding.
// We never actively tested such scenario's, so no guarantees on floating and user-defined types as code-units.
#pragma once
#ifndef assert
// Some tools use CRT's assert, most engine and game modules use CryAssert.h (via platform.h maybe).
// We don't want to force a choice upon all code that uses Unicode utilities, so we just assume assert is defined.
#error This header uses assert macro, please provide an applicable definition before including UnicodeXXX.h
#endif
#include "UnicodeEncoding.h"
#include <string.h> // For str(n)len and memcpy.
#include <wchar.h> // For wcs(n)len.
#include <stddef.h> // For size_t and ptrdiff_t.
#include <iterator> // For std::iterator_traits.
#include <string> // For std::basic_string.
#include <vector> // For std::vector.
#include <list> // For std::list.
#include <deque> // For std::deque.
#include <type_traits> // ... standard type-traits (as of C++11).
#if defined(AZ_RESTRICTED_PLATFORM)
#undef AZ_RESTRICTED_SECTION
#define UNICODEBINDING_H_SECTION_1 1
#define UNICODEBINDING_H_SECTION_2 2
#endif
// Forward declare the supported types.
// Before actually instantiating a binding however, you need to have the full definition included.
// Also, this allows us to work with QChar/QString as declared names without a dependency on Qt.
namespace AZStd
{
template<class Element, size_t MaxElementCount, class Traits>
class basic_fixed_string;
}
class QChar;
class QString;
namespace Unicode
{
namespace Detail
{
// Import standard type traits.
// This requires C++11 compiler support.
using std::add_const;
using std::conditional;
using std::extent;
using std::integral_constant;
using std::is_arithmetic;
using std::is_array;
using std::is_base_of;
using std::is_const;
using std::is_convertible;
using std::is_integral;
using std::is_pointer;
using std::is_same;
using std::make_unsigned;
using std::remove_cv;
using std::remove_extent;
using std::remove_pointer;
// SVoid<T>:
// Result type will be void if T is well-formed.
// Note: This is mostly used to test the presence of member types at compile-time.
template<typename T>
struct SVoid
{
typedef void type;
};
// SValidChar<T, InferEncoding, Input>:
// Determine if T is a valid character type in the given compile-time context.
// The InferEncoding flag is set if the encoding has to be detected automatically.
// The Input flag is set if the type is used for input (and not set if the type is used for output).
template<typename T, bool InferEncoding, bool Input>
struct SValidChar
{
typedef typename remove_cv<T>::type BaseType;
static const bool isArithmeticType = is_arithmetic<BaseType>::value;
static const bool isQChar = is_same<BaseType, QChar>::value;
static const bool isUsable = isArithmeticType || isQChar;
static const bool isValidQualified = !is_const<T>::value || Input;
static const bool isKnownSize = sizeof(T) == 1 || sizeof(T) == 2 || sizeof(T) == 4;
static const bool isValidInferred = isKnownSize || !InferEncoding;
static const bool value = isUsable && isValidQualified && isValidInferred;
};
// SPackedIterators<T>:
// A pair of iterators over some range.
// Note: Packing iterators into a single object allows us to pass them as a single argument like all other types.
template<typename T>
struct SPackedIterators
{
const T begin, end;
SPackedIterators(const T& _begin, const T& _end)
: begin(_begin)
, end(_end) {}
};
// SPackedBuffer<T>:
// A buffer-pointer/length tuple.
// Note: Packing them into a single object allows us to pass them as a single argument like all other types.
template<typename T>
struct SPackedBuffer
{
T buffer;
size_t size;
SPackedBuffer(T _buffer, size_t _size)
: buffer(_buffer)
, size(_size) {}
};
// SDependentType<T, X>:
// Makes the name of type T dependent on X (which is otherwise meaningless).
// Note: This is used to force two-phase lookup so we don't need the definition of T until instantiation.
// This way we can convince standards-compliant compilers Clang and GCC to not require definition of forward-declared types.
// Specifically, we forward-declare Qt's QString and QChar, for which the definition will never be available outside Editor.
template<typename T, int X>
struct SDependentType
{
typedef T type;
};
// EBind:
// Methods of binding a type for input and/or output.
// Note: These are used for tag-dispatch by binding functions, and are private to the implementation.
enum EBind
{ // Input Output Description
eBind_Impossible, // No No Can't bind this type.
eBind_Iterators, // Yes Yes Bind by using begin() and end() member functions.
eBind_Data, // Yes Yes Bind by using data() and size() member functions.
eBind_Literal, // Yes No Bind a fixed size buffer (const element, aka string literal).
eBind_Buffer, // Yes No Bind a fixed size buffer (non-const element) that may be null-terminated.
eBind_PackedBuffer, // No Yes Bind a user-specified size buffer (non-const element).
eBind_NullTerminated, // Yes No Bind a null-terminated buffer of unknown length (C string).
eBind_CodePoint, // Yes No Bind a single code-point value.
};
// SBindIterator<T, InferEncoding>:
// Find the EBind for input from iterator pair of type T at compile-time.
// If the type is not supported, the resulting value will be eBind_Impossible
template<typename T, bool InferEncoding, typename HasValueType = void, typename HasIteratorCategory = void>
struct SBindIterator
{
typedef const void CharType;
static const EBind value = eBind_Impossible;
};
template<typename T, bool InferEncoding, typename HasValueType, typename HasIteratorCategory>
struct SBindIterator<T*, InferEncoding, HasValueType, HasIteratorCategory>
{
typedef typename add_const<T>::type CharType;
static const bool isValid = SValidChar<CharType, InferEncoding, true>::value;
static const EBind value = isValid ? eBind_Iterators : eBind_Impossible;
};
template<typename T, bool InferEncoding>
struct SBindIterator<T, InferEncoding,
typename SVoid<typename T::value_type>::type,
typename SVoid<typename T::iterator_category>::type
>
{
typedef typename add_const<typename T::value_type>::type CharType;
typedef typename T::iterator_category IteratorCategory;
static const bool isInputIterator = is_base_of<std::input_iterator_tag, IteratorCategory>::value;
static const bool isValid = SValidChar<CharType, InferEncoding, true>::value;
static const EBind value = isValid && isInputIterator ? eBind_Iterators : eBind_Impossible;
};
// SBindObject<T, InferEncoding>:
// Find the EBind for input from object of type T at compile-time.
// If the type is not supported, the resulting value will be eBind_Impossible.
template<typename T, bool InferEncoding>
struct SBindObject
{
typedef typename add_const<
typename conditional<
is_array<T>::value,
typename remove_extent<T>::type,
typename remove_pointer<T>::type
>::type
>::type CharType;
static const size_t FixedSize = extent<T>::value;
COMPILE_TIME_ASSERT(!is_array<T>::value || FixedSize > 0);
static const bool isConstArray = is_array<T>::value && is_const<typename remove_extent<T>::type>::value;
static const bool isBufferArray = is_array<T>::value && !isConstArray;
static const bool isPointer = is_pointer<T>::value;
static const bool isCodePoint = is_integral<T>::value;
static const bool isValidChar = SValidChar<CharType, InferEncoding, true>::value;
static const EBind value =
!isValidChar ? eBind_Impossible :
isConstArray ? eBind_Literal :
isBufferArray ? eBind_Buffer :
isPointer ? eBind_NullTerminated :
isCodePoint ? eBind_CodePoint :
eBind_Impossible;
};
template<typename CharT, typename Traits, typename Allocator, bool InferEncoding>
struct SBindObject<std::basic_string<CharT, Traits, Allocator>, InferEncoding>
{
typedef typename add_const<CharT>::type CharType;
static const bool isValid = SValidChar<CharT, InferEncoding, true>::value;
static const EBind value = isValid ? eBind_Data : eBind_Impossible;
};
template<typename T, typename Allocator, bool InferEncoding>
struct SBindObject<std::vector<T, Allocator>, InferEncoding>
{
typedef typename add_const<T>::type CharType;
static const bool isValid = SValidChar<T, InferEncoding, true>::value;
static const EBind value = isValid ? eBind_Data : eBind_Impossible;
};
template<typename T, typename Allocator, bool InferEncoding>
struct SBindObject<std::list<T, Allocator>, InferEncoding>
{
typedef typename add_const<T>::type CharType;
static const bool isValid = SValidChar<T, InferEncoding, true>::value;
static const EBind value = isValid ? eBind_Iterators : eBind_Impossible;
};
template<typename T, typename Allocator, bool InferEncoding>
struct SBindObject<std::deque<T, Allocator>, InferEncoding>
{
typedef typename add_const<T>::type CharType;
static const bool isValid = SValidChar<T, InferEncoding, true>::value;
static const EBind value = isValid ? eBind_Iterators : eBind_Impossible;
};
template<typename T, size_t S, bool InferEncoding>
struct SBindObject<AZStd::basic_fixed_string<T, S>, InferEncoding>
{
typedef typename add_const<T>::type CharType;
static const bool isValid = SValidChar<T, InferEncoding, true>::value;
static const EBind value = isValid ? eBind_Data : eBind_Impossible;
};
template<size_t S, bool InferEncoding>
struct SBindObject<AZStd::fixed_wstring<S>, InferEncoding>
{
typedef wchar_t CharType;
static const bool isValid = SValidChar<CharType, InferEncoding, true>::value;
static const EBind value = isValid ? eBind_Data : eBind_Impossible;
};
template<bool InferEncoding>
struct SBindObject<QString, InferEncoding>
{
typedef const QChar CharType;
static const EBind value = eBind_Data;
};
template<typename T, bool InferEncoding>
struct SBindObject<SPackedIterators<T>, InferEncoding>
{
typedef typename SBindIterator<T, InferEncoding>::CharType CharType;
static const EBind value = eBind_Iterators;
};
// SBindOutput<T, InferEncoding>:
// Find the EBind for output to object of type T at compile-time.
// If the type is not supported, the resulting value will be eBind_Impossible.
template<typename T, bool InferEncoding>
struct SBindOutput
{
typedef typename remove_extent<T>::type CharType;
static const size_t FixedSize = extent<T>::value;
static const bool isArray = is_array<T>::value;
static const bool isValid = SValidChar<typename remove_extent<T>::type, InferEncoding, false>::value;
static const EBind value = isArray && isValid ? eBind_Buffer : eBind_Impossible;
};
template<typename OutputCharType, bool InferEncoding>
struct SBindOutput<SPackedBuffer<OutputCharType*>, InferEncoding>
{
typedef OutputCharType CharType;
static const bool isValid = SValidChar<CharType, InferEncoding, false>::value;
static const EBind value = isValid ? eBind_PackedBuffer : eBind_Impossible;
};
template<typename CharT, typename Traits, typename Allocator, bool InferEncoding>
struct SBindOutput<std::basic_string<CharT, Traits, Allocator>, InferEncoding>
{
typedef CharT CharType;
static const bool isValid = SValidChar<CharT, InferEncoding, false>::value;
static const EBind value = isValid ? eBind_Data : eBind_Impossible;
};
template<typename T, typename Allocator, bool InferEncoding>
struct SBindOutput<std::vector<T, Allocator>, InferEncoding>
{
typedef T CharType;
static const bool isValid = SValidChar<T, InferEncoding, false>::value;
static const EBind value = isValid ? eBind_Data : eBind_Impossible;
};
template<typename T, typename Allocator, bool InferEncoding>
struct SBindOutput<std::list<T, Allocator>, InferEncoding>
{
typedef T CharType;
static const bool isValid = SValidChar<T, InferEncoding, false>::value;
static const EBind value = isValid ? eBind_Iterators : eBind_Impossible;
};
template<typename T, typename Allocator, bool InferEncoding>
struct SBindOutput<std::deque<T, Allocator>, InferEncoding>
{
typedef T CharType;
static const bool isValid = SValidChar<T, InferEncoding, false>::value;
static const EBind value = isValid ? eBind_Iterators : eBind_Impossible;
};
template<typename T, size_t S, bool InferEncoding>
struct SBindOutput<AZStd::basic_fixed_string<T, S>, InferEncoding>
{
typedef T CharType;
static const bool isValid = SValidChar<T, InferEncoding, false>::value;
static const EBind value = isValid ? eBind_Data : eBind_Impossible;
};
template<bool InferEncoding>
struct SBindOutput<QString, InferEncoding>
{
typedef QChar CharType;
static const EBind value = eBind_Data;
};
// SInferEncoding<T>:
// Infers the encoding of the given character type.
// Note: This will always pick an UTF encoding type based on the size of the element type.
template<typename T, bool Input>
struct SInferEncoding
{
typedef SBindObject<T, true> ObjectType;
typedef SBindIterator<T, true> IteratorType;
typedef typename conditional<
IteratorType::value != eBind_Impossible,
typename IteratorType::CharType,
typename ObjectType::CharType
>::type CharType;
static const EEncoding value =
sizeof(CharType) == 1 ? eEncoding_UTF8 :
sizeof(CharType) == 2 ? eEncoding_UTF16 :
eEncoding_UTF32;
COMPILE_TIME_ASSERT(value != eEncoding_UTF32 || sizeof(CharType) == 4);
};
// SBindCharacter<T, Input>:
// Pick the base character type to use during input or output with this element type.
template<typename T, bool Input, bool Integral = is_integral<T>::value, bool IsQChar = is_same<QChar, typename remove_cv<T>::type>::value>
struct SBindCharacter
{
typedef typename make_unsigned<T>::type BaseType; // The standard doesn't define if a character type is signed or unsigned.
typedef typename remove_cv<BaseType>::type UnqualifiedType;
typedef typename conditional<Input, const UnqualifiedType, UnqualifiedType>::type type;
};
template<typename T, bool Input>
struct SBindCharacter<T, Input, false, false>
{
COMPILE_TIME_ASSERT(is_arithmetic<T>::value);
typedef typename remove_cv<T>::type UnqualifiedType;
typedef typename conditional<Input, const UnqualifiedType, UnqualifiedType>::type type;
};
template<typename T, bool Input>
struct SBindCharacter<T, Input, false, true>
{
typedef typename conditional<Input, const uint16, uint16>::type type;
typedef typename SDependentType<QChar, Input>::type ActuallyQChar; // Force two-phase name lookup on QChar.
COMPILE_TIME_ASSERT(sizeof(ActuallyQChar) == sizeof(type)); // In case Qt ever changes QChar.
};
// SBindPointer<T, Input>:
// Pick the pointer type to use during input or output with buffers (potentially inside string types).
template<typename T, bool Input>
struct SBindPointer
{
COMPILE_TIME_ASSERT(is_pointer<T>::value || is_array<T>::value);
typedef typename conditional<
is_pointer<T>::value,
typename remove_pointer<T>::type,
typename remove_extent<T>::type
>::type UnboundCharType;
typedef typename SBindCharacter<UnboundCharType, Input>::type BoundCharType;
typedef BoundCharType* type;
};
// SAutomaticallyDeduced:
// Placeholder type that is never defined, used by SRequire for SFINAE overloading.
struct SAutomaticallyDeduced;
// SRequire<Expr, T>:
// Helper for SFINAE overloading.
// Similar to C++11's std::enable_if, which is not in boost (with that exact name anyway).
template<bool SFINAE, typename T = SAutomaticallyDeduced>
struct SRequire
{
typedef T type;
};
template<typename T>
struct SRequire<false, T> {};
// SafeCast<T, SourceChar>:
// Cast a pointer to type T, but only allowing safe casts.
// This guards against bad code in other functions since it prevents unintended casts.
template<typename T, typename SourceChar>
inline T SafeCast(SourceChar* ptr, typename SRequire<is_integral<SourceChar>::value>::type* = 0)
{
// Allow casts from pointer-to-integral to unrelated pointer-to-integral, provided they are of the same size.
typedef typename remove_pointer<T>::type TargetChar;
COMPILE_TIME_ASSERT(is_integral<SourceChar>::value && is_integral<TargetChar>::value);
COMPILE_TIME_ASSERT(sizeof(SourceChar) == sizeof(TargetChar));
return reinterpret_cast<T>(ptr);
}
template<typename T, typename SourceChar>
inline T SafeCast(SourceChar* ptr, typename SRequire<is_same<typename remove_cv<SourceChar>::type, QChar>::value>::type* = 0)
{
// Allow casts from pointer-to-QChar to unrelated pointer-to-integral, provided they are of the same size.
typedef typename remove_pointer<T>::type TargetChar;
COMPILE_TIME_ASSERT(is_integral<TargetChar>::value);
COMPILE_TIME_ASSERT(sizeof(SourceChar) == sizeof(TargetChar));
return reinterpret_cast<T>(ptr);
}
template<typename T, typename SourceChar>
inline T SafeCast(SourceChar* ptr, typename SRequire<!is_integral<SourceChar>::value&& !is_same<typename remove_cv<SourceChar>::type, QChar>::value>::type* = 0)
{
// Any other casts that are allowed by C++.
return static_cast<T>(ptr);
}
// SCharacterTrait<T>:
// Exposes some basic traits for a given character.
// Note: Map to (hopefully optimized) CRT functions where possible.
template<typename T, size_t Size = sizeof(T)* is_integral<T>::value>
struct SCharacterTrait
{
static size_t StrLen(const T* nts) // Fall-back strlen.
{
size_t result = 0;
while (*nts != 0)
{
++nts;
++result;
}
return result;
}
static size_t StrNLen(const T* ptr, size_t len) // Fall-back strnlen.
{
size_t result = 0;
while (*ptr != 0 && result != len)
{
++ptr;
++result;
}
return result;
}
};
template<typename T>
struct SCharacterTrait<T, sizeof(char)>
{
static size_t StrLen(const T* nts) // Narrow CRT strlen.
{
return ::strlen(SafeCast<const char*>(nts));
}
static size_t StrNLen(const T* ptr, size_t len) // Narrow CRT strnlen.
{
return ::strnlen(SafeCast<const char*>(ptr), len);
}
};
template<typename T>
struct SCharacterTrait<T, sizeof(wchar_t)>
{
static size_t StrLen(const T* nts) // Wide CRT strlen.
{
return ::wcslen(SafeCast<const wchar_t*>(nts));
}
static size_t StrNLen(const T* ptr, size_t len) // Wide CRT strnlen.
{
#if defined(AZ_RESTRICTED_PLATFORM)
#define AZ_RESTRICTED_SECTION UNICODEBINDING_H_SECTION_1
#include AZ_RESTRICTED_FILE(UnicodeBinding_h)
#endif
return ::wcsnlen(SafeCast<const wchar_t*>(ptr), len);
#if defined(AZ_RESTRICTED_PLATFORM)
#define AZ_RESTRICTED_SECTION UNICODEBINDING_H_SECTION_2
#include AZ_RESTRICTED_FILE(UnicodeBinding_h)
#endif
}
};
// void Feed(const SPackedIterators<InputIteratorType> &its, Sink &out, tag):
// Feeds the provided sink from provided packed iterator-range.
template<typename InputIteratorType, typename Sink>
inline void Feed(const SPackedIterators<InputIteratorType>& its, Sink& out, integral_constant<EBind, eBind_Iterators>)
{
typedef typename std::iterator_traits<InputIteratorType>::value_type UnboundCharType;
typedef typename SBindCharacter<UnboundCharType, true>::type BoundCharType;
for (InputIteratorType it = its.begin; it != its.end; ++it)
{
const UnboundCharType unbound = *it;
const BoundCharType bound = static_cast<BoundCharType>(unbound);
const uint32 item = static_cast<uint32>(bound);
out(item);
}
}
// void Feed(const SPackedIterators<const InputCharType *> &its, Sink &out, tag):
// Feeds the provided sink from provided packed pointer-range.
// This is slightly better code-generation than using generic iterators.
template<typename InputCharType, typename Sink>
inline void Feed(const SPackedIterators<const InputCharType*>& its, Sink& out, integral_constant<EBind, eBind_Iterators>)
{
typedef typename SBindPointer<const InputCharType*, true>::type PointerType;
assert(reinterpret_cast<size_t>(its.begin) <= reinterpret_cast<size_t>(its.end) && "Invalid range specified");
const size_t length = its.end - its.begin;
PointerType ptr = SafeCast<PointerType>(its.begin);
assert((ptr || !length) && "Passed a non-empty range containing a null-pointer");
for (size_t i = 0; i < length; ++i, ++ptr)
{
const uint32 item = static_cast<uint32>(*ptr);
out(item);
}
}
// void Feed(const InputStringType &in, Sink &out, tag):
// Feeds the provided sink from a container, using it's iterators.
// Note: Dispatches to one of the packed-range overloads.
template<typename InputStringType, typename Sink>
inline void Feed(const InputStringType& in, Sink& out, integral_constant<EBind, eBind_Iterators> tag)
{
typedef typename InputStringType::const_iterator IteratorType;
Detail::SPackedIterators<IteratorType> its(in.begin(), in.end());
Feed(its, out, tag);
}
// void Feed(const InputStringType &in, Sink &out, tag):
// Feeds the provided sink from a string-object's buffer.
template<typename InputStringType, typename Sink>
inline void Feed(const InputStringType& in, Sink& out, integral_constant<EBind, eBind_Data>)
{
typedef typename InputStringType::size_type SizeType;
typedef typename InputStringType::value_type ValueType;
typedef typename SBindPointer<const ValueType*, true>::type PointerType;
const SizeType length = in.size();
if (length)
{
PointerType ptr = SafeCast<PointerType>(in.data());
for (SizeType i = 0; i < length; ++i, ++ptr)
{
const uint32 item = static_cast<uint32>(*ptr);
out(item);
}
}
}
// void Feed(const InputStringType &in, Sink &out, tag):
// Feeds the provided sink from a string-literal.
// Note: The literal is assumed to be null-terminated.
// It's possible that a const-element fixed-size-buffer is mistaken as a literal.
// However, we expect no-one uses such buffers that are not null-terminated already.
// If somehow this use-case is desired, either terminate the buffer, or remove const from the buffer, or pass iterators.
template<typename InputStringType, typename Sink>
inline void Feed(const InputStringType& in, Sink& out, integral_constant<EBind, eBind_Literal>)
{
COMPILE_TIME_ASSERT(is_array<InputStringType>::value && extent<InputStringType>::value > 0);
typedef typename SBindPointer<InputStringType, true>::type PointerType;
const size_t length = extent<InputStringType>::value - 1;
PointerType ptr = SafeCast<PointerType>(in);
assert(ptr[length] == 0 && "Literal is not null-terminated");
for (size_t i = 0; i < length; ++i, ++ptr)
{
const uint32 item = static_cast<uint32>(*ptr);
out(item);
}
}
// void Feed(const InputStringType &in, Sink &out, tag):
// Feeds the provided sink from a non-const-element fixed-size buffer.
// Note: The buffer is allowed to be null-terminated, but it's not required.
template<typename InputStringType, typename Sink>
inline void Feed(const InputStringType& in, Sink& out, integral_constant<EBind, eBind_Buffer>)
{
COMPILE_TIME_ASSERT(is_array<InputStringType>::value && extent<InputStringType>::value > 0);
typedef typename SBindPointer<InputStringType, true>::type PointerType;
typedef typename SBindPointer<InputStringType, true>::BoundCharType CharType;
const size_t length = extent<InputStringType>::value;
PointerType ptr = SafeCast<PointerType>(in);
for (size_t i = 0; i < length; ++i, ++ptr)
{
const CharType unbound = *ptr;
if (unbound == 0)
{
break;
}
const uint32 item = static_cast<uint32>(unbound);
out(item);
}
}
// void Feed(const InputStringType &in, Sink &out, tag):
// Feeds the provided sink from a null-terminated C-style string.
template<typename InputStringType, typename Sink>
inline void Feed(const InputStringType& in, Sink& out, integral_constant<EBind, eBind_NullTerminated>)
{
COMPILE_TIME_ASSERT(is_pointer<InputStringType>::value);
typedef typename SBindPointer<InputStringType, true>::type PointerType;
typedef typename SBindPointer<InputStringType, true>::BoundCharType CharType;
PointerType ptr = SafeCast<PointerType>(in);
if (ptr)
{
while (true)
{
const CharType unbound = *ptr;
++ptr;
if (unbound == 0)
{
break;
}
const uint32 item = static_cast<uint32>(unbound);
out(item);
}
}
}
// void Feed(const InputCharType &in, Sink &out, tag):
// Feeds the provided sink from a single value (interpreted as an UCS code-point).
template<typename InputCharType, typename Sink>
inline void Feed(const InputCharType& in, Sink& out, integral_constant<EBind, eBind_CodePoint>)
{
COMPILE_TIME_ASSERT(is_arithmetic<InputCharType>::value);
const uint32 item = static_cast<uint32>(in);
out(item);
}
// size_t EncodedLength(const SPackedIterators<InputIteratorType> &its, tag):
// Determines the length of the input sequence in a range of iterators.
template<typename InputIteratorType>
inline size_t EncodedLength(const SPackedIterators<InputIteratorType>& its, integral_constant<EBind, eBind_Iterators>)
{
return std::distance(its.begin, its.end); // std::distance will pick optimal implementation depending on iterator category.
}
// size_t EncodedLength(const InputStringType &in, tag):
// Determines the length of an input container, which would otherwise be enumerated with iterators.
template<typename InputStringType>
inline size_t EncodedLength(const InputStringType& in, integral_constant<EBind, eBind_Iterators>)
{
return in.size(); // Can there be a container without size()? At the very least, not in the supported types.
}
// size_t EncodedLength(const InputStringType &in, tag):
// Determines the length of the input container. The container uses contiguous element layout.
template<typename InputStringType>
inline size_t EncodedLength(const InputStringType& in, integral_constant<EBind, eBind_Data>)
{
return in.size();
}
// size_t EncodedLength(const InputStringType &in, tag):
// Determines the length of the input string-literal. This is a compile-time constant.
template<typename InputStringType>
inline size_t EncodedLength(const InputStringType& in, integral_constant<EBind, eBind_Literal>)
{
COMPILE_TIME_ASSERT(is_array<InputStringType>::value && extent<InputStringType>::value > 0);
return extent<InputStringType>::value - 1;
}
// size_t EncodedLength(const InputStringType &in, tag):
// Determines the length of the input fixed-size-buffer. We look for an (optional) null-terminator in the buffer.
template<typename InputStringType>
inline size_t EncodedLength(const InputStringType& in, integral_constant<EBind, eBind_Buffer>)
{
COMPILE_TIME_ASSERT(is_array<InputStringType>::value && extent<InputStringType>::value > 0);
typedef typename remove_extent<InputStringType>::type CharType;
return SCharacterTrait<CharType>::StrNLen(in, extent<InputStringType>::value);
}
// size_t EncodedLength(const InputStringType &in, tag):
// Determines the length of the input used-specified buffer. We look for an (optional) null-terminator in the buffer.
template<typename InputCharType>
inline size_t EncodedLength(const SPackedBuffer<InputCharType*>& in, integral_constant<EBind, eBind_PackedBuffer>)
{
return in.buffer ? SCharacterTrait<InputCharType>::StrNLen(in.buffer, in.size) : 0;
}
// size_t EncodedLength(const InputStringType &in, tag):
// Determines the length of the input null-terminated c-style string. We just use strlen() if available.
template<typename InputStringType>
inline size_t EncodedLength(const InputStringType& in, integral_constant<EBind, eBind_NullTerminated>)
{
COMPILE_TIME_ASSERT(is_pointer<InputStringType>::value);
typedef typename remove_pointer<InputStringType>::type CharType;
return in ? SCharacterTrait<CharType>::StrLen(in) : 0;
}
// size_t EncodedLength(const InputCharType &in, tag):
// Determines the length of a single UCS code-point. This is always 1.
template<typename InputCharType>
inline size_t EncodedLength([[maybe_unused]] const InputCharType& in, integral_constant<EBind, eBind_CodePoint>)
{
COMPILE_TIME_ASSERT(is_arithmetic<InputCharType>::value);
return 1;
}
// const void *EncodedPointer(const SPackedIterators<const InputCharType *> &its, tag):
// Get a pointer to contiguous storage for an iterator range.
// Note: This can only work if the iterators are pointers, or the storage won't be guaranteed contiguous.
template<typename InputCharType>
inline const void* EncodedPointer(const SPackedIterators<const InputCharType*>& its, integral_constant<EBind, eBind_Iterators>)
{
return its.begin;
}
// const void *EncodedPointer(const InputStringType &in, tag):
// Get a pointer to contiguous storage for string/vector object.
// Note: This can only work for containers that actually use contiguous storage, which is determined by the SBindXXX helpers.
template<typename InputStringType>
inline const void* EncodedPointer(const InputStringType& in, integral_constant<EBind, eBind_Data>)
{
return in.data();
}
// const void *EncodedPointer(const InputStringType &in, tag):
// Get a pointer to contiguous storage for a string-literal.
template<typename InputStringType>
inline const void* EncodedPointer(const InputStringType& in, integral_constant<EBind, eBind_Literal>)
{
COMPILE_TIME_ASSERT(is_array<InputStringType>::value && extent<InputStringType>::value > 0);
return in; // We can just let the array type decay to a pointer.
}
// const void *EncodedPointer(const InputStringType &in, tag):
// Get a pointer to contiguous storage for a fixed-size-buffer.
template<typename InputStringType>
inline const void* EncodedPointer(const InputStringType& in, integral_constant<EBind, eBind_Buffer>)
{
COMPILE_TIME_ASSERT(is_array<InputStringType>::value && extent<InputStringType>::value > 0);
return in; // We can just let the array type decay to a pointer.
}
// const void *EncodedPointer(const InputStringType &in, tag):
// Get a pointer to contiguous storage for a null-terminated c-style-string.
template<typename InputStringType>
inline const void* EncodedPointer(const InputStringType& in, integral_constant<EBind, eBind_NullTerminated>)
{
COMPILE_TIME_ASSERT(is_pointer<InputStringType>::value);
return in; // Implied
}
// const void *EncodedPointer(const InputCharType &in, tag):
// Get a pointer to contiguous storage for a single UCS code-point.
template<typename InputCharType>
inline const void* EncodedPointer(const InputCharType& in, integral_constant<EBind, eBind_CodePoint>)
{
COMPILE_TIME_ASSERT(is_arithmetic<InputCharType>::value);
return &in; // Take the address of the parameter (which is kept on the stack of the caller).
}
// SWriteSink<T, Append, BindMethod>:
// A helper that performs writing to the type T and can be passed as Sink type to a trans-coder helper.
template<typename T, bool Append, EBind>
struct SWriteSink;
template<typename T, bool Append>
struct SWriteSink<T, Append, eBind_Iterators>
{
typedef typename T::value_type OutputCharType;
T& out;
SWriteSink(T& _out, size_t)
: out(_out)
{
if (!Append)
{
// If not appending, clear the object beforehand.
out.clear();
}
}
void operator()(uint32 item)
{
const OutputCharType bound = static_cast<OutputCharType>(item);
out.push_back(bound); // We assume this can't fail and STL container takes care of memory.
}
void operator()(const void*, size_t); // Not implemented.
void HintSequence(uint32 length) {} // Don't care about sequences.
bool CanWrite() const { return true; } // Always writable
};
template<typename T, bool Append>
struct SWriteSink<T, Append, eBind_Data>
{
typedef SBindPointer<typename T::value_type*, false> BindHelper;
typedef typename BindHelper::UnboundCharType CharType;
CharType* ptr;
SWriteSink(T& out, size_t length)
{
const size_t offset = Append ? out.size() : 0;
length += offset;
out.resize(length); // resize() can't fail without exceptions, so assert instead.
assert((out.size() == length) && "Buffer resize failed (out-of-memory?)");
const CharType* base = length ? out.data() : 0;
ptr = const_cast<CharType*>(base + offset);
}
void operator()(uint32 item)
{
*SafeCast<typename BindHelper::type>(ptr) = static_cast<typename BindHelper::BoundCharType>(item);
++ptr;
}
void operator()(const void* src, size_t length)
{
::memcpy(ptr, src, length * sizeof(CharType));
ptr += length;
}
void HintSequence([[maybe_unused]] uint32 length) {} // Don't care about sequences.
bool CanWrite() const { return true; } // Always writable
};
template<typename P, bool Append>
struct SWriteSink<SPackedBuffer<P>, Append, eBind_PackedBuffer>
{
typedef typename remove_pointer<P>::type ElementType;
typedef SBindPointer<ElementType*, false> BindHelper;
typedef typename BindHelper::UnboundCharType CharType;
CharType* ptr;
CharType* const terminator;
SWriteSink(CharType* _terminator)
: terminator(_terminator) {}
SWriteSink(SPackedBuffer<P>& out, size_t)
: terminator(out.size && out.buffer ? out.buffer + out.size - 1 : 0)
{
const size_t offset = Append
? EncodedLength(out, integral_constant<EBind, eBind_PackedBuffer>())
: 0;
const size_t fixedOffset = Append && offset >= out.size
? out.size - 1 // In case the buffer is already full and not terminated.
: offset;
CharType* base = static_cast<CharType*>(out.buffer);
ptr = terminator ? base + fixedOffset : 0;
}
~SWriteSink()
{
if (ptr)
{
*ptr = 0; // Guarantees that the output is null-terminated.
}
}
void operator()(uint32 item)
{
if (ptr != terminator) // Guarantees we don't overflow the buffer.
{
*SafeCast<typename BindHelper::type>(ptr) = static_cast<typename BindHelper::BoundCharType>(item);
++ptr;
}
}
void operator()(const void* src, size_t length)
{
const size_t maxLength = terminator - ptr;
if (length > maxLength)
{
length = maxLength;
}
::memcpy(ptr, src, length * sizeof(CharType));
ptr += length;
}
void HintSequence(uint32 length)
{
if (terminator && (ptr + length >= terminator))
{
// This sequence will overflow the buffer.
// In this case, we prefer to not generate any part of the sequence.
// Terminate at the current position and flag as full.
*ptr = 0;
ptr = terminator;
}
}
bool CanWrite() const
{
return terminator != ptr;
}
};
template<typename T, bool Append>
struct SWriteSink<T, Append, eBind_Buffer> // Uses above implementation with specialized constructor
: SWriteSink<SPackedBuffer<typename remove_extent<T>::type*>, Append, eBind_PackedBuffer>
{
typedef typename remove_extent<T>::type ElementType;
typedef SWriteSink<SPackedBuffer<ElementType*>, Append, eBind_PackedBuffer> Super;
typedef SBindPointer<ElementType*, false> BindHelper;
typedef typename BindHelper::UnboundCharType CharType;
SWriteSink(T& out, size_t)
: Super(out + extent<T>::value - 1)
{
const size_t offset = Append
? EncodedLength(out, integral_constant<EBind, eBind_Buffer>())
: 0;
const size_t fixedOffset = Append && offset >= extent<T>::value
? extent<T>::value - 1 // In case the buffer is already full and not terminated.
: offset;
Super::ptr = out + fixedOffset; // Qualification for Super required for two-phase lookup.
}
};
// SIsBlockCopyable<InputType, OutputType>:
// Check if block-copy optimization is possible for these types.
// InputType should be an instantiation of SBindObject or SBindIterator.
// OutputType should be an instantiation of SBindOutput.
// Note: This doesn't take into account safe/unsafe conversions, just if the underlying storage types are compatible.
template<typename InputType, typename OutputType>
struct SIsBlockCopyable
{
template<EBind M>
struct SIsContiguous
{
static const bool value =
M == eBind_Data ||
M == eBind_Literal ||
M == eBind_Buffer ||
M == eBind_PackedBuffer ||
M == eBind_NullTerminated ||
M == eBind_CodePoint;
};
template<typename T>
struct SIsPointers
{
static const bool value = false;
};
template<typename T>
struct SIsPointers<SPackedIterators<T*> >
{
static const bool value = true;
};
typedef typename SBindCharacter<typename InputType::CharType, true>::type InputCharType;
typedef typename SBindCharacter<typename OutputType::CharType, false>::type OutputCharType;
static const bool isIntegral = is_integral<InputCharType>::value && is_integral<OutputCharType>::value;
static const bool isSameSize = sizeof(InputCharType) == sizeof(OutputCharType);
static const bool isInputContiguous = (SIsContiguous<InputType::value>::value || SIsPointers<InputType>::value);
static const bool isOutputContiguous = (SIsContiguous<OutputType::value>::value || SIsPointers<OutputType>::value);
static const bool value = isIntegral && isSameSize && isInputContiguous && isOutputContiguous;
};
}
}

@ -1,767 +0,0 @@
/*
* Copyright (c) Contributors to the Open 3D Engine Project.
* For complete copyright and license terms please see the LICENSE at the root of this distribution.
*
* SPDX-License-Identifier: Apache-2.0 OR MIT
*
*/
// Description : Generic Unicode encoding helpers.
//
// Defines encoding and decoding functions used by the higher-level functions.
// These are used by the various conversion functions in UnicodeFunctions.h and UnicodeIterator.h.
// Note: You can use these functions manually for low-level functionality, but we don't recommend that.
// In that case, you probably want to check inside the nested Detail namespace for the elementary bits.
#pragma once
#include "BaseTypes.h" // For uint8, uint16, uint32
#include "CompileTimeAssert.h" // For COMPILE_TIME_ASSERT macro
namespace Unicode
{
// Supported encoding/conversion types.
enum EEncoding
{
// UTF-8 encoding, see http://www.unicode.org/resources/utf8.html.
// Input and output are supported.
// Note: This format maps the entire UCS, where each code-point can take [1, 4] 8-bit code-units.
// Note: This is a strict super-set of Latin1/ISO-885901 as well as ASCII.
eEncoding_UTF8,
// UTF-16 encoding, see http://tools.ietf.org/html/rfc2781.
// Input and output are supported.
// Note: This format maps the entire UCS, where each code-point can take [1, 2] 16-bit code-units.
eEncoding_UTF16,
// UTF-32 encoding, see http://www.unicode.org/reports/tr17/.
// Input and output are supported.
// Note: This format maps the entire UCS, each code-point is stored in a single 32-bit code-unit.
eEncoding_UTF32,
// ASCII encoding, see http://en.wikipedia.org/wiki/ASCII.
// Input and output are supported (any output UCS values out of supported range are mapped to question mark).
// Note: Only values [U+0000, U+007F] can be mapped.
eEncoding_ASCII,
// Latin1, aka ISO-8859-1 encoding, see http://en.wikipedia.org/wiki/ISO/IEC_8859-1.
// Only input is supported.
// Note: This is a strict super-set of ASCII, it additionally maps [U+00A0, U+00FF].
eEncoding_Latin1,
// Windows ANSI codepage 1252, see http://en.wikipedia.org/wiki/Windows-1252.
// Only input is supported.
// Note: This is a strict super-set of ASCII and Latin1/ISO-8859-1, it maps some code-units from [0x80, 0x9F].
eEncoding_Win1252,
};
// Methods of recovery from invalid encoded sequences.
enum EErrorRecovery
{
// No attempt to detect invalid encoding is performed, the input is assumed to be valid.
// If the input is not valid, the output is undefined (in debug, this condition will cause an assert to trigger).
eErrorRecovery_None,
// When an invalidly encoded sequence is detected, the sequence is discarded (will not be part of the output).
// Typically used for logic/hashing purposes when the input is almost certainly valid.
eErrorRecovery_Discard,
// When an invalidly encoded sequence is detected, the sequence is replaced with the replacement-character (U+FFFD).
// Typically used when the output sequence is used for UI display purposes.
eErrorRecovery_Replace,
// When an invalidly encoded sequence is detected, the sequence is replaced with the eEncoding_Latin1 equivalent.
// If the sequence is also not valid Latin1 encoded, the sequence is discarded.
// Typically used when reading generic text files with 1-byte code-units.
// Note: This recovery method can only be used when decoding UTF-8.
eErrorRecovery_FallbackLatin1ThenDiscard,
// When an invalidly encoded sequence is detected, the sequence is replaced with the eEncoding_Win1252 equivalent.
// If the sequence is also not valid codepage 1252 encoded, the sequence is discarded.
// Typically used when reading text files generated on Windows with 1-byte code-units.
// Note: This recovery method can only be used when decoding UTF-8.
eErrorRecovery_FallbackWin1252ThenDiscard,
// When an invalidly encoded sequence is detected, the sequence is replaced with the eEncoding_Latin1 equivalent.
// If the sequence is also not valid Latin1 encoded, it is replaced with the replacement-character (U+FFFD).
// Typically used when reading generic text files with 1-byte code-units.
// Note: This recovery method can only be used when decoding UTF-8.
eErrorRecovery_FallbackLatin1ThenReplace,
// When an invalidly encoded sequence is detected, the sequence is replaced with the eEncoding_Win1252 equivalent.
// If the sequence is also not valid codepage 1252 encoded, it is replaced with the replacement-character (U+FFFD).
// Typically used when reading text files generated on Windows with 1-byte code-units.
// Note: This recovery method can only be used when decoding UTF-8.
eErrorRecovery_FallbackWin1252ThenReplace,
};
namespace Detail
{
// Decode<Encoding, Safe>(state, unit): Decodes a single code-unit of an encoding into an UCS code-point.
// When Safe flag is set, encoding errors are detected so a fall-back encoding or other recovery method can be used.
// Interpret return value as follows:
// < 0x001FFFFF: Decoded codepoint (== return value), call again with next code-unit and clear state.
// < 0x80000000: Intermediate state returned, call again with next code-unit and the returned state.
// >= 0x80000000: Bad encoding detected, up to 16 bits (UTF-16) or 24 bits (UTF-8, last in lower bits)
// contain previous consumed values (does not happen if Safe == false).
template<EEncoding InputEncoding, bool Safe>
inline uint32 Decode(uint32 state, uint32 unit);
// Some constant values used when encoding/decoding.
enum
{
cDecodeShiftRemaining = 26, // Where to store the remaining count in the state.
cDecodeOneRemaining = 1 << cDecodeShiftRemaining, // Remaining value of one.
cDecodeMaskRemaining = 3 << cDecodeShiftRemaining, // All possible remaining bits that can be used.
cDecodeLeadBit = 1 << 22, // All bits up to and including this one are reserved.
cDecodeErrorBit = 1 << 31, // Set if an error occurs during decoding.
cDecodeOverlongBit = 1 << 30, // Set if overlong sequence was used.
cDecodeSurrogateBit = 1 << 29, // Set if surrogate code-point decoded in UTF-8.
cDecodeInvalidBit = 1 << 28, // Set if invalid code-point decoded (U+FFFE/FFFF).
cDecodeSuccess = 0, // Placeholder to indicate no error occurred.
cCodepointMax = 0x10FFFF, // The maximum value of an UCS code-point.
cLeadSurrogateFirst = 0xD800, // The first valid UTF-16 lead-surrogate value.
cLeadSurrogateLast = 0xDBFF, // The last valid UTF-16 lead-surrogate value.
cTrailSurrogateFirst = 0xDC00, // The first valid UTF-16 trail-surrogate value.
cTrailSurrogateLast = 0xDFFF, // The last valid UTF-16 trail-surrogate value.
cReplacementCharacter = 0xFFFD, // The default replacement character.
};
// Validate the UTF-8 state of a multi-byte sequence.
// The safe decoder of UTF-8 will call this function when a full potential code-point has been decoded.
// This function is (at most) called for 50% of the decoded UTF-8 code-units, but likely at much lower frequency.
inline uint32 DecodeValidate8(uint32 state)
{
uint32 errorbits = (state >> 8) | cDecodeErrorBit;
state ^= (state & 0x400000) >> 1; // For 3-byte sequences, bit 5 of the lead byte needs to be cleared.
const uint32 cp =
(state & 0x3F) |
((state & 0x3F00) >> 2) |
((state & 0x3F0000) >> 4) |
((state & 0x07000000) >> 6);
if (cp <= cCodepointMax)
{
if (cp >= cLeadSurrogateFirst && cp <= cTrailSurrogateLast)
{
errorbits += cDecodeSurrogateBit; // CESU-8 encoding might have been used.
}
else
{
uint32 minval = 0x80;
minval += (0x00400000 & state) ? 0x800 - 0x80 : 0;
minval += (0x40000000 & state) ? 0x10000 - 0x80 : 0;
if (cp >= minval)
{
if ((cp & 0xFFFFFFFEU) != 0xFFFEU)
{
return cp; // Valid code-point.
}
errorbits += cDecodeInvalidBit; // Invalid character used.
}
errorbits += cDecodeOverlongBit; // Overlong encoding used.
}
}
return errorbits;
}
// Decode UTF-8, unsafe.
template<>
inline uint32 Decode<eEncoding_UTF8, false>(uint32 state, uint32 unit)
{
if (state == 0) // First byte.
{
unit = unit & 0xFF;
if (unit < 0xC0)
{
return unit; // Single-unit (ASCII).
}
uint32 remaining = (unit >> 4) - 0xC;
remaining += (remaining == 0);
return (unit & 0x1F) + (remaining << cDecodeShiftRemaining); // Lead byte of multi-byte.
}
state = (state << 6) + (unit & 0x3F) + (state & cDecodeMaskRemaining) - cDecodeOneRemaining; // Apply c-byte.
return state & ~cDecodeLeadBit; // Mask off the lead bits of a 4-byte sequence.
}
// Decode UTF-8, safe
template<>
inline uint32 Decode<eEncoding_UTF8, true>(uint32 state, uint32 unit)
{
if (unit <= 0xF4) // Discard out-of-range values immediately.
{
if (state == 0) // First byte.
{
if (unit < 0x80)
{
return unit; // Single-byte.
}
if (unit < 0xC2)
{
return cDecodeErrorBit; // Invalid continuation byte (or illegal 0xC0/0xC1).
}
uint32 remaining = (unit >> 4) - 0xC;
remaining += (remaining == 0);
return unit + (remaining << cDecodeShiftRemaining); // Multi-byte.
}
if ((unit & 0xC0) == 0x80)
{
const uint32 remaining = (state & cDecodeMaskRemaining) - cDecodeOneRemaining;
state = (state << 8) + unit;
if (remaining != 0)
{
return state | remaining; // Intermediate byte of a multi-byte sequence.
}
return DecodeValidate8(state); // Final byte of a multi-byte sequence.
}
}
return cDecodeErrorBit | state;
}
// Decode UTF-16, unsafe.
template<>
inline uint32 Decode<eEncoding_UTF16, false>(uint32 state, uint32 unit)
{
const bool bLead = (unit >= cLeadSurrogateFirst) && (unit <= cLeadSurrogateLast);
const uint32 initial = unit + (bLead << cDecodeShiftRemaining);
const uint32 pair = 0x10000 + ((state & 0x3FF) << 10) + (unit & 0x3FF);
return state == 0 ? initial : pair;
}
// Decode UTF-16, safe.
template<>
inline uint32 Decode<eEncoding_UTF16, true>(uint32 state, uint32 unit)
{
const bool bTrail = (unit >= cTrailSurrogateFirst) && (unit <= cTrailSurrogateLast);
if (state != 0 && !bTrail)
{
return cDecodeErrorBit + (state & 0xFFFF); // Lead surrogate without trail surrogate
}
uint32 result = Decode<eEncoding_UTF16, false>(state, unit);
bool bValid = (result & 0xFFFFFFFEU) != 0xFFFEU;
return bValid ? result : result + cDecodeErrorBit + cDecodeInvalidBit;
}
// Decode UTF-32, unsafe.
template<>
inline uint32 Decode<eEncoding_UTF32, false>([[maybe_unused]] uint32 state, uint32 unit)
{
return unit;
}
// Decode UTF-32, safe.
template<>
inline uint32 Decode<eEncoding_UTF32, true>([[maybe_unused]] uint32 state, uint32 unit)
{
if (unit > cCodepointMax)
{
return cDecodeErrorBit;
}
if (unit >= cLeadSurrogateFirst && unit <= cTrailSurrogateLast)
{
return cDecodeErrorBit | cDecodeSurrogateBit;
}
if ((unit & 0xFFFEU) == 0xFFFEU)
{
return cDecodeErrorBit | cDecodeInvalidBit;
}
return unit;
}
// Decode ASCII, unsafe.
template<>
inline uint32 Decode<eEncoding_ASCII, false>([[maybe_unused]] uint32 state, uint32 unit)
{
return unit;
}
// Decode ASCII, safe.
template<>
inline uint32 Decode<eEncoding_ASCII, true>([[maybe_unused]] uint32 state, uint32 unit)
{
if (unit > 0x7F)
{
return cDecodeErrorBit;
}
return unit;
}
// Decode Latin1, unsafe.
template<>
inline uint32 Decode<eEncoding_Latin1, false>([[maybe_unused]] uint32 state, uint32 unit)
{
return unit;
}
// Decode Latin1, safe.
template<>
inline uint32 Decode<eEncoding_Latin1, true>([[maybe_unused]] uint32 state, uint32 unit)
{
if ((unit >= 0x80 && unit <= 0x9F) || (unit > 0xFF))
{
return cDecodeErrorBit;
}
return unit;
}
// Decode Windows CP-1252, unsafe.
template<>
inline uint32 Decode<eEncoding_Win1252, false>([[maybe_unused]] uint32 state, uint32 unit)
{
static const uint16 cp1252[] =
{
0x20AC, 0x0081, 0x201A, 0x0192, 0x201E, 0x2026, 0x2020, 0x2021,
0x02C6, 0x2030, 0x0160, 0x2039, 0x0152, 0x008D, 0x017D, 0x008F,
0x0090, 0x2018, 0x2019, 0x201C, 0x201D, 0x2022, 0x2013, 0x2014,
0x02DC, 0x2122, 0x0161, 0x203A, 0x0153, 0x009D, 0x017E, 0x0178,
};
return (unit < 0x80 || unit > 0x9F) ? unit : cp1252[unit - 0x80];
}
// Decode Windows CP-1252, safe.
template<>
inline uint32 Decode<eEncoding_Win1252, true>(uint32 state, uint32 unit)
{
if (unit > 0xFF)
{
return cDecodeErrorBit;
}
uint32 result = Decode<eEncoding_Win1252, false>(state, unit);
if (!(unit < 0x80 || unit > 0x9F) && (result == unit))
{
return cDecodeErrorBit; // Not defined in codepage 1252.
}
return result;
}
// SBase<T>:
// Utility to apply empty-base-optimization on type T.
// Will fall back to a member if T is a reference type.
template<typename T, int Tag = 0>
struct SBase
: T
{
SBase(T base)
: T(base) {}
T& GetBase() { return *this; }
const T& GetBase() const { return *this; }
};
template<typename T, int Tag>
struct SBase<T&, Tag>
{
T& base;
SBase(T& b)
: base(b) {}
T& GetBase() { return base; }
const T& GetBase() const { return base; }
};
// SDecoder<Encoding, Sink, Recovery>:
// Functor to decode UCS code-points from an input range.
// Recovery functor will be invoked as a fall-back if decoding fails.
// This allows ensuring all the output is valid (even if the input isn't).
// Note: The destructor will automatically flush any remaining (erroneous) state, you can also call Finalize().
template<EEncoding InputEncoding, typename Sink, typename Recovery = void>
struct SDecoder
: SBase<Sink, 1>
, SBase<Recovery, 2>
{
uint32 state;
SDecoder(Sink sink, Recovery recovery = Recovery())
: SBase<Sink, 1>(sink)
, SBase<Recovery, 2>(recovery)
, state(0) {}
SDecoder() { Finalize(); }
Recovery& recovery() { return SBase<Recovery, 2>::GetBase(); }
Sink& sink() { return SBase<Sink, 1>::GetBase(); }
void operator()(uint32 unit)
{
state = Detail::Decode<InputEncoding, true>(state, unit);
if (state <= 0x1FFFFF)
{
sink()(state);
state = 0;
}
else if (state & Detail::cDecodeErrorBit)
{
recovery()(sink(), state, unit);
state = 0;
}
}
void Finalize()
{
if (state)
{
recovery()(sink(), state, 0);
state = 0;
}
}
};
// SDecoder<Encoding, Sink>:
// Functor to decode to UCS code-points from an input range.
// No attempt to discover or recover from encoding errors is made, can only safely be used with known-valid input.
template<EEncoding InputEncoding, typename Sink>
struct SDecoder<InputEncoding, Sink, void>
: SBase<Sink>
{
uint32 state;
SDecoder(Sink sink)
: SBase<Sink>(sink)
, state(0) {}
Sink& sink() { return SBase<Sink>::GetBase(); }
void operator()(uint32 unit)
{
state = Detail::Decode<InputEncoding, false>(state, unit);
if (state <= 0x1FFFFF)
{
sink()(state);
state = 0;
}
}
void Finalize() {}
};
// SEncoder<Encoding, Sink>:
// Generic Unicode encoder functor.
// Encoding must be one an encoding type for which output is supported.
// The Sink type must have HintSequence member for UTF-8 and UTF-16 (although it may be a no-op).
// In general, you feed operator() with UCS code-points and it will emit code-units.
template<EEncoding OutputEncoding, typename Sink>
struct SEncoder
{
static const bool value = false;
};
// SEncoder<Encoding, Sink>:
// Specialization of ASCII encoder functor.
// Note: Any out-of-range character is mapped to question mark.
template<typename Sink>
struct SEncoder<eEncoding_ASCII, Sink>
: SBase<Sink>
{
static const bool value = true;
typedef uint8 value_type;
SEncoder(Sink sink)
: SBase<Sink>(sink) {}
void operator()(uint32 cp)
{
cp = cp < 0x80 ? cp : (uint32)'?';
SBase<Sink>::GetBase()(value_type(cp));
}
};
// SEncoder<Encoding, Sink>:
// Specialization of UTF-8 encoder functor.
template<typename Sink>
struct SEncoder<eEncoding_UTF8, Sink>
: SBase<Sink>
{
static const bool value = true;
typedef uint8 value_type;
SEncoder(Sink sink)
: SBase<Sink>(sink) {}
Sink& sink() { return SBase<Sink>::GetBase(); }
void operator()(uint32 cp)
{
if (cp < 0x80)
{
// Single byte sequence.
sink()(value_type(cp));
}
else
{
// Expand 21-bit value to 32-bit.
uint32 bits =
(cp & 0x00003F) +
((cp & 0x000FC0) << 2) +
((cp & 0x03F000) << 4) +
((cp & 0x1C0000) << 6);
// Type of sequence.
const bool bSeq4 = (cp >= 0x10000);
const bool bSeq3 = (cp >= 0x800);
// Mask lead-bytes and continuation-bytes.
uint32 mask = 0xEFE0C080;
mask ^= (bSeq3 << 14);
mask += (bSeq4 ? 0xA00000 : 0);
bits |= mask;
// Length of the sequence.
const uint32 length = (uint32)bSeq4 + (uint32)bSeq3 + 1;
sink().HintSequence(length);
// Sink the multi-byte sequence.
if (bSeq4)
{
sink()(value_type(bits >> 24));
}
if (bSeq3)
{
sink()(value_type(bits >> 16));
}
sink()(value_type(bits >> 8));
sink()(value_type(bits));
}
}
};
// SEncoder<Encoding, Sink>:
// Specialization of UTF-16 encoder functor.
template<typename Sink>
struct SEncoder<eEncoding_UTF16, Sink>
: SBase<Sink>
{
static const bool value = true;
typedef uint16 value_type;
SEncoder(Sink sink)
: SBase<Sink>(sink) {}
Sink& sink() { return SBase<Sink>::GetBase(); }
void operator()(uint32 cp)
{
if (cp < 0x10000)
{
// Single unit
sink()(value_type(cp));
}
else
{
// We will generate two-element sequence
sink().HintSequence(2);
// Surrogate pair
cp -= 0x10000;
uint32 lead = ((cp >> 10) & 0x3FF) + Detail::cLeadSurrogateFirst;
uint32 trail = (cp & 0x3FF) + Detail::cTrailSurrogateFirst;
sink()(value_type(lead));
sink()(value_type(trail));
}
}
};
// SEncoder<Encoding, Sink>:
// Specialization of UTF-32 encoder functor.
// Note: This is a no-op, but we want to be able to express UTF-32 just like the other encodings.
template<typename Sink>
struct SEncoder<eEncoding_UTF32, Sink>
: SBase<Sink>
{
static const bool value = true;
typedef uint32 value_type;
SEncoder(Sink sink)
: SBase<Sink>(sink) {}
void operator()(uint32 cp)
{
SBase<Sink>::GetBase()(value_type(cp));
}
};
// SDecoder<Encoding, SEncoder<Encoding>, void>:
// Specialization for unsafe no-op trans-coding.
// Since the conversion is a no-op, no need to keep any state or do any computation.
// Note: For a decoding with a fallback, this is not possible since we can't guarantee the input is valid.
template<EEncoding SameEncoding, typename Sink>
struct SDecoder<SameEncoding, SEncoder<SameEncoding, Sink>, void>
{
Sink sink;
SDecoder(Sink s)
: sink(s) {}
void operator()(uint32 unit)
{
sink(unit);
}
void Finalize() {}
};
// SRecoveryDiscard<Sink>:
// Recovery handler that, on encoding error, discards the offending sequence.
template<typename Sink>
struct SRecoveryDiscard
{
SRecoveryDiscard() {}
void operator()([[maybe_unused]] Sink& sink, [[maybe_unused]] uint32 error, [[maybe_unused]] uint32 unit) {}
};
// SRecoveryReplace<Sink>:
// Recovery handler that, on encoding error, replaces the sequence with replacement-character (U+FFFD).
// Note: This implementation matches a whole invalid sequence, it could be changed to emit for every code-unit.
template<typename Sink>
struct SRecoveryReplace
{
SRecoveryReplace() {}
void operator()(Sink& sink, uint32 error, uint32 unit) { sink(cReplacementCharacter); }
};
// SRecoveryFallback<Sink>:
// Recovery handler that, on encoding error, falls back to another encoding.
// The fallback encoding must be stateless (ie: ASCII, Latin1 or Win1252).
// This type assumes an 8-bit primary encoding since the only viable fallback encodings are 8-bit.
template<typename Sink, EEncoding FallbackEncoding, typename NextFallback>
struct SRecoveryFallback
: NextFallback
{
SRecoveryFallback()
: NextFallback() {}
void operator()(Sink& sink, uint32 error, uint32 unit)
{
SDecoder<FallbackEncoding, Sink&, NextFallback&> fallback(sink, *static_cast<NextFallback*>(this));
uint8 byte1(error >> 16);
uint8 byte2(error >> 8);
uint8 byte3(error);
uint8 byte4(unit);
if (byte1)
{
fallback(byte1);
}
if (byte1 | byte2)
{
fallback(byte2);
}
if (byte1 | byte2 | byte3)
{
fallback(byte3);
}
fallback(byte4);
}
};
// SRecoveryFallbackHelper<Sink, RecoveryMethod>:
// Helper to pick a SRecoveryFallback instantiation based on RecoveryMethod.
template<EEncoding OutputEncoding, typename Sink, EErrorRecovery RecoveryMethod>
struct SRecoveryFallbackHelper
{
// A compilation error here means RecoveryMethod value was unexpected here
COMPILE_TIME_ASSERT(
RecoveryMethod == eErrorRecovery_FallbackLatin1ThenDiscard ||
RecoveryMethod == eErrorRecovery_FallbackLatin1ThenReplace ||
RecoveryMethod == eErrorRecovery_FallbackWin1252ThenDiscard ||
RecoveryMethod == eErrorRecovery_FallbackWin1252ThenReplace);
typedef SEncoder<OutputEncoding, Sink> SinkType;
static const EEncoding FallbackEncoding =
RecoveryMethod == eErrorRecovery_FallbackLatin1ThenDiscard ||
RecoveryMethod == eErrorRecovery_FallbackLatin1ThenReplace
? eEncoding_Latin1 : eEncoding_Win1252;
template<typename Dummy, bool WithDiscard>
struct Pick
{
typedef SRecoveryDiscard<SinkType> type;
};
template<typename Dummy>
struct Pick<Dummy, false>
{
typedef SRecoveryReplace<SinkType> type;
};
typedef typename Pick<Sink,
RecoveryMethod == eErrorRecovery_FallbackLatin1ThenDiscard ||
RecoveryMethod == eErrorRecovery_FallbackWin1252ThenDiscard>::type NextFallback;
typedef SRecoveryFallback<SinkType, FallbackEncoding, NextFallback> RecoveryType;
typedef SDecoder<eEncoding_UTF8, SinkType, RecoveryType> FullType;
};
// STranscoderSelect<InputEncoding, OutputEncoding, Sink, RecoveryMethod>:
// Derives a chained decoder/encoder pair that performs code-unit -> code-unit transform.
// The RecoveryMethod template parameter determines the behavior during encoding.
// This is the basic way to perform trans-coding, and is the type instantiated by the higher-level functions.
template<EEncoding InputEncoding, EEncoding OutputEncoding, typename Sink, EErrorRecovery RecoveryMethod>
struct STranscoderSelect;
template<EEncoding InputEncoding, EEncoding OutputEncoding, typename Sink>
struct STranscoderSelect<InputEncoding, OutputEncoding, Sink, eErrorRecovery_None>
: SDecoder<InputEncoding, SEncoder<OutputEncoding, Sink>, void>
{
typedef SDecoder<InputEncoding, SEncoder<OutputEncoding, Sink>, void> TranscoderType;
STranscoderSelect(Sink sink)
: TranscoderType(sink) {}
};
template<EEncoding InputEncoding, EEncoding OutputEncoding, typename Sink>
struct STranscoderSelect<InputEncoding, OutputEncoding, Sink, eErrorRecovery_Discard>
: SDecoder<InputEncoding, SEncoder<OutputEncoding, Sink>, SRecoveryDiscard<SEncoder<OutputEncoding, Sink> > >
{
typedef SRecoveryDiscard<SEncoder<OutputEncoding, Sink> > RecoveryType;
typedef SDecoder<InputEncoding, SEncoder<OutputEncoding, Sink>, RecoveryType> TranscoderType;
STranscoderSelect(Sink sink)
: TranscoderType(sink) {}
};
template<EEncoding InputEncoding, EEncoding OutputEncoding, typename Sink>
struct STranscoderSelect<InputEncoding, OutputEncoding, Sink, eErrorRecovery_Replace>
: SDecoder<InputEncoding, SEncoder<OutputEncoding, Sink>, SRecoveryReplace<SEncoder<OutputEncoding, Sink> > >
{
typedef SRecoveryReplace<SEncoder<OutputEncoding, Sink> > RecoveryType;
typedef SDecoder<InputEncoding, SEncoder<OutputEncoding, Sink>, RecoveryType> TranscoderType;
STranscoderSelect(Sink sink)
: TranscoderType(sink) {}
};
template<EEncoding OutputEncoding, typename Sink>
struct STranscoderSelect<eEncoding_UTF8, OutputEncoding, Sink, eErrorRecovery_FallbackLatin1ThenDiscard>
: SRecoveryFallbackHelper<OutputEncoding, Sink, eErrorRecovery_FallbackLatin1ThenDiscard>::FullType
{
static const EErrorRecovery RecoveryMethod = eErrorRecovery_FallbackLatin1ThenDiscard;
typedef typename SRecoveryFallbackHelper<OutputEncoding, Sink, RecoveryMethod>::RecoveryType RecoveryType;
typedef typename SRecoveryFallbackHelper<OutputEncoding, Sink, RecoveryMethod>::FullType TranscoderType;
STranscoderSelect(Sink sink)
: TranscoderType(sink) {}
};
template<EEncoding OutputEncoding, typename Sink>
struct STranscoderSelect<eEncoding_UTF8, OutputEncoding, Sink, eErrorRecovery_FallbackLatin1ThenReplace>
: SRecoveryFallbackHelper<OutputEncoding, Sink, eErrorRecovery_FallbackLatin1ThenReplace>::FullType
{
static const EErrorRecovery RecoveryMethod = eErrorRecovery_FallbackLatin1ThenReplace;
typedef typename SRecoveryFallbackHelper<OutputEncoding, Sink, RecoveryMethod>::RecoveryType RecoveryType;
typedef typename SRecoveryFallbackHelper<OutputEncoding, Sink, RecoveryMethod>::FullType TranscoderType;
STranscoderSelect(Sink sink)
: TranscoderType(sink) {}
};
template<EEncoding OutputEncoding, typename Sink>
struct STranscoderSelect<eEncoding_UTF8, OutputEncoding, Sink, eErrorRecovery_FallbackWin1252ThenDiscard>
: SRecoveryFallbackHelper<OutputEncoding, Sink, eErrorRecovery_FallbackWin1252ThenDiscard>::FullType
{
static const EErrorRecovery RecoveryMethod = eErrorRecovery_FallbackWin1252ThenDiscard;
typedef typename SRecoveryFallbackHelper<OutputEncoding, Sink, RecoveryMethod>::RecoveryType RecoveryType;
typedef typename SRecoveryFallbackHelper<OutputEncoding, Sink, RecoveryMethod>::FullType TranscoderType;
STranscoderSelect(Sink sink)
: TranscoderType(sink) {}
};
template<EEncoding OutputEncoding, typename Sink>
struct STranscoderSelect<eEncoding_UTF8, OutputEncoding, Sink, eErrorRecovery_FallbackWin1252ThenReplace>
: SRecoveryFallbackHelper<OutputEncoding, Sink, eErrorRecovery_FallbackWin1252ThenReplace>::FullType
{
static const EErrorRecovery RecoveryMethod = eErrorRecovery_FallbackWin1252ThenReplace;
typedef typename SRecoveryFallbackHelper<OutputEncoding, Sink, RecoveryMethod>::RecoveryType RecoveryType;
typedef typename SRecoveryFallbackHelper<OutputEncoding, Sink, RecoveryMethod>::FullType TranscoderType;
STranscoderSelect(Sink sink)
: TranscoderType(sink) {}
};
// SIsSafeEncoding<R>:
// Check if the given recovery mode is safe.
// This is used for SFINAE checks in higher-level functions.
template<EErrorRecovery R>
struct SIsSafeEncoding
{
static const bool value =
R == eErrorRecovery_Discard ||
R == eErrorRecovery_Replace ||
R == eErrorRecovery_FallbackLatin1ThenDiscard ||
R == eErrorRecovery_FallbackLatin1ThenReplace ||
R == eErrorRecovery_FallbackWin1252ThenDiscard ||
R == eErrorRecovery_FallbackWin1252ThenReplace;
};
// SIsCopyableEncoding<I, O>:
// Check if data in one encoding can be copied directly to another encoding.
// This is the basis for block-copy and string-assign optimizations in un-safe conversion functions.
// Note: There are more valid combinations, they are left out since those can't occur with the output encodings supported.
// Note: Only used for un-safe functions since it doesn't account for potential invalid sequences (they would be copied over).
template<EEncoding InputEncoding, EEncoding OutputEncoding>
struct SIsCopyableEncoding
{
static const bool value =
InputEncoding == eEncoding_ASCII || // ASCII and Latin1 values don't change in any encoding.
(InputEncoding == eEncoding_Latin1 && OutputEncoding != eEncoding_ASCII); // Except Latin1 -> ASCII is lossy.
};
template<EEncoding SameEncoding>
struct SIsCopyableEncoding<SameEncoding, SameEncoding>
{
static const bool value = true; // If the input and output encodings are the same, then it's copyable.
};
}
}

File diff suppressed because it is too large Load Diff

@ -1,615 +0,0 @@
/*
* Copyright (c) Contributors to the Open 3D Engine Project.
* For complete copyright and license terms please see the LICENSE at the root of this distribution.
*
* SPDX-License-Identifier: Apache-2.0 OR MIT
*
*/
// Description : Encoded Unicode sequence iteration.
//
// For lower level accessing of encoded text, an STL compatible iterator wrapper is provided.
// This iterator will decode the underlying sequence, abstracting it to a sequence of UCS code-points.
// Using the iterator wrapper, you can find where in an encoded string code-points (or encoding errors) are located.
// Note: The iterator is an input-only iterator, you cannot write to the underlying sequence.
#pragma once
#include "UnicodeBinding.h"
namespace Unicode
{
namespace Detail
{
// MoveNext(it, checker, tag):
// Moves the iterator to the next UCS code-point in the encoded sequence.
// Non-specialized version (for 1:1 code-unit to code-point).
template<typename BaseIterator, typename BoundsChecker, EEncoding Encoding>
inline void MoveNext(BaseIterator& it, const BoundsChecker& checker, const integral_constant<EEncoding, Encoding>)
{
COMPILE_TIME_ASSERT(
Encoding == eEncoding_ASCII ||
Encoding == eEncoding_UTF32 ||
Encoding == eEncoding_Latin1 ||
Encoding == eEncoding_Win1252);
assert(!checker.IsEnd(it) && "Attempt to iterate past the end of the sequence");
// All of these encodings use a single code-unit for each code-point.
++it;
}
// MoveNext(it, checker, tag):
// Moves the iterator to the next UCS code-point in the encoded sequence.
// Specialized for UTF-8.
template<typename BaseIterator, typename BoundsChecker>
inline void MoveNext(BaseIterator& it, const BoundsChecker& checker, integral_constant<EEncoding, eEncoding_UTF8>)
{
assert(!checker.IsEnd(it) && "Attempt to iterate past the end of the sequence");
// UTF-8: just need to skip up to 3 continuation bytes.
for (int i = 0; i < 4; ++i)
{
++it;
if (checker.IsEnd(it)) // :WARN: always returns false if "safe" bool is false!
{
break;
}
uint32 val = static_cast<uint32>(*it);
if ((val & 0xC0) != 0x80)
{
break;
}
}
}
// MoveNext(it, checker, tag):
// Moves the iterator to the next UCS code-point in the encoded sequence.
// Specialized for UTF-16.
template<typename BaseIterator, typename BoundsChecker>
inline void MoveNext(BaseIterator& it, const BoundsChecker& checker, integral_constant<EEncoding, eEncoding_UTF16>)
{
assert(!checker.IsEnd(it) && "Attempt to iterate past the end of the sequence");
// UTF-16: just need to skip one lead surrogate.
++it;
uint32 val = static_cast<uint32>(*it);
if (val >= cLeadSurrogateFirst && val <= cLeadSurrogateLast)
{
if (!checker.IsEnd(it))
{
++it;
}
}
}
// MovePrev(it, checker, tag):
// Moves the iterator to the previous UCS code-point in the encoded sequence.
// Non-specialized version (for 1:1 code-unit to code-point).
template<typename BaseIterator, typename BoundsChecker, EEncoding Encoding>
inline void MovePrev(BaseIterator& it, const BoundsChecker& checker, const integral_constant<EEncoding, Encoding>)
{
COMPILE_TIME_ASSERT(
Encoding == eEncoding_ASCII ||
Encoding == eEncoding_UTF32 ||
Encoding == eEncoding_Latin1 ||
Encoding == eEncoding_Win1252);
assert(!checker.IsBegin(it) && "Attempt to iterate past the beginning of the sequence");
// All of these encodings use a single code-unit for each code-point.
--it;
}
// MovePrev(it, checker, tag):
// Moves the iterator to the previous UCS code-point in the encoded sequence.
// Specialized for UTF-8.
template<typename BaseIterator, typename BoundsChecker>
inline void MovePrev(BaseIterator& it, const BoundsChecker& checker, integral_constant<EEncoding, eEncoding_UTF8>)
{
assert(!checker.IsBegin(it) && "Attempt to iterate past the beginning of the sequence");
// UTF-8: just need to skip up to 3 continuation bytes.
for (int i = 0; i < 4; ++i)
{
--it;
if (checker.IsBegin(it))
{
break;
}
uint32 val = static_cast<uint32>(*it);
if ((val & 0xC0) != 0x80)
{
break;
}
}
}
// MovePrev(it, checker, tag):
// Moves the iterator to the previous UCS code-point in the encoded sequence.
// Specialized for UTF-16.
template<typename BaseIterator, typename BoundsChecker>
inline void MovePrev(BaseIterator& it, const BoundsChecker& checker, integral_constant<EEncoding, eEncoding_UTF16>)
{
assert(!checker.IsBegin(it) && "Attempt to iterate past the beginning of the sequence");
// UTF-16: just need to skip one lead surrogate.
--it;
uint32 val = static_cast<uint32>(*it);
if (val >= cLeadSurrogateFirst && val <= cLeadSurrogateLast)
{
if (!checker.IsBegin(it))
{
--it;
}
}
}
// SBaseIterators<BaseIterator, BoundsChecked>:
// Utility to access base iterators properties from CIterator.
// This is the bounds-checked specialization, the range information is kept to defend against malformed sequences.
template<typename BaseIterator, bool BoundsChecked>
struct SBaseIterators
{
typedef BaseIterator type;
type begin, end;
type it;
SBaseIterators(const BaseIterator& _begin, const BaseIterator& _end)
: begin(_begin)
, end(_end)
, it(_begin) {}
SBaseIterators(const SBaseIterators& other)
: begin(other.begin)
, end(other.end)
, it(other.it) {}
SBaseIterators& operator =(const SBaseIterators& other)
{
begin = other.begin;
end = other.end;
it = other.it;
return *this;
}
bool IsBegin(const BaseIterator& _it) const
{
return begin == _it;
}
bool IsEnd(const BaseIterator& _it) const
{
return end == _it;
}
bool IsEqual(const SBaseIterators& other) const
{
return it == other.it
&& begin == other.begin
&& end == other.end;
}
// Note: Only called inside assert.
// O(N) version; works with any forward-iterator (or better)
bool IsInRange(const BaseIterator& _it, std::forward_iterator_tag) const
{
for (BaseIterator i = begin; i != end; ++i)
{
if (_it == i)
{
return true;
}
}
return false;
}
// Note: Only called inside assert.
// O(1) version; requires random-access-iterator.
bool IsInRange(const BaseIterator& _it, std::random_access_iterator_tag) const
{
return (begin <= _it && _it < end);
}
// Note: Only called inside assert.
// Dispatches to the O(1) version if a random-access iterator is used (common case).
bool IsInRange(const BaseIterator& _it) const
{
return IsInRange(_it, typename std::iterator_traits<BaseIterator>::iterator_category());
}
};
// SBaseIterators<BaseIterator, BoundsChecked>:
// Utility to access base iterators properties from CIterator.
// This is the un-checked specialization for known-safe sequences.
template<typename BaseIterator>
struct SBaseIterators<BaseIterator, false>
{
typedef BaseIterator type;
type it;
explicit SBaseIterators(const BaseIterator& begin)
: it(begin) {}
SBaseIterators(const BaseIterator& begin, const BaseIterator& end)
: it(begin) {}
SBaseIterators(const SBaseIterators& other)
: it(other.it) {}
SBaseIterators& operator =(const SBaseIterators& other)
{
it = other.it;
return *this;
}
bool IsBegin(const BaseIterator&) const
{
return false;
}
bool IsEnd(const BaseIterator&) const
{
return false;
}
bool IsEqual(const SBaseIterators& other) const
{
return it == other.it;
}
bool IsInRange(const BaseIterator&) const
{
return true;
}
};
// SIteratorSink<Safe>:
// Helper to store the last code-point and error bit that was decoded.
// This is the safe specialization for potentially malformed sequences.
template<bool Safe>
struct SIteratorSink
{
static const uint32 cEmpty = 0xFFFFFFFFU;
uint32 value;
bool error;
void Clear()
{
value = cEmpty;
error = false;
}
bool IsEmpty() const
{
return value == cEmpty;
}
bool IsError() const
{
return error;
}
const uint32& GetValue() const
{
return value;
}
void MarkDecodingError()
{
value = cReplacementCharacter;
error = true;
}
template<EEncoding Encoding, typename BaseIterator, bool BoundsChecked>
void Decode(const SBaseIterators<BaseIterator, BoundsChecked>& its, integral_constant<EEncoding, Encoding>)
{
typedef SDecoder<Encoding, SIteratorSink&, SIteratorSink&> DecoderType;
DecoderType decoder(*this, *this);
Clear();
for (BaseIterator it = its.it; IsEmpty(); ++it)
{
uint32 val = static_cast<uint32>(*it);
decoder(val);
if (its.IsEnd(it))
{
break;
}
}
if (IsEmpty())
{
// If we still have neither a new value or an error flag, just treat as error.
// This can happen if we reached the end of the sequence, but it ends in an incomplete code-sequence.
MarkDecodingError();
}
}
template<EEncoding Encoding, typename BaseIterator, bool BoundsChecked>
void DecodeIfEmpty(const SBaseIterators<BaseIterator, BoundsChecked>& its, integral_constant<EEncoding, Encoding> tag)
{
if (IsEmpty())
{
Decode(its, tag);
}
}
void operator()(uint32 unit)
{
value = unit;
}
void operator()(SIteratorSink&, uint32, uint32)
{
MarkDecodingError();
}
};
// SIteratorSink<Safe>:
// Helper to store the last code-point that was decoded.
// This is the un-safe specialization for known-valid sequences.
// Note: No error-state is tracked since we won't handle that regardless for un-safe CIterator.
template<>
struct SIteratorSink<false>
{
static const uint32 cEmpty = 0xFFFFFFFFU;
uint32 value;
void Clear()
{
value = cEmpty;
}
bool IsEmpty() const
{
return value == cEmpty;
}
bool IsError() const
{
return false;
}
const uint32& GetValue() const
{
return value;
}
template<EEncoding Encoding, typename BaseIterator, bool BoundsChecked>
void Decode(const SBaseIterators<BaseIterator, BoundsChecked>& its, integral_constant<EEncoding, Encoding>)
{
typedef SDecoder<Encoding, SIteratorSink&, void> DecoderType;
DecoderType decoder(*this);
for (BaseIterator it = its.it; IsEmpty(); ++it)
{
uint32 val = static_cast<uint32>(*it);
decoder(val);
}
}
template<EEncoding Encoding, typename BaseIterator, bool BoundsChecked>
void DecodeIfEmpty(const SBaseIterators<BaseIterator, BoundsChecked>& its, integral_constant<EEncoding, Encoding> tag)
{
if (IsEmpty())
{
Decode(its, tag);
}
}
void operator()(uint32 unit)
{
value = unit;
}
};
}
// CIterator<BaseIterator [, Safe, Encoding]>:
// Helper class that can iterate over an encoded text sequence and read the underlying UCS code-points.
// If the Safe flag is set, bounds checking is performed inside multi-unit sequences to guard against decoding errors.
// This requires the user to know where the sequence ends (use the constructor taking two parameters).
// Note: The BaseIterator must be forward-iterator or better when Safe flag is set.
// If the Safe flag is not set, you must guarantee the sequence is validly encoded, and allows the use of the single argument constructor.
// In the case of unsafe iterator used for C-style string pointer, look for a U+0000 dereferenced value to end the iteration.
// Regardless of the Safe flag, the user must ensure that the iterator is never moved past the beginning or end of the range (just like any other STL iterator).
// Example of typical usage:
// string utf8 = "foo"; // UTF-8
// for (Unicode::CIterator<string::const_iterator> it(utf8.begin(), utf8.end()); it != utf8.end(); ++it)
// {
// uint32 codepoint = *it; // 32-bit UCS code-point
// }
// Example unsafe usage: (for known-valid encoded C-style strings):
// const char *pValid = "foo"; // UTF-8
// for (Unicode::CIterator<const char *, false> it = pValid; *it != 0; ++it)
// {
// uint32 codepoint = *it; // 32-bit UCS code-point
// }
template<typename BaseIterator, bool Safe = true, EEncoding Encoding = Detail::SInferEncoding<BaseIterator, true>::value>
class CIterator
{
// The iterator value in the encoded sequence.
// Optionally provides bounds-checking.
Detail::SBaseIterators<BaseIterator, Safe> its;
// The cached UCS code-point at the current position.
// Mutable because dereferencing is conceptually const, but does cache some state in this case.
mutable Detail::SIteratorSink<Safe> sink;
public:
// Types for compatibility with STL bidirectional iterator requirements.
typedef const uint32 value_type;
typedef const uint32& reference;
typedef const uint32* pointer;
typedef const ptrdiff_t difference_type;
typedef std::bidirectional_iterator_tag iterator_category;
// Construct an iterator for the given range.
// The initial position of the iterator as at the beginning of the range.
CIterator(const BaseIterator& begin, const BaseIterator& end)
: its(begin, end)
{
sink.Clear();
}
// Construct an iterator from a single iterator (typically C-style string pointer).
// This can only be used for unsafe iterators.
template<typename IteratorType>
CIterator(const IteratorType& it, typename Detail::SRequire<!Safe&& Detail::is_convertible<IteratorType, BaseIterator>::value, IteratorType>::type* = 0)
: its(static_cast<const BaseIterator&>(it))
{
sink.Clear();
}
// Copy-construct an iterator.
CIterator(const CIterator& other)
: its(other.its)
, sink(other.sink) {}
// Copy-assign an iterator.
CIterator& operator =(const CIterator& other)
{
its = other.its;
sink = other.sink;
return *this;
}
// Test if the iterator points at an encoding error in the underlying encoded sequence.
// If so, the function returns false.
// When using an un-safe iterator, this function always returns true, if a sequence can contain encoding errors, you must use the safe variant.
// Note: This requires the underlying iterator to be dereferenced, so you cannot use it only while the iterator is inside the valid range.
bool IsAtValidCodepoint() const
{
assert(!its.IsEnd(its.it) && "Attempt to dereference the past-the-end iterator");
Detail::integral_constant<EEncoding, Encoding> tag;
sink.DecodeIfEmpty(its, tag);
return !sink.IsError();
}
// Gets the current position in the underlying encoded sequence.
// If the iterator points to an invalidly encoded sequence (ie, IsError() returns true), the direction of iteration is significant.
// In that case the returned position is approximated; to work around this: move all iterators of which the position is compared in the same direction.
const BaseIterator& GetPosition() const
{
return its.it;
}
// Sets the current position in the underlying encoded sequence.
// You may not set the position outside the range for which this iterator was constructed.
void SetPosition(const BaseIterator& it)
{
assert(its.IsInRange(it) && "Attempt to set the underlying iterator outside of the supported range");
its.it = it;
}
// Test if this iterator is equal to another iterator instance.
// Note: In the presence of an invalidly encoded sequence (ie, IsError() returns true), the direction of iteration is significant.
// To work around this, you can either:
// 1) Move all iterators that will be compared in the same direction; or
// 2) Compare the dereferenced iterator value(s) instead (if applicable).
bool operator ==(const CIterator& other) const
{
return its.IsEqual(other.its);
}
// Test if this iterator is equal to another base iterator.
// Note: If the provided iterator does not point to the the first code-unit of an UCS code-point, the behavior is undefined.
bool operator ==(const BaseIterator& other) const
{
return its.it == other;
}
// Test if this iterator is equal to another iterator instance.
// Note: In the presence of an invalidly encoded sequence (ie, IsError() returns true), the direction of iteration is significant.
// To work around this, you can either:
// 1) Move all iterators that will be compared in the same direction; or
// 2) Compare the dereferenced iterator value(s) instead (if applicable).
bool operator !=(const CIterator& other) const
{
return !its.IsEqual(other.its);
}
// Test if this iterator is equal to another base iterator.
// Note: If the provided iterator does not point to the the first code-unit of an UCS code-point, the behavior is undefined.
bool operator !=(const BaseIterator& other) const
{
return its.it != other;
}
// Get the decoded UCS code-point at the current position in the sequence.
// If the iterator points to an invalidly encoded sequence (ie, IsError() returns true) the function returns U+FFFD (replacement character).
reference operator *() const
{
assert(!its.IsEnd(its.it) && "Attempt to dereference the past-the-end iterator");
Detail::integral_constant<EEncoding, Encoding> tag;
sink.DecodeIfEmpty(its, tag);
return sink.GetValue();
}
// Advance the iterator to the next UCS code-point.
// Note: You must make sure the iterator is not at the end of the sequence, even in Safe mode.
// However, in Safe mode, the iterator will never move past the end of the sequence in the presence of encoding errors.
CIterator& operator ++()
{
Detail::integral_constant<EEncoding, Encoding> tag;
Detail::MoveNext(its.it, its, tag);
sink.Clear();
return *this;
}
// Go back to the previous UCS code-point.
// Note: You must make sure the iterator is not at the beginning of the sequence, even in Safe mode.
// However, in Safe mode, the iterators will never move past the beginning of the sequence in the presence of encoding errors.
CIterator& operator --()
{
Detail::integral_constant<EEncoding, Encoding> tag;
Detail::MovePrev(its.it, its, tag);
sink.Clear();
return *this;
}
// Advance the iterator to the next UCS code-point, return a copy of the iterator position before advancing.
// Note: You must make sure the iterator is not at the end of the sequence, even in Safe mode.
// However, in Safe mode, the iterator will never move past the end of the sequence in the presence of encoding errors.
CIterator operator ++(int)
{
CIterator result = *this;
++*this;
return result;
}
// Go back to the previous UCS code-point, return a copy of the iterator position before going back.
// Note: You must make sure the iterator is not at the beginning of the sequence, even in Safe mode.
// However, in Safe mode, the iterators will never move past the beginning of the sequence in the presence of encoding errors.
CIterator operator --(int)
{
CIterator result = *this;
--*this;
return result;
}
};
namespace Detail
{
// SIteratorSpecializer<T>:
// Specializes the CIterator template to use for a given string type.
// Note: The reason we use this is because MSVC doesn't want to deduce this on the MakeIterator declaration.
template<typename StringType>
struct SIteratorSpecializer
{
typedef CIterator<typename StringType::const_iterator> type;
};
}
// MakeIterator(const StringType &str):
// Helper function to make an UCS code-point iterator given an Unicode string.
// Example usage:
// string utf8 = "foo"; // UTF-8
// auto it = Unicode::MakeIterator(utf8);
// while (it != utf8.end())
// {
// uint32 codepoint = *it; // 32-bit UCS code-point
// }
// Or, in a for-loop:
// for (auto it = Unicode::MakeIterator(utf8); it != utf8.end(); ++it) {}
template<typename StringType>
inline typename Detail::SIteratorSpecializer<StringType>::type MakeIterator(const StringType& str)
{
return typename Detail::SIteratorSpecializer<StringType>::type(str.begin(), str.end());
}
}

@ -116,10 +116,6 @@ set(FILES
TimeValue_info.h
TypeInfo_decl.h
TypeInfo_impl.h
UnicodeBinding.h
UnicodeEncoding.h
UnicodeFunctions.h
UnicodeIterator.h
VectorMap.h
VectorSet.h
VertexFormats.h

Loading…
Cancel
Save