You cannot select more than 25 topics Topics must start with a letter or number, can include dashes ('-') and can be up to 35 characters long.
o3de/Code/Legacy/CryCommon/UnicodeIterator.h

615 lines
24 KiB
C++

/*
* Copyright (c) Contributors to the Open 3D Engine Project
*
* SPDX-License-Identifier: Apache-2.0 OR MIT
*
*/
// Description : Encoded Unicode sequence iteration.
//
// For lower level accessing of encoded text, an STL compatible iterator wrapper is provided.
// This iterator will decode the underlying sequence, abstracting it to a sequence of UCS code-points.
// Using the iterator wrapper, you can find where in an encoded string code-points (or encoding errors) are located.
// Note: The iterator is an input-only iterator, you cannot write to the underlying sequence.
#pragma once
#include "UnicodeBinding.h"
namespace Unicode
{
namespace Detail
{
// MoveNext(it, checker, tag):
// Moves the iterator to the next UCS code-point in the encoded sequence.
// Non-specialized version (for 1:1 code-unit to code-point).
template<typename BaseIterator, typename BoundsChecker, EEncoding Encoding>
inline void MoveNext(BaseIterator& it, const BoundsChecker& checker, const integral_constant<EEncoding, Encoding>)
{
COMPILE_TIME_ASSERT(
Encoding == eEncoding_ASCII ||
Encoding == eEncoding_UTF32 ||
Encoding == eEncoding_Latin1 ||
Encoding == eEncoding_Win1252);
assert(!checker.IsEnd(it) && "Attempt to iterate past the end of the sequence");
// All of these encodings use a single code-unit for each code-point.
++it;
}
// MoveNext(it, checker, tag):
// Moves the iterator to the next UCS code-point in the encoded sequence.
// Specialized for UTF-8.
template<typename BaseIterator, typename BoundsChecker>
inline void MoveNext(BaseIterator& it, const BoundsChecker& checker, integral_constant<EEncoding, eEncoding_UTF8>)
{
assert(!checker.IsEnd(it) && "Attempt to iterate past the end of the sequence");
// UTF-8: just need to skip up to 3 continuation bytes.
for (int i = 0; i < 4; ++i)
{
++it;
if (checker.IsEnd(it)) // :WARN: always returns false if "safe" bool is false!
{
break;
}
uint32 val = static_cast<uint32>(*it);
if ((val & 0xC0) != 0x80)
{
break;
}
}
}
// MoveNext(it, checker, tag):
// Moves the iterator to the next UCS code-point in the encoded sequence.
// Specialized for UTF-16.
template<typename BaseIterator, typename BoundsChecker>
inline void MoveNext(BaseIterator& it, const BoundsChecker& checker, integral_constant<EEncoding, eEncoding_UTF16>)
{
assert(!checker.IsEnd(it) && "Attempt to iterate past the end of the sequence");
// UTF-16: just need to skip one lead surrogate.
++it;
uint32 val = static_cast<uint32>(*it);
if (val >= cLeadSurrogateFirst && val <= cLeadSurrogateLast)
{
if (!checker.IsEnd(it))
{
++it;
}
}
}
// MovePrev(it, checker, tag):
// Moves the iterator to the previous UCS code-point in the encoded sequence.
// Non-specialized version (for 1:1 code-unit to code-point).
template<typename BaseIterator, typename BoundsChecker, EEncoding Encoding>
inline void MovePrev(BaseIterator& it, const BoundsChecker& checker, const integral_constant<EEncoding, Encoding>)
{
COMPILE_TIME_ASSERT(
Encoding == eEncoding_ASCII ||
Encoding == eEncoding_UTF32 ||
Encoding == eEncoding_Latin1 ||
Encoding == eEncoding_Win1252);
assert(!checker.IsBegin(it) && "Attempt to iterate past the beginning of the sequence");
// All of these encodings use a single code-unit for each code-point.
--it;
}
// MovePrev(it, checker, tag):
// Moves the iterator to the previous UCS code-point in the encoded sequence.
// Specialized for UTF-8.
template<typename BaseIterator, typename BoundsChecker>
inline void MovePrev(BaseIterator& it, const BoundsChecker& checker, integral_constant<EEncoding, eEncoding_UTF8>)
{
assert(!checker.IsBegin(it) && "Attempt to iterate past the beginning of the sequence");
// UTF-8: just need to skip up to 3 continuation bytes.
for (int i = 0; i < 4; ++i)
{
--it;
if (checker.IsBegin(it))
{
break;
}
uint32 val = static_cast<uint32>(*it);
if ((val & 0xC0) != 0x80)
{
break;
}
}
}
// MovePrev(it, checker, tag):
// Moves the iterator to the previous UCS code-point in the encoded sequence.
// Specialized for UTF-16.
template<typename BaseIterator, typename BoundsChecker>
inline void MovePrev(BaseIterator& it, const BoundsChecker& checker, integral_constant<EEncoding, eEncoding_UTF16>)
{
assert(!checker.IsBegin(it) && "Attempt to iterate past the beginning of the sequence");
// UTF-16: just need to skip one lead surrogate.
--it;
uint32 val = static_cast<uint32>(*it);
if (val >= cLeadSurrogateFirst && val <= cLeadSurrogateLast)
{
if (!checker.IsBegin(it))
{
--it;
}
}
}
// SBaseIterators<BaseIterator, BoundsChecked>:
// Utility to access base iterators properties from CIterator.
// This is the bounds-checked specialization, the range information is kept to defend against malformed sequences.
template<typename BaseIterator, bool BoundsChecked>
struct SBaseIterators
{
typedef BaseIterator type;
type begin, end;
type it;
SBaseIterators(const BaseIterator& _begin, const BaseIterator& _end)
: begin(_begin)
, end(_end)
, it(_begin) {}
SBaseIterators(const SBaseIterators& other)
: begin(other.begin)
, end(other.end)
, it(other.it) {}
SBaseIterators& operator =(const SBaseIterators& other)
{
begin = other.begin;
end = other.end;
it = other.it;
return *this;
}
bool IsBegin(const BaseIterator& _it) const
{
return begin == _it;
}
bool IsEnd(const BaseIterator& _it) const
{
return end == _it;
}
bool IsEqual(const SBaseIterators& other) const
{
return it == other.it
&& begin == other.begin
&& end == other.end;
}
// Note: Only called inside assert.
// O(N) version; works with any forward-iterator (or better)
bool IsInRange(const BaseIterator& _it, std::forward_iterator_tag) const
{
for (BaseIterator i = begin; i != end; ++i)
{
if (_it == i)
{
return true;
}
}
return false;
}
// Note: Only called inside assert.
// O(1) version; requires random-access-iterator.
bool IsInRange(const BaseIterator& _it, std::random_access_iterator_tag) const
{
return (begin <= _it && _it < end);
}
// Note: Only called inside assert.
// Dispatches to the O(1) version if a random-access iterator is used (common case).
bool IsInRange(const BaseIterator& _it) const
{
return IsInRange(_it, typename std::iterator_traits<BaseIterator>::iterator_category());
}
};
// SBaseIterators<BaseIterator, BoundsChecked>:
// Utility to access base iterators properties from CIterator.
// This is the un-checked specialization for known-safe sequences.
template<typename BaseIterator>
struct SBaseIterators<BaseIterator, false>
{
typedef BaseIterator type;
type it;
explicit SBaseIterators(const BaseIterator& begin)
: it(begin) {}
SBaseIterators(const BaseIterator& begin, const BaseIterator& end)
: it(begin) {}
SBaseIterators(const SBaseIterators& other)
: it(other.it) {}
SBaseIterators& operator =(const SBaseIterators& other)
{
it = other.it;
return *this;
}
bool IsBegin(const BaseIterator&) const
{
return false;
}
bool IsEnd(const BaseIterator&) const
{
return false;
}
bool IsEqual(const SBaseIterators& other) const
{
return it == other.it;
}
bool IsInRange(const BaseIterator&) const
{
return true;
}
};
// SIteratorSink<Safe>:
// Helper to store the last code-point and error bit that was decoded.
// This is the safe specialization for potentially malformed sequences.
template<bool Safe>
struct SIteratorSink
{
static const uint32 cEmpty = 0xFFFFFFFFU;
uint32 value;
bool error;
void Clear()
{
value = cEmpty;
error = false;
}
bool IsEmpty() const
{
return value == cEmpty;
}
bool IsError() const
{
return error;
}
const uint32& GetValue() const
{
return value;
}
void MarkDecodingError()
{
value = cReplacementCharacter;
error = true;
}
template<EEncoding Encoding, typename BaseIterator, bool BoundsChecked>
void Decode(const SBaseIterators<BaseIterator, BoundsChecked>& its, integral_constant<EEncoding, Encoding>)
{
typedef SDecoder<Encoding, SIteratorSink&, SIteratorSink&> DecoderType;
DecoderType decoder(*this, *this);
Clear();
for (BaseIterator it = its.it; IsEmpty(); ++it)
{
uint32 val = static_cast<uint32>(*it);
decoder(val);
if (its.IsEnd(it))
{
break;
}
}
if (IsEmpty())
{
// If we still have neither a new value or an error flag, just treat as error.
// This can happen if we reached the end of the sequence, but it ends in an incomplete code-sequence.
MarkDecodingError();
}
}
template<EEncoding Encoding, typename BaseIterator, bool BoundsChecked>
void DecodeIfEmpty(const SBaseIterators<BaseIterator, BoundsChecked>& its, integral_constant<EEncoding, Encoding> tag)
{
if (IsEmpty())
{
Decode(its, tag);
}
}
void operator()(uint32 unit)
{
value = unit;
}
void operator()(SIteratorSink&, uint32, uint32)
{
MarkDecodingError();
}
};
// SIteratorSink<Safe>:
// Helper to store the last code-point that was decoded.
// This is the un-safe specialization for known-valid sequences.
// Note: No error-state is tracked since we won't handle that regardless for un-safe CIterator.
template<>
struct SIteratorSink<false>
{
static const uint32 cEmpty = 0xFFFFFFFFU;
uint32 value;
void Clear()
{
value = cEmpty;
}
bool IsEmpty() const
{
return value == cEmpty;
}
bool IsError() const
{
return false;
}
const uint32& GetValue() const
{
return value;
}
template<EEncoding Encoding, typename BaseIterator, bool BoundsChecked>
void Decode(const SBaseIterators<BaseIterator, BoundsChecked>& its, integral_constant<EEncoding, Encoding>)
{
typedef SDecoder<Encoding, SIteratorSink&, void> DecoderType;
DecoderType decoder(*this);
for (BaseIterator it = its.it; IsEmpty(); ++it)
{
uint32 val = static_cast<uint32>(*it);
decoder(val);
}
}
template<EEncoding Encoding, typename BaseIterator, bool BoundsChecked>
void DecodeIfEmpty(const SBaseIterators<BaseIterator, BoundsChecked>& its, integral_constant<EEncoding, Encoding> tag)
{
if (IsEmpty())
{
Decode(its, tag);
}
}
void operator()(uint32 unit)
{
value = unit;
}
};
}
// CIterator<BaseIterator [, Safe, Encoding]>:
// Helper class that can iterate over an encoded text sequence and read the underlying UCS code-points.
// If the Safe flag is set, bounds checking is performed inside multi-unit sequences to guard against decoding errors.
// This requires the user to know where the sequence ends (use the constructor taking two parameters).
// Note: The BaseIterator must be forward-iterator or better when Safe flag is set.
// If the Safe flag is not set, you must guarantee the sequence is validly encoded, and allows the use of the single argument constructor.
// In the case of unsafe iterator used for C-style string pointer, look for a U+0000 dereferenced value to end the iteration.
// Regardless of the Safe flag, the user must ensure that the iterator is never moved past the beginning or end of the range (just like any other STL iterator).
// Example of typical usage:
// string utf8 = "foo"; // UTF-8
// for (Unicode::CIterator<string::const_iterator> it(utf8.begin(), utf8.end()); it != utf8.end(); ++it)
// {
// uint32 codepoint = *it; // 32-bit UCS code-point
// }
// Example unsafe usage: (for known-valid encoded C-style strings):
// const char *pValid = "foo"; // UTF-8
// for (Unicode::CIterator<const char *, false> it = pValid; *it != 0; ++it)
// {
// uint32 codepoint = *it; // 32-bit UCS code-point
// }
template<typename BaseIterator, bool Safe = true, EEncoding Encoding = Detail::SInferEncoding<BaseIterator, true>::value>
class CIterator
{
// The iterator value in the encoded sequence.
// Optionally provides bounds-checking.
Detail::SBaseIterators<BaseIterator, Safe> its;
// The cached UCS code-point at the current position.
// Mutable because dereferencing is conceptually const, but does cache some state in this case.
mutable Detail::SIteratorSink<Safe> sink;
public:
// Types for compatibility with STL bidirectional iterator requirements.
typedef const uint32 value_type;
typedef const uint32& reference;
typedef const uint32* pointer;
typedef const ptrdiff_t difference_type;
typedef std::bidirectional_iterator_tag iterator_category;
// Construct an iterator for the given range.
// The initial position of the iterator as at the beginning of the range.
CIterator(const BaseIterator& begin, const BaseIterator& end)
: its(begin, end)
{
sink.Clear();
}
// Construct an iterator from a single iterator (typically C-style string pointer).
// This can only be used for unsafe iterators.
template<typename IteratorType>
CIterator(const IteratorType& it, typename Detail::SRequire<!Safe&& Detail::is_convertible<IteratorType, BaseIterator>::value, IteratorType>::type* = 0)
: its(static_cast<const BaseIterator&>(it))
{
sink.Clear();
}
// Copy-construct an iterator.
CIterator(const CIterator& other)
: its(other.its)
, sink(other.sink) {}
// Copy-assign an iterator.
CIterator& operator =(const CIterator& other)
{
its = other.its;
sink = other.sink;
return *this;
}
// Test if the iterator points at an encoding error in the underlying encoded sequence.
// If so, the function returns false.
// When using an un-safe iterator, this function always returns true, if a sequence can contain encoding errors, you must use the safe variant.
// Note: This requires the underlying iterator to be dereferenced, so you cannot use it only while the iterator is inside the valid range.
bool IsAtValidCodepoint() const
{
assert(!its.IsEnd(its.it) && "Attempt to dereference the past-the-end iterator");
Detail::integral_constant<EEncoding, Encoding> tag;
sink.DecodeIfEmpty(its, tag);
return !sink.IsError();
}
// Gets the current position in the underlying encoded sequence.
// If the iterator points to an invalidly encoded sequence (ie, IsError() returns true), the direction of iteration is significant.
// In that case the returned position is approximated; to work around this: move all iterators of which the position is compared in the same direction.
const BaseIterator& GetPosition() const
{
return its.it;
}
// Sets the current position in the underlying encoded sequence.
// You may not set the position outside the range for which this iterator was constructed.
void SetPosition(const BaseIterator& it)
{
assert(its.IsInRange(it) && "Attempt to set the underlying iterator outside of the supported range");
its.it = it;
}
// Test if this iterator is equal to another iterator instance.
// Note: In the presence of an invalidly encoded sequence (ie, IsError() returns true), the direction of iteration is significant.
// To work around this, you can either:
// 1) Move all iterators that will be compared in the same direction; or
// 2) Compare the dereferenced iterator value(s) instead (if applicable).
bool operator ==(const CIterator& other) const
{
return its.IsEqual(other.its);
}
// Test if this iterator is equal to another base iterator.
// Note: If the provided iterator does not point to the the first code-unit of an UCS code-point, the behavior is undefined.
bool operator ==(const BaseIterator& other) const
{
return its.it == other;
}
// Test if this iterator is equal to another iterator instance.
// Note: In the presence of an invalidly encoded sequence (ie, IsError() returns true), the direction of iteration is significant.
// To work around this, you can either:
// 1) Move all iterators that will be compared in the same direction; or
// 2) Compare the dereferenced iterator value(s) instead (if applicable).
bool operator !=(const CIterator& other) const
{
return !its.IsEqual(other.its);
}
// Test if this iterator is equal to another base iterator.
// Note: If the provided iterator does not point to the the first code-unit of an UCS code-point, the behavior is undefined.
bool operator !=(const BaseIterator& other) const
{
return its.it != other;
}
// Get the decoded UCS code-point at the current position in the sequence.
// If the iterator points to an invalidly encoded sequence (ie, IsError() returns true) the function returns U+FFFD (replacement character).
reference operator *() const
{
assert(!its.IsEnd(its.it) && "Attempt to dereference the past-the-end iterator");
Detail::integral_constant<EEncoding, Encoding> tag;
sink.DecodeIfEmpty(its, tag);
return sink.GetValue();
}
// Advance the iterator to the next UCS code-point.
// Note: You must make sure the iterator is not at the end of the sequence, even in Safe mode.
// However, in Safe mode, the iterator will never move past the end of the sequence in the presence of encoding errors.
CIterator& operator ++()
{
Detail::integral_constant<EEncoding, Encoding> tag;
Detail::MoveNext(its.it, its, tag);
sink.Clear();
return *this;
}
// Go back to the previous UCS code-point.
// Note: You must make sure the iterator is not at the beginning of the sequence, even in Safe mode.
// However, in Safe mode, the iterators will never move past the beginning of the sequence in the presence of encoding errors.
CIterator& operator --()
{
Detail::integral_constant<EEncoding, Encoding> tag;
Detail::MovePrev(its.it, its, tag);
sink.Clear();
return *this;
}
// Advance the iterator to the next UCS code-point, return a copy of the iterator position before advancing.
// Note: You must make sure the iterator is not at the end of the sequence, even in Safe mode.
// However, in Safe mode, the iterator will never move past the end of the sequence in the presence of encoding errors.
CIterator operator ++(int)
{
CIterator result = *this;
++*this;
return result;
}
// Go back to the previous UCS code-point, return a copy of the iterator position before going back.
// Note: You must make sure the iterator is not at the beginning of the sequence, even in Safe mode.
// However, in Safe mode, the iterators will never move past the beginning of the sequence in the presence of encoding errors.
CIterator operator --(int)
{
CIterator result = *this;
--*this;
return result;
}
};
namespace Detail
{
// SIteratorSpecializer<T>:
// Specializes the CIterator template to use for a given string type.
// Note: The reason we use this is because MSVC doesn't want to deduce this on the MakeIterator declaration.
template<typename StringType>
struct SIteratorSpecializer
{
typedef CIterator<typename StringType::const_iterator> type;
};
}
// MakeIterator(const StringType &str):
// Helper function to make an UCS code-point iterator given an Unicode string.
// Example usage:
// string utf8 = "foo"; // UTF-8
// auto it = Unicode::MakeIterator(utf8);
// while (it != utf8.end())
// {
// uint32 codepoint = *it; // 32-bit UCS code-point
// }
// Or, in a for-loop:
// for (auto it = Unicode::MakeIterator(utf8); it != utf8.end(); ++it) {}
template<typename StringType>
inline typename Detail::SIteratorSpecializer<StringType>::type MakeIterator(const StringType& str)
{
return typename Detail::SIteratorSpecializer<StringType>::type(str.begin(), str.end());
}
}