o3de/Code/Legacy/CryCommon/UnicodeIterator.h

/*
 * Copyright (c) Contributors to the Open 3D Engine Project
 *
 * SPDX-License-Identifier: Apache-2.0 OR MIT
 *
 */


// Description : Encoded Unicode sequence iteration.
//
// For lower level accessing of encoded text, an STL compatible iterator wrapper is provided.
// This iterator will decode the underlying sequence, abstracting it to a sequence of UCS code-points.
// Using the iterator wrapper, you can find where in an encoded string code-points (or encoding errors) are located.
// Note: The iterator is an input-only iterator, you cannot write to the underlying sequence.


#pragma once
#include "UnicodeBinding.h"
namespace Unicode
{
    namespace Detail
    {
        // MoveNext(it, checker, tag):
        // Moves the iterator to the next UCS code-point in the encoded sequence.
        // Non-specialized version (for 1:1 code-unit to code-point).
        template<typename BaseIterator, typename BoundsChecker, EEncoding Encoding>
        inline void MoveNext(BaseIterator& it, const BoundsChecker& checker, const integral_constant<EEncoding, Encoding>)
        {
            COMPILE_TIME_ASSERT(
                Encoding == eEncoding_ASCII ||
                Encoding == eEncoding_UTF32 ||
                Encoding == eEncoding_Latin1 ||
                Encoding == eEncoding_Win1252);
            assert(!checker.IsEnd(it) && "Attempt to iterate past the end of the sequence");

            // All of these encodings use a single code-unit for each code-point.
            ++it;
        }

        // MoveNext(it, checker, tag):
        // Moves the iterator to the next UCS code-point in the encoded sequence.
        // Specialized for UTF-8.
        template<typename BaseIterator, typename BoundsChecker>
        inline void MoveNext(BaseIterator& it, const BoundsChecker& checker, integral_constant<EEncoding, eEncoding_UTF8>)
        {
            assert(!checker.IsEnd(it) && "Attempt to iterate past the end of the sequence");

            // UTF-8: just need to skip up to 3 continuation bytes.
            for (int i = 0; i < 4; ++i)
            {
                ++it;
                if (checker.IsEnd(it))  // :WARN: always returns false if "safe" bool is false!
                {
                    break;
                }
                uint32 val = static_cast<uint32>(*it);
                if ((val & 0xC0) != 0x80)
                {
                    break;
                }
            }
        }

        // MoveNext(it, checker, tag):
        // Moves the iterator to the next UCS code-point in the encoded sequence.
        // Specialized for UTF-16.
        template<typename BaseIterator, typename BoundsChecker>
        inline void MoveNext(BaseIterator& it, const BoundsChecker& checker, integral_constant<EEncoding, eEncoding_UTF16>)
        {
            assert(!checker.IsEnd(it) && "Attempt to iterate past the end of the sequence");

            // UTF-16: just need to skip one lead surrogate.
            ++it;
            uint32 val = static_cast<uint32>(*it);
            if (val >= cLeadSurrogateFirst && val <= cLeadSurrogateLast)
            {
                if (!checker.IsEnd(it))
                {
                    ++it;
                }
            }
        }

        // MovePrev(it, checker, tag):
        // Moves the iterator to the previous UCS code-point in the encoded sequence.
        // Non-specialized version (for 1:1 code-unit to code-point).
        template<typename BaseIterator, typename BoundsChecker, EEncoding Encoding>
        inline void MovePrev(BaseIterator& it, const BoundsChecker& checker, const integral_constant<EEncoding, Encoding>)
        {
            COMPILE_TIME_ASSERT(
                Encoding == eEncoding_ASCII ||
                Encoding == eEncoding_UTF32 ||
                Encoding == eEncoding_Latin1 ||
                Encoding == eEncoding_Win1252);
            assert(!checker.IsBegin(it) && "Attempt to iterate past the beginning of the sequence");

            // All of these encodings use a single code-unit for each code-point.
            --it;
        }

        // MovePrev(it, checker, tag):
        // Moves the iterator to the previous UCS code-point in the encoded sequence.
        // Specialized for UTF-8.
        template<typename BaseIterator, typename BoundsChecker>
        inline void MovePrev(BaseIterator& it, const BoundsChecker& checker, integral_constant<EEncoding, eEncoding_UTF8>)
        {
            assert(!checker.IsBegin(it) && "Attempt to iterate past the beginning of the sequence");

            // UTF-8: just need to skip up to 3 continuation bytes.
            for (int i = 0; i < 4; ++i)
            {
                --it;
                if (checker.IsBegin(it))
                {
                    break;
                }
                uint32 val = static_cast<uint32>(*it);
                if ((val & 0xC0) != 0x80)
                {
                    break;
                }
            }
        }

        // MovePrev(it, checker, tag):
        // Moves the iterator to the previous UCS code-point in the encoded sequence.
        // Specialized for UTF-16.
        template<typename BaseIterator, typename BoundsChecker>
        inline void MovePrev(BaseIterator& it, const BoundsChecker& checker, integral_constant<EEncoding, eEncoding_UTF16>)
        {
            assert(!checker.IsBegin(it) && "Attempt to iterate past the beginning of the sequence");

            // UTF-16: just need to skip one lead surrogate.
            --it;
            uint32 val = static_cast<uint32>(*it);
            if (val >= cLeadSurrogateFirst && val <= cLeadSurrogateLast)
            {
                if (!checker.IsBegin(it))
                {
                    --it;
                }
            }
        }

        // SBaseIterators<BaseIterator, BoundsChecked>:
        // Utility to access base iterators properties from CIterator.
        // This is the bounds-checked specialization, the range information is kept to defend against malformed sequences.
        template<typename BaseIterator, bool BoundsChecked>
        struct SBaseIterators
        {
            typedef BaseIterator type;
            type begin, end;
            type it;

            SBaseIterators(const BaseIterator& _begin, const BaseIterator& _end)
                : begin(_begin)
                , end(_end)
                , it(_begin) {}

            SBaseIterators(const SBaseIterators& other)
                : begin(other.begin)
                , end(other.end)
                , it(other.it) {}

            SBaseIterators& operator =(const SBaseIterators& other)
            {
                begin = other.begin;
                end = other.end;
                it = other.it;
                return *this;
            }

            bool IsBegin(const BaseIterator& _it) const
            {
                return begin == _it;
            }

            bool IsEnd(const BaseIterator& _it) const
            {
                return end == _it;
            }

            bool IsEqual(const SBaseIterators& other) const
            {
                return it == other.it
                       && begin == other.begin
                       && end == other.end;
            }

            // Note: Only called inside assert.
            // O(N) version; works with any forward-iterator (or better)
            bool IsInRange(const BaseIterator& _it, std::forward_iterator_tag) const
            {
                for (BaseIterator i = begin; i != end; ++i)
                {
                    if (_it == i)
                    {
                        return true;
                    }
                }
                return false;
            }

            // Note: Only called inside assert.
            // O(1) version; requires random-access-iterator.
            bool IsInRange(const BaseIterator& _it, std::random_access_iterator_tag) const
            {
                return (begin <= _it && _it < end);
            }

            // Note: Only called inside assert.
            // Dispatches to the O(1) version if a random-access iterator is used (common case).
            bool IsInRange(const BaseIterator& _it) const
            {
                return IsInRange(_it, typename std::iterator_traits<BaseIterator>::iterator_category());
            }
        };

        // SBaseIterators<BaseIterator, BoundsChecked>:
        // Utility to access base iterators properties from CIterator.
        // This is the un-checked specialization for known-safe sequences.
        template<typename BaseIterator>
        struct SBaseIterators<BaseIterator, false>
        {
            typedef BaseIterator type;
            type it;

            explicit SBaseIterators(const BaseIterator& begin)
                : it(begin) {}

            SBaseIterators(const BaseIterator& begin, const BaseIterator& end)
                : it(begin) {}

            SBaseIterators(const SBaseIterators& other)
                : it(other.it) {}

            SBaseIterators& operator =(const SBaseIterators& other)
            {
                it = other.it;
                return *this;
            }

            bool IsBegin(const BaseIterator&) const
            {
                return false;
            }

            bool IsEnd(const BaseIterator&) const
            {
                return false;
            }

            bool IsEqual(const SBaseIterators& other) const
            {
                return it == other.it;
            }

            bool IsInRange(const BaseIterator&) const
            {
                return true;
            }
        };

        // SIteratorSink<Safe>:
        // Helper to store the last code-point and error bit that was decoded.
        // This is the safe specialization for potentially malformed sequences.
        template<bool Safe>
        struct SIteratorSink
        {
            static const uint32 cEmpty = 0xFFFFFFFFU;
            uint32 value;
            bool error;

            void Clear()
            {
                value = cEmpty;
                error = false;
            }

            bool IsEmpty() const
            {
                return value == cEmpty;
            }

            bool IsError() const
            {
                return error;
            }

            const uint32& GetValue() const
            {
                return value;
            }

            void MarkDecodingError()
            {
                value = cReplacementCharacter;
                error = true;
            }

            template<EEncoding Encoding, typename BaseIterator, bool BoundsChecked>
            void Decode(const SBaseIterators<BaseIterator, BoundsChecked>& its, integral_constant<EEncoding, Encoding>)
            {
                typedef SDecoder<Encoding, SIteratorSink&, SIteratorSink&> DecoderType;
                DecoderType decoder(*this, *this);
                Clear();
                for (BaseIterator it = its.it; IsEmpty(); ++it)
                {
                    uint32 val = static_cast<uint32>(*it);
                    decoder(val);
                    if (its.IsEnd(it))
                    {
                        break;
                    }
                }
                if (IsEmpty())
                {
                    // If we still have neither a new value or an error flag, just treat as error.
                    // This can happen if we reached the end of the sequence, but it ends in an incomplete code-sequence.
                    MarkDecodingError();
                }
            }

            template<EEncoding Encoding, typename BaseIterator, bool BoundsChecked>
            void DecodeIfEmpty(const SBaseIterators<BaseIterator, BoundsChecked>& its, integral_constant<EEncoding, Encoding> tag)
            {
                if (IsEmpty())
                {
                    Decode(its, tag);
                }
            }

            void operator()(uint32 unit)
            {
                value = unit;
            }

            void operator()(SIteratorSink&, uint32, uint32)
            {
                MarkDecodingError();
            }
        };

        // SIteratorSink<Safe>:
        // Helper to store the last code-point that was decoded.
        // This is the un-safe specialization for known-valid sequences.
        // Note: No error-state is tracked since we won't handle that regardless for un-safe CIterator.
        template<>
        struct SIteratorSink<false>
        {
            static const uint32 cEmpty = 0xFFFFFFFFU;
            uint32 value;

            void Clear()
            {
                value = cEmpty;
            }

            bool IsEmpty() const
            {
                return value == cEmpty;
            }

            bool IsError() const
            {
                return false;
            }

            const uint32& GetValue() const
            {
                return value;
            }

            template<EEncoding Encoding, typename BaseIterator, bool BoundsChecked>
            void Decode(const SBaseIterators<BaseIterator, BoundsChecked>& its, integral_constant<EEncoding, Encoding>)
            {
                typedef SDecoder<Encoding, SIteratorSink&, void> DecoderType;
                DecoderType decoder(*this);
                for (BaseIterator it = its.it; IsEmpty(); ++it)
                {
                    uint32 val = static_cast<uint32>(*it);
                    decoder(val);
                }
            }

            template<EEncoding Encoding, typename BaseIterator, bool BoundsChecked>
            void DecodeIfEmpty(const SBaseIterators<BaseIterator, BoundsChecked>& its, integral_constant<EEncoding, Encoding> tag)
            {
                if (IsEmpty())
                {
                    Decode(its, tag);
                }
            }

            void operator()(uint32 unit)
            {
                value = unit;
            }
        };
    }

    // CIterator<BaseIterator [, Safe, Encoding]>:
    // Helper class that can iterate over an encoded text sequence and read the underlying UCS code-points.
    // If the Safe flag is set, bounds checking is performed inside multi-unit sequences to guard against decoding errors.
    // This requires the user to know where the sequence ends (use the constructor taking two parameters).
    // Note: The BaseIterator must be forward-iterator or better when Safe flag is set.
    // If the Safe flag is not set, you must guarantee the sequence is validly encoded, and allows the use of the single argument constructor.
    // In the case of unsafe iterator used for C-style string pointer, look for a U+0000 dereferenced value to end the iteration.
    // Regardless of the Safe flag, the user must ensure that the iterator is never moved past the beginning or end of the range (just like any other STL iterator).
    // Example of typical usage:
    // string utf8 = "foo"; // UTF-8
    // for (Unicode::CIterator<string::const_iterator> it(utf8.begin(), utf8.end()); it != utf8.end(); ++it)
    // {
    //     uint32 codepoint = *it; // 32-bit UCS code-point
    // }
    // Example unsafe usage: (for known-valid encoded C-style strings):
    // const char *pValid = "foo"; // UTF-8
    // for (Unicode::CIterator<const char *, false> it = pValid; *it != 0; ++it)
    // {
    //     uint32 codepoint = *it; // 32-bit UCS code-point
    // }
    template<typename BaseIterator, bool Safe = true, EEncoding Encoding = Detail::SInferEncoding<BaseIterator, true>::value>
    class CIterator
    {
        // The iterator value in the encoded sequence.
        // Optionally provides bounds-checking.
        Detail::SBaseIterators<BaseIterator, Safe> its;

        // The cached UCS code-point at the current position.
        // Mutable because dereferencing is conceptually const, but does cache some state in this case.
        mutable Detail::SIteratorSink<Safe> sink;

    public:
        // Types for compatibility with STL bidirectional iterator requirements.
        typedef const uint32 value_type;
        typedef const uint32& reference;
        typedef const uint32* pointer;
        typedef const ptrdiff_t difference_type;
        typedef std::bidirectional_iterator_tag iterator_category;

        // Construct an iterator for the given range.
        // The initial position of the iterator as at the beginning of the range.
        CIterator(const BaseIterator& begin, const BaseIterator& end)
            : its(begin, end)
        {
            sink.Clear();
        }

        // Construct an iterator from a single iterator (typically C-style string pointer).
        // This can only be used for unsafe iterators.
        template<typename IteratorType>
        CIterator(const IteratorType& it, typename Detail::SRequire<!Safe&& Detail::is_convertible<IteratorType, BaseIterator>::value, IteratorType>::type* = 0)
            : its(static_cast<const BaseIterator&>(it))
        {
            sink.Clear();
        }

        // Copy-construct an iterator.
        CIterator(const CIterator& other)
            : its(other.its)
            , sink(other.sink) {}

        // Copy-assign an iterator.
        CIterator& operator =(const CIterator& other)
        {
            its = other.its;
            sink = other.sink;
            return *this;
        }

        // Test if the iterator points at an encoding error in the underlying encoded sequence.
        // If so, the function returns false.
        // When using an un-safe iterator, this function always returns true, if a sequence can contain encoding errors, you must use the safe variant.
        // Note: This requires the underlying iterator to be dereferenced, so you cannot use it only while the iterator is inside the valid range.
        bool IsAtValidCodepoint() const
        {
            assert(!its.IsEnd(its.it) && "Attempt to dereference the past-the-end iterator");
            Detail::integral_constant<EEncoding, Encoding> tag;
            sink.DecodeIfEmpty(its, tag);
            return !sink.IsError();
        }

        // Gets the current position in the underlying encoded sequence.
        // If the iterator points to an invalidly encoded sequence (ie, IsError() returns true), the direction of iteration is significant.
        // In that case the returned position is approximated; to work around this: move all iterators of which the position is compared in the same direction.
        const BaseIterator& GetPosition() const
        {
            return its.it;
        }

        // Sets the current position in the underlying encoded sequence.
        // You may not set the position outside the range for which this iterator was constructed.
        void SetPosition(const BaseIterator& it)
        {
            assert(its.IsInRange(it) && "Attempt to set the underlying iterator outside of the supported range");
            its.it = it;
        }

        // Test if this iterator is equal to another iterator instance.
        // Note: In the presence of an invalidly encoded sequence (ie, IsError() returns true), the direction of iteration is significant.
        // To work around this, you can either:
        // 1) Move all iterators that will be compared in the same direction; or
        // 2) Compare the dereferenced iterator value(s) instead (if applicable).
        bool operator ==(const CIterator& other) const
        {
            return its.IsEqual(other.its);
        }

        // Test if this iterator is equal to another base iterator.
        // Note: If the provided iterator does not point to the the first code-unit of an UCS code-point, the behavior is undefined.
        bool operator ==(const BaseIterator& other) const
        {
            return its.it == other;
        }

        // Test if this iterator is equal to another iterator instance.
        // Note: In the presence of an invalidly encoded sequence (ie, IsError() returns true), the direction of iteration is significant.
        // To work around this, you can either:
        // 1) Move all iterators that will be compared in the same direction; or
        // 2) Compare the dereferenced iterator value(s) instead (if applicable).
        bool operator !=(const CIterator& other) const
        {
            return !its.IsEqual(other.its);
        }

        // Test if this iterator is equal to another base iterator.
        // Note: If the provided iterator does not point to the the first code-unit of an UCS code-point, the behavior is undefined.
        bool operator !=(const BaseIterator& other) const
        {
            return its.it != other;
        }

        // Get the decoded UCS code-point at the current position in the sequence.
        // If the iterator points to an invalidly encoded sequence (ie, IsError() returns true) the function returns U+FFFD (replacement character).
        reference operator *() const
        {
            assert(!its.IsEnd(its.it) && "Attempt to dereference the past-the-end iterator");
            Detail::integral_constant<EEncoding, Encoding> tag;
            sink.DecodeIfEmpty(its, tag);
            return sink.GetValue();
        }

        // Advance the iterator to the next UCS code-point.
        // Note: You must make sure the iterator is not at the end of the sequence, even in Safe mode.
        // However, in Safe mode, the iterator will never move past the end of the sequence in the presence of encoding errors.
        CIterator& operator ++()
        {
            Detail::integral_constant<EEncoding, Encoding> tag;
            Detail::MoveNext(its.it, its, tag);
            sink.Clear();
            return *this;
        }

        // Go back to the previous UCS code-point.
        // Note: You must make sure the iterator is not at the beginning of the sequence, even in Safe mode.
        // However, in Safe mode, the iterators will never move past the beginning of the sequence in the presence of encoding errors.
        CIterator& operator --()
        {
            Detail::integral_constant<EEncoding, Encoding> tag;
            Detail::MovePrev(its.it, its, tag);
            sink.Clear();
            return *this;
        }

        // Advance the iterator to the next UCS code-point, return a copy of the iterator position before advancing.
        // Note: You must make sure the iterator is not at the end of the sequence, even in Safe mode.
        // However, in Safe mode, the iterator will never move past the end of the sequence in the presence of encoding errors.
        CIterator operator ++(int)
        {
            CIterator result = *this;
            ++*this;
            return result;
        }

        // Go back to the previous UCS code-point, return a copy of the iterator position before going back.
        // Note: You must make sure the iterator is not at the beginning of the sequence, even in Safe mode.
        // However, in Safe mode, the iterators will never move past the beginning of the sequence in the presence of encoding errors.
        CIterator operator --(int)
        {
            CIterator result = *this;
            --*this;
            return result;
        }
    };

    namespace Detail
    {
        // SIteratorSpecializer<T>:
        // Specializes the CIterator template to use for a given string type.
        // Note: The reason we use this is because MSVC doesn't want to deduce this on the MakeIterator declaration.
        template<typename StringType>
        struct SIteratorSpecializer
        {
            typedef CIterator<typename StringType::const_iterator> type;
        };
    }

    // MakeIterator(const StringType &str):
    // Helper function to make an UCS code-point iterator given an Unicode string.
    // Example usage:
    // string utf8 = "foo"; // UTF-8
    // auto it = Unicode::MakeIterator(utf8);
    // while (it != utf8.end())
    // {
    //     uint32 codepoint = *it; // 32-bit UCS code-point
    // }
    // Or, in a for-loop:
    // for (auto it = Unicode::MakeIterator(utf8); it != utf8.end(); ++it) {}
    template<typename StringType>
    inline typename Detail::SIteratorSpecializer<StringType>::type MakeIterator(const StringType& str)
    {
        return typename Detail::SIteratorSpecializer<StringType>::type(str.begin(), str.end());
    }
}