You cannot select more than 25 topics
Topics must start with a letter or number, can include dashes ('-') and can be up to 35 characters long.
772 lines
35 KiB
C++
772 lines
35 KiB
C++
/*
|
|
* All or portions of this file Copyright (c) Amazon.com, Inc. or its affiliates or
|
|
* its licensors.
|
|
*
|
|
* For complete copyright and license terms please see the LICENSE at the root of this
|
|
* distribution (the "License"). All use of this software is governed by the License,
|
|
* or, if provided, by the license below or the license accompanying this file. Do not
|
|
* remove or modify any license notices. This file is distributed on an "AS IS" BASIS,
|
|
* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
|
|
*
|
|
*/
|
|
// Original file Copyright Crytek GMBH or its affiliates, used under license.
|
|
|
|
// Description : Generic Unicode encoding helpers.
|
|
//
|
|
// Defines encoding and decoding functions used by the higher-level functions.
|
|
// These are used by the various conversion functions in UnicodeFunctions.h and UnicodeIterator.h.
|
|
// Note: You can use these functions manually for low-level functionality, but we don't recommend that.
|
|
// In that case, you probably want to check inside the nested Detail namespace for the elementary bits.
|
|
|
|
|
|
#pragma once
|
|
#include "BaseTypes.h" // For uint8, uint16, uint32
|
|
#include "CompileTimeAssert.h" // For COMPILE_TIME_ASSERT macro
|
|
namespace Unicode
|
|
{
|
|
// Supported encoding/conversion types.
|
|
enum EEncoding
|
|
{
|
|
// UTF-8 encoding, see http://www.unicode.org/resources/utf8.html.
|
|
// Input and output are supported.
|
|
// Note: This format maps the entire UCS, where each code-point can take [1, 4] 8-bit code-units.
|
|
// Note: This is a strict super-set of Latin1/ISO-885901 as well as ASCII.
|
|
eEncoding_UTF8,
|
|
|
|
// UTF-16 encoding, see http://tools.ietf.org/html/rfc2781.
|
|
// Input and output are supported.
|
|
// Note: This format maps the entire UCS, where each code-point can take [1, 2] 16-bit code-units.
|
|
eEncoding_UTF16,
|
|
|
|
// UTF-32 encoding, see http://www.unicode.org/reports/tr17/.
|
|
// Input and output are supported.
|
|
// Note: This format maps the entire UCS, each code-point is stored in a single 32-bit code-unit.
|
|
eEncoding_UTF32,
|
|
|
|
// ASCII encoding, see http://en.wikipedia.org/wiki/ASCII.
|
|
// Input and output are supported (any output UCS values out of supported range are mapped to question mark).
|
|
// Note: Only values [U+0000, U+007F] can be mapped.
|
|
eEncoding_ASCII,
|
|
|
|
// Latin1, aka ISO-8859-1 encoding, see http://en.wikipedia.org/wiki/ISO/IEC_8859-1.
|
|
// Only input is supported.
|
|
// Note: This is a strict super-set of ASCII, it additionally maps [U+00A0, U+00FF].
|
|
eEncoding_Latin1,
|
|
|
|
// Windows ANSI codepage 1252, see http://en.wikipedia.org/wiki/Windows-1252.
|
|
// Only input is supported.
|
|
// Note: This is a strict super-set of ASCII and Latin1/ISO-8859-1, it maps some code-units from [0x80, 0x9F].
|
|
eEncoding_Win1252,
|
|
};
|
|
|
|
// Methods of recovery from invalid encoded sequences.
|
|
enum EErrorRecovery
|
|
{
|
|
// No attempt to detect invalid encoding is performed, the input is assumed to be valid.
|
|
// If the input is not valid, the output is undefined (in debug, this condition will cause an assert to trigger).
|
|
eErrorRecovery_None,
|
|
|
|
// When an invalidly encoded sequence is detected, the sequence is discarded (will not be part of the output).
|
|
// Typically used for logic/hashing purposes when the input is almost certainly valid.
|
|
eErrorRecovery_Discard,
|
|
|
|
// When an invalidly encoded sequence is detected, the sequence is replaced with the replacement-character (U+FFFD).
|
|
// Typically used when the output sequence is used for UI display purposes.
|
|
eErrorRecovery_Replace,
|
|
|
|
// When an invalidly encoded sequence is detected, the sequence is replaced with the eEncoding_Latin1 equivalent.
|
|
// If the sequence is also not valid Latin1 encoded, the sequence is discarded.
|
|
// Typically used when reading generic text files with 1-byte code-units.
|
|
// Note: This recovery method can only be used when decoding UTF-8.
|
|
eErrorRecovery_FallbackLatin1ThenDiscard,
|
|
|
|
// When an invalidly encoded sequence is detected, the sequence is replaced with the eEncoding_Win1252 equivalent.
|
|
// If the sequence is also not valid codepage 1252 encoded, the sequence is discarded.
|
|
// Typically used when reading text files generated on Windows with 1-byte code-units.
|
|
// Note: This recovery method can only be used when decoding UTF-8.
|
|
eErrorRecovery_FallbackWin1252ThenDiscard,
|
|
|
|
// When an invalidly encoded sequence is detected, the sequence is replaced with the eEncoding_Latin1 equivalent.
|
|
// If the sequence is also not valid Latin1 encoded, it is replaced with the replacement-character (U+FFFD).
|
|
// Typically used when reading generic text files with 1-byte code-units.
|
|
// Note: This recovery method can only be used when decoding UTF-8.
|
|
eErrorRecovery_FallbackLatin1ThenReplace,
|
|
|
|
// When an invalidly encoded sequence is detected, the sequence is replaced with the eEncoding_Win1252 equivalent.
|
|
// If the sequence is also not valid codepage 1252 encoded, it is replaced with the replacement-character (U+FFFD).
|
|
// Typically used when reading text files generated on Windows with 1-byte code-units.
|
|
// Note: This recovery method can only be used when decoding UTF-8.
|
|
eErrorRecovery_FallbackWin1252ThenReplace,
|
|
};
|
|
|
|
namespace Detail
|
|
{
|
|
// Decode<Encoding, Safe>(state, unit): Decodes a single code-unit of an encoding into an UCS code-point.
|
|
// When Safe flag is set, encoding errors are detected so a fall-back encoding or other recovery method can be used.
|
|
// Interpret return value as follows:
|
|
// < 0x001FFFFF: Decoded codepoint (== return value), call again with next code-unit and clear state.
|
|
// < 0x80000000: Intermediate state returned, call again with next code-unit and the returned state.
|
|
// >= 0x80000000: Bad encoding detected, up to 16 bits (UTF-16) or 24 bits (UTF-8, last in lower bits)
|
|
// contain previous consumed values (does not happen if Safe == false).
|
|
template<EEncoding InputEncoding, bool Safe>
|
|
inline uint32 Decode(uint32 state, uint32 unit);
|
|
|
|
// Some constant values used when encoding/decoding.
|
|
enum
|
|
{
|
|
cDecodeShiftRemaining = 26, // Where to store the remaining count in the state.
|
|
cDecodeOneRemaining = 1 << cDecodeShiftRemaining, // Remaining value of one.
|
|
cDecodeMaskRemaining = 3 << cDecodeShiftRemaining, // All possible remaining bits that can be used.
|
|
cDecodeLeadBit = 1 << 22, // All bits up to and including this one are reserved.
|
|
cDecodeErrorBit = 1 << 31, // Set if an error occurs during decoding.
|
|
cDecodeOverlongBit = 1 << 30, // Set if overlong sequence was used.
|
|
cDecodeSurrogateBit = 1 << 29, // Set if surrogate code-point decoded in UTF-8.
|
|
cDecodeInvalidBit = 1 << 28, // Set if invalid code-point decoded (U+FFFE/FFFF).
|
|
cDecodeSuccess = 0, // Placeholder to indicate no error occurred.
|
|
cCodepointMax = 0x10FFFF, // The maximum value of an UCS code-point.
|
|
cLeadSurrogateFirst = 0xD800, // The first valid UTF-16 lead-surrogate value.
|
|
cLeadSurrogateLast = 0xDBFF, // The last valid UTF-16 lead-surrogate value.
|
|
cTrailSurrogateFirst = 0xDC00, // The first valid UTF-16 trail-surrogate value.
|
|
cTrailSurrogateLast = 0xDFFF, // The last valid UTF-16 trail-surrogate value.
|
|
cReplacementCharacter = 0xFFFD, // The default replacement character.
|
|
};
|
|
|
|
// Validate the UTF-8 state of a multi-byte sequence.
|
|
// The safe decoder of UTF-8 will call this function when a full potential code-point has been decoded.
|
|
// This function is (at most) called for 50% of the decoded UTF-8 code-units, but likely at much lower frequency.
|
|
inline uint32 DecodeValidate8(uint32 state)
|
|
{
|
|
uint32 errorbits = (state >> 8) | cDecodeErrorBit;
|
|
state ^= (state & 0x400000) >> 1; // For 3-byte sequences, bit 5 of the lead byte needs to be cleared.
|
|
const uint32 cp =
|
|
(state & 0x3F) |
|
|
((state & 0x3F00) >> 2) |
|
|
((state & 0x3F0000) >> 4) |
|
|
((state & 0x07000000) >> 6);
|
|
if (cp <= cCodepointMax)
|
|
{
|
|
if (cp >= cLeadSurrogateFirst && cp <= cTrailSurrogateLast)
|
|
{
|
|
errorbits += cDecodeSurrogateBit; // CESU-8 encoding might have been used.
|
|
}
|
|
else
|
|
{
|
|
uint32 minval = 0x80;
|
|
minval += (0x00400000 & state) ? 0x800 - 0x80 : 0;
|
|
minval += (0x40000000 & state) ? 0x10000 - 0x80 : 0;
|
|
if (cp >= minval)
|
|
{
|
|
if ((cp & 0xFFFFFFFEU) != 0xFFFEU)
|
|
{
|
|
return cp; // Valid code-point.
|
|
}
|
|
errorbits += cDecodeInvalidBit; // Invalid character used.
|
|
}
|
|
errorbits += cDecodeOverlongBit; // Overlong encoding used.
|
|
}
|
|
}
|
|
return errorbits;
|
|
}
|
|
|
|
// Decode UTF-8, unsafe.
|
|
template<>
|
|
inline uint32 Decode<eEncoding_UTF8, false>(uint32 state, uint32 unit)
|
|
{
|
|
if (state == 0) // First byte.
|
|
{
|
|
unit = unit & 0xFF;
|
|
if (unit < 0xC0)
|
|
{
|
|
return unit; // Single-unit (ASCII).
|
|
}
|
|
uint32 remaining = (unit >> 4) - 0xC;
|
|
remaining += (remaining == 0);
|
|
return (unit & 0x1F) + (remaining << cDecodeShiftRemaining); // Lead byte of multi-byte.
|
|
}
|
|
state = (state << 6) + (unit & 0x3F) + (state & cDecodeMaskRemaining) - cDecodeOneRemaining; // Apply c-byte.
|
|
return state & ~cDecodeLeadBit; // Mask off the lead bits of a 4-byte sequence.
|
|
}
|
|
|
|
// Decode UTF-8, safe
|
|
template<>
|
|
inline uint32 Decode<eEncoding_UTF8, true>(uint32 state, uint32 unit)
|
|
{
|
|
if (unit <= 0xF4) // Discard out-of-range values immediately.
|
|
{
|
|
if (state == 0) // First byte.
|
|
{
|
|
if (unit < 0x80)
|
|
{
|
|
return unit; // Single-byte.
|
|
}
|
|
if (unit < 0xC2)
|
|
{
|
|
return cDecodeErrorBit; // Invalid continuation byte (or illegal 0xC0/0xC1).
|
|
}
|
|
uint32 remaining = (unit >> 4) - 0xC;
|
|
remaining += (remaining == 0);
|
|
return unit + (remaining << cDecodeShiftRemaining); // Multi-byte.
|
|
}
|
|
if ((unit & 0xC0) == 0x80)
|
|
{
|
|
const uint32 remaining = (state & cDecodeMaskRemaining) - cDecodeOneRemaining;
|
|
state = (state << 8) + unit;
|
|
if (remaining != 0)
|
|
{
|
|
return state | remaining; // Intermediate byte of a multi-byte sequence.
|
|
}
|
|
return DecodeValidate8(state); // Final byte of a multi-byte sequence.
|
|
}
|
|
}
|
|
return cDecodeErrorBit | state;
|
|
}
|
|
|
|
// Decode UTF-16, unsafe.
|
|
template<>
|
|
inline uint32 Decode<eEncoding_UTF16, false>(uint32 state, uint32 unit)
|
|
{
|
|
const bool bLead = (unit >= cLeadSurrogateFirst) && (unit <= cLeadSurrogateLast);
|
|
const uint32 initial = unit + (bLead << cDecodeShiftRemaining);
|
|
const uint32 pair = 0x10000 + ((state & 0x3FF) << 10) + (unit & 0x3FF);
|
|
return state == 0 ? initial : pair;
|
|
}
|
|
|
|
// Decode UTF-16, safe.
|
|
template<>
|
|
inline uint32 Decode<eEncoding_UTF16, true>(uint32 state, uint32 unit)
|
|
{
|
|
const bool bTrail = (unit >= cTrailSurrogateFirst) && (unit <= cTrailSurrogateLast);
|
|
if (state != 0 && !bTrail)
|
|
{
|
|
return cDecodeErrorBit + (state & 0xFFFF); // Lead surrogate without trail surrogate
|
|
}
|
|
uint32 result = Decode<eEncoding_UTF16, false>(state, unit);
|
|
bool bValid = (result & 0xFFFFFFFEU) != 0xFFFEU;
|
|
return bValid ? result : result + cDecodeErrorBit + cDecodeInvalidBit;
|
|
}
|
|
|
|
// Decode UTF-32, unsafe.
|
|
template<>
|
|
inline uint32 Decode<eEncoding_UTF32, false>([[maybe_unused]] uint32 state, uint32 unit)
|
|
{
|
|
return unit;
|
|
}
|
|
|
|
// Decode UTF-32, safe.
|
|
template<>
|
|
inline uint32 Decode<eEncoding_UTF32, true>([[maybe_unused]] uint32 state, uint32 unit)
|
|
{
|
|
if (unit > cCodepointMax)
|
|
{
|
|
return cDecodeErrorBit;
|
|
}
|
|
if (unit >= cLeadSurrogateFirst && unit <= cTrailSurrogateLast)
|
|
{
|
|
return cDecodeErrorBit | cDecodeSurrogateBit;
|
|
}
|
|
if ((unit & 0xFFFEU) == 0xFFFEU)
|
|
{
|
|
return cDecodeErrorBit | cDecodeInvalidBit;
|
|
}
|
|
return unit;
|
|
}
|
|
|
|
// Decode ASCII, unsafe.
|
|
template<>
|
|
inline uint32 Decode<eEncoding_ASCII, false>([[maybe_unused]] uint32 state, uint32 unit)
|
|
{
|
|
return unit;
|
|
}
|
|
|
|
// Decode ASCII, safe.
|
|
template<>
|
|
inline uint32 Decode<eEncoding_ASCII, true>([[maybe_unused]] uint32 state, uint32 unit)
|
|
{
|
|
if (unit > 0x7F)
|
|
{
|
|
return cDecodeErrorBit;
|
|
}
|
|
return unit;
|
|
}
|
|
|
|
// Decode Latin1, unsafe.
|
|
template<>
|
|
inline uint32 Decode<eEncoding_Latin1, false>([[maybe_unused]] uint32 state, uint32 unit)
|
|
{
|
|
return unit;
|
|
}
|
|
|
|
// Decode Latin1, safe.
|
|
template<>
|
|
inline uint32 Decode<eEncoding_Latin1, true>([[maybe_unused]] uint32 state, uint32 unit)
|
|
{
|
|
if ((unit >= 0x80 && unit <= 0x9F) || (unit > 0xFF))
|
|
{
|
|
return cDecodeErrorBit;
|
|
}
|
|
return unit;
|
|
}
|
|
|
|
// Decode Windows CP-1252, unsafe.
|
|
template<>
|
|
inline uint32 Decode<eEncoding_Win1252, false>([[maybe_unused]] uint32 state, uint32 unit)
|
|
{
|
|
static const uint16 cp1252[] =
|
|
{
|
|
0x20AC, 0x0081, 0x201A, 0x0192, 0x201E, 0x2026, 0x2020, 0x2021,
|
|
0x02C6, 0x2030, 0x0160, 0x2039, 0x0152, 0x008D, 0x017D, 0x008F,
|
|
0x0090, 0x2018, 0x2019, 0x201C, 0x201D, 0x2022, 0x2013, 0x2014,
|
|
0x02DC, 0x2122, 0x0161, 0x203A, 0x0153, 0x009D, 0x017E, 0x0178,
|
|
};
|
|
return (unit < 0x80 || unit > 0x9F) ? unit : cp1252[unit - 0x80];
|
|
}
|
|
|
|
// Decode Windows CP-1252, safe.
|
|
template<>
|
|
inline uint32 Decode<eEncoding_Win1252, true>(uint32 state, uint32 unit)
|
|
{
|
|
if (unit > 0xFF)
|
|
{
|
|
return cDecodeErrorBit;
|
|
}
|
|
uint32 result = Decode<eEncoding_Win1252, false>(state, unit);
|
|
if (!(unit < 0x80 || unit > 0x9F) && (result == unit))
|
|
{
|
|
return cDecodeErrorBit; // Not defined in codepage 1252.
|
|
}
|
|
return result;
|
|
}
|
|
|
|
// SBase<T>:
|
|
// Utility to apply empty-base-optimization on type T.
|
|
// Will fall back to a member if T is a reference type.
|
|
template<typename T, int Tag = 0>
|
|
struct SBase
|
|
: T
|
|
{
|
|
SBase(T base)
|
|
: T(base) {}
|
|
T& GetBase() { return *this; }
|
|
const T& GetBase() const { return *this; }
|
|
};
|
|
template<typename T, int Tag>
|
|
struct SBase<T&, Tag>
|
|
{
|
|
T& base;
|
|
SBase(T& b)
|
|
: base(b) {}
|
|
T& GetBase() { return base; }
|
|
const T& GetBase() const { return base; }
|
|
};
|
|
|
|
// SDecoder<Encoding, Sink, Recovery>:
|
|
// Functor to decode UCS code-points from an input range.
|
|
// Recovery functor will be invoked as a fall-back if decoding fails.
|
|
// This allows ensuring all the output is valid (even if the input isn't).
|
|
// Note: The destructor will automatically flush any remaining (erroneous) state, you can also call Finalize().
|
|
template<EEncoding InputEncoding, typename Sink, typename Recovery = void>
|
|
struct SDecoder
|
|
: SBase<Sink, 1>
|
|
, SBase<Recovery, 2>
|
|
{
|
|
uint32 state;
|
|
SDecoder(Sink sink, Recovery recovery = Recovery())
|
|
: SBase<Sink, 1>(sink)
|
|
, SBase<Recovery, 2>(recovery)
|
|
, state(0) {}
|
|
SDecoder() { Finalize(); }
|
|
Recovery& recovery() { return SBase<Recovery, 2>::GetBase(); }
|
|
Sink& sink() { return SBase<Sink, 1>::GetBase(); }
|
|
void operator()(uint32 unit)
|
|
{
|
|
state = Detail::Decode<InputEncoding, true>(state, unit);
|
|
if (state <= 0x1FFFFF)
|
|
{
|
|
sink()(state);
|
|
state = 0;
|
|
}
|
|
else if (state & Detail::cDecodeErrorBit)
|
|
{
|
|
recovery()(sink(), state, unit);
|
|
state = 0;
|
|
}
|
|
}
|
|
void Finalize()
|
|
{
|
|
if (state)
|
|
{
|
|
recovery()(sink(), state, 0);
|
|
state = 0;
|
|
}
|
|
}
|
|
};
|
|
|
|
// SDecoder<Encoding, Sink>:
|
|
// Functor to decode to UCS code-points from an input range.
|
|
// No attempt to discover or recover from encoding errors is made, can only safely be used with known-valid input.
|
|
template<EEncoding InputEncoding, typename Sink>
|
|
struct SDecoder<InputEncoding, Sink, void>
|
|
: SBase<Sink>
|
|
{
|
|
uint32 state;
|
|
SDecoder(Sink sink)
|
|
: SBase<Sink>(sink)
|
|
, state(0) {}
|
|
Sink& sink() { return SBase<Sink>::GetBase(); }
|
|
void operator()(uint32 unit)
|
|
{
|
|
state = Detail::Decode<InputEncoding, false>(state, unit);
|
|
if (state <= 0x1FFFFF)
|
|
{
|
|
sink()(state);
|
|
state = 0;
|
|
}
|
|
}
|
|
void Finalize() {}
|
|
};
|
|
|
|
// SEncoder<Encoding, Sink>:
|
|
// Generic Unicode encoder functor.
|
|
// Encoding must be one an encoding type for which output is supported.
|
|
// The Sink type must have HintSequence member for UTF-8 and UTF-16 (although it may be a no-op).
|
|
// In general, you feed operator() with UCS code-points and it will emit code-units.
|
|
template<EEncoding OutputEncoding, typename Sink>
|
|
struct SEncoder
|
|
{
|
|
static const bool value = false;
|
|
};
|
|
|
|
// SEncoder<Encoding, Sink>:
|
|
// Specialization of ASCII encoder functor.
|
|
// Note: Any out-of-range character is mapped to question mark.
|
|
template<typename Sink>
|
|
struct SEncoder<eEncoding_ASCII, Sink>
|
|
: SBase<Sink>
|
|
{
|
|
static const bool value = true;
|
|
typedef uint8 value_type;
|
|
SEncoder(Sink sink)
|
|
: SBase<Sink>(sink) {}
|
|
void operator()(uint32 cp)
|
|
{
|
|
cp = cp < 0x80 ? cp : (uint32)'?';
|
|
SBase<Sink>::GetBase()(value_type(cp));
|
|
}
|
|
};
|
|
|
|
// SEncoder<Encoding, Sink>:
|
|
// Specialization of UTF-8 encoder functor.
|
|
template<typename Sink>
|
|
struct SEncoder<eEncoding_UTF8, Sink>
|
|
: SBase<Sink>
|
|
{
|
|
static const bool value = true;
|
|
typedef uint8 value_type;
|
|
SEncoder(Sink sink)
|
|
: SBase<Sink>(sink) {}
|
|
Sink& sink() { return SBase<Sink>::GetBase(); }
|
|
void operator()(uint32 cp)
|
|
{
|
|
if (cp < 0x80)
|
|
{
|
|
// Single byte sequence.
|
|
sink()(value_type(cp));
|
|
}
|
|
else
|
|
{
|
|
// Expand 21-bit value to 32-bit.
|
|
uint32 bits =
|
|
(cp & 0x00003F) +
|
|
((cp & 0x000FC0) << 2) +
|
|
((cp & 0x03F000) << 4) +
|
|
((cp & 0x1C0000) << 6);
|
|
|
|
// Type of sequence.
|
|
const bool bSeq4 = (cp >= 0x10000);
|
|
const bool bSeq3 = (cp >= 0x800);
|
|
|
|
// Mask lead-bytes and continuation-bytes.
|
|
uint32 mask = 0xEFE0C080;
|
|
mask ^= (bSeq3 << 14);
|
|
mask += (bSeq4 ? 0xA00000 : 0);
|
|
bits |= mask;
|
|
|
|
// Length of the sequence.
|
|
const uint32 length = (uint32)bSeq4 + (uint32)bSeq3 + 1;
|
|
sink().HintSequence(length);
|
|
|
|
// Sink the multi-byte sequence.
|
|
if (bSeq4)
|
|
{
|
|
sink()(value_type(bits >> 24));
|
|
}
|
|
if (bSeq3)
|
|
{
|
|
sink()(value_type(bits >> 16));
|
|
}
|
|
sink()(value_type(bits >> 8));
|
|
sink()(value_type(bits));
|
|
}
|
|
}
|
|
};
|
|
|
|
// SEncoder<Encoding, Sink>:
|
|
// Specialization of UTF-16 encoder functor.
|
|
template<typename Sink>
|
|
struct SEncoder<eEncoding_UTF16, Sink>
|
|
: SBase<Sink>
|
|
{
|
|
static const bool value = true;
|
|
typedef uint16 value_type;
|
|
SEncoder(Sink sink)
|
|
: SBase<Sink>(sink) {}
|
|
Sink& sink() { return SBase<Sink>::GetBase(); }
|
|
void operator()(uint32 cp)
|
|
{
|
|
if (cp < 0x10000)
|
|
{
|
|
// Single unit
|
|
sink()(value_type(cp));
|
|
}
|
|
else
|
|
{
|
|
// We will generate two-element sequence
|
|
sink().HintSequence(2);
|
|
|
|
// Surrogate pair
|
|
cp -= 0x10000;
|
|
uint32 lead = ((cp >> 10) & 0x3FF) + Detail::cLeadSurrogateFirst;
|
|
uint32 trail = (cp & 0x3FF) + Detail::cTrailSurrogateFirst;
|
|
sink()(value_type(lead));
|
|
sink()(value_type(trail));
|
|
}
|
|
}
|
|
};
|
|
|
|
// SEncoder<Encoding, Sink>:
|
|
// Specialization of UTF-32 encoder functor.
|
|
// Note: This is a no-op, but we want to be able to express UTF-32 just like the other encodings.
|
|
template<typename Sink>
|
|
struct SEncoder<eEncoding_UTF32, Sink>
|
|
: SBase<Sink>
|
|
{
|
|
static const bool value = true;
|
|
typedef uint32 value_type;
|
|
SEncoder(Sink sink)
|
|
: SBase<Sink>(sink) {}
|
|
void operator()(uint32 cp)
|
|
{
|
|
SBase<Sink>::GetBase()(value_type(cp));
|
|
}
|
|
};
|
|
|
|
// SDecoder<Encoding, SEncoder<Encoding>, void>:
|
|
// Specialization for unsafe no-op trans-coding.
|
|
// Since the conversion is a no-op, no need to keep any state or do any computation.
|
|
// Note: For a decoding with a fallback, this is not possible since we can't guarantee the input is valid.
|
|
template<EEncoding SameEncoding, typename Sink>
|
|
struct SDecoder<SameEncoding, SEncoder<SameEncoding, Sink>, void>
|
|
{
|
|
Sink sink;
|
|
SDecoder(Sink s)
|
|
: sink(s) {}
|
|
void operator()(uint32 unit)
|
|
{
|
|
sink(unit);
|
|
}
|
|
void Finalize() {}
|
|
};
|
|
|
|
// SRecoveryDiscard<Sink>:
|
|
// Recovery handler that, on encoding error, discards the offending sequence.
|
|
template<typename Sink>
|
|
struct SRecoveryDiscard
|
|
{
|
|
SRecoveryDiscard() {}
|
|
void operator()([[maybe_unused]] Sink& sink, [[maybe_unused]] uint32 error, [[maybe_unused]] uint32 unit) {}
|
|
};
|
|
|
|
// SRecoveryReplace<Sink>:
|
|
// Recovery handler that, on encoding error, replaces the sequence with replacement-character (U+FFFD).
|
|
// Note: This implementation matches a whole invalid sequence, it could be changed to emit for every code-unit.
|
|
template<typename Sink>
|
|
struct SRecoveryReplace
|
|
{
|
|
SRecoveryReplace() {}
|
|
void operator()(Sink& sink, uint32 error, uint32 unit) { sink(cReplacementCharacter); }
|
|
};
|
|
|
|
// SRecoveryFallback<Sink>:
|
|
// Recovery handler that, on encoding error, falls back to another encoding.
|
|
// The fallback encoding must be stateless (ie: ASCII, Latin1 or Win1252).
|
|
// This type assumes an 8-bit primary encoding since the only viable fallback encodings are 8-bit.
|
|
template<typename Sink, EEncoding FallbackEncoding, typename NextFallback>
|
|
struct SRecoveryFallback
|
|
: NextFallback
|
|
{
|
|
SRecoveryFallback()
|
|
: NextFallback() {}
|
|
void operator()(Sink& sink, uint32 error, uint32 unit)
|
|
{
|
|
SDecoder<FallbackEncoding, Sink&, NextFallback&> fallback(sink, *static_cast<NextFallback*>(this));
|
|
uint8 byte1(error >> 16);
|
|
uint8 byte2(error >> 8);
|
|
uint8 byte3(error);
|
|
uint8 byte4(unit);
|
|
if (byte1)
|
|
{
|
|
fallback(byte1);
|
|
}
|
|
if (byte1 | byte2)
|
|
{
|
|
fallback(byte2);
|
|
}
|
|
if (byte1 | byte2 | byte3)
|
|
{
|
|
fallback(byte3);
|
|
}
|
|
fallback(byte4);
|
|
}
|
|
};
|
|
|
|
// SRecoveryFallbackHelper<Sink, RecoveryMethod>:
|
|
// Helper to pick a SRecoveryFallback instantiation based on RecoveryMethod.
|
|
template<EEncoding OutputEncoding, typename Sink, EErrorRecovery RecoveryMethod>
|
|
struct SRecoveryFallbackHelper
|
|
{
|
|
// A compilation error here means RecoveryMethod value was unexpected here
|
|
COMPILE_TIME_ASSERT(
|
|
RecoveryMethod == eErrorRecovery_FallbackLatin1ThenDiscard ||
|
|
RecoveryMethod == eErrorRecovery_FallbackLatin1ThenReplace ||
|
|
RecoveryMethod == eErrorRecovery_FallbackWin1252ThenDiscard ||
|
|
RecoveryMethod == eErrorRecovery_FallbackWin1252ThenReplace);
|
|
typedef SEncoder<OutputEncoding, Sink> SinkType;
|
|
static const EEncoding FallbackEncoding =
|
|
RecoveryMethod == eErrorRecovery_FallbackLatin1ThenDiscard ||
|
|
RecoveryMethod == eErrorRecovery_FallbackLatin1ThenReplace
|
|
? eEncoding_Latin1 : eEncoding_Win1252;
|
|
template<typename Dummy, bool WithDiscard>
|
|
struct Pick
|
|
{
|
|
typedef SRecoveryDiscard<SinkType> type;
|
|
};
|
|
template<typename Dummy>
|
|
struct Pick<Dummy, false>
|
|
{
|
|
typedef SRecoveryReplace<SinkType> type;
|
|
};
|
|
typedef typename Pick<Sink,
|
|
RecoveryMethod == eErrorRecovery_FallbackLatin1ThenDiscard ||
|
|
RecoveryMethod == eErrorRecovery_FallbackWin1252ThenDiscard>::type NextFallback;
|
|
typedef SRecoveryFallback<SinkType, FallbackEncoding, NextFallback> RecoveryType;
|
|
typedef SDecoder<eEncoding_UTF8, SinkType, RecoveryType> FullType;
|
|
};
|
|
|
|
// STranscoderSelect<InputEncoding, OutputEncoding, Sink, RecoveryMethod>:
|
|
// Derives a chained decoder/encoder pair that performs code-unit -> code-unit transform.
|
|
// The RecoveryMethod template parameter determines the behavior during encoding.
|
|
// This is the basic way to perform trans-coding, and is the type instantiated by the higher-level functions.
|
|
template<EEncoding InputEncoding, EEncoding OutputEncoding, typename Sink, EErrorRecovery RecoveryMethod>
|
|
struct STranscoderSelect;
|
|
template<EEncoding InputEncoding, EEncoding OutputEncoding, typename Sink>
|
|
struct STranscoderSelect<InputEncoding, OutputEncoding, Sink, eErrorRecovery_None>
|
|
: SDecoder<InputEncoding, SEncoder<OutputEncoding, Sink>, void>
|
|
{
|
|
typedef SDecoder<InputEncoding, SEncoder<OutputEncoding, Sink>, void> TranscoderType;
|
|
STranscoderSelect(Sink sink)
|
|
: TranscoderType(sink) {}
|
|
};
|
|
template<EEncoding InputEncoding, EEncoding OutputEncoding, typename Sink>
|
|
struct STranscoderSelect<InputEncoding, OutputEncoding, Sink, eErrorRecovery_Discard>
|
|
: SDecoder<InputEncoding, SEncoder<OutputEncoding, Sink>, SRecoveryDiscard<SEncoder<OutputEncoding, Sink> > >
|
|
{
|
|
typedef SRecoveryDiscard<SEncoder<OutputEncoding, Sink> > RecoveryType;
|
|
typedef SDecoder<InputEncoding, SEncoder<OutputEncoding, Sink>, RecoveryType> TranscoderType;
|
|
STranscoderSelect(Sink sink)
|
|
: TranscoderType(sink) {}
|
|
};
|
|
template<EEncoding InputEncoding, EEncoding OutputEncoding, typename Sink>
|
|
struct STranscoderSelect<InputEncoding, OutputEncoding, Sink, eErrorRecovery_Replace>
|
|
: SDecoder<InputEncoding, SEncoder<OutputEncoding, Sink>, SRecoveryReplace<SEncoder<OutputEncoding, Sink> > >
|
|
{
|
|
typedef SRecoveryReplace<SEncoder<OutputEncoding, Sink> > RecoveryType;
|
|
typedef SDecoder<InputEncoding, SEncoder<OutputEncoding, Sink>, RecoveryType> TranscoderType;
|
|
STranscoderSelect(Sink sink)
|
|
: TranscoderType(sink) {}
|
|
};
|
|
template<EEncoding OutputEncoding, typename Sink>
|
|
struct STranscoderSelect<eEncoding_UTF8, OutputEncoding, Sink, eErrorRecovery_FallbackLatin1ThenDiscard>
|
|
: SRecoveryFallbackHelper<OutputEncoding, Sink, eErrorRecovery_FallbackLatin1ThenDiscard>::FullType
|
|
{
|
|
static const EErrorRecovery RecoveryMethod = eErrorRecovery_FallbackLatin1ThenDiscard;
|
|
typedef typename SRecoveryFallbackHelper<OutputEncoding, Sink, RecoveryMethod>::RecoveryType RecoveryType;
|
|
typedef typename SRecoveryFallbackHelper<OutputEncoding, Sink, RecoveryMethod>::FullType TranscoderType;
|
|
STranscoderSelect(Sink sink)
|
|
: TranscoderType(sink) {}
|
|
};
|
|
template<EEncoding OutputEncoding, typename Sink>
|
|
struct STranscoderSelect<eEncoding_UTF8, OutputEncoding, Sink, eErrorRecovery_FallbackLatin1ThenReplace>
|
|
: SRecoveryFallbackHelper<OutputEncoding, Sink, eErrorRecovery_FallbackLatin1ThenReplace>::FullType
|
|
{
|
|
static const EErrorRecovery RecoveryMethod = eErrorRecovery_FallbackLatin1ThenReplace;
|
|
typedef typename SRecoveryFallbackHelper<OutputEncoding, Sink, RecoveryMethod>::RecoveryType RecoveryType;
|
|
typedef typename SRecoveryFallbackHelper<OutputEncoding, Sink, RecoveryMethod>::FullType TranscoderType;
|
|
STranscoderSelect(Sink sink)
|
|
: TranscoderType(sink) {}
|
|
};
|
|
template<EEncoding OutputEncoding, typename Sink>
|
|
struct STranscoderSelect<eEncoding_UTF8, OutputEncoding, Sink, eErrorRecovery_FallbackWin1252ThenDiscard>
|
|
: SRecoveryFallbackHelper<OutputEncoding, Sink, eErrorRecovery_FallbackWin1252ThenDiscard>::FullType
|
|
{
|
|
static const EErrorRecovery RecoveryMethod = eErrorRecovery_FallbackWin1252ThenDiscard;
|
|
typedef typename SRecoveryFallbackHelper<OutputEncoding, Sink, RecoveryMethod>::RecoveryType RecoveryType;
|
|
typedef typename SRecoveryFallbackHelper<OutputEncoding, Sink, RecoveryMethod>::FullType TranscoderType;
|
|
STranscoderSelect(Sink sink)
|
|
: TranscoderType(sink) {}
|
|
};
|
|
template<EEncoding OutputEncoding, typename Sink>
|
|
struct STranscoderSelect<eEncoding_UTF8, OutputEncoding, Sink, eErrorRecovery_FallbackWin1252ThenReplace>
|
|
: SRecoveryFallbackHelper<OutputEncoding, Sink, eErrorRecovery_FallbackWin1252ThenReplace>::FullType
|
|
{
|
|
static const EErrorRecovery RecoveryMethod = eErrorRecovery_FallbackWin1252ThenReplace;
|
|
typedef typename SRecoveryFallbackHelper<OutputEncoding, Sink, RecoveryMethod>::RecoveryType RecoveryType;
|
|
typedef typename SRecoveryFallbackHelper<OutputEncoding, Sink, RecoveryMethod>::FullType TranscoderType;
|
|
STranscoderSelect(Sink sink)
|
|
: TranscoderType(sink) {}
|
|
};
|
|
|
|
// SIsSafeEncoding<R>:
|
|
// Check if the given recovery mode is safe.
|
|
// This is used for SFINAE checks in higher-level functions.
|
|
template<EErrorRecovery R>
|
|
struct SIsSafeEncoding
|
|
{
|
|
static const bool value =
|
|
R == eErrorRecovery_Discard ||
|
|
R == eErrorRecovery_Replace ||
|
|
R == eErrorRecovery_FallbackLatin1ThenDiscard ||
|
|
R == eErrorRecovery_FallbackLatin1ThenReplace ||
|
|
R == eErrorRecovery_FallbackWin1252ThenDiscard ||
|
|
R == eErrorRecovery_FallbackWin1252ThenReplace;
|
|
};
|
|
|
|
// SIsCopyableEncoding<I, O>:
|
|
// Check if data in one encoding can be copied directly to another encoding.
|
|
// This is the basis for block-copy and string-assign optimizations in un-safe conversion functions.
|
|
// Note: There are more valid combinations, they are left out since those can't occur with the output encodings supported.
|
|
// Note: Only used for un-safe functions since it doesn't account for potential invalid sequences (they would be copied over).
|
|
template<EEncoding InputEncoding, EEncoding OutputEncoding>
|
|
struct SIsCopyableEncoding
|
|
{
|
|
static const bool value =
|
|
InputEncoding == eEncoding_ASCII || // ASCII and Latin1 values don't change in any encoding.
|
|
(InputEncoding == eEncoding_Latin1 && OutputEncoding != eEncoding_ASCII); // Except Latin1 -> ASCII is lossy.
|
|
};
|
|
template<EEncoding SameEncoding>
|
|
struct SIsCopyableEncoding<SameEncoding, SameEncoding>
|
|
{
|
|
static const bool value = true; // If the input and output encodings are the same, then it's copyable.
|
|
};
|
|
}
|
|
}
|