diff options
Diffstat (limited to 'Source/modules/asura-core/Font/Utf.inc')
-rw-r--r-- | Source/modules/asura-core/Font/Utf.inc | 752 |
1 files changed, 752 insertions, 0 deletions
diff --git a/Source/modules/asura-core/Font/Utf.inc b/Source/modules/asura-core/Font/Utf.inc new file mode 100644 index 0000000..69a523b --- /dev/null +++ b/Source/modules/asura-core/Font/Utf.inc @@ -0,0 +1,752 @@ +//////////////////////////////////////////////////////////// +// +// SFML - Simple and Fast Multimedia Library +// Copyright (C) 2007-2019 Laurent Gomila (laurent@sfml-dev.org) +// +// This software is provided 'as-is', without any express or implied warranty. +// In no event will the authors be held liable for any damages arising from the use of this software. +// +// Permission is granted to anyone to use this software for any purpose, +// including commercial applications, and to alter it and redistribute it freely, +// subject to the following restrictions: +// +// 1. The origin of this software must not be misrepresented; +// you must not claim that you wrote the original software. +// If you use this software in a product, an acknowledgment +// in the product documentation would be appreciated but is not required. +// +// 2. Altered source versions must be plainly marked as such, +// and must not be misrepresented as being the original software. +// +// 3. This notice may not be removed or altered from any source distribution. +// +//////////////////////////////////////////////////////////// + + +//////////////////////////////////////////////////////////// +// References: +// +// https://www.unicode.org/ +// https://www.unicode.org/Public/PROGRAMS/CVTUTF/ConvertUTF.c +// https://www.unicode.org/Public/PROGRAMS/CVTUTF/ConvertUTF.h +// https://people.w3.org/rishida/scripts/uniview/conversion +// +//////////////////////////////////////////////////////////// + + +//////////////////////////////////////////////////////////// +template <typename In> +In Utf<8>::Decode(In begin, In end, Uint32& output, Uint32 replacement) +{ + // Some useful precomputed data + static const int trailing[256] = + { + 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, + 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, + 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 3, 3, 3, 3, 3, 3, 3, 3, 4, 4, 4, 4, 5, 5, 5, 5 + }; + static const Uint32 offsets[6] = + { + 0x00000000, 0x00003080, 0x000E2080, 0x03C82080, 0xFA082080, 0x82082080 + }; + + // decode the character + int trailingBytes = trailing[static_cast<Uint8>(*begin)]; + if (begin + trailingBytes < end) + { + output = 0; + switch (trailingBytes) + { + case 5: output += static_cast<Uint8>(*begin++); output <<= 6; + case 4: output += static_cast<Uint8>(*begin++); output <<= 6; + case 3: output += static_cast<Uint8>(*begin++); output <<= 6; + case 2: output += static_cast<Uint8>(*begin++); output <<= 6; + case 1: output += static_cast<Uint8>(*begin++); output <<= 6; + case 0: output += static_cast<Uint8>(*begin++); + } + output -= offsets[trailingBytes]; + } + else + { + // Incomplete character + begin = end; + output = replacement; + } + + return begin; +} + + +//////////////////////////////////////////////////////////// +template <typename Out> +Out Utf<8>::Encode(Uint32 input, Out output, Uint8 replacement) +{ + // Some useful precomputed data + static const Uint8 firstBytes[7] = + { + 0x00, 0x00, 0xC0, 0xE0, 0xF0, 0xF8, 0xFC + }; + + // encode the character + if ((input > 0x0010FFFF) || ((input >= 0xD800) && (input <= 0xDBFF))) + { + // Invalid character + if (replacement) + *output++ = replacement; + } + else + { + // Valid character + + // Get the number of bytes to write + std::size_t bytestoWrite = 1; + if (input < 0x80) bytestoWrite = 1; + else if (input < 0x800) bytestoWrite = 2; + else if (input < 0x10000) bytestoWrite = 3; + else if (input <= 0x0010FFFF) bytestoWrite = 4; + + // Extract the bytes to write + Uint8 bytes[4]; + switch (bytestoWrite) + { + case 4: bytes[3] = static_cast<Uint8>((input | 0x80) & 0xBF); input >>= 6; + case 3: bytes[2] = static_cast<Uint8>((input | 0x80) & 0xBF); input >>= 6; + case 2: bytes[1] = static_cast<Uint8>((input | 0x80) & 0xBF); input >>= 6; + case 1: bytes[0] = static_cast<Uint8> (input | firstBytes[bytestoWrite]); + } + + // Add them to the output + output = std::copy(bytes, bytes + bytestoWrite, output); + } + + return output; +} + + +//////////////////////////////////////////////////////////// +template <typename In> +In Utf<8>::Next(In begin, In end) +{ + Uint32 codepoint; + return Decode(begin, end, codepoint); +} + + +//////////////////////////////////////////////////////////// +template <typename In> +std::size_t Utf<8>::Count(In begin, In end) +{ + std::size_t length = 0; + while (begin < end) + { + begin = Next(begin, end); + ++length; + } + + return length; +} + + +//////////////////////////////////////////////////////////// +template <typename In, typename Out> +Out Utf<8>::FromAnsi(In begin, In end, Out output, const std::locale& locale) +{ + while (begin < end) + { + Uint32 codepoint = Utf<32>::DecodeAnsi(*begin++, locale); + output = Encode(codepoint, output); + } + + return output; +} + + +//////////////////////////////////////////////////////////// +template <typename In, typename Out> +Out Utf<8>::FromWide(In begin, In end, Out output) +{ + while (begin < end) + { + Uint32 codepoint = Utf<32>::DecodeWide(*begin++); + output = Encode(codepoint, output); + } + + return output; +} + + +//////////////////////////////////////////////////////////// +template <typename In, typename Out> +Out Utf<8>::FromLatin1(In begin, In end, Out output) +{ + // Latin-1 is directly compatible with Unicode encodings, + // and can thus be treated as (a sub-range of) UTF-32 + while (begin < end) + output = Encode(*begin++, output); + + return output; +} + + +//////////////////////////////////////////////////////////// +template <typename In, typename Out> +Out Utf<8>::ToAnsi(In begin, In end, Out output, char replacement, const std::locale& locale) +{ + while (begin < end) + { + Uint32 codepoint; + begin = Decode(begin, end, codepoint); + output = Utf<32>::EncodeAnsi(codepoint, output, replacement, locale); + } + + return output; +} + + +//////////////////////////////////////////////////////////// +template <typename In, typename Out> +Out Utf<8>::ToWide(In begin, In end, Out output, wchar_t replacement) +{ + while (begin < end) + { + Uint32 codepoint; + begin = Decode(begin, end, codepoint); + output = Utf<32>::EncodeWide(codepoint, output, replacement); + } + + return output; +} + + +//////////////////////////////////////////////////////////// +template <typename In, typename Out> +Out Utf<8>::ToLatin1(In begin, In end, Out output, char replacement) +{ + // Latin-1 is directly compatible with Unicode encodings, + // and can thus be treated as (a sub-range of) UTF-32 + while (begin < end) + { + Uint32 codepoint; + begin = Decode(begin, end, codepoint); + *output++ = codepoint < 256 ? static_cast<char>(codepoint) : replacement; + } + + return output; +} + + +//////////////////////////////////////////////////////////// +template <typename In, typename Out> +Out Utf<8>::ToUtf8(In begin, In end, Out output) +{ + return std::copy(begin, end, output); +} + + +//////////////////////////////////////////////////////////// +template <typename In, typename Out> +Out Utf<8>::ToUtf16(In begin, In end, Out output) +{ + while (begin < end) + { + Uint32 codepoint; + begin = Decode(begin, end, codepoint); + output = Utf<16>::Encode(codepoint, output); + } + + return output; +} + + +//////////////////////////////////////////////////////////// +template <typename In, typename Out> +Out Utf<8>::ToUtf32(In begin, In end, Out output) +{ + while (begin < end) + { + Uint32 codepoint; + begin = Decode(begin, end, codepoint); + *output++ = codepoint; + } + + return output; +} + + +//////////////////////////////////////////////////////////// +template <typename In> +In Utf<16>::Decode(In begin, In end, Uint32& output, Uint32 replacement) +{ + Uint16 first = *begin++; + + // If it's a surrogate pair, first convert to a single UTF-32 character + if ((first >= 0xD800) && (first <= 0xDBFF)) + { + if (begin < end) + { + Uint32 second = *begin++; + if ((second >= 0xDC00) && (second <= 0xDFFF)) + { + // The second element is valid: convert the two elements to a UTF-32 character + output = ((first - 0xD800) << 10) + (second - 0xDC00) + 0x0010000; + } + else + { + // Invalid character + output = replacement; + } + } + else + { + // Invalid character + begin = end; + output = replacement; + } + } + else + { + // We can make a direct copy + output = first; + } + + return begin; +} + + +//////////////////////////////////////////////////////////// +template <typename Out> +Out Utf<16>::Encode(Uint32 input, Out output, Uint16 replacement) +{ + if (input <= 0xFFFF) + { + // The character can be copied directly, we just need to check if it's in the valid range + if ((input >= 0xD800) && (input <= 0xDFFF)) + { + // Invalid character (this range is reserved) + if (replacement) + *output++ = replacement; + } + else + { + // Valid character directly convertible to a single UTF-16 character + *output++ = static_cast<Uint16>(input); + } + } + else if (input > 0x0010FFFF) + { + // Invalid character (greater than the maximum Unicode value) + if (replacement) + *output++ = replacement; + } + else + { + // The input character will be converted to two UTF-16 elements + input -= 0x0010000; + *output++ = static_cast<Uint16>((input >> 10) + 0xD800); + *output++ = static_cast<Uint16>((input & 0x3FFUL) + 0xDC00); + } + + return output; +} + + +//////////////////////////////////////////////////////////// +template <typename In> +In Utf<16>::Next(In begin, In end) +{ + Uint32 codepoint; + return Decode(begin, end, codepoint); +} + + +//////////////////////////////////////////////////////////// +template <typename In> +std::size_t Utf<16>::Count(In begin, In end) +{ + std::size_t length = 0; + while (begin < end) + { + begin = Next(begin, end); + ++length; + } + + return length; +} + + +//////////////////////////////////////////////////////////// +template <typename In, typename Out> +Out Utf<16>::FromAnsi(In begin, In end, Out output, const std::locale& locale) +{ + while (begin < end) + { + Uint32 codepoint = Utf<32>::DecodeAnsi(*begin++, locale); + output = Encode(codepoint, output); + } + + return output; +} + + +//////////////////////////////////////////////////////////// +template <typename In, typename Out> +Out Utf<16>::FromWide(In begin, In end, Out output) +{ + while (begin < end) + { + Uint32 codepoint = Utf<32>::DecodeWide(*begin++); + output = Encode(codepoint, output); + } + + return output; +} + + +//////////////////////////////////////////////////////////// +template <typename In, typename Out> +Out Utf<16>::FromLatin1(In begin, In end, Out output) +{ + // Latin-1 is directly compatible with Unicode encodings, + // and can thus be treated as (a sub-range of) UTF-32 + return std::copy(begin, end, output); +} + + +//////////////////////////////////////////////////////////// +template <typename In, typename Out> +Out Utf<16>::ToAnsi(In begin, In end, Out output, char replacement, const std::locale& locale) +{ + while (begin < end) + { + Uint32 codepoint; + begin = Decode(begin, end, codepoint); + output = Utf<32>::EncodeAnsi(codepoint, output, replacement, locale); + } + + return output; +} + + +//////////////////////////////////////////////////////////// +template <typename In, typename Out> +Out Utf<16>::ToWide(In begin, In end, Out output, wchar_t replacement) +{ + while (begin < end) + { + Uint32 codepoint; + begin = Decode(begin, end, codepoint); + output = Utf<32>::EncodeWide(codepoint, output, replacement); + } + + return output; +} + + +//////////////////////////////////////////////////////////// +template <typename In, typename Out> +Out Utf<16>::ToLatin1(In begin, In end, Out output, char replacement) +{ + // Latin-1 is directly compatible with Unicode encodings, + // and can thus be treated as (a sub-range of) UTF-32 + while (begin < end) + { + *output++ = *begin < 256 ? static_cast<char>(*begin) : replacement; + begin++; + } + + return output; +} + + +//////////////////////////////////////////////////////////// +template <typename In, typename Out> +Out Utf<16>::ToUtf8(In begin, In end, Out output) +{ + while (begin < end) + { + Uint32 codepoint; + begin = Decode(begin, end, codepoint); + output = Utf<8>::Encode(codepoint, output); + } + + return output; +} + + +//////////////////////////////////////////////////////////// +template <typename In, typename Out> +Out Utf<16>::ToUtf16(In begin, In end, Out output) +{ + return std::copy(begin, end, output); +} + + +//////////////////////////////////////////////////////////// +template <typename In, typename Out> +Out Utf<16>::ToUtf32(In begin, In end, Out output) +{ + while (begin < end) + { + Uint32 codepoint; + begin = Decode(begin, end, codepoint); + *output++ = codepoint; + } + + return output; +} + + +//////////////////////////////////////////////////////////// +template <typename In> +In Utf<32>::Decode(In begin, In /*end*/, Uint32& output, Uint32 /*replacement*/) +{ + output = *begin++; + return begin; +} + + +//////////////////////////////////////////////////////////// +template <typename Out> +Out Utf<32>::Encode(Uint32 input, Out output, Uint32 /*replacement*/) +{ + *output++ = input; + return output; +} + + +//////////////////////////////////////////////////////////// +template <typename In> +In Utf<32>::Next(In begin, In /*end*/) +{ + return ++begin; +} + + +//////////////////////////////////////////////////////////// +template <typename In> +std::size_t Utf<32>::Count(In begin, In end) +{ + return begin - end; +} + + +//////////////////////////////////////////////////////////// +template <typename In, typename Out> +Out Utf<32>::FromAnsi(In begin, In end, Out output, const std::locale& locale) +{ + while (begin < end) + *output++ = DecodeAnsi(*begin++, locale); + + return output; +} + + +//////////////////////////////////////////////////////////// +template <typename In, typename Out> +Out Utf<32>::FromWide(In begin, In end, Out output) +{ + while (begin < end) + *output++ = DecodeWide(*begin++); + + return output; +} + + +//////////////////////////////////////////////////////////// +template <typename In, typename Out> +Out Utf<32>::FromLatin1(In begin, In end, Out output) +{ + // Latin-1 is directly compatible with Unicode encodings, + // and can thus be treated as (a sub-range of) UTF-32 + return std::copy(begin, end, output); +} + + +//////////////////////////////////////////////////////////// +template <typename In, typename Out> +Out Utf<32>::ToAnsi(In begin, In end, Out output, char replacement, const std::locale& locale) +{ + while (begin < end) + output = EncodeAnsi(*begin++, output, replacement, locale); + + return output; +} + + +//////////////////////////////////////////////////////////// +template <typename In, typename Out> +Out Utf<32>::ToWide(In begin, In end, Out output, wchar_t replacement) +{ + while (begin < end) + output = EncodeWide(*begin++, output, replacement); + + return output; +} + + +//////////////////////////////////////////////////////////// +template <typename In, typename Out> +Out Utf<32>::ToLatin1(In begin, In end, Out output, char replacement) +{ + // Latin-1 is directly compatible with Unicode encodings, + // and can thus be treated as (a sub-range of) UTF-32 + while (begin < end) + { + *output++ = *begin < 256 ? static_cast<char>(*begin) : replacement; + begin++; + } + + return output; +} + + +//////////////////////////////////////////////////////////// +template <typename In, typename Out> +Out Utf<32>::ToUtf8(In begin, In end, Out output) +{ + while (begin < end) + output = Utf<8>::Encode(*begin++, output); + + return output; +} + +//////////////////////////////////////////////////////////// +template <typename In, typename Out> +Out Utf<32>::ToUtf16(In begin, In end, Out output) +{ + while (begin < end) + output = Utf<16>::Encode(*begin++, output); + + return output; +} + + +//////////////////////////////////////////////////////////// +template <typename In, typename Out> +Out Utf<32>::ToUtf32(In begin, In end, Out output) +{ + return std::copy(begin, end, output); +} + + +//////////////////////////////////////////////////////////// +template <typename In> +Uint32 Utf<32>::DecodeAnsi(In input, const std::locale& locale) +{ + // On Windows, GCC's standard library (glibc++) has almost + // no support for Unicode stuff. As a consequence, in this + // context we can only use the default locale and ignore + // the one passed as parameter. + + #if defined(SFML_SYSTEM_WINDOWS) && /* if Windows ... */ \ + (defined(__GLIBCPP__) || defined (__GLIBCXX__)) && /* ... and standard library is glibc++ ... */ \ + !(defined(__SGI_STL_PORT) || defined(_STLPORT_VERSION)) /* ... and STLPort is not used on top of it */ + + (void)locale; // to avoid warnings + + wchar_t character = 0; + mbtowc(&character, &input, 1); + return static_cast<Uint32>(character); + + #else + + // Get the facet of the locale which deals with character conversion + const std::ctype<wchar_t>& facet = std::use_facet< std::ctype<wchar_t> >(locale); + + // Use the facet to convert each character of the input string + return static_cast<Uint32>(facet.widen(input)); + + #endif +} + + +//////////////////////////////////////////////////////////// +template <typename In> +Uint32 Utf<32>::DecodeWide(In input) +{ + // The encoding of wide characters is not well defined and is left to the system; + // however we can safely assume that it is UCS-2 on Windows and + // UCS-4 on Unix systems. + // In both cases, a simple copy is enough (UCS-2 is a subset of UCS-4, + // and UCS-4 *is* UTF-32). + + return input; +} + + +//////////////////////////////////////////////////////////// +template <typename Out> +Out Utf<32>::EncodeAnsi(Uint32 codepoint, Out output, char replacement, const std::locale& locale) +{ + // On Windows, gcc's standard library (glibc++) has almost + // no support for Unicode stuff. As a consequence, in this + // context we can only use the default locale and ignore + // the one passed as parameter. + + #if defined(SFML_SYSTEM_WINDOWS) && /* if Windows ... */ \ + (defined(__GLIBCPP__) || defined (__GLIBCXX__)) && /* ... and standard library is glibc++ ... */ \ + !(defined(__SGI_STL_PORT) || defined(_STLPORT_VERSION)) /* ... and STLPort is not used on top of it */ + + (void)locale; // to avoid warnings + + char character = 0; + if (wctomb(&character, static_cast<wchar_t>(codepoint)) >= 0) + *output++ = character; + else if (replacement) + *output++ = replacement; + + return output; + + #else + + // Get the facet of the locale which deals with character conversion + const std::ctype<wchar_t>& facet = std::use_facet< std::ctype<wchar_t> >(locale); + + // Use the facet to convert each character of the input string + *output++ = facet.narrow(static_cast<wchar_t>(codepoint), replacement); + + return output; + + #endif +} + + +//////////////////////////////////////////////////////////// +template <typename Out> +Out Utf<32>::EncodeWide(Uint32 codepoint, Out output, wchar_t replacement) +{ + // The encoding of wide characters is not well defined and is left to the system; + // however we can safely assume that it is UCS-2 on Windows and + // UCS-4 on Unix systems. + // For UCS-2 we need to check if the source characters fits in (UCS-2 is a subset of UCS-4). + // For UCS-4 we can do a direct copy (UCS-4 *is* UTF-32). + + switch (sizeof(wchar_t)) + { + case 4: + { + *output++ = static_cast<wchar_t>(codepoint); + break; + } + + default: + { + if ((codepoint <= 0xFFFF) && ((codepoint < 0xD800) || (codepoint > 0xDFFF))) + { + *output++ = static_cast<wchar_t>(codepoint); + } + else if (replacement) + { + *output++ = replacement; + } + break; + } + } + + return output; +} |