166 lines
3.1 KiB
C++
166 lines
3.1 KiB
C++
#include <psemek/util/unicode.hpp>
|
|
|
|
#include <codecvt>
|
|
#include <locale>
|
|
|
|
namespace psemek::util
|
|
{
|
|
|
|
#ifdef __clang__
|
|
#pragma clang diagnostic push
|
|
#pragma clang diagnostic ignored "-Wdeprecated-declarations"
|
|
#endif
|
|
|
|
std::string to_utf8(std::u32string const & str)
|
|
{
|
|
std::string result;
|
|
result.resize(str.size() * 4);
|
|
|
|
char * out = result.data();
|
|
for (char32_t c : str)
|
|
out = append_utf8(c, out);
|
|
result.resize(out - result.data());
|
|
|
|
return result;
|
|
}
|
|
|
|
std::u32string from_utf8(std::string const & str)
|
|
{
|
|
std::u32string result;
|
|
result.reserve(str.size());
|
|
for (char32_t c : utf8_range(str))
|
|
result.push_back(c);
|
|
return result;
|
|
}
|
|
|
|
char * append_utf8(char32_t c, char * out)
|
|
{
|
|
if (c < 0x80)
|
|
{
|
|
*out++ = c;
|
|
}
|
|
else if (c < 0x800)
|
|
{
|
|
*out++ = 0b11000000 | (c >> 6);
|
|
*out++ = 0b10000000 | (c & 0b00111111);
|
|
}
|
|
else if (c < 0x10000)
|
|
{
|
|
*out++ = 0b11100000 | (c >> 12);
|
|
*out++ = 0b10000000 | ((c >> 6) & 0b00111111);
|
|
*out++ = 0b10000000 | (c & 0b00111111);
|
|
}
|
|
else
|
|
{
|
|
*out++ = 0b11110000 | (c >> 18);
|
|
*out++ = 0b10000000 | ((c >> 12) & 0b00111111);
|
|
*out++ = 0b10000000 | ((c >> 6) & 0b00111111);
|
|
*out++ = 0b10000000 | (c & 0b00111111);
|
|
}
|
|
|
|
return out;
|
|
}
|
|
|
|
#ifdef __clang__
|
|
#pragma clang diagnostic pop
|
|
#endif
|
|
|
|
static bool is_1_byte(char c)
|
|
{
|
|
return (c & 0b10000000) == 0b00000000;
|
|
}
|
|
|
|
static bool is_2_byte(char c)
|
|
{
|
|
return (c & 0b11100000) == 0b11000000;
|
|
}
|
|
|
|
static bool is_3_byte(char c)
|
|
{
|
|
return (c & 0b11110000) == 0b11100000;
|
|
}
|
|
|
|
static bool is_4_byte(char c)
|
|
{
|
|
return (c & 0b11111000) == 0b11110000;
|
|
}
|
|
|
|
static bool is_middle_byte(char c)
|
|
{
|
|
return (c & 0b11000000) == 0b10000000;
|
|
}
|
|
|
|
static void assert_middle(char const * ptr)
|
|
{
|
|
#ifdef PSEMEK_DEBUG
|
|
if (!is_middle_byte(*ptr))
|
|
throw invalid_utf8(ptr);
|
|
#else
|
|
(void)is_middle_byte(*ptr);
|
|
#endif
|
|
}
|
|
|
|
utf8_iterator & utf8_iterator::operator ++()
|
|
{
|
|
if (is_1_byte(ptr[0]))
|
|
ptr += 1;
|
|
else if (is_2_byte(ptr[0]))
|
|
ptr += 2;
|
|
else if (is_3_byte(ptr[0]))
|
|
ptr += 3;
|
|
else if (is_4_byte(ptr[0]))
|
|
ptr += 4;
|
|
return *this;
|
|
}
|
|
|
|
utf8_iterator utf8_iterator::operator ++(int)
|
|
{
|
|
utf8_iterator copy(*this);
|
|
++(*this);
|
|
return copy;
|
|
}
|
|
|
|
utf8_iterator & utf8_iterator::operator --()
|
|
{
|
|
--ptr;
|
|
while (is_middle_byte(*ptr))
|
|
--ptr;
|
|
return *this;
|
|
}
|
|
|
|
utf8_iterator utf8_iterator::operator --(int)
|
|
{
|
|
utf8_iterator copy(*this);
|
|
--(*this);
|
|
return copy;
|
|
}
|
|
|
|
char32_t utf8_iterator::operator *() const
|
|
{
|
|
if (is_1_byte(ptr[0]))
|
|
{
|
|
return ptr[0];
|
|
}
|
|
else if (is_2_byte(ptr[0]))
|
|
{
|
|
assert_middle(ptr + 1);
|
|
return (char32_t(ptr[0] & 0b11111) << 6) | (char32_t(ptr[1] & 0b111111));
|
|
}
|
|
else if (is_3_byte(ptr[0]))
|
|
{
|
|
assert_middle(ptr + 1);
|
|
assert_middle(ptr + 2);
|
|
return (char32_t(ptr[0] & 0b1111) << 12) | (char32_t(ptr[1] & 0b111111) << 6) | char32_t(ptr[2] & 0b111111);
|
|
}
|
|
else if (is_4_byte(ptr[0]))
|
|
{
|
|
assert_middle(ptr + 1);
|
|
assert_middle(ptr + 2);
|
|
assert_middle(ptr + 3);
|
|
return (char32_t(ptr[0] & 0b111) << 18) | (char32_t(ptr[1] & 0b111111) << 12) | (char32_t(ptr[2] & 0b111111) << 6) | char32_t(ptr[3] & 0b111111);
|
|
}
|
|
else
|
|
throw invalid_utf8(ptr);
|
|
}
|
|
|
|
}
|