psemek/libs/util/source/unicode.cpp

113 lines
2.2 KiB
C++

#include <psemek/util/unicode.hpp>
#include <codecvt>
#include <locale>
namespace psemek::util
{
#ifdef __clang__
#pragma clang diagnostic push
#pragma clang diagnostic ignored "-Wdeprecated-declarations"
#endif
using converter = std::wstring_convert<std::codecvt_utf8<char32_t>, char32_t>;
std::string to_utf8(std::u32string const & str)
{
return converter{}.to_bytes(str);
}
std::u32string from_utf8(std::string const & str)
{
return converter{}.from_bytes(str);
}
#ifdef __clang__
#pragma clang diagnostic pop
#endif
static bool is_1_byte(char c)
{
return (c & 0b10000000) == 0b00000000;
}
static bool is_2_byte(char c)
{
return (c & 0b11100000) == 0b11000000;
}
static bool is_3_byte(char c)
{
return (c & 0b11110000) == 0b11100000;
}
static bool is_4_byte(char c)
{
return (c & 0b11111000) == 0b11110000;
}
static bool is_middle_byte(char c)
{
return (c & 0b11000000) == 0b10000000;
}
static void assert_middle(char const * ptr)
{
#ifdef PSEMEK_DEBUG
if (!is_middle_byte(*ptr))
throw invalid_utf8(ptr);
#else
(void)is_middle_byte(*ptr);
#endif
}
utf8_iterator & utf8_iterator::operator ++()
{
if (is_1_byte(ptr[0]))
ptr += 1;
else if (is_2_byte(ptr[0]))
ptr += 2;
else if (is_3_byte(ptr[0]))
ptr += 3;
else if (is_4_byte(ptr[0]))
ptr += 4;
return *this;
}
utf8_iterator utf8_iterator::operator ++(int)
{
utf8_iterator copy(*this);
++(*this);
return copy;
}
char32_t utf8_iterator::operator *() const
{
if (is_1_byte(ptr[0]))
{
return ptr[0];
}
else if (is_2_byte(ptr[0]))
{
assert_middle(ptr + 1);
return (char32_t(ptr[0] & 0b11111) << 6) | (char32_t(ptr[1] & 0b111111));
}
else if (is_3_byte(ptr[0]))
{
assert_middle(ptr + 1);
assert_middle(ptr + 2);
return (char32_t(ptr[0] & 0b1111) << 12) | (char32_t(ptr[1] & 0b111111) << 6) | char32_t(ptr[2] & 0b111111);
}
else if (is_4_byte(ptr[0]))
{
assert_middle(ptr + 1);
assert_middle(ptr + 2);
assert_middle(ptr + 3);
return (char32_t(ptr[0] & 0b111) << 18) | (char32_t(ptr[1] & 0b111111) << 12) | (char32_t(ptr[2] & 0b111111) << 6) | char32_t(ptr[3] & 0b111111);
}
else
throw invalid_utf8(ptr);
}
}