#include #include #include namespace psemek::util { #ifdef __clang__ #pragma clang diagnostic push #pragma clang diagnostic ignored "-Wdeprecated-declarations" #endif using converter = std::wstring_convert, char32_t>; std::string to_utf8(std::u32string const & str) { return converter{}.to_bytes(str); } std::u32string from_utf8(std::string const & str) { return converter{}.from_bytes(str); } #ifdef __clang__ #pragma clang diagnostic pop #endif static bool is_1_byte(char c) { return (c & 0b10000000) == 0b00000000; } static bool is_2_byte(char c) { return (c & 0b11100000) == 0b11000000; } static bool is_3_byte(char c) { return (c & 0b11110000) == 0b11100000; } static bool is_4_byte(char c) { return (c & 0b11111000) == 0b11110000; } static bool is_middle_byte(char c) { return (c & 0b11000000) == 0b10000000; } static void assert_middle(char const * ptr) { #ifdef PSEMEK_DEBUG if (!is_middle_byte(*ptr)) throw invalid_utf8(ptr); #else (void)is_middle_byte(*ptr); #endif } utf8_iterator & utf8_iterator::operator ++() { if (is_1_byte(ptr[0])) ptr += 1; else if (is_2_byte(ptr[0])) ptr += 2; else if (is_3_byte(ptr[0])) ptr += 3; else if (is_4_byte(ptr[0])) ptr += 4; return *this; } utf8_iterator utf8_iterator::operator ++(int) { utf8_iterator copy(*this); ++(*this); return copy; } char32_t utf8_iterator::operator *() const { if (is_1_byte(ptr[0])) { return ptr[0]; } else if (is_2_byte(ptr[0])) { assert_middle(ptr + 1); return (char32_t(ptr[0] & 0b11111) << 6) | (char32_t(ptr[1] & 0b111111)); } else if (is_3_byte(ptr[0])) { assert_middle(ptr + 1); assert_middle(ptr + 2); return (char32_t(ptr[0] & 0b1111) << 12) | (char32_t(ptr[1] & 0b111111) << 6) | char32_t(ptr[2] & 0b111111); } else if (is_4_byte(ptr[0])) { assert_middle(ptr + 1); assert_middle(ptr + 2); assert_middle(ptr + 3); return (char32_t(ptr[0] & 0b111) << 18) | (char32_t(ptr[1] & 0b111111) << 12) | (char32_t(ptr[2] & 0b111111) << 6) | char32_t(ptr[3] & 0b111111); } else throw invalid_utf8(ptr); } char const * invalid_utf8::what() const noexcept { return "Invalid UTF-8 string"; } }