#include #include #include namespace psemek::util { #ifdef __clang__ #pragma clang diagnostic push #pragma clang diagnostic ignored "-Wdeprecated-declarations" #endif std::string to_utf8(std::u32string const & str) { std::string result; result.resize(str.size() * 4); char * out = result.data(); for (char32_t c : str) out = append_utf8(c, out); result.resize(out - result.data()); return result; } std::u32string from_utf8(std::string const & str) { std::u32string result; result.reserve(str.size()); for (char32_t c : utf8_range(str)) result.push_back(c); return result; } char * append_utf8(char32_t c, char * out) { if (c < 0x80) { *out++ = c; } else if (c < 0x800) { *out++ = 0b11000000 | (c >> 6); *out++ = 0b10000000 | (c & 0b00111111); } else if (c < 0x10000) { *out++ = 0b11100000 | (c >> 12); *out++ = 0b10000000 | ((c >> 6) & 0b00111111); *out++ = 0b10000000 | (c & 0b00111111); } else { *out++ = 0b11110000 | (c >> 18); *out++ = 0b10000000 | ((c >> 12) & 0b00111111); *out++ = 0b10000000 | ((c >> 6) & 0b00111111); *out++ = 0b10000000 | (c & 0b00111111); } return out; } #ifdef __clang__ #pragma clang diagnostic pop #endif static bool is_1_byte(char c) { return (c & 0b10000000) == 0b00000000; } static bool is_2_byte(char c) { return (c & 0b11100000) == 0b11000000; } static bool is_3_byte(char c) { return (c & 0b11110000) == 0b11100000; } static bool is_4_byte(char c) { return (c & 0b11111000) == 0b11110000; } static bool is_middle_byte(char c) { return (c & 0b11000000) == 0b10000000; } static void assert_middle(char const * ptr) { #ifdef PSEMEK_DEBUG if (!is_middle_byte(*ptr)) throw invalid_utf8(ptr); #else (void)is_middle_byte(*ptr); #endif } utf8_iterator & utf8_iterator::operator ++() { if (is_1_byte(ptr[0])) ptr += 1; else if (is_2_byte(ptr[0])) ptr += 2; else if (is_3_byte(ptr[0])) ptr += 3; else if (is_4_byte(ptr[0])) ptr += 4; return *this; } utf8_iterator utf8_iterator::operator ++(int) { utf8_iterator copy(*this); ++(*this); return copy; } utf8_iterator & utf8_iterator::operator --() { --ptr; while (is_middle_byte(*ptr)) --ptr; return *this; } utf8_iterator utf8_iterator::operator --(int) { utf8_iterator copy(*this); --(*this); return copy; } char32_t utf8_iterator::operator *() const { if (is_1_byte(ptr[0])) { return ptr[0]; } else if (is_2_byte(ptr[0])) { assert_middle(ptr + 1); return (char32_t(ptr[0] & 0b11111) << 6) | (char32_t(ptr[1] & 0b111111)); } else if (is_3_byte(ptr[0])) { assert_middle(ptr + 1); assert_middle(ptr + 2); return (char32_t(ptr[0] & 0b1111) << 12) | (char32_t(ptr[1] & 0b111111) << 6) | char32_t(ptr[2] & 0b111111); } else if (is_4_byte(ptr[0])) { assert_middle(ptr + 1); assert_middle(ptr + 2); assert_middle(ptr + 3); return (char32_t(ptr[0] & 0b111) << 18) | (char32_t(ptr[1] & 0b111111) << 12) | (char32_t(ptr[2] & 0b111111) << 6) | char32_t(ptr[3] & 0b111111); } else throw invalid_utf8(ptr); } }