From 7e2c2a4c6d5388338828250baf55a082643657ab Mon Sep 17 00:00:00 2001 From: lisyarus Date: Sun, 13 Apr 2025 12:43:40 +0300 Subject: [PATCH] Implement utf8 encoding and replace std::codecvt with custom code in utf32 <-> utf8 conversions --- libs/util/include/psemek/util/unicode.hpp | 2 + libs/util/source/unicode.cpp | 46 +++++++++++++++++++++-- 2 files changed, 44 insertions(+), 4 deletions(-) diff --git a/libs/util/include/psemek/util/unicode.hpp b/libs/util/include/psemek/util/unicode.hpp index 8cbb914c..66cf3f3b 100644 --- a/libs/util/include/psemek/util/unicode.hpp +++ b/libs/util/include/psemek/util/unicode.hpp @@ -11,6 +11,8 @@ namespace psemek::util std::string to_utf8(std::u32string const & str); std::u32string from_utf8(std::string const & str); + char * append_utf8(char32_t c, char * out); + struct utf8_iterator { using value_type = char32_t; diff --git a/libs/util/source/unicode.cpp b/libs/util/source/unicode.cpp index e7fa9ef0..e981d70a 100644 --- a/libs/util/source/unicode.cpp +++ b/libs/util/source/unicode.cpp @@ -11,16 +11,54 @@ namespace psemek::util #pragma clang diagnostic ignored "-Wdeprecated-declarations" #endif - using converter = std::wstring_convert, char32_t>; - std::string to_utf8(std::u32string const & str) { - return converter{}.to_bytes(str); + std::string result; + result.resize(str.size() * 4); + + char * out = result.data(); + for (char32_t c : str) + out = append_utf8(c, out); + result.resize(out - result.data()); + + return result; } std::u32string from_utf8(std::string const & str) { - return converter{}.from_bytes(str); + std::u32string result; + result.reserve(str.size()); + for (char32_t c : utf8_range(str)) + result.push_back(c); + return result; + } + + char * append_utf8(char32_t c, char * out) + { + if (c < 0x80) + { + *out++ = c; + } + else if (c < 0x800) + { + *out++ = 0b11000000 | (c >> 6); + *out++ = 0b10000000 | (c & 0b00111111); + } + else if (c < 0x10000) + { + *out++ = 0b11100000 | (c >> 12); + *out++ = 0b10000000 | ((c >> 6) & 0b00111111); + *out++ = 0b10000000 | (c & 0b00111111); + } + else + { + *out++ = 0b11110000 | (c >> 18); + *out++ = 0b10000000 | ((c >> 12) & 0b00111111); + *out++ = 0b10000000 | ((c >> 6) & 0b00111111); + *out++ = 0b10000000 | (c & 0b00111111); + } + + return out; } #ifdef __clang__