From 7520901f30d35e37e847993c735bb920670a0048 Mon Sep 17 00:00:00 2001 From: lisyarus Date: Tue, 23 Feb 2021 20:13:33 +0300 Subject: [PATCH] Add utf8 iterator & range --- libs/util/include/psemek/util/unicode.hpp | 76 ++++++++++++++++++++ libs/util/source/unicode.cpp | 88 +++++++++++++++++++++++ 2 files changed, 164 insertions(+) diff --git a/libs/util/include/psemek/util/unicode.hpp b/libs/util/include/psemek/util/unicode.hpp index e1198772..2e7f807a 100644 --- a/libs/util/include/psemek/util/unicode.hpp +++ b/libs/util/include/psemek/util/unicode.hpp @@ -1,6 +1,8 @@ #pragma once #include +#include +#include namespace psemek::util { @@ -8,4 +10,78 @@ namespace psemek::util std::string to_utf8(std::u32string const & str); std::u32string from_utf8(std::string const & str); + struct utf8_iterator + { + using value_type = char32_t; + using pointer = void; + using reference = void; + using difference_type = std::ptrdiff_t; + using iterator_category = std::input_iterator_tag; + + char const * ptr; + + utf8_iterator & operator ++(); + utf8_iterator operator ++(int); + char32_t operator *() const; + }; + + inline bool operator == (utf8_iterator const & i1, utf8_iterator const & i2) + { + return i1.ptr == i2.ptr; + } + + inline bool operator != (utf8_iterator const & i1, utf8_iterator const & i2) + { + return i1.ptr != i2.ptr; + } + + struct utf8_range + { + std::string_view str; + + utf8_range(char const * begin) + : str(begin) + {} + + utf8_range(char const * begin, char const * end) + : str(begin, end - begin) + {} + + utf8_range(std::string_view str) + : str(str) + {} + + utf8_range(utf8_range const &) = default; + + auto begin() const + { + return utf8_iterator{str.data()}; + } + + auto end() const + { + return utf8_iterator{str.data() + str.size()}; + } + + std::size_t size() const + { + return std::distance(begin(), end()); + } + }; + + struct invalid_utf8 + : std::exception + { + invalid_utf8(char const * data) + : data_{data} + {} + + char const * what() const noexcept override; + + char const * data() const { return data_; } + + private: + char const * data_; + }; + } diff --git a/libs/util/source/unicode.cpp b/libs/util/source/unicode.cpp index 175ff5a2..81e98cd3 100644 --- a/libs/util/source/unicode.cpp +++ b/libs/util/source/unicode.cpp @@ -18,4 +18,92 @@ namespace psemek::util return converter{}.from_bytes(str); } + static bool is_1_byte(char c) + { + return (c & 0b10000000) == 0b00000000; + } + + static bool is_2_byte(char c) + { + return (c & 0b11100000) == 0b11000000; + } + + static bool is_3_byte(char c) + { + return (c & 0b11110000) == 0b11100000; + } + + static bool is_4_byte(char c) + { + return (c & 0b11111000) == 0b11110000; + } + + static bool is_middle_byte(char c) + { + return (c & 0b11000000) == 0b10000000; + } + + static void assert_middle(char const * ptr) + { +#ifdef PSEMEK_DEBUG + if (!is_middle_byte(*ptr)) + throw invalid_utf8(ptr); +#else + (void*)ptr; +#endif + } + + utf8_iterator & utf8_iterator::operator ++() + { + if (is_1_byte(ptr[0])) + ptr += 1; + else if (is_2_byte(ptr[0])) + ptr += 2; + else if (is_3_byte(ptr[0])) + ptr += 3; + else if (is_4_byte(ptr[0])) + ptr += 4; + return *this; + } + + utf8_iterator utf8_iterator::operator ++(int) + { + utf8_iterator copy(*this); + ++(*this); + return copy; + } + + char32_t utf8_iterator::operator *() const + { + if (is_1_byte(ptr[0])) + { + return ptr[0]; + } + else if (is_2_byte(ptr[0])) + { + assert_middle(ptr + 1); + return (ptr[0] & 0b11111) | ((ptr[1] & 0b111111) << 5); + } + else if (is_3_byte(ptr[0])) + { + assert_middle(ptr + 1); + assert_middle(ptr + 2); + return (ptr[0] & 0b1111) | ((ptr[1] & 0b111111) << 4) | ((ptr[2] & 0b111111) << 10); + } + else if (is_4_byte(ptr[0])) + { + assert_middle(ptr + 1); + assert_middle(ptr + 2); + assert_middle(ptr + 3); + return (ptr[0] & 0b111) | ((ptr[1] & 0b111111) << 3) | ((ptr[2] & 0b111111) << 9) | ((ptr[3] & 0b111111) << 15); + } + else + throw invalid_utf8(ptr); + } + + char const * invalid_utf8::what() const noexcept + { + return "Invalid UTF-8 string"; + } + }